{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999969938373666, "eval_steps": 500, "global_step": 16632, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.05028445, "auxiliary_loss_mlp": 0.02215396, "balance_loss_clip": 1.76983953, "balance_loss_mlp": 2.43573999, "epoch": 6.012325266796934e-05, "flos": 24456507091200.0, "grad_norm": 55.002780037447025, "language_loss": 2.85272503, "learning_rate": 0.0, "loss": 1.94613922, "num_input_tokens_seen": 19155, "router_z_loss_clip": 4.4375, "router_z_loss_mlp": 26.0, "step": 1, "time_per_iteration": 20.923267126083374 }, { "auxiliary_loss_clip": 0.03380736, "auxiliary_loss_mlp": 0.01460456, "balance_loss_clip": 1.19037545, "balance_loss_mlp": 1.62782836, "epoch": 0.00012024650533593868, "flos": 20225931246720.0, "grad_norm": 35.00766799207303, "language_loss": 1.82724655, "learning_rate": 4.4628432569317594e-07, "loss": 1.87565851, "num_input_tokens_seen": 36175, "router_z_loss_clip": 2.703125, "router_z_loss_mlp": 17.5, "step": 2, "time_per_iteration": 2.515800952911377 }, { "auxiliary_loss_clip": 0.03319475, "auxiliary_loss_mlp": 0.01440756, "balance_loss_clip": 1.1884141, "balance_loss_mlp": 1.62563479, "epoch": 0.000180369758003908, "flos": 22309935454080.0, "grad_norm": 32.4544237690822, "language_loss": 1.57471132, "learning_rate": 7.073439208833112e-07, "loss": 1.62231374, "num_input_tokens_seen": 54870, "router_z_loss_clip": 2.53125, "router_z_loss_mlp": 17.0, "step": 3, "time_per_iteration": 2.447206974029541 }, { "auxiliary_loss_clip": 0.03362551, "auxiliary_loss_mlp": 0.0145335, "balance_loss_clip": 1.15675771, "balance_loss_mlp": 1.62406373, "epoch": 0.00024049301067187735, "flos": 22414650577920.0, "grad_norm": 51.450822891669105, "language_loss": 1.67611313, "learning_rate": 8.925686513863519e-07, "loss": 1.72427213, "num_input_tokens_seen": 74575, "router_z_loss_clip": 2.96875, "router_z_loss_mlp": 17.375, "step": 4, "time_per_iteration": 2.492478132247925 }, { "auxiliary_loss_clip": 0.03402324, "auxiliary_loss_mlp": 0.0150522, "balance_loss_clip": 1.21740139, "balance_loss_mlp": 1.62474608, "epoch": 0.0003006162633398467, "flos": 21396978449280.0, "grad_norm": 56.3167392166388, "language_loss": 1.9156754, "learning_rate": 1.0362401141348472e-06, "loss": 1.96475077, "num_input_tokens_seen": 92580, "router_z_loss_clip": 2.875, "router_z_loss_mlp": 17.75, "step": 5, "time_per_iteration": 2.653923273086548 }, { "auxiliary_loss_clip": 0.03370897, "auxiliary_loss_mlp": 0.01514012, "balance_loss_clip": 1.21989846, "balance_loss_mlp": 1.61578369, "epoch": 0.000360739516007816, "flos": 21652375127040.0, "grad_norm": 33.570335476146674, "language_loss": 1.61133313, "learning_rate": 1.153628246576487e-06, "loss": 1.66018212, "num_input_tokens_seen": 109705, "router_z_loss_clip": 2.9375, "router_z_loss_mlp": 17.625, "step": 6, "time_per_iteration": 2.646034002304077 }, { "auxiliary_loss_clip": 0.03354521, "auxiliary_loss_mlp": 0.01487391, "balance_loss_clip": 1.20376813, "balance_loss_mlp": 1.61579108, "epoch": 0.0004208627686757854, "flos": 27159742897920.0, "grad_norm": 24.790091262266923, "language_loss": 1.53451025, "learning_rate": 1.2528784983718962e-06, "loss": 1.58292937, "num_input_tokens_seen": 129425, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 17.375, "step": 7, "time_per_iteration": 2.695610761642456 }, { "auxiliary_loss_clip": 0.03321864, "auxiliary_loss_mlp": 0.0144262, "balance_loss_clip": 1.16510093, "balance_loss_mlp": 1.61204803, "epoch": 0.0004809860213437547, "flos": 31319096135040.0, "grad_norm": 31.737457325875923, "language_loss": 1.43369198, "learning_rate": 1.338852977079528e-06, "loss": 1.48133683, "num_input_tokens_seen": 149210, "router_z_loss_clip": 2.78125, "router_z_loss_mlp": 17.125, "step": 8, "time_per_iteration": 2.764416217803955 }, { "auxiliary_loss_clip": 0.03369654, "auxiliary_loss_mlp": 0.0149824, "balance_loss_clip": 1.21385467, "balance_loss_mlp": 1.61175644, "epoch": 0.000541109274011724, "flos": 32160411463680.0, "grad_norm": 28.326617474751497, "language_loss": 1.50079083, "learning_rate": 1.4146878417666224e-06, "loss": 1.54946971, "num_input_tokens_seen": 169055, "router_z_loss_clip": 2.84375, "router_z_loss_mlp": 17.5, "step": 9, "time_per_iteration": 2.8139901161193848 }, { "auxiliary_loss_clip": 0.03309875, "auxiliary_loss_mlp": 0.01476065, "balance_loss_clip": 1.20693851, "balance_loss_mlp": 1.6156323, "epoch": 0.0006012325266796934, "flos": 18916808163840.0, "grad_norm": 23.315567799801094, "language_loss": 1.44711208, "learning_rate": 1.4825244398280232e-06, "loss": 1.49497151, "num_input_tokens_seen": 188045, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 17.0, "step": 10, "time_per_iteration": 2.680021047592163 }, { "auxiliary_loss_clip": 0.03365522, "auxiliary_loss_mlp": 0.01493334, "balance_loss_clip": 1.21943903, "balance_loss_mlp": 1.62093043, "epoch": 0.0006613557793476627, "flos": 20774861867520.0, "grad_norm": 18.20693783158409, "language_loss": 1.45029891, "learning_rate": 1.5438901072051983e-06, "loss": 1.49888754, "num_input_tokens_seen": 207035, "router_z_loss_clip": 2.75, "router_z_loss_mlp": 17.5, "step": 11, "time_per_iteration": 2.6827776432037354 }, { "auxiliary_loss_clip": 0.03293183, "auxiliary_loss_mlp": 0.01449295, "balance_loss_clip": 1.17253923, "balance_loss_mlp": 1.60774899, "epoch": 0.000721479032015632, "flos": 16581680997120.0, "grad_norm": 16.77783148949443, "language_loss": 1.45074821, "learning_rate": 1.5999125722696629e-06, "loss": 1.498173, "num_input_tokens_seen": 223225, "router_z_loss_clip": 2.765625, "router_z_loss_mlp": 16.75, "step": 12, "time_per_iteration": 2.7423782348632812 }, { "auxiliary_loss_clip": 0.03322964, "auxiliary_loss_mlp": 0.01403731, "balance_loss_clip": 1.1456672, "balance_loss_mlp": 1.61755228, "epoch": 0.0007816022846836014, "flos": 23805471144960.0, "grad_norm": 11.028926443177502, "language_loss": 1.23468924, "learning_rate": 1.6514482443788434e-06, "loss": 1.2819562, "num_input_tokens_seen": 242570, "router_z_loss_clip": 2.578125, "router_z_loss_mlp": 17.125, "step": 13, "time_per_iteration": 2.7579774856567383 }, { "auxiliary_loss_clip": 0.03287422, "auxiliary_loss_mlp": 0.01469266, "balance_loss_clip": 1.20128357, "balance_loss_mlp": 1.61271715, "epoch": 0.0008417255373515708, "flos": 19172204841600.0, "grad_norm": 5.9487907386258545, "language_loss": 1.20655715, "learning_rate": 1.6991628240650723e-06, "loss": 1.25412416, "num_input_tokens_seen": 261215, "router_z_loss_clip": 2.6875, "router_z_loss_mlp": 16.75, "step": 14, "time_per_iteration": 2.6400089263916016 }, { "auxiliary_loss_clip": 0.03272541, "auxiliary_loss_mlp": 0.01431463, "balance_loss_clip": 1.16824889, "balance_loss_mlp": 1.6179924, "epoch": 0.00090184879001954, "flos": 26395564026240.0, "grad_norm": 6.53506849130552, "language_loss": 1.12771595, "learning_rate": 1.7435840350181584e-06, "loss": 1.17475605, "num_input_tokens_seen": 280035, "router_z_loss_clip": 2.640625, "router_z_loss_mlp": 16.5, "step": 15, "time_per_iteration": 2.7431955337524414 }, { "auxiliary_loss_clip": 0.03238257, "auxiliary_loss_mlp": 0.01410712, "balance_loss_clip": 1.1612308, "balance_loss_mlp": 1.60280132, "epoch": 0.0009619720426875094, "flos": 24679500785280.0, "grad_norm": 5.827494999702572, "language_loss": 1.11190605, "learning_rate": 1.7851373027727038e-06, "loss": 1.15839577, "num_input_tokens_seen": 300265, "router_z_loss_clip": 2.5, "router_z_loss_mlp": 16.375, "step": 16, "time_per_iteration": 2.69842791557312 }, { "auxiliary_loss_clip": 0.03223906, "auxiliary_loss_mlp": 0.01417185, "balance_loss_clip": 1.17800379, "balance_loss_mlp": 1.60904455, "epoch": 0.0010220952953554788, "flos": 18624531196800.0, "grad_norm": 5.846935462451212, "language_loss": 1.12538886, "learning_rate": 1.8241705979033208e-06, "loss": 1.17179978, "num_input_tokens_seen": 317375, "router_z_loss_clip": 2.390625, "router_z_loss_mlp": 16.125, "step": 17, "time_per_iteration": 4.803100109100342 }, { "auxiliary_loss_clip": 0.03159669, "auxiliary_loss_mlp": 0.01378532, "balance_loss_clip": 1.14736104, "balance_loss_mlp": 1.60659766, "epoch": 0.001082218548023448, "flos": 26142537646080.0, "grad_norm": 3.9624754448671076, "language_loss": 1.08050489, "learning_rate": 1.860972167459798e-06, "loss": 1.12588692, "num_input_tokens_seen": 337975, "router_z_loss_clip": 2.3125, "router_z_loss_mlp": 15.5625, "step": 18, "time_per_iteration": 4.165529012680054 }, { "auxiliary_loss_clip": 0.03187014, "auxiliary_loss_mlp": 0.01400723, "balance_loss_clip": 1.13541031, "balance_loss_mlp": 1.60598075, "epoch": 0.0011423418006914173, "flos": 19609776322560.0, "grad_norm": 4.813716243136311, "language_loss": 1.02276397, "learning_rate": 1.89578346593066e-06, "loss": 1.06864142, "num_input_tokens_seen": 356635, "router_z_loss_clip": 2.65625, "router_z_loss_mlp": 15.8125, "step": 19, "time_per_iteration": 2.673365354537964 }, { "auxiliary_loss_clip": 0.03129146, "auxiliary_loss_mlp": 0.01340713, "balance_loss_clip": 1.12089109, "balance_loss_mlp": 1.60713637, "epoch": 0.0012024650533593868, "flos": 17895365107200.0, "grad_norm": 3.78901252667527, "language_loss": 1.16628087, "learning_rate": 1.928808765521199e-06, "loss": 1.21097946, "num_input_tokens_seen": 375625, "router_z_loss_clip": 2.203125, "router_z_loss_mlp": 15.1875, "step": 20, "time_per_iteration": 2.6582229137420654 }, { "auxiliary_loss_clip": 0.03115927, "auxiliary_loss_mlp": 0.0137943, "balance_loss_clip": 1.1303302, "balance_loss_mlp": 1.58817291, "epoch": 0.001262588306027356, "flos": 21252043071360.0, "grad_norm": 4.331909729197668, "language_loss": 1.06073928, "learning_rate": 1.9602224192552076e-06, "loss": 1.10569286, "num_input_tokens_seen": 394350, "router_z_loss_clip": 2.484375, "router_z_loss_mlp": 15.25, "step": 21, "time_per_iteration": 2.6800522804260254 }, { "auxiliary_loss_clip": 0.03008573, "auxiliary_loss_mlp": 0.01376065, "balance_loss_clip": 1.14355898, "balance_loss_mlp": 1.56995535, "epoch": 0.0013227115586953253, "flos": 26104077158400.0, "grad_norm": 3.331300770836244, "language_loss": 1.05753624, "learning_rate": 1.9901744328983746e-06, "loss": 1.10138273, "num_input_tokens_seen": 413255, "router_z_loss_clip": 2.328125, "router_z_loss_mlp": 14.375, "step": 22, "time_per_iteration": 2.7255592346191406 }, { "auxiliary_loss_clip": 0.02962594, "auxiliary_loss_mlp": 0.01335442, "balance_loss_clip": 1.12601566, "balance_loss_mlp": 1.571437, "epoch": 0.0013828348113632948, "flos": 23951376190080.0, "grad_norm": 3.60007074757765, "language_loss": 0.91700411, "learning_rate": 2.018794797290208e-06, "loss": 0.95998454, "num_input_tokens_seen": 433065, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 13.9375, "step": 23, "time_per_iteration": 2.686047315597534 }, { "auxiliary_loss_clip": 0.02931367, "auxiliary_loss_mlp": 0.01361907, "balance_loss_clip": 1.14198995, "balance_loss_mlp": 1.56360352, "epoch": 0.001442958064031264, "flos": 15959851724160.0, "grad_norm": 2.8346819440035254, "language_loss": 1.08114934, "learning_rate": 2.046196897962839e-06, "loss": 1.12408209, "num_input_tokens_seen": 451175, "router_z_loss_clip": 2.1875, "router_z_loss_mlp": 13.625, "step": 24, "time_per_iteration": 2.71943998336792 }, { "auxiliary_loss_clip": 0.0282186, "auxiliary_loss_mlp": 0.01329136, "balance_loss_clip": 1.11904109, "balance_loss_mlp": 1.5570333, "epoch": 0.0015030813166992333, "flos": 18108350801280.0, "grad_norm": 3.603074420734016, "language_loss": 1.0131526, "learning_rate": 2.0724802282696944e-06, "loss": 1.05466247, "num_input_tokens_seen": 468775, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 12.625, "step": 25, "time_per_iteration": 2.6328227519989014 }, { "auxiliary_loss_clip": 0.0281557, "auxiliary_loss_mlp": 0.01309778, "balance_loss_clip": 1.10120916, "balance_loss_mlp": 1.55955076, "epoch": 0.0015632045693672028, "flos": 22234558763520.0, "grad_norm": 3.1324755320936757, "language_loss": 1.06605864, "learning_rate": 2.0977325700720194e-06, "loss": 1.1073122, "num_input_tokens_seen": 488530, "router_z_loss_clip": 2.09375, "router_z_loss_mlp": 12.5625, "step": 26, "time_per_iteration": 2.7081079483032227 }, { "auxiliary_loss_clip": 0.02761435, "auxiliary_loss_mlp": 0.01326279, "balance_loss_clip": 1.12648439, "balance_loss_mlp": 1.55064106, "epoch": 0.001623327822035172, "flos": 23991955580160.0, "grad_norm": 2.540777821148759, "language_loss": 0.95503241, "learning_rate": 2.122031762649933e-06, "loss": 0.99590945, "num_input_tokens_seen": 510495, "router_z_loss_clip": 2.0, "router_z_loss_mlp": 12.125, "step": 27, "time_per_iteration": 2.729092836380005 }, { "auxiliary_loss_clip": 0.02739357, "auxiliary_loss_mlp": 0.01314131, "balance_loss_clip": 1.13321924, "balance_loss_mlp": 1.5550952, "epoch": 0.0016834510747031415, "flos": 19677647070720.0, "grad_norm": 3.1855353690189077, "language_loss": 1.06314528, "learning_rate": 2.1454471497582483e-06, "loss": 1.10368013, "num_input_tokens_seen": 528605, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 11.875, "step": 28, "time_per_iteration": 2.6816701889038086 }, { "auxiliary_loss_clip": 0.02705851, "auxiliary_loss_mlp": 0.01319635, "balance_loss_clip": 1.13395417, "balance_loss_mlp": 1.53992271, "epoch": 0.0017435743273711108, "flos": 20923819568640.0, "grad_norm": 2.875022540776029, "language_loss": 1.02511978, "learning_rate": 2.1680407726407727e-06, "loss": 1.06537473, "num_input_tokens_seen": 548515, "router_z_loss_clip": 1.859375, "router_z_loss_mlp": 11.6875, "step": 29, "time_per_iteration": 2.70896053314209 }, { "auxiliary_loss_clip": 0.02700753, "auxiliary_loss_mlp": 0.0131403, "balance_loss_clip": 1.12720549, "balance_loss_mlp": 1.53668237, "epoch": 0.00180369758003908, "flos": 19528976678400.0, "grad_norm": 5.120489109097819, "language_loss": 1.19444013, "learning_rate": 2.189868360711334e-06, "loss": 1.23458803, "num_input_tokens_seen": 564025, "router_z_loss_clip": 1.8671875, "router_z_loss_mlp": 11.625, "step": 30, "time_per_iteration": 2.6940596103668213 }, { "auxiliary_loss_clip": 0.0262282, "auxiliary_loss_mlp": 0.01338797, "balance_loss_clip": 1.15721726, "balance_loss_mlp": 1.52367866, "epoch": 0.0018638208327070496, "flos": 27453169100160.0, "grad_norm": 3.1503465252635223, "language_loss": 1.02464557, "learning_rate": 2.2109801597326265e-06, "loss": 1.06426179, "num_input_tokens_seen": 583345, "router_z_loss_clip": 1.8125, "router_z_loss_mlp": 11.0, "step": 31, "time_per_iteration": 2.79750394821167 }, { "auxiliary_loss_clip": 0.02597047, "auxiliary_loss_mlp": 0.01330844, "balance_loss_clip": 1.15069485, "balance_loss_mlp": 1.52611506, "epoch": 0.0019239440853750188, "flos": 13589460380160.0, "grad_norm": 2.7511595753847713, "language_loss": 0.95525706, "learning_rate": 2.2314216284658796e-06, "loss": 0.99453598, "num_input_tokens_seen": 600010, "router_z_loss_clip": 1.8046875, "router_z_loss_mlp": 10.6875, "step": 32, "time_per_iteration": 2.689899444580078 }, { "auxiliary_loss_clip": 0.02582589, "auxiliary_loss_mlp": 0.01302524, "balance_loss_clip": 1.13496304, "balance_loss_mlp": 1.52225137, "epoch": 0.001984067338042988, "flos": 11253866336640.0, "grad_norm": 3.156644206633407, "language_loss": 0.95300889, "learning_rate": 2.2512340280885094e-06, "loss": 0.99186003, "num_input_tokens_seen": 616295, "router_z_loss_clip": 1.671875, "router_z_loss_mlp": 10.625, "step": 33, "time_per_iteration": 2.637873649597168 }, { "auxiliary_loss_clip": 0.0243552, "auxiliary_loss_mlp": 0.01302185, "balance_loss_clip": 1.14463758, "balance_loss_mlp": 1.489182, "epoch": 0.0020441905907109576, "flos": 22386245898240.0, "grad_norm": 2.3545609601380795, "language_loss": 0.91486943, "learning_rate": 2.270454923596497e-06, "loss": 0.95224637, "num_input_tokens_seen": 637640, "router_z_loss_clip": 1.578125, "router_z_loss_mlp": 9.4375, "step": 34, "time_per_iteration": 2.7548704147338867 }, { "auxiliary_loss_clip": 0.02392591, "auxiliary_loss_mlp": 0.01271569, "balance_loss_clip": 1.11573899, "balance_loss_mlp": 1.45482337, "epoch": 0.0021043138433789266, "flos": 49778580337920.0, "grad_norm": 2.3416821430160377, "language_loss": 0.76565546, "learning_rate": 2.2891186125067434e-06, "loss": 0.80229706, "num_input_tokens_seen": 659710, "router_z_loss_clip": 1.5625, "router_z_loss_mlp": 9.375, "step": 35, "time_per_iteration": 2.934955596923828 }, { "auxiliary_loss_clip": 0.02366294, "auxiliary_loss_mlp": 0.01273817, "balance_loss_clip": 1.12943065, "balance_loss_mlp": 1.46861744, "epoch": 0.002164437096046896, "flos": 20557961591040.0, "grad_norm": 2.631936605910482, "language_loss": 0.88633162, "learning_rate": 2.307256493152974e-06, "loss": 0.92273271, "num_input_tokens_seen": 679670, "router_z_loss_clip": 1.4453125, "router_z_loss_mlp": 9.0, "step": 36, "time_per_iteration": 2.701292037963867 }, { "auxiliary_loss_clip": 0.02309334, "auxiliary_loss_mlp": 0.01334175, "balance_loss_clip": 1.18664145, "balance_loss_mlp": 1.45559943, "epoch": 0.0022245603487148656, "flos": 26542295084160.0, "grad_norm": 2.3215730324259445, "language_loss": 0.93035555, "learning_rate": 2.3248973825097614e-06, "loss": 0.96679068, "num_input_tokens_seen": 700170, "router_z_loss_clip": 1.4765625, "router_z_loss_mlp": 8.5, "step": 37, "time_per_iteration": 2.742539644241333 }, { "auxiliary_loss_clip": 0.02269471, "auxiliary_loss_mlp": 0.01277537, "balance_loss_clip": 1.15460825, "balance_loss_mlp": 1.45036364, "epoch": 0.0022846836013828346, "flos": 20338188226560.0, "grad_norm": 2.484327632279398, "language_loss": 1.04076195, "learning_rate": 2.3420677916238357e-06, "loss": 1.07623208, "num_input_tokens_seen": 718545, "router_z_loss_clip": 1.234375, "router_z_loss_mlp": 8.1875, "step": 38, "time_per_iteration": 2.666599988937378 }, { "auxiliary_loss_clip": 0.02236819, "auxiliary_loss_mlp": 0.0125331, "balance_loss_clip": 1.12923682, "balance_loss_mlp": 1.44274569, "epoch": 0.002344806854050804, "flos": 26247575992320.0, "grad_norm": 2.162565373621999, "language_loss": 0.85372066, "learning_rate": 2.358792165262154e-06, "loss": 0.88862193, "num_input_tokens_seen": 739865, "router_z_loss_clip": 1.2421875, "router_z_loss_mlp": 7.9375, "step": 39, "time_per_iteration": 2.7330455780029297 }, { "auxiliary_loss_clip": 0.02214752, "auxiliary_loss_mlp": 0.01246089, "balance_loss_clip": 1.11658049, "balance_loss_mlp": 1.4347769, "epoch": 0.0024049301067187736, "flos": 11801539981440.0, "grad_norm": 5.9571024314955645, "language_loss": 0.90163505, "learning_rate": 2.3750930912143747e-06, "loss": 0.93624353, "num_input_tokens_seen": 755770, "router_z_loss_clip": 1.296875, "router_z_loss_mlp": 7.8125, "step": 40, "time_per_iteration": 2.6310997009277344 }, { "auxiliary_loss_clip": 0.02163602, "auxiliary_loss_mlp": 0.01272977, "balance_loss_clip": 1.15882254, "balance_loss_mlp": 1.42392838, "epoch": 0.0024650533593867426, "flos": 20631506688000.0, "grad_norm": 2.6202574550618176, "language_loss": 0.93157917, "learning_rate": 2.3909914837471044e-06, "loss": 0.96594501, "num_input_tokens_seen": 773440, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 7.40625, "step": 41, "time_per_iteration": 2.7227513790130615 }, { "auxiliary_loss_clip": 0.02127693, "auxiliary_loss_mlp": 0.01252035, "balance_loss_clip": 1.14703536, "balance_loss_mlp": 1.41595221, "epoch": 0.002525176612054712, "flos": 18406122549120.0, "grad_norm": 2.2801277547628804, "language_loss": 0.97434628, "learning_rate": 2.4065067449483835e-06, "loss": 1.00814354, "num_input_tokens_seen": 790455, "router_z_loss_clip": 1.046875, "router_z_loss_mlp": 7.125, "step": 42, "time_per_iteration": 2.6547205448150635 }, { "auxiliary_loss_clip": 0.02088515, "auxiliary_loss_mlp": 0.01296129, "balance_loss_clip": 1.18779171, "balance_loss_mlp": 1.41714573, "epoch": 0.0025852998647226816, "flos": 28184023128960.0, "grad_norm": 2.464098414136866, "language_loss": 0.97523272, "learning_rate": 2.4216569070848724e-06, "loss": 1.00907922, "num_input_tokens_seen": 810645, "router_z_loss_clip": 1.0859375, "router_z_loss_mlp": 6.71875, "step": 43, "time_per_iteration": 2.746276617050171 }, { "auxiliary_loss_clip": 0.02107279, "auxiliary_loss_mlp": 0.013103, "balance_loss_clip": 1.19652677, "balance_loss_mlp": 1.41514277, "epoch": 0.0026454231173906506, "flos": 14283110897280.0, "grad_norm": 2.4798839818697784, "language_loss": 0.93797463, "learning_rate": 2.4364587585915504e-06, "loss": 0.97215044, "num_input_tokens_seen": 827470, "router_z_loss_clip": 1.140625, "router_z_loss_mlp": 6.90625, "step": 44, "time_per_iteration": 2.651607036590576 }, { "auxiliary_loss_clip": 0.02062101, "auxiliary_loss_mlp": 0.01271251, "balance_loss_clip": 1.17135394, "balance_loss_mlp": 1.40930784, "epoch": 0.00270554637005862, "flos": 22419211605120.0, "grad_norm": 2.3480063497036263, "language_loss": 0.98771811, "learning_rate": 2.450927955901469e-06, "loss": 1.02105165, "num_input_tokens_seen": 847285, "router_z_loss_clip": 1.0, "router_z_loss_mlp": 6.53125, "step": 45, "time_per_iteration": 2.7254068851470947 }, { "auxiliary_loss_clip": 0.0203771, "auxiliary_loss_mlp": 0.01225503, "balance_loss_clip": 1.13738322, "balance_loss_mlp": 1.39589262, "epoch": 0.0027656696227265896, "flos": 23985778440960.0, "grad_norm": 1.869175502981157, "language_loss": 1.02652001, "learning_rate": 2.465079122983384e-06, "loss": 1.05915213, "num_input_tokens_seen": 867545, "router_z_loss_clip": 0.8828125, "router_z_loss_mlp": 6.40625, "step": 46, "time_per_iteration": 2.729715347290039 }, { "auxiliary_loss_clip": 0.02005585, "auxiliary_loss_mlp": 0.01269285, "balance_loss_clip": 1.17754138, "balance_loss_mlp": 1.38812184, "epoch": 0.0028257928753945586, "flos": 37669503087360.0, "grad_norm": 2.1373158371818093, "language_loss": 0.88059092, "learning_rate": 2.4789259401737868e-06, "loss": 0.91333961, "num_input_tokens_seen": 889915, "router_z_loss_clip": 0.91796875, "router_z_loss_mlp": 6.1875, "step": 47, "time_per_iteration": 2.8143527507781982 }, { "auxiliary_loss_clip": 0.0196521, "auxiliary_loss_mlp": 0.01251283, "balance_loss_clip": 1.16435516, "balance_loss_mlp": 1.37663555, "epoch": 0.002885916128062528, "flos": 22454547609600.0, "grad_norm": 2.0063247970426503, "language_loss": 0.87766618, "learning_rate": 2.492481223656015e-06, "loss": 0.90983117, "num_input_tokens_seen": 908975, "router_z_loss_clip": 0.8671875, "router_z_loss_mlp": 5.875, "step": 48, "time_per_iteration": 2.6887054443359375 }, { "auxiliary_loss_clip": 0.01964084, "auxiliary_loss_mlp": 0.01241283, "balance_loss_clip": 1.14882398, "balance_loss_mlp": 1.36585879, "epoch": 0.0029460393807304976, "flos": 27012796358400.0, "grad_norm": 2.2580966190295593, "language_loss": 0.89735377, "learning_rate": 2.5057569967437924e-06, "loss": 0.92940748, "num_input_tokens_seen": 929810, "router_z_loss_clip": 0.92578125, "router_z_loss_mlp": 6.0, "step": 49, "time_per_iteration": 2.702658176422119 }, { "auxiliary_loss_clip": 0.01955703, "auxiliary_loss_mlp": 0.01229356, "balance_loss_clip": 1.14357328, "balance_loss_mlp": 1.35999548, "epoch": 0.0030061626333984666, "flos": 15851832549120.0, "grad_norm": 2.2852168867281004, "language_loss": 0.90764284, "learning_rate": 2.51876455396287e-06, "loss": 0.93949342, "num_input_tokens_seen": 948650, "router_z_loss_clip": 0.85546875, "router_z_loss_mlp": 5.96875, "step": 50, "time_per_iteration": 2.6902081966400146 }, { "auxiliary_loss_clip": 0.01951803, "auxiliary_loss_mlp": 0.01195155, "balance_loss_clip": 1.11309123, "balance_loss_mlp": 1.36428452, "epoch": 0.003066285886066436, "flos": 31827052316160.0, "grad_norm": 3.6361185802098306, "language_loss": 0.87046158, "learning_rate": 2.5315145187866316e-06, "loss": 0.90193117, "num_input_tokens_seen": 966455, "router_z_loss_clip": 0.8203125, "router_z_loss_mlp": 5.875, "step": 51, "time_per_iteration": 2.739928722381592 }, { "auxiliary_loss_clip": 0.01905451, "auxiliary_loss_mlp": 0.01202788, "balance_loss_clip": 1.12329912, "balance_loss_mlp": 1.35297322, "epoch": 0.0031264091387344056, "flos": 41427482774400.0, "grad_norm": 2.043789539131404, "language_loss": 0.95093298, "learning_rate": 2.5440168957651953e-06, "loss": 0.98201537, "num_input_tokens_seen": 988110, "router_z_loss_clip": 0.796875, "router_z_loss_mlp": 5.53125, "step": 52, "time_per_iteration": 2.854283571243286 }, { "auxiliary_loss_clip": 0.01900722, "auxiliary_loss_mlp": 0.0123614, "balance_loss_clip": 1.15646088, "balance_loss_mlp": 1.34913778, "epoch": 0.0031865323914023747, "flos": 23440941970560.0, "grad_norm": 3.712964090580492, "language_loss": 0.92187595, "learning_rate": 2.5562811176888872e-06, "loss": 0.95324457, "num_input_tokens_seen": 1008550, "router_z_loss_clip": 0.796875, "router_z_loss_mlp": 5.53125, "step": 53, "time_per_iteration": 2.6874890327453613 }, { "auxiliary_loss_clip": 0.0188845, "auxiliary_loss_mlp": 0.01189503, "balance_loss_clip": 1.10910869, "balance_loss_mlp": 1.35091841, "epoch": 0.003246655644070344, "flos": 14429195510400.0, "grad_norm": 2.4786472055640054, "language_loss": 0.82612383, "learning_rate": 2.5683160883431093e-06, "loss": 0.85690331, "num_input_tokens_seen": 1026840, "router_z_loss_clip": 0.8046875, "router_z_loss_mlp": 5.375, "step": 54, "time_per_iteration": 2.648531675338745 }, { "auxiliary_loss_clip": 0.01884502, "auxiliary_loss_mlp": 0.012044, "balance_loss_clip": 1.12538838, "balance_loss_mlp": 1.34077334, "epoch": 0.0033067788967383136, "flos": 35918247496320.0, "grad_norm": 2.6561604163832073, "language_loss": 0.81207103, "learning_rate": 2.580130221340046e-06, "loss": 0.84296006, "num_input_tokens_seen": 1048875, "router_z_loss_clip": 0.7890625, "router_z_loss_mlp": 5.4375, "step": 55, "time_per_iteration": 2.8001625537872314 }, { "auxiliary_loss_clip": 0.01868537, "auxiliary_loss_mlp": 0.01197689, "balance_loss_clip": 1.11848617, "balance_loss_mlp": 1.33241689, "epoch": 0.003366902149406283, "flos": 22958732862720.0, "grad_norm": 4.210418445241831, "language_loss": 0.86982387, "learning_rate": 2.5917314754514246e-06, "loss": 0.90048611, "num_input_tokens_seen": 1066435, "router_z_loss_clip": 0.7890625, "router_z_loss_mlp": 5.375, "step": 56, "time_per_iteration": 2.700359582901001 }, { "auxiliary_loss_clip": 0.01866949, "auxiliary_loss_mlp": 0.01160718, "balance_loss_clip": 1.08718956, "balance_loss_mlp": 1.32464385, "epoch": 0.003427025402074252, "flos": 26582838560640.0, "grad_norm": 2.0165589555805568, "language_loss": 0.92792726, "learning_rate": 2.6031273868139713e-06, "loss": 0.95820397, "num_input_tokens_seen": 1090330, "router_z_loss_clip": 0.734375, "router_z_loss_mlp": 5.4375, "step": 57, "time_per_iteration": 2.7842140197753906 }, { "auxiliary_loss_clip": 0.01835354, "auxiliary_loss_mlp": 0.01212226, "balance_loss_clip": 1.13960361, "balance_loss_mlp": 1.33106017, "epoch": 0.0034871486547422216, "flos": 23951196622080.0, "grad_norm": 2.1194453008948257, "language_loss": 0.99723607, "learning_rate": 2.614325098333948e-06, "loss": 1.02771187, "num_input_tokens_seen": 1109840, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 5.0625, "step": 58, "time_per_iteration": 2.6832425594329834 }, { "auxiliary_loss_clip": 0.01818008, "auxiliary_loss_mlp": 0.01192592, "balance_loss_clip": 1.12159109, "balance_loss_mlp": 1.31774092, "epoch": 0.003547271907410191, "flos": 21214983214080.0, "grad_norm": 5.3575668918727315, "language_loss": 0.88016164, "learning_rate": 2.625331386578098e-06, "loss": 0.91026759, "num_input_tokens_seen": 1128415, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 5.0, "step": 59, "time_per_iteration": 5.972346067428589 }, { "auxiliary_loss_clip": 0.01838309, "auxiliary_loss_mlp": 0.0115739, "balance_loss_clip": 1.08462429, "balance_loss_mlp": 1.32489634, "epoch": 0.00360739516007816, "flos": 16504903676160.0, "grad_norm": 2.337198862646033, "language_loss": 0.93331194, "learning_rate": 2.63615268640451e-06, "loss": 0.96326888, "num_input_tokens_seen": 1146515, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 5.125, "step": 60, "time_per_iteration": 2.6535885334014893 }, { "auxiliary_loss_clip": 0.01815624, "auxiliary_loss_mlp": 0.0117173, "balance_loss_clip": 1.10359001, "balance_loss_mlp": 1.30899191, "epoch": 0.0036675184127461296, "flos": 19464805031040.0, "grad_norm": 2.6834969936612243, "language_loss": 0.89743686, "learning_rate": 2.6467951135575943e-06, "loss": 0.92731047, "num_input_tokens_seen": 1166330, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 5.0625, "step": 61, "time_per_iteration": 2.6490519046783447 }, { "auxiliary_loss_clip": 0.01800153, "auxiliary_loss_mlp": 0.01141077, "balance_loss_clip": 1.07427204, "balance_loss_mlp": 1.30626488, "epoch": 0.003727641665414099, "flos": 20957323979520.0, "grad_norm": 2.276308563247295, "language_loss": 0.88411087, "learning_rate": 2.657264485425803e-06, "loss": 0.9135232, "num_input_tokens_seen": 1186010, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 4.9375, "step": 62, "time_per_iteration": 2.641284942626953 }, { "auxiliary_loss_clip": 0.01781494, "auxiliary_loss_mlp": 0.01162032, "balance_loss_clip": 1.09217548, "balance_loss_mlp": 1.29768777, "epoch": 0.003787764918082068, "flos": 18406050721920.0, "grad_norm": 2.0413596435641765, "language_loss": 0.96253467, "learning_rate": 2.6675663401385186e-06, "loss": 0.99196994, "num_input_tokens_seen": 1204985, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 4.84375, "step": 63, "time_per_iteration": 2.626486301422119 }, { "auxiliary_loss_clip": 0.0179054, "auxiliary_loss_mlp": 0.01169598, "balance_loss_clip": 1.1030314, "balance_loss_mlp": 1.30553293, "epoch": 0.0038478881707500376, "flos": 12459243962880.0, "grad_norm": 6.557327599990323, "language_loss": 0.98783666, "learning_rate": 2.677705954159056e-06, "loss": 1.01743805, "num_input_tokens_seen": 1223545, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 4.84375, "step": 64, "time_per_iteration": 2.638758420944214 }, { "auxiliary_loss_clip": 0.01799421, "auxiliary_loss_mlp": 0.01149467, "balance_loss_clip": 1.08151793, "balance_loss_mlp": 1.30568302, "epoch": 0.003908011423418007, "flos": 13553334276480.0, "grad_norm": 2.2053069793656963, "language_loss": 0.85475338, "learning_rate": 2.6876883585136904e-06, "loss": 0.8842423, "num_input_tokens_seen": 1241175, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 4.9375, "step": 65, "time_per_iteration": 2.6252458095550537 }, { "auxiliary_loss_clip": 0.01776163, "auxiliary_loss_mlp": 0.01155685, "balance_loss_clip": 1.08816504, "balance_loss_mlp": 1.29325438, "epoch": 0.003968134676085976, "flos": 18333475292160.0, "grad_norm": 2.3829536057424803, "language_loss": 0.85111558, "learning_rate": 2.697518353781685e-06, "loss": 0.88043404, "num_input_tokens_seen": 1259315, "router_z_loss_clip": 0.67578125, "router_z_loss_mlp": 4.8125, "step": 66, "time_per_iteration": 2.6503427028656006 }, { "auxiliary_loss_clip": 0.01778943, "auxiliary_loss_mlp": 0.01153198, "balance_loss_clip": 1.07776237, "balance_loss_mlp": 1.29076433, "epoch": 0.004028257928753946, "flos": 20485242506880.0, "grad_norm": 3.4001704241207116, "language_loss": 0.96182454, "learning_rate": 2.7072005239581103e-06, "loss": 0.99114591, "num_input_tokens_seen": 1277055, "router_z_loss_clip": 0.75390625, "router_z_loss_mlp": 4.875, "step": 67, "time_per_iteration": 2.6376609802246094 }, { "auxiliary_loss_clip": 0.01751533, "auxiliary_loss_mlp": 0.01156037, "balance_loss_clip": 1.08360553, "balance_loss_mlp": 1.28410339, "epoch": 0.004088381181421915, "flos": 18843837684480.0, "grad_norm": 2.065819005908995, "language_loss": 0.94427085, "learning_rate": 2.7167392492896727e-06, "loss": 0.97334659, "num_input_tokens_seen": 1294355, "router_z_loss_clip": 0.7265625, "router_z_loss_mlp": 4.6875, "step": 68, "time_per_iteration": 2.583876371383667 }, { "auxiliary_loss_clip": 0.01745356, "auxiliary_loss_mlp": 0.01157114, "balance_loss_clip": 1.08701873, "balance_loss_mlp": 1.28202391, "epoch": 0.004148504434089885, "flos": 19427817000960.0, "grad_norm": 2.2753582177945444, "language_loss": 0.95807081, "learning_rate": 2.7261387181735195e-06, "loss": 0.98709553, "num_input_tokens_seen": 1313525, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 4.625, "step": 69, "time_per_iteration": 2.664411783218384 }, { "auxiliary_loss_clip": 0.01741004, "auxiliary_loss_mlp": 0.0115858, "balance_loss_clip": 1.09344387, "balance_loss_mlp": 1.2855711, "epoch": 0.004208627686757853, "flos": 20811023884800.0, "grad_norm": 2.240590709957607, "language_loss": 0.97951424, "learning_rate": 2.7354029381999196e-06, "loss": 1.00851011, "num_input_tokens_seen": 1330505, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 4.5625, "step": 70, "time_per_iteration": 2.599648952484131 }, { "auxiliary_loss_clip": 0.01746239, "auxiliary_loss_mlp": 0.01147959, "balance_loss_clip": 1.07824576, "balance_loss_mlp": 1.27523732, "epoch": 0.004268750939425823, "flos": 19098623831040.0, "grad_norm": 3.1469495897954123, "language_loss": 0.93982041, "learning_rate": 2.7445357464116983e-06, "loss": 0.96876234, "num_input_tokens_seen": 1349615, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 4.71875, "step": 71, "time_per_iteration": 2.6172404289245605 }, { "auxiliary_loss_clip": 0.01814978, "auxiliary_loss_mlp": 0.01344053, "balance_loss_clip": 1.3038075, "balance_loss_mlp": 1.43948507, "epoch": 0.004328874192093792, "flos": 52439635514880.0, "grad_norm": 2.4372991147919154, "language_loss": 0.65723848, "learning_rate": 2.75354081884615e-06, "loss": 0.68882883, "num_input_tokens_seen": 1410275, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 3.75, "step": 72, "time_per_iteration": 3.278514862060547 }, { "auxiliary_loss_clip": 0.01799137, "auxiliary_loss_mlp": 0.01301496, "balance_loss_clip": 1.2610606, "balance_loss_mlp": 1.43354785, "epoch": 0.004388997444761762, "flos": 66473239564800.0, "grad_norm": 2.268398500126119, "language_loss": 0.63763267, "learning_rate": 2.7624216794188286e-06, "loss": 0.668639, "num_input_tokens_seen": 1473020, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 3.65625, "step": 73, "time_per_iteration": 3.7629637718200684 }, { "auxiliary_loss_clip": 0.01720904, "auxiliary_loss_mlp": 0.01142466, "balance_loss_clip": 1.0741353, "balance_loss_mlp": 1.26690948, "epoch": 0.004449120697429731, "flos": 18952970181120.0, "grad_norm": 2.4939124185642023, "language_loss": 0.86179161, "learning_rate": 2.771181708202938e-06, "loss": 0.89042532, "num_input_tokens_seen": 1490385, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 4.53125, "step": 74, "time_per_iteration": 2.621570110321045 }, { "auxiliary_loss_clip": 0.01723202, "auxiliary_loss_mlp": 0.01158989, "balance_loss_clip": 1.0897522, "balance_loss_mlp": 1.26562166, "epoch": 0.004509243950097701, "flos": 21105491581440.0, "grad_norm": 9.432365828431832, "language_loss": 0.96796864, "learning_rate": 2.779824149153005e-06, "loss": 0.99679053, "num_input_tokens_seen": 1509725, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 4.5625, "step": 75, "time_per_iteration": 2.5959765911102295 }, { "auxiliary_loss_clip": 0.01703303, "auxiliary_loss_mlp": 0.01143125, "balance_loss_clip": 1.07693958, "balance_loss_mlp": 1.26235557, "epoch": 0.004569367202765669, "flos": 20698730991360.0, "grad_norm": 2.015175402063309, "language_loss": 0.87647915, "learning_rate": 2.788352117317012e-06, "loss": 0.90494347, "num_input_tokens_seen": 1527245, "router_z_loss_clip": 0.66015625, "router_z_loss_mlp": 4.40625, "step": 76, "time_per_iteration": 2.6204538345336914 }, { "auxiliary_loss_clip": 0.017042, "auxiliary_loss_mlp": 0.01142026, "balance_loss_clip": 1.07236052, "balance_loss_mlp": 1.26151299, "epoch": 0.004629490455433639, "flos": 28658474899200.0, "grad_norm": 1.907392085917649, "language_loss": 0.91641521, "learning_rate": 2.796768605577095e-06, "loss": 0.94487751, "num_input_tokens_seen": 1548930, "router_z_loss_clip": 0.6953125, "router_z_loss_mlp": 4.4375, "step": 77, "time_per_iteration": 2.699547052383423 }, { "auxiliary_loss_clip": 0.01694542, "auxiliary_loss_mlp": 0.01170283, "balance_loss_clip": 1.09952044, "balance_loss_mlp": 1.26257372, "epoch": 0.004689613708101608, "flos": 11072409805440.0, "grad_norm": 2.2287946871635604, "language_loss": 0.92074168, "learning_rate": 2.80507649095533e-06, "loss": 0.94938993, "num_input_tokens_seen": 1565695, "router_z_loss_clip": 0.70703125, "router_z_loss_mlp": 4.3125, "step": 78, "time_per_iteration": 2.6152431964874268 }, { "auxiliary_loss_clip": 0.01691339, "auxiliary_loss_mlp": 0.0115804, "balance_loss_clip": 1.08928013, "balance_loss_mlp": 1.25771534, "epoch": 0.004749736960769578, "flos": 21799106184960.0, "grad_norm": 2.260081396034204, "language_loss": 0.82567471, "learning_rate": 2.813278540517843e-06, "loss": 0.85416853, "num_input_tokens_seen": 1582625, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 4.34375, "step": 79, "time_per_iteration": 2.6257522106170654 }, { "auxiliary_loss_clip": 0.01704235, "auxiliary_loss_mlp": 0.01134297, "balance_loss_clip": 1.06396317, "balance_loss_mlp": 1.26015639, "epoch": 0.004809860213437547, "flos": 19792597570560.0, "grad_norm": 3.2008013993021445, "language_loss": 0.9144851, "learning_rate": 2.8213774169075505e-06, "loss": 0.94287038, "num_input_tokens_seen": 1601725, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 4.4375, "step": 80, "time_per_iteration": 2.6655619144439697 }, { "auxiliary_loss_clip": 0.01674351, "auxiliary_loss_mlp": 0.01142962, "balance_loss_clip": 1.07253289, "balance_loss_mlp": 1.25332332, "epoch": 0.004869983466105517, "flos": 26574327037440.0, "grad_norm": 2.04027888675451, "language_loss": 0.95258987, "learning_rate": 2.829375683533245e-06, "loss": 0.98076302, "num_input_tokens_seen": 1622420, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 4.21875, "step": 81, "time_per_iteration": 2.702755928039551 }, { "auxiliary_loss_clip": 0.01689156, "auxiliary_loss_mlp": 0.01146, "balance_loss_clip": 1.07976711, "balance_loss_mlp": 1.25740814, "epoch": 0.004930106718773485, "flos": 12823378087680.0, "grad_norm": 3.28636160263665, "language_loss": 0.96280622, "learning_rate": 2.8372758094402803e-06, "loss": 0.99115783, "num_input_tokens_seen": 1640715, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 4.3125, "step": 82, "time_per_iteration": 2.647310256958008 }, { "auxiliary_loss_clip": 0.01672832, "auxiliary_loss_mlp": 0.01160878, "balance_loss_clip": 1.09068704, "balance_loss_mlp": 1.24652946, "epoch": 0.004990229971441455, "flos": 25774919902080.0, "grad_norm": 1.8914322573241695, "language_loss": 0.86576384, "learning_rate": 2.84508017388607e-06, "loss": 0.89410096, "num_input_tokens_seen": 1662210, "router_z_loss_clip": 0.703125, "router_z_loss_mlp": 4.25, "step": 83, "time_per_iteration": 2.67777681350708 }, { "auxiliary_loss_clip": 0.01665852, "auxiliary_loss_mlp": 0.01156913, "balance_loss_clip": 1.08629334, "balance_loss_mlp": 1.24767256, "epoch": 0.005050353224109424, "flos": 17457254922240.0, "grad_norm": 2.3648909701895158, "language_loss": 0.91776752, "learning_rate": 2.852791070641559e-06, "loss": 0.94599509, "num_input_tokens_seen": 1681070, "router_z_loss_clip": 0.70703125, "router_z_loss_mlp": 4.1875, "step": 84, "time_per_iteration": 2.6318089962005615 }, { "auxiliary_loss_clip": 0.01656594, "auxiliary_loss_mlp": 0.01230431, "balance_loss_clip": 1.19514501, "balance_loss_mlp": 1.36215615, "epoch": 0.005110476476777394, "flos": 69805460367360.0, "grad_norm": 1.4312636888550874, "language_loss": 0.62603879, "learning_rate": 2.8604107120381682e-06, "loss": 0.65490901, "num_input_tokens_seen": 1747140, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 2.953125, "step": 85, "time_per_iteration": 3.242306709289551 }, { "auxiliary_loss_clip": 0.01653108, "auxiliary_loss_mlp": 0.01127501, "balance_loss_clip": 1.05683339, "balance_loss_mlp": 1.23753893, "epoch": 0.005170599729445363, "flos": 24790105739520.0, "grad_norm": 1.9163321567646565, "language_loss": 0.90995711, "learning_rate": 2.8679412327780482e-06, "loss": 0.93776321, "num_input_tokens_seen": 1767475, "router_z_loss_clip": 0.70703125, "router_z_loss_mlp": 4.15625, "step": 86, "time_per_iteration": 2.6620845794677734 }, { "auxiliary_loss_clip": 0.01656196, "auxiliary_loss_mlp": 0.0116228, "balance_loss_clip": 1.09118402, "balance_loss_mlp": 1.2436738, "epoch": 0.005230722982113333, "flos": 23258048895360.0, "grad_norm": 2.35093826776701, "language_loss": 0.82086504, "learning_rate": 2.8753846935240833e-06, "loss": 0.84904981, "num_input_tokens_seen": 1784980, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 4.125, "step": 87, "time_per_iteration": 2.646270513534546 }, { "auxiliary_loss_clip": 0.01645073, "auxiliary_loss_mlp": 0.01157243, "balance_loss_clip": 1.08848357, "balance_loss_mlp": 1.24175143, "epoch": 0.005290846234781301, "flos": 16727909264640.0, "grad_norm": 2.1380032765855, "language_loss": 0.95745713, "learning_rate": 2.8827430842847267e-06, "loss": 0.98548031, "num_input_tokens_seen": 1803030, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 4.03125, "step": 88, "time_per_iteration": 2.598013162612915 }, { "auxiliary_loss_clip": 0.01661603, "auxiliary_loss_mlp": 0.01151178, "balance_loss_clip": 1.08346701, "balance_loss_mlp": 1.24086094, "epoch": 0.005350969487449271, "flos": 20886077352960.0, "grad_norm": 2.181064419522056, "language_loss": 0.8599754, "learning_rate": 2.8900183276075957e-06, "loss": 0.88810325, "num_input_tokens_seen": 1822865, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 4.21875, "step": 89, "time_per_iteration": 2.620514392852783 }, { "auxiliary_loss_clip": 0.01650773, "auxiliary_loss_mlp": 0.01130833, "balance_loss_clip": 1.06383777, "balance_loss_mlp": 1.23526454, "epoch": 0.00541109274011724, "flos": 26209977431040.0, "grad_norm": 2.351490067106702, "language_loss": 0.91511643, "learning_rate": 2.8972122815946455e-06, "loss": 0.94293249, "num_input_tokens_seen": 1842435, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 4.15625, "step": 90, "time_per_iteration": 2.655165195465088 }, { "auxiliary_loss_clip": 0.01629708, "auxiliary_loss_mlp": 0.01133837, "balance_loss_clip": 1.06560111, "balance_loss_mlp": 1.2302717, "epoch": 0.00547121599278521, "flos": 21178569801600.0, "grad_norm": 2.382446254015029, "language_loss": 0.85918391, "learning_rate": 2.90432674275074e-06, "loss": 0.88681936, "num_input_tokens_seen": 1860065, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 4.0, "step": 91, "time_per_iteration": 2.7135961055755615 }, { "auxiliary_loss_clip": 0.0162937, "auxiliary_loss_mlp": 0.01141148, "balance_loss_clip": 1.0744859, "balance_loss_mlp": 1.22587013, "epoch": 0.005531339245453179, "flos": 19718801078400.0, "grad_norm": 2.6159147208166535, "language_loss": 0.87102938, "learning_rate": 2.91136344867656e-06, "loss": 0.89873457, "num_input_tokens_seen": 1878135, "router_z_loss_clip": 0.66796875, "router_z_loss_mlp": 4.0625, "step": 92, "time_per_iteration": 2.6679303646087646 }, { "auxiliary_loss_clip": 0.0162123, "auxiliary_loss_mlp": 0.01177302, "balance_loss_clip": 1.10925698, "balance_loss_mlp": 1.21780264, "epoch": 0.005591462498121149, "flos": 17636089760640.0, "grad_norm": 9.811177093501167, "language_loss": 0.92058563, "learning_rate": 2.918324080615938e-06, "loss": 0.94857091, "num_input_tokens_seen": 1894895, "router_z_loss_clip": 0.6796875, "router_z_loss_mlp": 4.03125, "step": 93, "time_per_iteration": 2.6048171520233154 }, { "auxiliary_loss_clip": 0.01634028, "auxiliary_loss_mlp": 0.01152329, "balance_loss_clip": 1.08056521, "balance_loss_mlp": 1.22368491, "epoch": 0.005651585750789117, "flos": 20011221699840.0, "grad_norm": 2.162558797451376, "language_loss": 0.87273675, "learning_rate": 2.925210265866963e-06, "loss": 0.90060031, "num_input_tokens_seen": 1913220, "router_z_loss_clip": 0.71875, "router_z_loss_mlp": 4.125, "step": 94, "time_per_iteration": 2.6444578170776367 }, { "auxiliary_loss_clip": 0.0157273, "auxiliary_loss_mlp": 0.01065411, "balance_loss_clip": 1.03203201, "balance_loss_mlp": 1.3177104, "epoch": 0.005711709003457087, "flos": 59812957981440.0, "grad_norm": 1.3972786334990903, "language_loss": 0.6814115, "learning_rate": 2.932023580065507e-06, "loss": 0.70779288, "num_input_tokens_seen": 1970970, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 2.546875, "step": 95, "time_per_iteration": 3.114142894744873 }, { "auxiliary_loss_clip": 0.0161246, "auxiliary_loss_mlp": 0.01155043, "balance_loss_clip": 1.08656883, "balance_loss_mlp": 1.2127223, "epoch": 0.005771832256125056, "flos": 15559591495680.0, "grad_norm": 2.3329691410778373, "language_loss": 0.90340936, "learning_rate": 2.9387655493491906e-06, "loss": 0.93108439, "num_input_tokens_seen": 1988930, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 4.0, "step": 96, "time_per_iteration": 2.6197524070739746 }, { "auxiliary_loss_clip": 0.01602854, "auxiliary_loss_mlp": 0.01141956, "balance_loss_clip": 1.07944286, "balance_loss_mlp": 1.21462727, "epoch": 0.005831955508793026, "flos": 22528380015360.0, "grad_norm": 3.5630150780182186, "language_loss": 0.89677739, "learning_rate": 2.9454376524092147e-06, "loss": 0.92422545, "num_input_tokens_seen": 2006285, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 3.890625, "step": 97, "time_per_iteration": 2.6580440998077393 }, { "auxiliary_loss_clip": 0.01591704, "auxiliary_loss_mlp": 0.01142383, "balance_loss_clip": 1.073385, "balance_loss_mlp": 1.20744586, "epoch": 0.005892078761460995, "flos": 22049834094720.0, "grad_norm": 2.007013545562774, "language_loss": 0.76595819, "learning_rate": 2.952041322436969e-06, "loss": 0.79329908, "num_input_tokens_seen": 2024905, "router_z_loss_clip": 0.6875, "router_z_loss_mlp": 3.84375, "step": 98, "time_per_iteration": 2.661900043487549 }, { "auxiliary_loss_clip": 0.01538532, "auxiliary_loss_mlp": 0.01040592, "balance_loss_clip": 1.00645018, "balance_loss_mlp": 1.29400969, "epoch": 0.005952202014128965, "flos": 68539143317760.0, "grad_norm": 1.0405348440148077, "language_loss": 0.65492129, "learning_rate": 2.9585779489718204e-06, "loss": 0.68071252, "num_input_tokens_seen": 2086220, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 2.4375, "step": 99, "time_per_iteration": 3.259221076965332 }, { "auxiliary_loss_clip": 0.01590709, "auxiliary_loss_mlp": 0.01148566, "balance_loss_clip": 1.07751703, "balance_loss_mlp": 1.20651639, "epoch": 0.006012325266796933, "flos": 22960887678720.0, "grad_norm": 2.7678394711144687, "language_loss": 0.909567, "learning_rate": 2.9650488796560464e-06, "loss": 0.93695974, "num_input_tokens_seen": 2103365, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 3.84375, "step": 100, "time_per_iteration": 4.609083890914917 }, { "auxiliary_loss_clip": 0.01603357, "auxiliary_loss_mlp": 0.01151485, "balance_loss_clip": 1.08434606, "balance_loss_mlp": 1.20936906, "epoch": 0.006072448519464903, "flos": 17347942857600.0, "grad_norm": 3.854046683910609, "language_loss": 0.91064131, "learning_rate": 2.971455421902446e-06, "loss": 0.93818969, "num_input_tokens_seen": 2121995, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 3.9375, "step": 101, "time_per_iteration": 4.169233560562134 }, { "auxiliary_loss_clip": 0.01590893, "auxiliary_loss_mlp": 0.01151531, "balance_loss_clip": 1.08052993, "balance_loss_mlp": 1.20943415, "epoch": 0.006132571772132872, "flos": 24681116897280.0, "grad_norm": 2.1062010558227975, "language_loss": 0.90868771, "learning_rate": 2.9777988444798075e-06, "loss": 0.93611205, "num_input_tokens_seen": 2141815, "router_z_loss_clip": 0.7109375, "router_z_loss_mlp": 3.8125, "step": 102, "time_per_iteration": 2.6929092407226562 }, { "auxiliary_loss_clip": 0.01585761, "auxiliary_loss_mlp": 0.01136392, "balance_loss_clip": 1.07130349, "balance_loss_mlp": 1.20704806, "epoch": 0.006192695024800842, "flos": 21465675210240.0, "grad_norm": 3.8167502714140964, "language_loss": 0.87758744, "learning_rate": 2.9840803790210285e-06, "loss": 0.904809, "num_input_tokens_seen": 2161125, "router_z_loss_clip": 0.65234375, "router_z_loss_mlp": 3.78125, "step": 103, "time_per_iteration": 2.649204730987549 }, { "auxiliary_loss_clip": 0.01585901, "auxiliary_loss_mlp": 0.01138134, "balance_loss_clip": 1.07104254, "balance_loss_mlp": 1.20899677, "epoch": 0.006252818277468811, "flos": 17420410546560.0, "grad_norm": 1.899025339113099, "language_loss": 0.93727517, "learning_rate": 2.990301221458371e-06, "loss": 0.96451551, "num_input_tokens_seen": 2179510, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 3.765625, "step": 104, "time_per_iteration": 2.6155827045440674 }, { "auxiliary_loss_clip": 0.01575804, "auxiliary_loss_mlp": 0.01148034, "balance_loss_clip": 1.08347082, "balance_loss_mlp": 1.1988163, "epoch": 0.006312941530136781, "flos": 19099557584640.0, "grad_norm": 3.1406625984532184, "language_loss": 0.96472341, "learning_rate": 2.9964625333900544e-06, "loss": 0.99196184, "num_input_tokens_seen": 2197870, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 3.765625, "step": 105, "time_per_iteration": 2.689396619796753 }, { "auxiliary_loss_clip": 0.01575601, "auxiliary_loss_mlp": 0.01161366, "balance_loss_clip": 1.08979249, "balance_loss_mlp": 1.19915247, "epoch": 0.006373064782804749, "flos": 24060831909120.0, "grad_norm": 2.351789567863476, "language_loss": 0.87035739, "learning_rate": 3.002565443382063e-06, "loss": 0.89772707, "num_input_tokens_seen": 2217495, "router_z_loss_clip": 0.71484375, "router_z_loss_mlp": 3.765625, "step": 106, "time_per_iteration": 2.6870486736297607 }, { "auxiliary_loss_clip": 0.01559643, "auxiliary_loss_mlp": 0.0114484, "balance_loss_clip": 1.07636666, "balance_loss_mlp": 1.18594027, "epoch": 0.006433188035472719, "flos": 18332433797760.0, "grad_norm": 2.589260460232272, "language_loss": 0.83349192, "learning_rate": 3.008611048208843e-06, "loss": 0.86053669, "num_input_tokens_seen": 2236520, "router_z_loss_clip": 0.68359375, "router_z_loss_mlp": 3.734375, "step": 107, "time_per_iteration": 2.56502366065979 }, { "auxiliary_loss_clip": 0.01472695, "auxiliary_loss_mlp": 0.01034599, "balance_loss_clip": 1.00389087, "balance_loss_mlp": 1.25196552, "epoch": 0.006493311288140688, "flos": 62562387594240.0, "grad_norm": 0.9878231281239863, "language_loss": 0.647241, "learning_rate": 3.014600414036285e-06, "loss": 0.67231393, "num_input_tokens_seen": 2300140, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 2.203125, "step": 108, "time_per_iteration": 3.2481274604797363 }, { "auxiliary_loss_clip": 0.01550602, "auxiliary_loss_mlp": 0.01132231, "balance_loss_clip": 1.06289935, "balance_loss_mlp": 1.18802714, "epoch": 0.006553434540808658, "flos": 19500141035520.0, "grad_norm": 1.8639471692492782, "language_loss": 0.97735691, "learning_rate": 3.0205345775501937e-06, "loss": 1.0041852, "num_input_tokens_seen": 2317320, "router_z_loss_clip": 0.69140625, "router_z_loss_mlp": 3.625, "step": 109, "time_per_iteration": 2.5916998386383057 }, { "auxiliary_loss_clip": 0.01549201, "auxiliary_loss_mlp": 0.01144224, "balance_loss_clip": 1.07794356, "balance_loss_mlp": 1.18985844, "epoch": 0.006613557793476627, "flos": 21105132445440.0, "grad_norm": 1.6427825952822173, "language_loss": 0.84118265, "learning_rate": 3.0264145470332218e-06, "loss": 0.86811692, "num_input_tokens_seen": 2337820, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 3.59375, "step": 110, "time_per_iteration": 2.682626962661743 }, { "auxiliary_loss_clip": 0.01544942, "auxiliary_loss_mlp": 0.01151551, "balance_loss_clip": 1.08446026, "balance_loss_mlp": 1.18407512, "epoch": 0.006673681046144597, "flos": 26030747543040.0, "grad_norm": 2.088085539330413, "language_loss": 0.83018363, "learning_rate": 3.032241303393073e-06, "loss": 0.85714859, "num_input_tokens_seen": 2358560, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 3.609375, "step": 111, "time_per_iteration": 2.6620359420776367 }, { "auxiliary_loss_clip": 0.01545288, "auxiliary_loss_mlp": 0.01132966, "balance_loss_clip": 1.06907022, "balance_loss_mlp": 1.18676448, "epoch": 0.006733804298812566, "flos": 23147767163520.0, "grad_norm": 2.729871262794293, "language_loss": 0.93887258, "learning_rate": 3.0380158011446e-06, "loss": 0.96565503, "num_input_tokens_seen": 2379005, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 3.59375, "step": 112, "time_per_iteration": 2.689648151397705 }, { "auxiliary_loss_clip": 0.01548575, "auxiliary_loss_mlp": 0.01136539, "balance_loss_clip": 1.07192779, "balance_loss_mlp": 1.18315542, "epoch": 0.006793927551480535, "flos": 11764444210560.0, "grad_norm": 2.5944991793678676, "language_loss": 0.79384077, "learning_rate": 3.0437389693482466e-06, "loss": 0.82069194, "num_input_tokens_seen": 2395610, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 3.65625, "step": 113, "time_per_iteration": 2.56445050239563 }, { "auxiliary_loss_clip": 0.01536212, "auxiliary_loss_mlp": 0.01136909, "balance_loss_clip": 1.07043815, "balance_loss_mlp": 1.17948818, "epoch": 0.006854050804148504, "flos": 19171953446400.0, "grad_norm": 2.233612364884071, "language_loss": 0.93529266, "learning_rate": 3.0494117125071475e-06, "loss": 0.96202385, "num_input_tokens_seen": 2415005, "router_z_loss_clip": 0.6640625, "router_z_loss_mlp": 3.5625, "step": 114, "time_per_iteration": 2.6220247745513916 }, { "auxiliary_loss_clip": 0.01543464, "auxiliary_loss_mlp": 0.01136686, "balance_loss_clip": 1.07636619, "balance_loss_mlp": 1.17936015, "epoch": 0.006914174056816474, "flos": 21981891519360.0, "grad_norm": 1.9469512975947585, "language_loss": 0.9464221, "learning_rate": 3.055034911425055e-06, "loss": 0.97322363, "num_input_tokens_seen": 2433965, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 3.640625, "step": 115, "time_per_iteration": 2.6085734367370605 }, { "auxiliary_loss_clip": 0.01537024, "auxiliary_loss_mlp": 0.01119394, "balance_loss_clip": 1.05216026, "balance_loss_mlp": 1.17648935, "epoch": 0.006974297309484443, "flos": 16289152634880.0, "grad_norm": 2.3925382403411284, "language_loss": 0.81913668, "learning_rate": 3.0606094240271244e-06, "loss": 0.84570086, "num_input_tokens_seen": 2451605, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 3.609375, "step": 116, "time_per_iteration": 2.6010639667510986 }, { "auxiliary_loss_clip": 0.01528766, "auxiliary_loss_mlp": 0.01125325, "balance_loss_clip": 1.06061864, "balance_loss_mlp": 1.17730904, "epoch": 0.007034420562152413, "flos": 26104005331200.0, "grad_norm": 2.256522195692769, "language_loss": 0.87939125, "learning_rate": 3.0661360861454656e-06, "loss": 0.90593219, "num_input_tokens_seen": 2472035, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 3.515625, "step": 117, "time_per_iteration": 2.6327316761016846 }, { "auxiliary_loss_clip": 0.01525579, "auxiliary_loss_mlp": 0.01147757, "balance_loss_clip": 1.08204913, "balance_loss_mlp": 1.17414165, "epoch": 0.007094543814820382, "flos": 14204609723520.0, "grad_norm": 3.1418270215027846, "language_loss": 0.84602773, "learning_rate": 3.071615712271274e-06, "loss": 0.87276113, "num_input_tokens_seen": 2489285, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 3.515625, "step": 118, "time_per_iteration": 2.6496846675872803 }, { "auxiliary_loss_clip": 0.0153661, "auxiliary_loss_mlp": 0.01159611, "balance_loss_clip": 1.09409368, "balance_loss_mlp": 1.17555571, "epoch": 0.007154667067488351, "flos": 14976007228800.0, "grad_norm": 4.485696022773183, "language_loss": 0.99400902, "learning_rate": 3.0770490962752172e-06, "loss": 1.0209713, "num_input_tokens_seen": 2506460, "router_z_loss_clip": 0.65625, "router_z_loss_mlp": 3.609375, "step": 119, "time_per_iteration": 2.611893892288208 }, { "auxiliary_loss_clip": 0.01539051, "auxiliary_loss_mlp": 0.01121805, "balance_loss_clip": 1.05886292, "balance_loss_mlp": 1.17223763, "epoch": 0.00721479032015632, "flos": 20193288762240.0, "grad_norm": 3.096945248868154, "language_loss": 0.89429474, "learning_rate": 3.082437012097686e-06, "loss": 0.92090333, "num_input_tokens_seen": 2525565, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 3.671875, "step": 120, "time_per_iteration": 2.6107001304626465 }, { "auxiliary_loss_clip": 0.01523576, "auxiliary_loss_mlp": 0.01132726, "balance_loss_clip": 1.06811464, "balance_loss_mlp": 1.17244732, "epoch": 0.00727491357282429, "flos": 23147228459520.0, "grad_norm": 1.8747343481919831, "language_loss": 0.9333272, "learning_rate": 3.0877802144103967e-06, "loss": 0.95989025, "num_input_tokens_seen": 2546605, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 3.5, "step": 121, "time_per_iteration": 2.6304168701171875 }, { "auxiliary_loss_clip": 0.01524122, "auxiliary_loss_mlp": 0.01147062, "balance_loss_clip": 1.08407199, "balance_loss_mlp": 1.17273569, "epoch": 0.007335036825492259, "flos": 15521669712000.0, "grad_norm": 2.1788452057833756, "language_loss": 0.90428597, "learning_rate": 3.09307943925077e-06, "loss": 0.93099779, "num_input_tokens_seen": 2560730, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 3.515625, "step": 122, "time_per_iteration": 2.570563316345215 }, { "auxiliary_loss_clip": 0.0151956, "auxiliary_loss_mlp": 0.01145897, "balance_loss_clip": 1.07847238, "balance_loss_mlp": 1.16684914, "epoch": 0.007395160078160229, "flos": 24243365848320.0, "grad_norm": 2.6015654584061485, "language_loss": 0.92713153, "learning_rate": 3.0983354046304154e-06, "loss": 0.95378613, "num_input_tokens_seen": 2579550, "router_z_loss_clip": 0.671875, "router_z_loss_mlp": 3.53125, "step": 123, "time_per_iteration": 2.610058069229126 }, { "auxiliary_loss_clip": 0.01519116, "auxiliary_loss_mlp": 0.01126735, "balance_loss_clip": 1.0638876, "balance_loss_mlp": 1.16311121, "epoch": 0.007455283330828198, "flos": 31759792099200.0, "grad_norm": 3.6218757270449546, "language_loss": 0.71191609, "learning_rate": 3.103548811118979e-06, "loss": 0.73837465, "num_input_tokens_seen": 2600390, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 3.5625, "step": 124, "time_per_iteration": 2.6953225135803223 }, { "auxiliary_loss_clip": 0.01505405, "auxiliary_loss_mlp": 0.01125153, "balance_loss_clip": 1.06144738, "balance_loss_mlp": 1.16303468, "epoch": 0.007515406583496167, "flos": 26615157822720.0, "grad_norm": 2.885257486848377, "language_loss": 0.88281298, "learning_rate": 3.108720342404542e-06, "loss": 0.90911853, "num_input_tokens_seen": 2620770, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 3.421875, "step": 125, "time_per_iteration": 2.636683225631714 }, { "auxiliary_loss_clip": 0.01519701, "auxiliary_loss_mlp": 0.01143072, "balance_loss_clip": 1.08008146, "balance_loss_mlp": 1.1632216, "epoch": 0.007575529836164136, "flos": 18223696350720.0, "grad_norm": 3.428157412216112, "language_loss": 0.8219012, "learning_rate": 3.1138506658316945e-06, "loss": 0.84852886, "num_input_tokens_seen": 2639900, "router_z_loss_clip": 0.62890625, "router_z_loss_mlp": 3.5625, "step": 126, "time_per_iteration": 2.621777296066284 }, { "auxiliary_loss_clip": 0.01514184, "auxiliary_loss_mlp": 0.01139718, "balance_loss_clip": 1.07796788, "balance_loss_mlp": 1.16274285, "epoch": 0.007635653088832106, "flos": 21580410228480.0, "grad_norm": 3.25168914439579, "language_loss": 0.67548168, "learning_rate": 3.1189404329183404e-06, "loss": 0.70202065, "num_input_tokens_seen": 2657450, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 3.515625, "step": 127, "time_per_iteration": 2.597898483276367 }, { "auxiliary_loss_clip": 0.01501819, "auxiliary_loss_mlp": 0.01132467, "balance_loss_clip": 1.0689044, "balance_loss_mlp": 1.16415679, "epoch": 0.007695776341500075, "flos": 25375054723200.0, "grad_norm": 2.8540355839200546, "language_loss": 0.88139397, "learning_rate": 3.1239902798522317e-06, "loss": 0.90773684, "num_input_tokens_seen": 2678150, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 3.375, "step": 128, "time_per_iteration": 2.7277519702911377 }, { "auxiliary_loss_clip": 0.01504399, "auxiliary_loss_mlp": 0.0114023, "balance_loss_clip": 1.07704926, "balance_loss_mlp": 1.15937424, "epoch": 0.007755899594168045, "flos": 22343906741760.0, "grad_norm": 1.751533972799487, "language_loss": 0.84580743, "learning_rate": 3.129000827968184e-06, "loss": 0.87225366, "num_input_tokens_seen": 2698290, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 3.453125, "step": 129, "time_per_iteration": 2.6072933673858643 }, { "auxiliary_loss_clip": 0.01498275, "auxiliary_loss_mlp": 0.01134766, "balance_loss_clip": 1.07149005, "balance_loss_mlp": 1.15838289, "epoch": 0.007816022846836013, "flos": 22638230784000.0, "grad_norm": 2.5930800109063745, "language_loss": 0.97494572, "learning_rate": 3.133972684206866e-06, "loss": 1.00127614, "num_input_tokens_seen": 2717630, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 3.40625, "step": 130, "time_per_iteration": 2.6049721240997314 }, { "auxiliary_loss_clip": 0.01491706, "auxiliary_loss_mlp": 0.01133773, "balance_loss_clip": 1.06940031, "balance_loss_mlp": 1.15546036, "epoch": 0.007876146099503984, "flos": 18182901479040.0, "grad_norm": 1.8260644898892548, "language_loss": 0.8261897, "learning_rate": 3.138906441556014e-06, "loss": 0.85244447, "num_input_tokens_seen": 2735835, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 3.375, "step": 131, "time_per_iteration": 2.5464224815368652 }, { "auxiliary_loss_clip": 0.01500447, "auxiliary_loss_mlp": 0.01128042, "balance_loss_clip": 1.06691134, "balance_loss_mlp": 1.15809488, "epoch": 0.007936269352171952, "flos": 27119486730240.0, "grad_norm": 5.794643856128801, "language_loss": 0.82573891, "learning_rate": 3.143802679474861e-06, "loss": 0.85202372, "num_input_tokens_seen": 2756335, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 3.421875, "step": 132, "time_per_iteration": 2.655134916305542 }, { "auxiliary_loss_clip": 0.0149199, "auxiliary_loss_mlp": 0.01129965, "balance_loss_clip": 1.06850123, "balance_loss_mlp": 1.15276921, "epoch": 0.007996392604839923, "flos": 19026335710080.0, "grad_norm": 2.39056446589385, "language_loss": 0.95289159, "learning_rate": 3.1486619643025565e-06, "loss": 0.97911114, "num_input_tokens_seen": 2775090, "router_z_loss_clip": 0.61328125, "router_z_loss_mlp": 3.390625, "step": 133, "time_per_iteration": 2.6257967948913574 }, { "auxiliary_loss_clip": 0.01487947, "auxiliary_loss_mlp": 0.01128634, "balance_loss_clip": 1.06850517, "balance_loss_mlp": 1.16096163, "epoch": 0.008056515857507891, "flos": 25484151306240.0, "grad_norm": 1.6775657510836985, "language_loss": 0.73201722, "learning_rate": 3.153484849651286e-06, "loss": 0.75818306, "num_input_tokens_seen": 2795320, "router_z_loss_clip": 0.6015625, "router_z_loss_mlp": 3.265625, "step": 134, "time_per_iteration": 2.6783180236816406 }, { "auxiliary_loss_clip": 0.01484144, "auxiliary_loss_mlp": 0.01130998, "balance_loss_clip": 1.06657743, "balance_loss_mlp": 1.14953518, "epoch": 0.00811663911017586, "flos": 20557566541440.0, "grad_norm": 2.771130986455974, "language_loss": 0.88904142, "learning_rate": 3.1582718767847806e-06, "loss": 0.91519284, "num_input_tokens_seen": 2812815, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 3.34375, "step": 135, "time_per_iteration": 2.5667669773101807 }, { "auxiliary_loss_clip": 0.01487414, "auxiliary_loss_mlp": 0.01132348, "balance_loss_clip": 1.06730747, "balance_loss_mlp": 1.15387917, "epoch": 0.00817676236284383, "flos": 18799738761600.0, "grad_norm": 1.9728502808059323, "language_loss": 0.89111102, "learning_rate": 3.1630235749828485e-06, "loss": 0.91730869, "num_input_tokens_seen": 2830445, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 3.34375, "step": 136, "time_per_iteration": 2.556986093521118 }, { "auxiliary_loss_clip": 0.01483813, "auxiliary_loss_mlp": 0.01109925, "balance_loss_clip": 1.05003417, "balance_loss_mlp": 1.14822447, "epoch": 0.008236885615511799, "flos": 23873593288320.0, "grad_norm": 2.517938352077064, "language_loss": 0.84191978, "learning_rate": 3.1677404618925676e-06, "loss": 0.8678571, "num_input_tokens_seen": 2846965, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 3.34375, "step": 137, "time_per_iteration": 2.6207714080810547 }, { "auxiliary_loss_clip": 0.01480776, "auxiliary_loss_mlp": 0.01118739, "balance_loss_clip": 1.05880094, "balance_loss_mlp": 1.14796519, "epoch": 0.00829700886817977, "flos": 24643626076800.0, "grad_norm": 2.172000343357505, "language_loss": 0.90245324, "learning_rate": 3.1724230438666953e-06, "loss": 0.92844832, "num_input_tokens_seen": 2867520, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 3.34375, "step": 138, "time_per_iteration": 2.6559319496154785 }, { "auxiliary_loss_clip": 0.01470341, "auxiliary_loss_mlp": 0.01123348, "balance_loss_clip": 1.05916619, "balance_loss_mlp": 1.14715683, "epoch": 0.008357132120847738, "flos": 25262007644160.0, "grad_norm": 2.255941163921244, "language_loss": 0.91384262, "learning_rate": 3.177071816289865e-06, "loss": 0.93977946, "num_input_tokens_seen": 2885675, "router_z_loss_clip": 0.640625, "router_z_loss_mlp": 3.234375, "step": 139, "time_per_iteration": 2.595163345336914 }, { "auxiliary_loss_clip": 0.01487268, "auxiliary_loss_mlp": 0.01123098, "balance_loss_clip": 1.06048894, "balance_loss_mlp": 1.15402102, "epoch": 0.008417255373515706, "flos": 27344898529920.0, "grad_norm": 4.417191251635987, "language_loss": 0.8552348, "learning_rate": 3.181687263893095e-06, "loss": 0.88133848, "num_input_tokens_seen": 2905960, "router_z_loss_clip": 0.625, "router_z_loss_mlp": 3.328125, "step": 140, "time_per_iteration": 2.662053346633911 }, { "auxiliary_loss_clip": 0.01473174, "auxiliary_loss_mlp": 0.01123302, "balance_loss_clip": 1.06245756, "balance_loss_mlp": 1.14820397, "epoch": 0.008477378626183677, "flos": 17639070589440.0, "grad_norm": 2.916505170980686, "language_loss": 0.84068793, "learning_rate": 3.186269861057098e-06, "loss": 0.86665273, "num_input_tokens_seen": 2922780, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 3.25, "step": 141, "time_per_iteration": 2.538428544998169 }, { "auxiliary_loss_clip": 0.01478065, "auxiliary_loss_mlp": 0.01133532, "balance_loss_clip": 1.07168627, "balance_loss_mlp": 1.14585233, "epoch": 0.008537501878851645, "flos": 13881342297600.0, "grad_norm": 2.242389299265565, "language_loss": 0.81446183, "learning_rate": 3.1908200721048745e-06, "loss": 0.84057772, "num_input_tokens_seen": 2938765, "router_z_loss_clip": 0.6171875, "router_z_loss_mlp": 3.3125, "step": 142, "time_per_iteration": 4.117454528808594 }, { "auxiliary_loss_clip": 0.01384295, "auxiliary_loss_mlp": 0.01027478, "balance_loss_clip": 1.00172842, "balance_loss_mlp": 1.20230532, "epoch": 0.008597625131519616, "flos": 71248101281280.0, "grad_norm": 1.0528641895945292, "language_loss": 0.66911113, "learning_rate": 3.195338351584042e-06, "loss": 0.69322884, "num_input_tokens_seen": 3006665, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.8203125, "step": 143, "time_per_iteration": 4.944696664810181 }, { "auxiliary_loss_clip": 0.01468882, "auxiliary_loss_mlp": 0.01125959, "balance_loss_clip": 1.06482887, "balance_loss_mlp": 1.14547014, "epoch": 0.008657748384187584, "flos": 17602836744960.0, "grad_norm": 2.240080778348313, "language_loss": 0.8420589, "learning_rate": 3.1998251445393258e-06, "loss": 0.8680073, "num_input_tokens_seen": 3024335, "router_z_loss_clip": 0.609375, "router_z_loss_mlp": 3.234375, "step": 144, "time_per_iteration": 3.075165033340454 }, { "auxiliary_loss_clip": 0.01455361, "auxiliary_loss_mlp": 0.01113608, "balance_loss_clip": 1.05033159, "balance_loss_mlp": 1.13824272, "epoch": 0.008717871636855555, "flos": 19715317459200.0, "grad_norm": 2.197977383254954, "language_loss": 0.88446194, "learning_rate": 3.204280886775619e-06, "loss": 0.9101516, "num_input_tokens_seen": 3043300, "router_z_loss_clip": 0.6328125, "router_z_loss_mlp": 3.171875, "step": 145, "time_per_iteration": 2.5826945304870605 }, { "auxiliary_loss_clip": 0.01473524, "auxiliary_loss_mlp": 0.01127568, "balance_loss_clip": 1.06376755, "balance_loss_mlp": 1.14138937, "epoch": 0.008777994889523523, "flos": 24717422568960.0, "grad_norm": 2.3371943269057365, "language_loss": 0.86005169, "learning_rate": 3.208706005112005e-06, "loss": 0.88606262, "num_input_tokens_seen": 3064610, "router_z_loss_clip": 0.63671875, "router_z_loss_mlp": 3.3125, "step": 146, "time_per_iteration": 2.643402099609375 }, { "auxiliary_loss_clip": 0.01368896, "auxiliary_loss_mlp": 0.01025024, "balance_loss_clip": 0.99984688, "balance_loss_mlp": 1.19047809, "epoch": 0.008838118142191492, "flos": 70132067758080.0, "grad_norm": 0.8604502878041347, "language_loss": 0.60152549, "learning_rate": 3.213100917627104e-06, "loss": 0.62546468, "num_input_tokens_seen": 3130385, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.78125, "step": 147, "time_per_iteration": 3.217038154602051 }, { "auxiliary_loss_clip": 0.01463682, "auxiliary_loss_mlp": 0.01126592, "balance_loss_clip": 1.06841755, "balance_loss_mlp": 1.14494038, "epoch": 0.008898241394859462, "flos": 20044797937920.0, "grad_norm": 2.297983603103852, "language_loss": 0.8446154, "learning_rate": 3.2174660338961135e-06, "loss": 0.87051809, "num_input_tokens_seen": 3149760, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 3.1875, "step": 148, "time_per_iteration": 2.602626085281372 }, { "auxiliary_loss_clip": 0.01465798, "auxiliary_loss_mlp": 0.01144272, "balance_loss_clip": 1.079422, "balance_loss_mlp": 1.14476502, "epoch": 0.008958364647527431, "flos": 10743611685120.0, "grad_norm": 4.720383379700633, "language_loss": 0.88635504, "learning_rate": 3.2218017552198588e-06, "loss": 0.91245574, "num_input_tokens_seen": 3164500, "router_z_loss_clip": 0.6484375, "router_z_loss_mlp": 3.21875, "step": 149, "time_per_iteration": 2.556466579437256 }, { "auxiliary_loss_clip": 0.01463529, "auxiliary_loss_mlp": 0.01113398, "balance_loss_clip": 1.0546999, "balance_loss_mlp": 1.1399852, "epoch": 0.009018487900195401, "flos": 29127467802240.0, "grad_norm": 2.6155331483112825, "language_loss": 0.93124229, "learning_rate": 3.226108474846181e-06, "loss": 0.95701158, "num_input_tokens_seen": 3182455, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 3.234375, "step": 150, "time_per_iteration": 2.658618450164795 }, { "auxiliary_loss_clip": 0.01451057, "auxiliary_loss_mlp": 0.01111672, "balance_loss_clip": 1.05497599, "balance_loss_mlp": 1.13572133, "epoch": 0.00907861115286337, "flos": 32963661354240.0, "grad_norm": 1.8616151165747956, "language_loss": 0.74011046, "learning_rate": 3.2303865781839817e-06, "loss": 0.76573777, "num_input_tokens_seen": 3203995, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 3.15625, "step": 151, "time_per_iteration": 2.7262299060821533 }, { "auxiliary_loss_clip": 0.01463856, "auxiliary_loss_mlp": 0.01122857, "balance_loss_clip": 1.06430149, "balance_loss_mlp": 1.14082611, "epoch": 0.009138734405531338, "flos": 21762441377280.0, "grad_norm": 3.344429900318653, "language_loss": 0.88242143, "learning_rate": 3.234636443010188e-06, "loss": 0.90828848, "num_input_tokens_seen": 3222575, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 3.21875, "step": 152, "time_per_iteration": 2.6037039756774902 }, { "auxiliary_loss_clip": 0.014623, "auxiliary_loss_mlp": 0.01119384, "balance_loss_clip": 1.06011319, "balance_loss_mlp": 1.14523029, "epoch": 0.009198857658199309, "flos": 20842517134080.0, "grad_norm": 3.1063298106428294, "language_loss": 0.84016138, "learning_rate": 3.238858439669943e-06, "loss": 0.86597818, "num_input_tokens_seen": 3240180, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 3.171875, "step": 153, "time_per_iteration": 2.5580942630767822 }, { "auxiliary_loss_clip": 0.01453597, "auxiliary_loss_mlp": 0.01136638, "balance_loss_clip": 1.07603216, "balance_loss_mlp": 1.13759387, "epoch": 0.009258980910867277, "flos": 24827381078400.0, "grad_norm": 1.8676653479142935, "language_loss": 0.89842188, "learning_rate": 3.2430529312702712e-06, "loss": 0.92432415, "num_input_tokens_seen": 3259800, "router_z_loss_clip": 0.60546875, "router_z_loss_mlp": 3.15625, "step": 154, "time_per_iteration": 2.6255428791046143 }, { "auxiliary_loss_clip": 0.01458045, "auxiliary_loss_mlp": 0.01155763, "balance_loss_clip": 1.09603941, "balance_loss_mlp": 1.14055145, "epoch": 0.009319104163535248, "flos": 28767786963840.0, "grad_norm": 2.0577218404025657, "language_loss": 0.89804024, "learning_rate": 3.2472202738674737e-06, "loss": 0.92417836, "num_input_tokens_seen": 3280400, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 3.171875, "step": 155, "time_per_iteration": 2.6767754554748535 }, { "auxiliary_loss_clip": 0.01460511, "auxiliary_loss_mlp": 0.01122746, "balance_loss_clip": 1.06471503, "balance_loss_mlp": 1.13692307, "epoch": 0.009379227416203216, "flos": 16582004219520.0, "grad_norm": 9.595109188864248, "language_loss": 0.86590308, "learning_rate": 3.2513608166485063e-06, "loss": 0.89173567, "num_input_tokens_seen": 3297600, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 3.234375, "step": 156, "time_per_iteration": 2.558194875717163 }, { "auxiliary_loss_clip": 0.01460204, "auxiliary_loss_mlp": 0.01114187, "balance_loss_clip": 1.05629909, "balance_loss_mlp": 1.14228916, "epoch": 0.009439350668871187, "flos": 18329919845760.0, "grad_norm": 3.2290515925042773, "language_loss": 0.99511933, "learning_rate": 3.2554749021065498e-06, "loss": 1.02086329, "num_input_tokens_seen": 3313635, "router_z_loss_clip": 0.578125, "router_z_loss_mlp": 3.1875, "step": 157, "time_per_iteration": 2.531776189804077 }, { "auxiliary_loss_clip": 0.01444768, "auxiliary_loss_mlp": 0.01143119, "balance_loss_clip": 1.08475423, "balance_loss_mlp": 1.13706493, "epoch": 0.009499473921539155, "flos": 24349912565760.0, "grad_norm": 2.73789240562158, "language_loss": 0.88302529, "learning_rate": 3.2595628662110186e-06, "loss": 0.90890419, "num_input_tokens_seen": 3333735, "router_z_loss_clip": 0.5859375, "router_z_loss_mlp": 3.0625, "step": 158, "time_per_iteration": 2.5994873046875 }, { "auxiliary_loss_clip": 0.01451659, "auxiliary_loss_mlp": 0.01127953, "balance_loss_clip": 1.06849146, "balance_loss_mlp": 1.1365881, "epoch": 0.009559597174207124, "flos": 16399326625920.0, "grad_norm": 2.2545788708686865, "language_loss": 0.86666334, "learning_rate": 3.2636250385721982e-06, "loss": 0.89245939, "num_input_tokens_seen": 3348800, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 3.15625, "step": 159, "time_per_iteration": 2.529146194458008 }, { "auxiliary_loss_clip": 0.01440728, "auxiliary_loss_mlp": 0.01136869, "balance_loss_clip": 1.07697821, "balance_loss_mlp": 1.13162756, "epoch": 0.009619720426875094, "flos": 22856890826880.0, "grad_norm": 1.660820366492542, "language_loss": 0.86914206, "learning_rate": 3.2676617426007263e-06, "loss": 0.89491796, "num_input_tokens_seen": 3368595, "router_z_loss_clip": 0.59765625, "router_z_loss_mlp": 3.09375, "step": 160, "time_per_iteration": 2.5962650775909424 }, { "auxiliary_loss_clip": 0.01447887, "auxiliary_loss_mlp": 0.01122555, "balance_loss_clip": 1.06728995, "balance_loss_mlp": 1.13701451, "epoch": 0.009679843679543063, "flos": 19135001329920.0, "grad_norm": 2.5040773699710854, "language_loss": 0.91654533, "learning_rate": 3.2716732956621042e-06, "loss": 0.94224977, "num_input_tokens_seen": 3384975, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 3.109375, "step": 161, "time_per_iteration": 2.5669331550598145 }, { "auxiliary_loss_clip": 0.01452959, "auxiliary_loss_mlp": 0.0110981, "balance_loss_clip": 1.05459261, "balance_loss_mlp": 1.13790357, "epoch": 0.009739966932211033, "flos": 20302995876480.0, "grad_norm": 1.9807828161050722, "language_loss": 0.91674471, "learning_rate": 3.2756600092264203e-06, "loss": 0.94237244, "num_input_tokens_seen": 3404755, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 3.15625, "step": 162, "time_per_iteration": 2.584282875061035 }, { "auxiliary_loss_clip": 0.01320475, "auxiliary_loss_mlp": 0.01044155, "balance_loss_clip": 1.02098072, "balance_loss_mlp": 1.15468454, "epoch": 0.009800090184879002, "flos": 67034234177280.0, "grad_norm": 1.1744814305355962, "language_loss": 0.72354436, "learning_rate": 3.279622189013474e-06, "loss": 0.74719071, "num_input_tokens_seen": 3467210, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.65625, "step": 163, "time_per_iteration": 3.142191171646118 }, { "auxiliary_loss_clip": 0.01436698, "auxiliary_loss_mlp": 0.01118572, "balance_loss_clip": 1.06178045, "balance_loss_mlp": 1.13418531, "epoch": 0.00986021343754697, "flos": 17164690646400.0, "grad_norm": 2.5285431408560943, "language_loss": 0.84382576, "learning_rate": 3.283560135133457e-06, "loss": 0.86937845, "num_input_tokens_seen": 3483220, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 3.03125, "step": 164, "time_per_iteration": 2.527116060256958 }, { "auxiliary_loss_clip": 0.01428628, "auxiliary_loss_mlp": 0.01101744, "balance_loss_clip": 1.04624021, "balance_loss_mlp": 1.12677789, "epoch": 0.00992033669021494, "flos": 17749424148480.0, "grad_norm": 1.951572780833929, "language_loss": 0.89162302, "learning_rate": 3.2874741422233565e-06, "loss": 0.91692674, "num_input_tokens_seen": 3501465, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 3.015625, "step": 165, "time_per_iteration": 2.561903953552246 }, { "auxiliary_loss_clip": 0.01431162, "auxiliary_loss_mlp": 0.0112733, "balance_loss_clip": 1.06791639, "balance_loss_mlp": 1.12673283, "epoch": 0.00998045994288291, "flos": 25297164080640.0, "grad_norm": 1.9273428133598292, "language_loss": 0.79975772, "learning_rate": 3.2913644995792465e-06, "loss": 0.82534266, "num_input_tokens_seen": 3520480, "router_z_loss_clip": 0.59375, "router_z_loss_mlp": 3.046875, "step": 166, "time_per_iteration": 2.6117656230926514 }, { "auxiliary_loss_clip": 0.01436619, "auxiliary_loss_mlp": 0.01124512, "balance_loss_clip": 1.06621861, "balance_loss_mlp": 1.1313206, "epoch": 0.01004058319555088, "flos": 32298954220800.0, "grad_norm": 2.6897984053364916, "language_loss": 0.91628033, "learning_rate": 3.2952314912845914e-06, "loss": 0.94189155, "num_input_tokens_seen": 3539570, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 3.0625, "step": 167, "time_per_iteration": 2.6713342666625977 }, { "auxiliary_loss_clip": 0.01427467, "auxiliary_loss_mlp": 0.01131309, "balance_loss_clip": 1.07628214, "balance_loss_mlp": 1.12948275, "epoch": 0.010100706448218848, "flos": 11319941404800.0, "grad_norm": 2.4038953727735204, "language_loss": 0.90631187, "learning_rate": 3.299075396334735e-06, "loss": 0.93189967, "num_input_tokens_seen": 3555465, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 2.96875, "step": 168, "time_per_iteration": 2.530662775039673 }, { "auxiliary_loss_clip": 0.01422558, "auxiliary_loss_mlp": 0.01102553, "balance_loss_clip": 1.04538047, "balance_loss_mlp": 1.12507987, "epoch": 0.010160829700886819, "flos": 29719491765120.0, "grad_norm": 1.795370788468224, "language_loss": 0.87055135, "learning_rate": 3.3028964887576868e-06, "loss": 0.8958025, "num_input_tokens_seen": 3578970, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 2.96875, "step": 169, "time_per_iteration": 2.6814229488372803 }, { "auxiliary_loss_clip": 0.01423609, "auxiliary_loss_mlp": 0.01112633, "balance_loss_clip": 1.05536485, "balance_loss_mlp": 1.12714696, "epoch": 0.010220952953554787, "flos": 20412343854720.0, "grad_norm": 2.867396246227983, "language_loss": 0.84742451, "learning_rate": 3.306695037731344e-06, "loss": 0.87278694, "num_input_tokens_seen": 3597275, "router_z_loss_clip": 0.57421875, "router_z_loss_mlp": 2.96875, "step": 170, "time_per_iteration": 2.5584795475006104 }, { "auxiliary_loss_clip": 0.01432972, "auxiliary_loss_mlp": 0.01132532, "balance_loss_clip": 1.07445312, "balance_loss_mlp": 1.12688446, "epoch": 0.010281076206222756, "flos": 31285124847360.0, "grad_norm": 2.293409738519377, "language_loss": 0.90079266, "learning_rate": 3.3104713076972827e-06, "loss": 0.92644769, "num_input_tokens_seen": 3618905, "router_z_loss_clip": 0.58203125, "router_z_loss_mlp": 3.0625, "step": 171, "time_per_iteration": 2.646279811859131 }, { "auxiliary_loss_clip": 0.01427343, "auxiliary_loss_mlp": 0.01106153, "balance_loss_clip": 1.051126, "balance_loss_mlp": 1.12982559, "epoch": 0.010341199458890726, "flos": 21982286568960.0, "grad_norm": 1.9288414758860233, "language_loss": 0.88904381, "learning_rate": 3.314225558471224e-06, "loss": 0.91437882, "num_input_tokens_seen": 3639610, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 2.96875, "step": 172, "time_per_iteration": 2.568794012069702 }, { "auxiliary_loss_clip": 0.01415278, "auxiliary_loss_mlp": 0.01123356, "balance_loss_clip": 1.0681386, "balance_loss_mlp": 1.12314856, "epoch": 0.010401322711558695, "flos": 30810529422720.0, "grad_norm": 1.8399717753405576, "language_loss": 0.81099862, "learning_rate": 3.317958045350308e-06, "loss": 0.83638501, "num_input_tokens_seen": 3664030, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 2.921875, "step": 173, "time_per_iteration": 2.6580986976623535 }, { "auxiliary_loss_clip": 0.01427635, "auxiliary_loss_mlp": 0.01108377, "balance_loss_clip": 1.05544806, "balance_loss_mlp": 1.12764752, "epoch": 0.010461445964226665, "flos": 24715124098560.0, "grad_norm": 1.8077177676489302, "language_loss": 0.82704943, "learning_rate": 3.3216690192172596e-06, "loss": 0.85240954, "num_input_tokens_seen": 3683615, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 3.0, "step": 174, "time_per_iteration": 2.60903000831604 }, { "auxiliary_loss_clip": 0.01420446, "auxiliary_loss_mlp": 0.01114539, "balance_loss_clip": 1.05908275, "balance_loss_mlp": 1.12339628, "epoch": 0.010521569216894634, "flos": 27710361457920.0, "grad_norm": 2.630644352632287, "language_loss": 0.72963071, "learning_rate": 3.325358726641591e-06, "loss": 0.75498056, "num_input_tokens_seen": 3704540, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 2.96875, "step": 175, "time_per_iteration": 2.6190378665924072 }, { "auxiliary_loss_clip": 0.01422494, "auxiliary_loss_mlp": 0.0112704, "balance_loss_clip": 1.06981993, "balance_loss_mlp": 1.12464833, "epoch": 0.010581692469562603, "flos": 12458346122880.0, "grad_norm": 2.8876300679775544, "language_loss": 0.98135823, "learning_rate": 3.329027409977902e-06, "loss": 1.00685358, "num_input_tokens_seen": 3721320, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 2.984375, "step": 176, "time_per_iteration": 2.532853364944458 }, { "auxiliary_loss_clip": 0.01409874, "auxiliary_loss_mlp": 0.01129764, "balance_loss_clip": 1.07635832, "balance_loss_mlp": 1.12221861, "epoch": 0.010641815722230573, "flos": 19427601519360.0, "grad_norm": 2.8201551782742516, "language_loss": 0.76973891, "learning_rate": 3.3326753074614087e-06, "loss": 0.79513526, "num_input_tokens_seen": 3739385, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.875, "step": 177, "time_per_iteration": 2.5695581436157227 }, { "auxiliary_loss_clip": 0.01418947, "auxiliary_loss_mlp": 0.01102849, "balance_loss_clip": 1.04839444, "balance_loss_mlp": 1.11981535, "epoch": 0.010701938974898541, "flos": 18332577452160.0, "grad_norm": 2.546821708868483, "language_loss": 0.76860279, "learning_rate": 3.3363026533007716e-06, "loss": 0.79382074, "num_input_tokens_seen": 3756360, "router_z_loss_clip": 0.54296875, "router_z_loss_mlp": 3.0, "step": 178, "time_per_iteration": 2.5375661849975586 }, { "auxiliary_loss_clip": 0.01426841, "auxiliary_loss_mlp": 0.0110662, "balance_loss_clip": 1.05021048, "balance_loss_mlp": 1.12639809, "epoch": 0.010762062227566512, "flos": 19203985399680.0, "grad_norm": 2.3287409682159432, "language_loss": 0.84424639, "learning_rate": 3.3399096777683303e-06, "loss": 0.86958098, "num_input_tokens_seen": 3773930, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 3.015625, "step": 179, "time_per_iteration": 2.541346311569214 }, { "auxiliary_loss_clip": 0.01415325, "auxiliary_loss_mlp": 0.01110215, "balance_loss_clip": 1.05344713, "balance_loss_mlp": 1.11836493, "epoch": 0.01082218548023448, "flos": 31425427370880.0, "grad_norm": 2.509176447878497, "language_loss": 0.83653003, "learning_rate": 3.3434966072878213e-06, "loss": 0.86178541, "num_input_tokens_seen": 3793630, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 2.96875, "step": 180, "time_per_iteration": 2.6287312507629395 }, { "auxiliary_loss_clip": 0.0141768, "auxiliary_loss_mlp": 0.01122131, "balance_loss_clip": 1.06641293, "balance_loss_mlp": 1.12201333, "epoch": 0.01088230873290245, "flos": 25046436170880.0, "grad_norm": 3.171288043345318, "language_loss": 0.77846748, "learning_rate": 3.3470636645196674e-06, "loss": 0.80386561, "num_input_tokens_seen": 3813610, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 2.953125, "step": 181, "time_per_iteration": 2.6043121814727783 }, { "auxiliary_loss_clip": 0.01412075, "auxiliary_loss_mlp": 0.0112441, "balance_loss_clip": 1.07050347, "balance_loss_mlp": 1.11772752, "epoch": 0.01094243198557042, "flos": 22893411980160.0, "grad_norm": 3.3572627620654667, "language_loss": 0.76328558, "learning_rate": 3.3506110684439156e-06, "loss": 0.78865039, "num_input_tokens_seen": 3831390, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.9375, "step": 182, "time_per_iteration": 2.562706470489502 }, { "auxiliary_loss_clip": 0.01410727, "auxiliary_loss_mlp": 0.01128402, "balance_loss_clip": 1.07227778, "balance_loss_mlp": 1.11753607, "epoch": 0.011002555238238388, "flos": 17165049782400.0, "grad_norm": 2.2023197724429675, "language_loss": 0.87676775, "learning_rate": 3.3541390344409054e-06, "loss": 0.90215898, "num_input_tokens_seen": 3849705, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 2.9375, "step": 183, "time_per_iteration": 4.062732934951782 }, { "auxiliary_loss_clip": 0.01412296, "auxiliary_loss_mlp": 0.01108182, "balance_loss_clip": 1.05756557, "balance_loss_mlp": 1.11998188, "epoch": 0.011062678490906358, "flos": 22310150935680.0, "grad_norm": 2.2992097026192435, "language_loss": 0.86546576, "learning_rate": 3.357647774369736e-06, "loss": 0.89067054, "num_input_tokens_seen": 3869230, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.921875, "step": 184, "time_per_iteration": 5.487316608428955 }, { "auxiliary_loss_clip": 0.01407452, "auxiliary_loss_mlp": 0.01108048, "balance_loss_clip": 1.05285454, "balance_loss_mlp": 1.12009692, "epoch": 0.011122801743574327, "flos": 24388373053440.0, "grad_norm": 1.7372377938655275, "language_loss": 0.83790082, "learning_rate": 3.3611374966446085e-06, "loss": 0.86305583, "num_input_tokens_seen": 3889735, "router_z_loss_clip": 0.5546875, "router_z_loss_mlp": 2.875, "step": 185, "time_per_iteration": 2.5885889530181885 }, { "auxiliary_loss_clip": 0.01416744, "auxiliary_loss_mlp": 0.0110788, "balance_loss_clip": 1.05068374, "balance_loss_mlp": 1.11797452, "epoch": 0.011182924996242297, "flos": 18150258994560.0, "grad_norm": 3.2479787483231815, "language_loss": 0.71061254, "learning_rate": 3.3646084063091142e-06, "loss": 0.7358588, "num_input_tokens_seen": 3908855, "router_z_loss_clip": 0.5703125, "router_z_loss_mlp": 2.984375, "step": 186, "time_per_iteration": 2.5172743797302246 }, { "auxiliary_loss_clip": 0.01412589, "auxiliary_loss_mlp": 0.01106635, "balance_loss_clip": 1.05535114, "balance_loss_mlp": 1.11784601, "epoch": 0.011243048248910266, "flos": 15486800584320.0, "grad_norm": 2.705167501563703, "language_loss": 1.02221584, "learning_rate": 3.3680607051085194e-06, "loss": 1.0474081, "num_input_tokens_seen": 3923865, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 2.953125, "step": 187, "time_per_iteration": 2.5496914386749268 }, { "auxiliary_loss_clip": 0.01401393, "auxiliary_loss_mlp": 0.01108278, "balance_loss_clip": 1.05351317, "balance_loss_mlp": 1.11710048, "epoch": 0.011303171501578235, "flos": 40916868986880.0, "grad_norm": 1.6788161238136958, "language_loss": 0.75200713, "learning_rate": 3.371494591560139e-06, "loss": 0.77710378, "num_input_tokens_seen": 3946870, "router_z_loss_clip": 0.55078125, "router_z_loss_mlp": 2.84375, "step": 188, "time_per_iteration": 2.7331364154815674 }, { "auxiliary_loss_clip": 0.01302241, "auxiliary_loss_mlp": 0.01045969, "balance_loss_clip": 1.0240345, "balance_loss_mlp": 1.14617753, "epoch": 0.011363294754246205, "flos": 66302697790080.0, "grad_norm": 0.7558668611448344, "language_loss": 0.56232047, "learning_rate": 3.3749102610218297e-06, "loss": 0.58580256, "num_input_tokens_seen": 4010005, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.5625, "step": 189, "time_per_iteration": 3.2732386589050293 }, { "auxiliary_loss_clip": 0.01399591, "auxiliary_loss_mlp": 0.01122084, "balance_loss_clip": 1.06729555, "balance_loss_mlp": 1.11277282, "epoch": 0.011423418006914174, "flos": 24900279730560.0, "grad_norm": 2.7888953655042803, "language_loss": 0.95156622, "learning_rate": 3.3783079057586833e-06, "loss": 0.97678298, "num_input_tokens_seen": 4029035, "router_z_loss_clip": 0.546875, "router_z_loss_mlp": 2.875, "step": 190, "time_per_iteration": 2.6255650520324707 }, { "auxiliary_loss_clip": 0.01399465, "auxiliary_loss_mlp": 0.01100392, "balance_loss_clip": 1.0485363, "balance_loss_mlp": 1.11401463, "epoch": 0.011483541259582144, "flos": 19791879298560.0, "grad_norm": 3.0389505717610974, "language_loss": 0.84785712, "learning_rate": 3.3816877150079665e-06, "loss": 0.87285566, "num_input_tokens_seen": 4046995, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.859375, "step": 191, "time_per_iteration": 2.542107582092285 }, { "auxiliary_loss_clip": 0.01398505, "auxiliary_loss_mlp": 0.01120643, "balance_loss_clip": 1.06912041, "balance_loss_mlp": 1.11044931, "epoch": 0.011543664512250112, "flos": 26176939896960.0, "grad_norm": 1.867695226496383, "language_loss": 0.91873699, "learning_rate": 3.385049875042367e-06, "loss": 0.94392854, "num_input_tokens_seen": 4065865, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 2.875, "step": 192, "time_per_iteration": 2.6052939891815186 }, { "auxiliary_loss_clip": 0.01393836, "auxiliary_loss_mlp": 0.01118879, "balance_loss_clip": 1.0621835, "balance_loss_mlp": 1.11151803, "epoch": 0.011603787764918083, "flos": 23768985905280.0, "grad_norm": 2.2832839025661484, "language_loss": 0.87018973, "learning_rate": 3.3883945692315938e-06, "loss": 0.89531684, "num_input_tokens_seen": 4085305, "router_z_loss_clip": 0.56640625, "router_z_loss_mlp": 2.828125, "step": 193, "time_per_iteration": 2.582078218460083 }, { "auxiliary_loss_clip": 0.01397755, "auxiliary_loss_mlp": 0.01098942, "balance_loss_clip": 1.04782486, "balance_loss_mlp": 1.10995841, "epoch": 0.011663911017586051, "flos": 25954688494080.0, "grad_norm": 2.3044840469495402, "language_loss": 0.92089742, "learning_rate": 3.3917219781023906e-06, "loss": 0.94586438, "num_input_tokens_seen": 4105185, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 2.875, "step": 194, "time_per_iteration": 2.5897371768951416 }, { "auxiliary_loss_clip": 0.01402839, "auxiliary_loss_mlp": 0.01109824, "balance_loss_clip": 1.05789638, "balance_loss_mlp": 1.11490226, "epoch": 0.01172403427025402, "flos": 17895149625600.0, "grad_norm": 3.6912456059647685, "language_loss": 0.89951777, "learning_rate": 3.3950322793970014e-06, "loss": 0.92464447, "num_input_tokens_seen": 4123160, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.890625, "step": 195, "time_per_iteration": 2.5092999935150146 }, { "auxiliary_loss_clip": 0.01396398, "auxiliary_loss_mlp": 0.01114767, "balance_loss_clip": 1.06090832, "balance_loss_mlp": 1.11301148, "epoch": 0.01178415752292199, "flos": 17894539094400.0, "grad_norm": 3.1043341153884336, "language_loss": 0.85907334, "learning_rate": 3.3983256481301445e-06, "loss": 0.88418496, "num_input_tokens_seen": 4140425, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.828125, "step": 196, "time_per_iteration": 2.508283853530884 }, { "auxiliary_loss_clip": 0.01394682, "auxiliary_loss_mlp": 0.01108329, "balance_loss_clip": 1.05504274, "balance_loss_mlp": 1.11051834, "epoch": 0.011844280775589959, "flos": 22893555634560.0, "grad_norm": 2.40860388886501, "language_loss": 0.93113518, "learning_rate": 3.4016022566445335e-06, "loss": 0.95616531, "num_input_tokens_seen": 4159555, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 2.84375, "step": 197, "time_per_iteration": 2.5503621101379395 }, { "auxiliary_loss_clip": 0.01392165, "auxiliary_loss_mlp": 0.01112839, "balance_loss_clip": 1.06038725, "balance_loss_mlp": 1.111323, "epoch": 0.01190440402825793, "flos": 26980333441920.0, "grad_norm": 2.3713379646157087, "language_loss": 0.78991562, "learning_rate": 3.4048622746649966e-06, "loss": 0.81496567, "num_input_tokens_seen": 4180480, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 2.8125, "step": 198, "time_per_iteration": 2.6280856132507324 }, { "auxiliary_loss_clip": 0.01387923, "auxiliary_loss_mlp": 0.01117358, "balance_loss_clip": 1.06562138, "balance_loss_mlp": 1.11162615, "epoch": 0.011964527280925898, "flos": 20521584092160.0, "grad_norm": 2.567445119498013, "language_loss": 0.88120365, "learning_rate": 3.4081058693512278e-06, "loss": 0.90625644, "num_input_tokens_seen": 4198835, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 2.765625, "step": 199, "time_per_iteration": 2.5325253009796143 }, { "auxiliary_loss_clip": 0.01400492, "auxiliary_loss_mlp": 0.01123698, "balance_loss_clip": 1.06757402, "balance_loss_mlp": 1.11422002, "epoch": 0.012024650533593867, "flos": 27745984771200.0, "grad_norm": 2.1773960675983615, "language_loss": 0.81394607, "learning_rate": 3.411333205349222e-06, "loss": 0.83918798, "num_input_tokens_seen": 4219335, "router_z_loss_clip": 0.5625, "router_z_loss_mlp": 2.859375, "step": 200, "time_per_iteration": 2.6176793575286865 }, { "auxiliary_loss_clip": 0.0139756, "auxiliary_loss_mlp": 0.01100758, "balance_loss_clip": 1.04735208, "balance_loss_mlp": 1.11067653, "epoch": 0.012084773786261837, "flos": 10452017076480.0, "grad_norm": 12.446576687020395, "language_loss": 0.87477684, "learning_rate": 3.4145444448414217e-06, "loss": 0.89976007, "num_input_tokens_seen": 4236940, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 2.875, "step": 201, "time_per_iteration": 2.5192699432373047 }, { "auxiliary_loss_clip": 0.01396273, "auxiliary_loss_mlp": 0.01112532, "balance_loss_clip": 1.05864906, "balance_loss_mlp": 1.11282408, "epoch": 0.012144897038929806, "flos": 23105751229440.0, "grad_norm": 1.9541498787069242, "language_loss": 0.84134316, "learning_rate": 3.4177397475956223e-06, "loss": 0.86643118, "num_input_tokens_seen": 4256755, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.828125, "step": 202, "time_per_iteration": 2.5667331218719482 }, { "auxiliary_loss_clip": 0.01386067, "auxiliary_loss_mlp": 0.01107009, "balance_loss_clip": 1.05508173, "balance_loss_mlp": 1.10618997, "epoch": 0.012205020291597776, "flos": 21033203460480.0, "grad_norm": 2.4818669688074317, "language_loss": 0.90054262, "learning_rate": 3.4209192710126685e-06, "loss": 0.92547339, "num_input_tokens_seen": 4276505, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.796875, "step": 203, "time_per_iteration": 2.544362783432007 }, { "auxiliary_loss_clip": 0.01298394, "auxiliary_loss_mlp": 0.01065855, "balance_loss_clip": 1.04411149, "balance_loss_mlp": 1.11566663, "epoch": 0.012265143544265745, "flos": 68447785075200.0, "grad_norm": 1.0779079266696805, "language_loss": 0.61234933, "learning_rate": 3.4240831701729837e-06, "loss": 0.63599181, "num_input_tokens_seen": 4330965, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.828125, "step": 204, "time_per_iteration": 3.0674383640289307 }, { "auxiliary_loss_clip": 0.01397262, "auxiliary_loss_mlp": 0.01114218, "balance_loss_clip": 1.06162333, "balance_loss_mlp": 1.11012149, "epoch": 0.012325266796933715, "flos": 17019252478080.0, "grad_norm": 2.2166715609231957, "language_loss": 0.91504025, "learning_rate": 3.4272315978819516e-06, "loss": 0.94015503, "num_input_tokens_seen": 4348200, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 2.875, "step": 205, "time_per_iteration": 2.527839183807373 }, { "auxiliary_loss_clip": 0.01404277, "auxiliary_loss_mlp": 0.01120314, "balance_loss_clip": 1.06643105, "balance_loss_mlp": 1.11452317, "epoch": 0.012385390049601683, "flos": 20190056538240.0, "grad_norm": 2.3498766193061487, "language_loss": 0.89287794, "learning_rate": 3.4303647047142043e-06, "loss": 0.91812396, "num_input_tokens_seen": 4365460, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.90625, "step": 206, "time_per_iteration": 2.550358533859253 }, { "auxiliary_loss_clip": 0.01393837, "auxiliary_loss_mlp": 0.01101257, "balance_loss_clip": 1.04928148, "balance_loss_mlp": 1.10799813, "epoch": 0.012445513302269652, "flos": 16253134272000.0, "grad_norm": 2.235557709080016, "language_loss": 0.95362753, "learning_rate": 3.43348263905683e-06, "loss": 0.97857845, "num_input_tokens_seen": 4383650, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.859375, "step": 207, "time_per_iteration": 2.5071187019348145 }, { "auxiliary_loss_clip": 0.01394935, "auxiliary_loss_mlp": 0.0112008, "balance_loss_clip": 1.06705558, "balance_loss_mlp": 1.11429513, "epoch": 0.012505636554937622, "flos": 23769380954880.0, "grad_norm": 2.9321717105316987, "language_loss": 0.75952744, "learning_rate": 3.436585547151547e-06, "loss": 0.78467757, "num_input_tokens_seen": 4403765, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.8125, "step": 208, "time_per_iteration": 2.5757555961608887 }, { "auxiliary_loss_clip": 0.01383306, "auxiliary_loss_mlp": 0.01107596, "balance_loss_clip": 1.05490541, "balance_loss_mlp": 1.10842752, "epoch": 0.012565759807605591, "flos": 30591546157440.0, "grad_norm": 3.197124579283733, "language_loss": 0.98604006, "learning_rate": 3.4396735731358586e-06, "loss": 1.01094902, "num_input_tokens_seen": 4421935, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 2.75, "step": 209, "time_per_iteration": 2.5986828804016113 }, { "auxiliary_loss_clip": 0.01388262, "auxiliary_loss_mlp": 0.01112998, "balance_loss_clip": 1.05978298, "balance_loss_mlp": 1.1093154, "epoch": 0.012625883060273561, "flos": 40113511355520.0, "grad_norm": 2.886285968176816, "language_loss": 0.85446668, "learning_rate": 3.4427468590832302e-06, "loss": 0.87947935, "num_input_tokens_seen": 4441470, "router_z_loss_clip": 0.53125, "router_z_loss_mlp": 2.78125, "step": 210, "time_per_iteration": 2.7097630500793457 }, { "auxiliary_loss_clip": 0.01386165, "auxiliary_loss_mlp": 0.01120388, "balance_loss_clip": 1.07003427, "balance_loss_mlp": 1.1077044, "epoch": 0.01268600631294153, "flos": 27089178629760.0, "grad_norm": 3.9215234066689475, "language_loss": 0.97128499, "learning_rate": 3.445805545042314e-06, "loss": 0.99635047, "num_input_tokens_seen": 4459950, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 2.78125, "step": 211, "time_per_iteration": 2.6352903842926025 }, { "auxiliary_loss_clip": 0.01394979, "auxiliary_loss_mlp": 0.01118643, "balance_loss_clip": 1.06528473, "balance_loss_mlp": 1.1134038, "epoch": 0.012746129565609499, "flos": 16982767238400.0, "grad_norm": 2.559116389485065, "language_loss": 0.94765723, "learning_rate": 3.448849769075239e-06, "loss": 0.97279346, "num_input_tokens_seen": 4478390, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 2.8125, "step": 212, "time_per_iteration": 2.578293561935425 }, { "auxiliary_loss_clip": 0.01383068, "auxiliary_loss_mlp": 0.01116646, "balance_loss_clip": 1.06433725, "balance_loss_mlp": 1.10985136, "epoch": 0.012806252818277469, "flos": 46533476995200.0, "grad_norm": 1.918788455832942, "language_loss": 0.76299739, "learning_rate": 3.4518796672950093e-06, "loss": 0.7879945, "num_input_tokens_seen": 4501665, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 2.71875, "step": 213, "time_per_iteration": 2.794999599456787 }, { "auxiliary_loss_clip": 0.01387538, "auxiliary_loss_mlp": 0.01108809, "balance_loss_clip": 1.05745339, "balance_loss_mlp": 1.10831201, "epoch": 0.012866376070945438, "flos": 14388616120320.0, "grad_norm": 2.9472034188142344, "language_loss": 0.86508143, "learning_rate": 3.4548953739020187e-06, "loss": 0.89004493, "num_input_tokens_seen": 4519055, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 2.78125, "step": 214, "time_per_iteration": 2.5185632705688477 }, { "auxiliary_loss_clip": 0.01383344, "auxiliary_loss_mlp": 0.01124371, "balance_loss_clip": 1.07032132, "balance_loss_mlp": 1.112288, "epoch": 0.012926499323613408, "flos": 26140813793280.0, "grad_norm": 2.2191374406950164, "language_loss": 0.77532566, "learning_rate": 3.4578970212197196e-06, "loss": 0.80040282, "num_input_tokens_seen": 4540870, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.71875, "step": 215, "time_per_iteration": 2.6434929370880127 }, { "auxiliary_loss_clip": 0.01394323, "auxiliary_loss_mlp": 0.01116938, "balance_loss_clip": 1.06503451, "balance_loss_mlp": 1.11182916, "epoch": 0.012986622576281377, "flos": 30117202128000.0, "grad_norm": 2.4475576603744953, "language_loss": 0.90301299, "learning_rate": 3.460884739729461e-06, "loss": 0.92812556, "num_input_tokens_seen": 4560395, "router_z_loss_clip": 0.51953125, "router_z_loss_mlp": 2.828125, "step": 216, "time_per_iteration": 2.607722282409668 }, { "auxiliary_loss_clip": 0.01384614, "auxiliary_loss_mlp": 0.01108452, "balance_loss_clip": 1.05676293, "balance_loss_mlp": 1.10599744, "epoch": 0.013046745828949347, "flos": 13954025468160.0, "grad_norm": 3.0763643100198514, "language_loss": 0.93857372, "learning_rate": 3.463858658104523e-06, "loss": 0.96350437, "num_input_tokens_seen": 4575785, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 2.78125, "step": 217, "time_per_iteration": 2.502384901046753 }, { "auxiliary_loss_clip": 0.01378779, "auxiliary_loss_mlp": 0.0110725, "balance_loss_clip": 1.05365384, "balance_loss_mlp": 1.10481608, "epoch": 0.013106869081617315, "flos": 17347835116800.0, "grad_norm": 2.136620810959797, "language_loss": 0.93773878, "learning_rate": 3.4668189032433696e-06, "loss": 0.96259904, "num_input_tokens_seen": 4594985, "router_z_loss_clip": 0.5390625, "router_z_loss_mlp": 2.75, "step": 218, "time_per_iteration": 2.5120351314544678 }, { "auxiliary_loss_clip": 0.01375664, "auxiliary_loss_mlp": 0.01106668, "balance_loss_clip": 1.05574203, "balance_loss_mlp": 1.10495245, "epoch": 0.013166992334285284, "flos": 25884914325120.0, "grad_norm": 2.0812658232589607, "language_loss": 0.86060667, "learning_rate": 3.46976560030214e-06, "loss": 0.88543004, "num_input_tokens_seen": 4616125, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.703125, "step": 219, "time_per_iteration": 2.5875296592712402 }, { "auxiliary_loss_clip": 0.0138017, "auxiliary_loss_mlp": 0.01101656, "balance_loss_clip": 1.05092096, "balance_loss_mlp": 1.10748315, "epoch": 0.013227115586953254, "flos": 31175956437120.0, "grad_norm": 1.9977449440863648, "language_loss": 0.87664866, "learning_rate": 3.4726988727263976e-06, "loss": 0.90146691, "num_input_tokens_seen": 4637795, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.71875, "step": 220, "time_per_iteration": 2.5992214679718018 }, { "auxiliary_loss_clip": 0.01372188, "auxiliary_loss_mlp": 0.0110621, "balance_loss_clip": 1.05959964, "balance_loss_mlp": 1.10345709, "epoch": 0.013287238839621223, "flos": 20409470766720.0, "grad_norm": 2.793824620530796, "language_loss": 0.8648138, "learning_rate": 3.475618842282164e-06, "loss": 0.88959783, "num_input_tokens_seen": 4656835, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 2.6875, "step": 221, "time_per_iteration": 2.5275168418884277 }, { "auxiliary_loss_clip": 0.01378361, "auxiliary_loss_mlp": 0.01109767, "balance_loss_clip": 1.05884111, "balance_loss_mlp": 1.10367703, "epoch": 0.013347362092289193, "flos": 14137134024960.0, "grad_norm": 2.383868460112561, "language_loss": 0.9197197, "learning_rate": 3.4785256290862486e-06, "loss": 0.944601, "num_input_tokens_seen": 4673015, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.75, "step": 222, "time_per_iteration": 2.48663067817688 }, { "auxiliary_loss_clip": 0.01374401, "auxiliary_loss_mlp": 0.01103039, "balance_loss_clip": 1.04934692, "balance_loss_mlp": 1.10569668, "epoch": 0.013407485344957162, "flos": 21797705554560.0, "grad_norm": 2.305176844210774, "language_loss": 0.95557398, "learning_rate": 3.481419351635897e-06, "loss": 0.98034835, "num_input_tokens_seen": 4692355, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 2.6875, "step": 223, "time_per_iteration": 2.546250820159912 }, { "auxiliary_loss_clip": 0.01376177, "auxiliary_loss_mlp": 0.01105044, "balance_loss_clip": 1.05521464, "balance_loss_mlp": 1.10614359, "epoch": 0.013467608597625132, "flos": 18621622195200.0, "grad_norm": 2.5884846137815507, "language_loss": 0.88136375, "learning_rate": 3.484300126837776e-06, "loss": 0.90617585, "num_input_tokens_seen": 4710080, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 2.6875, "step": 224, "time_per_iteration": 2.510716676712036 }, { "auxiliary_loss_clip": 0.01375335, "auxiliary_loss_mlp": 0.01103372, "balance_loss_clip": 1.0497992, "balance_loss_mlp": 1.10526109, "epoch": 0.013527731850293101, "flos": 18552314903040.0, "grad_norm": 1.9263567782834914, "language_loss": 0.89491385, "learning_rate": 3.487168070036317e-06, "loss": 0.91970092, "num_input_tokens_seen": 4728980, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 2.703125, "step": 225, "time_per_iteration": 5.466860055923462 }, { "auxiliary_loss_clip": 0.0137235, "auxiliary_loss_mlp": 0.0111697, "balance_loss_clip": 1.06466091, "balance_loss_mlp": 1.10536098, "epoch": 0.01358785510296107, "flos": 19165381257600.0, "grad_norm": 2.4342829749340202, "language_loss": 0.99007666, "learning_rate": 3.4900232950414224e-06, "loss": 1.01496994, "num_input_tokens_seen": 4747020, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 2.671875, "step": 226, "time_per_iteration": 3.9983134269714355 }, { "auxiliary_loss_clip": 0.01378215, "auxiliary_loss_mlp": 0.0111277, "balance_loss_clip": 1.0591737, "balance_loss_mlp": 1.10806966, "epoch": 0.01364797835562904, "flos": 23329941966720.0, "grad_norm": 2.432118897448922, "language_loss": 0.91275084, "learning_rate": 3.4928659141555727e-06, "loss": 0.93766069, "num_input_tokens_seen": 4765000, "router_z_loss_clip": 0.53515625, "router_z_loss_mlp": 2.703125, "step": 227, "time_per_iteration": 2.568648338317871 }, { "auxiliary_loss_clip": 0.01267201, "auxiliary_loss_mlp": 0.01029667, "balance_loss_clip": 1.00992572, "balance_loss_mlp": 1.11420023, "epoch": 0.013708101608297009, "flos": 70993746097920.0, "grad_norm": 1.0443531736803868, "language_loss": 0.57678187, "learning_rate": 3.4956960382003234e-06, "loss": 0.59975052, "num_input_tokens_seen": 4833210, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.53125, "step": 228, "time_per_iteration": 3.2147223949432373 }, { "auxiliary_loss_clip": 0.01367763, "auxiliary_loss_mlp": 0.01111192, "balance_loss_clip": 1.0621016, "balance_loss_mlp": 1.10421646, "epoch": 0.013768224860964979, "flos": 16325170997760.0, "grad_norm": 2.4380317718818763, "language_loss": 0.88039589, "learning_rate": 3.4985137765422354e-06, "loss": 0.90518546, "num_input_tokens_seen": 4850120, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 2.640625, "step": 229, "time_per_iteration": 2.534351110458374 }, { "auxiliary_loss_clip": 0.01376428, "auxiliary_loss_mlp": 0.01098185, "balance_loss_clip": 1.04854608, "balance_loss_mlp": 1.10455585, "epoch": 0.013828348113632948, "flos": 20193037367040.0, "grad_norm": 2.745906473980769, "language_loss": 0.84297878, "learning_rate": 3.501319237118231e-06, "loss": 0.8677249, "num_input_tokens_seen": 4866215, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 2.71875, "step": 230, "time_per_iteration": 2.5028982162475586 }, { "auxiliary_loss_clip": 0.01375442, "auxiliary_loss_mlp": 0.01117317, "balance_loss_clip": 1.06796432, "balance_loss_mlp": 1.10695219, "epoch": 0.013888471366300916, "flos": 20741070147840.0, "grad_norm": 2.180039599187197, "language_loss": 0.90538323, "learning_rate": 3.5041125264604056e-06, "loss": 0.93031085, "num_input_tokens_seen": 4885630, "router_z_loss_clip": 0.49414062, "router_z_loss_mlp": 2.6875, "step": 231, "time_per_iteration": 2.550241470336914 }, { "auxiliary_loss_clip": 0.01378129, "auxiliary_loss_mlp": 0.01106065, "balance_loss_clip": 1.05754709, "balance_loss_mlp": 1.11045396, "epoch": 0.013948594618968886, "flos": 22090628966400.0, "grad_norm": 2.3299116109650546, "language_loss": 0.84092283, "learning_rate": 3.5068937497203002e-06, "loss": 0.86576474, "num_input_tokens_seen": 4905570, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 2.671875, "step": 232, "time_per_iteration": 2.528928756713867 }, { "auxiliary_loss_clip": 0.01379124, "auxiliary_loss_mlp": 0.01092603, "balance_loss_clip": 1.04205799, "balance_loss_mlp": 1.10156703, "epoch": 0.014008717871636855, "flos": 19063108258560.0, "grad_norm": 3.909368233271189, "language_loss": 0.74456751, "learning_rate": 3.509663010692652e-06, "loss": 0.76928478, "num_input_tokens_seen": 4923535, "router_z_loss_clip": 0.5078125, "router_z_loss_mlp": 2.765625, "step": 233, "time_per_iteration": 2.5652828216552734 }, { "auxiliary_loss_clip": 0.01382298, "auxiliary_loss_mlp": 0.01120696, "balance_loss_clip": 1.06893551, "balance_loss_mlp": 1.10846245, "epoch": 0.014068841124304825, "flos": 14530822064640.0, "grad_norm": 2.373170508232446, "language_loss": 0.85600466, "learning_rate": 3.512420411838642e-06, "loss": 0.88103455, "num_input_tokens_seen": 4939200, "router_z_loss_clip": 0.515625, "router_z_loss_mlp": 2.734375, "step": 234, "time_per_iteration": 2.5272157192230225 }, { "auxiliary_loss_clip": 0.0137551, "auxiliary_loss_mlp": 0.0111046, "balance_loss_clip": 1.06198955, "balance_loss_mlp": 1.10870099, "epoch": 0.014128964376972794, "flos": 18077396256000.0, "grad_norm": 2.4820616500309347, "language_loss": 0.89420748, "learning_rate": 3.515166054308634e-06, "loss": 0.91906714, "num_input_tokens_seen": 4956620, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 2.65625, "step": 235, "time_per_iteration": 2.5231070518493652 }, { "auxiliary_loss_clip": 0.01374311, "auxiliary_loss_mlp": 0.01121368, "balance_loss_clip": 1.0713712, "balance_loss_mlp": 1.11012244, "epoch": 0.014189087629640764, "flos": 25334331678720.0, "grad_norm": 2.845723954787431, "language_loss": 0.85788113, "learning_rate": 3.5179000379644498e-06, "loss": 0.88283795, "num_input_tokens_seen": 4975650, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.640625, "step": 236, "time_per_iteration": 2.5640292167663574 }, { "auxiliary_loss_clip": 0.01370837, "auxiliary_loss_mlp": 0.01097287, "balance_loss_clip": 1.04752922, "balance_loss_mlp": 1.10266542, "epoch": 0.014249210882308733, "flos": 36139744713600.0, "grad_norm": 1.8689018533859567, "language_loss": 0.8259452, "learning_rate": 3.520622461401154e-06, "loss": 0.85062647, "num_input_tokens_seen": 4997415, "router_z_loss_clip": 0.49804688, "router_z_loss_mlp": 2.6875, "step": 237, "time_per_iteration": 2.6984920501708984 }, { "auxiliary_loss_clip": 0.01370561, "auxiliary_loss_mlp": 0.01118888, "balance_loss_clip": 1.06676972, "balance_loss_mlp": 1.1063019, "epoch": 0.014309334134976702, "flos": 12932977461120.0, "grad_norm": 2.758527654769079, "language_loss": 0.77250671, "learning_rate": 3.5233334219683935e-06, "loss": 0.79740125, "num_input_tokens_seen": 5013905, "router_z_loss_clip": 0.5234375, "router_z_loss_mlp": 2.640625, "step": 238, "time_per_iteration": 2.5168800354003906 }, { "auxiliary_loss_clip": 0.013678, "auxiliary_loss_mlp": 0.01110766, "balance_loss_clip": 1.06451237, "balance_loss_mlp": 1.10842633, "epoch": 0.014369457387644672, "flos": 20777519473920.0, "grad_norm": 2.2290508046735016, "language_loss": 0.87085044, "learning_rate": 3.526033015791284e-06, "loss": 0.89563608, "num_input_tokens_seen": 5033645, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 2.59375, "step": 239, "time_per_iteration": 2.5919413566589355 }, { "auxiliary_loss_clip": 0.0135278, "auxiliary_loss_mlp": 0.01102633, "balance_loss_clip": 1.05642796, "balance_loss_mlp": 1.09991825, "epoch": 0.01442958064031264, "flos": 25848536826240.0, "grad_norm": 2.185664980822043, "language_loss": 0.93097413, "learning_rate": 3.528721337790862e-06, "loss": 0.95552826, "num_input_tokens_seen": 5052875, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 2.53125, "step": 240, "time_per_iteration": 2.5696496963500977 }, { "auxiliary_loss_clip": 0.01362106, "auxiliary_loss_mlp": 0.0109974, "balance_loss_clip": 1.05391574, "balance_loss_mlp": 1.1046102, "epoch": 0.014489703892980611, "flos": 28219718269440.0, "grad_norm": 6.501107619860489, "language_loss": 0.85076874, "learning_rate": 3.531398481704111e-06, "loss": 0.87538719, "num_input_tokens_seen": 5075005, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 2.578125, "step": 241, "time_per_iteration": 2.5995266437530518 }, { "auxiliary_loss_clip": 0.01361417, "auxiliary_loss_mlp": 0.01119444, "balance_loss_clip": 1.0704968, "balance_loss_mlp": 1.11159897, "epoch": 0.01454982714564858, "flos": 22490925108480.0, "grad_norm": 2.028622151062323, "language_loss": 0.88611639, "learning_rate": 3.534064540103573e-06, "loss": 0.91092503, "num_input_tokens_seen": 5091875, "router_z_loss_clip": 0.49023438, "router_z_loss_mlp": 2.5, "step": 242, "time_per_iteration": 2.5516774654388428 }, { "auxiliary_loss_clip": 0.01359561, "auxiliary_loss_mlp": 0.01100719, "balance_loss_clip": 1.05184317, "balance_loss_mlp": 1.10313678, "epoch": 0.014609950398316548, "flos": 21653201139840.0, "grad_norm": 2.1784949101197606, "language_loss": 0.86744255, "learning_rate": 3.536719604416555e-06, "loss": 0.89204538, "num_input_tokens_seen": 5111290, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 2.5625, "step": 243, "time_per_iteration": 2.5382399559020996 }, { "auxiliary_loss_clip": 0.01367137, "auxiliary_loss_mlp": 0.01104179, "balance_loss_clip": 1.05465961, "balance_loss_mlp": 1.10733068, "epoch": 0.014670073650984519, "flos": 21869993675520.0, "grad_norm": 2.127500008283092, "language_loss": 0.84371388, "learning_rate": 3.5393637649439464e-06, "loss": 0.86842704, "num_input_tokens_seen": 5132265, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 2.59375, "step": 244, "time_per_iteration": 2.543245553970337 }, { "auxiliary_loss_clip": 0.01375533, "auxiliary_loss_mlp": 0.01113392, "balance_loss_clip": 1.06237018, "balance_loss_mlp": 1.10966396, "epoch": 0.014730196903652487, "flos": 23183713699200.0, "grad_norm": 2.454493988408214, "language_loss": 0.78606188, "learning_rate": 3.54199711087864e-06, "loss": 0.81095117, "num_input_tokens_seen": 5148575, "router_z_loss_clip": 0.51171875, "router_z_loss_mlp": 2.65625, "step": 245, "time_per_iteration": 2.5540196895599365 }, { "auxiliary_loss_clip": 0.01374208, "auxiliary_loss_mlp": 0.01101482, "balance_loss_clip": 1.04871988, "balance_loss_mlp": 1.10561776, "epoch": 0.014790320156320457, "flos": 23222605150080.0, "grad_norm": 2.0783396933445264, "language_loss": 0.84130061, "learning_rate": 3.5446197303235913e-06, "loss": 0.86605752, "num_input_tokens_seen": 5170415, "router_z_loss_clip": 0.52734375, "router_z_loss_mlp": 2.6875, "step": 246, "time_per_iteration": 2.6024787425994873 }, { "auxiliary_loss_clip": 0.01367211, "auxiliary_loss_mlp": 0.01096236, "balance_loss_clip": 1.04664528, "balance_loss_mlp": 1.10388827, "epoch": 0.014850443408988426, "flos": 15815490963840.0, "grad_norm": 4.513708925387945, "language_loss": 0.8997879, "learning_rate": 3.5472317103095034e-06, "loss": 0.92442238, "num_input_tokens_seen": 5188565, "router_z_loss_clip": 0.49609375, "router_z_loss_mlp": 2.625, "step": 247, "time_per_iteration": 2.524625539779663 }, { "auxiliary_loss_clip": 0.01366545, "auxiliary_loss_mlp": 0.01098482, "balance_loss_clip": 1.05075085, "balance_loss_mlp": 1.09955788, "epoch": 0.014910566661656396, "flos": 22781657790720.0, "grad_norm": 2.0765764209455027, "language_loss": 0.78208518, "learning_rate": 3.549833136812155e-06, "loss": 0.80673552, "num_input_tokens_seen": 5207810, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 2.671875, "step": 248, "time_per_iteration": 2.6159374713897705 }, { "auxiliary_loss_clip": 0.01367053, "auxiliary_loss_mlp": 0.01103517, "balance_loss_clip": 1.05468845, "balance_loss_mlp": 1.10798609, "epoch": 0.014970689914324365, "flos": 26865023806080.0, "grad_norm": 2.5164251128133084, "language_loss": 0.8391301, "learning_rate": 3.552424094769381e-06, "loss": 0.86383575, "num_input_tokens_seen": 5226210, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 2.59375, "step": 249, "time_per_iteration": 2.5864601135253906 }, { "auxiliary_loss_clip": 0.01360788, "auxiliary_loss_mlp": 0.01102386, "balance_loss_clip": 1.05534613, "balance_loss_mlp": 1.10222101, "epoch": 0.015030813166992334, "flos": 13985662371840.0, "grad_norm": 2.736899736267203, "language_loss": 0.93598241, "learning_rate": 3.5550046680977174e-06, "loss": 0.96061409, "num_input_tokens_seen": 5241660, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 2.578125, "step": 250, "time_per_iteration": 2.5398812294006348 }, { "auxiliary_loss_clip": 0.01368827, "auxiliary_loss_mlp": 0.0111119, "balance_loss_clip": 1.06105089, "balance_loss_mlp": 1.1068306, "epoch": 0.015090936419660304, "flos": 24717817618560.0, "grad_norm": 2.3289734657687284, "language_loss": 0.97004038, "learning_rate": 3.5575749397087034e-06, "loss": 0.9948405, "num_input_tokens_seen": 5261090, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.625, "step": 251, "time_per_iteration": 2.58241868019104 }, { "auxiliary_loss_clip": 0.01362932, "auxiliary_loss_mlp": 0.0110762, "balance_loss_clip": 1.05953121, "balance_loss_mlp": 1.10146904, "epoch": 0.015151059672328273, "flos": 25738793798400.0, "grad_norm": 1.8799336236169382, "language_loss": 0.84324682, "learning_rate": 3.5601349915248707e-06, "loss": 0.86795235, "num_input_tokens_seen": 5279175, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 2.625, "step": 252, "time_per_iteration": 2.6041908264160156 }, { "auxiliary_loss_clip": 0.01358122, "auxiliary_loss_mlp": 0.01110798, "balance_loss_clip": 1.06263733, "balance_loss_mlp": 1.10379362, "epoch": 0.015211182924996243, "flos": 21871214737920.0, "grad_norm": 2.4567644196808063, "language_loss": 0.98210096, "learning_rate": 3.5626849044954064e-06, "loss": 1.00679016, "num_input_tokens_seen": 5296975, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 2.546875, "step": 253, "time_per_iteration": 2.5209951400756836 }, { "auxiliary_loss_clip": 0.01251335, "auxiliary_loss_mlp": 0.01031776, "balance_loss_clip": 1.01441944, "balance_loss_mlp": 1.11586905, "epoch": 0.015271306177664212, "flos": 66895080888960.0, "grad_norm": 0.848826166727114, "language_loss": 0.5559845, "learning_rate": 3.5652247586115167e-06, "loss": 0.57881558, "num_input_tokens_seen": 5358375, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.3515625, "step": 254, "time_per_iteration": 3.1301915645599365 }, { "auxiliary_loss_clip": 0.01363852, "auxiliary_loss_mlp": 0.01113882, "balance_loss_clip": 1.06574571, "balance_loss_mlp": 1.10072541, "epoch": 0.01533142943033218, "flos": 26834069260800.0, "grad_norm": 4.478275182090854, "language_loss": 0.90273011, "learning_rate": 3.567754632921479e-06, "loss": 0.92750746, "num_input_tokens_seen": 5377255, "router_z_loss_clip": 0.48242188, "router_z_loss_mlp": 2.625, "step": 255, "time_per_iteration": 2.5792081356048584 }, { "auxiliary_loss_clip": 0.0135879, "auxiliary_loss_mlp": 0.01127623, "balance_loss_clip": 1.07891393, "balance_loss_mlp": 1.10130537, "epoch": 0.01539155268300015, "flos": 20813753318400.0, "grad_norm": 2.419260178233055, "language_loss": 0.85451412, "learning_rate": 3.5702746055454075e-06, "loss": 0.8793782, "num_input_tokens_seen": 5395320, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 2.578125, "step": 256, "time_per_iteration": 2.5613856315612793 }, { "auxiliary_loss_clip": 0.01365997, "auxiliary_loss_mlp": 0.01113056, "balance_loss_clip": 1.0641799, "balance_loss_mlp": 1.10230911, "epoch": 0.01545167593566812, "flos": 15961862885760.0, "grad_norm": 2.5973057313389227, "language_loss": 0.71346384, "learning_rate": 3.5727847536897254e-06, "loss": 0.73825437, "num_input_tokens_seen": 5411970, "router_z_loss_clip": 0.48828125, "router_z_loss_mlp": 2.625, "step": 257, "time_per_iteration": 2.5090572834014893 }, { "auxiliary_loss_clip": 0.01358059, "auxiliary_loss_mlp": 0.01102375, "balance_loss_clip": 1.05414295, "balance_loss_mlp": 1.10182536, "epoch": 0.01551179918833609, "flos": 22601745544320.0, "grad_norm": 2.7729645289556024, "language_loss": 0.9466126, "learning_rate": 3.5752851536613596e-06, "loss": 0.97121692, "num_input_tokens_seen": 5430245, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 2.5625, "step": 258, "time_per_iteration": 2.5567331314086914 }, { "auxiliary_loss_clip": 0.01359, "auxiliary_loss_mlp": 0.01106716, "balance_loss_clip": 1.05943763, "balance_loss_mlp": 1.10046351, "epoch": 0.015571922441004058, "flos": 22816706486400.0, "grad_norm": 2.2450074094754044, "language_loss": 0.92959511, "learning_rate": 3.577775880881658e-06, "loss": 0.95425218, "num_input_tokens_seen": 5448905, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 2.59375, "step": 259, "time_per_iteration": 2.5155255794525146 }, { "auxiliary_loss_clip": 0.01351674, "auxiliary_loss_mlp": 0.0109759, "balance_loss_clip": 1.05205202, "balance_loss_mlp": 1.1033082, "epoch": 0.015632045693672027, "flos": 18947439486720.0, "grad_norm": 1.7851721914882286, "language_loss": 0.9723646, "learning_rate": 3.5802570099000424e-06, "loss": 0.99685729, "num_input_tokens_seen": 5466405, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 2.484375, "step": 260, "time_per_iteration": 2.544299364089966 }, { "auxiliary_loss_clip": 0.01366653, "auxiliary_loss_mlp": 0.01115038, "balance_loss_clip": 1.0683794, "balance_loss_mlp": 1.10341442, "epoch": 0.015692168946339995, "flos": 29971728046080.0, "grad_norm": 2.139768288987081, "language_loss": 0.87899017, "learning_rate": 3.5827286144073947e-06, "loss": 0.90380716, "num_input_tokens_seen": 5487055, "router_z_loss_clip": 0.46679688, "router_z_loss_mlp": 2.625, "step": 261, "time_per_iteration": 2.605931520462036 }, { "auxiliary_loss_clip": 0.01355906, "auxiliary_loss_mlp": 0.01105632, "balance_loss_clip": 1.05782855, "balance_loss_mlp": 1.0983032, "epoch": 0.015752292199007967, "flos": 19392085946880.0, "grad_norm": 3.2674426511391874, "language_loss": 0.67222679, "learning_rate": 3.5851907672491904e-06, "loss": 0.69684219, "num_input_tokens_seen": 5506600, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 2.578125, "step": 262, "time_per_iteration": 2.558858633041382 }, { "auxiliary_loss_clip": 0.01354584, "auxiliary_loss_mlp": 0.01123129, "balance_loss_clip": 1.07387137, "balance_loss_mlp": 1.10070419, "epoch": 0.015812415451675936, "flos": 20339804338560.0, "grad_norm": 2.2255060224620906, "language_loss": 0.68245518, "learning_rate": 3.587643540438383e-06, "loss": 0.70723236, "num_input_tokens_seen": 5524350, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 2.53125, "step": 263, "time_per_iteration": 2.5524160861968994 }, { "auxiliary_loss_clip": 0.01356852, "auxiliary_loss_mlp": 0.01106724, "balance_loss_clip": 1.05834842, "balance_loss_mlp": 1.09741569, "epoch": 0.015872538704343905, "flos": 17525412979200.0, "grad_norm": 2.760690508127858, "language_loss": 0.85293746, "learning_rate": 3.590087005168037e-06, "loss": 0.87757325, "num_input_tokens_seen": 5542145, "router_z_loss_clip": 0.484375, "router_z_loss_mlp": 2.59375, "step": 264, "time_per_iteration": 2.609189987182617 }, { "auxiliary_loss_clip": 0.01360698, "auxiliary_loss_mlp": 0.01090581, "balance_loss_clip": 1.04520953, "balance_loss_mlp": 1.1020658, "epoch": 0.015932661957011873, "flos": 15260490944640.0, "grad_norm": 2.3415104274618104, "language_loss": 1.04191995, "learning_rate": 3.5925212318237344e-06, "loss": 1.06643271, "num_input_tokens_seen": 5557920, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 2.59375, "step": 265, "time_per_iteration": 2.5057365894317627 }, { "auxiliary_loss_clip": 0.01365025, "auxiliary_loss_mlp": 0.01110202, "balance_loss_clip": 1.05968118, "balance_loss_mlp": 1.10510933, "epoch": 0.015992785209679845, "flos": 20302528999680.0, "grad_norm": 5.3868608103183595, "language_loss": 0.75128371, "learning_rate": 3.5949462899957323e-06, "loss": 0.77603602, "num_input_tokens_seen": 5576290, "router_z_loss_clip": 0.50390625, "router_z_loss_mlp": 2.59375, "step": 266, "time_per_iteration": 2.5603187084198 }, { "auxiliary_loss_clip": 0.01351247, "auxiliary_loss_mlp": 0.01101818, "balance_loss_clip": 1.05370498, "balance_loss_mlp": 1.10146356, "epoch": 0.016052908462347814, "flos": 23362368969600.0, "grad_norm": 1.7716247551893975, "language_loss": 0.90637523, "learning_rate": 3.5973622484909068e-06, "loss": 0.93090594, "num_input_tokens_seen": 5595205, "router_z_loss_clip": 0.48046875, "router_z_loss_mlp": 2.5, "step": 267, "time_per_iteration": 6.880797386169434 }, { "auxiliary_loss_clip": 0.01360477, "auxiliary_loss_mlp": 0.01114869, "balance_loss_clip": 1.06868768, "balance_loss_mlp": 1.10168576, "epoch": 0.016113031715015783, "flos": 21286588976640.0, "grad_norm": 4.282366814412464, "language_loss": 0.85988832, "learning_rate": 3.599769175344462e-06, "loss": 0.88464177, "num_input_tokens_seen": 5612645, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 2.59375, "step": 268, "time_per_iteration": 2.5493011474609375 }, { "auxiliary_loss_clip": 0.01354285, "auxiliary_loss_mlp": 0.01094452, "balance_loss_clip": 1.04853249, "balance_loss_mlp": 1.10428953, "epoch": 0.01617315496768375, "flos": 18914689261440.0, "grad_norm": 2.274617501699293, "language_loss": 0.88355774, "learning_rate": 3.602167137831432e-06, "loss": 0.90804511, "num_input_tokens_seen": 5628345, "router_z_loss_clip": 0.45898438, "router_z_loss_mlp": 2.5, "step": 269, "time_per_iteration": 2.5022950172424316 }, { "auxiliary_loss_clip": 0.01357341, "auxiliary_loss_mlp": 0.01098979, "balance_loss_clip": 1.04907775, "balance_loss_mlp": 1.09992361, "epoch": 0.01623327822035172, "flos": 16546488647040.0, "grad_norm": 2.2673895522305223, "language_loss": 0.96939617, "learning_rate": 3.6045562024779565e-06, "loss": 0.99395937, "num_input_tokens_seen": 5645940, "router_z_loss_clip": 0.5, "router_z_loss_mlp": 2.578125, "step": 270, "time_per_iteration": 2.518014430999756 }, { "auxiliary_loss_clip": 0.01358876, "auxiliary_loss_mlp": 0.01115371, "balance_loss_clip": 1.06842589, "balance_loss_mlp": 1.10588121, "epoch": 0.016293401473019692, "flos": 23513481486720.0, "grad_norm": 2.146803186492245, "language_loss": 0.86142069, "learning_rate": 3.606936435072361e-06, "loss": 0.88616318, "num_input_tokens_seen": 5665690, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 2.53125, "step": 271, "time_per_iteration": 2.5545215606689453 }, { "auxiliary_loss_clip": 0.01354934, "auxiliary_loss_mlp": 0.01104064, "balance_loss_clip": 1.05683303, "balance_loss_mlp": 1.09625793, "epoch": 0.01635352472568766, "flos": 29016072748800.0, "grad_norm": 2.394712665324855, "language_loss": 0.81122267, "learning_rate": 3.609307900676025e-06, "loss": 0.83581269, "num_input_tokens_seen": 5683190, "router_z_loss_clip": 0.47265625, "router_z_loss_mlp": 2.578125, "step": 272, "time_per_iteration": 2.5856080055236816 }, { "auxiliary_loss_clip": 0.01349839, "auxiliary_loss_mlp": 0.01119131, "balance_loss_clip": 1.07373619, "balance_loss_mlp": 1.09980679, "epoch": 0.01641364797835563, "flos": 13370513028480.0, "grad_norm": 2.443675754047478, "language_loss": 0.81058455, "learning_rate": 3.611670663634051e-06, "loss": 0.83527428, "num_input_tokens_seen": 5699780, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 2.5, "step": 273, "time_per_iteration": 2.4901235103607178 }, { "auxiliary_loss_clip": 0.01350322, "auxiliary_loss_mlp": 0.01103658, "balance_loss_clip": 1.05742836, "balance_loss_mlp": 1.09601605, "epoch": 0.016473771231023598, "flos": 18878239935360.0, "grad_norm": 5.648848107231918, "language_loss": 0.91497123, "learning_rate": 3.614024787585744e-06, "loss": 0.93951094, "num_input_tokens_seen": 5716980, "router_z_loss_clip": 0.46289062, "router_z_loss_mlp": 2.546875, "step": 274, "time_per_iteration": 2.519705295562744 }, { "auxiliary_loss_clip": 0.01346047, "auxiliary_loss_mlp": 0.01105902, "balance_loss_clip": 1.05874276, "balance_loss_mlp": 1.09723711, "epoch": 0.016533894483691566, "flos": 22601637803520.0, "grad_norm": 1.850578731771583, "language_loss": 0.88308626, "learning_rate": 3.6163703354748927e-06, "loss": 0.90760577, "num_input_tokens_seen": 5737780, "router_z_loss_clip": 0.47070312, "router_z_loss_mlp": 2.5, "step": 275, "time_per_iteration": 2.5548794269561768 }, { "auxiliary_loss_clip": 0.01348891, "auxiliary_loss_mlp": 0.01102261, "balance_loss_clip": 1.05307484, "balance_loss_mlp": 1.09760237, "epoch": 0.01659401773635954, "flos": 21507188353920.0, "grad_norm": 1.700144925355286, "language_loss": 0.80724609, "learning_rate": 3.6187073695598707e-06, "loss": 0.83175761, "num_input_tokens_seen": 5758330, "router_z_loss_clip": 0.4921875, "router_z_loss_mlp": 2.515625, "step": 276, "time_per_iteration": 2.5665156841278076 }, { "auxiliary_loss_clip": 0.0134149, "auxiliary_loss_mlp": 0.01098474, "balance_loss_clip": 1.05479586, "balance_loss_mlp": 1.09692168, "epoch": 0.016654140989027507, "flos": 32850973411200.0, "grad_norm": 1.759031957892041, "language_loss": 0.81140125, "learning_rate": 3.621035951423551e-06, "loss": 0.83580083, "num_input_tokens_seen": 5778340, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 2.4375, "step": 277, "time_per_iteration": 2.6310436725616455 }, { "auxiliary_loss_clip": 0.01340076, "auxiliary_loss_mlp": 0.01093297, "balance_loss_clip": 1.0468049, "balance_loss_mlp": 1.09149456, "epoch": 0.016714264241695476, "flos": 12306228024960.0, "grad_norm": 2.365282504091954, "language_loss": 0.80404222, "learning_rate": 3.623356141983041e-06, "loss": 0.82837594, "num_input_tokens_seen": 5794295, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 2.484375, "step": 278, "time_per_iteration": 2.5065183639526367 }, { "auxiliary_loss_clip": 0.01344958, "auxiliary_loss_mlp": 0.01098951, "balance_loss_clip": 1.05315053, "balance_loss_mlp": 1.09663451, "epoch": 0.016774387494363444, "flos": 27123796362240.0, "grad_norm": 1.9284639754624229, "language_loss": 0.90659571, "learning_rate": 3.6256680014992486e-06, "loss": 0.9310348, "num_input_tokens_seen": 5814405, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 2.484375, "step": 279, "time_per_iteration": 2.5926995277404785 }, { "auxiliary_loss_clip": 0.013496, "auxiliary_loss_mlp": 0.01114133, "balance_loss_clip": 1.0677371, "balance_loss_mlp": 1.09661818, "epoch": 0.016834510747031413, "flos": 20191493082240.0, "grad_norm": 2.6067797187008455, "language_loss": 0.93848324, "learning_rate": 3.6279715895862713e-06, "loss": 0.96312058, "num_input_tokens_seen": 5832795, "router_z_loss_clip": 0.46484375, "router_z_loss_mlp": 2.53125, "step": 280, "time_per_iteration": 2.5523064136505127 }, { "auxiliary_loss_clip": 0.01351169, "auxiliary_loss_mlp": 0.01109048, "balance_loss_clip": 1.06117308, "balance_loss_mlp": 1.09571922, "epoch": 0.016894633999699385, "flos": 27274262434560.0, "grad_norm": 2.077176787588556, "language_loss": 0.74151003, "learning_rate": 3.6302669652206183e-06, "loss": 0.76611209, "num_input_tokens_seen": 5855750, "router_z_loss_clip": 0.47851562, "router_z_loss_mlp": 2.5625, "step": 281, "time_per_iteration": 2.6008400917053223 }, { "auxiliary_loss_clip": 0.01344592, "auxiliary_loss_mlp": 0.01114444, "balance_loss_clip": 1.07043231, "balance_loss_mlp": 1.09692693, "epoch": 0.016954757252367354, "flos": 14902964922240.0, "grad_norm": 2.5241592991780113, "language_loss": 0.80143046, "learning_rate": 3.632554186750274e-06, "loss": 0.82602078, "num_input_tokens_seen": 5872610, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 2.484375, "step": 282, "time_per_iteration": 2.488654136657715 }, { "auxiliary_loss_clip": 0.01350196, "auxiliary_loss_mlp": 0.0111731, "balance_loss_clip": 1.07167637, "balance_loss_mlp": 1.09861541, "epoch": 0.017014880505035322, "flos": 21358805270400.0, "grad_norm": 1.9658252011297952, "language_loss": 0.77516758, "learning_rate": 3.6348333119035937e-06, "loss": 0.79984272, "num_input_tokens_seen": 5892985, "router_z_loss_clip": 0.45703125, "router_z_loss_mlp": 2.515625, "step": 283, "time_per_iteration": 2.5344631671905518 }, { "auxiliary_loss_clip": 0.01350501, "auxiliary_loss_mlp": 0.01092862, "balance_loss_clip": 1.04951739, "balance_loss_mlp": 1.10017228, "epoch": 0.01707500375770329, "flos": 35333154858240.0, "grad_norm": 2.4090141873415085, "language_loss": 0.8407799, "learning_rate": 3.6371043977980503e-06, "loss": 0.86521351, "num_input_tokens_seen": 5914060, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 2.5, "step": 284, "time_per_iteration": 2.6349682807922363 }, { "auxiliary_loss_clip": 0.01339898, "auxiliary_loss_mlp": 0.01099405, "balance_loss_clip": 1.05260372, "balance_loss_mlp": 1.09432817, "epoch": 0.01713512701037126, "flos": 23582070506880.0, "grad_norm": 2.7676441846182747, "language_loss": 0.97166061, "learning_rate": 3.639367500948819e-06, "loss": 0.9960537, "num_input_tokens_seen": 5932860, "router_z_loss_clip": 0.46875, "router_z_loss_mlp": 2.453125, "step": 285, "time_per_iteration": 2.533193588256836 }, { "auxiliary_loss_clip": 0.01344477, "auxiliary_loss_mlp": 0.01094761, "balance_loss_clip": 1.05120242, "balance_loss_mlp": 1.09676802, "epoch": 0.01719525026303923, "flos": 27634661544960.0, "grad_norm": 2.1950903573043914, "language_loss": 0.93831563, "learning_rate": 3.6416226772772178e-06, "loss": 0.962708, "num_input_tokens_seen": 5952725, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 2.484375, "step": 286, "time_per_iteration": 2.618377685546875 }, { "auxiliary_loss_clip": 0.01334871, "auxiliary_loss_mlp": 0.01089697, "balance_loss_clip": 1.04461193, "balance_loss_mlp": 1.0915066, "epoch": 0.0172553735157072, "flos": 26979722910720.0, "grad_norm": 1.7617835717943322, "language_loss": 0.92200089, "learning_rate": 3.643869982119001e-06, "loss": 0.9462465, "num_input_tokens_seen": 5970560, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 2.4375, "step": 287, "time_per_iteration": 2.557338237762451 }, { "auxiliary_loss_clip": 0.01340913, "auxiliary_loss_mlp": 0.01089469, "balance_loss_clip": 1.04440808, "balance_loss_mlp": 1.09156847, "epoch": 0.01731549676837517, "flos": 14056621689600.0, "grad_norm": 2.53213931765849, "language_loss": 1.02137005, "learning_rate": 3.646109470232502e-06, "loss": 1.04567385, "num_input_tokens_seen": 5982980, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 2.5, "step": 288, "time_per_iteration": 2.475588321685791 }, { "auxiliary_loss_clip": 0.01232335, "auxiliary_loss_mlp": 0.01054401, "balance_loss_clip": 1.0400008, "balance_loss_mlp": 1.10198736, "epoch": 0.017375620021043137, "flos": 66510694471680.0, "grad_norm": 0.9104682962860843, "language_loss": 0.63830692, "learning_rate": 3.6483411958066417e-06, "loss": 0.6611743, "num_input_tokens_seen": 6049445, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 1.3046875, "step": 289, "time_per_iteration": 3.2557427883148193 }, { "auxiliary_loss_clip": 0.01342126, "auxiliary_loss_mlp": 0.01107749, "balance_loss_clip": 1.06573915, "balance_loss_mlp": 1.09729731, "epoch": 0.01743574327371111, "flos": 15225154940160.0, "grad_norm": 2.5747960080168504, "language_loss": 0.88728976, "learning_rate": 3.6505652124687957e-06, "loss": 0.91178846, "num_input_tokens_seen": 6064150, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 2.453125, "step": 290, "time_per_iteration": 2.457124948501587 }, { "auxiliary_loss_clip": 0.01339197, "auxiliary_loss_mlp": 0.0109061, "balance_loss_clip": 1.04674053, "balance_loss_mlp": 1.09488249, "epoch": 0.017495866526379078, "flos": 25373869574400.0, "grad_norm": 3.5873338414729665, "language_loss": 0.8483901, "learning_rate": 3.6527815732925258e-06, "loss": 0.87268817, "num_input_tokens_seen": 6083920, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 2.4375, "step": 291, "time_per_iteration": 2.5482912063598633 }, { "auxiliary_loss_clip": 0.01346436, "auxiliary_loss_mlp": 0.01104241, "balance_loss_clip": 1.05658126, "balance_loss_mlp": 1.1027559, "epoch": 0.017555989779047047, "flos": 26359473836160.0, "grad_norm": 1.6594822776274643, "language_loss": 0.72655612, "learning_rate": 3.6549903308051806e-06, "loss": 0.75106287, "num_input_tokens_seen": 6105460, "router_z_loss_clip": 0.4765625, "router_z_loss_mlp": 2.4375, "step": 292, "time_per_iteration": 2.5769221782684326 }, { "auxiliary_loss_clip": 0.01332906, "auxiliary_loss_mlp": 0.01101795, "balance_loss_clip": 1.05816483, "balance_loss_mlp": 1.09311819, "epoch": 0.017616113031715015, "flos": 22338807010560.0, "grad_norm": 2.313538895559701, "language_loss": 0.87254131, "learning_rate": 3.6571915369953646e-06, "loss": 0.89688832, "num_input_tokens_seen": 6122890, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 2.40625, "step": 293, "time_per_iteration": 2.529618263244629 }, { "auxiliary_loss_clip": 0.0133465, "auxiliary_loss_mlp": 0.01104267, "balance_loss_clip": 1.06108975, "balance_loss_mlp": 1.09355807, "epoch": 0.017676236284382984, "flos": 20156911263360.0, "grad_norm": 2.1675596309365788, "language_loss": 0.81045449, "learning_rate": 3.6593852433202797e-06, "loss": 0.8348437, "num_input_tokens_seen": 6142890, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 2.40625, "step": 294, "time_per_iteration": 2.5307679176330566 }, { "auxiliary_loss_clip": 0.01333813, "auxiliary_loss_mlp": 0.01110759, "balance_loss_clip": 1.06653225, "balance_loss_mlp": 1.09006405, "epoch": 0.017736359537050956, "flos": 25223331674880.0, "grad_norm": 1.993056482544507, "language_loss": 0.84062266, "learning_rate": 3.6615715007129453e-06, "loss": 0.86506832, "num_input_tokens_seen": 6162030, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 2.4375, "step": 295, "time_per_iteration": 2.571242332458496 }, { "auxiliary_loss_clip": 0.0134011, "auxiliary_loss_mlp": 0.01112691, "balance_loss_clip": 1.06915569, "balance_loss_mlp": 1.10007691, "epoch": 0.017796482789718925, "flos": 20338798757760.0, "grad_norm": 2.175839863705816, "language_loss": 0.84689069, "learning_rate": 3.6637503595892897e-06, "loss": 0.87141865, "num_input_tokens_seen": 6180540, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 2.40625, "step": 296, "time_per_iteration": 2.501227378845215 }, { "auxiliary_loss_clip": 0.01339548, "auxiliary_loss_mlp": 0.01098849, "balance_loss_clip": 1.05590951, "balance_loss_mlp": 1.09557295, "epoch": 0.017856606042386893, "flos": 22379206832640.0, "grad_norm": 2.2486854336453472, "language_loss": 0.8801049, "learning_rate": 3.665921869855132e-06, "loss": 0.9044888, "num_input_tokens_seen": 6199425, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 2.4375, "step": 297, "time_per_iteration": 2.5258405208587646 }, { "auxiliary_loss_clip": 0.01338417, "auxiliary_loss_mlp": 0.01096924, "balance_loss_clip": 1.05443799, "balance_loss_mlp": 1.09367323, "epoch": 0.017916729295054862, "flos": 20230061310720.0, "grad_norm": 2.829307739060305, "language_loss": 0.88657022, "learning_rate": 3.6680860809130346e-06, "loss": 0.9109236, "num_input_tokens_seen": 6219170, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 2.4375, "step": 298, "time_per_iteration": 2.508406162261963 }, { "auxiliary_loss_clip": 0.01335425, "auxiliary_loss_mlp": 0.01118107, "balance_loss_clip": 1.0730226, "balance_loss_mlp": 1.09604931, "epoch": 0.01797685254772283, "flos": 19390972625280.0, "grad_norm": 2.5733203794026376, "language_loss": 0.88871539, "learning_rate": 3.6702430416690516e-06, "loss": 0.91325068, "num_input_tokens_seen": 6237930, "router_z_loss_clip": 0.45117188, "router_z_loss_mlp": 2.40625, "step": 299, "time_per_iteration": 2.524803638458252 }, { "auxiliary_loss_clip": 0.01341153, "auxiliary_loss_mlp": 0.01103621, "balance_loss_clip": 1.05896485, "balance_loss_mlp": 1.09519386, "epoch": 0.018036975800390802, "flos": 24426007528320.0, "grad_norm": 4.259928524174069, "language_loss": 0.65228081, "learning_rate": 3.672392800539357e-06, "loss": 0.67672849, "num_input_tokens_seen": 6257170, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 2.46875, "step": 300, "time_per_iteration": 2.545060396194458 }, { "auxiliary_loss_clip": 0.01338972, "auxiliary_loss_mlp": 0.0111334, "balance_loss_clip": 1.06870842, "balance_loss_mlp": 1.0971365, "epoch": 0.01809709905305877, "flos": 15778933896960.0, "grad_norm": 2.3953497008946587, "language_loss": 0.88289803, "learning_rate": 3.6745354054567686e-06, "loss": 0.90742117, "num_input_tokens_seen": 6274780, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 2.421875, "step": 301, "time_per_iteration": 2.547278642654419 }, { "auxiliary_loss_clip": 0.01218723, "auxiliary_loss_mlp": 0.01090702, "balance_loss_clip": 1.07620609, "balance_loss_mlp": 1.09111476, "epoch": 0.01815722230572674, "flos": 67348382526720.0, "grad_norm": 0.8423080957913763, "language_loss": 0.62148595, "learning_rate": 3.676670903877158e-06, "loss": 0.64458025, "num_input_tokens_seen": 6340435, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 1.2734375, "step": 302, "time_per_iteration": 3.282719612121582 }, { "auxiliary_loss_clip": 0.01329728, "auxiliary_loss_mlp": 0.01104486, "balance_loss_clip": 1.0599966, "balance_loss_mlp": 1.0906105, "epoch": 0.01821734555839471, "flos": 15485615435520.0, "grad_norm": 2.5586757271288385, "language_loss": 0.89616013, "learning_rate": 3.6787993427857567e-06, "loss": 0.92050219, "num_input_tokens_seen": 6358160, "router_z_loss_clip": 0.4453125, "router_z_loss_mlp": 2.375, "step": 303, "time_per_iteration": 2.6277682781219482 }, { "auxiliary_loss_clip": 0.01337535, "auxiliary_loss_mlp": 0.01118174, "balance_loss_clip": 1.07273126, "balance_loss_mlp": 1.0960958, "epoch": 0.018277468811062677, "flos": 24097424889600.0, "grad_norm": 1.732092127909313, "language_loss": 0.80122077, "learning_rate": 3.680920768703364e-06, "loss": 0.82577783, "num_input_tokens_seen": 6378485, "router_z_loss_clip": 0.45507812, "router_z_loss_mlp": 2.40625, "step": 304, "time_per_iteration": 2.5581061840057373 }, { "auxiliary_loss_clip": 0.0133262, "auxiliary_loss_mlp": 0.0109684, "balance_loss_clip": 1.054878, "balance_loss_mlp": 1.09881318, "epoch": 0.01833759206373065, "flos": 20959335141120.0, "grad_norm": 1.6334355927358795, "language_loss": 0.82915628, "learning_rate": 3.6830352276924415e-06, "loss": 0.85345089, "num_input_tokens_seen": 6397845, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 2.34375, "step": 305, "time_per_iteration": 2.532565116882324 }, { "auxiliary_loss_clip": 0.01333978, "auxiliary_loss_mlp": 0.010914, "balance_loss_clip": 1.04977167, "balance_loss_mlp": 1.09198141, "epoch": 0.018397715316398618, "flos": 19390757143680.0, "grad_norm": 2.1362029305169603, "language_loss": 0.91140556, "learning_rate": 3.685142765363119e-06, "loss": 0.93565929, "num_input_tokens_seen": 6416475, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 2.421875, "step": 306, "time_per_iteration": 2.5173048973083496 }, { "auxiliary_loss_clip": 0.01326345, "auxiliary_loss_mlp": 0.01092154, "balance_loss_clip": 1.05026364, "balance_loss_mlp": 1.08795166, "epoch": 0.018457838569066586, "flos": 29132531619840.0, "grad_norm": 3.5610546616378205, "language_loss": 0.8670876, "learning_rate": 3.687243426879095e-06, "loss": 0.89127254, "num_input_tokens_seen": 6437520, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 2.390625, "step": 307, "time_per_iteration": 2.573647975921631 }, { "auxiliary_loss_clip": 0.01327885, "auxiliary_loss_mlp": 0.01105593, "balance_loss_clip": 1.05941129, "balance_loss_mlp": 1.09343052, "epoch": 0.018517961821734555, "flos": 19208654167680.0, "grad_norm": 2.3088565469532076, "language_loss": 0.71706998, "learning_rate": 3.6893372569634466e-06, "loss": 0.74140477, "num_input_tokens_seen": 6455680, "router_z_loss_clip": 0.4609375, "router_z_loss_mlp": 2.34375, "step": 308, "time_per_iteration": 3.9615001678466797 }, { "auxiliary_loss_clip": 0.01334543, "auxiliary_loss_mlp": 0.01105104, "balance_loss_clip": 1.06199753, "balance_loss_mlp": 1.09007239, "epoch": 0.018578085074402523, "flos": 19863018184320.0, "grad_norm": 2.2197975549275744, "language_loss": 0.92018056, "learning_rate": 3.6914242999043395e-06, "loss": 0.94457704, "num_input_tokens_seen": 6474880, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 2.453125, "step": 309, "time_per_iteration": 3.985370635986328 }, { "auxiliary_loss_clip": 0.01344187, "auxiliary_loss_mlp": 0.01113545, "balance_loss_clip": 1.06814981, "balance_loss_mlp": 1.0924952, "epoch": 0.018638208327070496, "flos": 29606947476480.0, "grad_norm": 2.0706648626427615, "language_loss": 0.72512555, "learning_rate": 3.69350459956065e-06, "loss": 0.74970281, "num_input_tokens_seen": 6495945, "router_z_loss_clip": 0.453125, "router_z_loss_mlp": 2.515625, "step": 310, "time_per_iteration": 2.582362651824951 }, { "auxiliary_loss_clip": 0.01332248, "auxiliary_loss_mlp": 0.01113803, "balance_loss_clip": 1.06974375, "balance_loss_mlp": 1.09596467, "epoch": 0.018698331579738464, "flos": 45731555907840.0, "grad_norm": 2.403984632221903, "language_loss": 0.73799443, "learning_rate": 3.695578199367497e-06, "loss": 0.76245487, "num_input_tokens_seen": 6519930, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 2.359375, "step": 311, "time_per_iteration": 2.7544467449188232 }, { "auxiliary_loss_clip": 0.01341272, "auxiliary_loss_mlp": 0.01111413, "balance_loss_clip": 1.06899858, "balance_loss_mlp": 1.09231508, "epoch": 0.018758454832406433, "flos": 20483662308480.0, "grad_norm": 2.4916581103535953, "language_loss": 0.91628766, "learning_rate": 3.6976451423416825e-06, "loss": 0.9408145, "num_input_tokens_seen": 6535070, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 2.5, "step": 312, "time_per_iteration": 2.516752243041992 }, { "auxiliary_loss_clip": 0.01339628, "auxiliary_loss_mlp": 0.01116127, "balance_loss_clip": 1.07123327, "balance_loss_mlp": 1.09434938, "epoch": 0.0188185780850744, "flos": 15777784661760.0, "grad_norm": 2.4261274611390955, "language_loss": 0.89802587, "learning_rate": 3.699705471087043e-06, "loss": 0.92258346, "num_input_tokens_seen": 6554135, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 2.453125, "step": 313, "time_per_iteration": 2.5052692890167236 }, { "auxiliary_loss_clip": 0.01342791, "auxiliary_loss_mlp": 0.01103021, "balance_loss_clip": 1.05815113, "balance_loss_mlp": 1.09276998, "epoch": 0.018878701337742373, "flos": 22455732758400.0, "grad_norm": 3.8597071017574747, "language_loss": 0.73206389, "learning_rate": 3.7017592277997256e-06, "loss": 0.756522, "num_input_tokens_seen": 6572275, "router_z_loss_clip": 0.44726562, "router_z_loss_mlp": 2.5, "step": 314, "time_per_iteration": 2.5341238975524902 }, { "auxiliary_loss_clip": 0.01330664, "auxiliary_loss_mlp": 0.01108385, "balance_loss_clip": 1.06573224, "balance_loss_mlp": 1.09082556, "epoch": 0.018938824590410342, "flos": 30993530238720.0, "grad_norm": 2.7948599373201866, "language_loss": 0.89708591, "learning_rate": 3.7038064542733654e-06, "loss": 0.92147636, "num_input_tokens_seen": 6594520, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 2.40625, "step": 315, "time_per_iteration": 2.602092981338501 }, { "auxiliary_loss_clip": 0.01331552, "auxiliary_loss_mlp": 0.01098419, "balance_loss_clip": 1.05452645, "balance_loss_mlp": 1.09172845, "epoch": 0.01899894784307831, "flos": 23258910821760.0, "grad_norm": 1.8847260430390418, "language_loss": 0.80452079, "learning_rate": 3.7058471919041945e-06, "loss": 0.82882047, "num_input_tokens_seen": 6614245, "router_z_loss_clip": 0.43945312, "router_z_loss_mlp": 2.40625, "step": 316, "time_per_iteration": 2.522705554962158 }, { "auxiliary_loss_clip": 0.01326122, "auxiliary_loss_mlp": 0.01096977, "balance_loss_clip": 1.05430007, "balance_loss_mlp": 1.089185, "epoch": 0.01905907109574628, "flos": 17457901367040.0, "grad_norm": 2.2552452065708652, "language_loss": 0.90320945, "learning_rate": 3.7078814816960605e-06, "loss": 0.9274404, "num_input_tokens_seen": 6632015, "router_z_loss_clip": 0.42773438, "router_z_loss_mlp": 2.375, "step": 317, "time_per_iteration": 2.5083999633789062 }, { "auxiliary_loss_clip": 0.01324122, "auxiliary_loss_mlp": 0.01098099, "balance_loss_clip": 1.054039, "balance_loss_mlp": 1.08887863, "epoch": 0.019119194348414248, "flos": 14970225139200.0, "grad_norm": 2.4245767671309144, "language_loss": 0.91063154, "learning_rate": 3.709909364265374e-06, "loss": 0.93485373, "num_input_tokens_seen": 6649015, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 2.359375, "step": 318, "time_per_iteration": 2.4710183143615723 }, { "auxiliary_loss_clip": 0.01325041, "auxiliary_loss_mlp": 0.01087098, "balance_loss_clip": 1.0460186, "balance_loss_mlp": 1.08735704, "epoch": 0.01917931760108222, "flos": 25482822503040.0, "grad_norm": 4.341422018094531, "language_loss": 0.93924999, "learning_rate": 3.7119308798459706e-06, "loss": 0.9633714, "num_input_tokens_seen": 6669225, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 2.375, "step": 319, "time_per_iteration": 2.555241346359253 }, { "auxiliary_loss_clip": 0.01215044, "auxiliary_loss_mlp": 0.01142321, "balance_loss_clip": 1.12954223, "balance_loss_mlp": 1.08222866, "epoch": 0.01923944085375019, "flos": 71556967353600.0, "grad_norm": 0.9389228822403781, "language_loss": 0.59850258, "learning_rate": 3.7139460682939026e-06, "loss": 0.62207621, "num_input_tokens_seen": 6725775, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 1.328125, "step": 320, "time_per_iteration": 3.044332504272461 }, { "auxiliary_loss_clip": 0.01322676, "auxiliary_loss_mlp": 0.01100977, "balance_loss_clip": 1.05863404, "balance_loss_mlp": 1.08743227, "epoch": 0.019299564106418157, "flos": 19682495406720.0, "grad_norm": 2.3170134563612543, "language_loss": 0.90033007, "learning_rate": 3.715954969092154e-06, "loss": 0.92456663, "num_input_tokens_seen": 6744170, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 2.359375, "step": 321, "time_per_iteration": 2.500312566757202 }, { "auxiliary_loss_clip": 0.01332895, "auxiliary_loss_mlp": 0.01116442, "balance_loss_clip": 1.07274008, "balance_loss_mlp": 1.09163702, "epoch": 0.019359687359086126, "flos": 24387151991040.0, "grad_norm": 2.4830634465364465, "language_loss": 0.8275491, "learning_rate": 3.7179576213552805e-06, "loss": 0.85204256, "num_input_tokens_seen": 6764565, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 2.40625, "step": 322, "time_per_iteration": 2.554072380065918 }, { "auxiliary_loss_clip": 0.01332217, "auxiliary_loss_mlp": 0.010884, "balance_loss_clip": 1.04774988, "balance_loss_mlp": 1.0896678, "epoch": 0.019419810611754094, "flos": 23951376190080.0, "grad_norm": 2.425725583358623, "language_loss": 0.72544193, "learning_rate": 3.719954063833981e-06, "loss": 0.74964815, "num_input_tokens_seen": 6785310, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 2.4375, "step": 323, "time_per_iteration": 2.5795133113861084 }, { "auxiliary_loss_clip": 0.01322425, "auxiliary_loss_mlp": 0.01089664, "balance_loss_clip": 1.04744017, "balance_loss_mlp": 1.08477235, "epoch": 0.019479933864422067, "flos": 22160223567360.0, "grad_norm": 1.935234999304951, "language_loss": 0.92461991, "learning_rate": 3.721944334919596e-06, "loss": 0.94874084, "num_input_tokens_seen": 6803290, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 2.375, "step": 324, "time_per_iteration": 2.5986714363098145 }, { "auxiliary_loss_clip": 0.01330378, "auxiliary_loss_mlp": 0.01090017, "balance_loss_clip": 1.04953396, "balance_loss_mlp": 1.09102356, "epoch": 0.019540057117090035, "flos": 22236821320320.0, "grad_norm": 2.7938312238503236, "language_loss": 0.65501744, "learning_rate": 3.7239284726485375e-06, "loss": 0.67922139, "num_input_tokens_seen": 6822570, "router_z_loss_clip": 0.40429688, "router_z_loss_mlp": 2.390625, "step": 325, "time_per_iteration": 2.5224552154541016 }, { "auxiliary_loss_clip": 0.01330348, "auxiliary_loss_mlp": 0.01099276, "balance_loss_clip": 1.05657554, "balance_loss_mlp": 1.09648824, "epoch": 0.019600180369758004, "flos": 23076771932160.0, "grad_norm": 1.7498062313669267, "language_loss": 0.7645545, "learning_rate": 3.72590651470665e-06, "loss": 0.78885078, "num_input_tokens_seen": 6841910, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 2.328125, "step": 326, "time_per_iteration": 2.568861961364746 }, { "auxiliary_loss_clip": 0.01321812, "auxiliary_loss_mlp": 0.01099188, "balance_loss_clip": 1.05725014, "balance_loss_mlp": 1.09118927, "epoch": 0.019660303622425972, "flos": 25410857604480.0, "grad_norm": 2.3727911100053034, "language_loss": 0.79531157, "learning_rate": 3.727878498433505e-06, "loss": 0.81952155, "num_input_tokens_seen": 6862480, "router_z_loss_clip": 0.41992188, "router_z_loss_mlp": 2.3125, "step": 327, "time_per_iteration": 2.5558691024780273 }, { "auxiliary_loss_clip": 0.01329837, "auxiliary_loss_mlp": 0.01113545, "balance_loss_clip": 1.07184565, "balance_loss_mlp": 1.09200382, "epoch": 0.01972042687509394, "flos": 23657519024640.0, "grad_norm": 23.172246408293073, "language_loss": 0.80757403, "learning_rate": 3.7298444608266328e-06, "loss": 0.83200783, "num_input_tokens_seen": 6882015, "router_z_loss_clip": 0.41601562, "router_z_loss_mlp": 2.375, "step": 328, "time_per_iteration": 2.5691726207733154 }, { "auxiliary_loss_clip": 0.01326652, "auxiliary_loss_mlp": 0.0109407, "balance_loss_clip": 1.05079675, "balance_loss_mlp": 1.08625293, "epoch": 0.019780550127761913, "flos": 18223480869120.0, "grad_norm": 2.536768821940242, "language_loss": 0.93699944, "learning_rate": 3.731804438545683e-06, "loss": 0.96120667, "num_input_tokens_seen": 6899785, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 2.40625, "step": 329, "time_per_iteration": 2.4940669536590576 }, { "auxiliary_loss_clip": 0.0133633, "auxiliary_loss_mlp": 0.01106727, "balance_loss_clip": 1.06412125, "balance_loss_mlp": 1.09289241, "epoch": 0.01984067338042988, "flos": 22418780641920.0, "grad_norm": 3.526948740383348, "language_loss": 0.74846971, "learning_rate": 3.7337584679165324e-06, "loss": 0.77290022, "num_input_tokens_seen": 6918575, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 2.4375, "step": 330, "time_per_iteration": 2.5228540897369385 }, { "auxiliary_loss_clip": 0.01331365, "auxiliary_loss_mlp": 0.01121765, "balance_loss_clip": 1.07882619, "balance_loss_mlp": 1.09069121, "epoch": 0.01990079663309785, "flos": 17055199013760.0, "grad_norm": 3.4813178823992392, "language_loss": 0.93813872, "learning_rate": 3.7357065849353186e-06, "loss": 0.96267009, "num_input_tokens_seen": 6936965, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 2.40625, "step": 331, "time_per_iteration": 2.527099847793579 }, { "auxiliary_loss_clip": 0.01317588, "auxiliary_loss_mlp": 0.01094628, "balance_loss_clip": 1.05466914, "balance_loss_mlp": 1.08747745, "epoch": 0.01996091988576582, "flos": 15961791058560.0, "grad_norm": 2.2053906139645707, "language_loss": 0.92777777, "learning_rate": 3.737648825272422e-06, "loss": 0.95189989, "num_input_tokens_seen": 6953475, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 2.296875, "step": 332, "time_per_iteration": 2.508551836013794 }, { "auxiliary_loss_clip": 0.01325064, "auxiliary_loss_mlp": 0.01097199, "balance_loss_clip": 1.05468869, "balance_loss_mlp": 1.09235787, "epoch": 0.02002104313843379, "flos": 23586451966080.0, "grad_norm": 2.597614842862832, "language_loss": 0.75640428, "learning_rate": 3.739585224276384e-06, "loss": 0.78062683, "num_input_tokens_seen": 6971630, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 2.328125, "step": 333, "time_per_iteration": 2.5378963947296143 }, { "auxiliary_loss_clip": 0.01328691, "auxiliary_loss_mlp": 0.01091264, "balance_loss_clip": 1.05039907, "balance_loss_mlp": 1.09161329, "epoch": 0.02008116639110176, "flos": 34094883352320.0, "grad_norm": 3.329228554728317, "language_loss": 0.79130971, "learning_rate": 3.7415158169777673e-06, "loss": 0.81550926, "num_input_tokens_seen": 6992775, "router_z_loss_clip": 0.40820312, "router_z_loss_mlp": 2.375, "step": 334, "time_per_iteration": 2.6625661849975586 }, { "auxiliary_loss_clip": 0.01325239, "auxiliary_loss_mlp": 0.01099303, "balance_loss_clip": 1.05519533, "balance_loss_mlp": 1.08508921, "epoch": 0.020141289643769728, "flos": 19683716469120.0, "grad_norm": 1.7450379640864333, "language_loss": 0.8321054, "learning_rate": 3.7434406380929575e-06, "loss": 0.85635084, "num_input_tokens_seen": 7011425, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 2.40625, "step": 335, "time_per_iteration": 2.5004706382751465 }, { "auxiliary_loss_clip": 0.01323352, "auxiliary_loss_mlp": 0.01094761, "balance_loss_clip": 1.05356216, "balance_loss_mlp": 1.088696, "epoch": 0.020201412896437697, "flos": 20740567357440.0, "grad_norm": 3.426061557872147, "language_loss": 0.92242247, "learning_rate": 3.745359722027911e-06, "loss": 0.94660354, "num_input_tokens_seen": 7029450, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 2.34375, "step": 336, "time_per_iteration": 2.5425310134887695 }, { "auxiliary_loss_clip": 0.01323528, "auxiliary_loss_mlp": 0.01088767, "balance_loss_clip": 1.04775894, "balance_loss_mlp": 1.08675027, "epoch": 0.020261536149105665, "flos": 20266510636800.0, "grad_norm": 1.7277665879107293, "language_loss": 0.88289708, "learning_rate": 3.7472731028818428e-06, "loss": 0.90701997, "num_input_tokens_seen": 7047555, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 2.359375, "step": 337, "time_per_iteration": 2.545161485671997 }, { "auxiliary_loss_clip": 0.01311723, "auxiliary_loss_mlp": 0.01104969, "balance_loss_clip": 1.06176722, "balance_loss_mlp": 1.08334804, "epoch": 0.020321659401773638, "flos": 25848752307840.0, "grad_norm": 1.6442992502216442, "language_loss": 0.89967299, "learning_rate": 3.7491808144508626e-06, "loss": 0.92383987, "num_input_tokens_seen": 7068185, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 2.28125, "step": 338, "time_per_iteration": 2.5727241039276123 }, { "auxiliary_loss_clip": 0.01324649, "auxiliary_loss_mlp": 0.01106535, "balance_loss_clip": 1.06443036, "balance_loss_mlp": 1.08821774, "epoch": 0.020381782654441606, "flos": 17495033051520.0, "grad_norm": 2.3391666645876774, "language_loss": 0.84997487, "learning_rate": 3.7510828902315576e-06, "loss": 0.87428677, "num_input_tokens_seen": 7085955, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 2.375, "step": 339, "time_per_iteration": 2.5549676418304443 }, { "auxiliary_loss_clip": 0.01330428, "auxiliary_loss_mlp": 0.01097985, "balance_loss_clip": 1.05440187, "balance_loss_mlp": 1.0916903, "epoch": 0.020441905907109575, "flos": 24243940465920.0, "grad_norm": 1.7521073224724872, "language_loss": 0.88754416, "learning_rate": 3.75297936342452e-06, "loss": 0.91182828, "num_input_tokens_seen": 7106345, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 2.390625, "step": 340, "time_per_iteration": 2.5222108364105225 }, { "auxiliary_loss_clip": 0.0132409, "auxiliary_loss_mlp": 0.01088374, "balance_loss_clip": 1.04417086, "balance_loss_mlp": 1.08785892, "epoch": 0.020502029159777543, "flos": 22233301787520.0, "grad_norm": 1.9925351408118266, "language_loss": 0.88413811, "learning_rate": 3.7548702669378253e-06, "loss": 0.90826273, "num_input_tokens_seen": 7125070, "router_z_loss_clip": 0.44140625, "router_z_loss_mlp": 2.359375, "step": 341, "time_per_iteration": 2.508939504623413 }, { "auxiliary_loss_clip": 0.01326806, "auxiliary_loss_mlp": 0.0109998, "balance_loss_clip": 1.0576607, "balance_loss_mlp": 1.08755732, "epoch": 0.020562152412445512, "flos": 23987861429760.0, "grad_norm": 2.3788760348579765, "language_loss": 0.80963254, "learning_rate": 3.756755633390458e-06, "loss": 0.83390045, "num_input_tokens_seen": 7144675, "router_z_loss_clip": 0.421875, "router_z_loss_mlp": 2.390625, "step": 342, "time_per_iteration": 2.5338046550750732 }, { "auxiliary_loss_clip": 0.01315573, "auxiliary_loss_mlp": 0.0109864, "balance_loss_clip": 1.05386496, "balance_loss_mlp": 1.08529854, "epoch": 0.020622275665113484, "flos": 26975305537920.0, "grad_norm": 1.7650570275888082, "language_loss": 0.89595807, "learning_rate": 3.7586354951156886e-06, "loss": 0.92010021, "num_input_tokens_seen": 7165505, "router_z_loss_clip": 0.44921875, "router_z_loss_mlp": 2.296875, "step": 343, "time_per_iteration": 2.6203551292419434 }, { "auxiliary_loss_clip": 0.01326332, "auxiliary_loss_mlp": 0.01097587, "balance_loss_clip": 1.05655479, "balance_loss_mlp": 1.09214187, "epoch": 0.020682398917781453, "flos": 22600704049920.0, "grad_norm": 4.754426759825993, "language_loss": 0.78314829, "learning_rate": 3.7605098841644e-06, "loss": 0.80738753, "num_input_tokens_seen": 7184605, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 2.34375, "step": 344, "time_per_iteration": 2.525099039077759 }, { "auxiliary_loss_clip": 0.01312363, "auxiliary_loss_mlp": 0.01104114, "balance_loss_clip": 1.06048369, "balance_loss_mlp": 1.08571482, "epoch": 0.02074252217044942, "flos": 15013605790080.0, "grad_norm": 1.7740621392052416, "language_loss": 0.75231695, "learning_rate": 3.7623788323083666e-06, "loss": 0.77648175, "num_input_tokens_seen": 7203065, "router_z_loss_clip": 0.43554688, "router_z_loss_mlp": 2.265625, "step": 345, "time_per_iteration": 2.478710889816284 }, { "auxiliary_loss_clip": 0.0131957, "auxiliary_loss_mlp": 0.01105863, "balance_loss_clip": 1.06299531, "balance_loss_mlp": 1.09040451, "epoch": 0.02080264542311739, "flos": 25337958952320.0, "grad_norm": 2.0583889738570806, "language_loss": 0.90386617, "learning_rate": 3.7642423710434837e-06, "loss": 0.92812049, "num_input_tokens_seen": 7222995, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 2.28125, "step": 346, "time_per_iteration": 2.548940896987915 }, { "auxiliary_loss_clip": 0.0131547, "auxiliary_loss_mlp": 0.01097653, "balance_loss_clip": 1.05781353, "balance_loss_mlp": 1.08616948, "epoch": 0.02086276867578536, "flos": 24388804016640.0, "grad_norm": 2.099065297385466, "language_loss": 0.78899336, "learning_rate": 3.7661005315929563e-06, "loss": 0.81312454, "num_input_tokens_seen": 7244625, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 2.28125, "step": 347, "time_per_iteration": 2.5569944381713867 }, { "auxiliary_loss_clip": 0.01318964, "auxiliary_loss_mlp": 0.0109768, "balance_loss_clip": 1.05407321, "balance_loss_mlp": 1.09097588, "epoch": 0.02092289192845333, "flos": 24462205459200.0, "grad_norm": 1.9991572878064565, "language_loss": 0.71241486, "learning_rate": 3.7679533449104354e-06, "loss": 0.73658121, "num_input_tokens_seen": 7263255, "router_z_loss_clip": 0.4375, "router_z_loss_mlp": 2.28125, "step": 348, "time_per_iteration": 2.6028711795806885 }, { "auxiliary_loss_clip": 0.01321072, "auxiliary_loss_mlp": 0.01106332, "balance_loss_clip": 1.06291628, "balance_loss_mlp": 1.08669186, "epoch": 0.0209830151811213, "flos": 17451185523840.0, "grad_norm": 2.476652342190157, "language_loss": 0.76953119, "learning_rate": 3.7698008416831116e-06, "loss": 0.79380524, "num_input_tokens_seen": 7279275, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 2.34375, "step": 349, "time_per_iteration": 2.4514026641845703 }, { "auxiliary_loss_clip": 0.013058, "auxiliary_loss_mlp": 0.01100348, "balance_loss_clip": 1.05922079, "balance_loss_mlp": 1.0854032, "epoch": 0.021043138433789268, "flos": 24573995562240.0, "grad_norm": 1.798800769926751, "language_loss": 0.85107028, "learning_rate": 3.7716430523347664e-06, "loss": 0.87513185, "num_input_tokens_seen": 7300180, "router_z_loss_clip": 0.41210938, "router_z_loss_mlp": 2.203125, "step": 350, "time_per_iteration": 6.938042163848877 }, { "auxiliary_loss_clip": 0.01315464, "auxiliary_loss_mlp": 0.01092779, "balance_loss_clip": 1.0537498, "balance_loss_mlp": 1.09038782, "epoch": 0.021103261686457236, "flos": 24454053072000.0, "grad_norm": 2.1743756609095173, "language_loss": 0.79864544, "learning_rate": 3.773480007028776e-06, "loss": 0.8227278, "num_input_tokens_seen": 7317430, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 2.25, "step": 351, "time_per_iteration": 2.490898609161377 }, { "auxiliary_loss_clip": 0.01321757, "auxiliary_loss_mlp": 0.01105725, "balance_loss_clip": 1.06233275, "balance_loss_mlp": 1.0902462, "epoch": 0.021163384939125205, "flos": 14683083816960.0, "grad_norm": 2.0008938414393294, "language_loss": 0.87708282, "learning_rate": 3.775311735671078e-06, "loss": 0.90135759, "num_input_tokens_seen": 7334875, "router_z_loss_clip": 0.43359375, "router_z_loss_mlp": 2.3125, "step": 352, "time_per_iteration": 2.458524703979492 }, { "auxiliary_loss_clip": 0.01314514, "auxiliary_loss_mlp": 0.01103913, "balance_loss_clip": 1.06154585, "balance_loss_mlp": 1.08895445, "epoch": 0.021223508191793177, "flos": 24493195918080.0, "grad_norm": 5.038840163336693, "language_loss": 0.82542479, "learning_rate": 3.7771382679130878e-06, "loss": 0.84960902, "num_input_tokens_seen": 7355185, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 2.25, "step": 353, "time_per_iteration": 2.512474536895752 }, { "auxiliary_loss_clip": 0.01309862, "auxiliary_loss_mlp": 0.01095757, "balance_loss_clip": 1.05548787, "balance_loss_mlp": 1.0867815, "epoch": 0.021283631444461146, "flos": 24126978804480.0, "grad_norm": 3.622872785270022, "language_loss": 0.80926037, "learning_rate": 3.7789596331545845e-06, "loss": 0.83331656, "num_input_tokens_seen": 7374425, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 2.21875, "step": 354, "time_per_iteration": 2.525636911392212 }, { "auxiliary_loss_clip": 0.01317334, "auxiliary_loss_mlp": 0.01094517, "balance_loss_clip": 1.05145884, "balance_loss_mlp": 1.08542418, "epoch": 0.021343754697129114, "flos": 25192233475200.0, "grad_norm": 2.457496721979758, "language_loss": 0.8090387, "learning_rate": 3.780775860546545e-06, "loss": 0.83315718, "num_input_tokens_seen": 7394175, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 2.3125, "step": 355, "time_per_iteration": 2.536756753921509 }, { "auxiliary_loss_clip": 0.0131292, "auxiliary_loss_mlp": 0.01092633, "balance_loss_clip": 1.05191088, "balance_loss_mlp": 1.08477998, "epoch": 0.021403877949797083, "flos": 17274182279040.0, "grad_norm": 2.307648166706137, "language_loss": 0.89465719, "learning_rate": 3.7825869789939474e-06, "loss": 0.91871274, "num_input_tokens_seen": 7412645, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 2.28125, "step": 356, "time_per_iteration": 2.493438959121704 }, { "auxiliary_loss_clip": 0.01309232, "auxiliary_loss_mlp": 0.01085736, "balance_loss_clip": 1.04332173, "balance_loss_mlp": 1.08655775, "epoch": 0.021464001202465055, "flos": 30917435276160.0, "grad_norm": 2.675082225228011, "language_loss": 0.80234122, "learning_rate": 3.784393017158528e-06, "loss": 0.82629097, "num_input_tokens_seen": 7432275, "router_z_loss_clip": 0.42382812, "router_z_loss_mlp": 2.21875, "step": 357, "time_per_iteration": 2.5662033557891846 }, { "auxiliary_loss_clip": 0.01310532, "auxiliary_loss_mlp": 0.01087119, "balance_loss_clip": 1.04885268, "balance_loss_mlp": 1.08397782, "epoch": 0.021524124455133024, "flos": 18186385098240.0, "grad_norm": 3.929392883060757, "language_loss": 0.76885372, "learning_rate": 3.786194003461506e-06, "loss": 0.79283023, "num_input_tokens_seen": 7450245, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 2.265625, "step": 358, "time_per_iteration": 2.4756102561950684 }, { "auxiliary_loss_clip": 0.01309101, "auxiliary_loss_mlp": 0.01093541, "balance_loss_clip": 1.05095935, "balance_loss_mlp": 1.0826683, "epoch": 0.021584247707800992, "flos": 13805786039040.0, "grad_norm": 2.339565077482264, "language_loss": 0.88794184, "learning_rate": 3.787989966086264e-06, "loss": 0.91196835, "num_input_tokens_seen": 7466845, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 2.265625, "step": 359, "time_per_iteration": 2.458895683288574 }, { "auxiliary_loss_clip": 0.01318057, "auxiliary_loss_mlp": 0.01090228, "balance_loss_clip": 1.051247, "balance_loss_mlp": 1.08772588, "epoch": 0.02164437096046896, "flos": 23294713703040.0, "grad_norm": 3.53871238592718, "language_loss": 0.7587136, "learning_rate": 3.789780932980997e-06, "loss": 0.78279638, "num_input_tokens_seen": 7485450, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 2.3125, "step": 360, "time_per_iteration": 2.5144340991973877 }, { "auxiliary_loss_clip": 0.01196413, "auxiliary_loss_mlp": 0.01107018, "balance_loss_clip": 1.09657538, "balance_loss_mlp": 1.07073259, "epoch": 0.02170449421313693, "flos": 68899578341760.0, "grad_norm": 0.8528611824429059, "language_loss": 0.64872646, "learning_rate": 3.79156693186132e-06, "loss": 0.6717608, "num_input_tokens_seen": 7553780, "router_z_loss_clip": 0.10449219, "router_z_loss_mlp": 1.2578125, "step": 361, "time_per_iteration": 3.24495005607605 }, { "auxiliary_loss_clip": 0.01306863, "auxiliary_loss_mlp": 0.01088385, "balance_loss_clip": 1.04785419, "balance_loss_mlp": 1.08032274, "epoch": 0.0217646174658049, "flos": 25228539146880.0, "grad_norm": 3.190326128185742, "language_loss": 0.78388387, "learning_rate": 3.7933479902128433e-06, "loss": 0.80783635, "num_input_tokens_seen": 7574155, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 2.265625, "step": 362, "time_per_iteration": 2.571192979812622 }, { "auxiliary_loss_clip": 0.0131143, "auxiliary_loss_mlp": 0.01095051, "balance_loss_clip": 1.05490112, "balance_loss_mlp": 1.08420289, "epoch": 0.02182474071847287, "flos": 22893124671360.0, "grad_norm": 2.0580148648615517, "language_loss": 0.92470711, "learning_rate": 3.7951241352937077e-06, "loss": 0.94877195, "num_input_tokens_seen": 7592320, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 2.265625, "step": 363, "time_per_iteration": 2.496018648147583 }, { "auxiliary_loss_clip": 0.01307637, "auxiliary_loss_mlp": 0.01103435, "balance_loss_clip": 1.06328547, "balance_loss_mlp": 1.08313799, "epoch": 0.02188486397114084, "flos": 23658991482240.0, "grad_norm": 2.0084308262136314, "language_loss": 0.89521313, "learning_rate": 3.7968953941370915e-06, "loss": 0.9193238, "num_input_tokens_seen": 7611185, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 2.25, "step": 364, "time_per_iteration": 2.555741548538208 }, { "auxiliary_loss_clip": 0.01312853, "auxiliary_loss_mlp": 0.01091796, "balance_loss_clip": 1.05040646, "balance_loss_mlp": 1.08668709, "epoch": 0.021944987223808807, "flos": 21543637680000.0, "grad_norm": 2.4365955151018808, "language_loss": 0.79513538, "learning_rate": 3.798661793553676e-06, "loss": 0.81918192, "num_input_tokens_seen": 7631970, "router_z_loss_clip": 0.4140625, "router_z_loss_mlp": 2.25, "step": 365, "time_per_iteration": 2.493415594100952 }, { "auxiliary_loss_clip": 0.01307764, "auxiliary_loss_mlp": 0.01098802, "balance_loss_clip": 1.05567217, "balance_loss_mlp": 1.08414006, "epoch": 0.022005110476476776, "flos": 16070887641600.0, "grad_norm": 2.026571452187941, "language_loss": 0.84473419, "learning_rate": 3.8004233601340808e-06, "loss": 0.86879981, "num_input_tokens_seen": 7649745, "router_z_loss_clip": 0.43164062, "router_z_loss_mlp": 2.25, "step": 366, "time_per_iteration": 2.4871299266815186 }, { "auxiliary_loss_clip": 0.01313116, "auxiliary_loss_mlp": 0.0109043, "balance_loss_clip": 1.05187726, "balance_loss_mlp": 1.0860281, "epoch": 0.022065233729144748, "flos": 21433715084160.0, "grad_norm": 2.2919336059409456, "language_loss": 0.87237614, "learning_rate": 3.8021801202512694e-06, "loss": 0.8964116, "num_input_tokens_seen": 7668830, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 2.28125, "step": 367, "time_per_iteration": 2.480649471282959 }, { "auxiliary_loss_clip": 0.0131595, "auxiliary_loss_mlp": 0.01094107, "balance_loss_clip": 1.05150151, "balance_loss_mlp": 1.08498836, "epoch": 0.022125356981812717, "flos": 21543709507200.0, "grad_norm": 2.419002984625343, "language_loss": 0.8476246, "learning_rate": 3.803932100062912e-06, "loss": 0.8717252, "num_input_tokens_seen": 7687240, "router_z_loss_clip": 0.42578125, "router_z_loss_mlp": 2.3125, "step": 368, "time_per_iteration": 2.5121419429779053 }, { "auxiliary_loss_clip": 0.01315228, "auxiliary_loss_mlp": 0.01081267, "balance_loss_clip": 1.04097426, "balance_loss_mlp": 1.083462, "epoch": 0.022185480234480685, "flos": 20704153944960.0, "grad_norm": 2.470816731725873, "language_loss": 0.75492239, "learning_rate": 3.8056793255137264e-06, "loss": 0.77888733, "num_input_tokens_seen": 7704440, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 2.328125, "step": 369, "time_per_iteration": 2.4871225357055664 }, { "auxiliary_loss_clip": 0.01308342, "auxiliary_loss_mlp": 0.01101489, "balance_loss_clip": 1.06143427, "balance_loss_mlp": 1.08407533, "epoch": 0.022245603487148654, "flos": 25193203142400.0, "grad_norm": 2.1252870798092, "language_loss": 0.8265397, "learning_rate": 3.8074218223377844e-06, "loss": 0.85063803, "num_input_tokens_seen": 7727160, "router_z_loss_clip": 0.40039062, "router_z_loss_mlp": 2.234375, "step": 370, "time_per_iteration": 2.565810203552246 }, { "auxiliary_loss_clip": 0.01307158, "auxiliary_loss_mlp": 0.01104251, "balance_loss_clip": 1.06360102, "balance_loss_mlp": 1.08266723, "epoch": 0.022305726739816623, "flos": 21395936954880.0, "grad_norm": 1.8883374502640995, "language_loss": 0.81681871, "learning_rate": 3.8091596160607834e-06, "loss": 0.84093285, "num_input_tokens_seen": 7747730, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 2.25, "step": 371, "time_per_iteration": 2.491245985031128 }, { "auxiliary_loss_clip": 0.01312827, "auxiliary_loss_mlp": 0.01097649, "balance_loss_clip": 1.05583048, "balance_loss_mlp": 1.0880779, "epoch": 0.022365849992484595, "flos": 22492146170880.0, "grad_norm": 1.929291293430794, "language_loss": 0.83182752, "learning_rate": 3.8108927320022896e-06, "loss": 0.85593224, "num_input_tokens_seen": 7766765, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 2.25, "step": 372, "time_per_iteration": 2.5471487045288086 }, { "auxiliary_loss_clip": 0.013053, "auxiliary_loss_mlp": 0.01094615, "balance_loss_clip": 1.05370235, "balance_loss_mlp": 1.08386409, "epoch": 0.022425973245152563, "flos": 17856581397120.0, "grad_norm": 3.924114074741921, "language_loss": 0.78555596, "learning_rate": 3.8126211952779548e-06, "loss": 0.80955505, "num_input_tokens_seen": 7784010, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 2.21875, "step": 373, "time_per_iteration": 2.4561538696289062 }, { "auxiliary_loss_clip": 0.01310007, "auxiliary_loss_mlp": 0.01091677, "balance_loss_clip": 1.04978728, "balance_loss_mlp": 1.08613682, "epoch": 0.022486096497820532, "flos": 15483029656320.0, "grad_norm": 2.2392205952260427, "language_loss": 0.7784096, "learning_rate": 3.8143450308016952e-06, "loss": 0.80242646, "num_input_tokens_seen": 7801305, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 2.25, "step": 374, "time_per_iteration": 2.471271276473999 }, { "auxiliary_loss_clip": 0.01300133, "auxiliary_loss_mlp": 0.01075731, "balance_loss_clip": 1.03403163, "balance_loss_mlp": 1.07669485, "epoch": 0.0225462197504885, "flos": 27784157950080.0, "grad_norm": 1.807934159647282, "language_loss": 0.86194676, "learning_rate": 3.8160642632878525e-06, "loss": 0.88570547, "num_input_tokens_seen": 7823965, "router_z_loss_clip": 0.41796875, "router_z_loss_mlp": 2.234375, "step": 375, "time_per_iteration": 2.560253381729126 }, { "auxiliary_loss_clip": 0.01307934, "auxiliary_loss_mlp": 0.01100467, "balance_loss_clip": 1.05762291, "balance_loss_mlp": 1.08587635, "epoch": 0.02260634300315647, "flos": 19975490645760.0, "grad_norm": 2.056028409937585, "language_loss": 0.89064789, "learning_rate": 3.817778917253314e-06, "loss": 0.91473192, "num_input_tokens_seen": 7842115, "router_z_loss_clip": 0.4296875, "router_z_loss_mlp": 2.21875, "step": 376, "time_per_iteration": 2.5093841552734375 }, { "auxiliary_loss_clip": 0.01308244, "auxiliary_loss_mlp": 0.01089946, "balance_loss_clip": 1.05103588, "balance_loss_mlp": 1.0801723, "epoch": 0.02266646625582444, "flos": 16028189349120.0, "grad_norm": 2.111240959061147, "language_loss": 0.74903786, "learning_rate": 3.8194890170196155e-06, "loss": 0.77301979, "num_input_tokens_seen": 7857830, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 2.28125, "step": 377, "time_per_iteration": 2.458573579788208 }, { "auxiliary_loss_clip": 0.01298906, "auxiliary_loss_mlp": 0.01090075, "balance_loss_clip": 1.04904342, "balance_loss_mlp": 1.08354282, "epoch": 0.02272658950849241, "flos": 20404622430720.0, "grad_norm": 2.403800638153332, "language_loss": 0.99200201, "learning_rate": 3.8211945867150055e-06, "loss": 1.01589179, "num_input_tokens_seen": 7875840, "router_z_loss_clip": 0.41015625, "router_z_loss_mlp": 2.15625, "step": 378, "time_per_iteration": 2.5031659603118896 }, { "auxiliary_loss_clip": 0.01197255, "auxiliary_loss_mlp": 0.01050853, "balance_loss_clip": 1.03945684, "balance_loss_mlp": 1.08154607, "epoch": 0.02278671276116038, "flos": 69847332647040.0, "grad_norm": 0.9905124252632194, "language_loss": 0.75387597, "learning_rate": 3.822895650276492e-06, "loss": 0.77635705, "num_input_tokens_seen": 7940190, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 1.15625, "step": 379, "time_per_iteration": 3.1521594524383545 }, { "auxiliary_loss_clip": 0.0130888, "auxiliary_loss_mlp": 0.01086634, "balance_loss_clip": 1.04786694, "balance_loss_mlp": 1.07978153, "epoch": 0.022846836013828347, "flos": 38508771340800.0, "grad_norm": 2.624591598118141, "language_loss": 0.78621769, "learning_rate": 3.824592231451859e-06, "loss": 0.81017286, "num_input_tokens_seen": 7960840, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 2.28125, "step": 380, "time_per_iteration": 2.6523005962371826 }, { "auxiliary_loss_clip": 0.01300747, "auxiliary_loss_mlp": 0.01088247, "balance_loss_clip": 1.05033839, "balance_loss_mlp": 1.0815773, "epoch": 0.02290695926649632, "flos": 20959478795520.0, "grad_norm": 2.2896858732571497, "language_loss": 0.96765554, "learning_rate": 3.826284353801652e-06, "loss": 0.9915455, "num_input_tokens_seen": 7975500, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 2.1875, "step": 381, "time_per_iteration": 2.5140540599823 }, { "auxiliary_loss_clip": 0.01309562, "auxiliary_loss_mlp": 0.01094721, "balance_loss_clip": 1.0550487, "balance_loss_mlp": 1.08318126, "epoch": 0.022967082519164288, "flos": 24022407335040.0, "grad_norm": 2.9462675446442246, "language_loss": 0.87759614, "learning_rate": 3.827972040701142e-06, "loss": 0.90163898, "num_input_tokens_seen": 7993880, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 2.265625, "step": 382, "time_per_iteration": 2.5225658416748047 }, { "auxiliary_loss_clip": 0.01302332, "auxiliary_loss_mlp": 0.01098178, "balance_loss_clip": 1.05936337, "balance_loss_mlp": 1.0830723, "epoch": 0.023027205771832256, "flos": 20997149184000.0, "grad_norm": 2.226403593117741, "language_loss": 0.84774226, "learning_rate": 3.829655315342268e-06, "loss": 0.87174737, "num_input_tokens_seen": 8012730, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 2.1875, "step": 383, "time_per_iteration": 2.4713456630706787 }, { "auxiliary_loss_clip": 0.01299138, "auxiliary_loss_mlp": 0.01110141, "balance_loss_clip": 1.07149339, "balance_loss_mlp": 1.08290935, "epoch": 0.023087329024500225, "flos": 21360816432000.0, "grad_norm": 2.2251958712876774, "language_loss": 0.83514428, "learning_rate": 3.831334200735543e-06, "loss": 0.85923707, "num_input_tokens_seen": 8031275, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 2.15625, "step": 384, "time_per_iteration": 2.490431547164917 }, { "auxiliary_loss_clip": 0.01297551, "auxiliary_loss_mlp": 0.01091346, "balance_loss_clip": 1.05462968, "balance_loss_mlp": 1.08521771, "epoch": 0.023147452277168194, "flos": 21872435800320.0, "grad_norm": 1.9382909451901866, "language_loss": 0.89328873, "learning_rate": 3.8330087197119426e-06, "loss": 0.91717768, "num_input_tokens_seen": 8051600, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 2.125, "step": 385, "time_per_iteration": 2.4746804237365723 }, { "auxiliary_loss_clip": 0.01303635, "auxiliary_loss_mlp": 0.01109633, "balance_loss_clip": 1.07167721, "balance_loss_mlp": 1.08365393, "epoch": 0.023207575529836166, "flos": 18916700423040.0, "grad_norm": 1.9212279203737739, "language_loss": 0.69949746, "learning_rate": 3.83467889492477e-06, "loss": 0.72363019, "num_input_tokens_seen": 8070600, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 2.203125, "step": 386, "time_per_iteration": 2.488243579864502 }, { "auxiliary_loss_clip": 0.01304307, "auxiliary_loss_mlp": 0.01091196, "balance_loss_clip": 1.0534308, "balance_loss_mlp": 1.08524323, "epoch": 0.023267698782504134, "flos": 25046005207680.0, "grad_norm": 1.8854578494230896, "language_loss": 0.8790496, "learning_rate": 3.836344748851495e-06, "loss": 0.90300465, "num_input_tokens_seen": 8090680, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 2.1875, "step": 387, "time_per_iteration": 2.513505458831787 }, { "auxiliary_loss_clip": 0.01303295, "auxiliary_loss_mlp": 0.0108001, "balance_loss_clip": 1.04062366, "balance_loss_mlp": 1.08274555, "epoch": 0.023327822035172103, "flos": 28879217930880.0, "grad_norm": 1.8470074153043163, "language_loss": 0.83626294, "learning_rate": 3.838006303795566e-06, "loss": 0.8600961, "num_input_tokens_seen": 8114610, "router_z_loss_clip": 0.39453125, "router_z_loss_mlp": 2.203125, "step": 388, "time_per_iteration": 2.590317726135254 }, { "auxiliary_loss_clip": 0.01302007, "auxiliary_loss_mlp": 0.01088884, "balance_loss_clip": 1.0525732, "balance_loss_mlp": 1.08302546, "epoch": 0.02338794528784007, "flos": 27121533805440.0, "grad_norm": 2.725087269138004, "language_loss": 0.93657917, "learning_rate": 3.839663581888206e-06, "loss": 0.96048814, "num_input_tokens_seen": 8133975, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 2.203125, "step": 389, "time_per_iteration": 2.5092716217041016 }, { "auxiliary_loss_clip": 0.01295381, "auxiliary_loss_mlp": 0.01081794, "balance_loss_clip": 1.04274106, "balance_loss_mlp": 1.08329558, "epoch": 0.02344806854050804, "flos": 21322355944320.0, "grad_norm": 2.3623351573235443, "language_loss": 0.88019693, "learning_rate": 3.841316605090178e-06, "loss": 0.90396869, "num_input_tokens_seen": 8153570, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 2.125, "step": 390, "time_per_iteration": 2.5181307792663574 }, { "auxiliary_loss_clip": 0.01298106, "auxiliary_loss_mlp": 0.01090139, "balance_loss_clip": 1.05440044, "balance_loss_mlp": 1.08233333, "epoch": 0.023508191793176012, "flos": 24789997998720.0, "grad_norm": 2.527347885985162, "language_loss": 0.89514202, "learning_rate": 3.842965395193529e-06, "loss": 0.91902441, "num_input_tokens_seen": 8170075, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 2.15625, "step": 391, "time_per_iteration": 4.035652160644531 }, { "auxiliary_loss_clip": 0.01297165, "auxiliary_loss_mlp": 0.01069921, "balance_loss_clip": 1.03334808, "balance_loss_mlp": 1.08165216, "epoch": 0.02356831504584398, "flos": 25995375624960.0, "grad_norm": 2.009929925900034, "language_loss": 0.86107433, "learning_rate": 3.84460997382332e-06, "loss": 0.88474512, "num_input_tokens_seen": 8190420, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 2.15625, "step": 392, "time_per_iteration": 5.390367269515991 }, { "auxiliary_loss_clip": 0.0129245, "auxiliary_loss_mlp": 0.01085406, "balance_loss_clip": 1.04861832, "balance_loss_mlp": 1.08006799, "epoch": 0.02362843829851195, "flos": 19062461813760.0, "grad_norm": 2.45427942567474, "language_loss": 0.89275968, "learning_rate": 3.8462503624393256e-06, "loss": 0.91653818, "num_input_tokens_seen": 8208790, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 2.125, "step": 393, "time_per_iteration": 2.458430528640747 }, { "auxiliary_loss_clip": 0.01302032, "auxiliary_loss_mlp": 0.01103345, "balance_loss_clip": 1.06345701, "balance_loss_mlp": 1.08469772, "epoch": 0.023688561551179918, "flos": 16071031296000.0, "grad_norm": 1.8352072402229427, "language_loss": 0.8170687, "learning_rate": 3.84788658233771e-06, "loss": 0.84112245, "num_input_tokens_seen": 8226885, "router_z_loss_clip": 0.3984375, "router_z_loss_mlp": 2.171875, "step": 394, "time_per_iteration": 2.4904367923736572 }, { "auxiliary_loss_clip": 0.01294554, "auxiliary_loss_mlp": 0.0108575, "balance_loss_clip": 1.04757893, "balance_loss_mlp": 1.07907546, "epoch": 0.023748684803847887, "flos": 21724375939200.0, "grad_norm": 2.0495000507229246, "language_loss": 0.86018264, "learning_rate": 3.84951865465269e-06, "loss": 0.88398564, "num_input_tokens_seen": 8246825, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 2.15625, "step": 395, "time_per_iteration": 2.5021002292633057 }, { "auxiliary_loss_clip": 0.01188566, "auxiliary_loss_mlp": 0.01016016, "balance_loss_clip": 1.00595522, "balance_loss_mlp": 1.07741857, "epoch": 0.02380880805651586, "flos": 61926192881280.0, "grad_norm": 0.9280358149755081, "language_loss": 0.63845634, "learning_rate": 3.851146600358172e-06, "loss": 0.6605022, "num_input_tokens_seen": 8302835, "router_z_loss_clip": 0.10058594, "router_z_loss_mlp": 1.109375, "step": 396, "time_per_iteration": 2.9559810161590576 }, { "auxiliary_loss_clip": 0.01292292, "auxiliary_loss_mlp": 0.01070191, "balance_loss_clip": 1.03354573, "balance_loss_mlp": 1.07832718, "epoch": 0.023868931309183827, "flos": 20266331068800.0, "grad_norm": 2.461967546053096, "language_loss": 0.83712846, "learning_rate": 3.852770440269372e-06, "loss": 0.8607533, "num_input_tokens_seen": 8320745, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 2.140625, "step": 397, "time_per_iteration": 2.4740302562713623 }, { "auxiliary_loss_clip": 0.01294733, "auxiliary_loss_mlp": 0.01093391, "balance_loss_clip": 1.05493426, "balance_loss_mlp": 1.08044791, "epoch": 0.023929054561851796, "flos": 21139103733120.0, "grad_norm": 2.544167755855679, "language_loss": 0.84444159, "learning_rate": 3.854390195044404e-06, "loss": 0.86832285, "num_input_tokens_seen": 8339540, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 2.140625, "step": 398, "time_per_iteration": 2.497681140899658 }, { "auxiliary_loss_clip": 0.01297366, "auxiliary_loss_mlp": 0.01079156, "balance_loss_clip": 1.04029429, "balance_loss_mlp": 1.07947075, "epoch": 0.023989177814519765, "flos": 13698521049600.0, "grad_norm": 2.772484250177722, "language_loss": 0.85755789, "learning_rate": 3.856005885185868e-06, "loss": 0.8813231, "num_input_tokens_seen": 8354890, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 2.171875, "step": 399, "time_per_iteration": 2.496906042098999 }, { "auxiliary_loss_clip": 0.01290043, "auxiliary_loss_mlp": 0.0108872, "balance_loss_clip": 1.05100238, "balance_loss_mlp": 1.08026242, "epoch": 0.024049301067187733, "flos": 26322018929280.0, "grad_norm": 2.032431101289683, "language_loss": 0.8626911, "learning_rate": 3.857617531042398e-06, "loss": 0.88647872, "num_input_tokens_seen": 8375845, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 2.109375, "step": 400, "time_per_iteration": 2.5805623531341553 }, { "auxiliary_loss_clip": 0.01297046, "auxiliary_loss_mlp": 0.01076283, "balance_loss_clip": 1.03944767, "balance_loss_mlp": 1.08291721, "epoch": 0.024109424319855705, "flos": 24425432910720.0, "grad_norm": 1.7577683896846148, "language_loss": 0.79356134, "learning_rate": 3.8592251528102065e-06, "loss": 0.8172946, "num_input_tokens_seen": 8395240, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 2.140625, "step": 401, "time_per_iteration": 2.522374391555786 }, { "auxiliary_loss_clip": 0.01292444, "auxiliary_loss_mlp": 0.01089996, "balance_loss_clip": 1.05306458, "balance_loss_mlp": 1.0796845, "epoch": 0.024169547572523674, "flos": 29604397610880.0, "grad_norm": 2.0043944310213084, "language_loss": 0.78673089, "learning_rate": 3.8608287705345976e-06, "loss": 0.81055522, "num_input_tokens_seen": 8416950, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 2.125, "step": 402, "time_per_iteration": 2.580019235610962 }, { "auxiliary_loss_clip": 0.01296931, "auxiliary_loss_mlp": 0.01078343, "balance_loss_clip": 1.0391947, "balance_loss_mlp": 1.07885981, "epoch": 0.024229670825191642, "flos": 22601458235520.0, "grad_norm": 2.335272541246681, "language_loss": 0.94869506, "learning_rate": 3.86242840411147e-06, "loss": 0.97244787, "num_input_tokens_seen": 8433660, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 2.1875, "step": 403, "time_per_iteration": 2.4918553829193115 }, { "auxiliary_loss_clip": 0.01300673, "auxiliary_loss_mlp": 0.01089629, "balance_loss_clip": 1.05052876, "balance_loss_mlp": 1.0794276, "epoch": 0.02428979407785961, "flos": 18150258994560.0, "grad_norm": 2.65675577169217, "language_loss": 0.99993372, "learning_rate": 3.864024073288798e-06, "loss": 1.02383685, "num_input_tokens_seen": 8450180, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 2.203125, "step": 404, "time_per_iteration": 2.4820148944854736 }, { "auxiliary_loss_clip": 0.0129894, "auxiliary_loss_mlp": 0.01093329, "balance_loss_clip": 1.05549216, "balance_loss_mlp": 1.0813787, "epoch": 0.024349917330527583, "flos": 15304984917120.0, "grad_norm": 2.081748165618841, "language_loss": 0.87971115, "learning_rate": 3.865615797668091e-06, "loss": 0.90363377, "num_input_tokens_seen": 8467775, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 2.1875, "step": 405, "time_per_iteration": 2.4640517234802246 }, { "auxiliary_loss_clip": 0.01306453, "auxiliary_loss_mlp": 0.01094783, "balance_loss_clip": 1.05596828, "balance_loss_mlp": 1.0855149, "epoch": 0.024410040583195552, "flos": 20773892200320.0, "grad_norm": 2.71179995677528, "language_loss": 0.93594038, "learning_rate": 3.867203596705844e-06, "loss": 0.95995271, "num_input_tokens_seen": 8486765, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 2.203125, "step": 406, "time_per_iteration": 2.5149219036102295 }, { "auxiliary_loss_clip": 0.0129881, "auxiliary_loss_mlp": 0.01088047, "balance_loss_clip": 1.04918456, "balance_loss_mlp": 1.08282256, "epoch": 0.02447016383586352, "flos": 21798854789760.0, "grad_norm": 1.9609422767030988, "language_loss": 0.87378943, "learning_rate": 3.86878748971496e-06, "loss": 0.89765799, "num_input_tokens_seen": 8506515, "router_z_loss_clip": 0.38867188, "router_z_loss_mlp": 2.15625, "step": 407, "time_per_iteration": 2.503056526184082 }, { "auxiliary_loss_clip": 0.01295764, "auxiliary_loss_mlp": 0.01083266, "balance_loss_clip": 1.04490459, "balance_loss_mlp": 1.08274174, "epoch": 0.02453028708853149, "flos": 33948116380800.0, "grad_norm": 2.411487227374454, "language_loss": 0.74252051, "learning_rate": 3.8703674958661596e-06, "loss": 0.76631081, "num_input_tokens_seen": 8528035, "router_z_loss_clip": 0.38476562, "router_z_loss_mlp": 2.125, "step": 408, "time_per_iteration": 2.617107629776001 }, { "auxiliary_loss_clip": 0.01300759, "auxiliary_loss_mlp": 0.0109448, "balance_loss_clip": 1.05478358, "balance_loss_mlp": 1.08299553, "epoch": 0.024590410341199458, "flos": 21793000872960.0, "grad_norm": 2.476863830461904, "language_loss": 0.92577755, "learning_rate": 3.871943634189376e-06, "loss": 0.94972998, "num_input_tokens_seen": 8546455, "router_z_loss_clip": 0.39648438, "router_z_loss_mlp": 2.1875, "step": 409, "time_per_iteration": 2.496760368347168 }, { "auxiliary_loss_clip": 0.01302093, "auxiliary_loss_mlp": 0.01075944, "balance_loss_clip": 1.04075372, "balance_loss_mlp": 1.08636391, "epoch": 0.02465053359386743, "flos": 35114782124160.0, "grad_norm": 2.3838030628485765, "language_loss": 0.82729089, "learning_rate": 3.873515923575128e-06, "loss": 0.8510713, "num_input_tokens_seen": 8568450, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 2.15625, "step": 410, "time_per_iteration": 2.629286050796509 }, { "auxiliary_loss_clip": 0.01300014, "auxiliary_loss_mlp": 0.01088426, "balance_loss_clip": 1.05170953, "balance_loss_mlp": 1.0836246, "epoch": 0.0247106568465354, "flos": 27451409333760.0, "grad_norm": 2.18514095022853, "language_loss": 0.77934563, "learning_rate": 3.875084382775879e-06, "loss": 0.80323005, "num_input_tokens_seen": 8589340, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 2.15625, "step": 411, "time_per_iteration": 2.5396440029144287 }, { "auxiliary_loss_clip": 0.01297557, "auxiliary_loss_mlp": 0.01100053, "balance_loss_clip": 1.06128621, "balance_loss_mlp": 1.08029735, "epoch": 0.024770780099203367, "flos": 20703794808960.0, "grad_norm": 2.2794211614217827, "language_loss": 0.86446655, "learning_rate": 3.87664903040738e-06, "loss": 0.88844264, "num_input_tokens_seen": 8607150, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 2.171875, "step": 412, "time_per_iteration": 2.5078916549682617 }, { "auxiliary_loss_clip": 0.01174004, "auxiliary_loss_mlp": 0.01044201, "balance_loss_clip": 1.0345217, "balance_loss_mlp": 1.06806636, "epoch": 0.024830903351871336, "flos": 69551859369600.0, "grad_norm": 0.8420661935293754, "language_loss": 0.58514661, "learning_rate": 3.878209884949994e-06, "loss": 0.60732865, "num_input_tokens_seen": 8669865, "router_z_loss_clip": 0.09667969, "router_z_loss_mlp": 1.0625, "step": 413, "time_per_iteration": 3.18768048286438 }, { "auxiliary_loss_clip": 0.01293087, "auxiliary_loss_mlp": 0.01090503, "balance_loss_clip": 1.05013919, "balance_loss_mlp": 1.07911646, "epoch": 0.024891026604539304, "flos": 32270477713920.0, "grad_norm": 1.7965264553737315, "language_loss": 0.80766052, "learning_rate": 3.879766964750006e-06, "loss": 0.83149648, "num_input_tokens_seen": 8690235, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 2.140625, "step": 414, "time_per_iteration": 2.6067585945129395 }, { "auxiliary_loss_clip": 0.01288444, "auxiliary_loss_mlp": 0.01095101, "balance_loss_clip": 1.05776501, "balance_loss_mlp": 1.07835662, "epoch": 0.024951149857207276, "flos": 18840282238080.0, "grad_norm": 3.062506642470889, "language_loss": 0.80450886, "learning_rate": 3.881320288020917e-06, "loss": 0.82834435, "num_input_tokens_seen": 8706295, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 2.09375, "step": 415, "time_per_iteration": 2.5038139820098877 }, { "auxiliary_loss_clip": 0.0130438, "auxiliary_loss_mlp": 0.01088692, "balance_loss_clip": 1.05104578, "balance_loss_mlp": 1.08450878, "epoch": 0.025011273109875245, "flos": 15377201210880.0, "grad_norm": 2.7899251083714067, "language_loss": 0.96033758, "learning_rate": 3.882869872844723e-06, "loss": 0.98426831, "num_input_tokens_seen": 8724200, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 2.1875, "step": 416, "time_per_iteration": 2.5006580352783203 }, { "auxiliary_loss_clip": 0.01295261, "auxiliary_loss_mlp": 0.01073602, "balance_loss_clip": 1.03440559, "balance_loss_mlp": 1.08009577, "epoch": 0.025071396362543213, "flos": 18915515274240.0, "grad_norm": 1.6693684015564747, "language_loss": 0.77544838, "learning_rate": 3.884415737173176e-06, "loss": 0.799137, "num_input_tokens_seen": 8744170, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 2.15625, "step": 417, "time_per_iteration": 2.5189380645751953 }, { "auxiliary_loss_clip": 0.01294302, "auxiliary_loss_mlp": 0.0109139, "balance_loss_clip": 1.05307627, "balance_loss_mlp": 1.08476734, "epoch": 0.025131519615211182, "flos": 25337958952320.0, "grad_norm": 1.6896232135149034, "language_loss": 0.76901674, "learning_rate": 3.8859578988290344e-06, "loss": 0.79287374, "num_input_tokens_seen": 8765120, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 2.09375, "step": 418, "time_per_iteration": 2.555765151977539 }, { "auxiliary_loss_clip": 0.01300028, "auxiliary_loss_mlp": 0.01071158, "balance_loss_clip": 1.03458428, "balance_loss_mlp": 1.08455086, "epoch": 0.02519164286787915, "flos": 18953149749120.0, "grad_norm": 2.4704384368438155, "language_loss": 0.81363136, "learning_rate": 3.887496375507294e-06, "loss": 0.83734322, "num_input_tokens_seen": 8783500, "router_z_loss_clip": 0.36523438, "router_z_loss_mlp": 2.15625, "step": 419, "time_per_iteration": 2.574657917022705 }, { "auxiliary_loss_clip": 0.01295701, "auxiliary_loss_mlp": 0.01083941, "balance_loss_clip": 1.0438149, "balance_loss_mlp": 1.08423662, "epoch": 0.025251766120547123, "flos": 17421092904960.0, "grad_norm": 1.9326893256625823, "language_loss": 0.73579091, "learning_rate": 3.8890311847764065e-06, "loss": 0.75958729, "num_input_tokens_seen": 8801175, "router_z_loss_clip": 0.40234375, "router_z_loss_mlp": 2.125, "step": 420, "time_per_iteration": 2.4595046043395996 }, { "auxiliary_loss_clip": 0.01292393, "auxiliary_loss_mlp": 0.01095893, "balance_loss_clip": 1.05922389, "balance_loss_mlp": 1.07815075, "epoch": 0.02531188937321509, "flos": 25045430590080.0, "grad_norm": 1.761838257606799, "language_loss": 0.79025161, "learning_rate": 3.890562344079484e-06, "loss": 0.81413448, "num_input_tokens_seen": 8820215, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 2.15625, "step": 421, "time_per_iteration": 2.5596554279327393 }, { "auxiliary_loss_clip": 0.01293318, "auxiliary_loss_mlp": 0.01085292, "balance_loss_clip": 1.04602468, "balance_loss_mlp": 1.08298039, "epoch": 0.02537201262588306, "flos": 30592228515840.0, "grad_norm": 2.7716813520596486, "language_loss": 0.81713158, "learning_rate": 3.89208987073549e-06, "loss": 0.84091771, "num_input_tokens_seen": 8839660, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 2.109375, "step": 422, "time_per_iteration": 2.5660159587860107 }, { "auxiliary_loss_clip": 0.01297516, "auxiliary_loss_mlp": 0.010787, "balance_loss_clip": 1.04319882, "balance_loss_mlp": 1.08082128, "epoch": 0.02543213587855103, "flos": 26065365275520.0, "grad_norm": 1.952576712684812, "language_loss": 0.8367061, "learning_rate": 3.893613781940409e-06, "loss": 0.86046827, "num_input_tokens_seen": 8859280, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 2.171875, "step": 423, "time_per_iteration": 2.538947105407715 }, { "auxiliary_loss_clip": 0.01289255, "auxiliary_loss_mlp": 0.01078367, "balance_loss_clip": 1.04138851, "balance_loss_mlp": 1.07816505, "epoch": 0.025492259131218997, "flos": 36022818965760.0, "grad_norm": 1.905455024656147, "language_loss": 0.74346733, "learning_rate": 3.895134094768415e-06, "loss": 0.76714355, "num_input_tokens_seen": 8880560, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 2.109375, "step": 424, "time_per_iteration": 2.616703987121582 }, { "auxiliary_loss_clip": 0.01300029, "auxiliary_loss_mlp": 0.01097889, "balance_loss_clip": 1.06126773, "balance_loss_mlp": 1.08386254, "epoch": 0.02555238238388697, "flos": 18588045957120.0, "grad_norm": 2.7881353400822393, "language_loss": 0.83040535, "learning_rate": 3.896650826173015e-06, "loss": 0.85438454, "num_input_tokens_seen": 8899155, "router_z_loss_clip": 0.3671875, "router_z_loss_mlp": 2.15625, "step": 425, "time_per_iteration": 2.484200954437256 }, { "auxiliary_loss_clip": 0.01295606, "auxiliary_loss_mlp": 0.01092717, "balance_loss_clip": 1.05373597, "balance_loss_mlp": 1.07593393, "epoch": 0.025612505636554938, "flos": 24243186280320.0, "grad_norm": 2.138468935176219, "language_loss": 0.85423982, "learning_rate": 3.898163992988186e-06, "loss": 0.87812304, "num_input_tokens_seen": 8917890, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 2.1875, "step": 426, "time_per_iteration": 2.4962286949157715 }, { "auxiliary_loss_clip": 0.01168622, "auxiliary_loss_mlp": 0.01021046, "balance_loss_clip": 1.01117599, "balance_loss_mlp": 1.06479192, "epoch": 0.025672628889222907, "flos": 60586941265920.0, "grad_norm": 0.8989927127984938, "language_loss": 0.57240885, "learning_rate": 3.899673611929491e-06, "loss": 0.59430552, "num_input_tokens_seen": 8978260, "router_z_loss_clip": 0.09863281, "router_z_loss_mlp": 1.0390625, "step": 427, "time_per_iteration": 3.201742649078369 }, { "auxiliary_loss_clip": 0.01294732, "auxiliary_loss_mlp": 0.01100114, "balance_loss_clip": 1.06401753, "balance_loss_mlp": 1.0851593, "epoch": 0.025732752141890875, "flos": 19573255169280.0, "grad_norm": 2.3396278163771322, "language_loss": 0.88319099, "learning_rate": 3.901179699595194e-06, "loss": 0.90713942, "num_input_tokens_seen": 8994460, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 2.09375, "step": 428, "time_per_iteration": 2.4867336750030518 }, { "auxiliary_loss_clip": 0.01287189, "auxiliary_loss_mlp": 0.01079061, "balance_loss_clip": 1.03998446, "balance_loss_mlp": 1.0781548, "epoch": 0.025792875394558847, "flos": 31284262920960.0, "grad_norm": 1.7867432675776536, "language_loss": 0.85783553, "learning_rate": 3.902682272467353e-06, "loss": 0.88149798, "num_input_tokens_seen": 9016670, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 2.09375, "step": 429, "time_per_iteration": 2.604266405105591 }, { "auxiliary_loss_clip": 0.01294273, "auxiliary_loss_mlp": 0.01082134, "balance_loss_clip": 1.04293776, "balance_loss_mlp": 1.07764459, "epoch": 0.025852998647226816, "flos": 32379610210560.0, "grad_norm": 2.294302207169178, "language_loss": 0.88297474, "learning_rate": 3.904181346912895e-06, "loss": 0.90673888, "num_input_tokens_seen": 9039720, "router_z_loss_clip": 0.39257812, "router_z_loss_mlp": 2.171875, "step": 430, "time_per_iteration": 2.5944652557373047 }, { "auxiliary_loss_clip": 0.01294545, "auxiliary_loss_mlp": 0.0108624, "balance_loss_clip": 1.05069184, "balance_loss_mlp": 1.08476019, "epoch": 0.025913121899894784, "flos": 20193288762240.0, "grad_norm": 2.383322047642397, "language_loss": 0.83996141, "learning_rate": 3.905676939184698e-06, "loss": 0.86376929, "num_input_tokens_seen": 9059850, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 2.09375, "step": 431, "time_per_iteration": 2.5321199893951416 }, { "auxiliary_loss_clip": 0.01288185, "auxiliary_loss_mlp": 0.01078231, "balance_loss_clip": 1.04358852, "balance_loss_mlp": 1.07858312, "epoch": 0.025973245152562753, "flos": 14720430983040.0, "grad_norm": 3.5943868890199178, "language_loss": 0.86798042, "learning_rate": 3.907169065422638e-06, "loss": 0.8916446, "num_input_tokens_seen": 9077590, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 2.09375, "step": 432, "time_per_iteration": 2.459805965423584 }, { "auxiliary_loss_clip": 0.01291776, "auxiliary_loss_mlp": 0.01080389, "balance_loss_clip": 1.04524589, "balance_loss_mlp": 1.08046913, "epoch": 0.02603336840523072, "flos": 30992991534720.0, "grad_norm": 1.9554324730957076, "language_loss": 0.75916064, "learning_rate": 3.908657741654636e-06, "loss": 0.78288233, "num_input_tokens_seen": 9099880, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 2.109375, "step": 433, "time_per_iteration": 5.573594093322754 }, { "auxiliary_loss_clip": 0.01290248, "auxiliary_loss_mlp": 0.0108611, "balance_loss_clip": 1.04705739, "balance_loss_mlp": 1.07756531, "epoch": 0.026093491657898694, "flos": 17674262939520.0, "grad_norm": 2.052320945738291, "language_loss": 0.89771473, "learning_rate": 3.910142983797699e-06, "loss": 0.92147827, "num_input_tokens_seen": 9118620, "router_z_loss_clip": 0.390625, "router_z_loss_mlp": 2.125, "step": 434, "time_per_iteration": 3.9220850467681885 }, { "auxiliary_loss_clip": 0.01293673, "auxiliary_loss_mlp": 0.01101661, "balance_loss_clip": 1.06341827, "balance_loss_mlp": 1.08416915, "epoch": 0.026153614910566662, "flos": 17857874286720.0, "grad_norm": 2.601659869554926, "language_loss": 0.80619311, "learning_rate": 3.9116248076589305e-06, "loss": 0.83014643, "num_input_tokens_seen": 9135655, "router_z_loss_clip": 0.3828125, "router_z_loss_mlp": 2.09375, "step": 435, "time_per_iteration": 2.5221235752105713 }, { "auxiliary_loss_clip": 0.01286909, "auxiliary_loss_mlp": 0.01086235, "balance_loss_clip": 1.04906523, "balance_loss_mlp": 1.07594347, "epoch": 0.02621373816323463, "flos": 20011113959040.0, "grad_norm": 2.7689291357796204, "language_loss": 0.86162752, "learning_rate": 3.913103228936546e-06, "loss": 0.88535899, "num_input_tokens_seen": 9153520, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 2.109375, "step": 436, "time_per_iteration": 2.5119833946228027 }, { "auxiliary_loss_clip": 0.01291836, "auxiliary_loss_mlp": 0.01096893, "balance_loss_clip": 1.0599618, "balance_loss_mlp": 1.08063817, "epoch": 0.0262738614159026, "flos": 19281193683840.0, "grad_norm": 2.2038091570254443, "language_loss": 0.75079465, "learning_rate": 3.914578263220868e-06, "loss": 0.77468193, "num_input_tokens_seen": 9170750, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 2.109375, "step": 437, "time_per_iteration": 2.5289196968078613 }, { "auxiliary_loss_clip": 0.01289252, "auxiliary_loss_mlp": 0.01093441, "balance_loss_clip": 1.05481756, "balance_loss_mlp": 1.08071208, "epoch": 0.026333984668570568, "flos": 18807208790400.0, "grad_norm": 2.5421657114480856, "language_loss": 0.91513395, "learning_rate": 3.916049925995316e-06, "loss": 0.93896091, "num_input_tokens_seen": 9188430, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 2.09375, "step": 438, "time_per_iteration": 2.492929220199585 }, { "auxiliary_loss_clip": 0.01154295, "auxiliary_loss_mlp": 0.01045012, "balance_loss_clip": 1.03604722, "balance_loss_mlp": 1.05438566, "epoch": 0.02639410792123854, "flos": 64572020691840.0, "grad_norm": 0.8742703997815472, "language_loss": 0.62635475, "learning_rate": 3.917518232637377e-06, "loss": 0.64834785, "num_input_tokens_seen": 9255835, "router_z_loss_clip": 0.08984375, "router_z_loss_mlp": 1.0, "step": 439, "time_per_iteration": 3.3115170001983643 }, { "auxiliary_loss_clip": 0.01297579, "auxiliary_loss_mlp": 0.01096462, "balance_loss_clip": 1.05910134, "balance_loss_mlp": 1.08445108, "epoch": 0.02645423117390651, "flos": 28473462921600.0, "grad_norm": 1.8964232215387262, "language_loss": 0.75777292, "learning_rate": 3.918983198419573e-06, "loss": 0.78171337, "num_input_tokens_seen": 9276835, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 2.125, "step": 440, "time_per_iteration": 2.8171889781951904 }, { "auxiliary_loss_clip": 0.01289103, "auxiliary_loss_mlp": 0.01075074, "balance_loss_clip": 1.03766632, "balance_loss_mlp": 1.08079541, "epoch": 0.026514354426574478, "flos": 18551237495040.0, "grad_norm": 2.3876521778552062, "language_loss": 0.83152699, "learning_rate": 3.920444838510415e-06, "loss": 0.8551687, "num_input_tokens_seen": 9295075, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 2.078125, "step": 441, "time_per_iteration": 2.570892810821533 }, { "auxiliary_loss_clip": 0.01292607, "auxiliary_loss_mlp": 0.01088261, "balance_loss_clip": 1.05023313, "balance_loss_mlp": 1.07852316, "epoch": 0.026574477679242446, "flos": 20667812359680.0, "grad_norm": 2.3282975969395605, "language_loss": 0.78697348, "learning_rate": 3.92190316797534e-06, "loss": 0.81078213, "num_input_tokens_seen": 9314205, "router_z_loss_clip": 0.37890625, "router_z_loss_mlp": 2.140625, "step": 442, "time_per_iteration": 2.4972474575042725 }, { "auxiliary_loss_clip": 0.01148617, "auxiliary_loss_mlp": 0.01011435, "balance_loss_clip": 1.00251818, "balance_loss_mlp": 1.04899716, "epoch": 0.026634600931910415, "flos": 57956125340160.0, "grad_norm": 0.9662720289653316, "language_loss": 0.64454865, "learning_rate": 3.92335820177765e-06, "loss": 0.66614914, "num_input_tokens_seen": 9367395, "router_z_loss_clip": 0.08935547, "router_z_loss_mlp": 0.99609375, "step": 443, "time_per_iteration": 3.0013296604156494 }, { "auxiliary_loss_clip": 0.0129046, "auxiliary_loss_mlp": 0.01083386, "balance_loss_clip": 1.04647899, "balance_loss_mlp": 1.08182073, "epoch": 0.026694724184578387, "flos": 15815131827840.0, "grad_norm": 1.901262994816523, "language_loss": 0.820072, "learning_rate": 3.924809954779425e-06, "loss": 0.84381044, "num_input_tokens_seen": 9385185, "router_z_loss_clip": 0.36914062, "router_z_loss_mlp": 2.09375, "step": 444, "time_per_iteration": 2.477682113647461 }, { "auxiliary_loss_clip": 0.01296216, "auxiliary_loss_mlp": 0.01084359, "balance_loss_clip": 1.04375589, "balance_loss_mlp": 1.08076906, "epoch": 0.026754847437246355, "flos": 23440259612160.0, "grad_norm": 2.0762345022645117, "language_loss": 0.95632827, "learning_rate": 3.9262584417424425e-06, "loss": 0.98013401, "num_input_tokens_seen": 9403225, "router_z_loss_clip": 0.40625, "router_z_loss_mlp": 2.15625, "step": 445, "time_per_iteration": 2.5627338886260986 }, { "auxiliary_loss_clip": 0.0129114, "auxiliary_loss_mlp": 0.0109304, "balance_loss_clip": 1.05434501, "balance_loss_mlp": 1.0809418, "epoch": 0.026814970689914324, "flos": 17341801632000.0, "grad_norm": 2.2126566129638494, "language_loss": 0.91583228, "learning_rate": 3.9277036773290725e-06, "loss": 0.93967408, "num_input_tokens_seen": 9420540, "router_z_loss_clip": 0.38671875, "router_z_loss_mlp": 2.109375, "step": 446, "time_per_iteration": 2.4557971954345703 }, { "auxiliary_loss_clip": 0.01286913, "auxiliary_loss_mlp": 0.01086248, "balance_loss_clip": 1.04898298, "balance_loss_mlp": 1.08030617, "epoch": 0.026875093942582293, "flos": 17894718662400.0, "grad_norm": 2.3020744072049313, "language_loss": 0.79808521, "learning_rate": 3.92914567610317e-06, "loss": 0.8218168, "num_input_tokens_seen": 9438840, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 2.0625, "step": 447, "time_per_iteration": 2.4830286502838135 }, { "auxiliary_loss_clip": 0.01286764, "auxiliary_loss_mlp": 0.01074131, "balance_loss_clip": 1.03956056, "balance_loss_mlp": 1.07928753, "epoch": 0.026935217195250265, "flos": 21723980889600.0, "grad_norm": 2.757964638709849, "language_loss": 0.86350799, "learning_rate": 3.930584452530952e-06, "loss": 0.88711691, "num_input_tokens_seen": 9457215, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 2.078125, "step": 448, "time_per_iteration": 2.5236806869506836 }, { "auxiliary_loss_clip": 0.0128256, "auxiliary_loss_mlp": 0.01093984, "balance_loss_clip": 1.05984211, "balance_loss_mlp": 1.07850409, "epoch": 0.026995340447918233, "flos": 23622685810560.0, "grad_norm": 1.9937943842476253, "language_loss": 0.88869244, "learning_rate": 3.9320200209818755e-06, "loss": 0.91245788, "num_input_tokens_seen": 9475615, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 2.03125, "step": 449, "time_per_iteration": 2.5515220165252686 }, { "auxiliary_loss_clip": 0.01292775, "auxiliary_loss_mlp": 0.01087912, "balance_loss_clip": 1.05024195, "balance_loss_mlp": 1.08060217, "epoch": 0.027055463700586202, "flos": 17931275729280.0, "grad_norm": 2.1665586476712413, "language_loss": 0.80768448, "learning_rate": 3.933452395729493e-06, "loss": 0.83149135, "num_input_tokens_seen": 9493975, "router_z_loss_clip": 0.37695312, "router_z_loss_mlp": 2.125, "step": 450, "time_per_iteration": 2.4678428173065186 }, { "auxiliary_loss_clip": 0.0128685, "auxiliary_loss_mlp": 0.01081268, "balance_loss_clip": 1.04395545, "balance_loss_mlp": 1.08402717, "epoch": 0.02711558695325417, "flos": 25118903859840.0, "grad_norm": 1.8071692279862404, "language_loss": 0.8167572, "learning_rate": 3.934881590952304e-06, "loss": 0.84043837, "num_input_tokens_seen": 9514810, "router_z_loss_clip": 0.37304688, "router_z_loss_mlp": 2.03125, "step": 451, "time_per_iteration": 2.554708242416382 }, { "auxiliary_loss_clip": 0.01285532, "auxiliary_loss_mlp": 0.01094336, "balance_loss_clip": 1.05692792, "balance_loss_mlp": 1.08223414, "epoch": 0.02717571020592214, "flos": 24239559006720.0, "grad_norm": 1.6507257192477984, "language_loss": 0.76853102, "learning_rate": 3.936307620734599e-06, "loss": 0.79232967, "num_input_tokens_seen": 9533635, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 2.03125, "step": 452, "time_per_iteration": 2.559234142303467 }, { "auxiliary_loss_clip": 0.01287229, "auxiliary_loss_mlp": 0.01083868, "balance_loss_clip": 1.04767656, "balance_loss_mlp": 1.08222103, "epoch": 0.02723583345859011, "flos": 25118939773440.0, "grad_norm": 2.0685092586428206, "language_loss": 0.73021495, "learning_rate": 3.937730499067294e-06, "loss": 0.75392592, "num_input_tokens_seen": 9555420, "router_z_loss_clip": 0.36132812, "router_z_loss_mlp": 2.046875, "step": 453, "time_per_iteration": 2.545044422149658 }, { "auxiliary_loss_clip": 0.01280185, "auxiliary_loss_mlp": 0.01082921, "balance_loss_clip": 1.04825521, "balance_loss_mlp": 1.07824373, "epoch": 0.02729595671125808, "flos": 42741597847680.0, "grad_norm": 1.7617394546207295, "language_loss": 0.82227391, "learning_rate": 3.939150239848748e-06, "loss": 0.84590495, "num_input_tokens_seen": 9578950, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 2.015625, "step": 454, "time_per_iteration": 2.789292812347412 }, { "auxiliary_loss_clip": 0.01285245, "auxiliary_loss_mlp": 0.01077386, "balance_loss_clip": 1.04403174, "balance_loss_mlp": 1.08118832, "epoch": 0.02735607996392605, "flos": 21430985650560.0, "grad_norm": 1.786334160609171, "language_loss": 0.75358629, "learning_rate": 3.9405668568855866e-06, "loss": 0.77721262, "num_input_tokens_seen": 9598160, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 2.046875, "step": 455, "time_per_iteration": 2.649275779724121 }, { "auxiliary_loss_clip": 0.01285299, "auxiliary_loss_mlp": 0.01091507, "balance_loss_clip": 1.05688822, "balance_loss_mlp": 1.07899666, "epoch": 0.027416203216594017, "flos": 20851280052480.0, "grad_norm": 2.1561967192214153, "language_loss": 0.8053422, "learning_rate": 3.941980363893499e-06, "loss": 0.82911026, "num_input_tokens_seen": 9616010, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 2.0625, "step": 456, "time_per_iteration": 2.557905673980713 }, { "auxiliary_loss_clip": 0.01280476, "auxiliary_loss_mlp": 0.01076464, "balance_loss_clip": 1.04051018, "balance_loss_mlp": 1.07859969, "epoch": 0.027476326469261986, "flos": 13224500242560.0, "grad_norm": 2.144029482686567, "language_loss": 0.81555212, "learning_rate": 3.9433907744980384e-06, "loss": 0.83912158, "num_input_tokens_seen": 9634000, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 2.015625, "step": 457, "time_per_iteration": 2.4631614685058594 }, { "auxiliary_loss_clip": 0.01285993, "auxiliary_loss_mlp": 0.01083313, "balance_loss_clip": 1.0484798, "balance_loss_mlp": 1.07914221, "epoch": 0.027536449721929958, "flos": 24024526237440.0, "grad_norm": 19.690178992701142, "language_loss": 0.94007403, "learning_rate": 3.944798102235412e-06, "loss": 0.96376711, "num_input_tokens_seen": 9653455, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 2.0625, "step": 458, "time_per_iteration": 2.541727066040039 }, { "auxiliary_loss_clip": 0.0128209, "auxiliary_loss_mlp": 0.01092161, "balance_loss_clip": 1.05806661, "balance_loss_mlp": 1.07833016, "epoch": 0.027596572974597926, "flos": 13006055681280.0, "grad_norm": 3.3934606425515903, "language_loss": 0.79464662, "learning_rate": 3.9462023605532545e-06, "loss": 0.81838912, "num_input_tokens_seen": 9669650, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 2.03125, "step": 459, "time_per_iteration": 2.498356342315674 }, { "auxiliary_loss_clip": 0.01288217, "auxiliary_loss_mlp": 0.01080034, "balance_loss_clip": 1.0425781, "balance_loss_mlp": 1.08410096, "epoch": 0.027656696227265895, "flos": 26143076350080.0, "grad_norm": 2.080515116566068, "language_loss": 0.83311903, "learning_rate": 3.947603562811407e-06, "loss": 0.85680157, "num_input_tokens_seen": 9691415, "router_z_loss_clip": 0.375, "router_z_loss_mlp": 2.03125, "step": 460, "time_per_iteration": 2.6425600051879883 }, { "auxiliary_loss_clip": 0.01150847, "auxiliary_loss_mlp": 0.01032881, "balance_loss_clip": 1.02434587, "balance_loss_mlp": 1.0539459, "epoch": 0.027716819479933864, "flos": 60697222997760.0, "grad_norm": 1.5998537914780995, "language_loss": 0.73649347, "learning_rate": 3.949001722282675e-06, "loss": 0.75833076, "num_input_tokens_seen": 9755605, "router_z_loss_clip": 0.08544922, "router_z_loss_mlp": 0.96875, "step": 461, "time_per_iteration": 3.0989482402801514 }, { "auxiliary_loss_clip": 0.01279697, "auxiliary_loss_mlp": 0.01080105, "balance_loss_clip": 1.04770446, "balance_loss_mlp": 1.08251679, "epoch": 0.027776942732601832, "flos": 31211938886400.0, "grad_norm": 2.5457011345044447, "language_loss": 0.81568658, "learning_rate": 3.950396852153582e-06, "loss": 0.8392846, "num_input_tokens_seen": 9776270, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.96875, "step": 462, "time_per_iteration": 2.633282423019409 }, { "auxiliary_loss_clip": 0.01281121, "auxiliary_loss_mlp": 0.01073662, "balance_loss_clip": 1.04180944, "balance_loss_mlp": 1.0798738, "epoch": 0.027837065985269804, "flos": 22674644196480.0, "grad_norm": 2.650931583592543, "language_loss": 0.90404987, "learning_rate": 3.951788965525118e-06, "loss": 0.92759776, "num_input_tokens_seen": 9794465, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 2.015625, "step": 463, "time_per_iteration": 2.4789392948150635 }, { "auxiliary_loss_clip": 0.01143621, "auxiliary_loss_mlp": 0.01010829, "balance_loss_clip": 1.00286567, "balance_loss_mlp": 1.04660583, "epoch": 0.027897189237937773, "flos": 62182487399040.0, "grad_norm": 0.8963357564482474, "language_loss": 0.590891, "learning_rate": 3.953178075413476e-06, "loss": 0.61243546, "num_input_tokens_seen": 9849685, "router_z_loss_clip": 0.07958984, "router_z_loss_mlp": 0.96875, "step": 464, "time_per_iteration": 3.0857157707214355 }, { "auxiliary_loss_clip": 0.01293743, "auxiliary_loss_mlp": 0.01092942, "balance_loss_clip": 1.05801344, "balance_loss_mlp": 1.08482409, "epoch": 0.02795731249060574, "flos": 24493160004480.0, "grad_norm": 2.3306737201700254, "language_loss": 0.81384587, "learning_rate": 3.954564194750784e-06, "loss": 0.83771271, "num_input_tokens_seen": 9869505, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 2.09375, "step": 465, "time_per_iteration": 2.5535106658935547 }, { "auxiliary_loss_clip": 0.01281341, "auxiliary_loss_mlp": 0.01085117, "balance_loss_clip": 1.05002201, "balance_loss_mlp": 1.07836747, "epoch": 0.02801743574327371, "flos": 23733003456000.0, "grad_norm": 2.194187477435355, "language_loss": 0.78999674, "learning_rate": 3.955947336385828e-06, "loss": 0.81366134, "num_input_tokens_seen": 9890950, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 2.03125, "step": 466, "time_per_iteration": 2.5274574756622314 }, { "auxiliary_loss_clip": 0.01280567, "auxiliary_loss_mlp": 0.0108787, "balance_loss_clip": 1.05387187, "balance_loss_mlp": 1.0810082, "epoch": 0.02807755899594168, "flos": 20629100476800.0, "grad_norm": 1.9156710385614366, "language_loss": 0.87665874, "learning_rate": 3.957327513084761e-06, "loss": 0.90034312, "num_input_tokens_seen": 9911265, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.9921875, "step": 467, "time_per_iteration": 2.527629852294922 }, { "auxiliary_loss_clip": 0.01284508, "auxiliary_loss_mlp": 0.01099812, "balance_loss_clip": 1.06357265, "balance_loss_mlp": 1.08179069, "epoch": 0.02813768224860965, "flos": 19244564789760.0, "grad_norm": 2.3314485686301394, "language_loss": 0.86053085, "learning_rate": 3.958704737531818e-06, "loss": 0.88437414, "num_input_tokens_seen": 9929025, "router_z_loss_clip": 0.36328125, "router_z_loss_mlp": 2.03125, "step": 468, "time_per_iteration": 2.506098985671997 }, { "auxiliary_loss_clip": 0.01281306, "auxiliary_loss_mlp": 0.01082335, "balance_loss_clip": 1.04633379, "balance_loss_mlp": 1.07847619, "epoch": 0.02819780550127762, "flos": 20813968800000.0, "grad_norm": 2.181776421270866, "language_loss": 0.91666919, "learning_rate": 3.9600790223300065e-06, "loss": 0.94030559, "num_input_tokens_seen": 9945190, "router_z_loss_clip": 0.359375, "router_z_loss_mlp": 2.03125, "step": 469, "time_per_iteration": 2.5338449478149414 }, { "auxiliary_loss_clip": 0.01279902, "auxiliary_loss_mlp": 0.01093633, "balance_loss_clip": 1.05849028, "balance_loss_mlp": 1.07997656, "epoch": 0.028257928753945588, "flos": 19974125928960.0, "grad_norm": 4.751028717011765, "language_loss": 0.81662178, "learning_rate": 3.96145038000181e-06, "loss": 0.84035707, "num_input_tokens_seen": 9962820, "router_z_loss_clip": 0.3515625, "router_z_loss_mlp": 2.0, "step": 470, "time_per_iteration": 2.519056558609009 }, { "auxiliary_loss_clip": 0.01282873, "auxiliary_loss_mlp": 0.01086376, "balance_loss_clip": 1.0510428, "balance_loss_mlp": 1.07887173, "epoch": 0.028318052006613557, "flos": 20484488321280.0, "grad_norm": 1.8057370994648945, "language_loss": 0.93266684, "learning_rate": 3.962818822989861e-06, "loss": 0.95635939, "num_input_tokens_seen": 9982595, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 2.046875, "step": 471, "time_per_iteration": 2.4901297092437744 }, { "auxiliary_loss_clip": 0.01275635, "auxiliary_loss_mlp": 0.01095611, "balance_loss_clip": 1.06015801, "balance_loss_mlp": 1.07548046, "epoch": 0.02837817525928153, "flos": 28514832410880.0, "grad_norm": 4.175671162406699, "language_loss": 0.75945079, "learning_rate": 3.964184363657625e-06, "loss": 0.78316319, "num_input_tokens_seen": 10004645, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 2.0, "step": 472, "time_per_iteration": 2.5882437229156494 }, { "auxiliary_loss_clip": 0.01283099, "auxiliary_loss_mlp": 0.01079449, "balance_loss_clip": 1.04564142, "balance_loss_mlp": 1.07609677, "epoch": 0.028438298511949497, "flos": 18551668458240.0, "grad_norm": 1.8108333006755108, "language_loss": 0.93576276, "learning_rate": 3.965547014290071e-06, "loss": 0.95938826, "num_input_tokens_seen": 10022555, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 2.078125, "step": 473, "time_per_iteration": 2.4807121753692627 }, { "auxiliary_loss_clip": 0.01287978, "auxiliary_loss_mlp": 0.01117562, "balance_loss_clip": 1.08408785, "balance_loss_mlp": 1.0810616, "epoch": 0.028498421764617466, "flos": 16910227722240.0, "grad_norm": 3.068254270177605, "language_loss": 0.88699019, "learning_rate": 3.96690678709433e-06, "loss": 0.91104555, "num_input_tokens_seen": 10041025, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 2.0625, "step": 474, "time_per_iteration": 2.4863440990448 }, { "auxiliary_loss_clip": 0.01278435, "auxiliary_loss_mlp": 0.01090372, "balance_loss_clip": 1.05499101, "balance_loss_mlp": 1.07760382, "epoch": 0.028558545017285435, "flos": 27778699082880.0, "grad_norm": 3.424601507652988, "language_loss": 0.78946614, "learning_rate": 3.968263694200355e-06, "loss": 0.81315422, "num_input_tokens_seen": 10060775, "router_z_loss_clip": 0.35546875, "router_z_loss_mlp": 2.0, "step": 475, "time_per_iteration": 5.570080757141113 }, { "auxiliary_loss_clip": 0.01146966, "auxiliary_loss_mlp": 0.01056971, "balance_loss_clip": 1.0496279, "balance_loss_mlp": 1.05132604, "epoch": 0.028618668269953403, "flos": 65654367258240.0, "grad_norm": 0.9350865802041639, "language_loss": 0.66993666, "learning_rate": 3.969617747661569e-06, "loss": 0.69197601, "num_input_tokens_seen": 10120225, "router_z_loss_clip": 0.07324219, "router_z_loss_mlp": 0.953125, "step": 476, "time_per_iteration": 3.050954580307007 }, { "auxiliary_loss_clip": 0.01277919, "auxiliary_loss_mlp": 0.01078373, "balance_loss_clip": 1.04251516, "balance_loss_mlp": 1.07678843, "epoch": 0.028678791522621375, "flos": 21937074324480.0, "grad_norm": 3.2530483569730824, "language_loss": 0.83790302, "learning_rate": 3.970968959455509e-06, "loss": 0.86146593, "num_input_tokens_seen": 10137880, "router_z_loss_clip": 0.35742188, "router_z_loss_mlp": 2.015625, "step": 477, "time_per_iteration": 2.4992995262145996 }, { "auxiliary_loss_clip": 0.01285255, "auxiliary_loss_mlp": 0.01086057, "balance_loss_clip": 1.05119979, "balance_loss_mlp": 1.08097339, "epoch": 0.028738914775289344, "flos": 24572128055040.0, "grad_norm": 2.121713909258134, "language_loss": 0.82357562, "learning_rate": 3.97231734148446e-06, "loss": 0.84728873, "num_input_tokens_seen": 10156930, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 2.046875, "step": 478, "time_per_iteration": 2.539304733276367 }, { "auxiliary_loss_clip": 0.01277872, "auxiliary_loss_mlp": 0.0107781, "balance_loss_clip": 1.04335856, "balance_loss_mlp": 1.07620454, "epoch": 0.028799038027957313, "flos": 23257977068160.0, "grad_norm": 4.350753524297235, "language_loss": 0.81230682, "learning_rate": 3.973662905576082e-06, "loss": 0.83586365, "num_input_tokens_seen": 10176295, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 2.015625, "step": 479, "time_per_iteration": 2.5180585384368896 }, { "auxiliary_loss_clip": 0.01276585, "auxiliary_loss_mlp": 0.01078578, "balance_loss_clip": 1.04150438, "balance_loss_mlp": 1.07636285, "epoch": 0.02885916128062528, "flos": 22164102236160.0, "grad_norm": 3.311144773356861, "language_loss": 0.73729718, "learning_rate": 3.975005663484038e-06, "loss": 0.76084882, "num_input_tokens_seen": 10195790, "router_z_loss_clip": 0.37109375, "router_z_loss_mlp": 2.0, "step": 480, "time_per_iteration": 2.5396831035614014 }, { "auxiliary_loss_clip": 0.01276463, "auxiliary_loss_mlp": 0.01068834, "balance_loss_clip": 1.03745854, "balance_loss_mlp": 1.07900453, "epoch": 0.02891928453329325, "flos": 22932842135040.0, "grad_norm": 1.7620468446680526, "language_loss": 0.87908369, "learning_rate": 3.976345626888605e-06, "loss": 0.90253669, "num_input_tokens_seen": 10218405, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.9765625, "step": 481, "time_per_iteration": 2.6025030612945557 }, { "auxiliary_loss_clip": 0.01140237, "auxiliary_loss_mlp": 0.01009237, "balance_loss_clip": 1.0019896, "balance_loss_mlp": 1.04643011, "epoch": 0.028979407785961222, "flos": 57432941792640.0, "grad_norm": 0.8412156570888841, "language_loss": 0.6605109, "learning_rate": 3.9776828073972864e-06, "loss": 0.68200564, "num_input_tokens_seen": 10271005, "router_z_loss_clip": 0.07226562, "router_z_loss_mlp": 0.9375, "step": 482, "time_per_iteration": 2.8994739055633545 }, { "auxiliary_loss_clip": 0.01289281, "auxiliary_loss_mlp": 0.01076955, "balance_loss_clip": 1.04319453, "balance_loss_mlp": 1.08056641, "epoch": 0.02903953103862919, "flos": 16722737706240.0, "grad_norm": 2.5589020832662372, "language_loss": 0.79164147, "learning_rate": 3.979017216545415e-06, "loss": 0.81530386, "num_input_tokens_seen": 10288405, "router_z_loss_clip": 0.33789062, "router_z_loss_mlp": 2.09375, "step": 483, "time_per_iteration": 2.4574239253997803 }, { "auxiliary_loss_clip": 0.01284081, "auxiliary_loss_mlp": 0.01089643, "balance_loss_clip": 1.05531073, "balance_loss_mlp": 1.08136797, "epoch": 0.02909965429129716, "flos": 16763640318720.0, "grad_norm": 2.121864018008646, "language_loss": 0.75462365, "learning_rate": 3.980348865796749e-06, "loss": 0.7783609, "num_input_tokens_seen": 10306875, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 2.03125, "step": 484, "time_per_iteration": 2.4995274543762207 }, { "auxiliary_loss_clip": 0.01279096, "auxiliary_loss_mlp": 0.01079887, "balance_loss_clip": 1.04731917, "balance_loss_mlp": 1.07820535, "epoch": 0.029159777543965128, "flos": 19785343023360.0, "grad_norm": 2.050634803476252, "language_loss": 0.84029806, "learning_rate": 3.9816777665440615e-06, "loss": 0.86388785, "num_input_tokens_seen": 10323965, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 2.015625, "step": 485, "time_per_iteration": 2.514767646789551 }, { "auxiliary_loss_clip": 0.01286889, "auxiliary_loss_mlp": 0.01082779, "balance_loss_clip": 1.04827976, "balance_loss_mlp": 1.0858171, "epoch": 0.029219900796633096, "flos": 19642670202240.0, "grad_norm": 2.2704236201891934, "language_loss": 0.84436297, "learning_rate": 3.983003930109732e-06, "loss": 0.86805964, "num_input_tokens_seen": 10342620, "router_z_loss_clip": 0.34375, "router_z_loss_mlp": 2.0, "step": 486, "time_per_iteration": 2.582458019256592 }, { "auxiliary_loss_clip": 0.01280525, "auxiliary_loss_mlp": 0.0108633, "balance_loss_clip": 1.05180717, "balance_loss_mlp": 1.0787065, "epoch": 0.02928002404930107, "flos": 25885704424320.0, "grad_norm": 1.779507876844119, "language_loss": 0.8883909, "learning_rate": 3.984327367746315e-06, "loss": 0.91205943, "num_input_tokens_seen": 10364610, "router_z_loss_clip": 0.34570312, "router_z_loss_mlp": 2.015625, "step": 487, "time_per_iteration": 2.58760666847229 }, { "auxiliary_loss_clip": 0.01282902, "auxiliary_loss_mlp": 0.01068929, "balance_loss_clip": 1.03721929, "balance_loss_mlp": 1.08207655, "epoch": 0.029340147301969037, "flos": 20660234590080.0, "grad_norm": 2.514402285294437, "language_loss": 0.88539422, "learning_rate": 3.985648090637122e-06, "loss": 0.90891248, "num_input_tokens_seen": 10380910, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 2.0, "step": 488, "time_per_iteration": 2.4943253993988037 }, { "auxiliary_loss_clip": 0.01278083, "auxiliary_loss_mlp": 0.01081533, "balance_loss_clip": 1.0472964, "balance_loss_mlp": 1.07877839, "epoch": 0.029400270554637006, "flos": 24428018689920.0, "grad_norm": 1.960793600822513, "language_loss": 0.88801819, "learning_rate": 3.986966109896785e-06, "loss": 0.9116143, "num_input_tokens_seen": 10400665, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 2.0, "step": 489, "time_per_iteration": 2.543733596801758 }, { "auxiliary_loss_clip": 0.01274792, "auxiliary_loss_mlp": 0.01073195, "balance_loss_clip": 1.03891015, "balance_loss_mlp": 1.07584858, "epoch": 0.029460393807304974, "flos": 20120892900480.0, "grad_norm": 1.963419039280727, "language_loss": 0.88513762, "learning_rate": 3.988281436571815e-06, "loss": 0.9086175, "num_input_tokens_seen": 10420150, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 1.984375, "step": 490, "time_per_iteration": 2.542288303375244 }, { "auxiliary_loss_clip": 0.01279372, "auxiliary_loss_mlp": 0.01085653, "balance_loss_clip": 1.05272746, "balance_loss_mlp": 1.07758605, "epoch": 0.029520517059972943, "flos": 17675914965120.0, "grad_norm": 3.4981639058245872, "language_loss": 0.9140532, "learning_rate": 3.989594081641164e-06, "loss": 0.93770343, "num_input_tokens_seen": 10438210, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 2.015625, "step": 491, "time_per_iteration": 2.4809136390686035 }, { "auxiliary_loss_clip": 0.01270355, "auxiliary_loss_mlp": 0.01072725, "balance_loss_clip": 1.04058671, "balance_loss_mlp": 1.07639992, "epoch": 0.029580640312640915, "flos": 18953185662720.0, "grad_norm": 1.8840986256977632, "language_loss": 0.85324478, "learning_rate": 3.9909040560167675e-06, "loss": 0.87667561, "num_input_tokens_seen": 10455125, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.9375, "step": 492, "time_per_iteration": 2.720346212387085 }, { "auxiliary_loss_clip": 0.01280412, "auxiliary_loss_mlp": 0.0110069, "balance_loss_clip": 1.06592846, "balance_loss_mlp": 1.08128381, "epoch": 0.029640763565308884, "flos": 18726121837440.0, "grad_norm": 3.195370775170887, "language_loss": 0.84048307, "learning_rate": 3.992211370544093e-06, "loss": 0.86429405, "num_input_tokens_seen": 10470990, "router_z_loss_clip": 0.34765625, "router_z_loss_mlp": 1.984375, "step": 493, "time_per_iteration": 2.8871009349823 }, { "auxiliary_loss_clip": 0.01274472, "auxiliary_loss_mlp": 0.01070829, "balance_loss_clip": 1.03811824, "balance_loss_mlp": 1.07487059, "epoch": 0.029700886817976852, "flos": 20595308757120.0, "grad_norm": 2.797232375201074, "language_loss": 0.86676013, "learning_rate": 3.99351603600268e-06, "loss": 0.89021313, "num_input_tokens_seen": 10490685, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 2.0, "step": 494, "time_per_iteration": 2.8328700065612793 }, { "auxiliary_loss_clip": 0.01282448, "auxiliary_loss_mlp": 0.01080948, "balance_loss_clip": 1.04993033, "balance_loss_mlp": 1.07982588, "epoch": 0.02976101007064482, "flos": 22236857233920.0, "grad_norm": 2.686104025030144, "language_loss": 0.86788881, "learning_rate": 3.994818063106668e-06, "loss": 0.89152277, "num_input_tokens_seen": 10509435, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 2.03125, "step": 495, "time_per_iteration": 2.843939781188965 }, { "auxiliary_loss_clip": 0.0127044, "auxiliary_loss_mlp": 0.01076498, "balance_loss_clip": 1.04464555, "balance_loss_mlp": 1.07702231, "epoch": 0.029821133323312793, "flos": 23732644320000.0, "grad_norm": 2.6405055300366245, "language_loss": 0.6196627, "learning_rate": 3.99611746250533e-06, "loss": 0.64313209, "num_input_tokens_seen": 10530050, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.9375, "step": 496, "time_per_iteration": 2.553208589553833 }, { "auxiliary_loss_clip": 0.01272882, "auxiliary_loss_mlp": 0.01085769, "balance_loss_clip": 1.05306983, "balance_loss_mlp": 1.07950175, "epoch": 0.02988125657598076, "flos": 22419498913920.0, "grad_norm": 1.9054143339124545, "language_loss": 0.88626558, "learning_rate": 3.997414244783595e-06, "loss": 0.90985209, "num_input_tokens_seen": 10551370, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.9296875, "step": 497, "time_per_iteration": 2.543546676635742 }, { "auxiliary_loss_clip": 0.0128025, "auxiliary_loss_mlp": 0.01079357, "balance_loss_clip": 1.0461688, "balance_loss_mlp": 1.0803616, "epoch": 0.02994137982864873, "flos": 13845108453120.0, "grad_norm": 2.7970527893630326, "language_loss": 0.85137701, "learning_rate": 3.998708420462557e-06, "loss": 0.87497306, "num_input_tokens_seen": 10569225, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 2.0, "step": 498, "time_per_iteration": 2.4741671085357666 }, { "auxiliary_loss_clip": 0.01274854, "auxiliary_loss_mlp": 0.01075121, "balance_loss_clip": 1.04446077, "balance_loss_mlp": 1.07787538, "epoch": 0.0300015030813167, "flos": 23908354675200.0, "grad_norm": 3.5185291985078155, "language_loss": 0.78066677, "learning_rate": 4e-06, "loss": 0.80416656, "num_input_tokens_seen": 10586170, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.96875, "step": 499, "time_per_iteration": 2.5136983394622803 }, { "auxiliary_loss_clip": 0.01277745, "auxiliary_loss_mlp": 0.01080017, "balance_loss_clip": 1.04795003, "balance_loss_mlp": 1.0803628, "epoch": 0.030061626333984667, "flos": 22016796560640.0, "grad_norm": 1.6917066863693555, "language_loss": 0.82635379, "learning_rate": 3.9999999620799e-06, "loss": 0.84993148, "num_input_tokens_seen": 10606205, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.9765625, "step": 500, "time_per_iteration": 2.498228073120117 }, { "auxiliary_loss_clip": 0.01271475, "auxiliary_loss_mlp": 0.01084657, "balance_loss_clip": 1.04934752, "balance_loss_mlp": 1.07567227, "epoch": 0.03012174958665264, "flos": 23039747988480.0, "grad_norm": 2.748283437834822, "language_loss": 0.8801741, "learning_rate": 3.9999998483196e-06, "loss": 0.90373546, "num_input_tokens_seen": 10625995, "router_z_loss_clip": 0.35351562, "router_z_loss_mlp": 1.9609375, "step": 501, "time_per_iteration": 2.5077388286590576 }, { "auxiliary_loss_clip": 0.0127659, "auxiliary_loss_mlp": 0.01071101, "balance_loss_clip": 1.03979635, "balance_loss_mlp": 1.07704425, "epoch": 0.030181872839320608, "flos": 18953257489920.0, "grad_norm": 4.376843760172703, "language_loss": 0.8638308, "learning_rate": 3.9999996587191065e-06, "loss": 0.8873077, "num_input_tokens_seen": 10644105, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.9921875, "step": 502, "time_per_iteration": 2.4645867347717285 }, { "auxiliary_loss_clip": 0.01274323, "auxiliary_loss_mlp": 0.01076472, "balance_loss_clip": 1.04368949, "balance_loss_mlp": 1.07892561, "epoch": 0.030241996091988577, "flos": 16728017005440.0, "grad_norm": 2.385958642549728, "language_loss": 0.84553325, "learning_rate": 3.999999393278425e-06, "loss": 0.8690412, "num_input_tokens_seen": 10661090, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.953125, "step": 503, "time_per_iteration": 2.4754319190979004 }, { "auxiliary_loss_clip": 0.01267399, "auxiliary_loss_mlp": 0.01084958, "balance_loss_clip": 1.05274737, "balance_loss_mlp": 1.07710457, "epoch": 0.030302119344656545, "flos": 28621271387520.0, "grad_norm": 1.597451111341248, "language_loss": 0.88102537, "learning_rate": 3.999999051997567e-06, "loss": 0.90454894, "num_input_tokens_seen": 10682380, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 1.90625, "step": 504, "time_per_iteration": 2.5359039306640625 }, { "auxiliary_loss_clip": 0.01270602, "auxiliary_loss_mlp": 0.01089008, "balance_loss_clip": 1.05715489, "balance_loss_mlp": 1.07630455, "epoch": 0.030362242597324514, "flos": 15669334523520.0, "grad_norm": 1.8616956991861708, "language_loss": 0.78355908, "learning_rate": 3.9999986348765425e-06, "loss": 0.80715513, "num_input_tokens_seen": 10699925, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.9375, "step": 505, "time_per_iteration": 2.477348566055298 }, { "auxiliary_loss_clip": 0.01135593, "auxiliary_loss_mlp": 0.01036322, "balance_loss_clip": 1.02945578, "balance_loss_mlp": 1.04475522, "epoch": 0.030422365849992486, "flos": 72125973676800.0, "grad_norm": 5.145308035484059, "language_loss": 0.55033195, "learning_rate": 3.999998141915371e-06, "loss": 0.57205111, "num_input_tokens_seen": 10766525, "router_z_loss_clip": 0.06884766, "router_z_loss_mlp": 0.90625, "step": 506, "time_per_iteration": 3.258871078491211 }, { "auxiliary_loss_clip": 0.01270394, "auxiliary_loss_mlp": 0.01090196, "balance_loss_clip": 1.05755615, "balance_loss_mlp": 1.07550144, "epoch": 0.030482489102660455, "flos": 19427817000960.0, "grad_norm": 2.053334544978655, "language_loss": 0.83253646, "learning_rate": 3.999997573114069e-06, "loss": 0.8561424, "num_input_tokens_seen": 10786725, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 1.9453125, "step": 507, "time_per_iteration": 2.5366201400756836 }, { "auxiliary_loss_clip": 0.01274177, "auxiliary_loss_mlp": 0.0107998, "balance_loss_clip": 1.04745948, "balance_loss_mlp": 1.0763849, "epoch": 0.030542612355328423, "flos": 20375822701440.0, "grad_norm": 2.2583116600820152, "language_loss": 0.88789904, "learning_rate": 3.999996928472659e-06, "loss": 0.91144061, "num_input_tokens_seen": 10805390, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.9765625, "step": 508, "time_per_iteration": 2.506281614303589 }, { "auxiliary_loss_clip": 0.0127823, "auxiliary_loss_mlp": 0.01065047, "balance_loss_clip": 1.03257418, "balance_loss_mlp": 1.07832575, "epoch": 0.030602735607996392, "flos": 34677354297600.0, "grad_norm": 2.197468684983657, "language_loss": 0.71828747, "learning_rate": 3.999996207991165e-06, "loss": 0.7417202, "num_input_tokens_seen": 10828030, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 2.0, "step": 509, "time_per_iteration": 2.738222122192383 }, { "auxiliary_loss_clip": 0.01270495, "auxiliary_loss_mlp": 0.01072083, "balance_loss_clip": 1.041327, "balance_loss_mlp": 1.07780027, "epoch": 0.03066285886066436, "flos": 23658668259840.0, "grad_norm": 2.1020433126376363, "language_loss": 0.82456106, "learning_rate": 3.999995411669614e-06, "loss": 0.84798682, "num_input_tokens_seen": 10845240, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.921875, "step": 510, "time_per_iteration": 2.5395631790161133 }, { "auxiliary_loss_clip": 0.01274131, "auxiliary_loss_mlp": 0.01077005, "balance_loss_clip": 1.04529536, "balance_loss_mlp": 1.08014154, "epoch": 0.030722982113332332, "flos": 23002975440000.0, "grad_norm": 2.053116495540718, "language_loss": 0.83880591, "learning_rate": 3.999994539508036e-06, "loss": 0.86231726, "num_input_tokens_seen": 10864325, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.9375, "step": 511, "time_per_iteration": 2.5397706031799316 }, { "auxiliary_loss_clip": 0.01274986, "auxiliary_loss_mlp": 0.01077294, "balance_loss_clip": 1.04575109, "balance_loss_mlp": 1.07663846, "epoch": 0.0307831053660003, "flos": 24750855152640.0, "grad_norm": 2.03868234167311, "language_loss": 0.82212436, "learning_rate": 3.9999935915064655e-06, "loss": 0.8456471, "num_input_tokens_seen": 10883860, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.984375, "step": 512, "time_per_iteration": 2.532201051712036 }, { "auxiliary_loss_clip": 0.01269121, "auxiliary_loss_mlp": 0.01083059, "balance_loss_clip": 1.05030036, "balance_loss_mlp": 1.07433283, "epoch": 0.03084322861866827, "flos": 26140885620480.0, "grad_norm": 1.998359733473131, "language_loss": 0.86915505, "learning_rate": 3.9999925676649374e-06, "loss": 0.89267689, "num_input_tokens_seen": 10904555, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.953125, "step": 513, "time_per_iteration": 2.546088695526123 }, { "auxiliary_loss_clip": 0.01277771, "auxiliary_loss_mlp": 0.01077839, "balance_loss_clip": 1.0455575, "balance_loss_mlp": 1.07871604, "epoch": 0.03090335187133624, "flos": 18771298168320.0, "grad_norm": 1.6392843850791334, "language_loss": 0.79272121, "learning_rate": 3.999991467983491e-06, "loss": 0.81627727, "num_input_tokens_seen": 10923700, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 1.9921875, "step": 514, "time_per_iteration": 2.4906444549560547 }, { "auxiliary_loss_clip": 0.01269709, "auxiliary_loss_mlp": 0.01063748, "balance_loss_clip": 1.03385067, "balance_loss_mlp": 1.07851481, "epoch": 0.030963475124004207, "flos": 23221886878080.0, "grad_norm": 3.0292477087447645, "language_loss": 0.77703929, "learning_rate": 3.999990292462167e-06, "loss": 0.80037391, "num_input_tokens_seen": 10942730, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.90625, "step": 515, "time_per_iteration": 2.526031017303467 }, { "auxiliary_loss_clip": 0.01266965, "auxiliary_loss_mlp": 0.01069735, "balance_loss_clip": 1.03764343, "balance_loss_mlp": 1.07271671, "epoch": 0.03102359837667218, "flos": 42525595411200.0, "grad_norm": 1.9834494871868078, "language_loss": 0.83066177, "learning_rate": 3.999989041101011e-06, "loss": 0.85402882, "num_input_tokens_seen": 10967120, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.9375, "step": 516, "time_per_iteration": 5.799305438995361 }, { "auxiliary_loss_clip": 0.01266761, "auxiliary_loss_mlp": 0.01071944, "balance_loss_clip": 1.0400908, "balance_loss_mlp": 1.07560039, "epoch": 0.031083721629340148, "flos": 21176953689600.0, "grad_norm": 2.20588006118191, "language_loss": 0.7876122, "learning_rate": 3.999987713900071e-06, "loss": 0.81099927, "num_input_tokens_seen": 10986775, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.9140625, "step": 517, "time_per_iteration": 4.101670980453491 }, { "auxiliary_loss_clip": 0.0126389, "auxiliary_loss_mlp": 0.01070465, "balance_loss_clip": 1.03920865, "balance_loss_mlp": 1.07528925, "epoch": 0.031143844882008116, "flos": 29716187713920.0, "grad_norm": 1.621392588955328, "language_loss": 0.90752733, "learning_rate": 3.999986310859396e-06, "loss": 0.93087089, "num_input_tokens_seen": 11011360, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.8828125, "step": 518, "time_per_iteration": 2.583815097808838 }, { "auxiliary_loss_clip": 0.01276321, "auxiliary_loss_mlp": 0.01089593, "balance_loss_clip": 1.05571413, "balance_loss_mlp": 1.08264983, "epoch": 0.031203968134676085, "flos": 23112467072640.0, "grad_norm": 2.142889051237342, "language_loss": 0.86322463, "learning_rate": 3.999984831979039e-06, "loss": 0.88688374, "num_input_tokens_seen": 11030150, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.9375, "step": 519, "time_per_iteration": 2.5460057258605957 }, { "auxiliary_loss_clip": 0.01270743, "auxiliary_loss_mlp": 0.01093055, "balance_loss_clip": 1.06177485, "balance_loss_mlp": 1.07316422, "epoch": 0.03126409138734405, "flos": 20954379064320.0, "grad_norm": 2.2597246892251106, "language_loss": 0.86736524, "learning_rate": 3.999983277259057e-06, "loss": 0.89100319, "num_input_tokens_seen": 11049145, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.9765625, "step": 520, "time_per_iteration": 2.5227909088134766 }, { "auxiliary_loss_clip": 0.01275177, "auxiliary_loss_mlp": 0.01086605, "balance_loss_clip": 1.05396569, "balance_loss_mlp": 1.07735085, "epoch": 0.031324214640012026, "flos": 21650112570240.0, "grad_norm": 1.9707812689306878, "language_loss": 0.89207697, "learning_rate": 3.999981646699509e-06, "loss": 0.91569483, "num_input_tokens_seen": 11068835, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 1.9765625, "step": 521, "time_per_iteration": 2.5351991653442383 }, { "auxiliary_loss_clip": 0.01269545, "auxiliary_loss_mlp": 0.01083881, "balance_loss_clip": 1.05097938, "balance_loss_mlp": 1.07676637, "epoch": 0.03138433789267999, "flos": 23441337020160.0, "grad_norm": 1.9286624465511242, "language_loss": 0.71142006, "learning_rate": 3.999979940300456e-06, "loss": 0.7349543, "num_input_tokens_seen": 11088980, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.9296875, "step": 522, "time_per_iteration": 2.6425938606262207 }, { "auxiliary_loss_clip": 0.01271081, "auxiliary_loss_mlp": 0.01079282, "balance_loss_clip": 1.04840636, "balance_loss_mlp": 1.07357216, "epoch": 0.03144446114534796, "flos": 18982164960000.0, "grad_norm": 3.207110993088017, "language_loss": 0.84888846, "learning_rate": 3.999978158061963e-06, "loss": 0.87239206, "num_input_tokens_seen": 11104300, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.9765625, "step": 523, "time_per_iteration": 2.490490436553955 }, { "auxiliary_loss_clip": 0.01274023, "auxiliary_loss_mlp": 0.01075884, "balance_loss_clip": 1.04379261, "balance_loss_mlp": 1.07523274, "epoch": 0.031504584398015935, "flos": 22637692080000.0, "grad_norm": 2.4893092567774064, "language_loss": 0.90125734, "learning_rate": 3.999976299984099e-06, "loss": 0.92475641, "num_input_tokens_seen": 11123335, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.984375, "step": 524, "time_per_iteration": 2.49940824508667 }, { "auxiliary_loss_clip": 0.01276955, "auxiliary_loss_mlp": 0.01081798, "balance_loss_clip": 1.04939651, "balance_loss_mlp": 1.07817233, "epoch": 0.0315647076506839, "flos": 25297056339840.0, "grad_norm": 2.6950663979267153, "language_loss": 0.80156803, "learning_rate": 3.999974366066933e-06, "loss": 0.82515556, "num_input_tokens_seen": 11140880, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.984375, "step": 525, "time_per_iteration": 2.537539482116699 }, { "auxiliary_loss_clip": 0.01269548, "auxiliary_loss_mlp": 0.0107772, "balance_loss_clip": 1.04591453, "balance_loss_mlp": 1.07299304, "epoch": 0.03162483090335187, "flos": 16982839065600.0, "grad_norm": 2.270632990987524, "language_loss": 0.81029993, "learning_rate": 3.999972356310538e-06, "loss": 0.83377254, "num_input_tokens_seen": 11158710, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.96875, "step": 526, "time_per_iteration": 2.4579432010650635 }, { "auxiliary_loss_clip": 0.01276375, "auxiliary_loss_mlp": 0.0106704, "balance_loss_clip": 1.03366184, "balance_loss_mlp": 1.07807612, "epoch": 0.03168495415601984, "flos": 18734489706240.0, "grad_norm": 2.5962566888680283, "language_loss": 0.81345379, "learning_rate": 3.999970270714991e-06, "loss": 0.83688796, "num_input_tokens_seen": 11177550, "router_z_loss_clip": 0.33398438, "router_z_loss_mlp": 1.984375, "step": 527, "time_per_iteration": 2.485442876815796 }, { "auxiliary_loss_clip": 0.01265889, "auxiliary_loss_mlp": 0.01076422, "balance_loss_clip": 1.04390216, "balance_loss_mlp": 1.07249832, "epoch": 0.03174507740868781, "flos": 21214875473280.0, "grad_norm": 2.2618844298286307, "language_loss": 0.93905455, "learning_rate": 3.999968109280371e-06, "loss": 0.96247768, "num_input_tokens_seen": 11196230, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.9375, "step": 528, "time_per_iteration": 2.510507106781006 }, { "auxiliary_loss_clip": 0.01267195, "auxiliary_loss_mlp": 0.01070705, "balance_loss_clip": 1.04011548, "balance_loss_mlp": 1.07223189, "epoch": 0.03180520066135578, "flos": 24787663614720.0, "grad_norm": 2.14072191720094, "language_loss": 0.84083599, "learning_rate": 3.99996587200676e-06, "loss": 0.86421496, "num_input_tokens_seen": 11214935, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.953125, "step": 529, "time_per_iteration": 2.5378541946411133 }, { "auxiliary_loss_clip": 0.01269158, "auxiliary_loss_mlp": 0.01083248, "balance_loss_clip": 1.05213428, "balance_loss_mlp": 1.0792942, "epoch": 0.03186532391402375, "flos": 24864261367680.0, "grad_norm": 1.904885169054086, "language_loss": 0.90249312, "learning_rate": 3.999963558894243e-06, "loss": 0.92601722, "num_input_tokens_seen": 11235310, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.8984375, "step": 530, "time_per_iteration": 2.542607307434082 }, { "auxiliary_loss_clip": 0.01265097, "auxiliary_loss_mlp": 0.01069548, "balance_loss_clip": 1.03717029, "balance_loss_mlp": 1.07041669, "epoch": 0.03192544716669172, "flos": 21215055041280.0, "grad_norm": 2.195499920281593, "language_loss": 0.76305389, "learning_rate": 3.999961169942907e-06, "loss": 0.78640032, "num_input_tokens_seen": 11254425, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.953125, "step": 531, "time_per_iteration": 2.492338180541992 }, { "auxiliary_loss_clip": 0.01262793, "auxiliary_loss_mlp": 0.01060342, "balance_loss_clip": 1.02846527, "balance_loss_mlp": 1.07061851, "epoch": 0.03198557041935969, "flos": 24353216616960.0, "grad_norm": 2.3409790798001042, "language_loss": 0.90581262, "learning_rate": 3.999958705152843e-06, "loss": 0.92904401, "num_input_tokens_seen": 11274595, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.921875, "step": 532, "time_per_iteration": 2.5345263481140137 }, { "auxiliary_loss_clip": 0.01134299, "auxiliary_loss_mlp": 0.01016792, "balance_loss_clip": 1.00944924, "balance_loss_mlp": 1.04314184, "epoch": 0.032045693672027656, "flos": 61827367587840.0, "grad_norm": 0.7365865423683059, "language_loss": 0.58026636, "learning_rate": 3.9999561645241445e-06, "loss": 0.6017772, "num_input_tokens_seen": 11336705, "router_z_loss_clip": 0.07324219, "router_z_loss_mlp": 0.9140625, "step": 533, "time_per_iteration": 3.163919687271118 }, { "auxiliary_loss_clip": 0.01262215, "auxiliary_loss_mlp": 0.01077185, "balance_loss_clip": 1.04688215, "balance_loss_mlp": 1.0707202, "epoch": 0.03210581692469563, "flos": 28401174800640.0, "grad_norm": 6.067785132521438, "language_loss": 0.86543858, "learning_rate": 3.999953548056907e-06, "loss": 0.88883257, "num_input_tokens_seen": 11356820, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.9140625, "step": 534, "time_per_iteration": 2.5736827850341797 }, { "auxiliary_loss_clip": 0.01263473, "auxiliary_loss_mlp": 0.01063073, "balance_loss_clip": 1.03279412, "balance_loss_mlp": 1.07333159, "epoch": 0.03216594017736359, "flos": 24717709877760.0, "grad_norm": 2.815623698936664, "language_loss": 0.77194643, "learning_rate": 3.999950855751232e-06, "loss": 0.79521191, "num_input_tokens_seen": 11376645, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.8984375, "step": 535, "time_per_iteration": 2.5287625789642334 }, { "auxiliary_loss_clip": 0.0126578, "auxiliary_loss_mlp": 0.01076443, "balance_loss_clip": 1.04611659, "balance_loss_mlp": 1.07369161, "epoch": 0.032226063430031565, "flos": 31175453646720.0, "grad_norm": 2.4630384432635344, "language_loss": 0.80838263, "learning_rate": 3.999948087607219e-06, "loss": 0.83180493, "num_input_tokens_seen": 11397310, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.921875, "step": 536, "time_per_iteration": 2.604050397872925 }, { "auxiliary_loss_clip": 0.01268873, "auxiliary_loss_mlp": 0.01072557, "balance_loss_clip": 1.03963113, "balance_loss_mlp": 1.07633877, "epoch": 0.03228618668269954, "flos": 32198225506560.0, "grad_norm": 2.8722942436916, "language_loss": 0.69771981, "learning_rate": 3.999945243624975e-06, "loss": 0.72113413, "num_input_tokens_seen": 11418475, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.9296875, "step": 537, "time_per_iteration": 2.5898778438568115 }, { "auxiliary_loss_clip": 0.01266536, "auxiliary_loss_mlp": 0.01071254, "balance_loss_clip": 1.04054523, "balance_loss_mlp": 1.07868898, "epoch": 0.0323463099353675, "flos": 22670154996480.0, "grad_norm": 3.997340254059608, "language_loss": 0.82593894, "learning_rate": 3.999942323804607e-06, "loss": 0.84931684, "num_input_tokens_seen": 11436630, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.875, "step": 538, "time_per_iteration": 2.515735149383545 }, { "auxiliary_loss_clip": 0.01272239, "auxiliary_loss_mlp": 0.01072859, "balance_loss_clip": 1.04215109, "balance_loss_mlp": 1.07509136, "epoch": 0.032406433188035474, "flos": 26905172232960.0, "grad_norm": 1.9967482550163373, "language_loss": 0.79396391, "learning_rate": 3.999939328146225e-06, "loss": 0.81741488, "num_input_tokens_seen": 11457275, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.96875, "step": 539, "time_per_iteration": 2.535090923309326 }, { "auxiliary_loss_clip": 0.01265436, "auxiliary_loss_mlp": 0.01064461, "balance_loss_clip": 1.03205979, "balance_loss_mlp": 1.07453346, "epoch": 0.03246655644070344, "flos": 31503928544640.0, "grad_norm": 4.315776727929783, "language_loss": 0.77649587, "learning_rate": 3.999936256649943e-06, "loss": 0.79979491, "num_input_tokens_seen": 11476925, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.90625, "step": 540, "time_per_iteration": 2.5968143939971924 }, { "auxiliary_loss_clip": 0.01274157, "auxiliary_loss_mlp": 0.01068144, "balance_loss_clip": 1.03786492, "balance_loss_mlp": 1.07942438, "epoch": 0.03252667969337141, "flos": 23218331431680.0, "grad_norm": 2.3480560960976207, "language_loss": 0.85729975, "learning_rate": 3.999933109315878e-06, "loss": 0.88072276, "num_input_tokens_seen": 11496830, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.9453125, "step": 541, "time_per_iteration": 2.4877400398254395 }, { "auxiliary_loss_clip": 0.0126383, "auxiliary_loss_mlp": 0.01077473, "balance_loss_clip": 1.04509556, "balance_loss_mlp": 1.075091, "epoch": 0.032586802946039384, "flos": 14757454926720.0, "grad_norm": 2.659575550914408, "language_loss": 0.88981307, "learning_rate": 3.9999298861441496e-06, "loss": 0.91322601, "num_input_tokens_seen": 11515605, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.8828125, "step": 542, "time_per_iteration": 2.51326847076416 }, { "auxiliary_loss_clip": 0.01264654, "auxiliary_loss_mlp": 0.01072591, "balance_loss_clip": 1.0419544, "balance_loss_mlp": 1.07390296, "epoch": 0.03264692619870735, "flos": 24280677100800.0, "grad_norm": 2.2546609806794566, "language_loss": 0.71027792, "learning_rate": 3.999926587134879e-06, "loss": 0.73365033, "num_input_tokens_seen": 11536230, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.90625, "step": 543, "time_per_iteration": 2.5176641941070557 }, { "auxiliary_loss_clip": 0.01263883, "auxiliary_loss_mlp": 0.01084056, "balance_loss_clip": 1.05279899, "balance_loss_mlp": 1.06902695, "epoch": 0.03270704945137532, "flos": 22893160584960.0, "grad_norm": 3.4332445539916514, "language_loss": 0.91837347, "learning_rate": 3.999923212288192e-06, "loss": 0.94185281, "num_input_tokens_seen": 11554715, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.953125, "step": 544, "time_per_iteration": 2.5108227729797363 }, { "auxiliary_loss_clip": 0.01267468, "auxiliary_loss_mlp": 0.01072724, "balance_loss_clip": 1.0439949, "balance_loss_mlp": 1.07614136, "epoch": 0.032767172704043286, "flos": 18041018757120.0, "grad_norm": 3.44690342456265, "language_loss": 0.65642512, "learning_rate": 3.999919761604216e-06, "loss": 0.67982703, "num_input_tokens_seen": 11571370, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.9140625, "step": 545, "time_per_iteration": 2.4671437740325928 }, { "auxiliary_loss_clip": 0.01264191, "auxiliary_loss_mlp": 0.01066404, "balance_loss_clip": 1.0360533, "balance_loss_mlp": 1.07173586, "epoch": 0.03282729595671126, "flos": 22528739151360.0, "grad_norm": 2.337334080631783, "language_loss": 0.92116034, "learning_rate": 3.999916235083083e-06, "loss": 0.94446623, "num_input_tokens_seen": 11588560, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.9296875, "step": 546, "time_per_iteration": 2.49176025390625 }, { "auxiliary_loss_clip": 0.01262861, "auxiliary_loss_mlp": 0.01068438, "balance_loss_clip": 1.03627574, "balance_loss_mlp": 1.07025123, "epoch": 0.03288741920937923, "flos": 20410620001920.0, "grad_norm": 2.3458833739107083, "language_loss": 0.81958294, "learning_rate": 3.999912632724925e-06, "loss": 0.84289587, "num_input_tokens_seen": 11605685, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 1.921875, "step": 547, "time_per_iteration": 2.4903180599212646 }, { "auxiliary_loss_clip": 0.0126514, "auxiliary_loss_mlp": 0.01069044, "balance_loss_clip": 1.0385983, "balance_loss_mlp": 1.0740217, "epoch": 0.032947542462047195, "flos": 20777986350720.0, "grad_norm": 1.9403038872815082, "language_loss": 0.81539601, "learning_rate": 3.999908954529881e-06, "loss": 0.83873785, "num_input_tokens_seen": 11626290, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.90625, "step": 548, "time_per_iteration": 2.5236217975616455 }, { "auxiliary_loss_clip": 0.01264817, "auxiliary_loss_mlp": 0.01074986, "balance_loss_clip": 1.042418, "balance_loss_mlp": 1.07318914, "epoch": 0.03300766571471517, "flos": 19901263190400.0, "grad_norm": 3.4405510046975842, "language_loss": 0.6737386, "learning_rate": 3.999905200498087e-06, "loss": 0.69713652, "num_input_tokens_seen": 11643950, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.9140625, "step": 549, "time_per_iteration": 2.4950718879699707 }, { "auxiliary_loss_clip": 0.01258224, "auxiliary_loss_mlp": 0.01071505, "balance_loss_clip": 1.04067779, "balance_loss_mlp": 1.07228458, "epoch": 0.03306778896738313, "flos": 17967760968960.0, "grad_norm": 2.1608138554633145, "language_loss": 0.85931897, "learning_rate": 3.999901370629689e-06, "loss": 0.88261628, "num_input_tokens_seen": 11662560, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.859375, "step": 550, "time_per_iteration": 2.4719536304473877 }, { "auxiliary_loss_clip": 0.01263463, "auxiliary_loss_mlp": 0.01083352, "balance_loss_clip": 1.05202413, "balance_loss_mlp": 1.07423043, "epoch": 0.033127912220051105, "flos": 21653380707840.0, "grad_norm": 2.176904164160749, "language_loss": 0.81698465, "learning_rate": 3.99989746492483e-06, "loss": 0.84045279, "num_input_tokens_seen": 11682265, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.890625, "step": 551, "time_per_iteration": 2.492631196975708 }, { "auxiliary_loss_clip": 0.01269535, "auxiliary_loss_mlp": 0.01078278, "balance_loss_clip": 1.04706883, "balance_loss_mlp": 1.07502067, "epoch": 0.03318803547271908, "flos": 30188376927360.0, "grad_norm": 3.1954665751787132, "language_loss": 0.85879517, "learning_rate": 3.999893483383658e-06, "loss": 0.8822732, "num_input_tokens_seen": 11699300, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.9453125, "step": 552, "time_per_iteration": 2.5530803203582764 }, { "auxiliary_loss_clip": 0.01268806, "auxiliary_loss_mlp": 0.01075708, "balance_loss_clip": 1.04299641, "balance_loss_mlp": 1.07636714, "epoch": 0.03324815872538704, "flos": 20376038183040.0, "grad_norm": 2.5314551104054175, "language_loss": 0.9298588, "learning_rate": 3.999889426006326e-06, "loss": 0.95330393, "num_input_tokens_seen": 11716955, "router_z_loss_clip": 0.328125, "router_z_loss_mlp": 1.921875, "step": 553, "time_per_iteration": 2.487058401107788 }, { "auxiliary_loss_clip": 0.01263255, "auxiliary_loss_mlp": 0.0107077, "balance_loss_clip": 1.03839231, "balance_loss_mlp": 1.07290721, "epoch": 0.033308281978055014, "flos": 24494560634880.0, "grad_norm": 2.5151614769062927, "language_loss": 0.78905261, "learning_rate": 3.999885292792986e-06, "loss": 0.81239283, "num_input_tokens_seen": 11736130, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.90625, "step": 554, "time_per_iteration": 2.556032180786133 }, { "auxiliary_loss_clip": 0.01258051, "auxiliary_loss_mlp": 0.01086714, "balance_loss_clip": 1.05369306, "balance_loss_mlp": 1.07079256, "epoch": 0.03336840523072298, "flos": 23400326666880.0, "grad_norm": 2.5639275077352495, "language_loss": 0.82213181, "learning_rate": 3.999881083743795e-06, "loss": 0.8455795, "num_input_tokens_seen": 11754425, "router_z_loss_clip": 0.33007812, "router_z_loss_mlp": 1.875, "step": 555, "time_per_iteration": 2.505932569503784 }, { "auxiliary_loss_clip": 0.01263603, "auxiliary_loss_mlp": 0.01081038, "balance_loss_clip": 1.04877949, "balance_loss_mlp": 1.07148218, "epoch": 0.03342852848339095, "flos": 30550571717760.0, "grad_norm": 3.6611230061361995, "language_loss": 0.8901518, "learning_rate": 3.999876798858914e-06, "loss": 0.91359818, "num_input_tokens_seen": 11772845, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 1.921875, "step": 556, "time_per_iteration": 2.601979970932007 }, { "auxiliary_loss_clip": 0.01261617, "auxiliary_loss_mlp": 0.0108184, "balance_loss_clip": 1.04943871, "balance_loss_mlp": 1.07148647, "epoch": 0.03348865173605892, "flos": 22893304239360.0, "grad_norm": 2.4713657330626932, "language_loss": 0.83816975, "learning_rate": 3.999872438138503e-06, "loss": 0.86160433, "num_input_tokens_seen": 11792850, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.8984375, "step": 557, "time_per_iteration": 2.5117013454437256 }, { "auxiliary_loss_clip": 0.0126902, "auxiliary_loss_mlp": 0.01063727, "balance_loss_clip": 1.03373444, "balance_loss_mlp": 1.07710505, "epoch": 0.03354877498872689, "flos": 17676022705920.0, "grad_norm": 2.994103782498396, "language_loss": 0.9425317, "learning_rate": 3.999868001582729e-06, "loss": 0.96585917, "num_input_tokens_seen": 11809670, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.921875, "step": 558, "time_per_iteration": 5.510988712310791 }, { "auxiliary_loss_clip": 0.01260461, "auxiliary_loss_mlp": 0.01074754, "balance_loss_clip": 1.04371238, "balance_loss_mlp": 1.06996596, "epoch": 0.03360889824139486, "flos": 21652985658240.0, "grad_norm": 2.277305407304813, "language_loss": 0.76765239, "learning_rate": 3.99986348919176e-06, "loss": 0.7910046, "num_input_tokens_seen": 11829665, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.90625, "step": 559, "time_per_iteration": 3.9502739906311035 }, { "auxiliary_loss_clip": 0.0126165, "auxiliary_loss_mlp": 0.01081682, "balance_loss_clip": 1.05183148, "balance_loss_mlp": 1.07124186, "epoch": 0.033669021494062826, "flos": 21795730306560.0, "grad_norm": 2.083139553061574, "language_loss": 0.87508583, "learning_rate": 3.9998589009657675e-06, "loss": 0.89851916, "num_input_tokens_seen": 11848190, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.90625, "step": 560, "time_per_iteration": 2.486494541168213 }, { "auxiliary_loss_clip": 0.01257544, "auxiliary_loss_mlp": 0.01068243, "balance_loss_clip": 1.03915584, "balance_loss_mlp": 1.06976068, "epoch": 0.0337291447467308, "flos": 21866222747520.0, "grad_norm": 2.408453170290826, "language_loss": 0.81909907, "learning_rate": 3.999854236904925e-06, "loss": 0.84235692, "num_input_tokens_seen": 11864795, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.8828125, "step": 561, "time_per_iteration": 2.4676904678344727 }, { "auxiliary_loss_clip": 0.01257256, "auxiliary_loss_mlp": 0.01074104, "balance_loss_clip": 1.04411066, "balance_loss_mlp": 1.07189834, "epoch": 0.03378926799939877, "flos": 24245951627520.0, "grad_norm": 1.8196184253526884, "language_loss": 0.82285404, "learning_rate": 3.999849497009409e-06, "loss": 0.84616768, "num_input_tokens_seen": 11885275, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.8515625, "step": 562, "time_per_iteration": 2.534841775894165 }, { "auxiliary_loss_clip": 0.0126465, "auxiliary_loss_mlp": 0.01086385, "balance_loss_clip": 1.0549612, "balance_loss_mlp": 1.07328701, "epoch": 0.033849391252066735, "flos": 16507812677760.0, "grad_norm": 2.7758884722693438, "language_loss": 0.84347296, "learning_rate": 3.999844681279401e-06, "loss": 0.86698329, "num_input_tokens_seen": 11903595, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.9140625, "step": 563, "time_per_iteration": 2.457418203353882 }, { "auxiliary_loss_clip": 0.0126332, "auxiliary_loss_mlp": 0.01081303, "balance_loss_clip": 1.05002272, "balance_loss_mlp": 1.07305813, "epoch": 0.03390951450473471, "flos": 15669298609920.0, "grad_norm": 2.1171578542618006, "language_loss": 0.94238049, "learning_rate": 3.99983978971508e-06, "loss": 0.96582669, "num_input_tokens_seen": 11917815, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.90625, "step": 564, "time_per_iteration": 2.4796009063720703 }, { "auxiliary_loss_clip": 0.01263305, "auxiliary_loss_mlp": 0.0107089, "balance_loss_clip": 1.03858447, "balance_loss_mlp": 1.07062328, "epoch": 0.03396963775740267, "flos": 22674787850880.0, "grad_norm": 2.3875543180656837, "language_loss": 0.94401157, "learning_rate": 3.999834822316635e-06, "loss": 0.96735358, "num_input_tokens_seen": 11936305, "router_z_loss_clip": 0.32421875, "router_z_loss_mlp": 1.921875, "step": 565, "time_per_iteration": 2.496396780014038 }, { "auxiliary_loss_clip": 0.01135918, "auxiliary_loss_mlp": 0.01047224, "balance_loss_clip": 1.04143071, "balance_loss_mlp": 1.04344296, "epoch": 0.034029761010070644, "flos": 64392683063040.0, "grad_norm": 1.2898609390902043, "language_loss": 0.54867595, "learning_rate": 3.9998297790842535e-06, "loss": 0.57050741, "num_input_tokens_seen": 11998940, "router_z_loss_clip": 0.05786133, "router_z_loss_mlp": 0.921875, "step": 566, "time_per_iteration": 3.167177677154541 }, { "auxiliary_loss_clip": 0.01264026, "auxiliary_loss_mlp": 0.01070791, "balance_loss_clip": 1.03810441, "balance_loss_mlp": 1.07224607, "epoch": 0.034089884262738616, "flos": 25004204755200.0, "grad_norm": 3.1224623129671127, "language_loss": 0.77012992, "learning_rate": 3.999824660018126e-06, "loss": 0.79347807, "num_input_tokens_seen": 12018860, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 1.921875, "step": 567, "time_per_iteration": 2.584554672241211 }, { "auxiliary_loss_clip": 0.012587, "auxiliary_loss_mlp": 0.01082055, "balance_loss_clip": 1.05182338, "balance_loss_mlp": 1.07287765, "epoch": 0.03415000751540658, "flos": 28439096584320.0, "grad_norm": 1.9696336234244818, "language_loss": 0.80785835, "learning_rate": 3.999819465118447e-06, "loss": 0.83126587, "num_input_tokens_seen": 12039675, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.859375, "step": 568, "time_per_iteration": 2.59576678276062 }, { "auxiliary_loss_clip": 0.01257879, "auxiliary_loss_mlp": 0.01079526, "balance_loss_clip": 1.04810202, "balance_loss_mlp": 1.07259417, "epoch": 0.034210130768074554, "flos": 21468727866240.0, "grad_norm": 1.6860782833470038, "language_loss": 0.86570454, "learning_rate": 3.999814194385413e-06, "loss": 0.88907856, "num_input_tokens_seen": 12057680, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.8515625, "step": 569, "time_per_iteration": 2.502708673477173 }, { "auxiliary_loss_clip": 0.01259772, "auxiliary_loss_mlp": 0.01074655, "balance_loss_clip": 1.04356527, "balance_loss_mlp": 1.07180679, "epoch": 0.03427025402074252, "flos": 18697501676160.0, "grad_norm": 1.8620486428555003, "language_loss": 0.95878911, "learning_rate": 3.9998088478192255e-06, "loss": 0.98213339, "num_input_tokens_seen": 12076135, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.875, "step": 570, "time_per_iteration": 2.5020155906677246 }, { "auxiliary_loss_clip": 0.01260663, "auxiliary_loss_mlp": 0.01074642, "balance_loss_clip": 1.04061973, "balance_loss_mlp": 1.06856632, "epoch": 0.03433037727341049, "flos": 20849987162880.0, "grad_norm": 2.2338437606452928, "language_loss": 0.79428303, "learning_rate": 3.9998034254200846e-06, "loss": 0.81763613, "num_input_tokens_seen": 12094785, "router_z_loss_clip": 0.33984375, "router_z_loss_mlp": 1.921875, "step": 571, "time_per_iteration": 2.509748697280884 }, { "auxiliary_loss_clip": 0.01258703, "auxiliary_loss_mlp": 0.0108381, "balance_loss_clip": 1.05064535, "balance_loss_mlp": 1.07140088, "epoch": 0.03439050052607846, "flos": 25410282986880.0, "grad_norm": 2.244124792690199, "language_loss": 0.80367458, "learning_rate": 3.999797927188199e-06, "loss": 0.82709968, "num_input_tokens_seen": 12114590, "router_z_loss_clip": 0.33203125, "router_z_loss_mlp": 1.875, "step": 572, "time_per_iteration": 2.564319610595703 }, { "auxiliary_loss_clip": 0.01266983, "auxiliary_loss_mlp": 0.01067905, "balance_loss_clip": 1.03633869, "balance_loss_mlp": 1.07519817, "epoch": 0.03445062377874643, "flos": 17640147997440.0, "grad_norm": 1.8696502567824154, "language_loss": 0.84685385, "learning_rate": 3.999792353123774e-06, "loss": 0.87020278, "num_input_tokens_seen": 12132390, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.9140625, "step": 573, "time_per_iteration": 2.4684958457946777 }, { "auxiliary_loss_clip": 0.01261038, "auxiliary_loss_mlp": 0.01067259, "balance_loss_clip": 1.03726602, "balance_loss_mlp": 1.06998837, "epoch": 0.0345107470314144, "flos": 16764502245120.0, "grad_norm": 2.298079342248199, "language_loss": 0.76520652, "learning_rate": 3.999786703227023e-06, "loss": 0.78848946, "num_input_tokens_seen": 12149035, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.90625, "step": 574, "time_per_iteration": 2.539382219314575 }, { "auxiliary_loss_clip": 0.01259235, "auxiliary_loss_mlp": 0.01069066, "balance_loss_clip": 1.0390017, "balance_loss_mlp": 1.07074165, "epoch": 0.03457087028408237, "flos": 14684448533760.0, "grad_norm": 3.314557509232503, "language_loss": 0.83424044, "learning_rate": 3.9997809774981606e-06, "loss": 0.85752344, "num_input_tokens_seen": 12167530, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.8828125, "step": 575, "time_per_iteration": 2.4713995456695557 }, { "auxiliary_loss_clip": 0.01256018, "auxiliary_loss_mlp": 0.01068766, "balance_loss_clip": 1.03786671, "balance_loss_mlp": 1.07190144, "epoch": 0.03463099353675034, "flos": 20011293527040.0, "grad_norm": 3.0607917492822216, "language_loss": 0.83891702, "learning_rate": 3.9997751759374025e-06, "loss": 0.86216486, "num_input_tokens_seen": 12186340, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.84375, "step": 576, "time_per_iteration": 2.49153470993042 }, { "auxiliary_loss_clip": 0.01259109, "auxiliary_loss_mlp": 0.01070501, "balance_loss_clip": 1.0415566, "balance_loss_mlp": 1.07763529, "epoch": 0.03469111678941831, "flos": 25301150490240.0, "grad_norm": 2.4120729484949868, "language_loss": 0.86441463, "learning_rate": 3.99976929854497e-06, "loss": 0.88771069, "num_input_tokens_seen": 12204090, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.8125, "step": 577, "time_per_iteration": 2.5159568786621094 }, { "auxiliary_loss_clip": 0.01257349, "auxiliary_loss_mlp": 0.01074411, "balance_loss_clip": 1.04375029, "balance_loss_mlp": 1.07442546, "epoch": 0.034751240042086275, "flos": 23259413612160.0, "grad_norm": 2.162316213250321, "language_loss": 0.72246575, "learning_rate": 3.9997633453210845e-06, "loss": 0.74578333, "num_input_tokens_seen": 12224850, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.828125, "step": 578, "time_per_iteration": 2.5187623500823975 }, { "auxiliary_loss_clip": 0.01259563, "auxiliary_loss_mlp": 0.01073776, "balance_loss_clip": 1.04177988, "balance_loss_mlp": 1.07218623, "epoch": 0.03481136329475425, "flos": 23769237300480.0, "grad_norm": 4.997320735945946, "language_loss": 0.77656257, "learning_rate": 3.999757316265973e-06, "loss": 0.79989588, "num_input_tokens_seen": 12244935, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.875, "step": 579, "time_per_iteration": 2.512347936630249 }, { "auxiliary_loss_clip": 0.01254612, "auxiliary_loss_mlp": 0.0108251, "balance_loss_clip": 1.05122948, "balance_loss_mlp": 1.0705328, "epoch": 0.03487148654742222, "flos": 20157521794560.0, "grad_norm": 2.1582259078349098, "language_loss": 0.86720538, "learning_rate": 3.999751211379863e-06, "loss": 0.8905766, "num_input_tokens_seen": 12262140, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.84375, "step": 580, "time_per_iteration": 2.5133838653564453 }, { "auxiliary_loss_clip": 0.01260855, "auxiliary_loss_mlp": 0.01060022, "balance_loss_clip": 1.03200817, "balance_loss_mlp": 1.07155252, "epoch": 0.034931609800090184, "flos": 15669585918720.0, "grad_norm": 2.325346911389336, "language_loss": 0.81886041, "learning_rate": 3.999745030662987e-06, "loss": 0.84206921, "num_input_tokens_seen": 12280930, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.8984375, "step": 581, "time_per_iteration": 2.4624385833740234 }, { "auxiliary_loss_clip": 0.0125566, "auxiliary_loss_mlp": 0.01068366, "balance_loss_clip": 1.03935027, "balance_loss_mlp": 1.07249665, "epoch": 0.034991733052758156, "flos": 16362374509440.0, "grad_norm": 2.2199880306542785, "language_loss": 0.77156758, "learning_rate": 3.99973877411558e-06, "loss": 0.79480785, "num_input_tokens_seen": 12299125, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.828125, "step": 582, "time_per_iteration": 2.4742088317871094 }, { "auxiliary_loss_clip": 0.01255025, "auxiliary_loss_mlp": 0.01073023, "balance_loss_clip": 1.04224277, "balance_loss_mlp": 1.07309532, "epoch": 0.03505185630542612, "flos": 19387309438080.0, "grad_norm": 2.0398060210261684, "language_loss": 0.87811971, "learning_rate": 3.999732441737877e-06, "loss": 0.90140015, "num_input_tokens_seen": 12316905, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.8203125, "step": 583, "time_per_iteration": 2.4674181938171387 }, { "auxiliary_loss_clip": 0.01261081, "auxiliary_loss_mlp": 0.01084357, "balance_loss_clip": 1.05374384, "balance_loss_mlp": 1.07220364, "epoch": 0.03511197955809409, "flos": 21323828401920.0, "grad_norm": 2.504321522518801, "language_loss": 0.81013548, "learning_rate": 3.99972603353012e-06, "loss": 0.83358991, "num_input_tokens_seen": 12335070, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.890625, "step": 584, "time_per_iteration": 2.5165956020355225 }, { "auxiliary_loss_clip": 0.01257317, "auxiliary_loss_mlp": 0.01065853, "balance_loss_clip": 1.03602672, "balance_loss_mlp": 1.06996071, "epoch": 0.035172102810762065, "flos": 14136595320960.0, "grad_norm": 4.714468616426622, "language_loss": 0.92902172, "learning_rate": 3.999719549492551e-06, "loss": 0.95225346, "num_input_tokens_seen": 12350315, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.875, "step": 585, "time_per_iteration": 2.4501218795776367 }, { "auxiliary_loss_clip": 0.01257645, "auxiliary_loss_mlp": 0.01072315, "balance_loss_clip": 1.04270315, "balance_loss_mlp": 1.07203293, "epoch": 0.03523222606343003, "flos": 20296890564480.0, "grad_norm": 3.8781489810252086, "language_loss": 0.8763628, "learning_rate": 3.9997129896254165e-06, "loss": 0.89966238, "num_input_tokens_seen": 12366030, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.859375, "step": 586, "time_per_iteration": 2.4820823669433594 }, { "auxiliary_loss_clip": 0.01261555, "auxiliary_loss_mlp": 0.01076109, "balance_loss_clip": 1.04637861, "balance_loss_mlp": 1.07341456, "epoch": 0.035292349316098, "flos": 20375822701440.0, "grad_norm": 2.407596008026584, "language_loss": 0.76694781, "learning_rate": 3.999706353928965e-06, "loss": 0.79032445, "num_input_tokens_seen": 12384895, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.8828125, "step": 587, "time_per_iteration": 2.479261636734009 }, { "auxiliary_loss_clip": 0.01261533, "auxiliary_loss_mlp": 0.01061332, "balance_loss_clip": 1.0309093, "balance_loss_mlp": 1.07342267, "epoch": 0.03535247256876597, "flos": 21468871520640.0, "grad_norm": 2.0265671219003996, "language_loss": 0.78730059, "learning_rate": 3.999699642403449e-06, "loss": 0.81052923, "num_input_tokens_seen": 12404980, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.8828125, "step": 588, "time_per_iteration": 2.5407555103302 }, { "auxiliary_loss_clip": 0.01259637, "auxiliary_loss_mlp": 0.01075721, "balance_loss_clip": 1.04303384, "balance_loss_mlp": 1.07076144, "epoch": 0.03541259582143394, "flos": 23623044946560.0, "grad_norm": 2.201723020899977, "language_loss": 0.94087958, "learning_rate": 3.99969285504912e-06, "loss": 0.96423328, "num_input_tokens_seen": 12423835, "router_z_loss_clip": 0.32617188, "router_z_loss_mlp": 1.890625, "step": 589, "time_per_iteration": 2.4948627948760986 }, { "auxiliary_loss_clip": 0.01262599, "auxiliary_loss_mlp": 0.01071231, "balance_loss_clip": 1.04145217, "balance_loss_mlp": 1.07263362, "epoch": 0.03547271907410191, "flos": 33726367768320.0, "grad_norm": 2.156496064693264, "language_loss": 0.8395949, "learning_rate": 3.99968599186624e-06, "loss": 0.86293322, "num_input_tokens_seen": 12443135, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.8984375, "step": 590, "time_per_iteration": 2.611581802368164 }, { "auxiliary_loss_clip": 0.01254738, "auxiliary_loss_mlp": 0.0106417, "balance_loss_clip": 1.03589332, "balance_loss_mlp": 1.0715754, "epoch": 0.03553284232676988, "flos": 21142695093120.0, "grad_norm": 2.8639621721973856, "language_loss": 0.87200916, "learning_rate": 3.999679052855065e-06, "loss": 0.89519823, "num_input_tokens_seen": 12462895, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.828125, "step": 591, "time_per_iteration": 2.499316930770874 }, { "auxiliary_loss_clip": 0.01258263, "auxiliary_loss_mlp": 0.01077597, "balance_loss_clip": 1.04667437, "balance_loss_mlp": 1.0702529, "epoch": 0.03559296557943785, "flos": 20046593617920.0, "grad_norm": 4.6518711209826495, "language_loss": 0.82738227, "learning_rate": 3.999672038015861e-06, "loss": 0.85074085, "num_input_tokens_seen": 12481515, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.875, "step": 592, "time_per_iteration": 2.5055134296417236 }, { "auxiliary_loss_clip": 0.0114151, "auxiliary_loss_mlp": 0.01014455, "balance_loss_clip": 1.00858974, "balance_loss_mlp": 1.04697847, "epoch": 0.035653088832105814, "flos": 60334597244160.0, "grad_norm": 0.8574487747130051, "language_loss": 0.59781623, "learning_rate": 3.999664947348893e-06, "loss": 0.61937582, "num_input_tokens_seen": 12548220, "router_z_loss_clip": 0.05859375, "router_z_loss_mlp": 0.9453125, "step": 593, "time_per_iteration": 3.1401002407073975 }, { "auxiliary_loss_clip": 0.0125878, "auxiliary_loss_mlp": 0.01067234, "balance_loss_clip": 1.03650153, "balance_loss_mlp": 1.07598996, "epoch": 0.035713212084773786, "flos": 20113135562880.0, "grad_norm": 2.5476573137532226, "language_loss": 0.87049437, "learning_rate": 3.999657780854429e-06, "loss": 0.89375448, "num_input_tokens_seen": 12566105, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.828125, "step": 594, "time_per_iteration": 2.521101951599121 }, { "auxiliary_loss_clip": 0.01258617, "auxiliary_loss_mlp": 0.01069853, "balance_loss_clip": 1.04121935, "balance_loss_mlp": 1.07291389, "epoch": 0.03577333533744176, "flos": 26285785084800.0, "grad_norm": 2.146056795604613, "language_loss": 0.8385185, "learning_rate": 3.999650538532742e-06, "loss": 0.86180323, "num_input_tokens_seen": 12586680, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.859375, "step": 595, "time_per_iteration": 2.5749728679656982 }, { "auxiliary_loss_clip": 0.01257501, "auxiliary_loss_mlp": 0.0108006, "balance_loss_clip": 1.05077064, "balance_loss_mlp": 1.07470989, "epoch": 0.035833458590109724, "flos": 10889732211840.0, "grad_norm": 3.171165474099718, "language_loss": 0.96021116, "learning_rate": 3.999643220384106e-06, "loss": 0.98358679, "num_input_tokens_seen": 12601605, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.828125, "step": 596, "time_per_iteration": 2.5107734203338623 }, { "auxiliary_loss_clip": 0.01264207, "auxiliary_loss_mlp": 0.01078657, "balance_loss_clip": 1.05059505, "balance_loss_mlp": 1.07923675, "epoch": 0.035893581842777696, "flos": 22090198003200.0, "grad_norm": 2.6693931988451056, "language_loss": 0.82703161, "learning_rate": 3.999635826408799e-06, "loss": 0.85046023, "num_input_tokens_seen": 12620365, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.8515625, "step": 597, "time_per_iteration": 2.5096065998077393 }, { "auxiliary_loss_clip": 0.01258867, "auxiliary_loss_mlp": 0.01072135, "balance_loss_clip": 1.04273784, "balance_loss_mlp": 1.07760096, "epoch": 0.03595370509544566, "flos": 23038347358080.0, "grad_norm": 1.8742264103258393, "language_loss": 0.81398094, "learning_rate": 3.999628356607101e-06, "loss": 0.83729094, "num_input_tokens_seen": 12641140, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.8125, "step": 598, "time_per_iteration": 2.507936716079712 }, { "auxiliary_loss_clip": 0.01256841, "auxiliary_loss_mlp": 0.01071622, "balance_loss_clip": 1.04112792, "balance_loss_mlp": 1.07997251, "epoch": 0.03601382834811363, "flos": 20777734955520.0, "grad_norm": 1.770364544357873, "language_loss": 0.81147116, "learning_rate": 3.999620810979295e-06, "loss": 0.83475578, "num_input_tokens_seen": 12661080, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.765625, "step": 599, "time_per_iteration": 4.1066813468933105 }, { "auxiliary_loss_clip": 0.01263153, "auxiliary_loss_mlp": 0.01072271, "balance_loss_clip": 1.04376841, "balance_loss_mlp": 1.0756743, "epoch": 0.036073951600781605, "flos": 23951627585280.0, "grad_norm": 2.1789532182546054, "language_loss": 0.86378181, "learning_rate": 3.999613189525668e-06, "loss": 0.88713604, "num_input_tokens_seen": 12678270, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.875, "step": 600, "time_per_iteration": 5.417440176010132 }, { "auxiliary_loss_clip": 0.01250839, "auxiliary_loss_mlp": 0.01077862, "balance_loss_clip": 1.04827428, "balance_loss_mlp": 1.06961703, "epoch": 0.03613407485344957, "flos": 18912283050240.0, "grad_norm": 2.2509963208810957, "language_loss": 0.82349372, "learning_rate": 3.999605492246508e-06, "loss": 0.84678078, "num_input_tokens_seen": 12697295, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.8125, "step": 601, "time_per_iteration": 2.488349676132202 }, { "auxiliary_loss_clip": 0.01250821, "auxiliary_loss_mlp": 0.01066484, "balance_loss_clip": 1.03665757, "balance_loss_mlp": 1.07095909, "epoch": 0.03619419810611754, "flos": 23038526926080.0, "grad_norm": 2.7187466823215796, "language_loss": 0.75119424, "learning_rate": 3.999597719142107e-06, "loss": 0.77436721, "num_input_tokens_seen": 12716165, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.796875, "step": 602, "time_per_iteration": 2.506977081298828 }, { "auxiliary_loss_clip": 0.01250483, "auxiliary_loss_mlp": 0.01060696, "balance_loss_clip": 1.03208601, "balance_loss_mlp": 1.0711081, "epoch": 0.03625432135878551, "flos": 29457774293760.0, "grad_norm": 2.1181294930274452, "language_loss": 0.79590267, "learning_rate": 3.999589870212761e-06, "loss": 0.81901443, "num_input_tokens_seen": 12735475, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.796875, "step": 603, "time_per_iteration": 2.5421698093414307 }, { "auxiliary_loss_clip": 0.01252386, "auxiliary_loss_mlp": 0.01067735, "balance_loss_clip": 1.04005432, "balance_loss_mlp": 1.07351613, "epoch": 0.03631444461145348, "flos": 23508525409920.0, "grad_norm": 1.9007148000045675, "language_loss": 0.86939734, "learning_rate": 3.9995819454587664e-06, "loss": 0.89259863, "num_input_tokens_seen": 12754540, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.7890625, "step": 604, "time_per_iteration": 2.5013041496276855 }, { "auxiliary_loss_clip": 0.01256132, "auxiliary_loss_mlp": 0.01063805, "balance_loss_clip": 1.03333509, "balance_loss_mlp": 1.075073, "epoch": 0.03637456786412145, "flos": 16618130323200.0, "grad_norm": 4.679796475944965, "language_loss": 0.81003201, "learning_rate": 3.999573944880424e-06, "loss": 0.83323145, "num_input_tokens_seen": 12773050, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.8046875, "step": 605, "time_per_iteration": 2.456016778945923 }, { "auxiliary_loss_clip": 0.01253411, "auxiliary_loss_mlp": 0.01068862, "balance_loss_clip": 1.04120564, "balance_loss_mlp": 1.07259464, "epoch": 0.03643469111678942, "flos": 15851832549120.0, "grad_norm": 2.514778577702537, "language_loss": 0.85937786, "learning_rate": 3.9995658684780375e-06, "loss": 0.88260067, "num_input_tokens_seen": 12791240, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.8125, "step": 606, "time_per_iteration": 2.485832691192627 }, { "auxiliary_loss_clip": 0.01257995, "auxiliary_loss_mlp": 0.01072654, "balance_loss_clip": 1.04373395, "balance_loss_mlp": 1.07275569, "epoch": 0.03649481436945739, "flos": 23620387340160.0, "grad_norm": 8.88432217691545, "language_loss": 0.82447428, "learning_rate": 3.999557716251912e-06, "loss": 0.8477807, "num_input_tokens_seen": 12812245, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.8515625, "step": 607, "time_per_iteration": 2.498627185821533 }, { "auxiliary_loss_clip": 0.01250292, "auxiliary_loss_mlp": 0.01065697, "balance_loss_clip": 1.03815985, "balance_loss_mlp": 1.07165706, "epoch": 0.036554937622125354, "flos": 21755581879680.0, "grad_norm": 4.181731471366393, "language_loss": 0.83541405, "learning_rate": 3.999549488202358e-06, "loss": 0.85857391, "num_input_tokens_seen": 12831085, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.7890625, "step": 608, "time_per_iteration": 2.515286445617676 }, { "auxiliary_loss_clip": 0.01254775, "auxiliary_loss_mlp": 0.01062088, "balance_loss_clip": 1.03111792, "balance_loss_mlp": 1.07363057, "epoch": 0.036615060874793326, "flos": 17819772935040.0, "grad_norm": 2.12907077891396, "language_loss": 0.819121, "learning_rate": 3.999541184329688e-06, "loss": 0.84228963, "num_input_tokens_seen": 12849115, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.8125, "step": 609, "time_per_iteration": 2.4703996181488037 }, { "auxiliary_loss_clip": 0.01265872, "auxiliary_loss_mlp": 0.01077939, "balance_loss_clip": 1.04975772, "balance_loss_mlp": 1.08239913, "epoch": 0.0366751841274613, "flos": 26753808320640.0, "grad_norm": 1.9297102675090705, "language_loss": 0.79525244, "learning_rate": 3.999532804634215e-06, "loss": 0.81869054, "num_input_tokens_seen": 12868005, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.8359375, "step": 610, "time_per_iteration": 2.532620429992676 }, { "auxiliary_loss_clip": 0.01262251, "auxiliary_loss_mlp": 0.01075581, "balance_loss_clip": 1.04663718, "balance_loss_mlp": 1.07675183, "epoch": 0.03673530738012926, "flos": 22196960202240.0, "grad_norm": 2.1787834525190797, "language_loss": 0.87451982, "learning_rate": 3.9995243491162575e-06, "loss": 0.89789814, "num_input_tokens_seen": 12886890, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.8515625, "step": 611, "time_per_iteration": 2.492891550064087 }, { "auxiliary_loss_clip": 0.01252089, "auxiliary_loss_mlp": 0.01081131, "balance_loss_clip": 1.05243754, "balance_loss_mlp": 1.07361698, "epoch": 0.036795430632797235, "flos": 24681655601280.0, "grad_norm": 2.0831606195559647, "language_loss": 0.72561085, "learning_rate": 3.999515817776136e-06, "loss": 0.74894297, "num_input_tokens_seen": 12906130, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.78125, "step": 612, "time_per_iteration": 2.5428686141967773 }, { "auxiliary_loss_clip": 0.01253326, "auxiliary_loss_mlp": 0.01068859, "balance_loss_clip": 1.03910446, "balance_loss_mlp": 1.0710547, "epoch": 0.0368555538854652, "flos": 17748921358080.0, "grad_norm": 2.769340321681168, "language_loss": 0.78919482, "learning_rate": 3.999507210614175e-06, "loss": 0.81241667, "num_input_tokens_seen": 12925260, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.8203125, "step": 613, "time_per_iteration": 2.539445638656616 }, { "auxiliary_loss_clip": 0.01251681, "auxiliary_loss_mlp": 0.0107444, "balance_loss_clip": 1.04628325, "balance_loss_mlp": 1.07150662, "epoch": 0.03691567713813317, "flos": 20594554571520.0, "grad_norm": 3.9208289640324607, "language_loss": 0.93274701, "learning_rate": 3.9994985276307e-06, "loss": 0.95600832, "num_input_tokens_seen": 12944590, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.8046875, "step": 614, "time_per_iteration": 2.4995996952056885 }, { "auxiliary_loss_clip": 0.01259643, "auxiliary_loss_mlp": 0.01074016, "balance_loss_clip": 1.04254484, "balance_loss_mlp": 1.07480192, "epoch": 0.036975800390801145, "flos": 33650380546560.0, "grad_norm": 5.52758766746175, "language_loss": 0.72702652, "learning_rate": 3.999489768826041e-06, "loss": 0.75036317, "num_input_tokens_seen": 12964785, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.8515625, "step": 615, "time_per_iteration": 2.586097240447998 }, { "auxiliary_loss_clip": 0.01254235, "auxiliary_loss_mlp": 0.0106536, "balance_loss_clip": 1.03669, "balance_loss_mlp": 1.07052755, "epoch": 0.03703592364346911, "flos": 28293694329600.0, "grad_norm": 1.7347615671076717, "language_loss": 0.81536162, "learning_rate": 3.999480934200528e-06, "loss": 0.8385576, "num_input_tokens_seen": 12986705, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.84375, "step": 616, "time_per_iteration": 2.5439200401306152 }, { "auxiliary_loss_clip": 0.01257576, "auxiliary_loss_mlp": 0.01067643, "balance_loss_clip": 1.03998601, "balance_loss_mlp": 1.07525539, "epoch": 0.03709604689613708, "flos": 31504215853440.0, "grad_norm": 2.2089972448807775, "language_loss": 0.68388319, "learning_rate": 3.999472023754499e-06, "loss": 0.70713538, "num_input_tokens_seen": 13010560, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.8203125, "step": 617, "time_per_iteration": 2.5829484462738037 }, { "auxiliary_loss_clip": 0.01260349, "auxiliary_loss_mlp": 0.0106529, "balance_loss_clip": 1.03484416, "balance_loss_mlp": 1.07698464, "epoch": 0.03715617014880505, "flos": 19609381272960.0, "grad_norm": 2.1622647647895734, "language_loss": 0.80396569, "learning_rate": 3.99946303748829e-06, "loss": 0.82722205, "num_input_tokens_seen": 13028935, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.8359375, "step": 618, "time_per_iteration": 2.5125279426574707 }, { "auxiliary_loss_clip": 0.01258699, "auxiliary_loss_mlp": 0.01068828, "balance_loss_clip": 1.03795278, "balance_loss_mlp": 1.07254696, "epoch": 0.03721629340147302, "flos": 15924192497280.0, "grad_norm": 2.31868389765309, "language_loss": 0.91245085, "learning_rate": 3.999453975402242e-06, "loss": 0.93572605, "num_input_tokens_seen": 13046000, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.859375, "step": 619, "time_per_iteration": 2.4644317626953125 }, { "auxiliary_loss_clip": 0.01255602, "auxiliary_loss_mlp": 0.01073464, "balance_loss_clip": 1.04425824, "balance_loss_mlp": 1.07499337, "epoch": 0.03727641665414099, "flos": 21104090951040.0, "grad_norm": 2.7186136637802583, "language_loss": 0.9455933, "learning_rate": 3.9994448374967e-06, "loss": 0.96888399, "num_input_tokens_seen": 13062995, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.8046875, "step": 620, "time_per_iteration": 2.500775098800659 }, { "auxiliary_loss_clip": 0.0125383, "auxiliary_loss_mlp": 0.01075839, "balance_loss_clip": 1.04484439, "balance_loss_mlp": 1.07182717, "epoch": 0.037336539906808956, "flos": 24131683486080.0, "grad_norm": 2.421581392446416, "language_loss": 0.77245939, "learning_rate": 3.999435623772008e-06, "loss": 0.79575604, "num_input_tokens_seen": 13084120, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.8203125, "step": 621, "time_per_iteration": 2.519137382507324 }, { "auxiliary_loss_clip": 0.0125191, "auxiliary_loss_mlp": 0.0106093, "balance_loss_clip": 1.03084111, "balance_loss_mlp": 1.07389104, "epoch": 0.03739666315947693, "flos": 22346384780160.0, "grad_norm": 4.798737446507042, "language_loss": 0.86969656, "learning_rate": 3.999426334228518e-06, "loss": 0.89282489, "num_input_tokens_seen": 13100035, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.78125, "step": 622, "time_per_iteration": 2.4761440753936768 }, { "auxiliary_loss_clip": 0.0125178, "auxiliary_loss_mlp": 0.01063335, "balance_loss_clip": 1.03355694, "balance_loss_mlp": 1.07202709, "epoch": 0.0374567864121449, "flos": 20449511452800.0, "grad_norm": 2.2377150162690045, "language_loss": 0.89810532, "learning_rate": 3.999416968866581e-06, "loss": 0.92125648, "num_input_tokens_seen": 13118070, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.796875, "step": 623, "time_per_iteration": 2.4739041328430176 }, { "auxiliary_loss_clip": 0.01253761, "auxiliary_loss_mlp": 0.01080334, "balance_loss_clip": 1.05072212, "balance_loss_mlp": 1.07294309, "epoch": 0.037516909664812866, "flos": 19208043636480.0, "grad_norm": 1.8147392263338222, "language_loss": 0.84094483, "learning_rate": 3.999407527686551e-06, "loss": 0.86428577, "num_input_tokens_seen": 13136355, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.8125, "step": 624, "time_per_iteration": 2.4785773754119873 }, { "auxiliary_loss_clip": 0.01255483, "auxiliary_loss_mlp": 0.01068721, "balance_loss_clip": 1.03829885, "balance_loss_mlp": 1.07069695, "epoch": 0.03757703291748084, "flos": 35005218664320.0, "grad_norm": 2.9048547374896128, "language_loss": 0.66751933, "learning_rate": 3.999398010688788e-06, "loss": 0.69076133, "num_input_tokens_seen": 13155435, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.84375, "step": 625, "time_per_iteration": 2.5923573970794678 }, { "auxiliary_loss_clip": 0.01245272, "auxiliary_loss_mlp": 0.01066384, "balance_loss_clip": 1.03631914, "balance_loss_mlp": 1.06726432, "epoch": 0.0376371561701488, "flos": 25483899911040.0, "grad_norm": 1.9182677254295812, "language_loss": 0.76943469, "learning_rate": 3.999388417873652e-06, "loss": 0.79255128, "num_input_tokens_seen": 13174295, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.78125, "step": 626, "time_per_iteration": 2.5512707233428955 }, { "auxiliary_loss_clip": 0.0125276, "auxiliary_loss_mlp": 0.01072917, "balance_loss_clip": 1.04366326, "balance_loss_mlp": 1.07126653, "epoch": 0.037697279422816775, "flos": 18185630912640.0, "grad_norm": 1.8332628563277091, "language_loss": 0.81755698, "learning_rate": 3.999378749241506e-06, "loss": 0.84081376, "num_input_tokens_seen": 13192500, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.8125, "step": 627, "time_per_iteration": 2.4552857875823975 }, { "auxiliary_loss_clip": 0.01255417, "auxiliary_loss_mlp": 0.01071067, "balance_loss_clip": 1.04150319, "balance_loss_mlp": 1.07242942, "epoch": 0.03775740267548475, "flos": 24644272521600.0, "grad_norm": 1.6585288777030274, "language_loss": 0.88853657, "learning_rate": 3.999369004792719e-06, "loss": 0.91180134, "num_input_tokens_seen": 13213470, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.828125, "step": 628, "time_per_iteration": 2.529195547103882 }, { "auxiliary_loss_clip": 0.01249147, "auxiliary_loss_mlp": 0.01067163, "balance_loss_clip": 1.03752768, "balance_loss_mlp": 1.06689632, "epoch": 0.03781752592815271, "flos": 21288205088640.0, "grad_norm": 3.3716300606901175, "language_loss": 0.79900485, "learning_rate": 3.999359184527658e-06, "loss": 0.82216799, "num_input_tokens_seen": 13232365, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.8203125, "step": 629, "time_per_iteration": 2.486372709274292 }, { "auxiliary_loss_clip": 0.01248615, "auxiliary_loss_mlp": 0.01062087, "balance_loss_clip": 1.0338459, "balance_loss_mlp": 1.06683254, "epoch": 0.037877649180820684, "flos": 22089623385600.0, "grad_norm": 3.5724671627366495, "language_loss": 0.76881695, "learning_rate": 3.999349288446696e-06, "loss": 0.79192394, "num_input_tokens_seen": 13251920, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.8203125, "step": 630, "time_per_iteration": 2.5155937671661377 }, { "auxiliary_loss_clip": 0.01254144, "auxiliary_loss_mlp": 0.01065613, "balance_loss_clip": 1.03676462, "balance_loss_mlp": 1.06922066, "epoch": 0.03793777243348865, "flos": 14501339976960.0, "grad_norm": 2.661044593248301, "language_loss": 0.918347, "learning_rate": 3.99933931655021e-06, "loss": 0.94154453, "num_input_tokens_seen": 13267440, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.8515625, "step": 631, "time_per_iteration": 2.510401964187622 }, { "auxiliary_loss_clip": 0.01242627, "auxiliary_loss_mlp": 0.01074941, "balance_loss_clip": 1.04349363, "balance_loss_mlp": 1.06466961, "epoch": 0.03799789568615662, "flos": 21908418249600.0, "grad_norm": 1.594838694763877, "language_loss": 0.92231488, "learning_rate": 3.999329268838575e-06, "loss": 0.9454906, "num_input_tokens_seen": 13287850, "router_z_loss_clip": 0.31445312, "router_z_loss_mlp": 1.78125, "step": 632, "time_per_iteration": 2.4942612648010254 }, { "auxiliary_loss_clip": 0.01247203, "auxiliary_loss_mlp": 0.01062331, "balance_loss_clip": 1.03362536, "balance_loss_mlp": 1.06710529, "epoch": 0.03805801893882459, "flos": 24827021942400.0, "grad_norm": 1.9258872765563109, "language_loss": 0.83113456, "learning_rate": 3.999319145312175e-06, "loss": 0.85422981, "num_input_tokens_seen": 13307760, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.796875, "step": 633, "time_per_iteration": 2.5251669883728027 }, { "auxiliary_loss_clip": 0.01246834, "auxiliary_loss_mlp": 0.01066234, "balance_loss_clip": 1.03730178, "balance_loss_mlp": 1.06512856, "epoch": 0.03811814219149256, "flos": 30482952364800.0, "grad_norm": 1.6466843113326288, "language_loss": 0.6954205, "learning_rate": 3.999308945971392e-06, "loss": 0.71855116, "num_input_tokens_seen": 13331230, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.8125, "step": 634, "time_per_iteration": 2.563466787338257 }, { "auxiliary_loss_clip": 0.01142803, "auxiliary_loss_mlp": 0.01040177, "balance_loss_clip": 1.03428769, "balance_loss_mlp": 1.04393339, "epoch": 0.03817826544416053, "flos": 66992577379200.0, "grad_norm": 0.9028356863205277, "language_loss": 0.61712945, "learning_rate": 3.999298670816614e-06, "loss": 0.63895929, "num_input_tokens_seen": 13394760, "router_z_loss_clip": 0.05883789, "router_z_loss_mlp": 0.9921875, "step": 635, "time_per_iteration": 3.148911237716675 }, { "auxiliary_loss_clip": 0.01244721, "auxiliary_loss_mlp": 0.01067482, "balance_loss_clip": 1.03808522, "balance_loss_mlp": 1.0654943, "epoch": 0.038238388696828496, "flos": 20485350247680.0, "grad_norm": 2.2571289216762804, "language_loss": 0.83621705, "learning_rate": 3.9992883198482294e-06, "loss": 0.85933912, "num_input_tokens_seen": 13412775, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.7890625, "step": 636, "time_per_iteration": 2.4668045043945312 }, { "auxiliary_loss_clip": 0.01244938, "auxiliary_loss_mlp": 0.01082002, "balance_loss_clip": 1.05327272, "balance_loss_mlp": 1.06517446, "epoch": 0.03829851194949647, "flos": 17965893461760.0, "grad_norm": 2.3950058057543653, "language_loss": 0.7930848, "learning_rate": 3.999277893066632e-06, "loss": 0.81635422, "num_input_tokens_seen": 13427835, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.796875, "step": 637, "time_per_iteration": 2.4686312675476074 }, { "auxiliary_loss_clip": 0.01247128, "auxiliary_loss_mlp": 0.01075597, "balance_loss_clip": 1.04529405, "balance_loss_mlp": 1.06427574, "epoch": 0.03835863520216444, "flos": 22456522857600.0, "grad_norm": 1.797998250547975, "language_loss": 0.83836555, "learning_rate": 3.999267390472215e-06, "loss": 0.86159277, "num_input_tokens_seen": 13447295, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.828125, "step": 638, "time_per_iteration": 2.495990037918091 }, { "auxiliary_loss_clip": 0.01251751, "auxiliary_loss_mlp": 0.01070653, "balance_loss_clip": 1.04063642, "balance_loss_mlp": 1.06511867, "epoch": 0.038418758454832405, "flos": 22164425458560.0, "grad_norm": 2.703796580362641, "language_loss": 0.70150977, "learning_rate": 3.999256812065381e-06, "loss": 0.72473377, "num_input_tokens_seen": 13468455, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.8671875, "step": 639, "time_per_iteration": 2.513421058654785 }, { "auxiliary_loss_clip": 0.01246426, "auxiliary_loss_mlp": 0.01073836, "balance_loss_clip": 1.04312754, "balance_loss_mlp": 1.06494522, "epoch": 0.03847888170750038, "flos": 22747435107840.0, "grad_norm": 2.521129555093386, "language_loss": 0.85076463, "learning_rate": 3.999246157846526e-06, "loss": 0.87396717, "num_input_tokens_seen": 13489085, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.8125, "step": 640, "time_per_iteration": 2.4878945350646973 }, { "auxiliary_loss_clip": 0.0124939, "auxiliary_loss_mlp": 0.01075837, "balance_loss_clip": 1.04365075, "balance_loss_mlp": 1.06570196, "epoch": 0.03853900496016834, "flos": 22711201263360.0, "grad_norm": 2.029227854273315, "language_loss": 0.82201183, "learning_rate": 3.9992354278160574e-06, "loss": 0.84526408, "num_input_tokens_seen": 13509120, "router_z_loss_clip": 0.32226562, "router_z_loss_mlp": 1.8359375, "step": 641, "time_per_iteration": 5.480987071990967 }, { "auxiliary_loss_clip": 0.01139962, "auxiliary_loss_mlp": 0.0101131, "balance_loss_clip": 1.0051111, "balance_loss_mlp": 1.04340124, "epoch": 0.038599128212836314, "flos": 70399136355840.0, "grad_norm": 0.9125109479601351, "language_loss": 0.65486574, "learning_rate": 3.999224621974381e-06, "loss": 0.67637849, "num_input_tokens_seen": 13562005, "router_z_loss_clip": 0.06201172, "router_z_loss_mlp": 0.96484375, "step": 642, "time_per_iteration": 4.565070629119873 }, { "auxiliary_loss_clip": 0.01242744, "auxiliary_loss_mlp": 0.01060269, "balance_loss_clip": 1.03139615, "balance_loss_mlp": 1.06187606, "epoch": 0.03865925146550429, "flos": 23295144666240.0, "grad_norm": 1.7887406874972043, "language_loss": 0.79445255, "learning_rate": 3.999213740321906e-06, "loss": 0.81748271, "num_input_tokens_seen": 13582185, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.8046875, "step": 643, "time_per_iteration": 2.517549514770508 }, { "auxiliary_loss_clip": 0.01239851, "auxiliary_loss_mlp": 0.01066273, "balance_loss_clip": 1.03750777, "balance_loss_mlp": 1.05999112, "epoch": 0.03871937471817225, "flos": 21430446946560.0, "grad_norm": 2.0061340043498332, "language_loss": 0.82491398, "learning_rate": 3.999202782859046e-06, "loss": 0.84797525, "num_input_tokens_seen": 13599555, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.8046875, "step": 644, "time_per_iteration": 2.4845926761627197 }, { "auxiliary_loss_clip": 0.01243628, "auxiliary_loss_mlp": 0.01065237, "balance_loss_clip": 1.0346241, "balance_loss_mlp": 1.06222653, "epoch": 0.038779497970840224, "flos": 34277309550720.0, "grad_norm": 2.6216594000098206, "language_loss": 0.82158846, "learning_rate": 3.9991917495862165e-06, "loss": 0.84467709, "num_input_tokens_seen": 13621160, "router_z_loss_clip": 0.30664062, "router_z_loss_mlp": 1.8125, "step": 645, "time_per_iteration": 2.6168181896209717 }, { "auxiliary_loss_clip": 0.01247386, "auxiliary_loss_mlp": 0.01068403, "balance_loss_clip": 1.03786182, "balance_loss_mlp": 1.06335545, "epoch": 0.03883962122350819, "flos": 22748189293440.0, "grad_norm": 2.3586265359044374, "language_loss": 0.81859362, "learning_rate": 3.9991806405038345e-06, "loss": 0.84175146, "num_input_tokens_seen": 13641915, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.8359375, "step": 646, "time_per_iteration": 2.4883172512054443 }, { "auxiliary_loss_clip": 0.01246368, "auxiliary_loss_mlp": 0.01077951, "balance_loss_clip": 1.04767203, "balance_loss_mlp": 1.06588089, "epoch": 0.03889974447617616, "flos": 21945837242880.0, "grad_norm": 2.2193376721956612, "language_loss": 0.82015055, "learning_rate": 3.999169455612323e-06, "loss": 0.8433938, "num_input_tokens_seen": 13661410, "router_z_loss_clip": 0.30273438, "router_z_loss_mlp": 1.8046875, "step": 647, "time_per_iteration": 2.5190136432647705 }, { "auxiliary_loss_clip": 0.01241709, "auxiliary_loss_mlp": 0.01063786, "balance_loss_clip": 1.03466344, "balance_loss_mlp": 1.06193757, "epoch": 0.03895986772884413, "flos": 31504826384640.0, "grad_norm": 1.9857703387741847, "language_loss": 0.84367704, "learning_rate": 3.999158194912106e-06, "loss": 0.866732, "num_input_tokens_seen": 13681705, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.796875, "step": 648, "time_per_iteration": 2.568091630935669 }, { "auxiliary_loss_clip": 0.01241187, "auxiliary_loss_mlp": 0.01070164, "balance_loss_clip": 1.04067159, "balance_loss_mlp": 1.06171954, "epoch": 0.0390199909815121, "flos": 19901011795200.0, "grad_norm": 2.118033849719161, "language_loss": 0.8453896, "learning_rate": 3.9991468584036086e-06, "loss": 0.86850309, "num_input_tokens_seen": 13700400, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.796875, "step": 649, "time_per_iteration": 2.521519184112549 }, { "auxiliary_loss_clip": 0.01242435, "auxiliary_loss_mlp": 0.0106485, "balance_loss_clip": 1.03473735, "balance_loss_mlp": 1.06123924, "epoch": 0.03908011423418007, "flos": 21612478095360.0, "grad_norm": 1.7480229058323316, "language_loss": 0.79877651, "learning_rate": 3.999135446087263e-06, "loss": 0.82184935, "num_input_tokens_seen": 13720145, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.8125, "step": 650, "time_per_iteration": 2.4646353721618652 }, { "auxiliary_loss_clip": 0.01236741, "auxiliary_loss_mlp": 0.01068825, "balance_loss_clip": 1.03880787, "balance_loss_mlp": 1.05819035, "epoch": 0.039140237486848035, "flos": 18661411486080.0, "grad_norm": 2.5593989808891457, "language_loss": 0.78510797, "learning_rate": 3.9991239579635e-06, "loss": 0.80816364, "num_input_tokens_seen": 13737500, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.78125, "step": 651, "time_per_iteration": 2.5202229022979736 }, { "auxiliary_loss_clip": 0.01239455, "auxiliary_loss_mlp": 0.01074014, "balance_loss_clip": 1.04206574, "balance_loss_mlp": 1.05956888, "epoch": 0.03920036073951601, "flos": 18661124177280.0, "grad_norm": 2.908361460820879, "language_loss": 0.87685609, "learning_rate": 3.999112394032757e-06, "loss": 0.8999908, "num_input_tokens_seen": 13754750, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.796875, "step": 652, "time_per_iteration": 2.4443416595458984 }, { "auxiliary_loss_clip": 0.01231671, "auxiliary_loss_mlp": 0.01068909, "balance_loss_clip": 1.04075181, "balance_loss_mlp": 1.05766618, "epoch": 0.03926048399218398, "flos": 31354468053120.0, "grad_norm": 2.818875261946874, "language_loss": 0.79261947, "learning_rate": 3.999100754295471e-06, "loss": 0.81562531, "num_input_tokens_seen": 13771990, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.7421875, "step": 653, "time_per_iteration": 2.555471658706665 }, { "auxiliary_loss_clip": 0.01246611, "auxiliary_loss_mlp": 0.01067113, "balance_loss_clip": 1.03771615, "balance_loss_mlp": 1.06175613, "epoch": 0.039320607244851945, "flos": 29603499770880.0, "grad_norm": 4.5801364744739805, "language_loss": 0.86039209, "learning_rate": 3.999089038752085e-06, "loss": 0.88352931, "num_input_tokens_seen": 13792750, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.84375, "step": 654, "time_per_iteration": 2.5269715785980225 }, { "auxiliary_loss_clip": 0.01124584, "auxiliary_loss_mlp": 0.01012921, "balance_loss_clip": 1.00691271, "balance_loss_mlp": 1.03200173, "epoch": 0.03938073049751992, "flos": 66534609951360.0, "grad_norm": 0.7244466786262149, "language_loss": 0.49983484, "learning_rate": 3.999077247403041e-06, "loss": 0.5212099, "num_input_tokens_seen": 13858570, "router_z_loss_clip": 0.06005859, "router_z_loss_mlp": 0.92578125, "step": 655, "time_per_iteration": 3.174342632293701 }, { "auxiliary_loss_clip": 0.01234161, "auxiliary_loss_mlp": 0.01067305, "balance_loss_clip": 1.0387187, "balance_loss_mlp": 1.06044817, "epoch": 0.03944085375018788, "flos": 23367827836800.0, "grad_norm": 2.1485824492693997, "language_loss": 0.80848777, "learning_rate": 3.9990653802487886e-06, "loss": 0.83150244, "num_input_tokens_seen": 13876335, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.734375, "step": 656, "time_per_iteration": 2.4920220375061035 }, { "auxiliary_loss_clip": 0.01247676, "auxiliary_loss_mlp": 0.01083005, "balance_loss_clip": 1.04879236, "balance_loss_mlp": 1.06340957, "epoch": 0.039500977002855854, "flos": 18548292579840.0, "grad_norm": 2.4333941446832554, "language_loss": 0.76292008, "learning_rate": 3.999053437289776e-06, "loss": 0.78622693, "num_input_tokens_seen": 13892640, "router_z_loss_clip": 0.34179688, "router_z_loss_mlp": 1.84375, "step": 657, "time_per_iteration": 2.482020139694214 }, { "auxiliary_loss_clip": 0.01242689, "auxiliary_loss_mlp": 0.01064695, "balance_loss_clip": 1.03486896, "balance_loss_mlp": 1.0620724, "epoch": 0.039561100255523826, "flos": 25338174433920.0, "grad_norm": 3.3260991365148453, "language_loss": 0.8172828, "learning_rate": 3.999041418526457e-06, "loss": 0.84035665, "num_input_tokens_seen": 13910085, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.8046875, "step": 658, "time_per_iteration": 2.522108793258667 }, { "auxiliary_loss_clip": 0.01235906, "auxiliary_loss_mlp": 0.01070576, "balance_loss_clip": 1.03877115, "balance_loss_mlp": 1.059304, "epoch": 0.03962122350819179, "flos": 18219889509120.0, "grad_norm": 2.782100303360271, "language_loss": 0.91524899, "learning_rate": 3.999029323959287e-06, "loss": 0.93831384, "num_input_tokens_seen": 13928800, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.765625, "step": 659, "time_per_iteration": 2.4999022483825684 }, { "auxiliary_loss_clip": 0.01240048, "auxiliary_loss_mlp": 0.01070566, "balance_loss_clip": 1.04133606, "balance_loss_mlp": 1.05979037, "epoch": 0.03968134676085976, "flos": 20522230536960.0, "grad_norm": 2.46334626996438, "language_loss": 0.79223824, "learning_rate": 3.999017153588724e-06, "loss": 0.81534439, "num_input_tokens_seen": 13948325, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.8046875, "step": 660, "time_per_iteration": 2.524019956588745 }, { "auxiliary_loss_clip": 0.01239535, "auxiliary_loss_mlp": 0.01071463, "balance_loss_clip": 1.04142189, "balance_loss_mlp": 1.06261039, "epoch": 0.03974147001352773, "flos": 22422587483520.0, "grad_norm": 1.6153691859122632, "language_loss": 0.8169558, "learning_rate": 3.999004907415231e-06, "loss": 0.84006584, "num_input_tokens_seen": 13969090, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.765625, "step": 661, "time_per_iteration": 2.535318374633789 }, { "auxiliary_loss_clip": 0.01118871, "auxiliary_loss_mlp": 0.01007551, "balance_loss_clip": 1.00213933, "balance_loss_mlp": 1.02702355, "epoch": 0.0398015932661957, "flos": 71128769322240.0, "grad_norm": 0.9111086055284222, "language_loss": 0.69400322, "learning_rate": 3.998992585439272e-06, "loss": 0.71526742, "num_input_tokens_seen": 14037555, "router_z_loss_clip": 0.05419922, "router_z_loss_mlp": 0.921875, "step": 662, "time_per_iteration": 3.243648052215576 }, { "auxiliary_loss_clip": 0.01242868, "auxiliary_loss_mlp": 0.01072485, "balance_loss_clip": 1.04251599, "balance_loss_mlp": 1.06418216, "epoch": 0.03986171651886367, "flos": 16800951571200.0, "grad_norm": 1.8098551104355112, "language_loss": 0.82922542, "learning_rate": 3.998980187661314e-06, "loss": 0.85237896, "num_input_tokens_seen": 14055765, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.78125, "step": 663, "time_per_iteration": 2.455369710922241 }, { "auxiliary_loss_clip": 0.01242553, "auxiliary_loss_mlp": 0.01064381, "balance_loss_clip": 1.03381574, "balance_loss_mlp": 1.06252694, "epoch": 0.03992183977153164, "flos": 24535068197760.0, "grad_norm": 2.507882315225917, "language_loss": 0.87587619, "learning_rate": 3.998967714081826e-06, "loss": 0.89894551, "num_input_tokens_seen": 14074195, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.796875, "step": 664, "time_per_iteration": 2.494264841079712 }, { "auxiliary_loss_clip": 0.01235468, "auxiliary_loss_mlp": 0.01064197, "balance_loss_clip": 1.03412092, "balance_loss_mlp": 1.06041515, "epoch": 0.03998196302419961, "flos": 15595897167360.0, "grad_norm": 2.1951677166896038, "language_loss": 0.85143203, "learning_rate": 3.998955164701281e-06, "loss": 0.87442863, "num_input_tokens_seen": 14090215, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.75, "step": 665, "time_per_iteration": 2.4719972610473633 }, { "auxiliary_loss_clip": 0.01246949, "auxiliary_loss_mlp": 0.01080404, "balance_loss_clip": 1.04871809, "balance_loss_mlp": 1.06363332, "epoch": 0.04004208627686758, "flos": 25305065072640.0, "grad_norm": 2.103403393083134, "language_loss": 0.81582844, "learning_rate": 3.998942539520158e-06, "loss": 0.83910203, "num_input_tokens_seen": 14112150, "router_z_loss_clip": 0.31640625, "router_z_loss_mlp": 1.8359375, "step": 666, "time_per_iteration": 2.527268648147583 }, { "auxiliary_loss_clip": 0.01234572, "auxiliary_loss_mlp": 0.0107087, "balance_loss_clip": 1.03999472, "balance_loss_mlp": 1.05861938, "epoch": 0.04010220952953555, "flos": 23475847011840.0, "grad_norm": 4.406669958476304, "language_loss": 0.8701936, "learning_rate": 3.998929838538932e-06, "loss": 0.89324796, "num_input_tokens_seen": 14131475, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.7578125, "step": 667, "time_per_iteration": 2.5125062465667725 }, { "auxiliary_loss_clip": 0.012336, "auxiliary_loss_mlp": 0.01061152, "balance_loss_clip": 1.03270841, "balance_loss_mlp": 1.06195974, "epoch": 0.04016233278220352, "flos": 18617025254400.0, "grad_norm": 2.1172291397321423, "language_loss": 0.80977416, "learning_rate": 3.998917061758087e-06, "loss": 0.83272171, "num_input_tokens_seen": 14146165, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.71875, "step": 668, "time_per_iteration": 2.4625473022460938 }, { "auxiliary_loss_clip": 0.01111842, "auxiliary_loss_mlp": 0.01012426, "balance_loss_clip": 1.00718045, "balance_loss_mlp": 1.02309012, "epoch": 0.040222456034871484, "flos": 70906194696960.0, "grad_norm": 0.7914064806378739, "language_loss": 0.60121483, "learning_rate": 3.998904209178107e-06, "loss": 0.62245756, "num_input_tokens_seen": 14215005, "router_z_loss_clip": 0.05249023, "router_z_loss_mlp": 0.88671875, "step": 669, "time_per_iteration": 3.2090156078338623 }, { "auxiliary_loss_clip": 0.01234472, "auxiliary_loss_mlp": 0.01067605, "balance_loss_clip": 1.03880405, "balance_loss_mlp": 1.05810022, "epoch": 0.040282579287539456, "flos": 23764712186880.0, "grad_norm": 1.6654915713087362, "language_loss": 0.86339968, "learning_rate": 3.9988912807994785e-06, "loss": 0.88642049, "num_input_tokens_seen": 14235510, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.765625, "step": 670, "time_per_iteration": 2.5716168880462646 }, { "auxiliary_loss_clip": 0.01233876, "auxiliary_loss_mlp": 0.01070233, "balance_loss_clip": 1.04162323, "balance_loss_mlp": 1.06057715, "epoch": 0.04034270254020743, "flos": 18478518410880.0, "grad_norm": 1.7866703302571962, "language_loss": 0.75126874, "learning_rate": 3.998878276622692e-06, "loss": 0.77430987, "num_input_tokens_seen": 14254565, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.734375, "step": 671, "time_per_iteration": 2.499302387237549 }, { "auxiliary_loss_clip": 0.01237736, "auxiliary_loss_mlp": 0.01066987, "balance_loss_clip": 1.038234, "balance_loss_mlp": 1.06093383, "epoch": 0.040402825792875394, "flos": 17201858244480.0, "grad_norm": 1.981350946247716, "language_loss": 0.92277509, "learning_rate": 3.998865196648242e-06, "loss": 0.9458223, "num_input_tokens_seen": 14271885, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.765625, "step": 672, "time_per_iteration": 2.475755214691162 }, { "auxiliary_loss_clip": 0.01235952, "auxiliary_loss_mlp": 0.01072573, "balance_loss_clip": 1.0416379, "balance_loss_mlp": 1.06021726, "epoch": 0.040462949045543366, "flos": 19172168928000.0, "grad_norm": 2.0619397974043236, "language_loss": 0.89888024, "learning_rate": 3.998852040876622e-06, "loss": 0.92196548, "num_input_tokens_seen": 14289670, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.7578125, "step": 673, "time_per_iteration": 2.4545366764068604 }, { "auxiliary_loss_clip": 0.01230092, "auxiliary_loss_mlp": 0.01073467, "balance_loss_clip": 1.04385495, "balance_loss_mlp": 1.05678391, "epoch": 0.04052307229821133, "flos": 24019821555840.0, "grad_norm": 2.8759699253484774, "language_loss": 0.74916744, "learning_rate": 3.998838809308334e-06, "loss": 0.77220309, "num_input_tokens_seen": 14309285, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.734375, "step": 674, "time_per_iteration": 2.519322633743286 }, { "auxiliary_loss_clip": 0.01241949, "auxiliary_loss_mlp": 0.01058817, "balance_loss_clip": 1.02912247, "balance_loss_mlp": 1.06101727, "epoch": 0.0405831955508793, "flos": 16436601964800.0, "grad_norm": 2.225749280387157, "language_loss": 0.7803483, "learning_rate": 3.9988255019438766e-06, "loss": 0.80335593, "num_input_tokens_seen": 14328300, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.8125, "step": 675, "time_per_iteration": 2.4606199264526367 }, { "auxiliary_loss_clip": 0.01234378, "auxiliary_loss_mlp": 0.01071699, "balance_loss_clip": 1.04122925, "balance_loss_mlp": 1.05863905, "epoch": 0.040643318803547275, "flos": 24279922915200.0, "grad_norm": 1.8596445050250694, "language_loss": 0.77202654, "learning_rate": 3.998812118783757e-06, "loss": 0.79508734, "num_input_tokens_seen": 14346395, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.7578125, "step": 676, "time_per_iteration": 2.5295088291168213 }, { "auxiliary_loss_clip": 0.01237506, "auxiliary_loss_mlp": 0.01074143, "balance_loss_clip": 1.04436493, "balance_loss_mlp": 1.06103528, "epoch": 0.04070344205621524, "flos": 17712076982400.0, "grad_norm": 3.291631373615813, "language_loss": 0.85480309, "learning_rate": 3.9987986598284804e-06, "loss": 0.87791955, "num_input_tokens_seen": 14364605, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.765625, "step": 677, "time_per_iteration": 2.454719305038452 }, { "auxiliary_loss_clip": 0.0123173, "auxiliary_loss_mlp": 0.01064583, "balance_loss_clip": 1.03509068, "balance_loss_mlp": 1.05850267, "epoch": 0.04076356530888321, "flos": 26177658168960.0, "grad_norm": 1.8722050958362322, "language_loss": 0.76512283, "learning_rate": 3.998785125078559e-06, "loss": 0.78808594, "num_input_tokens_seen": 14385265, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.734375, "step": 678, "time_per_iteration": 2.5385332107543945 }, { "auxiliary_loss_clip": 0.01234548, "auxiliary_loss_mlp": 0.01067796, "balance_loss_clip": 1.03878069, "balance_loss_mlp": 1.05807757, "epoch": 0.04082368856155118, "flos": 35773455772800.0, "grad_norm": 1.9279594044960906, "language_loss": 0.82099396, "learning_rate": 3.998771514534505e-06, "loss": 0.84401739, "num_input_tokens_seen": 14406090, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.765625, "step": 679, "time_per_iteration": 2.589613437652588 }, { "auxiliary_loss_clip": 0.01236944, "auxiliary_loss_mlp": 0.01063487, "balance_loss_clip": 1.03438759, "balance_loss_mlp": 1.06294703, "epoch": 0.04088381181421915, "flos": 28146640049280.0, "grad_norm": 1.7131171004842303, "language_loss": 0.76491696, "learning_rate": 3.998757828196835e-06, "loss": 0.78792125, "num_input_tokens_seen": 14425130, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.7421875, "step": 680, "time_per_iteration": 2.571058511734009 }, { "auxiliary_loss_clip": 0.01234727, "auxiliary_loss_mlp": 0.01065196, "balance_loss_clip": 1.033391, "balance_loss_mlp": 1.05623806, "epoch": 0.04094393506688712, "flos": 27597673514880.0, "grad_norm": 1.713236131680358, "language_loss": 0.83481288, "learning_rate": 3.9987440660660685e-06, "loss": 0.85781211, "num_input_tokens_seen": 14447355, "router_z_loss_clip": 0.31835938, "router_z_loss_mlp": 1.78125, "step": 681, "time_per_iteration": 2.5226662158966064 }, { "auxiliary_loss_clip": 0.01234698, "auxiliary_loss_mlp": 0.01062757, "balance_loss_clip": 1.03342009, "balance_loss_mlp": 1.0579586, "epoch": 0.04100405831955509, "flos": 23112036109440.0, "grad_norm": 1.9995971000099868, "language_loss": 0.71572793, "learning_rate": 3.998730228142726e-06, "loss": 0.73870242, "num_input_tokens_seen": 14466790, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.765625, "step": 682, "time_per_iteration": 4.059685707092285 }, { "auxiliary_loss_clip": 0.01231374, "auxiliary_loss_mlp": 0.01070829, "balance_loss_clip": 1.04233825, "balance_loss_mlp": 1.05663598, "epoch": 0.04106418157222306, "flos": 20156731695360.0, "grad_norm": 1.699268597874826, "language_loss": 0.72486347, "learning_rate": 3.998716314427333e-06, "loss": 0.74788547, "num_input_tokens_seen": 14485195, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.75, "step": 683, "time_per_iteration": 5.42483115196228 }, { "auxiliary_loss_clip": 0.01232851, "auxiliary_loss_mlp": 0.01076243, "balance_loss_clip": 1.04748917, "balance_loss_mlp": 1.06356263, "epoch": 0.041124304824891024, "flos": 17420697855360.0, "grad_norm": 2.3688176984745386, "language_loss": 0.81589711, "learning_rate": 3.998702324920417e-06, "loss": 0.83898807, "num_input_tokens_seen": 14503370, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.6875, "step": 684, "time_per_iteration": 2.4594876766204834 }, { "auxiliary_loss_clip": 0.01231276, "auxiliary_loss_mlp": 0.01065338, "balance_loss_clip": 1.03522635, "balance_loss_mlp": 1.05916476, "epoch": 0.041184428077558996, "flos": 25780163287680.0, "grad_norm": 1.4834955654617772, "language_loss": 0.90717971, "learning_rate": 3.9986882596225085e-06, "loss": 0.93014586, "num_input_tokens_seen": 14526415, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.71875, "step": 685, "time_per_iteration": 2.568763256072998 }, { "auxiliary_loss_clip": 0.01235781, "auxiliary_loss_mlp": 0.01067253, "balance_loss_clip": 1.0381186, "balance_loss_mlp": 1.06042063, "epoch": 0.04124455133022697, "flos": 22964766347520.0, "grad_norm": 1.9942383242377357, "language_loss": 0.87913787, "learning_rate": 3.998674118534141e-06, "loss": 0.90216821, "num_input_tokens_seen": 14546595, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.7578125, "step": 686, "time_per_iteration": 2.5170514583587646 }, { "auxiliary_loss_clip": 0.012403, "auxiliary_loss_mlp": 0.01070423, "balance_loss_clip": 1.04131222, "balance_loss_mlp": 1.06065309, "epoch": 0.04130467458289493, "flos": 21289067015040.0, "grad_norm": 2.213571381664148, "language_loss": 0.71640766, "learning_rate": 3.998659901655851e-06, "loss": 0.73951483, "num_input_tokens_seen": 14566590, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.796875, "step": 687, "time_per_iteration": 2.4792561531066895 }, { "auxiliary_loss_clip": 0.01231261, "auxiliary_loss_mlp": 0.01066519, "balance_loss_clip": 1.03933883, "balance_loss_mlp": 1.06243289, "epoch": 0.041364797835562905, "flos": 19974233669760.0, "grad_norm": 1.6675709502228913, "language_loss": 0.85910225, "learning_rate": 3.998645608988177e-06, "loss": 0.88208008, "num_input_tokens_seen": 14585965, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.6875, "step": 688, "time_per_iteration": 2.4698991775512695 }, { "auxiliary_loss_clip": 0.01230982, "auxiliary_loss_mlp": 0.0107415, "balance_loss_clip": 1.04648161, "balance_loss_mlp": 1.06055212, "epoch": 0.04142492108823087, "flos": 21906227520000.0, "grad_norm": 2.0702817177489656, "language_loss": 0.83325046, "learning_rate": 3.998631240531661e-06, "loss": 0.85630178, "num_input_tokens_seen": 14606015, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.703125, "step": 689, "time_per_iteration": 2.535933017730713 }, { "auxiliary_loss_clip": 0.01228742, "auxiliary_loss_mlp": 0.01068175, "balance_loss_clip": 1.0402081, "balance_loss_mlp": 1.05647707, "epoch": 0.04148504434089884, "flos": 27639617621760.0, "grad_norm": 2.1359447430583494, "language_loss": 0.68234974, "learning_rate": 3.998616796286848e-06, "loss": 0.70531893, "num_input_tokens_seen": 14629955, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.71875, "step": 690, "time_per_iteration": 2.602114677429199 }, { "auxiliary_loss_clip": 0.01228405, "auxiliary_loss_mlp": 0.01072741, "balance_loss_clip": 1.04430985, "balance_loss_mlp": 1.05830896, "epoch": 0.041545167593566815, "flos": 20518387781760.0, "grad_norm": 1.7356650649657464, "language_loss": 0.75067884, "learning_rate": 3.998602276254286e-06, "loss": 0.77369028, "num_input_tokens_seen": 14648000, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.703125, "step": 691, "time_per_iteration": 2.4518299102783203 }, { "auxiliary_loss_clip": 0.01229135, "auxiliary_loss_mlp": 0.01075363, "balance_loss_clip": 1.04633522, "balance_loss_mlp": 1.05946755, "epoch": 0.04160529084623478, "flos": 11868907939200.0, "grad_norm": 2.4316418166953917, "language_loss": 0.84380805, "learning_rate": 3.998587680434526e-06, "loss": 0.86685312, "num_input_tokens_seen": 14662235, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.6953125, "step": 692, "time_per_iteration": 2.465664863586426 }, { "auxiliary_loss_clip": 0.01233786, "auxiliary_loss_mlp": 0.01067315, "balance_loss_clip": 1.03734624, "balance_loss_mlp": 1.05757082, "epoch": 0.04166541409890275, "flos": 14828306503680.0, "grad_norm": 2.418314808680366, "language_loss": 0.88815594, "learning_rate": 3.99857300882812e-06, "loss": 0.91116697, "num_input_tokens_seen": 14676065, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.765625, "step": 693, "time_per_iteration": 2.4377667903900146 }, { "auxiliary_loss_clip": 0.01236517, "auxiliary_loss_mlp": 0.01059001, "balance_loss_clip": 1.03113008, "balance_loss_mlp": 1.0633657, "epoch": 0.04172553735157072, "flos": 25808137004160.0, "grad_norm": 2.3651445085777865, "language_loss": 0.81850529, "learning_rate": 3.998558261435626e-06, "loss": 0.84146047, "num_input_tokens_seen": 14694955, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.734375, "step": 694, "time_per_iteration": 2.611248016357422 }, { "auxiliary_loss_clip": 0.01235587, "auxiliary_loss_mlp": 0.0106507, "balance_loss_clip": 1.03619742, "balance_loss_mlp": 1.05845356, "epoch": 0.04178566060423869, "flos": 24279815174400.0, "grad_norm": 2.2157764850472224, "language_loss": 0.83665049, "learning_rate": 3.9985434382576015e-06, "loss": 0.85965705, "num_input_tokens_seen": 14715510, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.7734375, "step": 695, "time_per_iteration": 2.5110278129577637 }, { "auxiliary_loss_clip": 0.01232125, "auxiliary_loss_mlp": 0.01070958, "balance_loss_clip": 1.04144204, "balance_loss_mlp": 1.05991161, "epoch": 0.04184578385690666, "flos": 18222008411520.0, "grad_norm": 2.2485582313925625, "language_loss": 0.84445673, "learning_rate": 3.99852853929461e-06, "loss": 0.86748755, "num_input_tokens_seen": 14731755, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.71875, "step": 696, "time_per_iteration": 2.462726354598999 }, { "auxiliary_loss_clip": 0.0123161, "auxiliary_loss_mlp": 0.01070778, "balance_loss_clip": 1.04128563, "balance_loss_mlp": 1.05940747, "epoch": 0.041905907109574626, "flos": 22776342577920.0, "grad_norm": 2.2039946215700157, "language_loss": 0.92901206, "learning_rate": 3.998513564547216e-06, "loss": 0.9520359, "num_input_tokens_seen": 14750810, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.71875, "step": 697, "time_per_iteration": 2.486125946044922 }, { "auxiliary_loss_clip": 0.01229312, "auxiliary_loss_mlp": 0.01061984, "balance_loss_clip": 1.03414845, "balance_loss_mlp": 1.06003785, "epoch": 0.0419660303622426, "flos": 20156947176960.0, "grad_norm": 2.1729359405489173, "language_loss": 0.83864188, "learning_rate": 3.998498514015987e-06, "loss": 0.86155486, "num_input_tokens_seen": 14768435, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.6953125, "step": 698, "time_per_iteration": 2.4906487464904785 }, { "auxiliary_loss_clip": 0.01231872, "auxiliary_loss_mlp": 0.01083387, "balance_loss_clip": 1.05236852, "balance_loss_mlp": 1.05847609, "epoch": 0.042026153614910564, "flos": 23076376882560.0, "grad_norm": 1.9391668718734687, "language_loss": 0.91226345, "learning_rate": 3.998483387701495e-06, "loss": 0.93541598, "num_input_tokens_seen": 14786690, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.734375, "step": 699, "time_per_iteration": 2.483621120452881 }, { "auxiliary_loss_clip": 0.01125831, "auxiliary_loss_mlp": 0.01045191, "balance_loss_clip": 1.03884935, "balance_loss_mlp": 1.03802633, "epoch": 0.042086276867578536, "flos": 64495243370880.0, "grad_norm": 0.905624306687762, "language_loss": 0.67936003, "learning_rate": 3.998468185604312e-06, "loss": 0.70107025, "num_input_tokens_seen": 14853840, "router_z_loss_clip": 0.06347656, "router_z_loss_mlp": 0.875, "step": 700, "time_per_iteration": 3.1560704708099365 }, { "auxiliary_loss_clip": 0.01237985, "auxiliary_loss_mlp": 0.01076253, "balance_loss_clip": 1.04528236, "balance_loss_mlp": 1.06261098, "epoch": 0.04214640012024651, "flos": 15487016065920.0, "grad_norm": 2.4597559808880685, "language_loss": 0.88736033, "learning_rate": 3.998452907725016e-06, "loss": 0.91050273, "num_input_tokens_seen": 14869580, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.7578125, "step": 701, "time_per_iteration": 2.4762141704559326 }, { "auxiliary_loss_clip": 0.01235464, "auxiliary_loss_mlp": 0.01072601, "balance_loss_clip": 1.04265606, "balance_loss_mlp": 1.06473672, "epoch": 0.04220652337291447, "flos": 23877040993920.0, "grad_norm": 1.682440669046644, "language_loss": 0.67140806, "learning_rate": 3.998437554064184e-06, "loss": 0.69448876, "num_input_tokens_seen": 14891065, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.703125, "step": 702, "time_per_iteration": 2.558002471923828 }, { "auxiliary_loss_clip": 0.01111962, "auxiliary_loss_mlp": 0.01008432, "balance_loss_clip": 1.00285268, "balance_loss_mlp": 1.0276444, "epoch": 0.042266646625582445, "flos": 63795451628160.0, "grad_norm": 0.8413961050964159, "language_loss": 0.60825205, "learning_rate": 3.9984221246224006e-06, "loss": 0.62945598, "num_input_tokens_seen": 14954815, "router_z_loss_clip": 0.0559082, "router_z_loss_mlp": 0.84375, "step": 703, "time_per_iteration": 3.163212537765503 }, { "auxiliary_loss_clip": 0.01109025, "auxiliary_loss_mlp": 0.01010451, "balance_loss_clip": 1.00506246, "balance_loss_mlp": 1.02492738, "epoch": 0.04232676987825041, "flos": 50018863345920.0, "grad_norm": 1.0199084274712533, "language_loss": 0.57766271, "learning_rate": 3.9984066194002494e-06, "loss": 0.5988574, "num_input_tokens_seen": 15003050, "router_z_loss_clip": 0.05395508, "router_z_loss_mlp": 0.84375, "step": 704, "time_per_iteration": 2.996978521347046 }, { "auxiliary_loss_clip": 0.01234256, "auxiliary_loss_mlp": 0.01068991, "balance_loss_clip": 1.04013038, "balance_loss_mlp": 1.06129408, "epoch": 0.04238689313091838, "flos": 21616105368960.0, "grad_norm": 2.3457048369679634, "language_loss": 0.87380004, "learning_rate": 3.998391038398319e-06, "loss": 0.89683247, "num_input_tokens_seen": 15021990, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.734375, "step": 705, "time_per_iteration": 2.5000107288360596 }, { "auxiliary_loss_clip": 0.01220907, "auxiliary_loss_mlp": 0.01067836, "balance_loss_clip": 1.0410738, "balance_loss_mlp": 1.05511618, "epoch": 0.042447016383586354, "flos": 19135109070720.0, "grad_norm": 1.7986481517595572, "language_loss": 0.71469194, "learning_rate": 3.998375381617201e-06, "loss": 0.73757935, "num_input_tokens_seen": 15040700, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.65625, "step": 706, "time_per_iteration": 2.533337354660034 }, { "auxiliary_loss_clip": 0.0122723, "auxiliary_loss_mlp": 0.01065704, "balance_loss_clip": 1.03577042, "balance_loss_mlp": 1.05729461, "epoch": 0.04250713963625432, "flos": 24426007528320.0, "grad_norm": 3.308239422504588, "language_loss": 0.93755156, "learning_rate": 3.9983596490574875e-06, "loss": 0.96048093, "num_input_tokens_seen": 15056725, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.703125, "step": 707, "time_per_iteration": 2.5230109691619873 }, { "auxiliary_loss_clip": 0.01231628, "auxiliary_loss_mlp": 0.01065034, "balance_loss_clip": 1.03539824, "balance_loss_mlp": 1.05555439, "epoch": 0.04256726288892229, "flos": 30367391333760.0, "grad_norm": 1.8095598027188295, "language_loss": 0.8157199, "learning_rate": 3.998343840719776e-06, "loss": 0.83868653, "num_input_tokens_seen": 15077550, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.765625, "step": 708, "time_per_iteration": 2.5965726375579834 }, { "auxiliary_loss_clip": 0.01235792, "auxiliary_loss_mlp": 0.01075906, "balance_loss_clip": 1.0459125, "balance_loss_mlp": 1.06062317, "epoch": 0.04262738614159026, "flos": 16362661818240.0, "grad_norm": 2.5336195078599633, "language_loss": 0.82098925, "learning_rate": 3.998327956604666e-06, "loss": 0.8441062, "num_input_tokens_seen": 15094955, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.75, "step": 709, "time_per_iteration": 2.4647531509399414 }, { "auxiliary_loss_clip": 0.01238894, "auxiliary_loss_mlp": 0.01063691, "balance_loss_clip": 1.03462768, "balance_loss_mlp": 1.06232905, "epoch": 0.04268750939425823, "flos": 20412379768320.0, "grad_norm": 4.041234520376237, "language_loss": 0.8537336, "learning_rate": 3.99831199671276e-06, "loss": 0.87675941, "num_input_tokens_seen": 15113395, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.765625, "step": 710, "time_per_iteration": 2.4812979698181152 }, { "auxiliary_loss_clip": 0.01236362, "auxiliary_loss_mlp": 0.01067123, "balance_loss_clip": 1.03825045, "balance_loss_mlp": 1.06283069, "epoch": 0.0427476326469262, "flos": 20302959962880.0, "grad_norm": 2.5241021593532382, "language_loss": 0.84592414, "learning_rate": 3.998295961044662e-06, "loss": 0.86895901, "num_input_tokens_seen": 15132920, "router_z_loss_clip": 0.2890625, "router_z_loss_mlp": 1.734375, "step": 711, "time_per_iteration": 2.4583516120910645 }, { "auxiliary_loss_clip": 0.01229439, "auxiliary_loss_mlp": 0.01066836, "balance_loss_clip": 1.03736734, "balance_loss_mlp": 1.05785358, "epoch": 0.042807755899594166, "flos": 21650794928640.0, "grad_norm": 1.6828912716632158, "language_loss": 0.85343528, "learning_rate": 3.9982798496009804e-06, "loss": 0.87639809, "num_input_tokens_seen": 15153115, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.71875, "step": 712, "time_per_iteration": 2.502645254135132 }, { "auxiliary_loss_clip": 0.01236852, "auxiliary_loss_mlp": 0.01067238, "balance_loss_clip": 1.03943896, "balance_loss_mlp": 1.05850363, "epoch": 0.04286787915226214, "flos": 21435007973760.0, "grad_norm": 3.2754462072804165, "language_loss": 0.9108851, "learning_rate": 3.998263662382328e-06, "loss": 0.93392605, "num_input_tokens_seen": 15172770, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.78125, "step": 713, "time_per_iteration": 2.4993438720703125 }, { "auxiliary_loss_clip": 0.01103291, "auxiliary_loss_mlp": 0.01041326, "balance_loss_clip": 1.03598583, "balance_loss_mlp": 1.02188993, "epoch": 0.04292800240493011, "flos": 66397970615040.0, "grad_norm": 0.8776597664564715, "language_loss": 0.63778579, "learning_rate": 3.9982473993893165e-06, "loss": 0.65923202, "num_input_tokens_seen": 15240055, "router_z_loss_clip": 0.0534668, "router_z_loss_mlp": 0.8125, "step": 714, "time_per_iteration": 3.201490640640259 }, { "auxiliary_loss_clip": 0.01229194, "auxiliary_loss_mlp": 0.01073665, "balance_loss_clip": 1.04523396, "balance_loss_mlp": 1.06078923, "epoch": 0.042988125657598075, "flos": 31650264552960.0, "grad_norm": 1.8218136295915337, "language_loss": 0.75194609, "learning_rate": 3.998231060622563e-06, "loss": 0.7749747, "num_input_tokens_seen": 15261585, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.6875, "step": 715, "time_per_iteration": 2.585618495941162 }, { "auxiliary_loss_clip": 0.01234913, "auxiliary_loss_mlp": 0.01067946, "balance_loss_clip": 1.03753555, "balance_loss_mlp": 1.063241, "epoch": 0.04304824891026605, "flos": 33248468292480.0, "grad_norm": 2.1525301576217766, "language_loss": 0.72387272, "learning_rate": 3.998214646082688e-06, "loss": 0.74690127, "num_input_tokens_seen": 15281160, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.71875, "step": 716, "time_per_iteration": 2.5704329013824463 }, { "auxiliary_loss_clip": 0.0110116, "auxiliary_loss_mlp": 0.01013884, "balance_loss_clip": 1.00871015, "balance_loss_mlp": 1.02131867, "epoch": 0.04310837216293401, "flos": 64064782782720.0, "grad_norm": 2.6791447134535247, "language_loss": 0.65610909, "learning_rate": 3.998198155770314e-06, "loss": 0.6772595, "num_input_tokens_seen": 15344505, "router_z_loss_clip": 0.05175781, "router_z_loss_mlp": 0.796875, "step": 717, "time_per_iteration": 3.1990246772766113 }, { "auxiliary_loss_clip": 0.01102356, "auxiliary_loss_mlp": 0.01007679, "balance_loss_clip": 1.00238609, "balance_loss_mlp": 1.02240038, "epoch": 0.043168495415601985, "flos": 61343757849600.0, "grad_norm": 0.9890424773370202, "language_loss": 0.58802521, "learning_rate": 3.998181589686065e-06, "loss": 0.60912555, "num_input_tokens_seen": 15404050, "router_z_loss_clip": 0.05297852, "router_z_loss_mlp": 0.796875, "step": 718, "time_per_iteration": 2.9342174530029297 }, { "auxiliary_loss_clip": 0.0123225, "auxiliary_loss_mlp": 0.01070616, "balance_loss_clip": 1.03969359, "balance_loss_mlp": 1.06319451, "epoch": 0.04322861866826996, "flos": 20704261685760.0, "grad_norm": 3.2269446479790656, "language_loss": 0.91375333, "learning_rate": 3.99816494783057e-06, "loss": 0.936782, "num_input_tokens_seen": 15424190, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.6875, "step": 719, "time_per_iteration": 2.5018410682678223 }, { "auxiliary_loss_clip": 0.0122954, "auxiliary_loss_mlp": 0.01069087, "balance_loss_clip": 1.04054832, "balance_loss_mlp": 1.05783999, "epoch": 0.04328874192093792, "flos": 30373352991360.0, "grad_norm": 1.8618775255477404, "language_loss": 0.66548479, "learning_rate": 3.99814823020446e-06, "loss": 0.68847108, "num_input_tokens_seen": 15446500, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.7109375, "step": 720, "time_per_iteration": 2.5665676593780518 }, { "auxiliary_loss_clip": 0.01230117, "auxiliary_loss_mlp": 0.01071645, "balance_loss_clip": 1.04221249, "balance_loss_mlp": 1.06008458, "epoch": 0.043348865173605894, "flos": 21944795748480.0, "grad_norm": 1.8660933193978844, "language_loss": 0.77698499, "learning_rate": 3.9981314368083684e-06, "loss": 0.80000263, "num_input_tokens_seen": 15465830, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.703125, "step": 721, "time_per_iteration": 2.5513393878936768 }, { "auxiliary_loss_clip": 0.01231992, "auxiliary_loss_mlp": 0.01083677, "balance_loss_clip": 1.05582941, "balance_loss_mlp": 1.06130719, "epoch": 0.04340898842627386, "flos": 15264225959040.0, "grad_norm": 2.7843284500503014, "language_loss": 0.87896478, "learning_rate": 3.998114567642933e-06, "loss": 0.90212148, "num_input_tokens_seen": 15479985, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.703125, "step": 722, "time_per_iteration": 2.446063756942749 }, { "auxiliary_loss_clip": 0.01238157, "auxiliary_loss_mlp": 0.01070205, "balance_loss_clip": 1.04207182, "balance_loss_mlp": 1.06360781, "epoch": 0.04346911167894183, "flos": 27965434913280.0, "grad_norm": 2.6065620519444312, "language_loss": 0.84350717, "learning_rate": 3.998097622708792e-06, "loss": 0.8665908, "num_input_tokens_seen": 15501545, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.75, "step": 723, "time_per_iteration": 2.550096035003662 }, { "auxiliary_loss_clip": 0.01239268, "auxiliary_loss_mlp": 0.01071433, "balance_loss_clip": 1.04163074, "balance_loss_mlp": 1.06526256, "epoch": 0.0435292349316098, "flos": 29242202820480.0, "grad_norm": 2.133554550901691, "language_loss": 0.82811165, "learning_rate": 3.99808060200659e-06, "loss": 0.8512187, "num_input_tokens_seen": 15521725, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.734375, "step": 724, "time_per_iteration": 4.020902872085571 }, { "auxiliary_loss_clip": 0.01234415, "auxiliary_loss_mlp": 0.01080483, "balance_loss_clip": 1.05062103, "balance_loss_mlp": 1.06314492, "epoch": 0.04358935818427777, "flos": 20558356640640.0, "grad_norm": 1.9688064768636726, "language_loss": 0.79794383, "learning_rate": 3.998063505536971e-06, "loss": 0.82109284, "num_input_tokens_seen": 15540910, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.7109375, "step": 725, "time_per_iteration": 3.909710645675659 }, { "auxiliary_loss_clip": 0.0124348, "auxiliary_loss_mlp": 0.01069469, "balance_loss_clip": 1.03926182, "balance_loss_mlp": 1.06380939, "epoch": 0.04364948143694574, "flos": 14464926564480.0, "grad_norm": 2.973003656049809, "language_loss": 0.8720206, "learning_rate": 3.998046333300584e-06, "loss": 0.89515007, "num_input_tokens_seen": 15558640, "router_z_loss_clip": 0.30078125, "router_z_loss_mlp": 1.796875, "step": 726, "time_per_iteration": 2.480562925338745 }, { "auxiliary_loss_clip": 0.0110213, "auxiliary_loss_mlp": 0.01015556, "balance_loss_clip": 1.01059651, "balance_loss_mlp": 1.02438498, "epoch": 0.043709604689613706, "flos": 50067268922880.0, "grad_norm": 0.9188561820390252, "language_loss": 0.559677, "learning_rate": 3.998029085298079e-06, "loss": 0.58085382, "num_input_tokens_seen": 15612975, "router_z_loss_clip": 0.04956055, "router_z_loss_mlp": 0.77734375, "step": 727, "time_per_iteration": 3.2461915016174316 }, { "auxiliary_loss_clip": 0.01236614, "auxiliary_loss_mlp": 0.01073143, "balance_loss_clip": 1.04338884, "balance_loss_mlp": 1.06268358, "epoch": 0.04376972794228168, "flos": 13991588115840.0, "grad_norm": 2.100082621050904, "language_loss": 0.82192922, "learning_rate": 3.998011761530112e-06, "loss": 0.84502673, "num_input_tokens_seen": 15631070, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.734375, "step": 728, "time_per_iteration": 2.5246074199676514 }, { "auxiliary_loss_clip": 0.01229661, "auxiliary_loss_mlp": 0.01064901, "balance_loss_clip": 1.03695893, "balance_loss_mlp": 1.06136882, "epoch": 0.04382985119494965, "flos": 22009901149440.0, "grad_norm": 3.4305660895122827, "language_loss": 0.77154279, "learning_rate": 3.997994361997338e-06, "loss": 0.79448849, "num_input_tokens_seen": 15647825, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.6875, "step": 729, "time_per_iteration": 2.5130207538604736 }, { "auxiliary_loss_clip": 0.01236291, "auxiliary_loss_mlp": 0.01069034, "balance_loss_clip": 1.04061437, "balance_loss_mlp": 1.06155169, "epoch": 0.043889974447617615, "flos": 24206521472640.0, "grad_norm": 2.834445769847079, "language_loss": 0.95229381, "learning_rate": 3.997976886700417e-06, "loss": 0.97534704, "num_input_tokens_seen": 15668260, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.75, "step": 730, "time_per_iteration": 2.511198043823242 }, { "auxiliary_loss_clip": 0.01231394, "auxiliary_loss_mlp": 0.010645, "balance_loss_clip": 1.03376794, "balance_loss_mlp": 1.05921197, "epoch": 0.04395009770028559, "flos": 17274541415040.0, "grad_norm": 4.733252648777729, "language_loss": 0.88195682, "learning_rate": 3.997959335640013e-06, "loss": 0.90491569, "num_input_tokens_seen": 15685630, "router_z_loss_clip": 0.30859375, "router_z_loss_mlp": 1.71875, "step": 731, "time_per_iteration": 2.4987633228302 }, { "auxiliary_loss_clip": 0.01234996, "auxiliary_loss_mlp": 0.01065426, "balance_loss_clip": 1.03834128, "balance_loss_mlp": 1.06346989, "epoch": 0.04401022095295355, "flos": 12310286261760.0, "grad_norm": 2.975362688652488, "language_loss": 0.8896603, "learning_rate": 3.997941708816791e-06, "loss": 0.91266447, "num_input_tokens_seen": 15698645, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.71875, "step": 732, "time_per_iteration": 2.418785333633423 }, { "auxiliary_loss_clip": 0.01234525, "auxiliary_loss_mlp": 0.01072789, "balance_loss_clip": 1.04310584, "balance_loss_mlp": 1.06147921, "epoch": 0.044070344205621524, "flos": 20959658363520.0, "grad_norm": 2.4633390058240257, "language_loss": 0.85939348, "learning_rate": 3.997924006231419e-06, "loss": 0.88246667, "num_input_tokens_seen": 15716775, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.734375, "step": 733, "time_per_iteration": 2.5068118572235107 }, { "auxiliary_loss_clip": 0.01239596, "auxiliary_loss_mlp": 0.01080525, "balance_loss_clip": 1.04852915, "balance_loss_mlp": 1.06424212, "epoch": 0.044130467458289496, "flos": 13845288021120.0, "grad_norm": 3.067141884003415, "language_loss": 0.91250134, "learning_rate": 3.9979062278845685e-06, "loss": 0.93570244, "num_input_tokens_seen": 15733320, "router_z_loss_clip": 0.3203125, "router_z_loss_mlp": 1.75, "step": 734, "time_per_iteration": 2.4399847984313965 }, { "auxiliary_loss_clip": 0.01230221, "auxiliary_loss_mlp": 0.01063969, "balance_loss_clip": 1.03628874, "balance_loss_mlp": 1.06279051, "epoch": 0.04419059071095746, "flos": 28655063107200.0, "grad_norm": 2.140319973108993, "language_loss": 0.7793175, "learning_rate": 3.9978883737769125e-06, "loss": 0.80225939, "num_input_tokens_seen": 15752705, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.671875, "step": 735, "time_per_iteration": 2.554816722869873 }, { "auxiliary_loss_clip": 0.0122597, "auxiliary_loss_mlp": 0.01062834, "balance_loss_clip": 1.03490329, "balance_loss_mlp": 1.05776739, "epoch": 0.04425071396362543, "flos": 28183304856960.0, "grad_norm": 2.722236448752062, "language_loss": 0.88777357, "learning_rate": 3.9978704439091305e-06, "loss": 0.91066164, "num_input_tokens_seen": 15772800, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.6796875, "step": 736, "time_per_iteration": 2.5267717838287354 }, { "auxiliary_loss_clip": 0.01229111, "auxiliary_loss_mlp": 0.01071368, "balance_loss_clip": 1.04397321, "balance_loss_mlp": 1.06400907, "epoch": 0.0443108372162934, "flos": 23658452778240.0, "grad_norm": 1.987780491218451, "language_loss": 0.84302747, "learning_rate": 3.997852438281901e-06, "loss": 0.86603224, "num_input_tokens_seen": 15793665, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.6484375, "step": 737, "time_per_iteration": 2.512930393218994 }, { "auxiliary_loss_clip": 0.01234971, "auxiliary_loss_mlp": 0.01068273, "balance_loss_clip": 1.03711128, "balance_loss_mlp": 1.06302738, "epoch": 0.04437096046896137, "flos": 33979861025280.0, "grad_norm": 2.088212444492136, "language_loss": 0.85034573, "learning_rate": 3.997834356895906e-06, "loss": 0.87337822, "num_input_tokens_seen": 15813175, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.71875, "step": 738, "time_per_iteration": 2.5685296058654785 }, { "auxiliary_loss_clip": 0.01094418, "auxiliary_loss_mlp": 0.01020615, "balance_loss_clip": 1.01620471, "balance_loss_mlp": 1.01837945, "epoch": 0.04443108372162934, "flos": 67397506375680.0, "grad_norm": 1.000642559580194, "language_loss": 0.59204972, "learning_rate": 3.9978161997518324e-06, "loss": 0.61320007, "num_input_tokens_seen": 15872050, "router_z_loss_clip": 0.04418945, "router_z_loss_mlp": 0.76171875, "step": 739, "time_per_iteration": 3.0761353969573975 }, { "auxiliary_loss_clip": 0.01233894, "auxiliary_loss_mlp": 0.01063965, "balance_loss_clip": 1.03561699, "balance_loss_mlp": 1.0642736, "epoch": 0.04449120697429731, "flos": 29752672953600.0, "grad_norm": 2.339871933531581, "language_loss": 0.91635245, "learning_rate": 3.997797966850369e-06, "loss": 0.939331, "num_input_tokens_seen": 15891085, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.6953125, "step": 740, "time_per_iteration": 2.5444204807281494 }, { "auxiliary_loss_clip": 0.01237641, "auxiliary_loss_mlp": 0.01063371, "balance_loss_clip": 1.03591692, "balance_loss_mlp": 1.06662035, "epoch": 0.04455133022696528, "flos": 36502119072000.0, "grad_norm": 2.142534320053521, "language_loss": 0.71970433, "learning_rate": 3.997779658192205e-06, "loss": 0.74271446, "num_input_tokens_seen": 15914225, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.7109375, "step": 741, "time_per_iteration": 2.6295182704925537 }, { "auxiliary_loss_clip": 0.01228361, "auxiliary_loss_mlp": 0.0107417, "balance_loss_clip": 1.04656148, "balance_loss_mlp": 1.06108117, "epoch": 0.044611453479633245, "flos": 28803661672320.0, "grad_norm": 2.6194595181509683, "language_loss": 0.88712698, "learning_rate": 3.997761273778037e-06, "loss": 0.91015232, "num_input_tokens_seen": 15934540, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.671875, "step": 742, "time_per_iteration": 2.532167911529541 }, { "auxiliary_loss_clip": 0.01230504, "auxiliary_loss_mlp": 0.01058538, "balance_loss_clip": 1.02923608, "balance_loss_mlp": 1.06283414, "epoch": 0.04467157673230122, "flos": 20010970304640.0, "grad_norm": 2.1661995932505307, "language_loss": 0.84203947, "learning_rate": 3.997742813608561e-06, "loss": 0.86492991, "num_input_tokens_seen": 15952560, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.6796875, "step": 743, "time_per_iteration": 2.5094141960144043 }, { "auxiliary_loss_clip": 0.01235432, "auxiliary_loss_mlp": 0.0106431, "balance_loss_clip": 1.03639126, "balance_loss_mlp": 1.06326151, "epoch": 0.04473169998496919, "flos": 18004964480640.0, "grad_norm": 2.099902751653203, "language_loss": 0.80203128, "learning_rate": 3.997724277684479e-06, "loss": 0.82502872, "num_input_tokens_seen": 15970620, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.71875, "step": 744, "time_per_iteration": 2.4661717414855957 }, { "auxiliary_loss_clip": 0.01230086, "auxiliary_loss_mlp": 0.01065169, "balance_loss_clip": 1.03713131, "balance_loss_mlp": 1.06169033, "epoch": 0.044791823237637154, "flos": 20631722169600.0, "grad_norm": 3.468437534582151, "language_loss": 0.85115314, "learning_rate": 3.99770566600649e-06, "loss": 0.87410569, "num_input_tokens_seen": 15987325, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.6875, "step": 745, "time_per_iteration": 2.488011598587036 }, { "auxiliary_loss_clip": 0.0122764, "auxiliary_loss_mlp": 0.01059249, "balance_loss_clip": 1.03025734, "balance_loss_mlp": 1.05937338, "epoch": 0.04485194649030513, "flos": 31176171918720.0, "grad_norm": 1.8118279012734568, "language_loss": 0.69129443, "learning_rate": 3.997686978575302e-06, "loss": 0.7141633, "num_input_tokens_seen": 16008310, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.6875, "step": 746, "time_per_iteration": 2.5789718627929688 }, { "auxiliary_loss_clip": 0.0123697, "auxiliary_loss_mlp": 0.01075326, "balance_loss_clip": 1.04588151, "balance_loss_mlp": 1.06698334, "epoch": 0.04491206974297309, "flos": 26143291831680.0, "grad_norm": 2.7310818974157267, "language_loss": 0.68637538, "learning_rate": 3.997668215391625e-06, "loss": 0.70949829, "num_input_tokens_seen": 16029620, "router_z_loss_clip": 0.29492188, "router_z_loss_mlp": 1.703125, "step": 747, "time_per_iteration": 2.5597610473632812 }, { "auxiliary_loss_clip": 0.01234127, "auxiliary_loss_mlp": 0.0107395, "balance_loss_clip": 1.04415977, "balance_loss_mlp": 1.06419742, "epoch": 0.044972192995641064, "flos": 20667668705280.0, "grad_norm": 2.053000756295382, "language_loss": 0.66831207, "learning_rate": 3.997649376456168e-06, "loss": 0.69139278, "num_input_tokens_seen": 16049065, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.703125, "step": 748, "time_per_iteration": 2.454868793487549 }, { "auxiliary_loss_clip": 0.01237911, "auxiliary_loss_mlp": 0.01075428, "balance_loss_clip": 1.04618549, "balance_loss_mlp": 1.06852078, "epoch": 0.045032316248309036, "flos": 16106834177280.0, "grad_norm": 3.015033242759349, "language_loss": 0.7696498, "learning_rate": 3.997630461769647e-06, "loss": 0.79278314, "num_input_tokens_seen": 16066765, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.6953125, "step": 749, "time_per_iteration": 2.466132164001465 }, { "auxiliary_loss_clip": 0.01236286, "auxiliary_loss_mlp": 0.0107513, "balance_loss_clip": 1.04644847, "balance_loss_mlp": 1.06593204, "epoch": 0.045092439500977, "flos": 17858843953920.0, "grad_norm": 2.2532093866585843, "language_loss": 0.88799739, "learning_rate": 3.997611471332778e-06, "loss": 0.91111153, "num_input_tokens_seen": 16085980, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.703125, "step": 750, "time_per_iteration": 2.4274022579193115 }, { "auxiliary_loss_clip": 0.01230866, "auxiliary_loss_mlp": 0.01068083, "balance_loss_clip": 1.03685069, "balance_loss_mlp": 1.05953741, "epoch": 0.04515256275364497, "flos": 24462815990400.0, "grad_norm": 1.7919707301283034, "language_loss": 0.74486744, "learning_rate": 3.9975924051462825e-06, "loss": 0.7678569, "num_input_tokens_seen": 16106260, "router_z_loss_clip": 0.3125, "router_z_loss_mlp": 1.7109375, "step": 751, "time_per_iteration": 2.5556604862213135 }, { "auxiliary_loss_clip": 0.01228, "auxiliary_loss_mlp": 0.01068538, "balance_loss_clip": 1.040452, "balance_loss_mlp": 1.05936348, "epoch": 0.04521268600631294, "flos": 20916385453440.0, "grad_norm": 2.057733357960098, "language_loss": 0.69335282, "learning_rate": 3.997573263210883e-06, "loss": 0.71631825, "num_input_tokens_seen": 16123475, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.6875, "step": 752, "time_per_iteration": 2.4885904788970947 }, { "auxiliary_loss_clip": 0.01229361, "auxiliary_loss_mlp": 0.01054922, "balance_loss_clip": 1.02711082, "balance_loss_mlp": 1.06052375, "epoch": 0.04527280925898091, "flos": 13371374954880.0, "grad_norm": 2.794273813525743, "language_loss": 0.92575246, "learning_rate": 3.997554045527305e-06, "loss": 0.94859529, "num_input_tokens_seen": 16138335, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.6875, "step": 753, "time_per_iteration": 2.4479458332061768 }, { "auxiliary_loss_clip": 0.01235885, "auxiliary_loss_mlp": 0.01075614, "balance_loss_clip": 1.04724264, "balance_loss_mlp": 1.06464934, "epoch": 0.04533293251164888, "flos": 23254565276160.0, "grad_norm": 2.2430378617923887, "language_loss": 0.91378999, "learning_rate": 3.997534752096277e-06, "loss": 0.93690497, "num_input_tokens_seen": 16157110, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.7109375, "step": 754, "time_per_iteration": 2.4919497966766357 }, { "auxiliary_loss_clip": 0.01222977, "auxiliary_loss_mlp": 0.01071088, "balance_loss_clip": 1.04152429, "balance_loss_mlp": 1.05968666, "epoch": 0.04539305576431685, "flos": 12422004537600.0, "grad_norm": 2.0974585520186446, "language_loss": 0.78626585, "learning_rate": 3.997515382918531e-06, "loss": 0.80920649, "num_input_tokens_seen": 16174155, "router_z_loss_clip": 0.296875, "router_z_loss_mlp": 1.6328125, "step": 755, "time_per_iteration": 2.4546313285827637 }, { "auxiliary_loss_clip": 0.01233529, "auxiliary_loss_mlp": 0.01075611, "balance_loss_clip": 1.04729843, "balance_loss_mlp": 1.0630281, "epoch": 0.04545317901698482, "flos": 16070995382400.0, "grad_norm": 2.083993469737354, "language_loss": 0.78919709, "learning_rate": 3.9974959379948015e-06, "loss": 0.81228852, "num_input_tokens_seen": 16192240, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.703125, "step": 756, "time_per_iteration": 2.4255306720733643 }, { "auxiliary_loss_clip": 0.01098235, "auxiliary_loss_mlp": 0.01008731, "balance_loss_clip": 1.00389123, "balance_loss_mlp": 1.02263355, "epoch": 0.045513302269652785, "flos": 66396139021440.0, "grad_norm": 0.8071188007683795, "language_loss": 0.62762433, "learning_rate": 3.997476417325827e-06, "loss": 0.64869398, "num_input_tokens_seen": 16255775, "router_z_loss_clip": 0.04833984, "router_z_loss_mlp": 0.7578125, "step": 757, "time_per_iteration": 3.1675868034362793 }, { "auxiliary_loss_clip": 0.0122931, "auxiliary_loss_mlp": 0.01069719, "balance_loss_clip": 1.04243171, "balance_loss_mlp": 1.06219244, "epoch": 0.04557342552232076, "flos": 21471169991040.0, "grad_norm": 2.0750082532783107, "language_loss": 0.84246886, "learning_rate": 3.997456820912346e-06, "loss": 0.8654592, "num_input_tokens_seen": 16277015, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.671875, "step": 758, "time_per_iteration": 2.5169906616210938 }, { "auxiliary_loss_clip": 0.01223122, "auxiliary_loss_mlp": 0.01062126, "balance_loss_clip": 1.03503001, "balance_loss_mlp": 1.05715346, "epoch": 0.04563354877498873, "flos": 23732680233600.0, "grad_norm": 2.081944004857274, "language_loss": 0.88208044, "learning_rate": 3.997437148755101e-06, "loss": 0.90493286, "num_input_tokens_seen": 16296005, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.65625, "step": 759, "time_per_iteration": 2.5224692821502686 }, { "auxiliary_loss_clip": 0.01232544, "auxiliary_loss_mlp": 0.01068773, "balance_loss_clip": 1.03944707, "balance_loss_mlp": 1.06365204, "epoch": 0.045693672027656694, "flos": 25735741142400.0, "grad_norm": 1.8483939368597457, "language_loss": 0.73914254, "learning_rate": 3.9974174008548405e-06, "loss": 0.76215565, "num_input_tokens_seen": 16315300, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.6875, "step": 760, "time_per_iteration": 2.5166401863098145 }, { "auxiliary_loss_clip": 0.01233427, "auxiliary_loss_mlp": 0.0107227, "balance_loss_clip": 1.04532838, "balance_loss_mlp": 1.06633365, "epoch": 0.045753795280324666, "flos": 19719016560000.0, "grad_norm": 2.320973805358062, "language_loss": 0.82273901, "learning_rate": 3.9973975772123105e-06, "loss": 0.84579599, "num_input_tokens_seen": 16333820, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.671875, "step": 761, "time_per_iteration": 2.4980223178863525 }, { "auxiliary_loss_clip": 0.01224526, "auxiliary_loss_mlp": 0.01072379, "balance_loss_clip": 1.04441237, "balance_loss_mlp": 1.05938268, "epoch": 0.04581391853299264, "flos": 23255786338560.0, "grad_norm": 1.8143627124117092, "language_loss": 0.79913855, "learning_rate": 3.997377677828266e-06, "loss": 0.82210755, "num_input_tokens_seen": 16355290, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.6484375, "step": 762, "time_per_iteration": 2.498945474624634 }, { "auxiliary_loss_clip": 0.01097421, "auxiliary_loss_mlp": 0.01015519, "balance_loss_clip": 1.01103687, "balance_loss_mlp": 1.02348745, "epoch": 0.0458740417856606, "flos": 64231155601920.0, "grad_norm": 1.0124393412544088, "language_loss": 0.58789432, "learning_rate": 3.9973577027034585e-06, "loss": 0.60902375, "num_input_tokens_seen": 16415995, "router_z_loss_clip": 0.04492188, "router_z_loss_mlp": 0.7421875, "step": 763, "time_per_iteration": 3.1875100135803223 }, { "auxiliary_loss_clip": 0.01228957, "auxiliary_loss_mlp": 0.01075808, "balance_loss_clip": 1.04800832, "balance_loss_mlp": 1.0609839, "epoch": 0.045934165038328575, "flos": 20770121272320.0, "grad_norm": 2.1606825605427407, "language_loss": 0.87666142, "learning_rate": 3.9973376518386475e-06, "loss": 0.89970911, "num_input_tokens_seen": 16433120, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.6796875, "step": 764, "time_per_iteration": 2.497767210006714 }, { "auxiliary_loss_clip": 0.01231915, "auxiliary_loss_mlp": 0.01079162, "balance_loss_clip": 1.05132675, "balance_loss_mlp": 1.06436729, "epoch": 0.04599428829099654, "flos": 30262891691520.0, "grad_norm": 2.346949562408539, "language_loss": 0.85842949, "learning_rate": 3.997317525234592e-06, "loss": 0.8815403, "num_input_tokens_seen": 16453360, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.671875, "step": 765, "time_per_iteration": 4.046855449676514 }, { "auxiliary_loss_clip": 0.0123088, "auxiliary_loss_mlp": 0.01073593, "balance_loss_clip": 1.04319441, "balance_loss_mlp": 1.06133747, "epoch": 0.04605441154366451, "flos": 23038921975680.0, "grad_norm": 2.507834333528023, "language_loss": 0.88466156, "learning_rate": 3.997297322892056e-06, "loss": 0.90770626, "num_input_tokens_seen": 16471160, "router_z_loss_clip": 0.3046875, "router_z_loss_mlp": 1.6953125, "step": 766, "time_per_iteration": 5.3579771518707275 }, { "auxiliary_loss_clip": 0.01226137, "auxiliary_loss_mlp": 0.01068457, "balance_loss_clip": 1.04096723, "balance_loss_mlp": 1.05925035, "epoch": 0.046114534796332485, "flos": 22017407091840.0, "grad_norm": 3.4928065866855813, "language_loss": 0.84011352, "learning_rate": 3.997277044811806e-06, "loss": 0.86305946, "num_input_tokens_seen": 16488940, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.671875, "step": 767, "time_per_iteration": 2.48663067817688 }, { "auxiliary_loss_clip": 0.01228645, "auxiliary_loss_mlp": 0.01060345, "balance_loss_clip": 1.03200889, "balance_loss_mlp": 1.06294513, "epoch": 0.04617465804900045, "flos": 29862380067840.0, "grad_norm": 1.98058761930258, "language_loss": 0.86731255, "learning_rate": 3.99725669099461e-06, "loss": 0.89020252, "num_input_tokens_seen": 16509505, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.65625, "step": 768, "time_per_iteration": 2.5755348205566406 }, { "auxiliary_loss_clip": 0.0122479, "auxiliary_loss_mlp": 0.01066433, "balance_loss_clip": 1.03909826, "balance_loss_mlp": 1.05632615, "epoch": 0.04623478130166842, "flos": 25630056351360.0, "grad_norm": 12.47212217391572, "language_loss": 0.75294429, "learning_rate": 3.9972362614412395e-06, "loss": 0.77585649, "num_input_tokens_seen": 16528840, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.6875, "step": 769, "time_per_iteration": 2.560673713684082 }, { "auxiliary_loss_clip": 0.01221921, "auxiliary_loss_mlp": 0.01061848, "balance_loss_clip": 1.03566968, "balance_loss_mlp": 1.05849779, "epoch": 0.04629490455433639, "flos": 20449080489600.0, "grad_norm": 2.045366293021004, "language_loss": 0.86354959, "learning_rate": 3.997215756152471e-06, "loss": 0.88638723, "num_input_tokens_seen": 16548335, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.6328125, "step": 770, "time_per_iteration": 2.521881103515625 }, { "auxiliary_loss_clip": 0.01232606, "auxiliary_loss_mlp": 0.01064114, "balance_loss_clip": 1.03615892, "balance_loss_mlp": 1.06003523, "epoch": 0.04635502780700436, "flos": 23148736830720.0, "grad_norm": 1.9915810713667665, "language_loss": 0.86701167, "learning_rate": 3.99719517512908e-06, "loss": 0.88997889, "num_input_tokens_seen": 16567725, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.7265625, "step": 771, "time_per_iteration": 2.485168218612671 }, { "auxiliary_loss_clip": 0.01229071, "auxiliary_loss_mlp": 0.01072877, "balance_loss_clip": 1.04364657, "balance_loss_mlp": 1.05676055, "epoch": 0.04641515105967233, "flos": 23292020183040.0, "grad_norm": 2.467960125773576, "language_loss": 0.83733976, "learning_rate": 3.997174518371848e-06, "loss": 0.86035919, "num_input_tokens_seen": 16588175, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.71875, "step": 772, "time_per_iteration": 2.4794793128967285 }, { "auxiliary_loss_clip": 0.01225944, "auxiliary_loss_mlp": 0.01062804, "balance_loss_clip": 1.03629231, "balance_loss_mlp": 1.06026387, "epoch": 0.046475274312340296, "flos": 25115204759040.0, "grad_norm": 1.7744286698412572, "language_loss": 0.73689067, "learning_rate": 3.997153785881557e-06, "loss": 0.7597782, "num_input_tokens_seen": 16607735, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.65625, "step": 773, "time_per_iteration": 2.5028228759765625 }, { "auxiliary_loss_clip": 0.01225206, "auxiliary_loss_mlp": 0.01065776, "balance_loss_clip": 1.03734434, "balance_loss_mlp": 1.06204951, "epoch": 0.04653539756500827, "flos": 25264916645760.0, "grad_norm": 2.2304963758274807, "language_loss": 0.78532577, "learning_rate": 3.997132977658996e-06, "loss": 0.80823559, "num_input_tokens_seen": 16627225, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.6328125, "step": 774, "time_per_iteration": 2.510788917541504 }, { "auxiliary_loss_clip": 0.01222913, "auxiliary_loss_mlp": 0.01066989, "balance_loss_clip": 1.04091752, "balance_loss_mlp": 1.05965245, "epoch": 0.046595520817676234, "flos": 35404150089600.0, "grad_norm": 1.96090429379014, "language_loss": 0.73360586, "learning_rate": 3.997112093704952e-06, "loss": 0.75650489, "num_input_tokens_seen": 16647785, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.6328125, "step": 775, "time_per_iteration": 2.595896005630493 }, { "auxiliary_loss_clip": 0.01225035, "auxiliary_loss_mlp": 0.0105664, "balance_loss_clip": 1.02838731, "balance_loss_mlp": 1.05904114, "epoch": 0.046655644070344206, "flos": 18112516778880.0, "grad_norm": 1.63435042305249, "language_loss": 0.77419436, "learning_rate": 3.997091134020217e-06, "loss": 0.79701114, "num_input_tokens_seen": 16667555, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.65625, "step": 776, "time_per_iteration": 2.467724323272705 }, { "auxiliary_loss_clip": 0.0121954, "auxiliary_loss_mlp": 0.01064472, "balance_loss_clip": 1.03800797, "balance_loss_mlp": 1.0571053, "epoch": 0.04671576732301218, "flos": 29205286617600.0, "grad_norm": 17.345146157461407, "language_loss": 0.71351933, "learning_rate": 3.997070098605585e-06, "loss": 0.73635948, "num_input_tokens_seen": 16686875, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.625, "step": 777, "time_per_iteration": 2.56067156791687 }, { "auxiliary_loss_clip": 0.01225082, "auxiliary_loss_mlp": 0.01069253, "balance_loss_clip": 1.04069066, "balance_loss_mlp": 1.05975854, "epoch": 0.04677589057568014, "flos": 30478319510400.0, "grad_norm": 1.9163773417531171, "language_loss": 0.76878315, "learning_rate": 3.997048987461856e-06, "loss": 0.79172653, "num_input_tokens_seen": 16706420, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.65625, "step": 778, "time_per_iteration": 2.548060655593872 }, { "auxiliary_loss_clip": 0.01222182, "auxiliary_loss_mlp": 0.01065315, "balance_loss_clip": 1.03714538, "balance_loss_mlp": 1.05803514, "epoch": 0.046836013828348115, "flos": 20557674282240.0, "grad_norm": 2.1178218487688065, "language_loss": 0.79456347, "learning_rate": 3.997027800589829e-06, "loss": 0.81743842, "num_input_tokens_seen": 16726390, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.640625, "step": 779, "time_per_iteration": 2.5139973163604736 }, { "auxiliary_loss_clip": 0.01216134, "auxiliary_loss_mlp": 0.01065094, "balance_loss_clip": 1.03815317, "balance_loss_mlp": 1.05633783, "epoch": 0.04689613708101608, "flos": 25447378757760.0, "grad_norm": 1.8714973963969865, "language_loss": 0.77437639, "learning_rate": 3.997006537990308e-06, "loss": 0.7971887, "num_input_tokens_seen": 16748965, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.59375, "step": 780, "time_per_iteration": 2.5502278804779053 }, { "auxiliary_loss_clip": 0.01222025, "auxiliary_loss_mlp": 0.01070908, "balance_loss_clip": 1.0455879, "balance_loss_mlp": 1.06027162, "epoch": 0.04695626033368405, "flos": 23001395241600.0, "grad_norm": 1.7095448612053739, "language_loss": 0.76480192, "learning_rate": 3.996985199664099e-06, "loss": 0.78773129, "num_input_tokens_seen": 16768620, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.6171875, "step": 781, "time_per_iteration": 2.5528297424316406 }, { "auxiliary_loss_clip": 0.0123212, "auxiliary_loss_mlp": 0.01071899, "balance_loss_clip": 1.04278827, "balance_loss_mlp": 1.06259871, "epoch": 0.047016383586352024, "flos": 29133357632640.0, "grad_norm": 4.014845128939442, "language_loss": 0.7392174, "learning_rate": 3.99696378561201e-06, "loss": 0.76225758, "num_input_tokens_seen": 16789755, "router_z_loss_clip": 0.29101562, "router_z_loss_mlp": 1.6953125, "step": 782, "time_per_iteration": 2.531026601791382 }, { "auxiliary_loss_clip": 0.01226252, "auxiliary_loss_mlp": 0.01065993, "balance_loss_clip": 1.04049373, "balance_loss_mlp": 1.06246686, "epoch": 0.04707650683901999, "flos": 14976330451200.0, "grad_norm": 2.2139745227218692, "language_loss": 0.80395019, "learning_rate": 3.996942295834855e-06, "loss": 0.82687271, "num_input_tokens_seen": 16807585, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.640625, "step": 783, "time_per_iteration": 2.5594542026519775 }, { "auxiliary_loss_clip": 0.01217886, "auxiliary_loss_mlp": 0.0105519, "balance_loss_clip": 1.02989352, "balance_loss_mlp": 1.05945778, "epoch": 0.04713663009168796, "flos": 21651118151040.0, "grad_norm": 1.8624366282305702, "language_loss": 0.8181107, "learning_rate": 3.996920730333448e-06, "loss": 0.84084141, "num_input_tokens_seen": 16827220, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.5859375, "step": 784, "time_per_iteration": 2.494961977005005 }, { "auxiliary_loss_clip": 0.01226121, "auxiliary_loss_mlp": 0.01068665, "balance_loss_clip": 1.04241538, "balance_loss_mlp": 1.05887794, "epoch": 0.04719675334435593, "flos": 21325408600320.0, "grad_norm": 3.779442502495342, "language_loss": 0.80751491, "learning_rate": 3.996899089108607e-06, "loss": 0.83046275, "num_input_tokens_seen": 16846230, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.671875, "step": 785, "time_per_iteration": 2.4996423721313477 }, { "auxiliary_loss_clip": 0.01227655, "auxiliary_loss_mlp": 0.01063101, "balance_loss_clip": 1.03779244, "balance_loss_mlp": 1.06450105, "epoch": 0.0472568765970239, "flos": 17931383470080.0, "grad_norm": 1.8210993432142064, "language_loss": 0.89610827, "learning_rate": 3.996877372161152e-06, "loss": 0.91901588, "num_input_tokens_seen": 16865325, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.6328125, "step": 786, "time_per_iteration": 2.4561681747436523 }, { "auxiliary_loss_clip": 0.01226658, "auxiliary_loss_mlp": 0.0106448, "balance_loss_clip": 1.0351311, "balance_loss_mlp": 1.05544925, "epoch": 0.04731699984969187, "flos": 18077324428800.0, "grad_norm": 4.521722684245679, "language_loss": 0.76440537, "learning_rate": 3.9968555794919065e-06, "loss": 0.78731674, "num_input_tokens_seen": 16882930, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.7109375, "step": 787, "time_per_iteration": 2.4596948623657227 }, { "auxiliary_loss_clip": 0.01230593, "auxiliary_loss_mlp": 0.01065501, "balance_loss_clip": 1.03844047, "balance_loss_mlp": 1.06471133, "epoch": 0.047377123102359836, "flos": 23185078416000.0, "grad_norm": 2.496899492567907, "language_loss": 0.81230384, "learning_rate": 3.996833711101698e-06, "loss": 0.83526474, "num_input_tokens_seen": 16900710, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.65625, "step": 788, "time_per_iteration": 2.4694926738739014 }, { "auxiliary_loss_clip": 0.01222867, "auxiliary_loss_mlp": 0.01069713, "balance_loss_clip": 1.04167533, "balance_loss_mlp": 1.06122935, "epoch": 0.04743724635502781, "flos": 22747794243840.0, "grad_norm": 1.9109050616111563, "language_loss": 0.84652609, "learning_rate": 3.996811766991355e-06, "loss": 0.86945194, "num_input_tokens_seen": 16919210, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.6171875, "step": 789, "time_per_iteration": 2.4858720302581787 }, { "auxiliary_loss_clip": 0.01227758, "auxiliary_loss_mlp": 0.01067065, "balance_loss_clip": 1.04068398, "balance_loss_mlp": 1.0626967, "epoch": 0.04749736960769577, "flos": 17238702620160.0, "grad_norm": 12.168131087095757, "language_loss": 0.82243657, "learning_rate": 3.996789747161709e-06, "loss": 0.84538478, "num_input_tokens_seen": 16937125, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.6484375, "step": 790, "time_per_iteration": 2.4369828701019287 }, { "auxiliary_loss_clip": 0.01223156, "auxiliary_loss_mlp": 0.0106321, "balance_loss_clip": 1.03582788, "balance_loss_mlp": 1.05894208, "epoch": 0.047557492860363745, "flos": 40479261592320.0, "grad_norm": 2.345321210689448, "language_loss": 0.88043618, "learning_rate": 3.996767651613597e-06, "loss": 0.90329981, "num_input_tokens_seen": 16958610, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.640625, "step": 791, "time_per_iteration": 2.631448984146118 }, { "auxiliary_loss_clip": 0.01223728, "auxiliary_loss_mlp": 0.01062207, "balance_loss_clip": 1.03474104, "balance_loss_mlp": 1.06062126, "epoch": 0.04761761611303172, "flos": 18698004466560.0, "grad_norm": 2.4413411082767493, "language_loss": 0.90302801, "learning_rate": 3.996745480347854e-06, "loss": 0.92588735, "num_input_tokens_seen": 16977300, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.625, "step": 792, "time_per_iteration": 2.435323715209961 }, { "auxiliary_loss_clip": 0.01226601, "auxiliary_loss_mlp": 0.0107177, "balance_loss_clip": 1.04580593, "balance_loss_mlp": 1.06022584, "epoch": 0.04767773936569968, "flos": 20921987975040.0, "grad_norm": 2.1009529996641083, "language_loss": 0.73943317, "learning_rate": 3.996723233365324e-06, "loss": 0.7624169, "num_input_tokens_seen": 16994950, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.6640625, "step": 793, "time_per_iteration": 2.4511513710021973 }, { "auxiliary_loss_clip": 0.01231169, "auxiliary_loss_mlp": 0.01067491, "balance_loss_clip": 1.03929853, "balance_loss_mlp": 1.06307471, "epoch": 0.047737862618367655, "flos": 23732680233600.0, "grad_norm": 1.8515497306048143, "language_loss": 0.86227632, "learning_rate": 3.996700910666847e-06, "loss": 0.88526291, "num_input_tokens_seen": 17014760, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.6796875, "step": 794, "time_per_iteration": 2.4927399158477783 }, { "auxiliary_loss_clip": 0.01226027, "auxiliary_loss_mlp": 0.01069161, "balance_loss_clip": 1.04076564, "balance_loss_mlp": 1.05905366, "epoch": 0.04779798587103562, "flos": 23695764030720.0, "grad_norm": 3.1737268721048286, "language_loss": 0.70150113, "learning_rate": 3.996678512253272e-06, "loss": 0.72445303, "num_input_tokens_seen": 17032715, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.671875, "step": 795, "time_per_iteration": 2.4917244911193848 }, { "auxiliary_loss_clip": 0.01222549, "auxiliary_loss_mlp": 0.01072056, "balance_loss_clip": 1.04487658, "balance_loss_mlp": 1.05851984, "epoch": 0.04785810912370359, "flos": 23183641872000.0, "grad_norm": 1.9935112593331201, "language_loss": 0.806422, "learning_rate": 3.996656038125449e-06, "loss": 0.82936811, "num_input_tokens_seen": 17052215, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.640625, "step": 796, "time_per_iteration": 2.481081247329712 }, { "auxiliary_loss_clip": 0.01226201, "auxiliary_loss_mlp": 0.01061517, "balance_loss_clip": 1.03388429, "balance_loss_mlp": 1.06113005, "epoch": 0.047918232376371564, "flos": 18040623707520.0, "grad_norm": 2.4578794387279177, "language_loss": 0.81444615, "learning_rate": 3.996633488284228e-06, "loss": 0.83732337, "num_input_tokens_seen": 17069225, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.65625, "step": 797, "time_per_iteration": 2.4770383834838867 }, { "auxiliary_loss_clip": 0.01101668, "auxiliary_loss_mlp": 0.01008256, "balance_loss_clip": 1.00384474, "balance_loss_mlp": 1.02757752, "epoch": 0.04797835562903953, "flos": 62442588758400.0, "grad_norm": 0.9290158013559202, "language_loss": 0.64398146, "learning_rate": 3.996610862730465e-06, "loss": 0.66508067, "num_input_tokens_seen": 17126680, "router_z_loss_clip": 0.04418945, "router_z_loss_mlp": 0.7421875, "step": 798, "time_per_iteration": 3.057276725769043 }, { "auxiliary_loss_clip": 0.01228446, "auxiliary_loss_mlp": 0.01065934, "balance_loss_clip": 1.03945708, "balance_loss_mlp": 1.05704403, "epoch": 0.0480384788817075, "flos": 21507296094720.0, "grad_norm": 3.0150960624079546, "language_loss": 0.91179669, "learning_rate": 3.996588161465018e-06, "loss": 0.93474048, "num_input_tokens_seen": 17144835, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.7109375, "step": 799, "time_per_iteration": 2.466618061065674 }, { "auxiliary_loss_clip": 0.01228692, "auxiliary_loss_mlp": 0.01064307, "balance_loss_clip": 1.0360899, "balance_loss_mlp": 1.065907, "epoch": 0.048098602134375466, "flos": 21726710323200.0, "grad_norm": 2.4871519369379547, "language_loss": 0.8658632, "learning_rate": 3.996565384488748e-06, "loss": 0.88879323, "num_input_tokens_seen": 17165030, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.625, "step": 800, "time_per_iteration": 2.4991190433502197 }, { "auxiliary_loss_clip": 0.01227628, "auxiliary_loss_mlp": 0.0106736, "balance_loss_clip": 1.04146743, "balance_loss_mlp": 1.06039059, "epoch": 0.04815872538704344, "flos": 22931082368640.0, "grad_norm": 2.9786696319586055, "language_loss": 0.84204239, "learning_rate": 3.996542531802518e-06, "loss": 0.86499226, "num_input_tokens_seen": 17184895, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.671875, "step": 801, "time_per_iteration": 2.4909474849700928 }, { "auxiliary_loss_clip": 0.01227702, "auxiliary_loss_mlp": 0.01067114, "balance_loss_clip": 1.04089975, "balance_loss_mlp": 1.0615344, "epoch": 0.04821884863971141, "flos": 43174716042240.0, "grad_norm": 2.106409783983983, "language_loss": 0.79797292, "learning_rate": 3.996519603407196e-06, "loss": 0.82092112, "num_input_tokens_seen": 17208225, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.65625, "step": 802, "time_per_iteration": 2.6870436668395996 }, { "auxiliary_loss_clip": 0.01228134, "auxiliary_loss_mlp": 0.0106146, "balance_loss_clip": 1.0358777, "balance_loss_mlp": 1.06366611, "epoch": 0.048278971892379376, "flos": 18620006083200.0, "grad_norm": 1.882098315954798, "language_loss": 0.86625022, "learning_rate": 3.996496599303649e-06, "loss": 0.88914621, "num_input_tokens_seen": 17226305, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.640625, "step": 803, "time_per_iteration": 2.438622236251831 }, { "auxiliary_loss_clip": 0.01221143, "auxiliary_loss_mlp": 0.01059827, "balance_loss_clip": 1.03381574, "balance_loss_mlp": 1.06098628, "epoch": 0.04833909514504735, "flos": 20230061310720.0, "grad_norm": 2.539087443301006, "language_loss": 0.85336447, "learning_rate": 3.996473519492753e-06, "loss": 0.87617421, "num_input_tokens_seen": 17244545, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.6015625, "step": 804, "time_per_iteration": 2.4882497787475586 }, { "auxiliary_loss_clip": 0.012248, "auxiliary_loss_mlp": 0.01068026, "balance_loss_clip": 1.04222882, "balance_loss_mlp": 1.06079495, "epoch": 0.04839921839771532, "flos": 24645170361600.0, "grad_norm": 2.7819666983679063, "language_loss": 0.86389124, "learning_rate": 3.99645036397538e-06, "loss": 0.88681948, "num_input_tokens_seen": 17265730, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.640625, "step": 805, "time_per_iteration": 2.495243787765503 }, { "auxiliary_loss_clip": 0.01218527, "auxiliary_loss_mlp": 0.01071995, "balance_loss_clip": 1.0467819, "balance_loss_mlp": 1.05729544, "epoch": 0.048459341650383285, "flos": 24827452905600.0, "grad_norm": 2.1144500031911098, "language_loss": 0.67987919, "learning_rate": 3.9964271327524085e-06, "loss": 0.70278436, "num_input_tokens_seen": 17284820, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.609375, "step": 806, "time_per_iteration": 2.507822036743164 }, { "auxiliary_loss_clip": 0.01219583, "auxiliary_loss_mlp": 0.01057745, "balance_loss_clip": 1.03161454, "balance_loss_mlp": 1.0591476, "epoch": 0.04851946490305126, "flos": 22163204396160.0, "grad_norm": 2.176533410276917, "language_loss": 0.76324147, "learning_rate": 3.9964038258247214e-06, "loss": 0.7860148, "num_input_tokens_seen": 17305085, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.609375, "step": 807, "time_per_iteration": 3.9514153003692627 }, { "auxiliary_loss_clip": 0.01216161, "auxiliary_loss_mlp": 0.0106781, "balance_loss_clip": 1.04226375, "balance_loss_mlp": 1.05519199, "epoch": 0.04857958815571922, "flos": 19792022952960.0, "grad_norm": 4.651902802013241, "language_loss": 0.86673224, "learning_rate": 3.9963804431932005e-06, "loss": 0.88957191, "num_input_tokens_seen": 17322715, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.609375, "step": 808, "time_per_iteration": 5.384095191955566 }, { "auxiliary_loss_clip": 0.01227921, "auxiliary_loss_mlp": 0.01064475, "balance_loss_clip": 1.03829682, "balance_loss_mlp": 1.06165147, "epoch": 0.048639711408387194, "flos": 18697968552960.0, "grad_norm": 1.8104162161272905, "language_loss": 0.89861864, "learning_rate": 3.996356984858732e-06, "loss": 0.92154264, "num_input_tokens_seen": 17341455, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.6640625, "step": 809, "time_per_iteration": 2.4869956970214844 }, { "auxiliary_loss_clip": 0.01223944, "auxiliary_loss_mlp": 0.01074038, "balance_loss_clip": 1.04816973, "balance_loss_mlp": 1.06262541, "epoch": 0.048699834661055166, "flos": 24863507182080.0, "grad_norm": 1.928614591326603, "language_loss": 0.84927106, "learning_rate": 3.996333450822208e-06, "loss": 0.87225091, "num_input_tokens_seen": 17360765, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.6171875, "step": 810, "time_per_iteration": 2.483480215072632 }, { "auxiliary_loss_clip": 0.01224914, "auxiliary_loss_mlp": 0.01066736, "balance_loss_clip": 1.04066455, "balance_loss_mlp": 1.05934072, "epoch": 0.04875995791372313, "flos": 20704010290560.0, "grad_norm": 2.215956473154052, "language_loss": 0.81024605, "learning_rate": 3.99630984108452e-06, "loss": 0.83316255, "num_input_tokens_seen": 17380625, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.65625, "step": 811, "time_per_iteration": 2.4782896041870117 }, { "auxiliary_loss_clip": 0.01218299, "auxiliary_loss_mlp": 0.01069059, "balance_loss_clip": 1.04410839, "balance_loss_mlp": 1.05814481, "epoch": 0.048820081166391104, "flos": 18588297352320.0, "grad_norm": 1.7287291279083157, "language_loss": 0.74605644, "learning_rate": 3.9962861556465615e-06, "loss": 0.76893008, "num_input_tokens_seen": 17399355, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.6015625, "step": 812, "time_per_iteration": 2.4458916187286377 }, { "auxiliary_loss_clip": 0.0122025, "auxiliary_loss_mlp": 0.01076525, "balance_loss_clip": 1.05165839, "balance_loss_mlp": 1.06219995, "epoch": 0.04888020441905907, "flos": 22707322594560.0, "grad_norm": 2.0009383484298713, "language_loss": 0.90891242, "learning_rate": 3.996262394509233e-06, "loss": 0.93188012, "num_input_tokens_seen": 17418240, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.578125, "step": 813, "time_per_iteration": 2.5027413368225098 }, { "auxiliary_loss_clip": 0.01217043, "auxiliary_loss_mlp": 0.01056071, "balance_loss_clip": 1.03152537, "balance_loss_mlp": 1.0580523, "epoch": 0.04894032767172704, "flos": 22784351310720.0, "grad_norm": 2.235128446738837, "language_loss": 0.74778378, "learning_rate": 3.9962385576734335e-06, "loss": 0.77051497, "num_input_tokens_seen": 17436250, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.59375, "step": 814, "time_per_iteration": 2.4741930961608887 }, { "auxiliary_loss_clip": 0.01222607, "auxiliary_loss_mlp": 0.01065465, "balance_loss_clip": 1.03909636, "balance_loss_mlp": 1.06088877, "epoch": 0.04900045092439501, "flos": 25516147345920.0, "grad_norm": 2.520999723015682, "language_loss": 0.83469248, "learning_rate": 3.9962146451400675e-06, "loss": 0.85757327, "num_input_tokens_seen": 17455750, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.609375, "step": 815, "time_per_iteration": 2.5401086807250977 }, { "auxiliary_loss_clip": 0.01224783, "auxiliary_loss_mlp": 0.01063521, "balance_loss_clip": 1.03758097, "balance_loss_mlp": 1.06222188, "epoch": 0.04906057417706298, "flos": 25958136199680.0, "grad_norm": 2.3427238552313905, "language_loss": 0.91167629, "learning_rate": 3.996190656910043e-06, "loss": 0.93455935, "num_input_tokens_seen": 17474995, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.625, "step": 816, "time_per_iteration": 2.506207227706909 }, { "auxiliary_loss_clip": 0.01225174, "auxiliary_loss_mlp": 0.01056689, "balance_loss_clip": 1.03014088, "balance_loss_mlp": 1.06047356, "epoch": 0.04912069742973095, "flos": 18624638937600.0, "grad_norm": 3.6932681401489633, "language_loss": 0.8017689, "learning_rate": 3.996166592984268e-06, "loss": 0.82458752, "num_input_tokens_seen": 17493395, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.6484375, "step": 817, "time_per_iteration": 2.484373092651367 }, { "auxiliary_loss_clip": 0.012193, "auxiliary_loss_mlp": 0.01067571, "balance_loss_clip": 1.04175019, "balance_loss_mlp": 1.06014752, "epoch": 0.049180820682398915, "flos": 23699786353920.0, "grad_norm": 1.8839230200078028, "language_loss": 0.84892714, "learning_rate": 3.996142453363656e-06, "loss": 0.87179583, "num_input_tokens_seen": 17514565, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.59375, "step": 818, "time_per_iteration": 2.5128681659698486 }, { "auxiliary_loss_clip": 0.01227517, "auxiliary_loss_mlp": 0.01063782, "balance_loss_clip": 1.03650641, "balance_loss_mlp": 1.06034338, "epoch": 0.04924094393506689, "flos": 22420396753920.0, "grad_norm": 2.1138153837778826, "language_loss": 0.76008439, "learning_rate": 3.996118238049124e-06, "loss": 0.78299737, "num_input_tokens_seen": 17534590, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.671875, "step": 819, "time_per_iteration": 2.5008277893066406 }, { "auxiliary_loss_clip": 0.01224715, "auxiliary_loss_mlp": 0.01058951, "balance_loss_clip": 1.03565764, "balance_loss_mlp": 1.06349993, "epoch": 0.04930106718773486, "flos": 15738246766080.0, "grad_norm": 2.462158760870924, "language_loss": 0.84820414, "learning_rate": 3.996093947041586e-06, "loss": 0.87104082, "num_input_tokens_seen": 17551900, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.609375, "step": 820, "time_per_iteration": 2.455585479736328 }, { "auxiliary_loss_clip": 0.01221759, "auxiliary_loss_mlp": 0.01062984, "balance_loss_clip": 1.03679371, "balance_loss_mlp": 1.05812454, "epoch": 0.049361190440402825, "flos": 26250628648320.0, "grad_norm": 2.206525389444392, "language_loss": 0.90701061, "learning_rate": 3.996069580341966e-06, "loss": 0.92985803, "num_input_tokens_seen": 17571485, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.640625, "step": 821, "time_per_iteration": 2.5401623249053955 }, { "auxiliary_loss_clip": 0.01219506, "auxiliary_loss_mlp": 0.010709, "balance_loss_clip": 1.04584229, "balance_loss_mlp": 1.05854344, "epoch": 0.0494213136930708, "flos": 21252366293760.0, "grad_norm": 1.9649365161693255, "language_loss": 0.90202057, "learning_rate": 3.996045137951188e-06, "loss": 0.92492461, "num_input_tokens_seen": 17591410, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.609375, "step": 822, "time_per_iteration": 2.4925155639648438 }, { "auxiliary_loss_clip": 0.01222171, "auxiliary_loss_mlp": 0.01061333, "balance_loss_clip": 1.03412914, "balance_loss_mlp": 1.06189132, "epoch": 0.04948143694573876, "flos": 27965506740480.0, "grad_norm": 1.7083359672063425, "language_loss": 0.67336345, "learning_rate": 3.996020619870178e-06, "loss": 0.69619846, "num_input_tokens_seen": 17612010, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.6015625, "step": 823, "time_per_iteration": 2.529418706893921 }, { "auxiliary_loss_clip": 0.01098153, "auxiliary_loss_mlp": 0.010214, "balance_loss_clip": 1.01758504, "balance_loss_mlp": 1.02630758, "epoch": 0.049541560198406734, "flos": 66180995533440.0, "grad_norm": 1.4087988686972597, "language_loss": 0.62424421, "learning_rate": 3.995996026099866e-06, "loss": 0.64543968, "num_input_tokens_seen": 17673430, "router_z_loss_clip": 0.03808594, "router_z_loss_mlp": 0.71875, "step": 824, "time_per_iteration": 3.1637580394744873 }, { "auxiliary_loss_clip": 0.01226981, "auxiliary_loss_mlp": 0.01069187, "balance_loss_clip": 1.04199529, "balance_loss_mlp": 1.06192076, "epoch": 0.049601683451074706, "flos": 22892693708160.0, "grad_norm": 1.9375117984840726, "language_loss": 0.90854156, "learning_rate": 3.995971356641185e-06, "loss": 0.93150324, "num_input_tokens_seen": 17689545, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.6484375, "step": 825, "time_per_iteration": 2.4619786739349365 }, { "auxiliary_loss_clip": 0.01222369, "auxiliary_loss_mlp": 0.01065893, "balance_loss_clip": 1.03873718, "balance_loss_mlp": 1.05964375, "epoch": 0.04966180670374267, "flos": 21433643256960.0, "grad_norm": 8.38194085727824, "language_loss": 0.66524595, "learning_rate": 3.9959466114950695e-06, "loss": 0.68812853, "num_input_tokens_seen": 17705965, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.625, "step": 826, "time_per_iteration": 2.4727602005004883 }, { "auxiliary_loss_clip": 0.01224846, "auxiliary_loss_mlp": 0.0106216, "balance_loss_clip": 1.03586268, "balance_loss_mlp": 1.06118155, "epoch": 0.04972192995641064, "flos": 23107367341440.0, "grad_norm": 2.085201533622005, "language_loss": 0.78070825, "learning_rate": 3.995921790662459e-06, "loss": 0.80357832, "num_input_tokens_seen": 17724580, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.640625, "step": 827, "time_per_iteration": 2.4689583778381348 }, { "auxiliary_loss_clip": 0.01226867, "auxiliary_loss_mlp": 0.01074665, "balance_loss_clip": 1.04700828, "balance_loss_mlp": 1.06128788, "epoch": 0.04978205320907861, "flos": 40406147458560.0, "grad_norm": 1.897707720450754, "language_loss": 0.79019397, "learning_rate": 3.995896894144294e-06, "loss": 0.8132093, "num_input_tokens_seen": 17747755, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.65625, "step": 828, "time_per_iteration": 2.6566343307495117 }, { "auxiliary_loss_clip": 0.01212392, "auxiliary_loss_mlp": 0.01060286, "balance_loss_clip": 1.0350852, "balance_loss_mlp": 1.0546869, "epoch": 0.04984217646174658, "flos": 25228539146880.0, "grad_norm": 1.8850377944636654, "language_loss": 0.83761913, "learning_rate": 3.995871921941519e-06, "loss": 0.86034596, "num_input_tokens_seen": 17768550, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.578125, "step": 829, "time_per_iteration": 2.492023468017578 }, { "auxiliary_loss_clip": 0.01221668, "auxiliary_loss_mlp": 0.0107435, "balance_loss_clip": 1.04494095, "balance_loss_mlp": 1.05811822, "epoch": 0.04990229971441455, "flos": 15959636242560.0, "grad_norm": 2.095526952042825, "language_loss": 0.75487083, "learning_rate": 3.99584687405508e-06, "loss": 0.77783102, "num_input_tokens_seen": 17786080, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.6328125, "step": 830, "time_per_iteration": 2.4595932960510254 }, { "auxiliary_loss_clip": 0.01221946, "auxiliary_loss_mlp": 0.01068379, "balance_loss_clip": 1.04084158, "balance_loss_mlp": 1.05865216, "epoch": 0.04996242296708252, "flos": 18405116968320.0, "grad_norm": 1.8228478407910487, "language_loss": 0.79882991, "learning_rate": 3.995821750485929e-06, "loss": 0.82173318, "num_input_tokens_seen": 17803635, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.6328125, "step": 831, "time_per_iteration": 2.4450180530548096 }, { "auxiliary_loss_clip": 0.01223698, "auxiliary_loss_mlp": 0.01077248, "balance_loss_clip": 1.05160594, "balance_loss_mlp": 1.05996752, "epoch": 0.05002254621975049, "flos": 17858053854720.0, "grad_norm": 2.6296175935571946, "language_loss": 0.91431558, "learning_rate": 3.995796551235016e-06, "loss": 0.93732512, "num_input_tokens_seen": 17822190, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.640625, "step": 832, "time_per_iteration": 2.4538230895996094 }, { "auxiliary_loss_clip": 0.01216205, "auxiliary_loss_mlp": 0.0107715, "balance_loss_clip": 1.05215144, "balance_loss_mlp": 1.05785847, "epoch": 0.050082669472418455, "flos": 45660273367680.0, "grad_norm": 2.116711992554241, "language_loss": 0.83449411, "learning_rate": 3.9957712763032974e-06, "loss": 0.8574276, "num_input_tokens_seen": 17846915, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.578125, "step": 833, "time_per_iteration": 2.6827008724212646 }, { "auxiliary_loss_clip": 0.0122237, "auxiliary_loss_mlp": 0.01058588, "balance_loss_clip": 1.03155124, "balance_loss_mlp": 1.05897677, "epoch": 0.05014279272508643, "flos": 37962067363200.0, "grad_norm": 2.010627669066052, "language_loss": 0.82106853, "learning_rate": 3.995745925691733e-06, "loss": 0.84387815, "num_input_tokens_seen": 17867270, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.6328125, "step": 834, "time_per_iteration": 2.60917067527771 }, { "auxiliary_loss_clip": 0.01224495, "auxiliary_loss_mlp": 0.01063815, "balance_loss_clip": 1.03608692, "balance_loss_mlp": 1.0592593, "epoch": 0.0502029159777544, "flos": 20996179516800.0, "grad_norm": 2.679258328517798, "language_loss": 0.92012364, "learning_rate": 3.995720499401282e-06, "loss": 0.94300669, "num_input_tokens_seen": 17884880, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.65625, "step": 835, "time_per_iteration": 2.4636552333831787 }, { "auxiliary_loss_clip": 0.01221732, "auxiliary_loss_mlp": 0.01069035, "balance_loss_clip": 1.04075885, "balance_loss_mlp": 1.05627775, "epoch": 0.050263039230422364, "flos": 15888066393600.0, "grad_norm": 2.7778512941902584, "language_loss": 0.76395541, "learning_rate": 3.995694997432911e-06, "loss": 0.78686309, "num_input_tokens_seen": 17903695, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.65625, "step": 836, "time_per_iteration": 2.454082489013672 }, { "auxiliary_loss_clip": 0.0121741, "auxiliary_loss_mlp": 0.01070388, "balance_loss_clip": 1.04443622, "balance_loss_mlp": 1.05934787, "epoch": 0.050323162483090336, "flos": 23732752060800.0, "grad_norm": 2.0687076334432133, "language_loss": 0.83653808, "learning_rate": 3.9956694197875855e-06, "loss": 0.85941607, "num_input_tokens_seen": 17920745, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5859375, "step": 837, "time_per_iteration": 2.473996162414551 }, { "auxiliary_loss_clip": 0.01220994, "auxiliary_loss_mlp": 0.01065351, "balance_loss_clip": 1.03979266, "balance_loss_mlp": 1.06048644, "epoch": 0.0503832857357583, "flos": 20266223328000.0, "grad_norm": 2.304191891175959, "language_loss": 0.73183638, "learning_rate": 3.995643766466275e-06, "loss": 0.75469983, "num_input_tokens_seen": 17938220, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.6015625, "step": 838, "time_per_iteration": 2.4817233085632324 }, { "auxiliary_loss_clip": 0.0121737, "auxiliary_loss_mlp": 0.01066308, "balance_loss_clip": 1.03981984, "balance_loss_mlp": 1.05545712, "epoch": 0.05044340898842627, "flos": 17785011548160.0, "grad_norm": 1.834404978673594, "language_loss": 0.83381546, "learning_rate": 3.995618037469953e-06, "loss": 0.85665226, "num_input_tokens_seen": 17957325, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.6171875, "step": 839, "time_per_iteration": 2.4350850582122803 }, { "auxiliary_loss_clip": 0.01217177, "auxiliary_loss_mlp": 0.01074092, "balance_loss_clip": 1.04793715, "balance_loss_mlp": 1.05774665, "epoch": 0.050503532241094246, "flos": 22966526113920.0, "grad_norm": 1.9815829786693528, "language_loss": 0.8545512, "learning_rate": 3.995592232799595e-06, "loss": 0.87746382, "num_input_tokens_seen": 17975875, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.59375, "step": 840, "time_per_iteration": 2.4991455078125 }, { "auxiliary_loss_clip": 0.01223249, "auxiliary_loss_mlp": 0.01064927, "balance_loss_clip": 1.03713953, "balance_loss_mlp": 1.06070685, "epoch": 0.05056365549376221, "flos": 22776989022720.0, "grad_norm": 1.7517796115493334, "language_loss": 0.94856918, "learning_rate": 3.99556635245618e-06, "loss": 0.97145104, "num_input_tokens_seen": 17994340, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.625, "step": 841, "time_per_iteration": 2.4739551544189453 }, { "auxiliary_loss_clip": 0.01222189, "auxiliary_loss_mlp": 0.0107335, "balance_loss_clip": 1.04504943, "balance_loss_mlp": 1.05843055, "epoch": 0.05062377874643018, "flos": 30916968399360.0, "grad_norm": 2.3089225485321943, "language_loss": 0.77672505, "learning_rate": 3.995540396440688e-06, "loss": 0.79968047, "num_input_tokens_seen": 18015260, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.640625, "step": 842, "time_per_iteration": 2.5683412551879883 }, { "auxiliary_loss_clip": 0.01227297, "auxiliary_loss_mlp": 0.0107062, "balance_loss_clip": 1.04277265, "balance_loss_mlp": 1.0613482, "epoch": 0.05068390199909815, "flos": 19647159402240.0, "grad_norm": 1.9635969585592346, "language_loss": 0.78148156, "learning_rate": 3.995514364754105e-06, "loss": 0.8044607, "num_input_tokens_seen": 18033960, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.65625, "step": 843, "time_per_iteration": 2.4616036415100098 }, { "auxiliary_loss_clip": 0.01226501, "auxiliary_loss_mlp": 0.01061757, "balance_loss_clip": 1.0365448, "balance_loss_mlp": 1.06207943, "epoch": 0.05074402525176612, "flos": 37962103276800.0, "grad_norm": 1.838491040432003, "language_loss": 0.83002388, "learning_rate": 3.995488257397417e-06, "loss": 0.85290647, "num_input_tokens_seen": 18056700, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.640625, "step": 844, "time_per_iteration": 2.630467653274536 }, { "auxiliary_loss_clip": 0.01220273, "auxiliary_loss_mlp": 0.01063357, "balance_loss_clip": 1.03709495, "balance_loss_mlp": 1.05814981, "epoch": 0.05080414850443409, "flos": 22054610603520.0, "grad_norm": 2.9080777544924823, "language_loss": 0.76234287, "learning_rate": 3.995462074371614e-06, "loss": 0.78517914, "num_input_tokens_seen": 18075815, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.625, "step": 845, "time_per_iteration": 2.4773976802825928 }, { "auxiliary_loss_clip": 0.01218105, "auxiliary_loss_mlp": 0.01071275, "balance_loss_clip": 1.04416692, "balance_loss_mlp": 1.05641139, "epoch": 0.05086427175710206, "flos": 20225787592320.0, "grad_norm": 2.2483263518186347, "language_loss": 0.87731397, "learning_rate": 3.99543581567769e-06, "loss": 0.90020776, "num_input_tokens_seen": 18095095, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.6171875, "step": 846, "time_per_iteration": 2.461580991744995 }, { "auxiliary_loss_clip": 0.01220399, "auxiliary_loss_mlp": 0.010695, "balance_loss_clip": 1.04295194, "balance_loss_mlp": 1.05942881, "epoch": 0.05092439500977003, "flos": 15159223526400.0, "grad_norm": 1.6762532099007348, "language_loss": 0.87698472, "learning_rate": 3.9954094813166394e-06, "loss": 0.89988375, "num_input_tokens_seen": 18112675, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.609375, "step": 847, "time_per_iteration": 2.422255039215088 }, { "auxiliary_loss_clip": 0.01217042, "auxiliary_loss_mlp": 0.01070764, "balance_loss_clip": 1.04335785, "balance_loss_mlp": 1.05949807, "epoch": 0.050984518262437994, "flos": 22055149307520.0, "grad_norm": 2.0687779485303817, "language_loss": 0.82132888, "learning_rate": 3.995383071289462e-06, "loss": 0.84420699, "num_input_tokens_seen": 18130745, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.578125, "step": 848, "time_per_iteration": 3.9743380546569824 }, { "auxiliary_loss_clip": 0.01220369, "auxiliary_loss_mlp": 0.0106892, "balance_loss_clip": 1.04197907, "balance_loss_mlp": 1.06102204, "epoch": 0.05104464151510597, "flos": 30225329043840.0, "grad_norm": 1.7241676507539223, "language_loss": 0.87094635, "learning_rate": 3.995356585597158e-06, "loss": 0.89383924, "num_input_tokens_seen": 18152410, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.59375, "step": 849, "time_per_iteration": 4.072125196456909 }, { "auxiliary_loss_clip": 0.01217426, "auxiliary_loss_mlp": 0.01064098, "balance_loss_clip": 1.03712106, "balance_loss_mlp": 1.05744553, "epoch": 0.05110476476777394, "flos": 18332900674560.0, "grad_norm": 2.0160999041574095, "language_loss": 0.83295405, "learning_rate": 3.995330024240732e-06, "loss": 0.85576928, "num_input_tokens_seen": 18170870, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.6015625, "step": 850, "time_per_iteration": 3.874636650085449 }, { "auxiliary_loss_clip": 0.01220156, "auxiliary_loss_mlp": 0.01064751, "balance_loss_clip": 1.03856134, "balance_loss_mlp": 1.05816364, "epoch": 0.051164888020441904, "flos": 37998732170880.0, "grad_norm": 2.008032710086171, "language_loss": 0.65153301, "learning_rate": 3.995303387221192e-06, "loss": 0.67438203, "num_input_tokens_seen": 18191555, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.625, "step": 851, "time_per_iteration": 2.6049985885620117 }, { "auxiliary_loss_clip": 0.01217187, "auxiliary_loss_mlp": 0.01071873, "balance_loss_clip": 1.043239, "balance_loss_mlp": 1.05613017, "epoch": 0.051225011273109876, "flos": 23038634666880.0, "grad_norm": 2.174477841432049, "language_loss": 0.83484304, "learning_rate": 3.995276674539547e-06, "loss": 0.85773361, "num_input_tokens_seen": 18208620, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.609375, "step": 852, "time_per_iteration": 2.4498343467712402 }, { "auxiliary_loss_clip": 0.01221992, "auxiliary_loss_mlp": 0.01074825, "balance_loss_clip": 1.04743052, "balance_loss_mlp": 1.05848682, "epoch": 0.05128513452577785, "flos": 18259822454400.0, "grad_norm": 2.076583342265765, "language_loss": 0.80164665, "learning_rate": 3.995249886196811e-06, "loss": 0.82461488, "num_input_tokens_seen": 18226370, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.6328125, "step": 853, "time_per_iteration": 2.451916456222534 }, { "auxiliary_loss_clip": 0.01215526, "auxiliary_loss_mlp": 0.01070106, "balance_loss_clip": 1.04266453, "balance_loss_mlp": 1.056476, "epoch": 0.05134525777844581, "flos": 27198957571200.0, "grad_norm": 1.9502683968280619, "language_loss": 0.75819588, "learning_rate": 3.995223022193999e-06, "loss": 0.78105223, "num_input_tokens_seen": 18247075, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.59375, "step": 854, "time_per_iteration": 2.50819993019104 }, { "auxiliary_loss_clip": 0.01224449, "auxiliary_loss_mlp": 0.01066803, "balance_loss_clip": 1.03880036, "balance_loss_mlp": 1.06156039, "epoch": 0.051405381031113785, "flos": 28362247436160.0, "grad_norm": 2.523608182366558, "language_loss": 0.8156786, "learning_rate": 3.99519608253213e-06, "loss": 0.8385911, "num_input_tokens_seen": 18265680, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.625, "step": 855, "time_per_iteration": 2.52390456199646 }, { "auxiliary_loss_clip": 0.01114468, "auxiliary_loss_mlp": 0.01009439, "balance_loss_clip": 1.00571942, "balance_loss_mlp": 1.03954327, "epoch": 0.05146550428378175, "flos": 65618169327360.0, "grad_norm": 0.9737032931593018, "language_loss": 0.65693164, "learning_rate": 3.995169067212227e-06, "loss": 0.67817074, "num_input_tokens_seen": 18327015, "router_z_loss_clip": 0.03710938, "router_z_loss_mlp": 0.75, "step": 856, "time_per_iteration": 3.083124876022339 }, { "auxiliary_loss_clip": 0.01215548, "auxiliary_loss_mlp": 0.01056954, "balance_loss_clip": 1.03065681, "balance_loss_mlp": 1.05791664, "epoch": 0.05152562753644972, "flos": 22054861998720.0, "grad_norm": 1.8395406546168611, "language_loss": 0.76839268, "learning_rate": 3.9951419762353116e-06, "loss": 0.79111767, "num_input_tokens_seen": 18345235, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.578125, "step": 857, "time_per_iteration": 2.487651824951172 }, { "auxiliary_loss_clip": 0.0121995, "auxiliary_loss_mlp": 0.01057714, "balance_loss_clip": 1.03124976, "balance_loss_mlp": 1.05821753, "epoch": 0.051585750789117694, "flos": 18509544783360.0, "grad_norm": 2.043305217876723, "language_loss": 0.88715708, "learning_rate": 3.995114809602412e-06, "loss": 0.90993381, "num_input_tokens_seen": 18362350, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.6171875, "step": 858, "time_per_iteration": 2.456463098526001 }, { "auxiliary_loss_clip": 0.01216572, "auxiliary_loss_mlp": 0.01058689, "balance_loss_clip": 1.03215337, "balance_loss_mlp": 1.05710506, "epoch": 0.05164587404178566, "flos": 23730238108800.0, "grad_norm": 1.8563963653144933, "language_loss": 0.75665659, "learning_rate": 3.9950875673145605e-06, "loss": 0.77940923, "num_input_tokens_seen": 18383390, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.59375, "step": 859, "time_per_iteration": 2.5076866149902344 }, { "auxiliary_loss_clip": 0.0122743, "auxiliary_loss_mlp": 0.01075152, "balance_loss_clip": 1.04644632, "balance_loss_mlp": 1.06039691, "epoch": 0.05170599729445363, "flos": 16252882876800.0, "grad_norm": 2.5280054427706538, "language_loss": 0.90884769, "learning_rate": 3.995060249372788e-06, "loss": 0.9318735, "num_input_tokens_seen": 18399220, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.671875, "step": 860, "time_per_iteration": 2.4666857719421387 }, { "auxiliary_loss_clip": 0.01221289, "auxiliary_loss_mlp": 0.01057983, "balance_loss_clip": 1.03229332, "balance_loss_mlp": 1.06163383, "epoch": 0.0517661205471216, "flos": 23985922095360.0, "grad_norm": 1.945292146468344, "language_loss": 0.82527792, "learning_rate": 3.99503285577813e-06, "loss": 0.84807062, "num_input_tokens_seen": 18419005, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.59375, "step": 861, "time_per_iteration": 2.4866721630096436 }, { "auxiliary_loss_clip": 0.0121929, "auxiliary_loss_mlp": 0.01059412, "balance_loss_clip": 1.03406858, "balance_loss_mlp": 1.05781603, "epoch": 0.05182624379978957, "flos": 29277718392960.0, "grad_norm": 1.835040728388185, "language_loss": 0.7863971, "learning_rate": 3.995005386531627e-06, "loss": 0.80918419, "num_input_tokens_seen": 18440550, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.6171875, "step": 862, "time_per_iteration": 2.534636974334717 }, { "auxiliary_loss_clip": 0.012149, "auxiliary_loss_mlp": 0.01066782, "balance_loss_clip": 1.04153395, "balance_loss_mlp": 1.06003988, "epoch": 0.05188636705245754, "flos": 24170826332160.0, "grad_norm": 2.090389748678251, "language_loss": 0.88957918, "learning_rate": 3.9949778416343195e-06, "loss": 0.91239601, "num_input_tokens_seen": 18461950, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.546875, "step": 863, "time_per_iteration": 2.4783661365509033 }, { "auxiliary_loss_clip": 0.01223462, "auxiliary_loss_mlp": 0.0106023, "balance_loss_clip": 1.03216803, "balance_loss_mlp": 1.06276393, "epoch": 0.051946490305125506, "flos": 26760703731840.0, "grad_norm": 3.6772244506864133, "language_loss": 0.75897956, "learning_rate": 3.9949502210872525e-06, "loss": 0.78181648, "num_input_tokens_seen": 18480555, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.6015625, "step": 864, "time_per_iteration": 2.5095722675323486 }, { "auxiliary_loss_clip": 0.01222282, "auxiliary_loss_mlp": 0.01067058, "balance_loss_clip": 1.04035497, "balance_loss_mlp": 1.0601052, "epoch": 0.05200661355779348, "flos": 21502519585920.0, "grad_norm": 5.035251271562377, "language_loss": 0.79191267, "learning_rate": 3.994922524891474e-06, "loss": 0.81480604, "num_input_tokens_seen": 18499645, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.625, "step": 865, "time_per_iteration": 2.4638543128967285 }, { "auxiliary_loss_clip": 0.01219064, "auxiliary_loss_mlp": 0.01067051, "balance_loss_clip": 1.04068184, "balance_loss_mlp": 1.0587585, "epoch": 0.05206673681046144, "flos": 18114492026880.0, "grad_norm": 2.313210964650785, "language_loss": 0.859501, "learning_rate": 3.994894753048032e-06, "loss": 0.88236219, "num_input_tokens_seen": 18516810, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.6015625, "step": 866, "time_per_iteration": 2.4840316772460938 }, { "auxiliary_loss_clip": 0.01222633, "auxiliary_loss_mlp": 0.01063008, "balance_loss_clip": 1.0366745, "balance_loss_mlp": 1.06289554, "epoch": 0.052126860063129415, "flos": 17524191916800.0, "grad_norm": 2.3869729611849113, "language_loss": 0.87264729, "learning_rate": 3.9948669055579815e-06, "loss": 0.89550364, "num_input_tokens_seen": 18532510, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.6015625, "step": 867, "time_per_iteration": 2.425755500793457 }, { "auxiliary_loss_clip": 0.01213223, "auxiliary_loss_mlp": 0.01068272, "balance_loss_clip": 1.04487085, "balance_loss_mlp": 1.05975032, "epoch": 0.05218698331579739, "flos": 32598054771840.0, "grad_norm": 1.516115172246761, "language_loss": 0.63773066, "learning_rate": 3.9948389824223785e-06, "loss": 0.66054559, "num_input_tokens_seen": 18557380, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.5390625, "step": 868, "time_per_iteration": 2.587620735168457 }, { "auxiliary_loss_clip": 0.01222635, "auxiliary_loss_mlp": 0.01068718, "balance_loss_clip": 1.04076385, "balance_loss_mlp": 1.05954242, "epoch": 0.05224710656846535, "flos": 22127293774080.0, "grad_norm": 2.7373878512262264, "language_loss": 0.83317536, "learning_rate": 3.994810983642281e-06, "loss": 0.85608888, "num_input_tokens_seen": 18575720, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.6328125, "step": 869, "time_per_iteration": 2.464980363845825 }, { "auxiliary_loss_clip": 0.0122188, "auxiliary_loss_mlp": 0.01056192, "balance_loss_clip": 1.03019261, "balance_loss_mlp": 1.05926561, "epoch": 0.052307229821133325, "flos": 11145092976000.0, "grad_norm": 2.1399820161345735, "language_loss": 0.87278414, "learning_rate": 3.994782909218751e-06, "loss": 0.89556491, "num_input_tokens_seen": 18592185, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.625, "step": 870, "time_per_iteration": 2.442814350128174 }, { "auxiliary_loss_clip": 0.01220564, "auxiliary_loss_mlp": 0.01063093, "balance_loss_clip": 1.03762984, "balance_loss_mlp": 1.05980062, "epoch": 0.05236735307380129, "flos": 19128070005120.0, "grad_norm": 2.0833230663872984, "language_loss": 0.80521917, "learning_rate": 3.994754759152854e-06, "loss": 0.82805574, "num_input_tokens_seen": 18609560, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.609375, "step": 871, "time_per_iteration": 2.4393954277038574 }, { "auxiliary_loss_clip": 0.01218225, "auxiliary_loss_mlp": 0.01061187, "balance_loss_clip": 1.03630853, "balance_loss_mlp": 1.06068349, "epoch": 0.05242747632646926, "flos": 20960663944320.0, "grad_norm": 1.6794279952682307, "language_loss": 0.8164283, "learning_rate": 3.994726533445656e-06, "loss": 0.83922243, "num_input_tokens_seen": 18629405, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.578125, "step": 872, "time_per_iteration": 2.481635093688965 }, { "auxiliary_loss_clip": 0.01096401, "auxiliary_loss_mlp": 0.01011767, "balance_loss_clip": 1.00802374, "balance_loss_mlp": 1.02462578, "epoch": 0.052487599579137234, "flos": 65020542842880.0, "grad_norm": 0.8769193703394604, "language_loss": 0.61688566, "learning_rate": 3.9946982320982274e-06, "loss": 0.63796735, "num_input_tokens_seen": 18681480, "router_z_loss_clip": 0.03735352, "router_z_loss_mlp": 0.71875, "step": 873, "time_per_iteration": 2.9893956184387207 }, { "auxiliary_loss_clip": 0.01218357, "auxiliary_loss_mlp": 0.01058498, "balance_loss_clip": 1.03255832, "balance_loss_mlp": 1.0595665, "epoch": 0.0525477228318052, "flos": 23288859786240.0, "grad_norm": 2.029356128756201, "language_loss": 0.89020419, "learning_rate": 3.994669855111643e-06, "loss": 0.91297269, "num_input_tokens_seen": 18700390, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5859375, "step": 874, "time_per_iteration": 2.5074806213378906 }, { "auxiliary_loss_clip": 0.01218902, "auxiliary_loss_mlp": 0.01064326, "balance_loss_clip": 1.03804016, "balance_loss_mlp": 1.05730247, "epoch": 0.05260784608447317, "flos": 32230221546240.0, "grad_norm": 1.7953411913641926, "language_loss": 0.7455976, "learning_rate": 3.994641402486977e-06, "loss": 0.76842988, "num_input_tokens_seen": 18721280, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.6171875, "step": 875, "time_per_iteration": 2.5442540645599365 }, { "auxiliary_loss_clip": 0.01216032, "auxiliary_loss_mlp": 0.01053133, "balance_loss_clip": 1.0265615, "balance_loss_mlp": 1.05797577, "epoch": 0.052667969337141136, "flos": 24463211040000.0, "grad_norm": 1.6936865544180661, "language_loss": 0.92942524, "learning_rate": 3.99461287422531e-06, "loss": 0.95211691, "num_input_tokens_seen": 18741545, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.578125, "step": 876, "time_per_iteration": 2.5278003215789795 }, { "auxiliary_loss_clip": 0.01091549, "auxiliary_loss_mlp": 0.01002187, "balance_loss_clip": 0.9986586, "balance_loss_mlp": 1.02055478, "epoch": 0.05272809258980911, "flos": 57784329567360.0, "grad_norm": 0.8295444542268139, "language_loss": 0.62937129, "learning_rate": 3.994584270327722e-06, "loss": 0.65030861, "num_input_tokens_seen": 18801400, "router_z_loss_clip": 0.03540039, "router_z_loss_mlp": 0.7109375, "step": 877, "time_per_iteration": 3.098057508468628 }, { "auxiliary_loss_clip": 0.01219681, "auxiliary_loss_mlp": 0.01067668, "balance_loss_clip": 1.04032099, "balance_loss_mlp": 1.05920541, "epoch": 0.05278821584247708, "flos": 17420805596160.0, "grad_norm": 2.9240568589357423, "language_loss": 0.85522711, "learning_rate": 3.994555590795299e-06, "loss": 0.87810057, "num_input_tokens_seen": 18819670, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.6015625, "step": 878, "time_per_iteration": 2.482764959335327 }, { "auxiliary_loss_clip": 0.01221754, "auxiliary_loss_mlp": 0.01054739, "balance_loss_clip": 1.0288353, "balance_loss_mlp": 1.06062472, "epoch": 0.052848339095145046, "flos": 26137258346880.0, "grad_norm": 1.809906490225536, "language_loss": 0.82817006, "learning_rate": 3.9945268356291275e-06, "loss": 0.85093498, "num_input_tokens_seen": 18840580, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.609375, "step": 879, "time_per_iteration": 2.5185353755950928 }, { "auxiliary_loss_clip": 0.01217798, "auxiliary_loss_mlp": 0.01066391, "balance_loss_clip": 1.03966427, "balance_loss_mlp": 1.05986893, "epoch": 0.05290846234781302, "flos": 16472081623680.0, "grad_norm": 2.300845560719434, "language_loss": 0.84299982, "learning_rate": 3.9944980048302985e-06, "loss": 0.86584175, "num_input_tokens_seen": 18859295, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.578125, "step": 880, "time_per_iteration": 2.4753000736236572 }, { "auxiliary_loss_clip": 0.0122427, "auxiliary_loss_mlp": 0.01063479, "balance_loss_clip": 1.03731227, "balance_loss_mlp": 1.06212783, "epoch": 0.05296858560048098, "flos": 19865173000320.0, "grad_norm": 2.129461790172949, "language_loss": 0.87128079, "learning_rate": 3.994469098399906e-06, "loss": 0.8941583, "num_input_tokens_seen": 18877485, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.625, "step": 881, "time_per_iteration": 2.4547998905181885 }, { "auxiliary_loss_clip": 0.01218769, "auxiliary_loss_mlp": 0.01061667, "balance_loss_clip": 1.03470182, "balance_loss_mlp": 1.05852532, "epoch": 0.053028708853148955, "flos": 24388588535040.0, "grad_norm": 1.7889011056556638, "language_loss": 0.87889552, "learning_rate": 3.994440116339046e-06, "loss": 0.90169978, "num_input_tokens_seen": 18898275, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.6015625, "step": 882, "time_per_iteration": 2.4978511333465576 }, { "auxiliary_loss_clip": 0.01221653, "auxiliary_loss_mlp": 0.01062512, "balance_loss_clip": 1.03498697, "balance_loss_mlp": 1.05932331, "epoch": 0.05308883210581693, "flos": 36393166143360.0, "grad_norm": 2.46154222650579, "language_loss": 0.692523, "learning_rate": 3.994411058648816e-06, "loss": 0.71536469, "num_input_tokens_seen": 18920665, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.625, "step": 883, "time_per_iteration": 2.590074300765991 }, { "auxiliary_loss_clip": 0.01216162, "auxiliary_loss_mlp": 0.0106004, "balance_loss_clip": 1.03457725, "balance_loss_mlp": 1.06099701, "epoch": 0.05314895535848489, "flos": 22855095146880.0, "grad_norm": 2.387759493610435, "language_loss": 0.75941288, "learning_rate": 3.994381925330319e-06, "loss": 0.78217489, "num_input_tokens_seen": 18939835, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5546875, "step": 884, "time_per_iteration": 2.4747726917266846 }, { "auxiliary_loss_clip": 0.01213618, "auxiliary_loss_mlp": 0.01055801, "balance_loss_clip": 1.03148246, "balance_loss_mlp": 1.05906832, "epoch": 0.053209078611152864, "flos": 12860330204160.0, "grad_norm": 2.131912308118146, "language_loss": 0.86289072, "learning_rate": 3.994352716384659e-06, "loss": 0.88558495, "num_input_tokens_seen": 18958405, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 1.546875, "step": 885, "time_per_iteration": 2.4338276386260986 }, { "auxiliary_loss_clip": 0.01218634, "auxiliary_loss_mlp": 0.01062234, "balance_loss_clip": 1.03602028, "balance_loss_mlp": 1.057693, "epoch": 0.05326920186382083, "flos": 12164596698240.0, "grad_norm": 2.9346042206501757, "language_loss": 0.86330211, "learning_rate": 3.994323431812945e-06, "loss": 0.88611084, "num_input_tokens_seen": 18975445, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.609375, "step": 886, "time_per_iteration": 2.4278478622436523 }, { "auxiliary_loss_clip": 0.01216074, "auxiliary_loss_mlp": 0.0106325, "balance_loss_clip": 1.03636813, "balance_loss_mlp": 1.05882931, "epoch": 0.0533293251164888, "flos": 22704485420160.0, "grad_norm": 2.2610965069221725, "language_loss": 0.892308, "learning_rate": 3.994294071616286e-06, "loss": 0.91510117, "num_input_tokens_seen": 18991930, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.578125, "step": 887, "time_per_iteration": 2.467837333679199 }, { "auxiliary_loss_clip": 0.01216339, "auxiliary_loss_mlp": 0.01067044, "balance_loss_clip": 1.03826654, "balance_loss_mlp": 1.055673, "epoch": 0.053389448369156774, "flos": 26940939200640.0, "grad_norm": 2.221340146733075, "language_loss": 0.75294173, "learning_rate": 3.994264635795796e-06, "loss": 0.77577555, "num_input_tokens_seen": 19009790, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.609375, "step": 888, "time_per_iteration": 2.4953560829162598 }, { "auxiliary_loss_clip": 0.01217695, "auxiliary_loss_mlp": 0.01072171, "balance_loss_clip": 1.04404926, "balance_loss_mlp": 1.05963159, "epoch": 0.05344957162182474, "flos": 25556331686400.0, "grad_norm": 2.028857777614197, "language_loss": 0.8844499, "learning_rate": 3.994235124352592e-06, "loss": 0.90734857, "num_input_tokens_seen": 19030170, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.578125, "step": 889, "time_per_iteration": 2.4983088970184326 }, { "auxiliary_loss_clip": 0.01213241, "auxiliary_loss_mlp": 0.01050205, "balance_loss_clip": 1.02470565, "balance_loss_mlp": 1.05751264, "epoch": 0.05350969487449271, "flos": 19719591177600.0, "grad_norm": 1.869509209537226, "language_loss": 0.8853457, "learning_rate": 3.994205537287791e-06, "loss": 0.9079802, "num_input_tokens_seen": 19048075, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.5546875, "step": 890, "time_per_iteration": 3.8823068141937256 }, { "auxiliary_loss_clip": 0.01215881, "auxiliary_loss_mlp": 0.01068548, "balance_loss_clip": 1.04357398, "balance_loss_mlp": 1.05762935, "epoch": 0.053569818127160676, "flos": 27016351804800.0, "grad_norm": 3.4567612757449866, "language_loss": 0.93378794, "learning_rate": 3.994175874602517e-06, "loss": 0.95663226, "num_input_tokens_seen": 19067465, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.578125, "step": 891, "time_per_iteration": 5.441504955291748 }, { "auxiliary_loss_clip": 0.01214973, "auxiliary_loss_mlp": 0.01065232, "balance_loss_clip": 1.03764653, "balance_loss_mlp": 1.05758286, "epoch": 0.05362994137982865, "flos": 13188338225280.0, "grad_norm": 2.18267669343725, "language_loss": 0.72024012, "learning_rate": 3.994146136297893e-06, "loss": 0.74304217, "num_input_tokens_seen": 19085505, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.578125, "step": 892, "time_per_iteration": 2.4660820960998535 }, { "auxiliary_loss_clip": 0.01216722, "auxiliary_loss_mlp": 0.01072259, "balance_loss_clip": 1.04664135, "balance_loss_mlp": 1.0581398, "epoch": 0.05369006463249662, "flos": 28658008022400.0, "grad_norm": 1.8763248342750587, "language_loss": 0.82328951, "learning_rate": 3.994116322375049e-06, "loss": 0.84617937, "num_input_tokens_seen": 19104360, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.5859375, "step": 893, "time_per_iteration": 2.508021116256714 }, { "auxiliary_loss_clip": 0.01216683, "auxiliary_loss_mlp": 0.01061267, "balance_loss_clip": 1.03549373, "balance_loss_mlp": 1.05627155, "epoch": 0.053750187885164585, "flos": 28913153304960.0, "grad_norm": 2.099129257369959, "language_loss": 0.81242061, "learning_rate": 3.994086432835114e-06, "loss": 0.83520013, "num_input_tokens_seen": 19124680, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.6015625, "step": 894, "time_per_iteration": 2.5224719047546387 }, { "auxiliary_loss_clip": 0.01216864, "auxiliary_loss_mlp": 0.01059317, "balance_loss_clip": 1.03341317, "balance_loss_mlp": 1.05823076, "epoch": 0.05381031113783256, "flos": 15158828476800.0, "grad_norm": 2.1659508917984125, "language_loss": 0.75260377, "learning_rate": 3.994056467679221e-06, "loss": 0.77536559, "num_input_tokens_seen": 19142895, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5859375, "step": 895, "time_per_iteration": 2.429800033569336 }, { "auxiliary_loss_clip": 0.01224949, "auxiliary_loss_mlp": 0.01060875, "balance_loss_clip": 1.03509033, "balance_loss_mlp": 1.06181931, "epoch": 0.05387043439050053, "flos": 21835232288640.0, "grad_norm": 1.8826712687803842, "language_loss": 0.86556655, "learning_rate": 3.9940264269085065e-06, "loss": 0.88842481, "num_input_tokens_seen": 19163125, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.6328125, "step": 896, "time_per_iteration": 2.494812488555908 }, { "auxiliary_loss_clip": 0.01219574, "auxiliary_loss_mlp": 0.01062683, "balance_loss_clip": 1.03527725, "balance_loss_mlp": 1.05815506, "epoch": 0.053930557643168495, "flos": 17310308382720.0, "grad_norm": 2.22636843144368, "language_loss": 0.88517839, "learning_rate": 3.9939963105241115e-06, "loss": 0.90800095, "num_input_tokens_seen": 19179385, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.609375, "step": 897, "time_per_iteration": 2.432173252105713 }, { "auxiliary_loss_clip": 0.01214705, "auxiliary_loss_mlp": 0.01066832, "balance_loss_clip": 1.03898466, "balance_loss_mlp": 1.05672765, "epoch": 0.05399068089583647, "flos": 17348481561600.0, "grad_norm": 1.650031574417938, "language_loss": 0.90480733, "learning_rate": 3.993966118527175e-06, "loss": 0.92762268, "num_input_tokens_seen": 19198725, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.578125, "step": 898, "time_per_iteration": 2.5033581256866455 }, { "auxiliary_loss_clip": 0.01222206, "auxiliary_loss_mlp": 0.01079071, "balance_loss_clip": 1.05223703, "balance_loss_mlp": 1.05975008, "epoch": 0.05405080414850443, "flos": 17486952491520.0, "grad_norm": 2.917577790779689, "language_loss": 0.92192721, "learning_rate": 3.993935850918845e-06, "loss": 0.94493997, "num_input_tokens_seen": 19212380, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.625, "step": 899, "time_per_iteration": 2.421766757965088 }, { "auxiliary_loss_clip": 0.01213254, "auxiliary_loss_mlp": 0.01065255, "balance_loss_clip": 1.03975606, "balance_loss_mlp": 1.05709815, "epoch": 0.054110927401172404, "flos": 24496787278080.0, "grad_norm": 2.5180020134062486, "language_loss": 0.75276399, "learning_rate": 3.9939055077002665e-06, "loss": 0.77554911, "num_input_tokens_seen": 19232235, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5625, "step": 900, "time_per_iteration": 2.511098623275757 }, { "auxiliary_loss_clip": 0.01219713, "auxiliary_loss_mlp": 0.01060534, "balance_loss_clip": 1.03538108, "balance_loss_mlp": 1.05904722, "epoch": 0.054171050653840376, "flos": 22930040874240.0, "grad_norm": 2.291125485927803, "language_loss": 0.73712397, "learning_rate": 3.993875088872592e-06, "loss": 0.75992644, "num_input_tokens_seen": 19251460, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.609375, "step": 901, "time_per_iteration": 2.452735662460327 }, { "auxiliary_loss_clip": 0.01211515, "auxiliary_loss_mlp": 0.01071375, "balance_loss_clip": 1.04641283, "balance_loss_mlp": 1.05866981, "epoch": 0.05423117390650834, "flos": 12933192942720.0, "grad_norm": 2.2326696213881507, "language_loss": 0.85059345, "learning_rate": 3.9938445944369745e-06, "loss": 0.87342238, "num_input_tokens_seen": 19269060, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.53125, "step": 902, "time_per_iteration": 2.4436633586883545 }, { "auxiliary_loss_clip": 0.01213712, "auxiliary_loss_mlp": 0.01068505, "balance_loss_clip": 1.04247022, "balance_loss_mlp": 1.05604172, "epoch": 0.05429129715917631, "flos": 19901335017600.0, "grad_norm": 1.9238670094194843, "language_loss": 0.86814135, "learning_rate": 3.993814024394569e-06, "loss": 0.89096355, "num_input_tokens_seen": 19288620, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.578125, "step": 903, "time_per_iteration": 2.493544101715088 }, { "auxiliary_loss_clip": 0.01215059, "auxiliary_loss_mlp": 0.01063702, "balance_loss_clip": 1.03920484, "balance_loss_mlp": 1.05802941, "epoch": 0.05435142041184428, "flos": 16908611610240.0, "grad_norm": 2.0189125638431427, "language_loss": 0.74763519, "learning_rate": 3.993783378746537e-06, "loss": 0.77042282, "num_input_tokens_seen": 19306615, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.5703125, "step": 904, "time_per_iteration": 2.442856550216675 }, { "auxiliary_loss_clip": 0.01219294, "auxiliary_loss_mlp": 0.01074936, "balance_loss_clip": 1.04929376, "balance_loss_mlp": 1.05918014, "epoch": 0.05441154366451225, "flos": 23948323534080.0, "grad_norm": 7.305543389555527, "language_loss": 0.86143857, "learning_rate": 3.993752657494039e-06, "loss": 0.88438088, "num_input_tokens_seen": 19321680, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.6015625, "step": 905, "time_per_iteration": 2.4730653762817383 }, { "auxiliary_loss_clip": 0.01215705, "auxiliary_loss_mlp": 0.01069749, "balance_loss_clip": 1.0457406, "balance_loss_mlp": 1.0618645, "epoch": 0.05447166691718022, "flos": 19975382904960.0, "grad_norm": 1.8903726528202245, "language_loss": 0.74356896, "learning_rate": 3.993721860638241e-06, "loss": 0.76642346, "num_input_tokens_seen": 19339760, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.5390625, "step": 906, "time_per_iteration": 2.464919328689575 }, { "auxiliary_loss_clip": 0.01218855, "auxiliary_loss_mlp": 0.01057661, "balance_loss_clip": 1.032341, "balance_loss_mlp": 1.05902064, "epoch": 0.05453179016984819, "flos": 24936513575040.0, "grad_norm": 3.0457290918802595, "language_loss": 0.87800318, "learning_rate": 3.993690988180309e-06, "loss": 0.90076828, "num_input_tokens_seen": 19359585, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.59375, "step": 907, "time_per_iteration": 2.4814870357513428 }, { "auxiliary_loss_clip": 0.01218543, "auxiliary_loss_mlp": 0.01067872, "balance_loss_clip": 1.04172897, "balance_loss_mlp": 1.06134391, "epoch": 0.05459191342251616, "flos": 18115102558080.0, "grad_norm": 1.803440016325384, "language_loss": 0.87261373, "learning_rate": 3.9936600401214165e-06, "loss": 0.89547789, "num_input_tokens_seen": 19378590, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.5703125, "step": 908, "time_per_iteration": 2.44952130317688 }, { "auxiliary_loss_clip": 0.0121674, "auxiliary_loss_mlp": 0.01071769, "balance_loss_clip": 1.04477954, "balance_loss_mlp": 1.05997109, "epoch": 0.054652036675184125, "flos": 19208295031680.0, "grad_norm": 2.148590649638238, "language_loss": 0.89789373, "learning_rate": 3.9936290164627345e-06, "loss": 0.92077881, "num_input_tokens_seen": 19397910, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.5625, "step": 909, "time_per_iteration": 2.440624952316284 }, { "auxiliary_loss_clip": 0.01219403, "auxiliary_loss_mlp": 0.01074239, "balance_loss_clip": 1.04759598, "balance_loss_mlp": 1.05989063, "epoch": 0.0547121599278521, "flos": 16325745615360.0, "grad_norm": 5.107613062608846, "language_loss": 0.71169347, "learning_rate": 3.99359791720544e-06, "loss": 0.73462987, "num_input_tokens_seen": 19415950, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.59375, "step": 910, "time_per_iteration": 2.430516481399536 }, { "auxiliary_loss_clip": 0.01214178, "auxiliary_loss_mlp": 0.01057125, "balance_loss_clip": 1.03283048, "balance_loss_mlp": 1.0579735, "epoch": 0.05477228318052007, "flos": 20339014239360.0, "grad_norm": 1.9847599751486305, "language_loss": 0.8398419, "learning_rate": 3.993566742350714e-06, "loss": 0.86255491, "num_input_tokens_seen": 19435275, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.5625, "step": 911, "time_per_iteration": 2.476816415786743 }, { "auxiliary_loss_clip": 0.01214863, "auxiliary_loss_mlp": 0.01068307, "balance_loss_clip": 1.04160464, "balance_loss_mlp": 1.05674994, "epoch": 0.054832406433188034, "flos": 21973092687360.0, "grad_norm": 2.4489426452937004, "language_loss": 0.75920045, "learning_rate": 3.993535491899736e-06, "loss": 0.78203219, "num_input_tokens_seen": 19452090, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.578125, "step": 912, "time_per_iteration": 2.4665889739990234 }, { "auxiliary_loss_clip": 0.01210336, "auxiliary_loss_mlp": 0.01054031, "balance_loss_clip": 1.02951002, "balance_loss_mlp": 1.05745256, "epoch": 0.054892529685856006, "flos": 16398931576320.0, "grad_norm": 2.098905268204812, "language_loss": 0.82561976, "learning_rate": 3.993504165853694e-06, "loss": 0.84826338, "num_input_tokens_seen": 19470865, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.53125, "step": 913, "time_per_iteration": 2.476060628890991 }, { "auxiliary_loss_clip": 0.01212905, "auxiliary_loss_mlp": 0.01059143, "balance_loss_clip": 1.03450191, "balance_loss_mlp": 1.05999637, "epoch": 0.05495265293852397, "flos": 23912341084800.0, "grad_norm": 1.6542456126049216, "language_loss": 0.83483171, "learning_rate": 3.993472764213772e-06, "loss": 0.85755217, "num_input_tokens_seen": 19492145, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.53125, "step": 914, "time_per_iteration": 2.485229015350342 }, { "auxiliary_loss_clip": 0.01217888, "auxiliary_loss_mlp": 0.01054931, "balance_loss_clip": 1.03020644, "balance_loss_mlp": 1.06131268, "epoch": 0.055012776191191944, "flos": 23586954756480.0, "grad_norm": 2.355943258881255, "language_loss": 0.89983797, "learning_rate": 3.9934412869811655e-06, "loss": 0.92256618, "num_input_tokens_seen": 19511015, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.5703125, "step": 915, "time_per_iteration": 2.5157394409179688 }, { "auxiliary_loss_clip": 0.01215994, "auxiliary_loss_mlp": 0.01057165, "balance_loss_clip": 1.03306055, "balance_loss_mlp": 1.06112695, "epoch": 0.055072899443859916, "flos": 17528501548800.0, "grad_norm": 3.109750289321145, "language_loss": 0.89769065, "learning_rate": 3.993409734157064e-06, "loss": 0.92042226, "num_input_tokens_seen": 19529040, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.546875, "step": 916, "time_per_iteration": 2.4590511322021484 }, { "auxiliary_loss_clip": 0.01218297, "auxiliary_loss_mlp": 0.01067629, "balance_loss_clip": 1.04209483, "balance_loss_mlp": 1.05962133, "epoch": 0.05513302269652788, "flos": 21687172427520.0, "grad_norm": 1.8336692259851621, "language_loss": 0.802858, "learning_rate": 3.993378105742666e-06, "loss": 0.82571733, "num_input_tokens_seen": 19549540, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.5859375, "step": 917, "time_per_iteration": 2.4913928508758545 }, { "auxiliary_loss_clip": 0.01217512, "auxiliary_loss_mlp": 0.01064199, "balance_loss_clip": 1.03853321, "balance_loss_mlp": 1.05817294, "epoch": 0.05519314594919585, "flos": 21613340021760.0, "grad_norm": 2.693260963246125, "language_loss": 0.79623407, "learning_rate": 3.9933464017391705e-06, "loss": 0.81905121, "num_input_tokens_seen": 19567570, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.59375, "step": 918, "time_per_iteration": 2.47973895072937 }, { "auxiliary_loss_clip": 0.01214825, "auxiliary_loss_mlp": 0.01058685, "balance_loss_clip": 1.03372228, "balance_loss_mlp": 1.05749989, "epoch": 0.05525326920186382, "flos": 21798567480960.0, "grad_norm": 2.3397132394389546, "language_loss": 0.89111364, "learning_rate": 3.99331462214778e-06, "loss": 0.91384876, "num_input_tokens_seen": 19585330, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.5703125, "step": 919, "time_per_iteration": 2.4896059036254883 }, { "auxiliary_loss_clip": 0.01213557, "auxiliary_loss_mlp": 0.0107372, "balance_loss_clip": 1.04792261, "balance_loss_mlp": 1.05787683, "epoch": 0.05531339245453179, "flos": 28439635288320.0, "grad_norm": 2.0569057204123435, "language_loss": 0.86956418, "learning_rate": 3.993282766969699e-06, "loss": 0.89243698, "num_input_tokens_seen": 19604970, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.5546875, "step": 920, "time_per_iteration": 2.572732448577881 }, { "auxiliary_loss_clip": 0.01215312, "auxiliary_loss_mlp": 0.01059718, "balance_loss_clip": 1.03480327, "balance_loss_mlp": 1.06114244, "epoch": 0.05537351570719976, "flos": 37375143131520.0, "grad_norm": 2.2726858797834457, "language_loss": 0.65910184, "learning_rate": 3.993250836206136e-06, "loss": 0.6818521, "num_input_tokens_seen": 19626235, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.546875, "step": 921, "time_per_iteration": 2.5960896015167236 }, { "auxiliary_loss_clip": 0.01220735, "auxiliary_loss_mlp": 0.010644, "balance_loss_clip": 1.03631473, "balance_loss_mlp": 1.06149495, "epoch": 0.05543363895986773, "flos": 20084479488000.0, "grad_norm": 2.1900806088644913, "language_loss": 0.72105139, "learning_rate": 3.993218829858301e-06, "loss": 0.7439028, "num_input_tokens_seen": 19644305, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.59375, "step": 922, "time_per_iteration": 2.442821741104126 }, { "auxiliary_loss_clip": 0.01216428, "auxiliary_loss_mlp": 0.01065783, "balance_loss_clip": 1.03989112, "balance_loss_mlp": 1.05767775, "epoch": 0.0554937622125357, "flos": 24533200690560.0, "grad_norm": 3.0165590940554208, "language_loss": 0.82117838, "learning_rate": 3.993186747927408e-06, "loss": 0.84400052, "num_input_tokens_seen": 19662130, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.5859375, "step": 923, "time_per_iteration": 2.4892263412475586 }, { "auxiliary_loss_clip": 0.01213052, "auxiliary_loss_mlp": 0.0106669, "balance_loss_clip": 1.03986847, "balance_loss_mlp": 1.05503368, "epoch": 0.055553885465203665, "flos": 14320063013760.0, "grad_norm": 2.284350414731528, "language_loss": 0.78992581, "learning_rate": 3.993154590414675e-06, "loss": 0.81272316, "num_input_tokens_seen": 19680715, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.578125, "step": 924, "time_per_iteration": 2.4372854232788086 }, { "auxiliary_loss_clip": 0.01211385, "auxiliary_loss_mlp": 0.01061629, "balance_loss_clip": 1.03528357, "balance_loss_mlp": 1.05668807, "epoch": 0.05561400871787164, "flos": 27381132374400.0, "grad_norm": 2.032095897027873, "language_loss": 1.02352047, "learning_rate": 3.993122357321319e-06, "loss": 1.04625058, "num_input_tokens_seen": 19700535, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.546875, "step": 925, "time_per_iteration": 2.5159029960632324 }, { "auxiliary_loss_clip": 0.01211951, "auxiliary_loss_mlp": 0.01055028, "balance_loss_clip": 1.02968407, "balance_loss_mlp": 1.05432987, "epoch": 0.05567413197053961, "flos": 23221096778880.0, "grad_norm": 2.5851083048092613, "language_loss": 0.8128925, "learning_rate": 3.993090048648564e-06, "loss": 0.83556223, "num_input_tokens_seen": 19718825, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.578125, "step": 926, "time_per_iteration": 2.461786985397339 }, { "auxiliary_loss_clip": 0.01224867, "auxiliary_loss_mlp": 0.01065947, "balance_loss_clip": 1.03888667, "balance_loss_mlp": 1.06101227, "epoch": 0.055734255223207574, "flos": 25264952559360.0, "grad_norm": 4.6864973734980415, "language_loss": 0.73413509, "learning_rate": 3.993057664397634e-06, "loss": 0.75704324, "num_input_tokens_seen": 19739080, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.640625, "step": 927, "time_per_iteration": 2.5085511207580566 }, { "auxiliary_loss_clip": 0.01096617, "auxiliary_loss_mlp": 0.01020607, "balance_loss_clip": 1.01579118, "balance_loss_mlp": 1.02557945, "epoch": 0.055794378475875546, "flos": 66503116702080.0, "grad_norm": 0.7836134187061049, "language_loss": 0.59899992, "learning_rate": 3.9930252045697585e-06, "loss": 0.62017214, "num_input_tokens_seen": 19802960, "router_z_loss_clip": 0.0480957, "router_z_loss_mlp": 0.7109375, "step": 928, "time_per_iteration": 3.144382953643799 }, { "auxiliary_loss_clip": 0.01218539, "auxiliary_loss_mlp": 0.01066295, "balance_loss_clip": 1.04029512, "balance_loss_mlp": 1.0605669, "epoch": 0.05585450172854351, "flos": 25337635729920.0, "grad_norm": 2.2916249416301935, "language_loss": 0.95335048, "learning_rate": 3.992992669166168e-06, "loss": 0.97619885, "num_input_tokens_seen": 19822765, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.578125, "step": 929, "time_per_iteration": 2.480210065841675 }, { "auxiliary_loss_clip": 0.0121474, "auxiliary_loss_mlp": 0.01071956, "balance_loss_clip": 1.04383481, "balance_loss_mlp": 1.05747938, "epoch": 0.05591462498121148, "flos": 33911738881920.0, "grad_norm": 2.0439051763464495, "language_loss": 0.71817189, "learning_rate": 3.992960058188094e-06, "loss": 0.74103886, "num_input_tokens_seen": 19843590, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.5703125, "step": 930, "time_per_iteration": 2.58419132232666 }, { "auxiliary_loss_clip": 0.01218276, "auxiliary_loss_mlp": 0.01060668, "balance_loss_clip": 1.03386927, "balance_loss_mlp": 1.05961442, "epoch": 0.055974748233879455, "flos": 17930880679680.0, "grad_norm": 4.014446068324964, "language_loss": 0.84963274, "learning_rate": 3.992927371636776e-06, "loss": 0.87242222, "num_input_tokens_seen": 19860230, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.59375, "step": 931, "time_per_iteration": 3.864170789718628 }, { "auxiliary_loss_clip": 0.01218706, "auxiliary_loss_mlp": 0.01072713, "balance_loss_clip": 1.04566467, "balance_loss_mlp": 1.05941033, "epoch": 0.05603487148654742, "flos": 24021976371840.0, "grad_norm": 6.024703654727697, "language_loss": 0.83725178, "learning_rate": 3.9928946095134525e-06, "loss": 0.86016595, "num_input_tokens_seen": 19880795, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.59375, "step": 932, "time_per_iteration": 2.4998111724853516 }, { "auxiliary_loss_clip": 0.01217603, "auxiliary_loss_mlp": 0.01069114, "balance_loss_clip": 1.04069459, "balance_loss_mlp": 1.05941367, "epoch": 0.05609499473921539, "flos": 17307758517120.0, "grad_norm": 2.285430516320898, "language_loss": 0.73513961, "learning_rate": 3.992861771819365e-06, "loss": 0.75800681, "num_input_tokens_seen": 19897960, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.578125, "step": 933, "time_per_iteration": 6.868191480636597 }, { "auxiliary_loss_clip": 0.01213768, "auxiliary_loss_mlp": 0.01074571, "balance_loss_clip": 1.04736769, "balance_loss_mlp": 1.05702615, "epoch": 0.05615511799188336, "flos": 20994742972800.0, "grad_norm": 4.936387515654604, "language_loss": 0.86578345, "learning_rate": 3.99282885855576e-06, "loss": 0.88866675, "num_input_tokens_seen": 19913315, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.5625, "step": 934, "time_per_iteration": 2.467925548553467 }, { "auxiliary_loss_clip": 0.01211629, "auxiliary_loss_mlp": 0.01064132, "balance_loss_clip": 1.03919291, "balance_loss_mlp": 1.05979514, "epoch": 0.05621524124455133, "flos": 17273535834240.0, "grad_norm": 2.8033273863871835, "language_loss": 0.80426764, "learning_rate": 3.992795869723885e-06, "loss": 0.82702523, "num_input_tokens_seen": 19928790, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.5234375, "step": 935, "time_per_iteration": 2.417855978012085 }, { "auxiliary_loss_clip": 0.01092198, "auxiliary_loss_mlp": 0.01015691, "balance_loss_clip": 1.01166213, "balance_loss_mlp": 1.02498674, "epoch": 0.0562753644972193, "flos": 58719370458240.0, "grad_norm": 0.8279215637065304, "language_loss": 0.69159281, "learning_rate": 3.99276280532499e-06, "loss": 0.71267164, "num_input_tokens_seen": 19988785, "router_z_loss_clip": 0.0402832, "router_z_loss_mlp": 0.671875, "step": 936, "time_per_iteration": 3.0083510875701904 }, { "auxiliary_loss_clip": 0.01215568, "auxiliary_loss_mlp": 0.01063692, "balance_loss_clip": 1.03775227, "balance_loss_mlp": 1.05825353, "epoch": 0.05633548774988727, "flos": 17457039440640.0, "grad_norm": 3.138511760214011, "language_loss": 0.75935793, "learning_rate": 3.992729665360331e-06, "loss": 0.78215057, "num_input_tokens_seen": 20007685, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.578125, "step": 937, "time_per_iteration": 2.486640691757202 }, { "auxiliary_loss_clip": 0.01090703, "auxiliary_loss_mlp": 0.01006228, "balance_loss_clip": 1.00222266, "balance_loss_mlp": 1.02408147, "epoch": 0.05639561100255524, "flos": 70654928083200.0, "grad_norm": 1.167322808268864, "language_loss": 0.6445117, "learning_rate": 3.992696449831162e-06, "loss": 0.66548103, "num_input_tokens_seen": 20072750, "router_z_loss_clip": 0.04003906, "router_z_loss_mlp": 0.6640625, "step": 938, "time_per_iteration": 3.062978744506836 }, { "auxiliary_loss_clip": 0.01221866, "auxiliary_loss_mlp": 0.01063733, "balance_loss_clip": 1.03669643, "balance_loss_mlp": 1.05930388, "epoch": 0.056455734255223204, "flos": 20485996692480.0, "grad_norm": 3.2792411116585347, "language_loss": 0.79500461, "learning_rate": 3.992663158738745e-06, "loss": 0.8178606, "num_input_tokens_seen": 20089070, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.625, "step": 939, "time_per_iteration": 2.447234630584717 }, { "auxiliary_loss_clip": 0.01213877, "auxiliary_loss_mlp": 0.01068766, "balance_loss_clip": 1.04330301, "balance_loss_mlp": 1.05859113, "epoch": 0.056515857507891176, "flos": 22053569109120.0, "grad_norm": 1.6722278579898544, "language_loss": 0.74103415, "learning_rate": 3.992629792084341e-06, "loss": 0.76386058, "num_input_tokens_seen": 20108790, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5546875, "step": 940, "time_per_iteration": 2.498835325241089 }, { "auxiliary_loss_clip": 0.01215924, "auxiliary_loss_mlp": 0.01067271, "balance_loss_clip": 1.04005516, "balance_loss_mlp": 1.06055069, "epoch": 0.05657598076055915, "flos": 24025316336640.0, "grad_norm": 1.9341170040884392, "language_loss": 0.70643651, "learning_rate": 3.992596349869216e-06, "loss": 0.72926855, "num_input_tokens_seen": 20128455, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.5546875, "step": 941, "time_per_iteration": 2.4765257835388184 }, { "auxiliary_loss_clip": 0.01211086, "auxiliary_loss_mlp": 0.01062864, "balance_loss_clip": 1.03749633, "balance_loss_mlp": 1.05688334, "epoch": 0.05663610401322711, "flos": 20480609652480.0, "grad_norm": 1.976540508262694, "language_loss": 0.81041431, "learning_rate": 3.992562832094637e-06, "loss": 0.83315384, "num_input_tokens_seen": 20145775, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5390625, "step": 942, "time_per_iteration": 2.4586503505706787 }, { "auxiliary_loss_clip": 0.0120839, "auxiliary_loss_mlp": 0.01060238, "balance_loss_clip": 1.03504896, "balance_loss_mlp": 1.05509233, "epoch": 0.056696227265895086, "flos": 21069042255360.0, "grad_norm": 2.081899541808235, "language_loss": 0.88755322, "learning_rate": 3.9925292387618755e-06, "loss": 0.91023946, "num_input_tokens_seen": 20164315, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.53125, "step": 943, "time_per_iteration": 2.450437307357788 }, { "auxiliary_loss_clip": 0.01216233, "auxiliary_loss_mlp": 0.01063256, "balance_loss_clip": 1.03834116, "balance_loss_mlp": 1.06098294, "epoch": 0.05675635051856306, "flos": 17821317219840.0, "grad_norm": 2.521438198814715, "language_loss": 0.75285316, "learning_rate": 3.992495569872206e-06, "loss": 0.775648, "num_input_tokens_seen": 20182760, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.5546875, "step": 944, "time_per_iteration": 2.443230152130127 }, { "auxiliary_loss_clip": 0.01212331, "auxiliary_loss_mlp": 0.01062183, "balance_loss_clip": 1.03780437, "balance_loss_mlp": 1.05694914, "epoch": 0.05681647377123102, "flos": 23114945111040.0, "grad_norm": 1.6507103038868898, "language_loss": 0.79561317, "learning_rate": 3.992461825426906e-06, "loss": 0.8183583, "num_input_tokens_seen": 20203830, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.5546875, "step": 945, "time_per_iteration": 2.4710187911987305 }, { "auxiliary_loss_clip": 0.01214433, "auxiliary_loss_mlp": 0.01062947, "balance_loss_clip": 1.03793693, "balance_loss_mlp": 1.05772066, "epoch": 0.056876597023898995, "flos": 16070528505600.0, "grad_norm": 2.270906291769495, "language_loss": 0.82555509, "learning_rate": 3.992428005427252e-06, "loss": 0.84832883, "num_input_tokens_seen": 20220365, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.5625, "step": 946, "time_per_iteration": 2.4432363510131836 }, { "auxiliary_loss_clip": 0.01218673, "auxiliary_loss_mlp": 0.01059843, "balance_loss_clip": 1.03247261, "balance_loss_mlp": 1.05957627, "epoch": 0.05693672027656696, "flos": 16835641130880.0, "grad_norm": 1.8777071752973147, "language_loss": 0.78931212, "learning_rate": 3.992394109874529e-06, "loss": 0.81209725, "num_input_tokens_seen": 20238640, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.59375, "step": 947, "time_per_iteration": 2.436964988708496 }, { "auxiliary_loss_clip": 0.01223969, "auxiliary_loss_mlp": 0.01077192, "balance_loss_clip": 1.05108535, "balance_loss_mlp": 1.06259441, "epoch": 0.05699684352923493, "flos": 21389113370880.0, "grad_norm": 2.8428673312361217, "language_loss": 0.85779202, "learning_rate": 3.9923601387700225e-06, "loss": 0.88080359, "num_input_tokens_seen": 20251025, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.6171875, "step": 948, "time_per_iteration": 2.438612699508667 }, { "auxiliary_loss_clip": 0.01215189, "auxiliary_loss_mlp": 0.01069801, "balance_loss_clip": 1.04212081, "balance_loss_mlp": 1.05874121, "epoch": 0.057056966781902904, "flos": 15560309767680.0, "grad_norm": 2.284479523536822, "language_loss": 0.87604702, "learning_rate": 3.992326092115019e-06, "loss": 0.89889693, "num_input_tokens_seen": 20269775, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.5625, "step": 949, "time_per_iteration": 2.4739813804626465 }, { "auxiliary_loss_clip": 0.01209847, "auxiliary_loss_mlp": 0.01069163, "balance_loss_clip": 1.04424858, "balance_loss_mlp": 1.05712032, "epoch": 0.05711709003457087, "flos": 19937856170880.0, "grad_norm": 2.3370839633931917, "language_loss": 0.79011887, "learning_rate": 3.992291969910811e-06, "loss": 0.81290901, "num_input_tokens_seen": 20287715, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.53125, "step": 950, "time_per_iteration": 2.467104196548462 }, { "auxiliary_loss_clip": 0.01219201, "auxiliary_loss_mlp": 0.01076324, "balance_loss_clip": 1.05032432, "balance_loss_mlp": 1.06066442, "epoch": 0.05717721328723884, "flos": 30332701774080.0, "grad_norm": 2.1543627756490733, "language_loss": 0.82527846, "learning_rate": 3.992257772158691e-06, "loss": 0.8482337, "num_input_tokens_seen": 20307070, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5859375, "step": 951, "time_per_iteration": 2.519195795059204 }, { "auxiliary_loss_clip": 0.01210759, "auxiliary_loss_mlp": 0.01061746, "balance_loss_clip": 1.03432798, "balance_loss_mlp": 1.05447865, "epoch": 0.05723733653990681, "flos": 23654358627840.0, "grad_norm": 3.21544642284726, "language_loss": 0.86402988, "learning_rate": 3.992223498859958e-06, "loss": 0.88675487, "num_input_tokens_seen": 20324945, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.5625, "step": 952, "time_per_iteration": 2.4812700748443604 }, { "auxiliary_loss_clip": 0.01216627, "auxiliary_loss_mlp": 0.01063008, "balance_loss_clip": 1.03454041, "balance_loss_mlp": 1.0550921, "epoch": 0.05729745979257478, "flos": 22055759838720.0, "grad_norm": 2.2240386875467157, "language_loss": 0.79365528, "learning_rate": 3.9921891500159084e-06, "loss": 0.81645155, "num_input_tokens_seen": 20346135, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.609375, "step": 953, "time_per_iteration": 2.4781713485717773 }, { "auxiliary_loss_clip": 0.01217664, "auxiliary_loss_mlp": 0.01070479, "balance_loss_clip": 1.04357362, "balance_loss_mlp": 1.0612793, "epoch": 0.05735758304524275, "flos": 19604353368960.0, "grad_norm": 2.565977914822496, "language_loss": 0.8694014, "learning_rate": 3.992154725627848e-06, "loss": 0.89228278, "num_input_tokens_seen": 20364450, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.5625, "step": 954, "time_per_iteration": 2.460033655166626 }, { "auxiliary_loss_clip": 0.01218867, "auxiliary_loss_mlp": 0.01063203, "balance_loss_clip": 1.03771603, "balance_loss_mlp": 1.05983818, "epoch": 0.057417706297910716, "flos": 19099018880640.0, "grad_norm": 2.5893606760170016, "language_loss": 0.8840462, "learning_rate": 3.9921202256970804e-06, "loss": 0.90686691, "num_input_tokens_seen": 20383500, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.59375, "step": 955, "time_per_iteration": 2.4449660778045654 }, { "auxiliary_loss_clip": 0.01215449, "auxiliary_loss_mlp": 0.0107102, "balance_loss_clip": 1.04456758, "balance_loss_mlp": 1.05931246, "epoch": 0.05747782955057869, "flos": 16654507822080.0, "grad_norm": 2.17355043101371, "language_loss": 0.89406765, "learning_rate": 3.992085650224914e-06, "loss": 0.91693228, "num_input_tokens_seen": 20400295, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.5625, "step": 956, "time_per_iteration": 2.446621894836426 }, { "auxiliary_loss_clip": 0.01211685, "auxiliary_loss_mlp": 0.01059317, "balance_loss_clip": 1.03247154, "balance_loss_mlp": 1.06007147, "epoch": 0.05753795280324665, "flos": 14502058248960.0, "grad_norm": 2.077005230472628, "language_loss": 0.75391126, "learning_rate": 3.99205099921266e-06, "loss": 0.77662134, "num_input_tokens_seen": 20419085, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.515625, "step": 957, "time_per_iteration": 2.4360761642456055 }, { "auxiliary_loss_clip": 0.01216218, "auxiliary_loss_mlp": 0.01076498, "balance_loss_clip": 1.04719663, "balance_loss_mlp": 1.05831504, "epoch": 0.057598076055914625, "flos": 18076318848000.0, "grad_norm": 2.1104534432034114, "language_loss": 0.80266488, "learning_rate": 3.992016272661633e-06, "loss": 0.82559204, "num_input_tokens_seen": 20437465, "router_z_loss_clip": 0.29296875, "router_z_loss_mlp": 1.578125, "step": 958, "time_per_iteration": 2.476039171218872 }, { "auxiliary_loss_clip": 0.0121357, "auxiliary_loss_mlp": 0.01059614, "balance_loss_clip": 1.03504515, "balance_loss_mlp": 1.05747831, "epoch": 0.0576581993085826, "flos": 22124600254080.0, "grad_norm": 3.3434411780747473, "language_loss": 0.88402182, "learning_rate": 3.99198147057315e-06, "loss": 0.90675372, "num_input_tokens_seen": 20456235, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.5625, "step": 959, "time_per_iteration": 2.4573965072631836 }, { "auxiliary_loss_clip": 0.01208823, "auxiliary_loss_mlp": 0.01060104, "balance_loss_clip": 1.03385425, "balance_loss_mlp": 1.05874467, "epoch": 0.05771832256125056, "flos": 33181746779520.0, "grad_norm": 2.0823791027276135, "language_loss": 0.7868405, "learning_rate": 3.991946592948529e-06, "loss": 0.80952972, "num_input_tokens_seen": 20476825, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.5, "step": 960, "time_per_iteration": 2.5775134563446045 }, { "auxiliary_loss_clip": 0.01215457, "auxiliary_loss_mlp": 0.01065576, "balance_loss_clip": 1.03795481, "balance_loss_mlp": 1.05746305, "epoch": 0.057778445813918534, "flos": 24170143973760.0, "grad_norm": 3.623214678730032, "language_loss": 0.93172979, "learning_rate": 3.991911639789094e-06, "loss": 0.95454007, "num_input_tokens_seen": 20496965, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.578125, "step": 961, "time_per_iteration": 2.4904792308807373 }, { "auxiliary_loss_clip": 0.01214101, "auxiliary_loss_mlp": 0.01067204, "balance_loss_clip": 1.0392729, "balance_loss_mlp": 1.0574261, "epoch": 0.0578385690665865, "flos": 29643037666560.0, "grad_norm": 2.0325486989861976, "language_loss": 0.6821121, "learning_rate": 3.991876611096169e-06, "loss": 0.70492518, "num_input_tokens_seen": 20518035, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.5625, "step": 962, "time_per_iteration": 2.542943000793457 }, { "auxiliary_loss_clip": 0.01214024, "auxiliary_loss_mlp": 0.01065916, "balance_loss_clip": 1.04114425, "balance_loss_mlp": 1.05985665, "epoch": 0.05789869231925447, "flos": 20885430908160.0, "grad_norm": 2.396655419022381, "language_loss": 0.88721478, "learning_rate": 3.991841506871084e-06, "loss": 0.91001415, "num_input_tokens_seen": 20534740, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.5390625, "step": 963, "time_per_iteration": 2.4528636932373047 }, { "auxiliary_loss_clip": 0.01222718, "auxiliary_loss_mlp": 0.01060178, "balance_loss_clip": 1.03417802, "balance_loss_mlp": 1.06448281, "epoch": 0.057958815571922444, "flos": 26031106679040.0, "grad_norm": 2.4806413641200153, "language_loss": 0.84801614, "learning_rate": 3.99180632711517e-06, "loss": 0.87084508, "num_input_tokens_seen": 20553485, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5859375, "step": 964, "time_per_iteration": 2.5224993228912354 }, { "auxiliary_loss_clip": 0.01216651, "auxiliary_loss_mlp": 0.01072166, "balance_loss_clip": 1.04505777, "balance_loss_mlp": 1.06063581, "epoch": 0.05801893882459041, "flos": 18077683564800.0, "grad_norm": 2.9883910216740186, "language_loss": 0.78234935, "learning_rate": 3.99177107182976e-06, "loss": 0.80523753, "num_input_tokens_seen": 20572155, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.5625, "step": 965, "time_per_iteration": 2.4421653747558594 }, { "auxiliary_loss_clip": 0.01209153, "auxiliary_loss_mlp": 0.01066761, "balance_loss_clip": 1.04151225, "balance_loss_mlp": 1.05801845, "epoch": 0.05807906207725838, "flos": 17748885444480.0, "grad_norm": 1.9000329805521556, "language_loss": 0.81486583, "learning_rate": 3.99173574101619e-06, "loss": 0.83762503, "num_input_tokens_seen": 20590395, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5078125, "step": 966, "time_per_iteration": 2.5682413578033447 }, { "auxiliary_loss_clip": 0.01215384, "auxiliary_loss_mlp": 0.01066464, "balance_loss_clip": 1.04194283, "balance_loss_mlp": 1.06153154, "epoch": 0.058139185329926346, "flos": 18040372312320.0, "grad_norm": 1.956062382127811, "language_loss": 0.76279485, "learning_rate": 3.9917003346758035e-06, "loss": 0.7856133, "num_input_tokens_seen": 20608435, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.5390625, "step": 967, "time_per_iteration": 2.4300689697265625 }, { "auxiliary_loss_clip": 0.01099118, "auxiliary_loss_mlp": 0.01098493, "balance_loss_clip": 1.09396315, "balance_loss_mlp": 1.02987659, "epoch": 0.05819930858259432, "flos": 62363297485440.0, "grad_norm": 0.8560489828164268, "language_loss": 0.5737468, "learning_rate": 3.991664852809939e-06, "loss": 0.59572291, "num_input_tokens_seen": 20668575, "router_z_loss_clip": 0.04541016, "router_z_loss_mlp": 0.69140625, "step": 968, "time_per_iteration": 3.0210230350494385 }, { "auxiliary_loss_clip": 0.01219137, "auxiliary_loss_mlp": 0.01062779, "balance_loss_clip": 1.03536105, "balance_loss_mlp": 1.06410968, "epoch": 0.05825943183526229, "flos": 19135360465920.0, "grad_norm": 2.0218651122404387, "language_loss": 0.82237303, "learning_rate": 3.991629295419945e-06, "loss": 0.84519219, "num_input_tokens_seen": 20687355, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.546875, "step": 969, "time_per_iteration": 2.4655497074127197 }, { "auxiliary_loss_clip": 0.01217112, "auxiliary_loss_mlp": 0.01059527, "balance_loss_clip": 1.03349125, "balance_loss_mlp": 1.06105208, "epoch": 0.058319555087930255, "flos": 29022465369600.0, "grad_norm": 2.2705331883541064, "language_loss": 0.77761596, "learning_rate": 3.991593662507167e-06, "loss": 0.80038238, "num_input_tokens_seen": 20705710, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5625, "step": 970, "time_per_iteration": 2.5158231258392334 }, { "auxiliary_loss_clip": 0.0121673, "auxiliary_loss_mlp": 0.01063298, "balance_loss_clip": 1.03611851, "balance_loss_mlp": 1.06034744, "epoch": 0.05837967834059823, "flos": 18879999701760.0, "grad_norm": 2.4422550304199633, "language_loss": 0.92036986, "learning_rate": 3.991557954072958e-06, "loss": 0.94317007, "num_input_tokens_seen": 20722405, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.5625, "step": 971, "time_per_iteration": 2.459357261657715 }, { "auxiliary_loss_clip": 0.01212589, "auxiliary_loss_mlp": 0.01062927, "balance_loss_clip": 1.03718984, "balance_loss_mlp": 1.05843449, "epoch": 0.05843980159326619, "flos": 25703062744320.0, "grad_norm": 1.7792993473228154, "language_loss": 0.8612113, "learning_rate": 3.991522170118673e-06, "loss": 0.88396645, "num_input_tokens_seen": 20741480, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.5390625, "step": 972, "time_per_iteration": 2.50174617767334 }, { "auxiliary_loss_clip": 0.0121446, "auxiliary_loss_mlp": 0.01068663, "balance_loss_clip": 1.04308105, "balance_loss_mlp": 1.06235266, "epoch": 0.058499924845934165, "flos": 25552129795200.0, "grad_norm": 2.0448626748736016, "language_loss": 0.8769182, "learning_rate": 3.991486310645667e-06, "loss": 0.8997494, "num_input_tokens_seen": 20759685, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.515625, "step": 973, "time_per_iteration": 3.9567408561706543 }, { "auxiliary_loss_clip": 0.01214691, "auxiliary_loss_mlp": 0.0107575, "balance_loss_clip": 1.0488565, "balance_loss_mlp": 1.06036961, "epoch": 0.05856004809860214, "flos": 16436171001600.0, "grad_norm": 2.9623199827045883, "language_loss": 0.75131428, "learning_rate": 3.991450375655301e-06, "loss": 0.77421868, "num_input_tokens_seen": 20778180, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.546875, "step": 974, "time_per_iteration": 3.88228440284729 }, { "auxiliary_loss_clip": 0.01212765, "auxiliary_loss_mlp": 0.01067996, "balance_loss_clip": 1.0417347, "balance_loss_mlp": 1.06089461, "epoch": 0.0586201713512701, "flos": 39458824116480.0, "grad_norm": 1.6218394336353763, "language_loss": 0.76790637, "learning_rate": 3.991414365148936e-06, "loss": 0.79071397, "num_input_tokens_seen": 20802705, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.515625, "step": 975, "time_per_iteration": 4.043028831481934 }, { "auxiliary_loss_clip": 0.01219888, "auxiliary_loss_mlp": 0.0108008, "balance_loss_clip": 1.0545218, "balance_loss_mlp": 1.06290245, "epoch": 0.058680294603938074, "flos": 23365170230400.0, "grad_norm": 2.5743899956110976, "language_loss": 0.76815802, "learning_rate": 3.99137827912794e-06, "loss": 0.79115772, "num_input_tokens_seen": 20822540, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5703125, "step": 976, "time_per_iteration": 2.50241756439209 }, { "auxiliary_loss_clip": 0.01209872, "auxiliary_loss_mlp": 0.01082679, "balance_loss_clip": 1.05681014, "balance_loss_mlp": 1.05706537, "epoch": 0.05874041785660604, "flos": 32232017226240.0, "grad_norm": 1.916891134727089, "language_loss": 0.87497187, "learning_rate": 3.991342117593679e-06, "loss": 0.89789748, "num_input_tokens_seen": 20844175, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.53125, "step": 977, "time_per_iteration": 2.5534119606018066 }, { "auxiliary_loss_clip": 0.01213204, "auxiliary_loss_mlp": 0.01073577, "balance_loss_clip": 1.04658759, "balance_loss_mlp": 1.06044531, "epoch": 0.05880054110927401, "flos": 22310043194880.0, "grad_norm": 2.2707511704572876, "language_loss": 0.79446459, "learning_rate": 3.991305880547527e-06, "loss": 0.81733239, "num_input_tokens_seen": 20864730, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.5234375, "step": 978, "time_per_iteration": 2.5121076107025146 }, { "auxiliary_loss_clip": 0.01221026, "auxiliary_loss_mlp": 0.01079994, "balance_loss_clip": 1.0523138, "balance_loss_mlp": 1.06320679, "epoch": 0.05886066436194198, "flos": 27380450016000.0, "grad_norm": 2.291722155862527, "language_loss": 0.8077963, "learning_rate": 3.991269567990855e-06, "loss": 0.83080649, "num_input_tokens_seen": 20885200, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.578125, "step": 979, "time_per_iteration": 2.5205633640289307 }, { "auxiliary_loss_clip": 0.01096845, "auxiliary_loss_mlp": 0.01062659, "balance_loss_clip": 1.05851078, "balance_loss_mlp": 1.03229976, "epoch": 0.05892078761460995, "flos": 59584493525760.0, "grad_norm": 0.9483428712610524, "language_loss": 0.590096, "learning_rate": 3.9912331799250415e-06, "loss": 0.61169112, "num_input_tokens_seen": 20940325, "router_z_loss_clip": 0.04150391, "router_z_loss_mlp": 0.64453125, "step": 980, "time_per_iteration": 2.9823544025421143 }, { "auxiliary_loss_clip": 0.01213758, "auxiliary_loss_mlp": 0.01076696, "balance_loss_clip": 1.04952812, "balance_loss_mlp": 1.06258368, "epoch": 0.05898091086727792, "flos": 15414081500160.0, "grad_norm": 2.1533857184713585, "language_loss": 0.86897337, "learning_rate": 3.9911967163514665e-06, "loss": 0.89187789, "num_input_tokens_seen": 20958220, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.515625, "step": 981, "time_per_iteration": 2.445178270339966 }, { "auxiliary_loss_clip": 0.01212138, "auxiliary_loss_mlp": 0.01060512, "balance_loss_clip": 1.03638399, "balance_loss_mlp": 1.06065786, "epoch": 0.059041034119945886, "flos": 23655328295040.0, "grad_norm": 2.365324546733283, "language_loss": 0.79594839, "learning_rate": 3.991160177271513e-06, "loss": 0.81867492, "num_input_tokens_seen": 20978920, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.515625, "step": 982, "time_per_iteration": 2.4995646476745605 }, { "auxiliary_loss_clip": 0.01221774, "auxiliary_loss_mlp": 0.01067528, "balance_loss_clip": 1.04093218, "balance_loss_mlp": 1.06229341, "epoch": 0.05910115737261386, "flos": 24754087376640.0, "grad_norm": 2.1117941570317496, "language_loss": 0.84610921, "learning_rate": 3.9911235626865654e-06, "loss": 0.86900222, "num_input_tokens_seen": 20999490, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.59375, "step": 983, "time_per_iteration": 2.4960386753082275 }, { "auxiliary_loss_clip": 0.01212213, "auxiliary_loss_mlp": 0.01067745, "balance_loss_clip": 1.04287851, "balance_loss_mlp": 1.06139302, "epoch": 0.05916128062528183, "flos": 11728749070080.0, "grad_norm": 1.9465956779405262, "language_loss": 0.84563637, "learning_rate": 3.9910868725980125e-06, "loss": 0.86843598, "num_input_tokens_seen": 21017865, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.5078125, "step": 984, "time_per_iteration": 2.453754425048828 }, { "auxiliary_loss_clip": 0.01209405, "auxiliary_loss_mlp": 0.01060426, "balance_loss_clip": 1.03579712, "balance_loss_mlp": 1.06019092, "epoch": 0.059221403877949795, "flos": 21902995296000.0, "grad_norm": 2.506849135245851, "language_loss": 0.77264094, "learning_rate": 3.9910501070072465e-06, "loss": 0.79533929, "num_input_tokens_seen": 21035900, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.4921875, "step": 985, "time_per_iteration": 2.4529192447662354 }, { "auxiliary_loss_clip": 0.01218334, "auxiliary_loss_mlp": 0.01061486, "balance_loss_clip": 1.03670251, "balance_loss_mlp": 1.06272721, "epoch": 0.05928152713061777, "flos": 20514580940160.0, "grad_norm": 1.838936751177948, "language_loss": 0.90604264, "learning_rate": 3.991013265915661e-06, "loss": 0.92884082, "num_input_tokens_seen": 21053235, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.5546875, "step": 986, "time_per_iteration": 2.46939754486084 }, { "auxiliary_loss_clip": 0.01216298, "auxiliary_loss_mlp": 0.01062304, "balance_loss_clip": 1.03431356, "balance_loss_mlp": 1.05861926, "epoch": 0.05934165038328574, "flos": 24495135252480.0, "grad_norm": 8.015668199859075, "language_loss": 0.75518692, "learning_rate": 3.9909763493246525e-06, "loss": 0.77797294, "num_input_tokens_seen": 21073090, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.578125, "step": 987, "time_per_iteration": 2.4983086585998535 }, { "auxiliary_loss_clip": 0.01219009, "auxiliary_loss_mlp": 0.01057516, "balance_loss_clip": 1.03223133, "balance_loss_mlp": 1.05958378, "epoch": 0.059401773635953704, "flos": 38728041914880.0, "grad_norm": 2.4352841195929855, "language_loss": 0.71703231, "learning_rate": 3.990939357235621e-06, "loss": 0.73979759, "num_input_tokens_seen": 21094895, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.59375, "step": 988, "time_per_iteration": 2.616101026535034 }, { "auxiliary_loss_clip": 0.01085558, "auxiliary_loss_mlp": 0.01008973, "balance_loss_clip": 1.00520563, "balance_loss_mlp": 1.02138853, "epoch": 0.059461896888621676, "flos": 58023565125120.0, "grad_norm": 0.9293052491680932, "language_loss": 0.71207404, "learning_rate": 3.99090228964997e-06, "loss": 0.73301935, "num_input_tokens_seen": 21147555, "router_z_loss_clip": 0.03759766, "router_z_loss_mlp": 0.640625, "step": 989, "time_per_iteration": 2.946591377258301 }, { "auxiliary_loss_clip": 0.01220574, "auxiliary_loss_mlp": 0.01069446, "balance_loss_clip": 1.04200363, "balance_loss_mlp": 1.06093645, "epoch": 0.05952202014128964, "flos": 22127760650880.0, "grad_norm": 3.0100020258184252, "language_loss": 0.78827202, "learning_rate": 3.990865146569105e-06, "loss": 0.81117219, "num_input_tokens_seen": 21167845, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.59375, "step": 990, "time_per_iteration": 2.501009702682495 }, { "auxiliary_loss_clip": 0.01209517, "auxiliary_loss_mlp": 0.01060605, "balance_loss_clip": 1.03479671, "balance_loss_mlp": 1.05724227, "epoch": 0.059582143393957614, "flos": 20445776438400.0, "grad_norm": 2.08076065224744, "language_loss": 0.86413586, "learning_rate": 3.990827927994434e-06, "loss": 0.88683712, "num_input_tokens_seen": 21185085, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.5234375, "step": 991, "time_per_iteration": 2.4534811973571777 }, { "auxiliary_loss_clip": 0.01217387, "auxiliary_loss_mlp": 0.01065576, "balance_loss_clip": 1.03981495, "balance_loss_mlp": 1.05910516, "epoch": 0.059642266646625586, "flos": 20594877793920.0, "grad_norm": 1.9801625232567484, "language_loss": 0.77245873, "learning_rate": 3.9907906339273674e-06, "loss": 0.79528832, "num_input_tokens_seen": 21204230, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.578125, "step": 992, "time_per_iteration": 2.466351270675659 }, { "auxiliary_loss_clip": 0.01215289, "auxiliary_loss_mlp": 0.01064458, "balance_loss_clip": 1.04037738, "balance_loss_mlp": 1.06132293, "epoch": 0.05970238989929355, "flos": 19352655792000.0, "grad_norm": 2.298117603880455, "language_loss": 0.75272584, "learning_rate": 3.9907532643693215e-06, "loss": 0.7755233, "num_input_tokens_seen": 21222655, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.5390625, "step": 993, "time_per_iteration": 2.4494543075561523 }, { "auxiliary_loss_clip": 0.01212804, "auxiliary_loss_mlp": 0.01075896, "balance_loss_clip": 1.05009913, "balance_loss_mlp": 1.06105351, "epoch": 0.05976251315196152, "flos": 30264040926720.0, "grad_norm": 1.865365782851439, "language_loss": 0.78626168, "learning_rate": 3.990715819321712e-06, "loss": 0.80914867, "num_input_tokens_seen": 21242310, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.515625, "step": 994, "time_per_iteration": 2.5654990673065186 }, { "auxiliary_loss_clip": 0.01213199, "auxiliary_loss_mlp": 0.01087714, "balance_loss_clip": 1.06234646, "balance_loss_mlp": 1.06023622, "epoch": 0.05982263640462949, "flos": 23185150243200.0, "grad_norm": 2.4951893691581972, "language_loss": 0.79963022, "learning_rate": 3.99067829878596e-06, "loss": 0.82263935, "num_input_tokens_seen": 21261410, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.53125, "step": 995, "time_per_iteration": 2.4976961612701416 }, { "auxiliary_loss_clip": 0.01211008, "auxiliary_loss_mlp": 0.01069571, "balance_loss_clip": 1.04255795, "balance_loss_mlp": 1.05697966, "epoch": 0.05988275965729746, "flos": 27850879463040.0, "grad_norm": 1.9945909974189167, "language_loss": 0.86847073, "learning_rate": 3.990640702763487e-06, "loss": 0.89127648, "num_input_tokens_seen": 21280080, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.5390625, "step": 996, "time_per_iteration": 2.525127410888672 }, { "auxiliary_loss_clip": 0.01217502, "auxiliary_loss_mlp": 0.01082323, "balance_loss_clip": 1.05132818, "balance_loss_mlp": 1.06177807, "epoch": 0.05994288290996543, "flos": 24680003575680.0, "grad_norm": 3.358639272790157, "language_loss": 0.88080859, "learning_rate": 3.990603031255718e-06, "loss": 0.90380687, "num_input_tokens_seen": 21296765, "router_z_loss_clip": 0.31054688, "router_z_loss_mlp": 1.5546875, "step": 997, "time_per_iteration": 2.4760518074035645 }, { "auxiliary_loss_clip": 0.01082553, "auxiliary_loss_mlp": 0.01007419, "balance_loss_clip": 1.00400949, "balance_loss_mlp": 1.01856232, "epoch": 0.0600030061626334, "flos": 69929568835200.0, "grad_norm": 1.0082482695243922, "language_loss": 0.75412893, "learning_rate": 3.990565284264083e-06, "loss": 0.77502871, "num_input_tokens_seen": 21363345, "router_z_loss_clip": 0.03417969, "router_z_loss_mlp": 0.640625, "step": 998, "time_per_iteration": 3.1731607913970947 }, { "auxiliary_loss_clip": 0.01212496, "auxiliary_loss_mlp": 0.01067612, "balance_loss_clip": 1.04192269, "balance_loss_mlp": 1.06010461, "epoch": 0.06006312941530137, "flos": 26540140268160.0, "grad_norm": 2.019191538673633, "language_loss": 0.75920761, "learning_rate": 3.990527461790013e-06, "loss": 0.78200865, "num_input_tokens_seen": 21385290, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.5234375, "step": 999, "time_per_iteration": 2.537480115890503 }, { "auxiliary_loss_clip": 0.01215394, "auxiliary_loss_mlp": 0.01063655, "balance_loss_clip": 1.03708303, "balance_loss_mlp": 1.05879569, "epoch": 0.060123252667969335, "flos": 27344000689920.0, "grad_norm": 1.6991863882427605, "language_loss": 0.82563257, "learning_rate": 3.990489563834943e-06, "loss": 0.848423, "num_input_tokens_seen": 21407625, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.5625, "step": 1000, "time_per_iteration": 2.5251777172088623 }, { "auxiliary_loss_clip": 0.01217196, "auxiliary_loss_mlp": 0.01066253, "balance_loss_clip": 1.04124296, "balance_loss_mlp": 1.06282806, "epoch": 0.06018337592063731, "flos": 27016710940800.0, "grad_norm": 4.292246338922873, "language_loss": 0.86156118, "learning_rate": 3.990451590400309e-06, "loss": 0.8843956, "num_input_tokens_seen": 21426835, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.546875, "step": 1001, "time_per_iteration": 2.5503175258636475 }, { "auxiliary_loss_clip": 0.01213492, "auxiliary_loss_mlp": 0.0106895, "balance_loss_clip": 1.04370165, "balance_loss_mlp": 1.0622592, "epoch": 0.06024349917330528, "flos": 25592960580480.0, "grad_norm": 2.020509714400331, "language_loss": 0.74128199, "learning_rate": 3.990413541487551e-06, "loss": 0.76410639, "num_input_tokens_seen": 21444920, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.515625, "step": 1002, "time_per_iteration": 2.50848126411438 }, { "auxiliary_loss_clip": 0.01213994, "auxiliary_loss_mlp": 0.01063811, "balance_loss_clip": 1.0378828, "balance_loss_mlp": 1.06089127, "epoch": 0.060303622425973244, "flos": 26133271937280.0, "grad_norm": 2.4484239595240957, "language_loss": 0.75670052, "learning_rate": 3.990375417098112e-06, "loss": 0.77947855, "num_input_tokens_seen": 21463555, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.53125, "step": 1003, "time_per_iteration": 2.513476848602295 }, { "auxiliary_loss_clip": 0.01222897, "auxiliary_loss_mlp": 0.01066749, "balance_loss_clip": 1.04101205, "balance_loss_mlp": 1.064291, "epoch": 0.060363745678641216, "flos": 20377187418240.0, "grad_norm": 3.436954084483884, "language_loss": 0.69789052, "learning_rate": 3.990337217233437e-06, "loss": 0.72078705, "num_input_tokens_seen": 21481990, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.5859375, "step": 1004, "time_per_iteration": 2.453671455383301 }, { "auxiliary_loss_clip": 0.01222636, "auxiliary_loss_mlp": 0.01081316, "balance_loss_clip": 1.05556679, "balance_loss_mlp": 1.0657953, "epoch": 0.06042386893130918, "flos": 17749172753280.0, "grad_norm": 3.2978302850160635, "language_loss": 0.83479762, "learning_rate": 3.990298941894976e-06, "loss": 0.85783708, "num_input_tokens_seen": 21500385, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.5703125, "step": 1005, "time_per_iteration": 2.4599320888519287 }, { "auxiliary_loss_clip": 0.01078902, "auxiliary_loss_mlp": 0.01003684, "balance_loss_clip": 1.00046492, "balance_loss_mlp": 1.01656127, "epoch": 0.06048399218397715, "flos": 68538496872960.0, "grad_norm": 0.9015438012593998, "language_loss": 0.59140265, "learning_rate": 3.9902605910841794e-06, "loss": 0.61222851, "num_input_tokens_seen": 21561040, "router_z_loss_clip": 0.03222656, "router_z_loss_mlp": 0.625, "step": 1006, "time_per_iteration": 3.1326534748077393 }, { "auxiliary_loss_clip": 0.01213005, "auxiliary_loss_mlp": 0.01056405, "balance_loss_clip": 1.03047657, "balance_loss_mlp": 1.05692911, "epoch": 0.060544115436645125, "flos": 23258515772160.0, "grad_norm": 2.069912855374522, "language_loss": 0.74226767, "learning_rate": 3.990222164802503e-06, "loss": 0.76496172, "num_input_tokens_seen": 21580655, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5625, "step": 1007, "time_per_iteration": 2.486415147781372 }, { "auxiliary_loss_clip": 0.01216728, "auxiliary_loss_mlp": 0.01062182, "balance_loss_clip": 1.03595626, "balance_loss_mlp": 1.06172609, "epoch": 0.06060423868931309, "flos": 23878441624320.0, "grad_norm": 3.254936518192381, "language_loss": 0.80509317, "learning_rate": 3.9901836630514006e-06, "loss": 0.82788223, "num_input_tokens_seen": 21599650, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.546875, "step": 1008, "time_per_iteration": 2.4931488037109375 }, { "auxiliary_loss_clip": 0.01218295, "auxiliary_loss_mlp": 0.01065814, "balance_loss_clip": 1.03957617, "balance_loss_mlp": 1.0652554, "epoch": 0.06066436194198106, "flos": 18728061171840.0, "grad_norm": 1.907345609330277, "language_loss": 0.78178954, "learning_rate": 3.990145085832335e-06, "loss": 0.80463052, "num_input_tokens_seen": 21617550, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.53125, "step": 1009, "time_per_iteration": 2.462628126144409 }, { "auxiliary_loss_clip": 0.01213769, "auxiliary_loss_mlp": 0.01059093, "balance_loss_clip": 1.03389239, "balance_loss_mlp": 1.06323576, "epoch": 0.06072448519464903, "flos": 24640465680000.0, "grad_norm": 1.7443446544278325, "language_loss": 0.92773122, "learning_rate": 3.990106433146769e-06, "loss": 0.9504599, "num_input_tokens_seen": 21635865, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.5078125, "step": 1010, "time_per_iteration": 2.4786322116851807 }, { "auxiliary_loss_clip": 0.01223548, "auxiliary_loss_mlp": 0.01068341, "balance_loss_clip": 1.04037404, "balance_loss_mlp": 1.06230879, "epoch": 0.060784608447317, "flos": 17378825575680.0, "grad_norm": 2.7171239127729243, "language_loss": 0.71367961, "learning_rate": 3.9900677049961665e-06, "loss": 0.73659849, "num_input_tokens_seen": 21653945, "router_z_loss_clip": 0.27929688, "router_z_loss_mlp": 1.609375, "step": 1011, "time_per_iteration": 2.463777542114258 }, { "auxiliary_loss_clip": 0.01217904, "auxiliary_loss_mlp": 0.01070455, "balance_loss_clip": 1.04183292, "balance_loss_mlp": 1.06152725, "epoch": 0.06084473169998497, "flos": 23692208584320.0, "grad_norm": 1.8737552969951228, "language_loss": 0.87444925, "learning_rate": 3.990028901381999e-06, "loss": 0.89733291, "num_input_tokens_seen": 21671230, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.5625, "step": 1012, "time_per_iteration": 2.480933904647827 }, { "auxiliary_loss_clip": 0.01211288, "auxiliary_loss_mlp": 0.01063511, "balance_loss_clip": 1.03792882, "balance_loss_mlp": 1.05805564, "epoch": 0.06090485495265294, "flos": 23546339452800.0, "grad_norm": 2.5278079939383216, "language_loss": 0.76920348, "learning_rate": 3.989990022305734e-06, "loss": 0.79195142, "num_input_tokens_seen": 21691155, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.53125, "step": 1013, "time_per_iteration": 2.4850101470947266 }, { "auxiliary_loss_clip": 0.01219983, "auxiliary_loss_mlp": 0.01066688, "balance_loss_clip": 1.03824425, "balance_loss_mlp": 1.0620085, "epoch": 0.06096497820532091, "flos": 20339301548160.0, "grad_norm": 2.343824757836373, "language_loss": 0.8593514, "learning_rate": 3.98995106776885e-06, "loss": 0.88221812, "num_input_tokens_seen": 21707405, "router_z_loss_clip": 0.28515625, "router_z_loss_mlp": 1.578125, "step": 1014, "time_per_iteration": 2.479571580886841 }, { "auxiliary_loss_clip": 0.01222925, "auxiliary_loss_mlp": 0.01068572, "balance_loss_clip": 1.04072499, "balance_loss_mlp": 1.06359649, "epoch": 0.061025101457988874, "flos": 26939035779840.0, "grad_norm": 3.294523453353664, "language_loss": 0.73359424, "learning_rate": 3.98991203777282e-06, "loss": 0.7565093, "num_input_tokens_seen": 21728090, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.59375, "step": 1015, "time_per_iteration": 4.107834100723267 }, { "auxiliary_loss_clip": 0.01212228, "auxiliary_loss_mlp": 0.01069276, "balance_loss_clip": 1.04288304, "balance_loss_mlp": 1.06145597, "epoch": 0.061085224710656846, "flos": 25375054723200.0, "grad_norm": 1.611456651020526, "language_loss": 0.79272014, "learning_rate": 3.9898729323191275e-06, "loss": 0.81553519, "num_input_tokens_seen": 21747950, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.5078125, "step": 1016, "time_per_iteration": 3.9858665466308594 }, { "auxiliary_loss_clip": 0.01215294, "auxiliary_loss_mlp": 0.01060568, "balance_loss_clip": 1.03554606, "balance_loss_mlp": 1.06092882, "epoch": 0.06114534796332482, "flos": 24824759385600.0, "grad_norm": 1.702618618348019, "language_loss": 0.76288521, "learning_rate": 3.989833751409254e-06, "loss": 0.78564388, "num_input_tokens_seen": 21767900, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.546875, "step": 1017, "time_per_iteration": 2.4924821853637695 }, { "auxiliary_loss_clip": 0.01225448, "auxiliary_loss_mlp": 0.0107671, "balance_loss_clip": 1.05042386, "balance_loss_mlp": 1.06641495, "epoch": 0.061205471215992784, "flos": 20631434860800.0, "grad_norm": 1.9107111568255433, "language_loss": 0.8600474, "learning_rate": 3.989794495044685e-06, "loss": 0.88306904, "num_input_tokens_seen": 21787375, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.59375, "step": 1018, "time_per_iteration": 2.4822349548339844 }, { "auxiliary_loss_clip": 0.01213237, "auxiliary_loss_mlp": 0.01080368, "balance_loss_clip": 1.0528667, "balance_loss_mlp": 1.0614531, "epoch": 0.061265594468660756, "flos": 16508351381760.0, "grad_norm": 2.8893293652810743, "language_loss": 0.77604997, "learning_rate": 3.989755163226909e-06, "loss": 0.79898602, "num_input_tokens_seen": 21806275, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.515625, "step": 1019, "time_per_iteration": 2.457731008529663 }, { "auxiliary_loss_clip": 0.01216143, "auxiliary_loss_mlp": 0.01064251, "balance_loss_clip": 1.0375483, "balance_loss_mlp": 1.06080604, "epoch": 0.06132571772132872, "flos": 26246211275520.0, "grad_norm": 2.2052730277028676, "language_loss": 0.84405863, "learning_rate": 3.989715755957418e-06, "loss": 0.86686254, "num_input_tokens_seen": 21826430, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.5546875, "step": 1020, "time_per_iteration": 2.506040096282959 }, { "auxiliary_loss_clip": 0.0122149, "auxiliary_loss_mlp": 0.01064665, "balance_loss_clip": 1.03880858, "balance_loss_mlp": 1.06552458, "epoch": 0.06138584097399669, "flos": 37414788768000.0, "grad_norm": 1.89928642879142, "language_loss": 0.79230952, "learning_rate": 3.989676273237705e-06, "loss": 0.81517106, "num_input_tokens_seen": 21847800, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.5625, "step": 1021, "time_per_iteration": 2.6032164096832275 }, { "auxiliary_loss_clip": 0.01212323, "auxiliary_loss_mlp": 0.01064901, "balance_loss_clip": 1.04202437, "balance_loss_mlp": 1.06086802, "epoch": 0.061445964226664665, "flos": 17420661941760.0, "grad_norm": 1.903575442133639, "language_loss": 0.88013822, "learning_rate": 3.9896367150692705e-06, "loss": 0.90291041, "num_input_tokens_seen": 21863385, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.515625, "step": 1022, "time_per_iteration": 2.4610252380371094 }, { "auxiliary_loss_clip": 0.01219188, "auxiliary_loss_mlp": 0.01063998, "balance_loss_clip": 1.03770041, "balance_loss_mlp": 1.06645727, "epoch": 0.06150608747933263, "flos": 22600021691520.0, "grad_norm": 1.6496007703833597, "language_loss": 0.83032346, "learning_rate": 3.989597081453611e-06, "loss": 0.85315531, "num_input_tokens_seen": 21881880, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.53125, "step": 1023, "time_per_iteration": 2.466681480407715 }, { "auxiliary_loss_clip": 0.01085124, "auxiliary_loss_mlp": 0.01014215, "balance_loss_clip": 1.01023316, "balance_loss_mlp": 1.02222335, "epoch": 0.0615662107320006, "flos": 56741482005120.0, "grad_norm": 0.8858061802729967, "language_loss": 0.65020531, "learning_rate": 3.989557372392231e-06, "loss": 0.67119873, "num_input_tokens_seen": 21940550, "router_z_loss_clip": 0.03979492, "router_z_loss_mlp": 0.62890625, "step": 1024, "time_per_iteration": 3.168614625930786 }, { "auxiliary_loss_clip": 0.01219996, "auxiliary_loss_mlp": 0.01069707, "balance_loss_clip": 1.04252684, "balance_loss_mlp": 1.06662095, "epoch": 0.06162633398466857, "flos": 22564793427840.0, "grad_norm": 2.149249081144086, "language_loss": 0.88160878, "learning_rate": 3.989517587886636e-06, "loss": 0.90450585, "num_input_tokens_seen": 21958390, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.53125, "step": 1025, "time_per_iteration": 2.4704339504241943 }, { "auxiliary_loss_clip": 0.01218815, "auxiliary_loss_mlp": 0.01065851, "balance_loss_clip": 1.04078126, "balance_loss_mlp": 1.06530738, "epoch": 0.06168645723733654, "flos": 25593104234880.0, "grad_norm": 2.8038335277911837, "language_loss": 0.84895384, "learning_rate": 3.989477727938335e-06, "loss": 0.87180054, "num_input_tokens_seen": 21978625, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.53125, "step": 1026, "time_per_iteration": 2.5396981239318848 }, { "auxiliary_loss_clip": 0.01217877, "auxiliary_loss_mlp": 0.01073319, "balance_loss_clip": 1.04785609, "balance_loss_mlp": 1.06256199, "epoch": 0.06174658049000451, "flos": 15997917162240.0, "grad_norm": 1.7347224632621658, "language_loss": 0.82255077, "learning_rate": 3.989437792548839e-06, "loss": 0.84546274, "num_input_tokens_seen": 21996035, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5546875, "step": 1027, "time_per_iteration": 2.446767807006836 }, { "auxiliary_loss_clip": 0.01214807, "auxiliary_loss_mlp": 0.0106322, "balance_loss_clip": 1.037256, "balance_loss_mlp": 1.06200159, "epoch": 0.06180670374267248, "flos": 11285970117120.0, "grad_norm": 2.2280300379212274, "language_loss": 0.8450067, "learning_rate": 3.989397781719663e-06, "loss": 0.867787, "num_input_tokens_seen": 22011625, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.53125, "step": 1028, "time_per_iteration": 2.4557509422302246 }, { "auxiliary_loss_clip": 0.01082507, "auxiliary_loss_mlp": 0.01005172, "balance_loss_clip": 1.0012145, "balance_loss_mlp": 1.02104187, "epoch": 0.06186682699534045, "flos": 65130142216320.0, "grad_norm": 0.9357810114260717, "language_loss": 0.60575032, "learning_rate": 3.989357695452323e-06, "loss": 0.62662715, "num_input_tokens_seen": 22066035, "router_z_loss_clip": 0.03955078, "router_z_loss_mlp": 0.6171875, "step": 1029, "time_per_iteration": 2.9132068157196045 }, { "auxiliary_loss_clip": 0.01209688, "auxiliary_loss_mlp": 0.01062967, "balance_loss_clip": 1.03771853, "balance_loss_mlp": 1.05836535, "epoch": 0.061926950248008414, "flos": 21105742976640.0, "grad_norm": 2.5165110912863224, "language_loss": 0.82789606, "learning_rate": 3.98931753374834e-06, "loss": 0.85062265, "num_input_tokens_seen": 22085015, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.515625, "step": 1030, "time_per_iteration": 2.504511833190918 }, { "auxiliary_loss_clip": 0.0121972, "auxiliary_loss_mlp": 0.01075694, "balance_loss_clip": 1.04906273, "balance_loss_mlp": 1.0637908, "epoch": 0.061987073500676386, "flos": 17748454481280.0, "grad_norm": 4.177274531703238, "language_loss": 0.79623234, "learning_rate": 3.989277296609237e-06, "loss": 0.81918645, "num_input_tokens_seen": 22102775, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.5625, "step": 1031, "time_per_iteration": 2.459197759628296 }, { "auxiliary_loss_clip": 0.01214758, "auxiliary_loss_mlp": 0.01076326, "balance_loss_clip": 1.04922974, "balance_loss_mlp": 1.06132543, "epoch": 0.06204719675334436, "flos": 21836237869440.0, "grad_norm": 1.9246264632585794, "language_loss": 0.77489746, "learning_rate": 3.98923698403654e-06, "loss": 0.79780829, "num_input_tokens_seen": 22121680, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.5390625, "step": 1032, "time_per_iteration": 2.4608607292175293 }, { "auxiliary_loss_clip": 0.01215481, "auxiliary_loss_mlp": 0.01070315, "balance_loss_clip": 1.04345703, "balance_loss_mlp": 1.06098104, "epoch": 0.06210732000601232, "flos": 19353697286400.0, "grad_norm": 2.137045685086952, "language_loss": 0.89444113, "learning_rate": 3.989196596031776e-06, "loss": 0.91729909, "num_input_tokens_seen": 22138155, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.546875, "step": 1033, "time_per_iteration": 2.453871965408325 }, { "auxiliary_loss_clip": 0.01217365, "auxiliary_loss_mlp": 0.0106591, "balance_loss_clip": 1.04024446, "balance_loss_mlp": 1.06141567, "epoch": 0.062167443258680295, "flos": 24749382695040.0, "grad_norm": 1.9416847745649966, "language_loss": 0.84615982, "learning_rate": 3.989156132596479e-06, "loss": 0.86899251, "num_input_tokens_seen": 22157420, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.5625, "step": 1034, "time_per_iteration": 2.511230945587158 }, { "auxiliary_loss_clip": 0.01206716, "auxiliary_loss_mlp": 0.01064161, "balance_loss_clip": 1.03772068, "balance_loss_mlp": 1.06105185, "epoch": 0.06222756651134827, "flos": 34458478773120.0, "grad_norm": 1.8630119917436754, "language_loss": 0.81213498, "learning_rate": 3.989115593732182e-06, "loss": 0.83484375, "num_input_tokens_seen": 22178620, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.453125, "step": 1035, "time_per_iteration": 2.5855026245117188 }, { "auxiliary_loss_clip": 0.01215569, "auxiliary_loss_mlp": 0.01069649, "balance_loss_clip": 1.04093122, "balance_loss_mlp": 1.06266558, "epoch": 0.06228768976401623, "flos": 25666469763840.0, "grad_norm": 2.0338530659908525, "language_loss": 0.7871505, "learning_rate": 3.989074979440421e-06, "loss": 0.81000262, "num_input_tokens_seen": 22197125, "router_z_loss_clip": 0.28710938, "router_z_loss_mlp": 1.53125, "step": 1036, "time_per_iteration": 2.5049664974212646 }, { "auxiliary_loss_clip": 0.01206639, "auxiliary_loss_mlp": 0.01070281, "balance_loss_clip": 1.04484153, "balance_loss_mlp": 1.05886889, "epoch": 0.062347813016684205, "flos": 25295619795840.0, "grad_norm": 1.6582403364795466, "language_loss": 0.86880249, "learning_rate": 3.989034289722739e-06, "loss": 0.89157164, "num_input_tokens_seen": 22217575, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.4765625, "step": 1037, "time_per_iteration": 2.5130276679992676 }, { "auxiliary_loss_clip": 0.01209579, "auxiliary_loss_mlp": 0.01057913, "balance_loss_clip": 1.03044701, "balance_loss_mlp": 1.06033635, "epoch": 0.06240793626935217, "flos": 26907039740160.0, "grad_norm": 2.256413728414613, "language_loss": 0.81369174, "learning_rate": 3.988993524580676e-06, "loss": 0.83636665, "num_input_tokens_seen": 22236840, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.4921875, "step": 1038, "time_per_iteration": 2.5062201023101807 }, { "auxiliary_loss_clip": 0.01209607, "auxiliary_loss_mlp": 0.01070503, "balance_loss_clip": 1.0436573, "balance_loss_mlp": 1.06243968, "epoch": 0.06246805952202014, "flos": 21615782146560.0, "grad_norm": 2.0019403881210285, "language_loss": 0.85527802, "learning_rate": 3.98895268401578e-06, "loss": 0.87807912, "num_input_tokens_seen": 22256465, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.46875, "step": 1039, "time_per_iteration": 2.4681289196014404 }, { "auxiliary_loss_clip": 0.01209534, "auxiliary_loss_mlp": 0.01067016, "balance_loss_clip": 1.03999162, "balance_loss_mlp": 1.05945778, "epoch": 0.0625281827746881, "flos": 19311896833920.0, "grad_norm": 1.9119205873509786, "language_loss": 0.80918068, "learning_rate": 3.9889117680296e-06, "loss": 0.83194613, "num_input_tokens_seen": 22274025, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.5, "step": 1040, "time_per_iteration": 2.470421075820923 }, { "auxiliary_loss_clip": 0.01213786, "auxiliary_loss_mlp": 0.01064176, "balance_loss_clip": 1.03778267, "balance_loss_mlp": 1.06368816, "epoch": 0.06258830602735609, "flos": 27745769289600.0, "grad_norm": 2.1623970945234015, "language_loss": 0.69786459, "learning_rate": 3.988870776623685e-06, "loss": 0.72064424, "num_input_tokens_seen": 22292245, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.5, "step": 1041, "time_per_iteration": 2.566394567489624 }, { "auxiliary_loss_clip": 0.01211502, "auxiliary_loss_mlp": 0.01059852, "balance_loss_clip": 1.03339946, "balance_loss_mlp": 1.05790198, "epoch": 0.06264842928002405, "flos": 23222605150080.0, "grad_norm": 2.495217818445787, "language_loss": 0.81445467, "learning_rate": 3.9888297097995905e-06, "loss": 0.83716822, "num_input_tokens_seen": 22311455, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.53125, "step": 1042, "time_per_iteration": 2.4741015434265137 }, { "auxiliary_loss_clip": 0.01209468, "auxiliary_loss_mlp": 0.01053675, "balance_loss_clip": 1.02868891, "balance_loss_mlp": 1.05888438, "epoch": 0.06270855253269202, "flos": 38399495189760.0, "grad_norm": 1.652078834647491, "language_loss": 0.76185846, "learning_rate": 3.988788567558874e-06, "loss": 0.78448987, "num_input_tokens_seen": 22333750, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.5078125, "step": 1043, "time_per_iteration": 2.6338307857513428 }, { "auxiliary_loss_clip": 0.01204685, "auxiliary_loss_mlp": 0.0106581, "balance_loss_clip": 1.04078794, "balance_loss_mlp": 1.06014085, "epoch": 0.06276867578535998, "flos": 22453542028800.0, "grad_norm": 1.8526891230425218, "language_loss": 0.92481464, "learning_rate": 3.988747349903097e-06, "loss": 0.9475196, "num_input_tokens_seen": 22351940, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.4453125, "step": 1044, "time_per_iteration": 2.449979066848755 }, { "auxiliary_loss_clip": 0.01207251, "auxiliary_loss_mlp": 0.01072151, "balance_loss_clip": 1.04631829, "balance_loss_mlp": 1.05747795, "epoch": 0.06282879903802796, "flos": 22930435923840.0, "grad_norm": 2.714907896879035, "language_loss": 0.86168981, "learning_rate": 3.988706056833821e-06, "loss": 0.88448381, "num_input_tokens_seen": 22372085, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.5, "step": 1045, "time_per_iteration": 2.510779619216919 }, { "auxiliary_loss_clip": 0.01204297, "auxiliary_loss_mlp": 0.01065881, "balance_loss_clip": 1.04100168, "balance_loss_mlp": 1.05752909, "epoch": 0.06288892229069593, "flos": 34819237019520.0, "grad_norm": 1.9908988737572353, "language_loss": 0.78329754, "learning_rate": 3.9886646883526125e-06, "loss": 0.80599934, "num_input_tokens_seen": 22392020, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.46875, "step": 1046, "time_per_iteration": 2.5561046600341797 }, { "auxiliary_loss_clip": 0.0120643, "auxiliary_loss_mlp": 0.01071952, "balance_loss_clip": 1.04704881, "balance_loss_mlp": 1.05850124, "epoch": 0.06294904554336389, "flos": 19427134642560.0, "grad_norm": 2.2075901188891556, "language_loss": 0.77388543, "learning_rate": 3.988623244461039e-06, "loss": 0.79666924, "num_input_tokens_seen": 22411180, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.484375, "step": 1047, "time_per_iteration": 2.4726710319519043 }, { "auxiliary_loss_clip": 0.01214475, "auxiliary_loss_mlp": 0.01063886, "balance_loss_clip": 1.03750515, "balance_loss_mlp": 1.05977035, "epoch": 0.06300916879603187, "flos": 40661867358720.0, "grad_norm": 2.157065555611277, "language_loss": 0.77005959, "learning_rate": 3.988581725160672e-06, "loss": 0.79284316, "num_input_tokens_seen": 22435105, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.546875, "step": 1048, "time_per_iteration": 2.64678955078125 }, { "auxiliary_loss_clip": 0.01209752, "auxiliary_loss_mlp": 0.01072357, "balance_loss_clip": 1.04610753, "balance_loss_mlp": 1.05834651, "epoch": 0.06306929204869983, "flos": 23804142341760.0, "grad_norm": 2.2853142996111995, "language_loss": 0.775401, "learning_rate": 3.988540130453087e-06, "loss": 0.79822206, "num_input_tokens_seen": 22452710, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.515625, "step": 1049, "time_per_iteration": 2.523597478866577 }, { "auxiliary_loss_clip": 0.0120863, "auxiliary_loss_mlp": 0.01056909, "balance_loss_clip": 1.03191054, "balance_loss_mlp": 1.05884194, "epoch": 0.0631294153013678, "flos": 18915802583040.0, "grad_norm": 2.324637329227423, "language_loss": 0.83022225, "learning_rate": 3.988498460339862e-06, "loss": 0.85287762, "num_input_tokens_seen": 22470175, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.5, "step": 1050, "time_per_iteration": 2.4506049156188965 }, { "auxiliary_loss_clip": 0.01210392, "auxiliary_loss_mlp": 0.01067051, "balance_loss_clip": 1.04263687, "balance_loss_mlp": 1.06259394, "epoch": 0.06318953855403578, "flos": 24280174310400.0, "grad_norm": 1.7363416163453604, "language_loss": 0.76826966, "learning_rate": 3.988456714822575e-06, "loss": 0.79104412, "num_input_tokens_seen": 22490020, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.484375, "step": 1051, "time_per_iteration": 2.5192530155181885 }, { "auxiliary_loss_clip": 0.01209727, "auxiliary_loss_mlp": 0.01077382, "balance_loss_clip": 1.05134654, "balance_loss_mlp": 1.05962539, "epoch": 0.06324966180670374, "flos": 22528918719360.0, "grad_norm": 2.227316267189946, "language_loss": 0.79989684, "learning_rate": 3.98841489390281e-06, "loss": 0.82276797, "num_input_tokens_seen": 22509685, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5, "step": 1052, "time_per_iteration": 2.4786906242370605 }, { "auxiliary_loss_clip": 0.01211302, "auxiliary_loss_mlp": 0.0106038, "balance_loss_clip": 1.03446388, "balance_loss_mlp": 1.06059718, "epoch": 0.06330978505937171, "flos": 15778107884160.0, "grad_norm": 2.125747929027282, "language_loss": 0.78117323, "learning_rate": 3.988372997582155e-06, "loss": 0.80388999, "num_input_tokens_seen": 22527905, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5078125, "step": 1053, "time_per_iteration": 2.480353593826294 }, { "auxiliary_loss_clip": 0.01209869, "auxiliary_loss_mlp": 0.01055488, "balance_loss_clip": 1.03058541, "balance_loss_mlp": 1.05961442, "epoch": 0.06336990831203967, "flos": 21471098163840.0, "grad_norm": 1.97699261517924, "language_loss": 0.84733409, "learning_rate": 3.988331025862195e-06, "loss": 0.86998761, "num_input_tokens_seen": 22546335, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.5, "step": 1054, "time_per_iteration": 2.46647572517395 }, { "auxiliary_loss_clip": 0.0120808, "auxiliary_loss_mlp": 0.01066958, "balance_loss_clip": 1.04215026, "balance_loss_mlp": 1.0611316, "epoch": 0.06343003156470765, "flos": 18478877546880.0, "grad_norm": 2.186935765800818, "language_loss": 0.85636365, "learning_rate": 3.9882889787445225e-06, "loss": 0.87911403, "num_input_tokens_seen": 22563885, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.46875, "step": 1055, "time_per_iteration": 2.470257520675659 }, { "auxiliary_loss_clip": 0.01212106, "auxiliary_loss_mlp": 0.01067, "balance_loss_clip": 1.04166806, "balance_loss_mlp": 1.05964708, "epoch": 0.06349015481737562, "flos": 25154886309120.0, "grad_norm": 3.6427086234983626, "language_loss": 0.80978394, "learning_rate": 3.988246856230734e-06, "loss": 0.83257502, "num_input_tokens_seen": 22583035, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5234375, "step": 1056, "time_per_iteration": 3.921149492263794 }, { "auxiliary_loss_clip": 0.01213089, "auxiliary_loss_mlp": 0.01065154, "balance_loss_clip": 1.03737807, "balance_loss_mlp": 1.05672097, "epoch": 0.06355027807004358, "flos": 26871775562880.0, "grad_norm": 2.475711426101451, "language_loss": 0.81390887, "learning_rate": 3.988204658322426e-06, "loss": 0.83669132, "num_input_tokens_seen": 22605055, "router_z_loss_clip": 0.27734375, "router_z_loss_mlp": 1.5625, "step": 1057, "time_per_iteration": 3.961085081100464 }, { "auxiliary_loss_clip": 0.01199332, "auxiliary_loss_mlp": 0.01069015, "balance_loss_clip": 1.04594839, "balance_loss_mlp": 1.05661535, "epoch": 0.06361040132271156, "flos": 21396691140480.0, "grad_norm": 1.7552989066360771, "language_loss": 0.83421361, "learning_rate": 3.988162385021196e-06, "loss": 0.85689712, "num_input_tokens_seen": 22623760, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.4296875, "step": 1058, "time_per_iteration": 5.276554346084595 }, { "auxiliary_loss_clip": 0.01208541, "auxiliary_loss_mlp": 0.01063686, "balance_loss_clip": 1.03638744, "balance_loss_mlp": 1.05943584, "epoch": 0.06367052457537953, "flos": 25733765894400.0, "grad_norm": 1.887761458652131, "language_loss": 0.87842798, "learning_rate": 3.988120036328651e-06, "loss": 0.90115029, "num_input_tokens_seen": 22643000, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.4921875, "step": 1059, "time_per_iteration": 2.5119619369506836 }, { "auxiliary_loss_clip": 0.012168, "auxiliary_loss_mlp": 0.01065427, "balance_loss_clip": 1.03881955, "balance_loss_mlp": 1.0640254, "epoch": 0.0637306478280475, "flos": 17631420992640.0, "grad_norm": 3.1463872157522474, "language_loss": 0.91367918, "learning_rate": 3.988077612246394e-06, "loss": 0.9365015, "num_input_tokens_seen": 22660460, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.53125, "step": 1060, "time_per_iteration": 2.434664249420166 }, { "auxiliary_loss_clip": 0.01208609, "auxiliary_loss_mlp": 0.01066307, "balance_loss_clip": 1.04060555, "balance_loss_mlp": 1.05950165, "epoch": 0.06379077108071547, "flos": 13662610427520.0, "grad_norm": 1.8868508691495713, "language_loss": 0.87613583, "learning_rate": 3.988035112776035e-06, "loss": 0.89888501, "num_input_tokens_seen": 22679270, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4921875, "step": 1061, "time_per_iteration": 2.4687447547912598 }, { "auxiliary_loss_clip": 0.01213542, "auxiliary_loss_mlp": 0.01065729, "balance_loss_clip": 1.03747606, "balance_loss_mlp": 1.05769682, "epoch": 0.06385089433338344, "flos": 28478849961600.0, "grad_norm": 2.5202307887458324, "language_loss": 0.76986229, "learning_rate": 3.987992537919185e-06, "loss": 0.79265499, "num_input_tokens_seen": 22699330, "router_z_loss_clip": 0.28320312, "router_z_loss_mlp": 1.5625, "step": 1062, "time_per_iteration": 2.55314302444458 }, { "auxiliary_loss_clip": 0.01209359, "auxiliary_loss_mlp": 0.01062256, "balance_loss_clip": 1.03744876, "balance_loss_mlp": 1.05714595, "epoch": 0.0639110175860514, "flos": 24311057028480.0, "grad_norm": 1.9167934888165599, "language_loss": 0.86556959, "learning_rate": 3.987949887677459e-06, "loss": 0.88828576, "num_input_tokens_seen": 22717945, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.5234375, "step": 1063, "time_per_iteration": 2.497105360031128 }, { "auxiliary_loss_clip": 0.01207622, "auxiliary_loss_mlp": 0.01060083, "balance_loss_clip": 1.03414285, "balance_loss_mlp": 1.05781198, "epoch": 0.06397114083871938, "flos": 22090772620800.0, "grad_norm": 2.2246139381687002, "language_loss": 0.80396897, "learning_rate": 3.9879071620524744e-06, "loss": 0.82664597, "num_input_tokens_seen": 22736790, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5, "step": 1064, "time_per_iteration": 2.513364791870117 }, { "auxiliary_loss_clip": 0.01209385, "auxiliary_loss_mlp": 0.01072741, "balance_loss_clip": 1.04554939, "balance_loss_mlp": 1.05924428, "epoch": 0.06403126409138735, "flos": 19572824206080.0, "grad_norm": 2.3803890139567176, "language_loss": 0.84032035, "learning_rate": 3.987864361045851e-06, "loss": 0.86314166, "num_input_tokens_seen": 22754745, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.5, "step": 1065, "time_per_iteration": 2.4973270893096924 }, { "auxiliary_loss_clip": 0.01208615, "auxiliary_loss_mlp": 0.01057892, "balance_loss_clip": 1.03389502, "balance_loss_mlp": 1.06056023, "epoch": 0.06409138734405531, "flos": 40807413267840.0, "grad_norm": 1.6232320169083827, "language_loss": 0.68746126, "learning_rate": 3.987821484659211e-06, "loss": 0.7101264, "num_input_tokens_seen": 22776780, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.484375, "step": 1066, "time_per_iteration": 2.61976957321167 }, { "auxiliary_loss_clip": 0.01207804, "auxiliary_loss_mlp": 0.01079058, "balance_loss_clip": 1.05175865, "balance_loss_mlp": 1.05985284, "epoch": 0.06415151059672328, "flos": 20441610460800.0, "grad_norm": 1.8242070017638663, "language_loss": 0.90360266, "learning_rate": 3.987778532894181e-06, "loss": 0.92647123, "num_input_tokens_seen": 22793915, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.4765625, "step": 1067, "time_per_iteration": 2.4712328910827637 }, { "auxiliary_loss_clip": 0.01209957, "auxiliary_loss_mlp": 0.01064312, "balance_loss_clip": 1.03956461, "balance_loss_mlp": 1.05880713, "epoch": 0.06421163384939126, "flos": 18072045129600.0, "grad_norm": 2.5133891509590285, "language_loss": 0.83557338, "learning_rate": 3.987735505752391e-06, "loss": 0.85831606, "num_input_tokens_seen": 22812670, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.515625, "step": 1068, "time_per_iteration": 2.460989236831665 }, { "auxiliary_loss_clip": 0.01206505, "auxiliary_loss_mlp": 0.01062313, "balance_loss_clip": 1.03804231, "balance_loss_mlp": 1.06062579, "epoch": 0.06427175710205922, "flos": 25119442563840.0, "grad_norm": 2.364480872555252, "language_loss": 0.89616221, "learning_rate": 3.987692403235471e-06, "loss": 0.91885042, "num_input_tokens_seen": 22832440, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 1.4609375, "step": 1069, "time_per_iteration": 2.518115758895874 }, { "auxiliary_loss_clip": 0.01208759, "auxiliary_loss_mlp": 0.01077507, "balance_loss_clip": 1.05047035, "balance_loss_mlp": 1.05756497, "epoch": 0.06433188035472719, "flos": 17380549428480.0, "grad_norm": 5.693711675470714, "language_loss": 0.96283835, "learning_rate": 3.987649225345056e-06, "loss": 0.98570102, "num_input_tokens_seen": 22845495, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.515625, "step": 1070, "time_per_iteration": 2.431722640991211 }, { "auxiliary_loss_clip": 0.01209921, "auxiliary_loss_mlp": 0.01054571, "balance_loss_clip": 1.02760565, "balance_loss_mlp": 1.05899, "epoch": 0.06439200360739517, "flos": 23546267625600.0, "grad_norm": 1.8416249803959568, "language_loss": 0.88135201, "learning_rate": 3.987605972082782e-06, "loss": 0.90399694, "num_input_tokens_seen": 22865390, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.515625, "step": 1071, "time_per_iteration": 2.5255329608917236 }, { "auxiliary_loss_clip": 0.01204334, "auxiliary_loss_mlp": 0.01053122, "balance_loss_clip": 1.02880383, "balance_loss_mlp": 1.05682874, "epoch": 0.06445212686006313, "flos": 21979772616960.0, "grad_norm": 1.6125799386335764, "language_loss": 0.76150084, "learning_rate": 3.987562643450292e-06, "loss": 0.78407538, "num_input_tokens_seen": 22885495, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 1.4765625, "step": 1072, "time_per_iteration": 2.465691328048706 }, { "auxiliary_loss_clip": 0.01212361, "auxiliary_loss_mlp": 0.01073291, "balance_loss_clip": 1.04531288, "balance_loss_mlp": 1.05972528, "epoch": 0.0645122501127311, "flos": 25921291824000.0, "grad_norm": 2.0798485900542785, "language_loss": 0.80556959, "learning_rate": 3.987519239449226e-06, "loss": 0.82842612, "num_input_tokens_seen": 22904845, "router_z_loss_clip": 0.28125, "router_z_loss_mlp": 1.5234375, "step": 1073, "time_per_iteration": 2.507908344268799 }, { "auxiliary_loss_clip": 0.01200511, "auxiliary_loss_mlp": 0.01061007, "balance_loss_clip": 1.03591299, "balance_loss_mlp": 1.05665636, "epoch": 0.06457237336539907, "flos": 25626034028160.0, "grad_norm": 1.7986977953602907, "language_loss": 0.80321938, "learning_rate": 3.987475760081233e-06, "loss": 0.82583457, "num_input_tokens_seen": 22925940, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.4375, "step": 1074, "time_per_iteration": 2.4906723499298096 }, { "auxiliary_loss_clip": 0.01205889, "auxiliary_loss_mlp": 0.01067068, "balance_loss_clip": 1.04102039, "balance_loss_mlp": 1.0567565, "epoch": 0.06463249661806704, "flos": 19463979018240.0, "grad_norm": 1.6968477535323565, "language_loss": 0.79381537, "learning_rate": 3.987432205347958e-06, "loss": 0.81654495, "num_input_tokens_seen": 22944375, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.4921875, "step": 1075, "time_per_iteration": 2.4622578620910645 }, { "auxiliary_loss_clip": 0.01210577, "auxiliary_loss_mlp": 0.010619, "balance_loss_clip": 1.03848767, "balance_loss_mlp": 1.06197751, "epoch": 0.064692619870735, "flos": 24498044254080.0, "grad_norm": 2.8827857062115543, "language_loss": 0.8781386, "learning_rate": 3.987388575251055e-06, "loss": 0.90086341, "num_input_tokens_seen": 22959145, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.484375, "step": 1076, "time_per_iteration": 2.520028829574585 }, { "auxiliary_loss_clip": 0.01201648, "auxiliary_loss_mlp": 0.01054588, "balance_loss_clip": 1.02987599, "balance_loss_mlp": 1.05535316, "epoch": 0.06475274312340297, "flos": 17018677860480.0, "grad_norm": 2.0011437882067886, "language_loss": 0.80398929, "learning_rate": 3.98734486979218e-06, "loss": 0.82655162, "num_input_tokens_seen": 22978100, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.4609375, "step": 1077, "time_per_iteration": 2.465439558029175 }, { "auxiliary_loss_clip": 0.01209606, "auxiliary_loss_mlp": 0.01066813, "balance_loss_clip": 1.04059839, "balance_loss_mlp": 1.05679655, "epoch": 0.06481286637607095, "flos": 24572379450240.0, "grad_norm": 2.035372674006196, "language_loss": 0.91797376, "learning_rate": 3.987301088972986e-06, "loss": 0.94073796, "num_input_tokens_seen": 22997285, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.53125, "step": 1078, "time_per_iteration": 2.489267349243164 }, { "auxiliary_loss_clip": 0.01216155, "auxiliary_loss_mlp": 0.01061457, "balance_loss_clip": 1.03544521, "balance_loss_mlp": 1.06057835, "epoch": 0.06487298962873891, "flos": 21105635235840.0, "grad_norm": 2.0712228520032165, "language_loss": 0.78882909, "learning_rate": 3.987257232795137e-06, "loss": 0.81160522, "num_input_tokens_seen": 23016285, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.5546875, "step": 1079, "time_per_iteration": 2.473101854324341 }, { "auxiliary_loss_clip": 0.01205626, "auxiliary_loss_mlp": 0.01066825, "balance_loss_clip": 1.04042053, "balance_loss_mlp": 1.05621552, "epoch": 0.06493311288140688, "flos": 24608182331520.0, "grad_norm": 1.9449296887491883, "language_loss": 0.69416732, "learning_rate": 3.987213301260294e-06, "loss": 0.71689188, "num_input_tokens_seen": 23036420, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.5, "step": 1080, "time_per_iteration": 2.4856133460998535 }, { "auxiliary_loss_clip": 0.0120717, "auxiliary_loss_mlp": 0.01065134, "balance_loss_clip": 1.03814483, "balance_loss_mlp": 1.05621743, "epoch": 0.06499323613407486, "flos": 25337994865920.0, "grad_norm": 1.7114849515186765, "language_loss": 0.72148383, "learning_rate": 3.987169294370123e-06, "loss": 0.74420685, "num_input_tokens_seen": 23056945, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.5078125, "step": 1081, "time_per_iteration": 2.512742280960083 }, { "auxiliary_loss_clip": 0.01204144, "auxiliary_loss_mlp": 0.0106263, "balance_loss_clip": 1.03556919, "balance_loss_mlp": 1.05749607, "epoch": 0.06505335938674282, "flos": 20375714960640.0, "grad_norm": 2.548933905713627, "language_loss": 0.84203041, "learning_rate": 3.987125212126294e-06, "loss": 0.86469817, "num_input_tokens_seen": 23074940, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.46875, "step": 1082, "time_per_iteration": 2.4455103874206543 }, { "auxiliary_loss_clip": 0.01217173, "auxiliary_loss_mlp": 0.010688, "balance_loss_clip": 1.04214478, "balance_loss_mlp": 1.05945826, "epoch": 0.06511348263941079, "flos": 25337923038720.0, "grad_norm": 2.248068695795138, "language_loss": 0.8257345, "learning_rate": 3.987081054530478e-06, "loss": 0.84859419, "num_input_tokens_seen": 23093420, "router_z_loss_clip": 0.265625, "router_z_loss_mlp": 1.578125, "step": 1083, "time_per_iteration": 2.5088601112365723 }, { "auxiliary_loss_clip": 0.01209135, "auxiliary_loss_mlp": 0.01065139, "balance_loss_clip": 1.03828096, "balance_loss_mlp": 1.06000352, "epoch": 0.06517360589207877, "flos": 20332801186560.0, "grad_norm": 2.6192657624472164, "language_loss": 0.79774696, "learning_rate": 3.987036821584348e-06, "loss": 0.82048965, "num_input_tokens_seen": 23111550, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.4921875, "step": 1084, "time_per_iteration": 2.4502639770507812 }, { "auxiliary_loss_clip": 0.01207256, "auxiliary_loss_mlp": 0.01064251, "balance_loss_clip": 1.0388236, "balance_loss_mlp": 1.05852532, "epoch": 0.06523372914474673, "flos": 31681650061440.0, "grad_norm": 2.5087701237461815, "language_loss": 0.66072226, "learning_rate": 3.986992513289584e-06, "loss": 0.68343735, "num_input_tokens_seen": 23130335, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.484375, "step": 1085, "time_per_iteration": 2.5452492237091064 }, { "auxiliary_loss_clip": 0.01202811, "auxiliary_loss_mlp": 0.01070227, "balance_loss_clip": 1.04516947, "balance_loss_mlp": 1.05727208, "epoch": 0.0652938523974147, "flos": 20778165918720.0, "grad_norm": 1.8014126652788516, "language_loss": 0.76409161, "learning_rate": 3.9869481296478645e-06, "loss": 0.78682196, "num_input_tokens_seen": 23152380, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.453125, "step": 1086, "time_per_iteration": 2.483318567276001 }, { "auxiliary_loss_clip": 0.01203447, "auxiliary_loss_mlp": 0.01064127, "balance_loss_clip": 1.0381155, "balance_loss_mlp": 1.05629611, "epoch": 0.06535397565008266, "flos": 16690993061760.0, "grad_norm": 4.917990870961866, "language_loss": 0.84926867, "learning_rate": 3.986903670660872e-06, "loss": 0.87194443, "num_input_tokens_seen": 23171630, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.46875, "step": 1087, "time_per_iteration": 2.464637041091919 }, { "auxiliary_loss_clip": 0.01207206, "auxiliary_loss_mlp": 0.01058255, "balance_loss_clip": 1.03273261, "balance_loss_mlp": 1.05785131, "epoch": 0.06541409890275064, "flos": 26868220116480.0, "grad_norm": 1.7724625326744359, "language_loss": 0.78121275, "learning_rate": 3.9868591363302945e-06, "loss": 0.8038674, "num_input_tokens_seen": 23192520, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.5, "step": 1088, "time_per_iteration": 2.5083367824554443 }, { "auxiliary_loss_clip": 0.01207727, "auxiliary_loss_mlp": 0.01071431, "balance_loss_clip": 1.0478754, "balance_loss_mlp": 1.05998039, "epoch": 0.06547422215541861, "flos": 20521620005760.0, "grad_norm": 1.7073570222506675, "language_loss": 0.71451914, "learning_rate": 3.9868145266578186e-06, "loss": 0.73731077, "num_input_tokens_seen": 23210710, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.4765625, "step": 1089, "time_per_iteration": 2.470134973526001 }, { "auxiliary_loss_clip": 0.0120431, "auxiliary_loss_mlp": 0.01063577, "balance_loss_clip": 1.03994942, "balance_loss_mlp": 1.05846596, "epoch": 0.06553434540808657, "flos": 22016616992640.0, "grad_norm": 9.194662664055365, "language_loss": 0.85697252, "learning_rate": 3.9867698416451366e-06, "loss": 0.87965131, "num_input_tokens_seen": 23230305, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.453125, "step": 1090, "time_per_iteration": 2.4730494022369385 }, { "auxiliary_loss_clip": 0.01205993, "auxiliary_loss_mlp": 0.01059969, "balance_loss_clip": 1.03491092, "balance_loss_mlp": 1.05803823, "epoch": 0.06559446866075455, "flos": 24608649208320.0, "grad_norm": 1.8315813407091397, "language_loss": 0.72048986, "learning_rate": 3.9867250812939434e-06, "loss": 0.74314952, "num_input_tokens_seen": 23249015, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.4765625, "step": 1091, "time_per_iteration": 2.532548666000366 }, { "auxiliary_loss_clip": 0.01203903, "auxiliary_loss_mlp": 0.01065538, "balance_loss_clip": 1.04015779, "balance_loss_mlp": 1.05583608, "epoch": 0.06565459191342252, "flos": 24274679529600.0, "grad_norm": 2.4028006008750213, "language_loss": 0.82810128, "learning_rate": 3.986680245605936e-06, "loss": 0.85079575, "num_input_tokens_seen": 23265105, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.484375, "step": 1092, "time_per_iteration": 2.487614393234253 }, { "auxiliary_loss_clip": 0.01209617, "auxiliary_loss_mlp": 0.01065251, "balance_loss_clip": 1.03821421, "balance_loss_mlp": 1.05749512, "epoch": 0.06571471516609048, "flos": 24787124910720.0, "grad_norm": 1.8520877006543277, "language_loss": 0.71006465, "learning_rate": 3.986635334582814e-06, "loss": 0.73281336, "num_input_tokens_seen": 23283950, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.515625, "step": 1093, "time_per_iteration": 2.5074195861816406 }, { "auxiliary_loss_clip": 0.01206082, "auxiliary_loss_mlp": 0.01063336, "balance_loss_clip": 1.03640628, "balance_loss_mlp": 1.05843079, "epoch": 0.06577483841875846, "flos": 26214071581440.0, "grad_norm": 1.6199577714488957, "language_loss": 0.88004327, "learning_rate": 3.986590348226282e-06, "loss": 0.90273738, "num_input_tokens_seen": 23305005, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.4765625, "step": 1094, "time_per_iteration": 2.5085091590881348 }, { "auxiliary_loss_clip": 0.01210819, "auxiliary_loss_mlp": 0.01061914, "balance_loss_clip": 1.03461528, "balance_loss_mlp": 1.05994225, "epoch": 0.06583496167142643, "flos": 25080802508160.0, "grad_norm": 1.5579495153216782, "language_loss": 0.81425864, "learning_rate": 3.986545286538044e-06, "loss": 0.83698601, "num_input_tokens_seen": 23323220, "router_z_loss_clip": 0.2734375, "router_z_loss_mlp": 1.5078125, "step": 1095, "time_per_iteration": 2.5004427433013916 }, { "auxiliary_loss_clip": 0.01204387, "auxiliary_loss_mlp": 0.01059213, "balance_loss_clip": 1.03633726, "balance_loss_mlp": 1.05826712, "epoch": 0.06589508492409439, "flos": 25629804956160.0, "grad_norm": 2.1115896147924764, "language_loss": 0.70030451, "learning_rate": 3.986500149519811e-06, "loss": 0.72294044, "num_input_tokens_seen": 23342235, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.4609375, "step": 1096, "time_per_iteration": 2.482991933822632 }, { "auxiliary_loss_clip": 0.01210147, "auxiliary_loss_mlp": 0.01071842, "balance_loss_clip": 1.04599786, "balance_loss_mlp": 1.06140614, "epoch": 0.06595520817676236, "flos": 23621249266560.0, "grad_norm": 2.259507491294274, "language_loss": 0.77606744, "learning_rate": 3.986454937173292e-06, "loss": 0.79888737, "num_input_tokens_seen": 23363680, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.484375, "step": 1097, "time_per_iteration": 2.4790666103363037 }, { "auxiliary_loss_clip": 0.01208354, "auxiliary_loss_mlp": 0.01064423, "balance_loss_clip": 1.03940046, "balance_loss_mlp": 1.05830264, "epoch": 0.06601533142943034, "flos": 33801708545280.0, "grad_norm": 1.8632181024573358, "language_loss": 0.78204632, "learning_rate": 3.986409649500203e-06, "loss": 0.80477405, "num_input_tokens_seen": 23385590, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.5, "step": 1098, "time_per_iteration": 4.068421840667725 }, { "auxiliary_loss_clip": 0.01207007, "auxiliary_loss_mlp": 0.01076259, "balance_loss_clip": 1.0495199, "balance_loss_mlp": 1.05991626, "epoch": 0.0660754546820983, "flos": 20259184262400.0, "grad_norm": 1.776107818365578, "language_loss": 0.81720752, "learning_rate": 3.986364286502261e-06, "loss": 0.84004021, "num_input_tokens_seen": 23402945, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.4765625, "step": 1099, "time_per_iteration": 3.8823423385620117 }, { "auxiliary_loss_clip": 0.01199202, "auxiliary_loss_mlp": 0.01058277, "balance_loss_clip": 1.0332191, "balance_loss_mlp": 1.05415189, "epoch": 0.06613557793476627, "flos": 19354164163200.0, "grad_norm": 1.9854225310859965, "language_loss": 0.82808131, "learning_rate": 3.986318848181186e-06, "loss": 0.85065615, "num_input_tokens_seen": 23421410, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.4453125, "step": 1100, "time_per_iteration": 3.8695931434631348 }, { "auxiliary_loss_clip": 0.01205841, "auxiliary_loss_mlp": 0.01060579, "balance_loss_clip": 1.03578341, "balance_loss_mlp": 1.0593816, "epoch": 0.06619570118743424, "flos": 13772568936960.0, "grad_norm": 3.4084944539544546, "language_loss": 0.73612678, "learning_rate": 3.986273334538702e-06, "loss": 0.75879097, "num_input_tokens_seen": 23438870, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.46875, "step": 1101, "time_per_iteration": 2.4344277381896973 }, { "auxiliary_loss_clip": 0.01202387, "auxiliary_loss_mlp": 0.01063972, "balance_loss_clip": 1.03968871, "balance_loss_mlp": 1.05542636, "epoch": 0.06625582444010221, "flos": 17857874286720.0, "grad_norm": 2.3668910256104687, "language_loss": 0.85866511, "learning_rate": 3.986227745576533e-06, "loss": 0.8813287, "num_input_tokens_seen": 23456975, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.46875, "step": 1102, "time_per_iteration": 2.446394443511963 }, { "auxiliary_loss_clip": 0.01204169, "auxiliary_loss_mlp": 0.01057883, "balance_loss_clip": 1.03278947, "balance_loss_mlp": 1.05836833, "epoch": 0.06631594769277017, "flos": 11838707579520.0, "grad_norm": 2.039637458522551, "language_loss": 0.81715512, "learning_rate": 3.98618208129641e-06, "loss": 0.83977568, "num_input_tokens_seen": 23473440, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.453125, "step": 1103, "time_per_iteration": 2.4427337646484375 }, { "auxiliary_loss_clip": 0.01207621, "auxiliary_loss_mlp": 0.01062344, "balance_loss_clip": 1.03900325, "balance_loss_mlp": 1.06213069, "epoch": 0.06637607094543815, "flos": 19793351756160.0, "grad_norm": 2.0168228186292523, "language_loss": 0.82315373, "learning_rate": 3.986136341700063e-06, "loss": 0.84585339, "num_input_tokens_seen": 23493880, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.453125, "step": 1104, "time_per_iteration": 2.4874205589294434 }, { "auxiliary_loss_clip": 0.0119833, "auxiliary_loss_mlp": 0.01051774, "balance_loss_clip": 1.02566743, "balance_loss_mlp": 1.05288267, "epoch": 0.06643619419810612, "flos": 25485659677440.0, "grad_norm": 1.6769428014064884, "language_loss": 0.80554259, "learning_rate": 3.986090526789227e-06, "loss": 0.82804358, "num_input_tokens_seen": 23514920, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.453125, "step": 1105, "time_per_iteration": 2.5046746730804443 }, { "auxiliary_loss_clip": 0.01200038, "auxiliary_loss_mlp": 0.01060093, "balance_loss_clip": 1.03725207, "balance_loss_mlp": 1.057868, "epoch": 0.06649631745077408, "flos": 16946533393920.0, "grad_norm": 1.7910313793845003, "language_loss": 0.9697423, "learning_rate": 3.986044636565639e-06, "loss": 0.99234366, "num_input_tokens_seen": 23531635, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.421875, "step": 1106, "time_per_iteration": 2.4553775787353516 }, { "auxiliary_loss_clip": 0.01207399, "auxiliary_loss_mlp": 0.01061726, "balance_loss_clip": 1.03639364, "balance_loss_mlp": 1.05717278, "epoch": 0.06655644070344206, "flos": 17858592558720.0, "grad_norm": 2.0337051646969306, "language_loss": 0.82567394, "learning_rate": 3.985998671031039e-06, "loss": 0.84836519, "num_input_tokens_seen": 23551020, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.5078125, "step": 1107, "time_per_iteration": 2.44316029548645 }, { "auxiliary_loss_clip": 0.01085608, "auxiliary_loss_mlp": 0.01008438, "balance_loss_clip": 1.00400341, "balance_loss_mlp": 1.02718079, "epoch": 0.06661656395611003, "flos": 61419350021760.0, "grad_norm": 0.7946422928279512, "language_loss": 0.56813908, "learning_rate": 3.9859526301871705e-06, "loss": 0.5890795, "num_input_tokens_seen": 23610675, "router_z_loss_clip": 0.04443359, "router_z_loss_mlp": 0.5859375, "step": 1108, "time_per_iteration": 3.061859607696533 }, { "auxiliary_loss_clip": 0.01202552, "auxiliary_loss_mlp": 0.01063695, "balance_loss_clip": 1.03811252, "balance_loss_mlp": 1.0550096, "epoch": 0.066676687208778, "flos": 20662856282880.0, "grad_norm": 4.838674963071793, "language_loss": 0.72814894, "learning_rate": 3.9859065140357795e-06, "loss": 0.75081134, "num_input_tokens_seen": 23628710, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.4765625, "step": 1109, "time_per_iteration": 2.521239757537842 }, { "auxiliary_loss_clip": 0.01199781, "auxiliary_loss_mlp": 0.01064081, "balance_loss_clip": 1.03808117, "balance_loss_mlp": 1.05392838, "epoch": 0.06673681046144596, "flos": 20923280864640.0, "grad_norm": 1.7173087933128681, "language_loss": 0.77959418, "learning_rate": 3.985860322578614e-06, "loss": 0.8022328, "num_input_tokens_seen": 23649160, "router_z_loss_clip": 0.25976562, "router_z_loss_mlp": 1.453125, "step": 1110, "time_per_iteration": 2.4560108184814453 }, { "auxiliary_loss_clip": 0.01201969, "auxiliary_loss_mlp": 0.01057079, "balance_loss_clip": 1.03360653, "balance_loss_mlp": 1.05575609, "epoch": 0.06679693371411394, "flos": 31065818359680.0, "grad_norm": 1.918494667266849, "language_loss": 0.71699804, "learning_rate": 3.985814055817427e-06, "loss": 0.7395885, "num_input_tokens_seen": 23671995, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.46875, "step": 1111, "time_per_iteration": 2.5565083026885986 }, { "auxiliary_loss_clip": 0.01207403, "auxiliary_loss_mlp": 0.01068193, "balance_loss_clip": 1.04394555, "balance_loss_mlp": 1.05802441, "epoch": 0.0668570569667819, "flos": 21726135705600.0, "grad_norm": 1.7996971544071971, "language_loss": 0.78520977, "learning_rate": 3.985767713753971e-06, "loss": 0.80796576, "num_input_tokens_seen": 23690705, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.4921875, "step": 1112, "time_per_iteration": 2.507981538772583 }, { "auxiliary_loss_clip": 0.01204082, "auxiliary_loss_mlp": 0.01059309, "balance_loss_clip": 1.03521681, "balance_loss_mlp": 1.0565542, "epoch": 0.06691718021944987, "flos": 22747255539840.0, "grad_norm": 2.1836003960045214, "language_loss": 0.79052186, "learning_rate": 3.985721296390005e-06, "loss": 0.81315577, "num_input_tokens_seen": 23709990, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.4765625, "step": 1113, "time_per_iteration": 2.48008131980896 }, { "auxiliary_loss_clip": 0.01195249, "auxiliary_loss_mlp": 0.01052674, "balance_loss_clip": 1.02929699, "balance_loss_mlp": 1.05175591, "epoch": 0.06697730347211785, "flos": 16545626720640.0, "grad_norm": 2.0572667515207863, "language_loss": 0.8236661, "learning_rate": 3.985674803727289e-06, "loss": 0.84614533, "num_input_tokens_seen": 23728485, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.4375, "step": 1114, "time_per_iteration": 2.4265692234039307 }, { "auxiliary_loss_clip": 0.01080924, "auxiliary_loss_mlp": 0.010053, "balance_loss_clip": 1.00124645, "balance_loss_mlp": 1.02262926, "epoch": 0.06703742672478581, "flos": 59782326658560.0, "grad_norm": 0.8279985817435634, "language_loss": 0.58190584, "learning_rate": 3.985628235767584e-06, "loss": 0.60276806, "num_input_tokens_seen": 23786650, "router_z_loss_clip": 0.04052734, "router_z_loss_mlp": 0.58203125, "step": 1115, "time_per_iteration": 3.0433883666992188 }, { "auxiliary_loss_clip": 0.01202053, "auxiliary_loss_mlp": 0.01059731, "balance_loss_clip": 1.03354025, "balance_loss_mlp": 1.05614424, "epoch": 0.06709754997745378, "flos": 16800197385600.0, "grad_norm": 2.442676645122879, "language_loss": 0.91452599, "learning_rate": 3.985581592512658e-06, "loss": 0.9371438, "num_input_tokens_seen": 23802555, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.4609375, "step": 1116, "time_per_iteration": 2.4243273735046387 }, { "auxiliary_loss_clip": 0.01209966, "auxiliary_loss_mlp": 0.01063666, "balance_loss_clip": 1.0384059, "balance_loss_mlp": 1.05995727, "epoch": 0.06715767323012176, "flos": 22123917895680.0, "grad_norm": 1.9142107597087712, "language_loss": 0.87094486, "learning_rate": 3.985534873964279e-06, "loss": 0.89368117, "num_input_tokens_seen": 23822945, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.5, "step": 1117, "time_per_iteration": 2.518495798110962 }, { "auxiliary_loss_clip": 0.01077908, "auxiliary_loss_mlp": 0.01004046, "balance_loss_clip": 1.0003264, "balance_loss_mlp": 1.01994848, "epoch": 0.06721779648278972, "flos": 66618100137600.0, "grad_norm": 0.8712113575336745, "language_loss": 0.59790039, "learning_rate": 3.985488080124218e-06, "loss": 0.61871994, "num_input_tokens_seen": 23874075, "router_z_loss_clip": 0.03710938, "router_z_loss_mlp": 0.578125, "step": 1118, "time_per_iteration": 3.023294687271118 }, { "auxiliary_loss_clip": 0.01202467, "auxiliary_loss_mlp": 0.0105338, "balance_loss_clip": 1.02892995, "balance_loss_mlp": 1.05427289, "epoch": 0.06727791973545769, "flos": 22382474970240.0, "grad_norm": 3.457767048124595, "language_loss": 0.83945805, "learning_rate": 3.985441210994251e-06, "loss": 0.86201656, "num_input_tokens_seen": 23889720, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.484375, "step": 1119, "time_per_iteration": 2.4752655029296875 }, { "auxiliary_loss_clip": 0.01199649, "auxiliary_loss_mlp": 0.01056803, "balance_loss_clip": 1.03424895, "balance_loss_mlp": 1.05605721, "epoch": 0.06733804298812565, "flos": 24280210224000.0, "grad_norm": 1.7859909001173666, "language_loss": 0.84715188, "learning_rate": 3.9853942665761545e-06, "loss": 0.86971635, "num_input_tokens_seen": 23909385, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.4375, "step": 1120, "time_per_iteration": 2.486453056335449 }, { "auxiliary_loss_clip": 0.01209017, "auxiliary_loss_mlp": 0.01067492, "balance_loss_clip": 1.04252946, "balance_loss_mlp": 1.06031275, "epoch": 0.06739816624079363, "flos": 15918230839680.0, "grad_norm": 1.8211200433920853, "language_loss": 0.79157972, "learning_rate": 3.985347246871708e-06, "loss": 0.81434482, "num_input_tokens_seen": 23926830, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.484375, "step": 1121, "time_per_iteration": 2.4519882202148438 }, { "auxiliary_loss_clip": 0.0107425, "auxiliary_loss_mlp": 0.01004997, "balance_loss_clip": 1.00115836, "balance_loss_mlp": 1.01638162, "epoch": 0.0674582894934616, "flos": 71398567353600.0, "grad_norm": 0.7569855624147593, "language_loss": 0.58411789, "learning_rate": 3.985300151882694e-06, "loss": 0.60491037, "num_input_tokens_seen": 23992640, "router_z_loss_clip": 0.03833008, "router_z_loss_mlp": 0.578125, "step": 1122, "time_per_iteration": 3.2340753078460693 }, { "auxiliary_loss_clip": 0.01205614, "auxiliary_loss_mlp": 0.01057838, "balance_loss_clip": 1.03421092, "balance_loss_mlp": 1.05961275, "epoch": 0.06751841274612956, "flos": 25264952559360.0, "grad_norm": 1.9589377057476867, "language_loss": 0.71745604, "learning_rate": 3.985252981610901e-06, "loss": 0.74009061, "num_input_tokens_seen": 24011135, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.4609375, "step": 1123, "time_per_iteration": 2.5027151107788086 }, { "auxiliary_loss_clip": 0.01203689, "auxiliary_loss_mlp": 0.01056946, "balance_loss_clip": 1.03136373, "balance_loss_mlp": 1.05664945, "epoch": 0.06757853599879754, "flos": 23802741711360.0, "grad_norm": 1.7317568414736735, "language_loss": 0.79180348, "learning_rate": 3.985205736058114e-06, "loss": 0.81440985, "num_input_tokens_seen": 24030695, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.46875, "step": 1124, "time_per_iteration": 2.5012052059173584 }, { "auxiliary_loss_clip": 0.01198975, "auxiliary_loss_mlp": 0.01053054, "balance_loss_clip": 1.03026152, "balance_loss_mlp": 1.05585647, "epoch": 0.0676386592514655, "flos": 21033742164480.0, "grad_norm": 2.3656865582835827, "language_loss": 0.71427584, "learning_rate": 3.985158415226128e-06, "loss": 0.73679614, "num_input_tokens_seen": 24050680, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.4296875, "step": 1125, "time_per_iteration": 2.469292402267456 }, { "auxiliary_loss_clip": 0.01204392, "auxiliary_loss_mlp": 0.01069482, "balance_loss_clip": 1.04275525, "balance_loss_mlp": 1.05881929, "epoch": 0.06769878250413347, "flos": 25556331686400.0, "grad_norm": 2.6869836934768796, "language_loss": 0.81312382, "learning_rate": 3.985111019116736e-06, "loss": 0.83586252, "num_input_tokens_seen": 24067205, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.453125, "step": 1126, "time_per_iteration": 2.572234630584717 }, { "auxiliary_loss_clip": 0.01070721, "auxiliary_loss_mlp": 0.010101, "balance_loss_clip": 1.00623798, "balance_loss_mlp": 1.01308107, "epoch": 0.06775890575680145, "flos": 70655251305600.0, "grad_norm": 0.7763117026320523, "language_loss": 0.59744799, "learning_rate": 3.985063547731735e-06, "loss": 0.61825621, "num_input_tokens_seen": 24131320, "router_z_loss_clip": 0.03857422, "router_z_loss_mlp": 0.578125, "step": 1127, "time_per_iteration": 3.1070668697357178 }, { "auxiliary_loss_clip": 0.01202041, "auxiliary_loss_mlp": 0.01059438, "balance_loss_clip": 1.03498864, "balance_loss_mlp": 1.05776978, "epoch": 0.06781902900946941, "flos": 24235500769920.0, "grad_norm": 2.078039860633893, "language_loss": 0.81609917, "learning_rate": 3.985016001072925e-06, "loss": 0.838714, "num_input_tokens_seen": 24149930, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.4375, "step": 1128, "time_per_iteration": 2.5032145977020264 }, { "auxiliary_loss_clip": 0.01209448, "auxiliary_loss_mlp": 0.01052869, "balance_loss_clip": 1.02770352, "balance_loss_mlp": 1.0605042, "epoch": 0.06787915226213738, "flos": 22417523665920.0, "grad_norm": 2.1635536602762437, "language_loss": 0.75925565, "learning_rate": 3.984968379142109e-06, "loss": 0.78187877, "num_input_tokens_seen": 24169590, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.484375, "step": 1129, "time_per_iteration": 2.457608461380005 }, { "auxiliary_loss_clip": 0.01204357, "auxiliary_loss_mlp": 0.0106157, "balance_loss_clip": 1.03695321, "balance_loss_mlp": 1.05714107, "epoch": 0.06793927551480534, "flos": 37706922080640.0, "grad_norm": 1.8262073234483707, "language_loss": 0.72000813, "learning_rate": 3.984920681941094e-06, "loss": 0.74266744, "num_input_tokens_seen": 24189965, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.46875, "step": 1130, "time_per_iteration": 2.6096818447113037 }, { "auxiliary_loss_clip": 0.01201539, "auxiliary_loss_mlp": 0.01063865, "balance_loss_clip": 1.0391407, "balance_loss_mlp": 1.05764687, "epoch": 0.06799939876747332, "flos": 20631398947200.0, "grad_norm": 2.021381379098316, "language_loss": 0.80585444, "learning_rate": 3.984872909471688e-06, "loss": 0.8285085, "num_input_tokens_seen": 24208045, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.4375, "step": 1131, "time_per_iteration": 2.4881677627563477 }, { "auxiliary_loss_clip": 0.01198513, "auxiliary_loss_mlp": 0.01067934, "balance_loss_clip": 1.04325747, "balance_loss_mlp": 1.05619335, "epoch": 0.06805952202014129, "flos": 14864755829760.0, "grad_norm": 2.247183338031825, "language_loss": 0.80636942, "learning_rate": 3.984825061735701e-06, "loss": 0.82903397, "num_input_tokens_seen": 24223805, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.421875, "step": 1132, "time_per_iteration": 2.451589345932007 }, { "auxiliary_loss_clip": 0.01201199, "auxiliary_loss_mlp": 0.01066395, "balance_loss_clip": 1.04156363, "balance_loss_mlp": 1.05643439, "epoch": 0.06811964527280925, "flos": 48909434947200.0, "grad_norm": 1.4397329609269784, "language_loss": 0.6357882, "learning_rate": 3.9847771387349495e-06, "loss": 0.65846413, "num_input_tokens_seen": 24249475, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.4453125, "step": 1133, "time_per_iteration": 2.7115650177001953 }, { "auxiliary_loss_clip": 0.01203427, "auxiliary_loss_mlp": 0.0105661, "balance_loss_clip": 1.02909684, "balance_loss_mlp": 1.05456424, "epoch": 0.06817976852547723, "flos": 15377273038080.0, "grad_norm": 2.0195687268621807, "language_loss": 0.74721098, "learning_rate": 3.9847291404712506e-06, "loss": 0.76981133, "num_input_tokens_seen": 24267980, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.484375, "step": 1134, "time_per_iteration": 2.4575564861297607 }, { "auxiliary_loss_clip": 0.01202241, "auxiliary_loss_mlp": 0.01060373, "balance_loss_clip": 1.03718674, "balance_loss_mlp": 1.05957091, "epoch": 0.0682398917781452, "flos": 20155690200960.0, "grad_norm": 2.9395232896548444, "language_loss": 0.87222433, "learning_rate": 3.984681066946423e-06, "loss": 0.89485049, "num_input_tokens_seen": 24286805, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.421875, "step": 1135, "time_per_iteration": 2.4445958137512207 }, { "auxiliary_loss_clip": 0.01203822, "auxiliary_loss_mlp": 0.01055563, "balance_loss_clip": 1.03008771, "balance_loss_mlp": 1.05556536, "epoch": 0.06830001503081316, "flos": 23440618748160.0, "grad_norm": 2.3506544437944927, "language_loss": 0.78577346, "learning_rate": 3.984632918162291e-06, "loss": 0.80836737, "num_input_tokens_seen": 24305855, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.484375, "step": 1136, "time_per_iteration": 2.482506275177002 }, { "auxiliary_loss_clip": 0.01208516, "auxiliary_loss_mlp": 0.01069434, "balance_loss_clip": 1.0448885, "balance_loss_mlp": 1.0625937, "epoch": 0.06836013828348114, "flos": 34349813153280.0, "grad_norm": 2.1182501666615585, "language_loss": 0.83987415, "learning_rate": 3.984584694120679e-06, "loss": 0.86265367, "num_input_tokens_seen": 24326535, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.453125, "step": 1137, "time_per_iteration": 2.5643889904022217 }, { "auxiliary_loss_clip": 0.01199468, "auxiliary_loss_mlp": 0.01060012, "balance_loss_clip": 1.03549027, "balance_loss_mlp": 1.0566082, "epoch": 0.06842026153614911, "flos": 23148844571520.0, "grad_norm": 2.3166542698007953, "language_loss": 0.78944778, "learning_rate": 3.984536394823418e-06, "loss": 0.81204259, "num_input_tokens_seen": 24345810, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.4296875, "step": 1138, "time_per_iteration": 2.490246534347534 }, { "auxiliary_loss_clip": 0.01205357, "auxiliary_loss_mlp": 0.01055272, "balance_loss_clip": 1.03076315, "balance_loss_mlp": 1.0584929, "epoch": 0.06848038478881707, "flos": 24608972430720.0, "grad_norm": 3.612587476829004, "language_loss": 0.85460699, "learning_rate": 3.984488020272336e-06, "loss": 0.87721324, "num_input_tokens_seen": 24366095, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.46875, "step": 1139, "time_per_iteration": 2.4978857040405273 }, { "auxiliary_loss_clip": 0.01202915, "auxiliary_loss_mlp": 0.01057193, "balance_loss_clip": 1.0323379, "balance_loss_mlp": 1.05817282, "epoch": 0.06854050804148504, "flos": 40880994278400.0, "grad_norm": 2.2104321496145807, "language_loss": 0.74908942, "learning_rate": 3.984439570469271e-06, "loss": 0.77169049, "num_input_tokens_seen": 24388665, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.4453125, "step": 1140, "time_per_iteration": 4.0662548542022705 }, { "auxiliary_loss_clip": 0.01200304, "auxiliary_loss_mlp": 0.01067162, "balance_loss_clip": 1.0415442, "balance_loss_mlp": 1.05759752, "epoch": 0.06860063129415302, "flos": 31686354743040.0, "grad_norm": 2.441157404870021, "language_loss": 0.68150318, "learning_rate": 3.9843910454160574e-06, "loss": 0.70417786, "num_input_tokens_seen": 24407705, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.4296875, "step": 1141, "time_per_iteration": 6.872981309890747 }, { "auxiliary_loss_clip": 0.01206205, "auxiliary_loss_mlp": 0.01066719, "balance_loss_clip": 1.04029036, "balance_loss_mlp": 1.05938005, "epoch": 0.06866075454682098, "flos": 26542007775360.0, "grad_norm": 4.615793391391671, "language_loss": 0.79479921, "learning_rate": 3.984342445114538e-06, "loss": 0.81752843, "num_input_tokens_seen": 24428390, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.46875, "step": 1142, "time_per_iteration": 2.52908992767334 }, { "auxiliary_loss_clip": 0.01203112, "auxiliary_loss_mlp": 0.01060314, "balance_loss_clip": 1.03580475, "balance_loss_mlp": 1.05906487, "epoch": 0.06872087779948895, "flos": 29789768724480.0, "grad_norm": 1.7812590652460294, "language_loss": 0.68556535, "learning_rate": 3.984293769566553e-06, "loss": 0.70819962, "num_input_tokens_seen": 24450810, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.4375, "step": 1143, "time_per_iteration": 2.5400502681732178 }, { "auxiliary_loss_clip": 0.01197971, "auxiliary_loss_mlp": 0.01058946, "balance_loss_clip": 1.03666532, "balance_loss_mlp": 1.06017971, "epoch": 0.06878100105215693, "flos": 26941118768640.0, "grad_norm": 1.8571517095159615, "language_loss": 0.74535871, "learning_rate": 3.98424501877395e-06, "loss": 0.76792789, "num_input_tokens_seen": 24469965, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.375, "step": 1144, "time_per_iteration": 2.5308852195739746 }, { "auxiliary_loss_clip": 0.01208738, "auxiliary_loss_mlp": 0.01062725, "balance_loss_clip": 1.03666568, "balance_loss_mlp": 1.05933201, "epoch": 0.06884112430482489, "flos": 10670748946560.0, "grad_norm": 2.098109992180984, "language_loss": 0.91785717, "learning_rate": 3.984196192738577e-06, "loss": 0.94057178, "num_input_tokens_seen": 24486370, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.5, "step": 1145, "time_per_iteration": 2.448420763015747 }, { "auxiliary_loss_clip": 0.01210536, "auxiliary_loss_mlp": 0.01069474, "balance_loss_clip": 1.04263961, "balance_loss_mlp": 1.05935025, "epoch": 0.06890124755749286, "flos": 20193647898240.0, "grad_norm": 6.446468876443106, "language_loss": 0.82170916, "learning_rate": 3.984147291462285e-06, "loss": 0.84450924, "num_input_tokens_seen": 24503780, "router_z_loss_clip": 0.26953125, "router_z_loss_mlp": 1.515625, "step": 1146, "time_per_iteration": 2.4779090881347656 }, { "auxiliary_loss_clip": 0.01203034, "auxiliary_loss_mlp": 0.01065404, "balance_loss_clip": 1.04165792, "balance_loss_mlp": 1.06041336, "epoch": 0.06896137081016084, "flos": 20449224144000.0, "grad_norm": 2.183821034979968, "language_loss": 0.85199469, "learning_rate": 3.98409831494693e-06, "loss": 0.87467903, "num_input_tokens_seen": 24522320, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.421875, "step": 1147, "time_per_iteration": 2.4666836261749268 }, { "auxiliary_loss_clip": 0.01204926, "auxiliary_loss_mlp": 0.01067175, "balance_loss_clip": 1.04211724, "balance_loss_mlp": 1.0597806, "epoch": 0.0690214940628288, "flos": 18368703555840.0, "grad_norm": 1.7200858912371721, "language_loss": 0.85954094, "learning_rate": 3.984049263194367e-06, "loss": 0.88226193, "num_input_tokens_seen": 24540445, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.453125, "step": 1148, "time_per_iteration": 2.4448983669281006 }, { "auxiliary_loss_clip": 0.01205291, "auxiliary_loss_mlp": 0.01058617, "balance_loss_clip": 1.03397632, "balance_loss_mlp": 1.05967283, "epoch": 0.06908161731549677, "flos": 20558033418240.0, "grad_norm": 2.90431715565894, "language_loss": 0.69073176, "learning_rate": 3.9840001362064575e-06, "loss": 0.71337092, "num_input_tokens_seen": 24557105, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.453125, "step": 1149, "time_per_iteration": 2.4588494300842285 }, { "auxiliary_loss_clip": 0.01208353, "auxiliary_loss_mlp": 0.01054845, "balance_loss_clip": 1.02932286, "balance_loss_mlp": 1.05971169, "epoch": 0.06914174056816474, "flos": 27563666313600.0, "grad_norm": 1.9155451603648272, "language_loss": 0.83941019, "learning_rate": 3.983950933985064e-06, "loss": 0.86204219, "num_input_tokens_seen": 24578240, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.484375, "step": 1150, "time_per_iteration": 2.527686834335327 }, { "auxiliary_loss_clip": 0.01210176, "auxiliary_loss_mlp": 0.01062277, "balance_loss_clip": 1.03702855, "balance_loss_mlp": 1.06356823, "epoch": 0.06920186382083271, "flos": 15304015249920.0, "grad_norm": 4.260253216308726, "language_loss": 0.8155511, "learning_rate": 3.983901656532052e-06, "loss": 0.83827561, "num_input_tokens_seen": 24593585, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.46875, "step": 1151, "time_per_iteration": 2.4315848350524902 }, { "auxiliary_loss_clip": 0.01207327, "auxiliary_loss_mlp": 0.01063324, "balance_loss_clip": 1.03937459, "balance_loss_mlp": 1.06446791, "epoch": 0.06926198707350067, "flos": 25191227894400.0, "grad_norm": 1.8202335771073468, "language_loss": 0.85529912, "learning_rate": 3.983852303849291e-06, "loss": 0.87800562, "num_input_tokens_seen": 24613110, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.4296875, "step": 1152, "time_per_iteration": 2.5262417793273926 }, { "auxiliary_loss_clip": 0.01203164, "auxiliary_loss_mlp": 0.01062182, "balance_loss_clip": 1.03866208, "balance_loss_mlp": 1.06120753, "epoch": 0.06932211032616864, "flos": 13256137146240.0, "grad_norm": 2.1222359108617526, "language_loss": 0.90695065, "learning_rate": 3.983802875938651e-06, "loss": 0.92960411, "num_input_tokens_seen": 24628795, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.421875, "step": 1153, "time_per_iteration": 2.4298641681671143 }, { "auxiliary_loss_clip": 0.01207672, "auxiliary_loss_mlp": 0.01056144, "balance_loss_clip": 1.03176546, "balance_loss_mlp": 1.06300271, "epoch": 0.06938223357883662, "flos": 24827381078400.0, "grad_norm": 2.1138550250948316, "language_loss": 0.81575179, "learning_rate": 3.983753372802008e-06, "loss": 0.83838987, "num_input_tokens_seen": 24645480, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.4453125, "step": 1154, "time_per_iteration": 2.5205652713775635 }, { "auxiliary_loss_clip": 0.01207987, "auxiliary_loss_mlp": 0.01062579, "balance_loss_clip": 1.03858209, "balance_loss_mlp": 1.06455874, "epoch": 0.06944235683150458, "flos": 27267977554560.0, "grad_norm": 2.1584014622449366, "language_loss": 0.75218564, "learning_rate": 3.983703794441237e-06, "loss": 0.77489138, "num_input_tokens_seen": 24664630, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.4296875, "step": 1155, "time_per_iteration": 2.516846179962158 }, { "auxiliary_loss_clip": 0.01199947, "auxiliary_loss_mlp": 0.01065624, "balance_loss_clip": 1.0420922, "balance_loss_mlp": 1.0573678, "epoch": 0.06950248008417255, "flos": 25808065176960.0, "grad_norm": 1.7947168726342968, "language_loss": 0.70868635, "learning_rate": 3.98365414085822e-06, "loss": 0.73134208, "num_input_tokens_seen": 24684210, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.421875, "step": 1156, "time_per_iteration": 2.532663345336914 }, { "auxiliary_loss_clip": 0.01203609, "auxiliary_loss_mlp": 0.01068082, "balance_loss_clip": 1.04195094, "balance_loss_mlp": 1.06006479, "epoch": 0.06956260333684053, "flos": 22271546793600.0, "grad_norm": 1.9591076130454665, "language_loss": 0.75116593, "learning_rate": 3.98360441205484e-06, "loss": 0.77388287, "num_input_tokens_seen": 24702490, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.4375, "step": 1157, "time_per_iteration": 2.4671928882598877 }, { "auxiliary_loss_clip": 0.01204444, "auxiliary_loss_mlp": 0.01057475, "balance_loss_clip": 1.03261971, "balance_loss_mlp": 1.05881155, "epoch": 0.0696227265895085, "flos": 29681390413440.0, "grad_norm": 2.5944963368085325, "language_loss": 0.71742868, "learning_rate": 3.983554608032982e-06, "loss": 0.74004787, "num_input_tokens_seen": 24724340, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.453125, "step": 1158, "time_per_iteration": 2.6506943702697754 }, { "auxiliary_loss_clip": 0.01207278, "auxiliary_loss_mlp": 0.01061691, "balance_loss_clip": 1.03628778, "balance_loss_mlp": 1.05995083, "epoch": 0.06968284984217646, "flos": 25523545547520.0, "grad_norm": 1.8292817533395571, "language_loss": 0.79899377, "learning_rate": 3.983504728794533e-06, "loss": 0.82168353, "num_input_tokens_seen": 24745550, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.4765625, "step": 1159, "time_per_iteration": 2.5262746810913086 }, { "auxiliary_loss_clip": 0.01208421, "auxiliary_loss_mlp": 0.01065428, "balance_loss_clip": 1.037938, "balance_loss_mlp": 1.06219184, "epoch": 0.06974297309484444, "flos": 20698192287360.0, "grad_norm": 2.6258723899707808, "language_loss": 0.81065226, "learning_rate": 3.983454774341387e-06, "loss": 0.83339071, "num_input_tokens_seen": 24762575, "router_z_loss_clip": 0.27539062, "router_z_loss_mlp": 1.4609375, "step": 1160, "time_per_iteration": 2.4920237064361572 }, { "auxiliary_loss_clip": 0.01201043, "auxiliary_loss_mlp": 0.01060556, "balance_loss_clip": 1.03491378, "balance_loss_mlp": 1.05613959, "epoch": 0.0698030963475124, "flos": 26505199313280.0, "grad_norm": 1.7067647104842956, "language_loss": 0.75694895, "learning_rate": 3.983404744675437e-06, "loss": 0.77956486, "num_input_tokens_seen": 24782605, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.453125, "step": 1161, "time_per_iteration": 2.5365798473358154 }, { "auxiliary_loss_clip": 0.01200529, "auxiliary_loss_mlp": 0.01064997, "balance_loss_clip": 1.03888965, "balance_loss_mlp": 1.05665946, "epoch": 0.06986321960018037, "flos": 23040430346880.0, "grad_norm": 1.6526342603731508, "language_loss": 0.82894349, "learning_rate": 3.9833546397985794e-06, "loss": 0.85159874, "num_input_tokens_seen": 24802910, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.4375, "step": 1162, "time_per_iteration": 2.50223708152771 }, { "auxiliary_loss_clip": 0.01198912, "auxiliary_loss_mlp": 0.01054506, "balance_loss_clip": 1.02934086, "balance_loss_mlp": 1.05725145, "epoch": 0.06992334285284833, "flos": 28584822061440.0, "grad_norm": 2.304653899046666, "language_loss": 0.79329717, "learning_rate": 3.983304459712716e-06, "loss": 0.81583142, "num_input_tokens_seen": 24823305, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.4140625, "step": 1163, "time_per_iteration": 2.5293312072753906 }, { "auxiliary_loss_clip": 0.01203086, "auxiliary_loss_mlp": 0.01062282, "balance_loss_clip": 1.03609157, "balance_loss_mlp": 1.05754995, "epoch": 0.06998346610551631, "flos": 20595344670720.0, "grad_norm": 2.0405371528211145, "language_loss": 0.79513949, "learning_rate": 3.983254204419749e-06, "loss": 0.81779325, "num_input_tokens_seen": 24842155, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.453125, "step": 1164, "time_per_iteration": 2.500373125076294 }, { "auxiliary_loss_clip": 0.01203615, "auxiliary_loss_mlp": 0.01069818, "balance_loss_clip": 1.04305506, "balance_loss_mlp": 1.058056, "epoch": 0.07004358935818428, "flos": 22528810978560.0, "grad_norm": 1.5105820718780631, "language_loss": 0.72804826, "learning_rate": 3.983203873921583e-06, "loss": 0.75078261, "num_input_tokens_seen": 24862080, "router_z_loss_clip": 0.26757812, "router_z_loss_mlp": 1.453125, "step": 1165, "time_per_iteration": 2.5157628059387207 }, { "auxiliary_loss_clip": 0.01201959, "auxiliary_loss_mlp": 0.01057321, "balance_loss_clip": 1.03295445, "balance_loss_mlp": 1.05844033, "epoch": 0.07010371261085224, "flos": 28949997680640.0, "grad_norm": 1.7392771963035423, "language_loss": 0.81135631, "learning_rate": 3.983153468220128e-06, "loss": 0.83394909, "num_input_tokens_seen": 24886165, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.4375, "step": 1166, "time_per_iteration": 2.5585641860961914 }, { "auxiliary_loss_clip": 0.01199626, "auxiliary_loss_mlp": 0.01046566, "balance_loss_clip": 1.02092433, "balance_loss_mlp": 1.05650592, "epoch": 0.07016383586352022, "flos": 23659171050240.0, "grad_norm": 2.1782626443104363, "language_loss": 0.84516537, "learning_rate": 3.983102987317295e-06, "loss": 0.86762726, "num_input_tokens_seen": 24905775, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4296875, "step": 1167, "time_per_iteration": 2.4844260215759277 }, { "auxiliary_loss_clip": 0.01204718, "auxiliary_loss_mlp": 0.01060059, "balance_loss_clip": 1.0345006, "balance_loss_mlp": 1.05901039, "epoch": 0.07022395911618819, "flos": 19792130693760.0, "grad_norm": 2.2103413407720782, "language_loss": 0.89582729, "learning_rate": 3.983052431214997e-06, "loss": 0.91847509, "num_input_tokens_seen": 24924295, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.453125, "step": 1168, "time_per_iteration": 2.472085952758789 }, { "auxiliary_loss_clip": 0.01209481, "auxiliary_loss_mlp": 0.01068501, "balance_loss_clip": 1.03865123, "balance_loss_mlp": 1.05960548, "epoch": 0.07028408236885615, "flos": 21689147675520.0, "grad_norm": 1.8434404902635362, "language_loss": 0.88828194, "learning_rate": 3.983001799915153e-06, "loss": 0.91106176, "num_input_tokens_seen": 24943210, "router_z_loss_clip": 0.29882812, "router_z_loss_mlp": 1.5, "step": 1169, "time_per_iteration": 2.4582154750823975 }, { "auxiliary_loss_clip": 0.01207071, "auxiliary_loss_mlp": 0.01065389, "balance_loss_clip": 1.03928208, "balance_loss_mlp": 1.06063664, "epoch": 0.07034420562152413, "flos": 25630271832960.0, "grad_norm": 3.3154493316349436, "language_loss": 0.84321278, "learning_rate": 3.982951093419681e-06, "loss": 0.86593735, "num_input_tokens_seen": 24960360, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.46875, "step": 1170, "time_per_iteration": 2.5419678688049316 }, { "auxiliary_loss_clip": 0.01203571, "auxiliary_loss_mlp": 0.01065514, "balance_loss_clip": 1.03841758, "balance_loss_mlp": 1.05910027, "epoch": 0.0704043288741921, "flos": 20810449267200.0, "grad_norm": 1.9575280653459157, "language_loss": 0.75547493, "learning_rate": 3.982900311730506e-06, "loss": 0.77816582, "num_input_tokens_seen": 24978290, "router_z_loss_clip": 0.27148438, "router_z_loss_mlp": 1.4453125, "step": 1171, "time_per_iteration": 2.4702653884887695 }, { "auxiliary_loss_clip": 0.01204679, "auxiliary_loss_mlp": 0.01061788, "balance_loss_clip": 1.03669488, "balance_loss_mlp": 1.06036425, "epoch": 0.07046445212686006, "flos": 25593176062080.0, "grad_norm": 1.7058359668184966, "language_loss": 0.88975537, "learning_rate": 3.9828494548495514e-06, "loss": 0.91242003, "num_input_tokens_seen": 24997055, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.4453125, "step": 1172, "time_per_iteration": 2.522852659225464 }, { "auxiliary_loss_clip": 0.01203399, "auxiliary_loss_mlp": 0.01052785, "balance_loss_clip": 1.02700019, "balance_loss_mlp": 1.0557065, "epoch": 0.07052457537952803, "flos": 25556978131200.0, "grad_norm": 1.6192080913000075, "language_loss": 0.82087004, "learning_rate": 3.982798522778748e-06, "loss": 0.84343189, "num_input_tokens_seen": 25017490, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4765625, "step": 1173, "time_per_iteration": 2.4999754428863525 }, { "auxiliary_loss_clip": 0.01202793, "auxiliary_loss_mlp": 0.01052686, "balance_loss_clip": 1.02662647, "balance_loss_mlp": 1.05785155, "epoch": 0.070584698632196, "flos": 17968515154560.0, "grad_norm": 2.0245987265421954, "language_loss": 0.82536721, "learning_rate": 3.9827475155200245e-06, "loss": 0.84792197, "num_input_tokens_seen": 25035660, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.453125, "step": 1174, "time_per_iteration": 2.458705425262451 }, { "auxiliary_loss_clip": 0.01199897, "auxiliary_loss_mlp": 0.01056478, "balance_loss_clip": 1.0322423, "balance_loss_mlp": 1.05562568, "epoch": 0.07064482188486397, "flos": 25370888745600.0, "grad_norm": 1.7565771944836739, "language_loss": 0.85236073, "learning_rate": 3.982696433075317e-06, "loss": 0.87492442, "num_input_tokens_seen": 25054785, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.4453125, "step": 1175, "time_per_iteration": 2.490591526031494 }, { "auxiliary_loss_clip": 0.0120437, "auxiliary_loss_mlp": 0.0106961, "balance_loss_clip": 1.04554152, "balance_loss_mlp": 1.05984879, "epoch": 0.07070494513753194, "flos": 24899848767360.0, "grad_norm": 1.7293368342087139, "language_loss": 0.83320701, "learning_rate": 3.982645275446563e-06, "loss": 0.85594684, "num_input_tokens_seen": 25075180, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.4453125, "step": 1176, "time_per_iteration": 2.549057960510254 }, { "auxiliary_loss_clip": 0.0120109, "auxiliary_loss_mlp": 0.01063387, "balance_loss_clip": 1.03717303, "balance_loss_mlp": 1.05746579, "epoch": 0.07076506839019991, "flos": 22338447874560.0, "grad_norm": 2.4660194043292365, "language_loss": 0.74266589, "learning_rate": 3.982594042635701e-06, "loss": 0.76531065, "num_input_tokens_seen": 25093035, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.4375, "step": 1177, "time_per_iteration": 2.4562792778015137 }, { "auxiliary_loss_clip": 0.01205442, "auxiliary_loss_mlp": 0.01058944, "balance_loss_clip": 1.03307521, "balance_loss_mlp": 1.05981851, "epoch": 0.07082519164286788, "flos": 18660800954880.0, "grad_norm": 1.6407731642985295, "language_loss": 0.85675919, "learning_rate": 3.982542734644673e-06, "loss": 0.87940311, "num_input_tokens_seen": 25112520, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.453125, "step": 1178, "time_per_iteration": 2.4890365600585938 }, { "auxiliary_loss_clip": 0.01078986, "auxiliary_loss_mlp": 0.01006036, "balance_loss_clip": 1.00195873, "balance_loss_mlp": 1.02230787, "epoch": 0.07088531489553584, "flos": 63654107610240.0, "grad_norm": 0.8370777062264242, "language_loss": 0.63254988, "learning_rate": 3.982491351475427e-06, "loss": 0.65340006, "num_input_tokens_seen": 25177760, "router_z_loss_clip": 0.04077148, "router_z_loss_mlp": 0.56640625, "step": 1179, "time_per_iteration": 3.200862407684326 }, { "auxiliary_loss_clip": 0.01207945, "auxiliary_loss_mlp": 0.01062227, "balance_loss_clip": 1.03845692, "balance_loss_mlp": 1.06025195, "epoch": 0.07094543814820382, "flos": 21572688804480.0, "grad_norm": 2.4472586206397935, "language_loss": 0.83833206, "learning_rate": 3.98243989312991e-06, "loss": 0.86103374, "num_input_tokens_seen": 25195260, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.4765625, "step": 1180, "time_per_iteration": 2.5159451961517334 }, { "auxiliary_loss_clip": 0.01199898, "auxiliary_loss_mlp": 0.0107026, "balance_loss_clip": 1.04513109, "balance_loss_mlp": 1.0576787, "epoch": 0.07100556140087179, "flos": 22089946608000.0, "grad_norm": 2.6996812528575225, "language_loss": 0.88389575, "learning_rate": 3.982388359610074e-06, "loss": 0.90659726, "num_input_tokens_seen": 25212740, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.421875, "step": 1181, "time_per_iteration": 3.976541757583618 }, { "auxiliary_loss_clip": 0.01200286, "auxiliary_loss_mlp": 0.01060031, "balance_loss_clip": 1.0364399, "balance_loss_mlp": 1.05826199, "epoch": 0.07106568465353975, "flos": 47922286400640.0, "grad_norm": 1.789728232599628, "language_loss": 0.83462167, "learning_rate": 3.9823367509178725e-06, "loss": 0.85722488, "num_input_tokens_seen": 25236420, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.421875, "step": 1182, "time_per_iteration": 4.157965660095215 }, { "auxiliary_loss_clip": 0.01195664, "auxiliary_loss_mlp": 0.01061819, "balance_loss_clip": 1.03672504, "balance_loss_mlp": 1.05828977, "epoch": 0.07112580790620772, "flos": 23440798316160.0, "grad_norm": 2.137442532265807, "language_loss": 0.792521, "learning_rate": 3.982285067055262e-06, "loss": 0.81509578, "num_input_tokens_seen": 25255120, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.375, "step": 1183, "time_per_iteration": 4.03586483001709 }, { "auxiliary_loss_clip": 0.01203503, "auxiliary_loss_mlp": 0.0105742, "balance_loss_clip": 1.03224289, "balance_loss_mlp": 1.05573392, "epoch": 0.0711859311588757, "flos": 31868888682240.0, "grad_norm": 2.001777939151209, "language_loss": 0.79414237, "learning_rate": 3.982233308024204e-06, "loss": 0.81675154, "num_input_tokens_seen": 25275150, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.4765625, "step": 1184, "time_per_iteration": 2.542015314102173 }, { "auxiliary_loss_clip": 0.01197666, "auxiliary_loss_mlp": 0.01059396, "balance_loss_clip": 1.03587651, "balance_loss_mlp": 1.05836225, "epoch": 0.07124605441154366, "flos": 19610315026560.0, "grad_norm": 1.9231305664827436, "language_loss": 0.76949978, "learning_rate": 3.98218147382666e-06, "loss": 0.79207039, "num_input_tokens_seen": 25293680, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.390625, "step": 1185, "time_per_iteration": 2.4629547595977783 }, { "auxiliary_loss_clip": 0.01198809, "auxiliary_loss_mlp": 0.01062339, "balance_loss_clip": 1.03846157, "balance_loss_mlp": 1.05745506, "epoch": 0.07130617766421163, "flos": 14684448533760.0, "grad_norm": 2.480329973145114, "language_loss": 0.65106344, "learning_rate": 3.982129564464596e-06, "loss": 0.67367494, "num_input_tokens_seen": 25310050, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.4140625, "step": 1186, "time_per_iteration": 2.482564926147461 }, { "auxiliary_loss_clip": 0.01196994, "auxiliary_loss_mlp": 0.01053344, "balance_loss_clip": 1.02921605, "balance_loss_mlp": 1.0572753, "epoch": 0.07136630091687961, "flos": 26067915141120.0, "grad_norm": 1.9960386182188874, "language_loss": 0.69889247, "learning_rate": 3.98207757993998e-06, "loss": 0.72139585, "num_input_tokens_seen": 25331020, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.3984375, "step": 1187, "time_per_iteration": 2.5017611980438232 }, { "auxiliary_loss_clip": 0.01194278, "auxiliary_loss_mlp": 0.01053699, "balance_loss_clip": 1.03058457, "balance_loss_mlp": 1.05669928, "epoch": 0.07142642416954757, "flos": 15669190869120.0, "grad_norm": 2.6172387164705313, "language_loss": 0.78907186, "learning_rate": 3.9820255202547845e-06, "loss": 0.81155163, "num_input_tokens_seen": 25347875, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.375, "step": 1188, "time_per_iteration": 2.4578919410705566 }, { "auxiliary_loss_clip": 0.01196538, "auxiliary_loss_mlp": 0.01058229, "balance_loss_clip": 1.0337435, "balance_loss_mlp": 1.05722463, "epoch": 0.07148654742221554, "flos": 19755322231680.0, "grad_norm": 1.802749378519314, "language_loss": 0.8474012, "learning_rate": 3.981973385410981e-06, "loss": 0.86994886, "num_input_tokens_seen": 25366715, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.390625, "step": 1189, "time_per_iteration": 2.4513440132141113 }, { "auxiliary_loss_clip": 0.01193721, "auxiliary_loss_mlp": 0.01060305, "balance_loss_clip": 1.03451967, "balance_loss_mlp": 1.05466187, "epoch": 0.07154667067488352, "flos": 23471824688640.0, "grad_norm": 1.6540847366253792, "language_loss": 0.76938987, "learning_rate": 3.9819211754105494e-06, "loss": 0.79193014, "num_input_tokens_seen": 25385450, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.390625, "step": 1190, "time_per_iteration": 2.4929096698760986 }, { "auxiliary_loss_clip": 0.01200453, "auxiliary_loss_mlp": 0.01068897, "balance_loss_clip": 1.04251647, "balance_loss_mlp": 1.05710816, "epoch": 0.07160679392755148, "flos": 18332936588160.0, "grad_norm": 2.2772350351939936, "language_loss": 0.7540338, "learning_rate": 3.981868890255468e-06, "loss": 0.77672732, "num_input_tokens_seen": 25403940, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.4375, "step": 1191, "time_per_iteration": 2.4545202255249023 }, { "auxiliary_loss_clip": 0.01196437, "auxiliary_loss_mlp": 0.01058537, "balance_loss_clip": 1.03266883, "balance_loss_mlp": 1.05509257, "epoch": 0.07166691718021945, "flos": 17747017937280.0, "grad_norm": 2.413685270021228, "language_loss": 0.73760241, "learning_rate": 3.981816529947719e-06, "loss": 0.7601521, "num_input_tokens_seen": 25420410, "router_z_loss_clip": 0.2578125, "router_z_loss_mlp": 1.4140625, "step": 1192, "time_per_iteration": 2.466991662979126 }, { "auxiliary_loss_clip": 0.01192837, "auxiliary_loss_mlp": 0.01054421, "balance_loss_clip": 1.03127098, "balance_loss_mlp": 1.05182314, "epoch": 0.07172704043288743, "flos": 22451925916800.0, "grad_norm": 2.0765802699610934, "language_loss": 0.78102446, "learning_rate": 3.9817640944892896e-06, "loss": 0.80349696, "num_input_tokens_seen": 25439415, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.4140625, "step": 1193, "time_per_iteration": 2.462348699569702 }, { "auxiliary_loss_clip": 0.0120025, "auxiliary_loss_mlp": 0.01055448, "balance_loss_clip": 1.03031874, "balance_loss_mlp": 1.05800378, "epoch": 0.07178716368555539, "flos": 23222210100480.0, "grad_norm": 2.2066141934164656, "language_loss": 0.85673285, "learning_rate": 3.981711583882166e-06, "loss": 0.87928987, "num_input_tokens_seen": 25458715, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.421875, "step": 1194, "time_per_iteration": 2.502373218536377 }, { "auxiliary_loss_clip": 0.01194742, "auxiliary_loss_mlp": 0.01059715, "balance_loss_clip": 1.03506279, "balance_loss_mlp": 1.05501056, "epoch": 0.07184728693822336, "flos": 25150828072320.0, "grad_norm": 3.4223674702262885, "language_loss": 0.81715006, "learning_rate": 3.981658998128341e-06, "loss": 0.83969462, "num_input_tokens_seen": 25477985, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.3984375, "step": 1195, "time_per_iteration": 2.4993505477905273 }, { "auxiliary_loss_clip": 0.01197752, "auxiliary_loss_mlp": 0.01052346, "balance_loss_clip": 1.02948201, "balance_loss_mlp": 1.05856586, "epoch": 0.07190741019089132, "flos": 22711237176960.0, "grad_norm": 1.7758292591462637, "language_loss": 0.7989589, "learning_rate": 3.981606337229808e-06, "loss": 0.82145989, "num_input_tokens_seen": 25497110, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.390625, "step": 1196, "time_per_iteration": 2.5023481845855713 }, { "auxiliary_loss_clip": 0.01194943, "auxiliary_loss_mlp": 0.01064275, "balance_loss_clip": 1.0387882, "balance_loss_mlp": 1.05606365, "epoch": 0.0719675334435593, "flos": 29349791032320.0, "grad_norm": 6.161665528719659, "language_loss": 0.71442252, "learning_rate": 3.9815536011885655e-06, "loss": 0.73701477, "num_input_tokens_seen": 25516555, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.390625, "step": 1197, "time_per_iteration": 2.554457426071167 }, { "auxiliary_loss_clip": 0.01194398, "auxiliary_loss_mlp": 0.01048946, "balance_loss_clip": 1.02564096, "balance_loss_mlp": 1.05559301, "epoch": 0.07202765669622727, "flos": 17639788861440.0, "grad_norm": 3.0761793703983127, "language_loss": 0.85690093, "learning_rate": 3.98150079000661e-06, "loss": 0.87933439, "num_input_tokens_seen": 25533895, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.390625, "step": 1198, "time_per_iteration": 2.48696231842041 }, { "auxiliary_loss_clip": 0.01196227, "auxiliary_loss_mlp": 0.01062204, "balance_loss_clip": 1.03775406, "balance_loss_mlp": 1.05806565, "epoch": 0.07208777994889523, "flos": 21434038306560.0, "grad_norm": 1.9517784490161916, "language_loss": 0.83957005, "learning_rate": 3.981447903685947e-06, "loss": 0.86215436, "num_input_tokens_seen": 25554195, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.3828125, "step": 1199, "time_per_iteration": 2.4702799320220947 }, { "auxiliary_loss_clip": 0.01202091, "auxiliary_loss_mlp": 0.010534, "balance_loss_clip": 1.03067863, "balance_loss_mlp": 1.06239057, "epoch": 0.07214790320156321, "flos": 26940867373440.0, "grad_norm": 1.9415357867781786, "language_loss": 0.76703244, "learning_rate": 3.981394942228581e-06, "loss": 0.78958738, "num_input_tokens_seen": 25574155, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.3984375, "step": 1200, "time_per_iteration": 2.5214309692382812 }, { "auxiliary_loss_clip": 0.01196659, "auxiliary_loss_mlp": 0.01065398, "balance_loss_clip": 1.0405426, "balance_loss_mlp": 1.05793858, "epoch": 0.07220802645423118, "flos": 23879949995520.0, "grad_norm": 2.776058634090227, "language_loss": 0.82715142, "learning_rate": 3.98134190563652e-06, "loss": 0.84977198, "num_input_tokens_seen": 25592735, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.3828125, "step": 1201, "time_per_iteration": 2.5096993446350098 }, { "auxiliary_loss_clip": 0.01200318, "auxiliary_loss_mlp": 0.01059166, "balance_loss_clip": 1.03299975, "balance_loss_mlp": 1.05720711, "epoch": 0.07226814970689914, "flos": 19243631036160.0, "grad_norm": 2.241987242231911, "language_loss": 0.68860972, "learning_rate": 3.981288793911775e-06, "loss": 0.71120465, "num_input_tokens_seen": 25611510, "router_z_loss_clip": 0.26171875, "router_z_loss_mlp": 1.4296875, "step": 1202, "time_per_iteration": 2.497474193572998 }, { "auxiliary_loss_clip": 0.01199073, "auxiliary_loss_mlp": 0.01059143, "balance_loss_clip": 1.03269041, "balance_loss_mlp": 1.05832052, "epoch": 0.07232827295956712, "flos": 19172025273600.0, "grad_norm": 1.7946496014588773, "language_loss": 0.87607849, "learning_rate": 3.98123560705636e-06, "loss": 0.89866066, "num_input_tokens_seen": 25629560, "router_z_loss_clip": 0.26367188, "router_z_loss_mlp": 1.40625, "step": 1203, "time_per_iteration": 2.4437475204467773 }, { "auxiliary_loss_clip": 0.0119884, "auxiliary_loss_mlp": 0.01056197, "balance_loss_clip": 1.0320456, "balance_loss_mlp": 1.05526268, "epoch": 0.07238839621223508, "flos": 17639752947840.0, "grad_norm": 2.6733983442429192, "language_loss": 0.78717202, "learning_rate": 3.981182345072293e-06, "loss": 0.80972242, "num_input_tokens_seen": 25648330, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.4375, "step": 1204, "time_per_iteration": 2.464689254760742 }, { "auxiliary_loss_clip": 0.01197588, "auxiliary_loss_mlp": 0.0106266, "balance_loss_clip": 1.0385201, "balance_loss_mlp": 1.05763268, "epoch": 0.07244851946490305, "flos": 28292401440000.0, "grad_norm": 1.4239863963339732, "language_loss": 0.82039583, "learning_rate": 3.981129007961593e-06, "loss": 0.84299827, "num_input_tokens_seen": 25669470, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.40625, "step": 1205, "time_per_iteration": 2.5178630352020264 }, { "auxiliary_loss_clip": 0.01202544, "auxiliary_loss_mlp": 0.01066987, "balance_loss_clip": 1.04243004, "balance_loss_mlp": 1.05977881, "epoch": 0.07250864271757101, "flos": 22564829341440.0, "grad_norm": 1.5823073374025196, "language_loss": 0.76564097, "learning_rate": 3.981075595726283e-06, "loss": 0.78833628, "num_input_tokens_seen": 25690470, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.4296875, "step": 1206, "time_per_iteration": 2.5194003582000732 }, { "auxiliary_loss_clip": 0.01196012, "auxiliary_loss_mlp": 0.01056905, "balance_loss_clip": 1.03303969, "balance_loss_mlp": 1.05714941, "epoch": 0.072568765970239, "flos": 21762405463680.0, "grad_norm": 1.9411953802683117, "language_loss": 0.7752732, "learning_rate": 3.981022108368387e-06, "loss": 0.79780245, "num_input_tokens_seen": 25709205, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.390625, "step": 1207, "time_per_iteration": 2.4680676460266113 }, { "auxiliary_loss_clip": 0.01195181, "auxiliary_loss_mlp": 0.01054297, "balance_loss_clip": 1.03192115, "balance_loss_mlp": 1.05780172, "epoch": 0.07262888922290696, "flos": 25519702792320.0, "grad_norm": 2.8623387681544927, "language_loss": 0.79683495, "learning_rate": 3.9809685458899345e-06, "loss": 0.81932968, "num_input_tokens_seen": 25728485, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.375, "step": 1208, "time_per_iteration": 2.530632734298706 }, { "auxiliary_loss_clip": 0.01194681, "auxiliary_loss_mlp": 0.01051518, "balance_loss_clip": 1.02970302, "balance_loss_mlp": 1.05826044, "epoch": 0.07268901247557492, "flos": 21246548290560.0, "grad_norm": 1.8265920783186618, "language_loss": 0.78738737, "learning_rate": 3.980914908292955e-06, "loss": 0.80984938, "num_input_tokens_seen": 25747730, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.359375, "step": 1209, "time_per_iteration": 2.4515647888183594 }, { "auxiliary_loss_clip": 0.01196857, "auxiliary_loss_mlp": 0.01061023, "balance_loss_clip": 1.03797972, "balance_loss_mlp": 1.05785346, "epoch": 0.0727491357282429, "flos": 25479302970240.0, "grad_norm": 2.3346464558218605, "language_loss": 0.81080627, "learning_rate": 3.980861195579486e-06, "loss": 0.83338511, "num_input_tokens_seen": 25768050, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.390625, "step": 1210, "time_per_iteration": 2.5219478607177734 }, { "auxiliary_loss_clip": 0.0119748, "auxiliary_loss_mlp": 0.01064666, "balance_loss_clip": 1.0411818, "balance_loss_mlp": 1.06085825, "epoch": 0.07280925898091087, "flos": 24462169545600.0, "grad_norm": 1.9399581514499253, "language_loss": 0.8500005, "learning_rate": 3.98080740775156e-06, "loss": 0.87262189, "num_input_tokens_seen": 25787985, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.3671875, "step": 1211, "time_per_iteration": 2.4880359172821045 }, { "auxiliary_loss_clip": 0.01193127, "auxiliary_loss_mlp": 0.01050738, "balance_loss_clip": 1.02813625, "balance_loss_mlp": 1.0562849, "epoch": 0.07286938223357883, "flos": 18288191220480.0, "grad_norm": 2.3285461870731896, "language_loss": 0.90697706, "learning_rate": 3.98075354481122e-06, "loss": 0.9294157, "num_input_tokens_seen": 25803620, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.3671875, "step": 1212, "time_per_iteration": 2.4461312294006348 }, { "auxiliary_loss_clip": 0.01195605, "auxiliary_loss_mlp": 0.01054299, "balance_loss_clip": 1.0315063, "balance_loss_mlp": 1.05883741, "epoch": 0.07292950548624681, "flos": 21214803646080.0, "grad_norm": 1.7557313924851743, "language_loss": 0.72767276, "learning_rate": 3.9806996067605055e-06, "loss": 0.75017178, "num_input_tokens_seen": 25823315, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.3671875, "step": 1213, "time_per_iteration": 2.4601762294769287 }, { "auxiliary_loss_clip": 0.01195658, "auxiliary_loss_mlp": 0.01053698, "balance_loss_clip": 1.03008235, "balance_loss_mlp": 1.05477011, "epoch": 0.07298962873891478, "flos": 24642009964800.0, "grad_norm": 1.7865887093557822, "language_loss": 0.84776127, "learning_rate": 3.980645593601465e-06, "loss": 0.87025487, "num_input_tokens_seen": 25842605, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.40625, "step": 1214, "time_per_iteration": 2.511245012283325 }, { "auxiliary_loss_clip": 0.01198819, "auxiliary_loss_mlp": 0.01053168, "balance_loss_clip": 1.0295651, "balance_loss_mlp": 1.05780613, "epoch": 0.07304975199158274, "flos": 27052765217280.0, "grad_norm": 2.013852454967255, "language_loss": 0.84325826, "learning_rate": 3.980591505336144e-06, "loss": 0.86577815, "num_input_tokens_seen": 25863030, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.40625, "step": 1215, "time_per_iteration": 2.513493776321411 }, { "auxiliary_loss_clip": 0.011958, "auxiliary_loss_mlp": 0.01058792, "balance_loss_clip": 1.03486681, "balance_loss_mlp": 1.05670452, "epoch": 0.07310987524425071, "flos": 33549544091520.0, "grad_norm": 1.5741044398415922, "language_loss": 0.81222701, "learning_rate": 3.980537341966595e-06, "loss": 0.83477288, "num_input_tokens_seen": 25888015, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.390625, "step": 1216, "time_per_iteration": 2.612281560897827 }, { "auxiliary_loss_clip": 0.01199079, "auxiliary_loss_mlp": 0.01056945, "balance_loss_clip": 1.03443837, "balance_loss_mlp": 1.0599966, "epoch": 0.07316999849691869, "flos": 28110944908800.0, "grad_norm": 2.733092073543679, "language_loss": 0.75924742, "learning_rate": 3.980483103494872e-06, "loss": 0.78180766, "num_input_tokens_seen": 25908660, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.390625, "step": 1217, "time_per_iteration": 2.538496494293213 }, { "auxiliary_loss_clip": 0.01194226, "auxiliary_loss_mlp": 0.01058229, "balance_loss_clip": 1.0367837, "balance_loss_mlp": 1.05706334, "epoch": 0.07323012174958665, "flos": 14392602529920.0, "grad_norm": 2.2561100728832, "language_loss": 0.86596358, "learning_rate": 3.98042878992303e-06, "loss": 0.88848811, "num_input_tokens_seen": 25927215, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.375, "step": 1218, "time_per_iteration": 2.483955144882202 }, { "auxiliary_loss_clip": 0.01196688, "auxiliary_loss_mlp": 0.01062572, "balance_loss_clip": 1.04026783, "balance_loss_mlp": 1.05663562, "epoch": 0.07329024500225462, "flos": 21616428591360.0, "grad_norm": 1.7631136981475404, "language_loss": 0.86436898, "learning_rate": 3.9803744012531305e-06, "loss": 0.88696146, "num_input_tokens_seen": 25945500, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.3984375, "step": 1219, "time_per_iteration": 2.4750688076019287 }, { "auxiliary_loss_clip": 0.0119057, "auxiliary_loss_mlp": 0.01055213, "balance_loss_clip": 1.03312373, "balance_loss_mlp": 1.05344057, "epoch": 0.0733503682549226, "flos": 13224141106560.0, "grad_norm": 2.1037638344148486, "language_loss": 0.84614354, "learning_rate": 3.980319937487235e-06, "loss": 0.86860144, "num_input_tokens_seen": 25963105, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.375, "step": 1220, "time_per_iteration": 2.4631569385528564 }, { "auxiliary_loss_clip": 0.01194039, "auxiliary_loss_mlp": 0.0106172, "balance_loss_clip": 1.03805733, "balance_loss_mlp": 1.0559876, "epoch": 0.07341049150759056, "flos": 20886975192960.0, "grad_norm": 2.6994766329063444, "language_loss": 0.77293754, "learning_rate": 3.98026539862741e-06, "loss": 0.79549515, "num_input_tokens_seen": 25981690, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.375, "step": 1221, "time_per_iteration": 2.4666531085968018 }, { "auxiliary_loss_clip": 0.01196979, "auxiliary_loss_mlp": 0.01058452, "balance_loss_clip": 1.03558755, "balance_loss_mlp": 1.05883443, "epoch": 0.07347061476025853, "flos": 15413614623360.0, "grad_norm": 3.9385202537277784, "language_loss": 0.92265177, "learning_rate": 3.980210784675722e-06, "loss": 0.94520605, "num_input_tokens_seen": 25999890, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.3828125, "step": 1222, "time_per_iteration": 2.480388879776001 }, { "auxiliary_loss_clip": 0.01199542, "auxiliary_loss_mlp": 0.01054774, "balance_loss_clip": 1.03304172, "balance_loss_mlp": 1.05995977, "epoch": 0.0735307380129265, "flos": 11108859131520.0, "grad_norm": 2.674327270212127, "language_loss": 0.91047484, "learning_rate": 3.980156095634242e-06, "loss": 0.93301797, "num_input_tokens_seen": 26016445, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.390625, "step": 1223, "time_per_iteration": 5.324740648269653 }, { "auxiliary_loss_clip": 0.01195687, "auxiliary_loss_mlp": 0.01070867, "balance_loss_clip": 1.04877782, "balance_loss_mlp": 1.0587914, "epoch": 0.07359086126559447, "flos": 23732392924800.0, "grad_norm": 2.11779667475943, "language_loss": 0.82075459, "learning_rate": 3.980101331505045e-06, "loss": 0.84342009, "num_input_tokens_seen": 26036080, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.3671875, "step": 1224, "time_per_iteration": 5.290279388427734 }, { "auxiliary_loss_clip": 0.01192173, "auxiliary_loss_mlp": 0.01060941, "balance_loss_clip": 1.03556108, "balance_loss_mlp": 1.0549159, "epoch": 0.07365098451826244, "flos": 20993270515200.0, "grad_norm": 2.1600372065532363, "language_loss": 0.83442271, "learning_rate": 3.9800464922902076e-06, "loss": 0.85695386, "num_input_tokens_seen": 26055805, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.375, "step": 1225, "time_per_iteration": 2.466627597808838 }, { "auxiliary_loss_clip": 0.01193307, "auxiliary_loss_mlp": 0.01051746, "balance_loss_clip": 1.02871501, "balance_loss_mlp": 1.05555522, "epoch": 0.0737111077709304, "flos": 19933582452480.0, "grad_norm": 1.918087471754054, "language_loss": 0.90364385, "learning_rate": 3.979991577991808e-06, "loss": 0.92609429, "num_input_tokens_seen": 26073905, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.3828125, "step": 1226, "time_per_iteration": 2.4596824645996094 }, { "auxiliary_loss_clip": 0.01201667, "auxiliary_loss_mlp": 0.01048086, "balance_loss_clip": 1.02437484, "balance_loss_mlp": 1.05472314, "epoch": 0.07377123102359838, "flos": 16581537342720.0, "grad_norm": 2.8004819677128046, "language_loss": 0.76545918, "learning_rate": 3.97993658861193e-06, "loss": 0.78795671, "num_input_tokens_seen": 26091700, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.46875, "step": 1227, "time_per_iteration": 2.4304440021514893 }, { "auxiliary_loss_clip": 0.01193085, "auxiliary_loss_mlp": 0.01048898, "balance_loss_clip": 1.02623618, "balance_loss_mlp": 1.0596242, "epoch": 0.07383135427626634, "flos": 28328563457280.0, "grad_norm": 1.4831253612895368, "language_loss": 0.85658109, "learning_rate": 3.9798815241526575e-06, "loss": 0.8790009, "num_input_tokens_seen": 26114105, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.3359375, "step": 1228, "time_per_iteration": 2.5527021884918213 }, { "auxiliary_loss_clip": 0.01193738, "auxiliary_loss_mlp": 0.01058748, "balance_loss_clip": 1.03619397, "balance_loss_mlp": 1.05457985, "epoch": 0.07389147752893431, "flos": 20047168235520.0, "grad_norm": 2.25436757027875, "language_loss": 0.79877245, "learning_rate": 3.97982638461608e-06, "loss": 0.82129729, "num_input_tokens_seen": 26131165, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.390625, "step": 1229, "time_per_iteration": 2.4516568183898926 }, { "auxiliary_loss_clip": 0.01198065, "auxiliary_loss_mlp": 0.01059081, "balance_loss_clip": 1.03547728, "balance_loss_mlp": 1.0587368, "epoch": 0.07395160078160229, "flos": 18114132890880.0, "grad_norm": 1.8036130814191194, "language_loss": 0.78279269, "learning_rate": 3.979771170004287e-06, "loss": 0.80536413, "num_input_tokens_seen": 26150040, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.390625, "step": 1230, "time_per_iteration": 2.4577126502990723 }, { "auxiliary_loss_clip": 0.01192887, "auxiliary_loss_mlp": 0.01048033, "balance_loss_clip": 1.02459633, "balance_loss_mlp": 1.05772936, "epoch": 0.07401172403427025, "flos": 23586918842880.0, "grad_norm": 3.0763243959919713, "language_loss": 0.81192744, "learning_rate": 3.979715880319372e-06, "loss": 0.83433664, "num_input_tokens_seen": 26169380, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.3515625, "step": 1231, "time_per_iteration": 2.474881172180176 }, { "auxiliary_loss_clip": 0.01196926, "auxiliary_loss_mlp": 0.01056579, "balance_loss_clip": 1.03324926, "balance_loss_mlp": 1.05567741, "epoch": 0.07407184728693822, "flos": 26359904799360.0, "grad_norm": 2.336762218887071, "language_loss": 0.9465791, "learning_rate": 3.979660515563434e-06, "loss": 0.96911412, "num_input_tokens_seen": 26189420, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.4140625, "step": 1232, "time_per_iteration": 2.511270523071289 }, { "auxiliary_loss_clip": 0.01194011, "auxiliary_loss_mlp": 0.0105865, "balance_loss_clip": 1.03702509, "balance_loss_mlp": 1.05784905, "epoch": 0.0741319705396062, "flos": 22200443821440.0, "grad_norm": 1.718332666066412, "language_loss": 0.80824685, "learning_rate": 3.979605075738569e-06, "loss": 0.83077347, "num_input_tokens_seen": 26209300, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.359375, "step": 1233, "time_per_iteration": 2.4561805725097656 }, { "auxiliary_loss_clip": 0.01197214, "auxiliary_loss_mlp": 0.01055775, "balance_loss_clip": 1.03091955, "balance_loss_mlp": 1.05594873, "epoch": 0.07419209379227416, "flos": 39200482523520.0, "grad_norm": 2.455976686037663, "language_loss": 0.707367, "learning_rate": 3.979549560846883e-06, "loss": 0.7298969, "num_input_tokens_seen": 26228110, "router_z_loss_clip": 0.24902344, "router_z_loss_mlp": 1.4140625, "step": 1234, "time_per_iteration": 2.6026298999786377 }, { "auxiliary_loss_clip": 0.01194801, "auxiliary_loss_mlp": 0.0106024, "balance_loss_clip": 1.0364573, "balance_loss_mlp": 1.05781162, "epoch": 0.07425221704494213, "flos": 22781657790720.0, "grad_norm": 1.8324995912599693, "language_loss": 0.77197111, "learning_rate": 3.979493970890478e-06, "loss": 0.79452157, "num_input_tokens_seen": 26247020, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.3671875, "step": 1235, "time_per_iteration": 2.4694225788116455 }, { "auxiliary_loss_clip": 0.01188748, "auxiliary_loss_mlp": 0.01050559, "balance_loss_clip": 1.02844536, "balance_loss_mlp": 1.05470049, "epoch": 0.0743123402976101, "flos": 22272983337600.0, "grad_norm": 1.8435226245275453, "language_loss": 0.82799661, "learning_rate": 3.979438305871464e-06, "loss": 0.85038972, "num_input_tokens_seen": 26265750, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.34375, "step": 1236, "time_per_iteration": 2.4938440322875977 }, { "auxiliary_loss_clip": 0.01195019, "auxiliary_loss_mlp": 0.01054756, "balance_loss_clip": 1.03124809, "balance_loss_mlp": 1.05626774, "epoch": 0.07437246355027807, "flos": 29315029645440.0, "grad_norm": 3.1840631232744787, "language_loss": 0.75922066, "learning_rate": 3.979382565791951e-06, "loss": 0.78171849, "num_input_tokens_seen": 26287905, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.390625, "step": 1237, "time_per_iteration": 2.5290887355804443 }, { "auxiliary_loss_clip": 0.01191328, "auxiliary_loss_mlp": 0.01058529, "balance_loss_clip": 1.03627229, "balance_loss_mlp": 1.05466723, "epoch": 0.07443258680294604, "flos": 31944732249600.0, "grad_norm": 1.7162059926268478, "language_loss": 0.77412421, "learning_rate": 3.979326750654053e-06, "loss": 0.79662275, "num_input_tokens_seen": 26311795, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.3671875, "step": 1238, "time_per_iteration": 2.581259250640869 }, { "auxiliary_loss_clip": 0.01197718, "auxiliary_loss_mlp": 0.0105529, "balance_loss_clip": 1.03182948, "balance_loss_mlp": 1.05609775, "epoch": 0.074492710055614, "flos": 22675290641280.0, "grad_norm": 1.9596184308583087, "language_loss": 0.86606085, "learning_rate": 3.9792708604598854e-06, "loss": 0.88859093, "num_input_tokens_seen": 26330330, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.421875, "step": 1239, "time_per_iteration": 2.472813367843628 }, { "auxiliary_loss_clip": 0.01193343, "auxiliary_loss_mlp": 0.01047684, "balance_loss_clip": 1.02357984, "balance_loss_mlp": 1.05533195, "epoch": 0.07455283330828198, "flos": 21284901037440.0, "grad_norm": 2.1272563286624053, "language_loss": 0.89118516, "learning_rate": 3.979214895211569e-06, "loss": 0.91359544, "num_input_tokens_seen": 26348865, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.3828125, "step": 1240, "time_per_iteration": 2.513500690460205 }, { "auxiliary_loss_clip": 0.01195951, "auxiliary_loss_mlp": 0.01060731, "balance_loss_clip": 1.03650749, "balance_loss_mlp": 1.05803537, "epoch": 0.07461295656094995, "flos": 24388408967040.0, "grad_norm": 1.7637798371329896, "language_loss": 0.88893044, "learning_rate": 3.979158854911225e-06, "loss": 0.91149724, "num_input_tokens_seen": 26368210, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.3828125, "step": 1241, "time_per_iteration": 2.5066921710968018 }, { "auxiliary_loss_clip": 0.01078245, "auxiliary_loss_mlp": 0.01034868, "balance_loss_clip": 1.03176832, "balance_loss_mlp": 1.02405047, "epoch": 0.07467307981361791, "flos": 62109660574080.0, "grad_norm": 0.9326237263840725, "language_loss": 0.63050669, "learning_rate": 3.979102739560979e-06, "loss": 0.65163779, "num_input_tokens_seen": 26424890, "router_z_loss_clip": 0.03088379, "router_z_loss_mlp": 0.5390625, "step": 1242, "time_per_iteration": 3.141724109649658 }, { "auxiliary_loss_clip": 0.01206131, "auxiliary_loss_mlp": 0.01057184, "balance_loss_clip": 1.03161383, "balance_loss_mlp": 1.05827332, "epoch": 0.07473320306628589, "flos": 24863148046080.0, "grad_norm": 2.129990667620913, "language_loss": 0.63113844, "learning_rate": 3.9790465491629595e-06, "loss": 0.65377158, "num_input_tokens_seen": 26446405, "router_z_loss_clip": 0.25585938, "router_z_loss_mlp": 1.4765625, "step": 1243, "time_per_iteration": 2.516077995300293 }, { "auxiliary_loss_clip": 0.01192551, "auxiliary_loss_mlp": 0.01051685, "balance_loss_clip": 1.02839172, "balance_loss_mlp": 1.05638683, "epoch": 0.07479332631895386, "flos": 24897442556160.0, "grad_norm": 3.8879290779311324, "language_loss": 0.7630322, "learning_rate": 3.978990283719296e-06, "loss": 0.78547454, "num_input_tokens_seen": 26466070, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.359375, "step": 1244, "time_per_iteration": 2.4952778816223145 }, { "auxiliary_loss_clip": 0.01197618, "auxiliary_loss_mlp": 0.01056924, "balance_loss_clip": 1.03323674, "balance_loss_mlp": 1.05813777, "epoch": 0.07485344957162182, "flos": 17815247821440.0, "grad_norm": 3.2147423553548053, "language_loss": 0.69159675, "learning_rate": 3.978933943232123e-06, "loss": 0.7141422, "num_input_tokens_seen": 26479350, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.3984375, "step": 1245, "time_per_iteration": 2.4405059814453125 }, { "auxiliary_loss_clip": 0.01194791, "auxiliary_loss_mlp": 0.01056677, "balance_loss_clip": 1.03250158, "balance_loss_mlp": 1.05697513, "epoch": 0.0749135728242898, "flos": 25010202326400.0, "grad_norm": 2.017563341873818, "language_loss": 0.88633561, "learning_rate": 3.978877527703576e-06, "loss": 0.90885031, "num_input_tokens_seen": 26498255, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.375, "step": 1246, "time_per_iteration": 2.494155168533325 }, { "auxiliary_loss_clip": 0.01205809, "auxiliary_loss_mlp": 0.01065529, "balance_loss_clip": 1.04000592, "balance_loss_mlp": 1.05895424, "epoch": 0.07497369607695777, "flos": 17822071405440.0, "grad_norm": 2.1770861617908124, "language_loss": 0.88029063, "learning_rate": 3.9788210371357945e-06, "loss": 0.90300399, "num_input_tokens_seen": 26515375, "router_z_loss_clip": 0.25390625, "router_z_loss_mlp": 1.46875, "step": 1247, "time_per_iteration": 2.441128969192505 }, { "auxiliary_loss_clip": 0.01194775, "auxiliary_loss_mlp": 0.01059486, "balance_loss_clip": 1.03541732, "balance_loss_mlp": 1.0581969, "epoch": 0.07503381932962573, "flos": 15121086261120.0, "grad_norm": 2.1486784452708205, "language_loss": 0.64862442, "learning_rate": 3.978764471530921e-06, "loss": 0.67116702, "num_input_tokens_seen": 26533595, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.3671875, "step": 1248, "time_per_iteration": 2.4220693111419678 }, { "auxiliary_loss_clip": 0.01193677, "auxiliary_loss_mlp": 0.01058627, "balance_loss_clip": 1.03790855, "balance_loss_mlp": 1.05950236, "epoch": 0.0750939425822937, "flos": 12816734071680.0, "grad_norm": 1.956597671171663, "language_loss": 0.74448967, "learning_rate": 3.978707830891102e-06, "loss": 0.76701272, "num_input_tokens_seen": 26549405, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.34375, "step": 1249, "time_per_iteration": 2.410961627960205 }, { "auxiliary_loss_clip": 0.01198271, "auxiliary_loss_mlp": 0.01065259, "balance_loss_clip": 1.04153681, "balance_loss_mlp": 1.05818903, "epoch": 0.07515406583496168, "flos": 24206844695040.0, "grad_norm": 2.506620127166205, "language_loss": 0.82022685, "learning_rate": 3.978651115218482e-06, "loss": 0.84286213, "num_input_tokens_seen": 26567200, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.40625, "step": 1250, "time_per_iteration": 2.4982311725616455 }, { "auxiliary_loss_clip": 0.01195982, "auxiliary_loss_mlp": 0.01061707, "balance_loss_clip": 1.0388068, "balance_loss_mlp": 1.06096804, "epoch": 0.07521418908762964, "flos": 26688164215680.0, "grad_norm": 3.220752690276317, "language_loss": 0.66636515, "learning_rate": 3.978594324515215e-06, "loss": 0.68894196, "num_input_tokens_seen": 26586190, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.3515625, "step": 1251, "time_per_iteration": 2.5065667629241943 }, { "auxiliary_loss_clip": 0.0107739, "auxiliary_loss_mlp": 0.01026331, "balance_loss_clip": 1.02339888, "balance_loss_mlp": 1.02383876, "epoch": 0.0752743123402976, "flos": 59095140589440.0, "grad_norm": 0.9038670467488412, "language_loss": 0.70447683, "learning_rate": 3.9785374587834515e-06, "loss": 0.72551405, "num_input_tokens_seen": 26650710, "router_z_loss_clip": 0.02929688, "router_z_loss_mlp": 0.53515625, "step": 1252, "time_per_iteration": 3.1357531547546387 }, { "auxiliary_loss_clip": 0.01193803, "auxiliary_loss_mlp": 0.01062924, "balance_loss_clip": 1.03986931, "balance_loss_mlp": 1.05687237, "epoch": 0.07533443559296558, "flos": 23477032160640.0, "grad_norm": 2.1498162523020174, "language_loss": 0.79909688, "learning_rate": 3.97848051802535e-06, "loss": 0.82166415, "num_input_tokens_seen": 26669000, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.375, "step": 1253, "time_per_iteration": 2.489551305770874 }, { "auxiliary_loss_clip": 0.01199133, "auxiliary_loss_mlp": 0.01059542, "balance_loss_clip": 1.03736877, "balance_loss_mlp": 1.06017089, "epoch": 0.07539455884563355, "flos": 20879110114560.0, "grad_norm": 2.8814481813982384, "language_loss": 0.93755221, "learning_rate": 3.978423502243069e-06, "loss": 0.96013892, "num_input_tokens_seen": 26683075, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.390625, "step": 1254, "time_per_iteration": 2.445035696029663 }, { "auxiliary_loss_clip": 0.01195049, "auxiliary_loss_mlp": 0.01058387, "balance_loss_clip": 1.03660738, "balance_loss_mlp": 1.06210291, "epoch": 0.07545468209830151, "flos": 27672906551040.0, "grad_norm": 2.469754558763291, "language_loss": 0.88121569, "learning_rate": 3.97836641143877e-06, "loss": 0.90375006, "num_input_tokens_seen": 26701875, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.328125, "step": 1255, "time_per_iteration": 2.530710220336914 }, { "auxiliary_loss_clip": 0.0119315, "auxiliary_loss_mlp": 0.01067021, "balance_loss_clip": 1.04326296, "balance_loss_mlp": 1.05816746, "epoch": 0.0755148053509695, "flos": 14136990370560.0, "grad_norm": 1.9697903878827825, "language_loss": 0.79174376, "learning_rate": 3.978309245614618e-06, "loss": 0.81434548, "num_input_tokens_seen": 26719050, "router_z_loss_clip": 0.23730469, "router_z_loss_mlp": 1.3515625, "step": 1256, "time_per_iteration": 2.4325625896453857 }, { "auxiliary_loss_clip": 0.01078056, "auxiliary_loss_mlp": 0.01015819, "balance_loss_clip": 1.01295841, "balance_loss_mlp": 1.02530169, "epoch": 0.07557492860363746, "flos": 58235257929600.0, "grad_norm": 0.773386637893189, "language_loss": 0.58072853, "learning_rate": 3.9782520047727825e-06, "loss": 0.60166728, "num_input_tokens_seen": 26780650, "router_z_loss_clip": 0.02856445, "router_z_loss_mlp": 0.52734375, "step": 1257, "time_per_iteration": 3.2139780521392822 }, { "auxiliary_loss_clip": 0.01200604, "auxiliary_loss_mlp": 0.01059998, "balance_loss_clip": 1.03784943, "balance_loss_mlp": 1.0651834, "epoch": 0.07563505185630542, "flos": 24644380262400.0, "grad_norm": 1.9951921405040538, "language_loss": 0.89882338, "learning_rate": 3.978194688915432e-06, "loss": 0.9214294, "num_input_tokens_seen": 26798725, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.359375, "step": 1258, "time_per_iteration": 2.506359815597534 }, { "auxiliary_loss_clip": 0.01195518, "auxiliary_loss_mlp": 0.01055312, "balance_loss_clip": 1.03410459, "balance_loss_mlp": 1.06326628, "epoch": 0.07569517510897339, "flos": 15522998515200.0, "grad_norm": 1.968007537804403, "language_loss": 0.8130089, "learning_rate": 3.978137298044741e-06, "loss": 0.83551717, "num_input_tokens_seen": 26817005, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.3203125, "step": 1259, "time_per_iteration": 2.4694037437438965 }, { "auxiliary_loss_clip": 0.0120046, "auxiliary_loss_mlp": 0.01059564, "balance_loss_clip": 1.03818989, "balance_loss_mlp": 1.0639478, "epoch": 0.07575529836164137, "flos": 22928532503040.0, "grad_norm": 1.9935343174350948, "language_loss": 0.76019752, "learning_rate": 3.978079832162885e-06, "loss": 0.78279769, "num_input_tokens_seen": 26836655, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.3671875, "step": 1260, "time_per_iteration": 2.4849629402160645 }, { "auxiliary_loss_clip": 0.0119471, "auxiliary_loss_mlp": 0.01064921, "balance_loss_clip": 1.04187775, "balance_loss_mlp": 1.06010294, "epoch": 0.07581542161430933, "flos": 19500428344320.0, "grad_norm": 1.699694730480749, "language_loss": 0.8517859, "learning_rate": 3.978022291272044e-06, "loss": 0.87438226, "num_input_tokens_seen": 26854925, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.34375, "step": 1261, "time_per_iteration": 2.468233108520508 }, { "auxiliary_loss_clip": 0.01202198, "auxiliary_loss_mlp": 0.01060293, "balance_loss_clip": 1.03877544, "balance_loss_mlp": 1.06440091, "epoch": 0.0758755448669773, "flos": 24973465691520.0, "grad_norm": 1.786987690386392, "language_loss": 0.8264637, "learning_rate": 3.977964675374399e-06, "loss": 0.84908861, "num_input_tokens_seen": 26876170, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.375, "step": 1262, "time_per_iteration": 2.560882806777954 }, { "auxiliary_loss_clip": 0.01195341, "auxiliary_loss_mlp": 0.01061499, "balance_loss_clip": 1.03918266, "balance_loss_mlp": 1.05939627, "epoch": 0.07593566811964528, "flos": 22747973811840.0, "grad_norm": 2.4712550913732727, "language_loss": 0.82470399, "learning_rate": 3.977906984472136e-06, "loss": 0.8472724, "num_input_tokens_seen": 26895005, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.359375, "step": 1263, "time_per_iteration": 2.5241312980651855 }, { "auxiliary_loss_clip": 0.01198919, "auxiliary_loss_mlp": 0.01056178, "balance_loss_clip": 1.03463662, "balance_loss_mlp": 1.06038165, "epoch": 0.07599579137231324, "flos": 23112395245440.0, "grad_norm": 2.037186988949323, "language_loss": 0.76223946, "learning_rate": 3.977849218567442e-06, "loss": 0.78479046, "num_input_tokens_seen": 26913930, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.3828125, "step": 1264, "time_per_iteration": 2.5103487968444824 }, { "auxiliary_loss_clip": 0.01198578, "auxiliary_loss_mlp": 0.01063109, "balance_loss_clip": 1.04063797, "balance_loss_mlp": 1.06153762, "epoch": 0.07605591462498121, "flos": 14502058248960.0, "grad_norm": 2.202734967382067, "language_loss": 0.81435728, "learning_rate": 3.977791377662507e-06, "loss": 0.83697414, "num_input_tokens_seen": 26931485, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.375, "step": 1265, "time_per_iteration": 5.433353900909424 }, { "auxiliary_loss_clip": 0.01196163, "auxiliary_loss_mlp": 0.01050512, "balance_loss_clip": 1.02780235, "balance_loss_mlp": 1.05824232, "epoch": 0.07611603787764919, "flos": 23514199758720.0, "grad_norm": 1.9584375753119594, "language_loss": 0.65412438, "learning_rate": 3.977733461759524e-06, "loss": 0.67659104, "num_input_tokens_seen": 26951670, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.3828125, "step": 1266, "time_per_iteration": 5.26707649230957 }, { "auxiliary_loss_clip": 0.01196224, "auxiliary_loss_mlp": 0.01063411, "balance_loss_clip": 1.04059482, "balance_loss_mlp": 1.05815828, "epoch": 0.07617616113031715, "flos": 21507188353920.0, "grad_norm": 1.943272686343156, "language_loss": 0.79625201, "learning_rate": 3.977675470860691e-06, "loss": 0.81884837, "num_input_tokens_seen": 26970335, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.3828125, "step": 1267, "time_per_iteration": 2.4870543479919434 }, { "auxiliary_loss_clip": 0.01193879, "auxiliary_loss_mlp": 0.01050094, "balance_loss_clip": 1.02916098, "balance_loss_mlp": 1.05711603, "epoch": 0.07623628438298512, "flos": 14573161221120.0, "grad_norm": 3.86944342517368, "language_loss": 0.73533523, "learning_rate": 3.977617404968205e-06, "loss": 0.75777501, "num_input_tokens_seen": 26986025, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.3671875, "step": 1268, "time_per_iteration": 2.4398112297058105 }, { "auxiliary_loss_clip": 0.01195131, "auxiliary_loss_mlp": 0.01054295, "balance_loss_clip": 1.03189552, "balance_loss_mlp": 1.05842781, "epoch": 0.07629640763565308, "flos": 14720395069440.0, "grad_norm": 1.9797118746740199, "language_loss": 0.82271558, "learning_rate": 3.977559264084269e-06, "loss": 0.84520984, "num_input_tokens_seen": 27004045, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.3671875, "step": 1269, "time_per_iteration": 2.452974319458008 }, { "auxiliary_loss_clip": 0.01196225, "auxiliary_loss_mlp": 0.01053143, "balance_loss_clip": 1.03069615, "balance_loss_mlp": 1.05990648, "epoch": 0.07635653088832106, "flos": 14902929008640.0, "grad_norm": 2.116457860432095, "language_loss": 0.88275957, "learning_rate": 3.977501048211088e-06, "loss": 0.90525329, "num_input_tokens_seen": 27022070, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.359375, "step": 1270, "time_per_iteration": 2.4297401905059814 }, { "auxiliary_loss_clip": 0.01197997, "auxiliary_loss_mlp": 0.01055101, "balance_loss_clip": 1.03266609, "balance_loss_mlp": 1.0599227, "epoch": 0.07641665414098903, "flos": 26651571235200.0, "grad_norm": 1.9353501115947638, "language_loss": 0.71030295, "learning_rate": 3.977442757350869e-06, "loss": 0.73283398, "num_input_tokens_seen": 27041755, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.3828125, "step": 1271, "time_per_iteration": 2.5064756870269775 }, { "auxiliary_loss_clip": 0.01189938, "auxiliary_loss_mlp": 0.01062874, "balance_loss_clip": 1.04066515, "balance_loss_mlp": 1.05936849, "epoch": 0.07647677739365699, "flos": 25192808092800.0, "grad_norm": 1.54554067990617, "language_loss": 0.82954574, "learning_rate": 3.977384391505823e-06, "loss": 0.85207385, "num_input_tokens_seen": 27061540, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.3046875, "step": 1272, "time_per_iteration": 2.504385471343994 }, { "auxiliary_loss_clip": 0.01192433, "auxiliary_loss_mlp": 0.0105485, "balance_loss_clip": 1.03364313, "balance_loss_mlp": 1.0566771, "epoch": 0.07653690064632497, "flos": 20558141159040.0, "grad_norm": 1.9436233437676136, "language_loss": 0.79944611, "learning_rate": 3.977325950678162e-06, "loss": 0.8219189, "num_input_tokens_seen": 27081395, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.359375, "step": 1273, "time_per_iteration": 2.50011944770813 }, { "auxiliary_loss_clip": 0.01195719, "auxiliary_loss_mlp": 0.01054775, "balance_loss_clip": 1.03223264, "balance_loss_mlp": 1.05821276, "epoch": 0.07659702389899294, "flos": 22269320150400.0, "grad_norm": 1.6291626473633767, "language_loss": 0.81307584, "learning_rate": 3.977267434870103e-06, "loss": 0.83558083, "num_input_tokens_seen": 27101175, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.375, "step": 1274, "time_per_iteration": 2.5447123050689697 }, { "auxiliary_loss_clip": 0.01195107, "auxiliary_loss_mlp": 0.01064483, "balance_loss_clip": 1.04124916, "balance_loss_mlp": 1.05865407, "epoch": 0.0766571471516609, "flos": 32636120209920.0, "grad_norm": 1.690014993638547, "language_loss": 0.72950536, "learning_rate": 3.977208844083865e-06, "loss": 0.75210118, "num_input_tokens_seen": 27124505, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.3671875, "step": 1275, "time_per_iteration": 2.5679352283477783 }, { "auxiliary_loss_clip": 0.01192382, "auxiliary_loss_mlp": 0.01057712, "balance_loss_clip": 1.03381014, "balance_loss_mlp": 1.05619955, "epoch": 0.07671727040432888, "flos": 15267386355840.0, "grad_norm": 2.912189924681835, "language_loss": 0.79536819, "learning_rate": 3.9771501783216685e-06, "loss": 0.81786913, "num_input_tokens_seen": 27140960, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.359375, "step": 1276, "time_per_iteration": 2.4314053058624268 }, { "auxiliary_loss_clip": 0.01196814, "auxiliary_loss_mlp": 0.01050364, "balance_loss_clip": 1.0283463, "balance_loss_mlp": 1.05828571, "epoch": 0.07677739365699685, "flos": 28184094956160.0, "grad_norm": 3.1735514364073816, "language_loss": 0.5909276, "learning_rate": 3.97709143758574e-06, "loss": 0.61339933, "num_input_tokens_seen": 27160985, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.390625, "step": 1277, "time_per_iteration": 2.532219409942627 }, { "auxiliary_loss_clip": 0.01199176, "auxiliary_loss_mlp": 0.01057305, "balance_loss_clip": 1.03403497, "balance_loss_mlp": 1.05695152, "epoch": 0.07683751690966481, "flos": 18296128126080.0, "grad_norm": 2.095997965434649, "language_loss": 0.74764788, "learning_rate": 3.977032621878305e-06, "loss": 0.77021259, "num_input_tokens_seen": 27178390, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.421875, "step": 1278, "time_per_iteration": 2.446889877319336 }, { "auxiliary_loss_clip": 0.01189773, "auxiliary_loss_mlp": 0.01054881, "balance_loss_clip": 1.03276777, "balance_loss_mlp": 1.0550828, "epoch": 0.07689764016233278, "flos": 21981101420160.0, "grad_norm": 2.3932715669266273, "language_loss": 0.88718498, "learning_rate": 3.976973731201596e-06, "loss": 0.90963155, "num_input_tokens_seen": 27197505, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.34375, "step": 1279, "time_per_iteration": 2.4778666496276855 }, { "auxiliary_loss_clip": 0.01189065, "auxiliary_loss_mlp": 0.01060535, "balance_loss_clip": 1.03759897, "balance_loss_mlp": 1.05539834, "epoch": 0.07695776341500075, "flos": 22235995307520.0, "grad_norm": 2.420335185338537, "language_loss": 0.83141619, "learning_rate": 3.976914765557845e-06, "loss": 0.85391217, "num_input_tokens_seen": 27214260, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.3359375, "step": 1280, "time_per_iteration": 2.4532878398895264 }, { "auxiliary_loss_clip": 0.01189387, "auxiliary_loss_mlp": 0.01058928, "balance_loss_clip": 1.03648114, "balance_loss_mlp": 1.055565, "epoch": 0.07701788666766872, "flos": 16143750380160.0, "grad_norm": 2.369849178667951, "language_loss": 0.76185906, "learning_rate": 3.9768557249492875e-06, "loss": 0.78434217, "num_input_tokens_seen": 27232525, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.3359375, "step": 1281, "time_per_iteration": 2.4593148231506348 }, { "auxiliary_loss_clip": 0.01194736, "auxiliary_loss_mlp": 0.01052733, "balance_loss_clip": 1.02923644, "balance_loss_mlp": 1.05433905, "epoch": 0.07707800992033668, "flos": 19463045264640.0, "grad_norm": 2.1206371996289124, "language_loss": 0.75239319, "learning_rate": 3.9767966093781634e-06, "loss": 0.77486783, "num_input_tokens_seen": 27249800, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.40625, "step": 1282, "time_per_iteration": 2.443384885787964 }, { "auxiliary_loss_clip": 0.01193078, "auxiliary_loss_mlp": 0.01063043, "balance_loss_clip": 1.0394516, "balance_loss_mlp": 1.05686975, "epoch": 0.07713813317300466, "flos": 18990281433600.0, "grad_norm": 1.8350543722445438, "language_loss": 0.84272408, "learning_rate": 3.976737418846713e-06, "loss": 0.86528528, "num_input_tokens_seen": 27268895, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.359375, "step": 1283, "time_per_iteration": 2.473249673843384 }, { "auxiliary_loss_clip": 0.0119305, "auxiliary_loss_mlp": 0.01060953, "balance_loss_clip": 1.03614533, "balance_loss_mlp": 1.05607367, "epoch": 0.07719825642567263, "flos": 18113953322880.0, "grad_norm": 1.822445728178162, "language_loss": 0.75190783, "learning_rate": 3.976678153357181e-06, "loss": 0.77444792, "num_input_tokens_seen": 27288180, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.3671875, "step": 1284, "time_per_iteration": 2.419189453125 }, { "auxiliary_loss_clip": 0.01187833, "auxiliary_loss_mlp": 0.01061252, "balance_loss_clip": 1.03955603, "balance_loss_mlp": 1.05374825, "epoch": 0.0772583796783406, "flos": 42194426993280.0, "grad_norm": 1.7391382440513428, "language_loss": 0.76207995, "learning_rate": 3.976618812911817e-06, "loss": 0.78457081, "num_input_tokens_seen": 27311815, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.3359375, "step": 1285, "time_per_iteration": 2.6484835147857666 }, { "auxiliary_loss_clip": 0.01196076, "auxiliary_loss_mlp": 0.01064514, "balance_loss_clip": 1.04228175, "balance_loss_mlp": 1.05778551, "epoch": 0.07731850293100857, "flos": 24753692327040.0, "grad_norm": 1.8691325764945308, "language_loss": 0.84252477, "learning_rate": 3.9765593975128685e-06, "loss": 0.86513072, "num_input_tokens_seen": 27331890, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.3828125, "step": 1286, "time_per_iteration": 2.533121347427368 }, { "auxiliary_loss_clip": 0.01194099, "auxiliary_loss_mlp": 0.01056576, "balance_loss_clip": 1.03378344, "balance_loss_mlp": 1.05486524, "epoch": 0.07737862618367654, "flos": 17565884628480.0, "grad_norm": 2.5032123326713007, "language_loss": 0.7682454, "learning_rate": 3.97649990716259e-06, "loss": 0.79075217, "num_input_tokens_seen": 27348320, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.390625, "step": 1287, "time_per_iteration": 2.4256749153137207 }, { "auxiliary_loss_clip": 0.01188608, "auxiliary_loss_mlp": 0.01054586, "balance_loss_clip": 1.0324012, "balance_loss_mlp": 1.05436277, "epoch": 0.0774387494363445, "flos": 25627147349760.0, "grad_norm": 1.6900170479621326, "language_loss": 0.84723926, "learning_rate": 3.976440341863237e-06, "loss": 0.86967117, "num_input_tokens_seen": 27367670, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.34375, "step": 1288, "time_per_iteration": 2.5058605670928955 }, { "auxiliary_loss_clip": 0.01192139, "auxiliary_loss_mlp": 0.01055737, "balance_loss_clip": 1.03356421, "balance_loss_mlp": 1.05375934, "epoch": 0.07749887268901248, "flos": 12239865648000.0, "grad_norm": 2.006051377619914, "language_loss": 0.85478324, "learning_rate": 3.976380701617068e-06, "loss": 0.877262, "num_input_tokens_seen": 27385485, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.3828125, "step": 1289, "time_per_iteration": 2.4491608142852783 }, { "auxiliary_loss_clip": 0.01192531, "auxiliary_loss_mlp": 0.0104748, "balance_loss_clip": 1.02494943, "balance_loss_mlp": 1.05527258, "epoch": 0.07755899594168045, "flos": 25081736261760.0, "grad_norm": 2.0173249029944427, "language_loss": 0.85208523, "learning_rate": 3.976320986426344e-06, "loss": 0.87448537, "num_input_tokens_seen": 27405110, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.375, "step": 1290, "time_per_iteration": 2.499552011489868 }, { "auxiliary_loss_clip": 0.01188255, "auxiliary_loss_mlp": 0.01059114, "balance_loss_clip": 1.03511715, "balance_loss_mlp": 1.05568242, "epoch": 0.07761911919434841, "flos": 14246410176000.0, "grad_norm": 2.497260450708053, "language_loss": 0.90757358, "learning_rate": 3.9762611962933315e-06, "loss": 0.93004727, "num_input_tokens_seen": 27422855, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.328125, "step": 1291, "time_per_iteration": 2.4678826332092285 }, { "auxiliary_loss_clip": 0.01071062, "auxiliary_loss_mlp": 0.01012796, "balance_loss_clip": 1.00960159, "balance_loss_mlp": 1.01939797, "epoch": 0.07767924244701638, "flos": 67237202954880.0, "grad_norm": 0.9343148069455951, "language_loss": 0.6506694, "learning_rate": 3.9762013312202955e-06, "loss": 0.67150795, "num_input_tokens_seen": 27487190, "router_z_loss_clip": 0.03198242, "router_z_loss_mlp": 0.515625, "step": 1292, "time_per_iteration": 3.179517984390259 }, { "auxiliary_loss_clip": 0.01192159, "auxiliary_loss_mlp": 0.01053105, "balance_loss_clip": 1.03102732, "balance_loss_mlp": 1.05599296, "epoch": 0.07773936569968436, "flos": 28550635292160.0, "grad_norm": 1.7330062554809482, "language_loss": 0.87936413, "learning_rate": 3.9761413912095075e-06, "loss": 0.90181684, "num_input_tokens_seen": 27510465, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.359375, "step": 1293, "time_per_iteration": 2.5529494285583496 }, { "auxiliary_loss_clip": 0.011938, "auxiliary_loss_mlp": 0.01062079, "balance_loss_clip": 1.03755796, "balance_loss_mlp": 1.05762637, "epoch": 0.07779948895235232, "flos": 27490264871040.0, "grad_norm": 2.0294944341490977, "language_loss": 0.84802771, "learning_rate": 3.976081376263239e-06, "loss": 0.87058651, "num_input_tokens_seen": 27528645, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.359375, "step": 1294, "time_per_iteration": 2.508366107940674 }, { "auxiliary_loss_clip": 0.01198832, "auxiliary_loss_mlp": 0.01056527, "balance_loss_clip": 1.03314972, "balance_loss_mlp": 1.06069851, "epoch": 0.07785961220502029, "flos": 18223301301120.0, "grad_norm": 2.6697959221245626, "language_loss": 0.79348332, "learning_rate": 3.976021286383768e-06, "loss": 0.81603694, "num_input_tokens_seen": 27546165, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.3828125, "step": 1295, "time_per_iteration": 2.45554518699646 }, { "auxiliary_loss_clip": 0.01192285, "auxiliary_loss_mlp": 0.01053955, "balance_loss_clip": 1.02993441, "balance_loss_mlp": 1.05588174, "epoch": 0.07791973545768827, "flos": 24608218245120.0, "grad_norm": 2.146128175665706, "language_loss": 0.88467675, "learning_rate": 3.975961121573371e-06, "loss": 0.90713918, "num_input_tokens_seen": 27566520, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.359375, "step": 1296, "time_per_iteration": 2.5007052421569824 }, { "auxiliary_loss_clip": 0.01196431, "auxiliary_loss_mlp": 0.01057793, "balance_loss_clip": 1.03343821, "balance_loss_mlp": 1.05769587, "epoch": 0.07797985871035623, "flos": 14282069402880.0, "grad_norm": 3.009822330831532, "language_loss": 0.96281779, "learning_rate": 3.9759008818343305e-06, "loss": 0.98536003, "num_input_tokens_seen": 27581960, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.390625, "step": 1297, "time_per_iteration": 2.516206979751587 }, { "auxiliary_loss_clip": 0.01193501, "auxiliary_loss_mlp": 0.01052541, "balance_loss_clip": 1.02986741, "balance_loss_mlp": 1.0551672, "epoch": 0.0780399819630242, "flos": 26610453141120.0, "grad_norm": 2.041639989641837, "language_loss": 0.75952578, "learning_rate": 3.97584056716893e-06, "loss": 0.78198624, "num_input_tokens_seen": 27601415, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.3828125, "step": 1298, "time_per_iteration": 2.5279102325439453 }, { "auxiliary_loss_clip": 0.0119488, "auxiliary_loss_mlp": 0.01059888, "balance_loss_clip": 1.03720295, "balance_loss_mlp": 1.05827498, "epoch": 0.07810010521569218, "flos": 21834514016640.0, "grad_norm": 1.7114495810992398, "language_loss": 0.8073917, "learning_rate": 3.9757801775794575e-06, "loss": 0.82993937, "num_input_tokens_seen": 27621490, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.3671875, "step": 1299, "time_per_iteration": 2.4805917739868164 }, { "auxiliary_loss_clip": 0.01189239, "auxiliary_loss_mlp": 0.0105697, "balance_loss_clip": 1.03291404, "balance_loss_mlp": 1.05640709, "epoch": 0.07816022846836014, "flos": 25081233471360.0, "grad_norm": 1.896935643432527, "language_loss": 0.86618078, "learning_rate": 3.975719713068202e-06, "loss": 0.88864291, "num_input_tokens_seen": 27640600, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.328125, "step": 1300, "time_per_iteration": 2.485255002975464 }, { "auxiliary_loss_clip": 0.01195281, "auxiliary_loss_mlp": 0.01051846, "balance_loss_clip": 1.02769434, "balance_loss_mlp": 1.05797887, "epoch": 0.0782203517210281, "flos": 40917515431680.0, "grad_norm": 3.510995479532107, "language_loss": 0.71810353, "learning_rate": 3.975659173637458e-06, "loss": 0.74057484, "num_input_tokens_seen": 27663070, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.375, "step": 1301, "time_per_iteration": 2.635979175567627 }, { "auxiliary_loss_clip": 0.01199835, "auxiliary_loss_mlp": 0.01061396, "balance_loss_clip": 1.0380187, "balance_loss_mlp": 1.06030822, "epoch": 0.07828047497369607, "flos": 41172014269440.0, "grad_norm": 1.6146566757580014, "language_loss": 0.70924783, "learning_rate": 3.97559855928952e-06, "loss": 0.73186016, "num_input_tokens_seen": 27686425, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.390625, "step": 1302, "time_per_iteration": 2.6619691848754883 }, { "auxiliary_loss_clip": 0.0119533, "auxiliary_loss_mlp": 0.0105535, "balance_loss_clip": 1.03055453, "balance_loss_mlp": 1.05816913, "epoch": 0.07834059822636405, "flos": 23508130360320.0, "grad_norm": 2.0274211828285793, "language_loss": 0.82144153, "learning_rate": 3.9755378700266864e-06, "loss": 0.84394836, "num_input_tokens_seen": 27704900, "router_z_loss_clip": 0.24804688, "router_z_loss_mlp": 1.375, "step": 1303, "time_per_iteration": 2.4775097370147705 }, { "auxiliary_loss_clip": 0.01193266, "auxiliary_loss_mlp": 0.01063152, "balance_loss_clip": 1.03892851, "balance_loss_mlp": 1.05679846, "epoch": 0.07840072147903202, "flos": 20193899293440.0, "grad_norm": 1.520738417134122, "language_loss": 0.74689376, "learning_rate": 3.9754771058512585e-06, "loss": 0.76945794, "num_input_tokens_seen": 27724890, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.359375, "step": 1304, "time_per_iteration": 2.5464327335357666 }, { "auxiliary_loss_clip": 0.01195912, "auxiliary_loss_mlp": 0.01062808, "balance_loss_clip": 1.03900146, "balance_loss_mlp": 1.06008017, "epoch": 0.07846084473169998, "flos": 21360816432000.0, "grad_norm": 1.6486237064966849, "language_loss": 0.76427424, "learning_rate": 3.975416266765542e-06, "loss": 0.78686142, "num_input_tokens_seen": 27743115, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.359375, "step": 1305, "time_per_iteration": 2.55131196975708 }, { "auxiliary_loss_clip": 0.01196305, "auxiliary_loss_mlp": 0.01062666, "balance_loss_clip": 1.03853762, "balance_loss_mlp": 1.05804491, "epoch": 0.07852096798436796, "flos": 25410965345280.0, "grad_norm": 1.7509237236613557, "language_loss": 0.85097057, "learning_rate": 3.975355352771841e-06, "loss": 0.87356031, "num_input_tokens_seen": 27763570, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.375, "step": 1306, "time_per_iteration": 4.039067268371582 }, { "auxiliary_loss_clip": 0.01194148, "auxiliary_loss_mlp": 0.01045664, "balance_loss_clip": 1.02355099, "balance_loss_mlp": 1.05856645, "epoch": 0.07858109123703592, "flos": 24571481610240.0, "grad_norm": 2.5664958199116037, "language_loss": 0.9079771, "learning_rate": 3.975294363872468e-06, "loss": 0.93037522, "num_input_tokens_seen": 27780030, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.3515625, "step": 1307, "time_per_iteration": 5.414397478103638 }, { "auxiliary_loss_clip": 0.01191576, "auxiliary_loss_mlp": 0.01055046, "balance_loss_clip": 1.03087032, "balance_loss_mlp": 1.05567551, "epoch": 0.07864121448970389, "flos": 20698874645760.0, "grad_norm": 1.8375367738483017, "language_loss": 0.83297771, "learning_rate": 3.975233300069735e-06, "loss": 0.85544395, "num_input_tokens_seen": 27796225, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.359375, "step": 1308, "time_per_iteration": 3.944891929626465 }, { "auxiliary_loss_clip": 0.01192091, "auxiliary_loss_mlp": 0.01051154, "balance_loss_clip": 1.02886176, "balance_loss_mlp": 1.0559783, "epoch": 0.07870133774237187, "flos": 22966526113920.0, "grad_norm": 1.4942200227135103, "language_loss": 0.77265233, "learning_rate": 3.975172161365958e-06, "loss": 0.79508477, "num_input_tokens_seen": 27815975, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.359375, "step": 1309, "time_per_iteration": 2.468369483947754 }, { "auxiliary_loss_clip": 0.01199501, "auxiliary_loss_mlp": 0.01065605, "balance_loss_clip": 1.04059482, "balance_loss_mlp": 1.05796504, "epoch": 0.07876146099503983, "flos": 18842832103680.0, "grad_norm": 1.8090831871127762, "language_loss": 0.80537844, "learning_rate": 3.975110947763453e-06, "loss": 0.82802951, "num_input_tokens_seen": 27832255, "router_z_loss_clip": 0.25, "router_z_loss_mlp": 1.4140625, "step": 1310, "time_per_iteration": 2.450739860534668 }, { "auxiliary_loss_clip": 0.01188316, "auxiliary_loss_mlp": 0.01051642, "balance_loss_clip": 1.02948093, "balance_loss_mlp": 1.05666518, "epoch": 0.0788215842477078, "flos": 23805794367360.0, "grad_norm": 1.9348612430928431, "language_loss": 0.73303807, "learning_rate": 3.9750496592645435e-06, "loss": 0.75543761, "num_input_tokens_seen": 27852180, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.3125, "step": 1311, "time_per_iteration": 2.4925637245178223 }, { "auxiliary_loss_clip": 0.01194155, "auxiliary_loss_mlp": 0.01075491, "balance_loss_clip": 1.05104089, "balance_loss_mlp": 1.0600189, "epoch": 0.07888170750037576, "flos": 21579907438080.0, "grad_norm": 1.7057699607601668, "language_loss": 0.85804331, "learning_rate": 3.974988295871553e-06, "loss": 0.88073981, "num_input_tokens_seen": 27871435, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.34375, "step": 1312, "time_per_iteration": 2.4845151901245117 }, { "auxiliary_loss_clip": 0.0119057, "auxiliary_loss_mlp": 0.01058473, "balance_loss_clip": 1.03660953, "balance_loss_mlp": 1.05864596, "epoch": 0.07894183075304374, "flos": 19864849777920.0, "grad_norm": 1.7401878025104764, "language_loss": 0.82077777, "learning_rate": 3.9749268575868085e-06, "loss": 0.84326816, "num_input_tokens_seen": 27890625, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.3203125, "step": 1313, "time_per_iteration": 2.4496676921844482 }, { "auxiliary_loss_clip": 0.01198496, "auxiliary_loss_mlp": 0.0105592, "balance_loss_clip": 1.03120828, "balance_loss_mlp": 1.05766749, "epoch": 0.07900195400571171, "flos": 16143463071360.0, "grad_norm": 2.4280321119097588, "language_loss": 0.73486996, "learning_rate": 3.97486534441264e-06, "loss": 0.7574141, "num_input_tokens_seen": 27906530, "router_z_loss_clip": 0.24609375, "router_z_loss_mlp": 1.40625, "step": 1314, "time_per_iteration": 2.4555561542510986 }, { "auxiliary_loss_clip": 0.01191409, "auxiliary_loss_mlp": 0.01054029, "balance_loss_clip": 1.03223777, "balance_loss_mlp": 1.0548842, "epoch": 0.07906207725837967, "flos": 23730417676800.0, "grad_norm": 1.5450996007311364, "language_loss": 0.79715884, "learning_rate": 3.974803756351379e-06, "loss": 0.81961316, "num_input_tokens_seen": 27926725, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.359375, "step": 1315, "time_per_iteration": 2.502606153488159 }, { "auxiliary_loss_clip": 0.01190958, "auxiliary_loss_mlp": 0.0105895, "balance_loss_clip": 1.03446448, "balance_loss_mlp": 1.05379343, "epoch": 0.07912220051104765, "flos": 24315905364480.0, "grad_norm": 1.7834030766542506, "language_loss": 0.73969728, "learning_rate": 3.974742093405362e-06, "loss": 0.76219642, "num_input_tokens_seen": 27947875, "router_z_loss_clip": 0.24511719, "router_z_loss_mlp": 1.375, "step": 1316, "time_per_iteration": 2.4908905029296875 }, { "auxiliary_loss_clip": 0.01196538, "auxiliary_loss_mlp": 0.01062948, "balance_loss_clip": 1.03893936, "balance_loss_mlp": 1.05696559, "epoch": 0.07918232376371562, "flos": 18880035615360.0, "grad_norm": 2.361745164844023, "language_loss": 0.6512444, "learning_rate": 3.974680355576927e-06, "loss": 0.67383921, "num_input_tokens_seen": 27965040, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.3984375, "step": 1317, "time_per_iteration": 2.444304943084717 }, { "auxiliary_loss_clip": 0.01203337, "auxiliary_loss_mlp": 0.01061222, "balance_loss_clip": 1.03647423, "balance_loss_mlp": 1.06119823, "epoch": 0.07924244701638358, "flos": 27376284038400.0, "grad_norm": 2.9885605328303346, "language_loss": 0.7312513, "learning_rate": 3.974618542868415e-06, "loss": 0.75389689, "num_input_tokens_seen": 27985330, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.421875, "step": 1318, "time_per_iteration": 2.529545545578003 }, { "auxiliary_loss_clip": 0.01192588, "auxiliary_loss_mlp": 0.0106094, "balance_loss_clip": 1.03938675, "balance_loss_mlp": 1.05699277, "epoch": 0.07930257026905156, "flos": 25120340403840.0, "grad_norm": 1.5925630572200455, "language_loss": 0.90449876, "learning_rate": 3.97455665528217e-06, "loss": 0.92703402, "num_input_tokens_seen": 28007615, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.359375, "step": 1319, "time_per_iteration": 2.5151729583740234 }, { "auxiliary_loss_clip": 0.01193501, "auxiliary_loss_mlp": 0.01056175, "balance_loss_clip": 1.03372765, "balance_loss_mlp": 1.05544615, "epoch": 0.07936269352171953, "flos": 21834478103040.0, "grad_norm": 2.197877988089978, "language_loss": 0.80009902, "learning_rate": 3.974494692820539e-06, "loss": 0.82259583, "num_input_tokens_seen": 28027765, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.375, "step": 1320, "time_per_iteration": 2.5373613834381104 }, { "auxiliary_loss_clip": 0.01195426, "auxiliary_loss_mlp": 0.01058529, "balance_loss_clip": 1.03651094, "balance_loss_mlp": 1.06046605, "epoch": 0.07942281677438749, "flos": 16939889377920.0, "grad_norm": 1.9396640181342004, "language_loss": 0.69344467, "learning_rate": 3.974432655485872e-06, "loss": 0.71598423, "num_input_tokens_seen": 28044225, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.3515625, "step": 1321, "time_per_iteration": 2.438751459121704 }, { "auxiliary_loss_clip": 0.01190114, "auxiliary_loss_mlp": 0.0106005, "balance_loss_clip": 1.03812766, "balance_loss_mlp": 1.05755579, "epoch": 0.07948294002705546, "flos": 18986941468800.0, "grad_norm": 2.802703272319673, "language_loss": 0.83672082, "learning_rate": 3.9743705432805195e-06, "loss": 0.85922241, "num_input_tokens_seen": 28062915, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.328125, "step": 1322, "time_per_iteration": 2.4684321880340576 }, { "auxiliary_loss_clip": 0.01189684, "auxiliary_loss_mlp": 0.01055488, "balance_loss_clip": 1.03270745, "balance_loss_mlp": 1.05290365, "epoch": 0.07954306327972344, "flos": 21653452535040.0, "grad_norm": 2.0155045123085844, "language_loss": 0.90455472, "learning_rate": 3.974308356206838e-06, "loss": 0.92700648, "num_input_tokens_seen": 28082175, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.3671875, "step": 1323, "time_per_iteration": 2.471036195755005 }, { "auxiliary_loss_clip": 0.01189101, "auxiliary_loss_mlp": 0.0106127, "balance_loss_clip": 1.0390017, "balance_loss_mlp": 1.05708838, "epoch": 0.0796031865323914, "flos": 23220270766080.0, "grad_norm": 1.570870003645657, "language_loss": 0.82202148, "learning_rate": 3.974246094267187e-06, "loss": 0.84452522, "num_input_tokens_seen": 28102645, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.3203125, "step": 1324, "time_per_iteration": 2.4965169429779053 }, { "auxiliary_loss_clip": 0.01192077, "auxiliary_loss_mlp": 0.01047987, "balance_loss_clip": 1.02468193, "balance_loss_mlp": 1.05609167, "epoch": 0.07966330978505937, "flos": 23294534135040.0, "grad_norm": 2.7313966128628087, "language_loss": 0.79133362, "learning_rate": 3.974183757463925e-06, "loss": 0.81373429, "num_input_tokens_seen": 28122805, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.359375, "step": 1325, "time_per_iteration": 2.4955642223358154 }, { "auxiliary_loss_clip": 0.0119376, "auxiliary_loss_mlp": 0.01064704, "balance_loss_clip": 1.04107642, "balance_loss_mlp": 1.05856586, "epoch": 0.07972343303772735, "flos": 18363783392640.0, "grad_norm": 2.3008404677573844, "language_loss": 0.88544166, "learning_rate": 3.974121345799418e-06, "loss": 0.90802628, "num_input_tokens_seen": 28140530, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.3515625, "step": 1326, "time_per_iteration": 2.4621522426605225 }, { "auxiliary_loss_clip": 0.01187004, "auxiliary_loss_mlp": 0.01052566, "balance_loss_clip": 1.02868879, "balance_loss_mlp": 1.05435705, "epoch": 0.07978355629039531, "flos": 21762513204480.0, "grad_norm": 2.130535743168356, "language_loss": 0.83382398, "learning_rate": 3.974058859276032e-06, "loss": 0.85621965, "num_input_tokens_seen": 28159640, "router_z_loss_clip": 0.23828125, "router_z_loss_mlp": 1.328125, "step": 1327, "time_per_iteration": 2.456493854522705 }, { "auxiliary_loss_clip": 0.01195594, "auxiliary_loss_mlp": 0.01050451, "balance_loss_clip": 1.02694249, "balance_loss_mlp": 1.05903947, "epoch": 0.07984367954306328, "flos": 18551309322240.0, "grad_norm": 5.191581618904735, "language_loss": 0.78380901, "learning_rate": 3.9739962978961354e-06, "loss": 0.80626953, "num_input_tokens_seen": 28177050, "router_z_loss_clip": 0.23535156, "router_z_loss_mlp": 1.359375, "step": 1328, "time_per_iteration": 2.48891544342041 }, { "auxiliary_loss_clip": 0.01195291, "auxiliary_loss_mlp": 0.01051787, "balance_loss_clip": 1.02852964, "balance_loss_mlp": 1.05894709, "epoch": 0.07990380279573125, "flos": 16904050583040.0, "grad_norm": 3.2544133913570237, "language_loss": 0.74460608, "learning_rate": 3.973933661662101e-06, "loss": 0.76707685, "num_input_tokens_seen": 28193245, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.359375, "step": 1329, "time_per_iteration": 2.4212141036987305 }, { "auxiliary_loss_clip": 0.01189713, "auxiliary_loss_mlp": 0.01059608, "balance_loss_clip": 1.0375663, "balance_loss_mlp": 1.05544281, "epoch": 0.07996392604839922, "flos": 24098358643200.0, "grad_norm": 1.697628589272219, "language_loss": 0.81332552, "learning_rate": 3.973870950576305e-06, "loss": 0.83581877, "num_input_tokens_seen": 28213570, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.34375, "step": 1330, "time_per_iteration": 2.5129785537719727 }, { "auxiliary_loss_clip": 0.01194091, "auxiliary_loss_mlp": 0.0105723, "balance_loss_clip": 1.03483033, "balance_loss_mlp": 1.05724955, "epoch": 0.08002404930106718, "flos": 14278729438080.0, "grad_norm": 2.7402084920756873, "language_loss": 0.88727629, "learning_rate": 3.9738081646411255e-06, "loss": 0.90978944, "num_input_tokens_seen": 28229980, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.3671875, "step": 1331, "time_per_iteration": 2.413100004196167 }, { "auxiliary_loss_clip": 0.01197974, "auxiliary_loss_mlp": 0.01062079, "balance_loss_clip": 1.03867793, "balance_loss_mlp": 1.05764508, "epoch": 0.08008417255373516, "flos": 40406219285760.0, "grad_norm": 2.045674320199945, "language_loss": 0.73218393, "learning_rate": 3.973745303858942e-06, "loss": 0.75478446, "num_input_tokens_seen": 28253840, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.40625, "step": 1332, "time_per_iteration": 2.647573947906494 }, { "auxiliary_loss_clip": 0.01192217, "auxiliary_loss_mlp": 0.01051979, "balance_loss_clip": 1.03021097, "balance_loss_mlp": 1.05835962, "epoch": 0.08014429580640313, "flos": 18478913460480.0, "grad_norm": 2.293325559556138, "language_loss": 0.82714677, "learning_rate": 3.973682368232138e-06, "loss": 0.84958863, "num_input_tokens_seen": 28271675, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.3359375, "step": 1333, "time_per_iteration": 2.532564163208008 }, { "auxiliary_loss_clip": 0.0119201, "auxiliary_loss_mlp": 0.01053027, "balance_loss_clip": 1.03106868, "balance_loss_mlp": 1.05571795, "epoch": 0.0802044190590711, "flos": 22053461368320.0, "grad_norm": 2.9259150418823436, "language_loss": 0.74840212, "learning_rate": 3.9736193577631015e-06, "loss": 0.77085251, "num_input_tokens_seen": 28291850, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.359375, "step": 1334, "time_per_iteration": 2.51759934425354 }, { "auxiliary_loss_clip": 0.01195783, "auxiliary_loss_mlp": 0.01060156, "balance_loss_clip": 1.03788745, "balance_loss_mlp": 1.06231761, "epoch": 0.08026454231173906, "flos": 24572128055040.0, "grad_norm": 1.917038667761494, "language_loss": 0.80316389, "learning_rate": 3.973556272454221e-06, "loss": 0.82572329, "num_input_tokens_seen": 28310780, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.3359375, "step": 1335, "time_per_iteration": 2.527366876602173 }, { "auxiliary_loss_clip": 0.0108334, "auxiliary_loss_mlp": 0.01002856, "balance_loss_clip": 1.00035238, "balance_loss_mlp": 1.03130782, "epoch": 0.08032466556440704, "flos": 52581841459200.0, "grad_norm": 0.7405955711303981, "language_loss": 0.56065995, "learning_rate": 3.973493112307889e-06, "loss": 0.58152187, "num_input_tokens_seen": 28369985, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.51953125, "step": 1336, "time_per_iteration": 3.1537179946899414 }, { "auxiliary_loss_clip": 0.01189459, "auxiliary_loss_mlp": 0.01056863, "balance_loss_clip": 1.03559566, "balance_loss_mlp": 1.05678201, "epoch": 0.080384788817075, "flos": 23842602829440.0, "grad_norm": 1.8743831138127867, "language_loss": 0.67600894, "learning_rate": 3.9734298773265005e-06, "loss": 0.69847214, "num_input_tokens_seen": 28388670, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.328125, "step": 1337, "time_per_iteration": 2.477642297744751 }, { "auxiliary_loss_clip": 0.01190678, "auxiliary_loss_mlp": 0.01064298, "balance_loss_clip": 1.04262614, "balance_loss_mlp": 1.05839562, "epoch": 0.08044491206974297, "flos": 25300719527040.0, "grad_norm": 1.726034498771916, "language_loss": 0.87183917, "learning_rate": 3.973366567512453e-06, "loss": 0.89438897, "num_input_tokens_seen": 28411845, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.3203125, "step": 1338, "time_per_iteration": 2.5571603775024414 }, { "auxiliary_loss_clip": 0.01189547, "auxiliary_loss_mlp": 0.01062042, "balance_loss_clip": 1.0392487, "balance_loss_mlp": 1.05412984, "epoch": 0.08050503532241095, "flos": 22376549226240.0, "grad_norm": 3.0661073760800934, "language_loss": 0.87379628, "learning_rate": 3.973303182868147e-06, "loss": 0.89631224, "num_input_tokens_seen": 28427875, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.3515625, "step": 1339, "time_per_iteration": 2.4285595417022705 }, { "auxiliary_loss_clip": 0.01188078, "auxiliary_loss_mlp": 0.01049425, "balance_loss_clip": 1.02877808, "balance_loss_mlp": 1.05766904, "epoch": 0.08056515857507891, "flos": 18369421827840.0, "grad_norm": 1.8143925112531099, "language_loss": 0.89394838, "learning_rate": 3.973239723395988e-06, "loss": 0.91632342, "num_input_tokens_seen": 28446615, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.3046875, "step": 1340, "time_per_iteration": 2.475186586380005 }, { "auxiliary_loss_clip": 0.01080419, "auxiliary_loss_mlp": 0.01002692, "balance_loss_clip": 1.00012898, "balance_loss_mlp": 1.0287559, "epoch": 0.08062528182774688, "flos": 51348130980480.0, "grad_norm": 0.8951753082597589, "language_loss": 0.64854974, "learning_rate": 3.97317618909838e-06, "loss": 0.66938084, "num_input_tokens_seen": 28505290, "router_z_loss_clip": 0.02563477, "router_z_loss_mlp": 0.515625, "step": 1341, "time_per_iteration": 3.0075201988220215 }, { "auxiliary_loss_clip": 0.01196172, "auxiliary_loss_mlp": 0.01054389, "balance_loss_clip": 1.03027284, "balance_loss_mlp": 1.05616283, "epoch": 0.08068540508041486, "flos": 17599712261760.0, "grad_norm": 2.072319915528199, "language_loss": 0.89648724, "learning_rate": 3.973112579977733e-06, "loss": 0.91899282, "num_input_tokens_seen": 28522735, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.40625, "step": 1342, "time_per_iteration": 2.4608538150787354 }, { "auxiliary_loss_clip": 0.01201515, "auxiliary_loss_mlp": 0.01053562, "balance_loss_clip": 1.03009021, "balance_loss_mlp": 1.06382167, "epoch": 0.08074552833308282, "flos": 10561185486720.0, "grad_norm": 2.2509413599414874, "language_loss": 0.76425362, "learning_rate": 3.973048896036459e-06, "loss": 0.78680432, "num_input_tokens_seen": 28539460, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.375, "step": 1343, "time_per_iteration": 2.4457099437713623 }, { "auxiliary_loss_clip": 0.01077765, "auxiliary_loss_mlp": 0.01008612, "balance_loss_clip": 1.00594223, "balance_loss_mlp": 1.02671146, "epoch": 0.08080565158575079, "flos": 60840254954880.0, "grad_norm": 0.8152203983861334, "language_loss": 0.57548618, "learning_rate": 3.972985137276974e-06, "loss": 0.59634995, "num_input_tokens_seen": 28599855, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.51171875, "step": 1344, "time_per_iteration": 3.0195460319519043 }, { "auxiliary_loss_clip": 0.01193662, "auxiliary_loss_mlp": 0.0105851, "balance_loss_clip": 1.03510928, "balance_loss_mlp": 1.05835128, "epoch": 0.08086577483841875, "flos": 18332361970560.0, "grad_norm": 2.226055547768755, "language_loss": 0.86344242, "learning_rate": 3.972921303701695e-06, "loss": 0.88596416, "num_input_tokens_seen": 28617585, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.3515625, "step": 1345, "time_per_iteration": 2.478146553039551 }, { "auxiliary_loss_clip": 0.01190236, "auxiliary_loss_mlp": 0.01052331, "balance_loss_clip": 1.03093278, "balance_loss_mlp": 1.05791593, "epoch": 0.08092589809108673, "flos": 21543601766400.0, "grad_norm": 1.7436782357237668, "language_loss": 0.87557006, "learning_rate": 3.972857395313042e-06, "loss": 0.89799571, "num_input_tokens_seen": 28636355, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.328125, "step": 1346, "time_per_iteration": 2.5276784896850586 }, { "auxiliary_loss_clip": 0.01188836, "auxiliary_loss_mlp": 0.01053179, "balance_loss_clip": 1.03100622, "balance_loss_mlp": 1.05550003, "epoch": 0.0809860213437547, "flos": 22128012046080.0, "grad_norm": 1.5357020446121097, "language_loss": 0.92586505, "learning_rate": 3.972793412113439e-06, "loss": 0.94828516, "num_input_tokens_seen": 28656260, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.3359375, "step": 1347, "time_per_iteration": 2.483421564102173 }, { "auxiliary_loss_clip": 0.01188024, "auxiliary_loss_mlp": 0.01059297, "balance_loss_clip": 1.03520441, "balance_loss_mlp": 1.05597365, "epoch": 0.08104614459642266, "flos": 21725489260800.0, "grad_norm": 2.9883345718232546, "language_loss": 0.89295202, "learning_rate": 3.972729354105312e-06, "loss": 0.9154253, "num_input_tokens_seen": 28675865, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.3203125, "step": 1348, "time_per_iteration": 5.449762582778931 }, { "auxiliary_loss_clip": 0.01189086, "auxiliary_loss_mlp": 0.010521, "balance_loss_clip": 1.03125012, "balance_loss_mlp": 1.05986285, "epoch": 0.08110626784909064, "flos": 23951878980480.0, "grad_norm": 1.8215194330841347, "language_loss": 0.76738197, "learning_rate": 3.97266522129109e-06, "loss": 0.78979385, "num_input_tokens_seen": 28696255, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.2890625, "step": 1349, "time_per_iteration": 5.210908651351929 }, { "auxiliary_loss_clip": 0.01191195, "auxiliary_loss_mlp": 0.01058219, "balance_loss_clip": 1.03556919, "balance_loss_mlp": 1.05609059, "epoch": 0.0811663911017586, "flos": 19025689265280.0, "grad_norm": 1.8749215813667326, "language_loss": 0.88896453, "learning_rate": 3.972601013673205e-06, "loss": 0.91145873, "num_input_tokens_seen": 28713905, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.3515625, "step": 1350, "time_per_iteration": 2.456059694290161 }, { "auxiliary_loss_clip": 0.01188269, "auxiliary_loss_mlp": 0.01058405, "balance_loss_clip": 1.03573179, "balance_loss_mlp": 1.05660105, "epoch": 0.08122651435442657, "flos": 15341290588800.0, "grad_norm": 1.9242607912555068, "language_loss": 0.82265258, "learning_rate": 3.972536731254092e-06, "loss": 0.84511924, "num_input_tokens_seen": 28732075, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.3125, "step": 1351, "time_per_iteration": 2.4818613529205322 }, { "auxiliary_loss_clip": 0.01187543, "auxiliary_loss_mlp": 0.01051397, "balance_loss_clip": 1.02775812, "balance_loss_mlp": 1.05336738, "epoch": 0.08128663760709455, "flos": 23221563655680.0, "grad_norm": 2.252899686322969, "language_loss": 0.75513405, "learning_rate": 3.972472374036189e-06, "loss": 0.77752346, "num_input_tokens_seen": 28751150, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.34375, "step": 1352, "time_per_iteration": 2.4887747764587402 }, { "auxiliary_loss_clip": 0.01194315, "auxiliary_loss_mlp": 0.0105954, "balance_loss_clip": 1.03582895, "balance_loss_mlp": 1.05765796, "epoch": 0.08134676085976252, "flos": 22965628273920.0, "grad_norm": 2.6080368927153352, "language_loss": 0.8270539, "learning_rate": 3.972407942021935e-06, "loss": 0.84959245, "num_input_tokens_seen": 28773360, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.3671875, "step": 1353, "time_per_iteration": 2.4966776371002197 }, { "auxiliary_loss_clip": 0.01079998, "auxiliary_loss_mlp": 0.0100775, "balance_loss_clip": 1.00517511, "balance_loss_mlp": 1.03015757, "epoch": 0.08140688411243048, "flos": 64322115816960.0, "grad_norm": 0.8519962337094097, "language_loss": 0.5973357, "learning_rate": 3.972343435213775e-06, "loss": 0.61821318, "num_input_tokens_seen": 28833390, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.49804688, "step": 1354, "time_per_iteration": 3.103447675704956 }, { "auxiliary_loss_clip": 0.01185726, "auxiliary_loss_mlp": 0.01051999, "balance_loss_clip": 1.03037429, "balance_loss_mlp": 1.05562544, "epoch": 0.08146700736509845, "flos": 22491858862080.0, "grad_norm": 2.023379338341147, "language_loss": 0.82465386, "learning_rate": 3.972278853614154e-06, "loss": 0.84703112, "num_input_tokens_seen": 28852430, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.296875, "step": 1355, "time_per_iteration": 2.515320301055908 }, { "auxiliary_loss_clip": 0.01187819, "auxiliary_loss_mlp": 0.01056392, "balance_loss_clip": 1.03188312, "balance_loss_mlp": 1.05371976, "epoch": 0.08152713061776642, "flos": 20447823513600.0, "grad_norm": 1.8886687771425832, "language_loss": 0.71131849, "learning_rate": 3.972214197225521e-06, "loss": 0.73376065, "num_input_tokens_seen": 28870685, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.34375, "step": 1356, "time_per_iteration": 2.462819814682007 }, { "auxiliary_loss_clip": 0.01185467, "auxiliary_loss_mlp": 0.0104712, "balance_loss_clip": 1.02395797, "balance_loss_mlp": 1.0519954, "epoch": 0.08158725387043439, "flos": 23550218121600.0, "grad_norm": 5.688464175822198, "language_loss": 0.70513701, "learning_rate": 3.972149466050329e-06, "loss": 0.72746301, "num_input_tokens_seen": 28889860, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.3359375, "step": 1357, "time_per_iteration": 2.5149848461151123 }, { "auxiliary_loss_clip": 0.01190545, "auxiliary_loss_mlp": 0.01052886, "balance_loss_clip": 1.03085613, "balance_loss_mlp": 1.0557065, "epoch": 0.08164737712310235, "flos": 22017335264640.0, "grad_norm": 2.5603390276146527, "language_loss": 0.83861738, "learning_rate": 3.97208466009103e-06, "loss": 0.86105168, "num_input_tokens_seen": 28905865, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.3515625, "step": 1358, "time_per_iteration": 2.470489978790283 }, { "auxiliary_loss_clip": 0.01189537, "auxiliary_loss_mlp": 0.01054821, "balance_loss_clip": 1.03085971, "balance_loss_mlp": 1.05465388, "epoch": 0.08170750037577033, "flos": 23367827836800.0, "grad_norm": 2.6668996857640286, "language_loss": 1.02398646, "learning_rate": 3.972019779350084e-06, "loss": 1.04642999, "num_input_tokens_seen": 28925250, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.34375, "step": 1359, "time_per_iteration": 2.4836950302124023 }, { "auxiliary_loss_clip": 0.01186381, "auxiliary_loss_mlp": 0.01052901, "balance_loss_clip": 1.03003693, "balance_loss_mlp": 1.05321264, "epoch": 0.0817676236284383, "flos": 28397978490240.0, "grad_norm": 2.282778021103112, "language_loss": 0.83497596, "learning_rate": 3.971954823829951e-06, "loss": 0.85736877, "num_input_tokens_seen": 28943445, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.328125, "step": 1360, "time_per_iteration": 2.5135915279388428 }, { "auxiliary_loss_clip": 0.01190618, "auxiliary_loss_mlp": 0.01060366, "balance_loss_clip": 1.03802633, "balance_loss_mlp": 1.05488729, "epoch": 0.08182774688110626, "flos": 19208905562880.0, "grad_norm": 2.2011344635318224, "language_loss": 0.72180331, "learning_rate": 3.971889793533093e-06, "loss": 0.74431312, "num_input_tokens_seen": 28962695, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.359375, "step": 1361, "time_per_iteration": 2.46833872795105 }, { "auxiliary_loss_clip": 0.0118129, "auxiliary_loss_mlp": 0.01056572, "balance_loss_clip": 1.03225374, "balance_loss_mlp": 1.05030918, "epoch": 0.08188787013377424, "flos": 22784099915520.0, "grad_norm": 2.2032180127489718, "language_loss": 0.76699817, "learning_rate": 3.971824688461976e-06, "loss": 0.78937685, "num_input_tokens_seen": 28982120, "router_z_loss_clip": 0.24316406, "router_z_loss_mlp": 1.3125, "step": 1362, "time_per_iteration": 2.473252773284912 }, { "auxiliary_loss_clip": 0.01187499, "auxiliary_loss_mlp": 0.0104955, "balance_loss_clip": 1.02735317, "balance_loss_mlp": 1.05550516, "epoch": 0.08194799338644221, "flos": 16468095214080.0, "grad_norm": 2.1617589969259052, "language_loss": 0.73026693, "learning_rate": 3.971759508619069e-06, "loss": 0.75263739, "num_input_tokens_seen": 28998100, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.3203125, "step": 1363, "time_per_iteration": 2.442387342453003 }, { "auxiliary_loss_clip": 0.01189676, "auxiliary_loss_mlp": 0.01055233, "balance_loss_clip": 1.03114045, "balance_loss_mlp": 1.05771565, "epoch": 0.08200811663911017, "flos": 23913633974400.0, "grad_norm": 2.0190177571172687, "language_loss": 0.77121091, "learning_rate": 3.971694254006844e-06, "loss": 0.79366004, "num_input_tokens_seen": 29017095, "router_z_loss_clip": 0.24023438, "router_z_loss_mlp": 1.3203125, "step": 1364, "time_per_iteration": 2.495680809020996 }, { "auxiliary_loss_clip": 0.01190857, "auxiliary_loss_mlp": 0.01056327, "balance_loss_clip": 1.03305745, "balance_loss_mlp": 1.05715549, "epoch": 0.08206823989177814, "flos": 17896550256000.0, "grad_norm": 1.6281406052396599, "language_loss": 0.81780338, "learning_rate": 3.971628924627776e-06, "loss": 0.84027523, "num_input_tokens_seen": 29037240, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.3359375, "step": 1365, "time_per_iteration": 2.484238624572754 }, { "auxiliary_loss_clip": 0.01189572, "auxiliary_loss_mlp": 0.01057656, "balance_loss_clip": 1.03613877, "balance_loss_mlp": 1.05797505, "epoch": 0.08212836314444612, "flos": 22088186841600.0, "grad_norm": 1.7982899140000708, "language_loss": 0.81883198, "learning_rate": 3.97156352048434e-06, "loss": 0.84130424, "num_input_tokens_seen": 29056250, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.3125, "step": 1366, "time_per_iteration": 2.495712995529175 }, { "auxiliary_loss_clip": 0.01187513, "auxiliary_loss_mlp": 0.01054393, "balance_loss_clip": 1.03303099, "balance_loss_mlp": 1.05369139, "epoch": 0.08218848639711408, "flos": 17597485618560.0, "grad_norm": 1.8189700791500432, "language_loss": 0.81712377, "learning_rate": 3.97149804157902e-06, "loss": 0.83954287, "num_input_tokens_seen": 29073380, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.3359375, "step": 1367, "time_per_iteration": 2.4604735374450684 }, { "auxiliary_loss_clip": 0.0119136, "auxiliary_loss_mlp": 0.01057732, "balance_loss_clip": 1.03559482, "balance_loss_mlp": 1.05523872, "epoch": 0.08224860964978205, "flos": 17857838373120.0, "grad_norm": 2.2506335868501344, "language_loss": 0.83470178, "learning_rate": 3.9714324879142946e-06, "loss": 0.85719264, "num_input_tokens_seen": 29091330, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.359375, "step": 1368, "time_per_iteration": 2.4453554153442383 }, { "auxiliary_loss_clip": 0.01182765, "auxiliary_loss_mlp": 0.01047654, "balance_loss_clip": 1.02679217, "balance_loss_mlp": 1.05546892, "epoch": 0.08230873290245003, "flos": 25227533566080.0, "grad_norm": 2.774603015951136, "language_loss": 0.81321961, "learning_rate": 3.971366859492653e-06, "loss": 0.83552384, "num_input_tokens_seen": 29110375, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.2734375, "step": 1369, "time_per_iteration": 2.53346848487854 }, { "auxiliary_loss_clip": 0.01186023, "auxiliary_loss_mlp": 0.01050663, "balance_loss_clip": 1.03064799, "balance_loss_mlp": 1.05766869, "epoch": 0.08236885615511799, "flos": 31759935753600.0, "grad_norm": 2.4573168678798587, "language_loss": 0.74707735, "learning_rate": 3.971301156316582e-06, "loss": 0.76944423, "num_input_tokens_seen": 29129395, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.28125, "step": 1370, "time_per_iteration": 2.5349347591400146 }, { "auxiliary_loss_clip": 0.01191698, "auxiliary_loss_mlp": 0.01057268, "balance_loss_clip": 1.03430808, "balance_loss_mlp": 1.05762649, "epoch": 0.08242897940778596, "flos": 23185832601600.0, "grad_norm": 1.6377739632715238, "language_loss": 0.74675679, "learning_rate": 3.971235378388573e-06, "loss": 0.76924646, "num_input_tokens_seen": 29148650, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.34375, "step": 1371, "time_per_iteration": 2.5252339839935303 }, { "auxiliary_loss_clip": 0.01187824, "auxiliary_loss_mlp": 0.01057144, "balance_loss_clip": 1.03424394, "balance_loss_mlp": 1.05508006, "epoch": 0.08248910266045394, "flos": 34491480393600.0, "grad_norm": 1.9587300218040986, "language_loss": 0.71194881, "learning_rate": 3.971169525711122e-06, "loss": 0.73439848, "num_input_tokens_seen": 29170785, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.328125, "step": 1372, "time_per_iteration": 2.5934057235717773 }, { "auxiliary_loss_clip": 0.01189501, "auxiliary_loss_mlp": 0.0105353, "balance_loss_clip": 1.03047442, "balance_loss_mlp": 1.05446923, "epoch": 0.0825492259131219, "flos": 13436228960640.0, "grad_norm": 2.4415393061307156, "language_loss": 0.88167202, "learning_rate": 3.9711035982867246e-06, "loss": 0.90410239, "num_input_tokens_seen": 29185210, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.3515625, "step": 1373, "time_per_iteration": 2.4655921459198 }, { "auxiliary_loss_clip": 0.01187479, "auxiliary_loss_mlp": 0.01064441, "balance_loss_clip": 1.04189825, "balance_loss_mlp": 1.05452037, "epoch": 0.08260934916578987, "flos": 25812446636160.0, "grad_norm": 1.850049022014929, "language_loss": 0.82281339, "learning_rate": 3.971037596117882e-06, "loss": 0.84533256, "num_input_tokens_seen": 29205210, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.328125, "step": 1374, "time_per_iteration": 2.4992754459381104 }, { "auxiliary_loss_clip": 0.01075018, "auxiliary_loss_mlp": 0.01015609, "balance_loss_clip": 1.01277149, "balance_loss_mlp": 1.0258379, "epoch": 0.08266947241845783, "flos": 63460009491840.0, "grad_norm": 0.8408428561885192, "language_loss": 0.60723078, "learning_rate": 3.970971519207095e-06, "loss": 0.62813705, "num_input_tokens_seen": 29265350, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.4921875, "step": 1375, "time_per_iteration": 3.092310905456543 }, { "auxiliary_loss_clip": 0.01073195, "auxiliary_loss_mlp": 0.0101595, "balance_loss_clip": 1.01311326, "balance_loss_mlp": 1.02410078, "epoch": 0.08272959567112581, "flos": 69993704568960.0, "grad_norm": 0.9100061944785393, "language_loss": 0.62299621, "learning_rate": 3.970905367556871e-06, "loss": 0.64388764, "num_input_tokens_seen": 29321475, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.4921875, "step": 1376, "time_per_iteration": 3.01914644241333 }, { "auxiliary_loss_clip": 0.01196199, "auxiliary_loss_mlp": 0.01073656, "balance_loss_clip": 1.05142379, "balance_loss_mlp": 1.06040978, "epoch": 0.08278971892379378, "flos": 20413205781120.0, "grad_norm": 2.449950659914033, "language_loss": 0.82852912, "learning_rate": 3.970839141169718e-06, "loss": 0.85122764, "num_input_tokens_seen": 29341405, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.359375, "step": 1377, "time_per_iteration": 2.4992384910583496 }, { "auxiliary_loss_clip": 0.0118638, "auxiliary_loss_mlp": 0.01062024, "balance_loss_clip": 1.03968394, "balance_loss_mlp": 1.05447543, "epoch": 0.08284984217646174, "flos": 26250233598720.0, "grad_norm": 2.268491245885168, "language_loss": 0.8513037, "learning_rate": 3.970772840048147e-06, "loss": 0.8737877, "num_input_tokens_seen": 29361955, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.3125, "step": 1378, "time_per_iteration": 2.5098304748535156 }, { "auxiliary_loss_clip": 0.01187354, "auxiliary_loss_mlp": 0.01067526, "balance_loss_clip": 1.04414928, "balance_loss_mlp": 1.05367565, "epoch": 0.08290996542912972, "flos": 27194683852800.0, "grad_norm": 2.081315163584935, "language_loss": 0.8762908, "learning_rate": 3.970706464194672e-06, "loss": 0.89883965, "num_input_tokens_seen": 29382395, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.3359375, "step": 1379, "time_per_iteration": 2.5375967025756836 }, { "auxiliary_loss_clip": 0.01187882, "auxiliary_loss_mlp": 0.01062662, "balance_loss_clip": 1.04109752, "balance_loss_mlp": 1.05695343, "epoch": 0.08297008868179769, "flos": 38618191146240.0, "grad_norm": 1.8550663894542403, "language_loss": 0.78385103, "learning_rate": 3.970640013611812e-06, "loss": 0.80635643, "num_input_tokens_seen": 29404460, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.3125, "step": 1380, "time_per_iteration": 2.6378419399261475 }, { "auxiliary_loss_clip": 0.01184884, "auxiliary_loss_mlp": 0.01063045, "balance_loss_clip": 1.03946543, "balance_loss_mlp": 1.05573523, "epoch": 0.08303021193446565, "flos": 19974736460160.0, "grad_norm": 8.299147751639193, "language_loss": 0.86343122, "learning_rate": 3.970573488302083e-06, "loss": 0.88591051, "num_input_tokens_seen": 29422675, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.2890625, "step": 1381, "time_per_iteration": 2.481668710708618 }, { "auxiliary_loss_clip": 0.01197577, "auxiliary_loss_mlp": 0.01055277, "balance_loss_clip": 1.03247261, "balance_loss_mlp": 1.05931473, "epoch": 0.08309033518713363, "flos": 13662646341120.0, "grad_norm": 4.3763857467500396, "language_loss": 0.87793964, "learning_rate": 3.970506888268011e-06, "loss": 0.90046823, "num_input_tokens_seen": 29439840, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.3828125, "step": 1382, "time_per_iteration": 2.4292619228363037 }, { "auxiliary_loss_clip": 0.0118868, "auxiliary_loss_mlp": 0.01058371, "balance_loss_clip": 1.03740168, "balance_loss_mlp": 1.05498815, "epoch": 0.0831504584398016, "flos": 17968551068160.0, "grad_norm": 2.0608516149770937, "language_loss": 0.77429491, "learning_rate": 3.970440213512121e-06, "loss": 0.79676545, "num_input_tokens_seen": 29457360, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.3359375, "step": 1383, "time_per_iteration": 2.445611000061035 }, { "auxiliary_loss_clip": 0.01191747, "auxiliary_loss_mlp": 0.01059443, "balance_loss_clip": 1.03735316, "balance_loss_mlp": 1.05597854, "epoch": 0.08321058169246956, "flos": 22601386408320.0, "grad_norm": 1.874293637615888, "language_loss": 0.82848853, "learning_rate": 3.97037346403694e-06, "loss": 0.85100043, "num_input_tokens_seen": 29477040, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.359375, "step": 1384, "time_per_iteration": 2.5187504291534424 }, { "auxiliary_loss_clip": 0.01197383, "auxiliary_loss_mlp": 0.01060924, "balance_loss_clip": 1.03677249, "balance_loss_mlp": 1.05861521, "epoch": 0.08327070494513754, "flos": 22850426378880.0, "grad_norm": 2.613200353217896, "language_loss": 0.84453118, "learning_rate": 3.970306639845e-06, "loss": 0.86711419, "num_input_tokens_seen": 29492010, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.3828125, "step": 1385, "time_per_iteration": 2.5200788974761963 }, { "auxiliary_loss_clip": 0.01192957, "auxiliary_loss_mlp": 0.01065042, "balance_loss_clip": 1.04172456, "balance_loss_mlp": 1.05720913, "epoch": 0.0833308281978055, "flos": 22782986593920.0, "grad_norm": 1.7322944166671521, "language_loss": 0.6886363, "learning_rate": 3.970239740938835e-06, "loss": 0.71121627, "num_input_tokens_seen": 29511850, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.359375, "step": 1386, "time_per_iteration": 2.481346368789673 }, { "auxiliary_loss_clip": 0.01184968, "auxiliary_loss_mlp": 0.01053752, "balance_loss_clip": 1.03080463, "balance_loss_mlp": 1.05137897, "epoch": 0.08339095145047347, "flos": 20812604083200.0, "grad_norm": 1.6729825325865206, "language_loss": 0.82010275, "learning_rate": 3.97017276732098e-06, "loss": 0.84248996, "num_input_tokens_seen": 29531415, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.3359375, "step": 1387, "time_per_iteration": 2.4729626178741455 }, { "auxiliary_loss_clip": 0.011908, "auxiliary_loss_mlp": 0.01065295, "balance_loss_clip": 1.04100025, "balance_loss_mlp": 1.05311906, "epoch": 0.08345107470314143, "flos": 18515326872960.0, "grad_norm": 2.3631402045092265, "language_loss": 0.77207434, "learning_rate": 3.970105718993978e-06, "loss": 0.7946353, "num_input_tokens_seen": 29549525, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.375, "step": 1388, "time_per_iteration": 2.457338809967041 }, { "auxiliary_loss_clip": 0.01185403, "auxiliary_loss_mlp": 0.01061789, "balance_loss_clip": 1.03785205, "balance_loss_mlp": 1.05556285, "epoch": 0.08351119795580941, "flos": 18807567926400.0, "grad_norm": 2.1214478260034513, "language_loss": 0.79262209, "learning_rate": 3.970038595960369e-06, "loss": 0.81509399, "num_input_tokens_seen": 29568705, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.296875, "step": 1389, "time_per_iteration": 2.461562156677246 }, { "auxiliary_loss_clip": 0.01193964, "auxiliary_loss_mlp": 0.01059703, "balance_loss_clip": 1.03707719, "balance_loss_mlp": 1.05812788, "epoch": 0.08357132120847738, "flos": 18441817689600.0, "grad_norm": 2.5249217799959363, "language_loss": 0.87505722, "learning_rate": 3.969971398222699e-06, "loss": 0.89759386, "num_input_tokens_seen": 29585855, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.359375, "step": 1390, "time_per_iteration": 6.774719715118408 }, { "auxiliary_loss_clip": 0.0118572, "auxiliary_loss_mlp": 0.01060321, "balance_loss_clip": 1.03732586, "balance_loss_mlp": 1.05245197, "epoch": 0.08363144446114534, "flos": 25922333318400.0, "grad_norm": 1.6243795364020606, "language_loss": 0.86683214, "learning_rate": 3.969904125783517e-06, "loss": 0.88929254, "num_input_tokens_seen": 29607280, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.3359375, "step": 1391, "time_per_iteration": 3.8849756717681885 }, { "auxiliary_loss_clip": 0.01198395, "auxiliary_loss_mlp": 0.01072344, "balance_loss_clip": 1.04870439, "balance_loss_mlp": 1.05895948, "epoch": 0.08369156771381332, "flos": 18041306065920.0, "grad_norm": 2.3816552877798074, "language_loss": 0.87723398, "learning_rate": 3.969836778645371e-06, "loss": 0.89994144, "num_input_tokens_seen": 29624130, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.390625, "step": 1392, "time_per_iteration": 2.4626011848449707 }, { "auxiliary_loss_clip": 0.01187021, "auxiliary_loss_mlp": 0.01058804, "balance_loss_clip": 1.03546286, "balance_loss_mlp": 1.05289698, "epoch": 0.08375169096648129, "flos": 22675111073280.0, "grad_norm": 2.4014391561013686, "language_loss": 0.79914367, "learning_rate": 3.969769356810819e-06, "loss": 0.82160199, "num_input_tokens_seen": 29643210, "router_z_loss_clip": 0.23339844, "router_z_loss_mlp": 1.34375, "step": 1393, "time_per_iteration": 2.463818311691284 }, { "auxiliary_loss_clip": 0.01189518, "auxiliary_loss_mlp": 0.01052602, "balance_loss_clip": 1.03051233, "balance_loss_mlp": 1.05774999, "epoch": 0.08381181421914925, "flos": 26103215232000.0, "grad_norm": 2.153018137141168, "language_loss": 0.84879184, "learning_rate": 3.969701860282415e-06, "loss": 0.87121308, "num_input_tokens_seen": 29663920, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.3203125, "step": 1394, "time_per_iteration": 2.527294874191284 }, { "auxiliary_loss_clip": 0.01189312, "auxiliary_loss_mlp": 0.01051501, "balance_loss_clip": 1.02927995, "balance_loss_mlp": 1.05570006, "epoch": 0.08387193747181723, "flos": 20629782835200.0, "grad_norm": 2.2883910732704367, "language_loss": 0.82903093, "learning_rate": 3.969634289062719e-06, "loss": 0.85143912, "num_input_tokens_seen": 29683825, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.3359375, "step": 1395, "time_per_iteration": 2.4701027870178223 }, { "auxiliary_loss_clip": 0.01191334, "auxiliary_loss_mlp": 0.01055253, "balance_loss_clip": 1.03100538, "balance_loss_mlp": 1.05806398, "epoch": 0.0839320607244852, "flos": 13443196199040.0, "grad_norm": 2.2567891784776175, "language_loss": 0.82429898, "learning_rate": 3.969566643154293e-06, "loss": 0.84676492, "num_input_tokens_seen": 29698775, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.328125, "step": 1396, "time_per_iteration": 2.4575259685516357 }, { "auxiliary_loss_clip": 0.011906, "auxiliary_loss_mlp": 0.01056006, "balance_loss_clip": 1.03155613, "balance_loss_mlp": 1.05919659, "epoch": 0.08399218397715316, "flos": 23477247642240.0, "grad_norm": 1.91886284328595, "language_loss": 0.7657041, "learning_rate": 3.969498922559703e-06, "loss": 0.78817016, "num_input_tokens_seen": 29719430, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.3125, "step": 1397, "time_per_iteration": 2.4665510654449463 }, { "auxiliary_loss_clip": 0.01189875, "auxiliary_loss_mlp": 0.01049565, "balance_loss_clip": 1.02624726, "balance_loss_mlp": 1.05741894, "epoch": 0.08405230722982113, "flos": 25920717206400.0, "grad_norm": 2.1099460363848825, "language_loss": 0.77637625, "learning_rate": 3.969431127281516e-06, "loss": 0.79877067, "num_input_tokens_seen": 29739685, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.328125, "step": 1398, "time_per_iteration": 2.5210089683532715 }, { "auxiliary_loss_clip": 0.01184206, "auxiliary_loss_mlp": 0.01051279, "balance_loss_clip": 1.02946377, "balance_loss_mlp": 1.05553341, "epoch": 0.0841124304824891, "flos": 17967437746560.0, "grad_norm": 2.244213514273242, "language_loss": 0.94872296, "learning_rate": 3.969363257322304e-06, "loss": 0.9710778, "num_input_tokens_seen": 29756165, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.28125, "step": 1399, "time_per_iteration": 2.4275104999542236 }, { "auxiliary_loss_clip": 0.01190802, "auxiliary_loss_mlp": 0.01061552, "balance_loss_clip": 1.03685212, "balance_loss_mlp": 1.05503488, "epoch": 0.08417255373515707, "flos": 25629661301760.0, "grad_norm": 1.7458558590128632, "language_loss": 0.81775367, "learning_rate": 3.96929531268464e-06, "loss": 0.84027719, "num_input_tokens_seen": 29776425, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.359375, "step": 1400, "time_per_iteration": 2.521982431411743 }, { "auxiliary_loss_clip": 0.01188175, "auxiliary_loss_mlp": 0.01055747, "balance_loss_clip": 1.03287113, "balance_loss_mlp": 1.0542326, "epoch": 0.08423267698782504, "flos": 26249730808320.0, "grad_norm": 2.0958160313357577, "language_loss": 0.86794907, "learning_rate": 3.969227293371099e-06, "loss": 0.89038825, "num_input_tokens_seen": 29796440, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.3359375, "step": 1401, "time_per_iteration": 2.5206527709960938 }, { "auxiliary_loss_clip": 0.01190882, "auxiliary_loss_mlp": 0.01058684, "balance_loss_clip": 1.03449678, "balance_loss_mlp": 1.05618811, "epoch": 0.08429280024049302, "flos": 20119707751680.0, "grad_norm": 2.203066087633078, "language_loss": 0.87412715, "learning_rate": 3.969159199384263e-06, "loss": 0.8966229, "num_input_tokens_seen": 29814755, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.34375, "step": 1402, "time_per_iteration": 2.4970996379852295 }, { "auxiliary_loss_clip": 0.01185513, "auxiliary_loss_mlp": 0.01047692, "balance_loss_clip": 1.02555513, "balance_loss_mlp": 1.05379081, "epoch": 0.08435292349316098, "flos": 42924526836480.0, "grad_norm": 2.1856149045935234, "language_loss": 0.89368343, "learning_rate": 3.9690910307267125e-06, "loss": 0.91601539, "num_input_tokens_seen": 29834785, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.3203125, "step": 1403, "time_per_iteration": 2.6490554809570312 }, { "auxiliary_loss_clip": 0.01188055, "auxiliary_loss_mlp": 0.01051805, "balance_loss_clip": 1.02833247, "balance_loss_mlp": 1.05315828, "epoch": 0.08441304674582895, "flos": 22857285876480.0, "grad_norm": 1.8675909791814314, "language_loss": 0.80280888, "learning_rate": 3.969022787401033e-06, "loss": 0.82520747, "num_input_tokens_seen": 29854695, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.34375, "step": 1404, "time_per_iteration": 2.515673875808716 }, { "auxiliary_loss_clip": 0.0119541, "auxiliary_loss_mlp": 0.01063443, "balance_loss_clip": 1.04054284, "balance_loss_mlp": 1.05944562, "epoch": 0.08447316999849692, "flos": 18697501676160.0, "grad_norm": 1.9601906448949729, "language_loss": 0.83474201, "learning_rate": 3.968954469409811e-06, "loss": 0.85733056, "num_input_tokens_seen": 29872180, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.359375, "step": 1405, "time_per_iteration": 2.420783519744873 }, { "auxiliary_loss_clip": 0.01187544, "auxiliary_loss_mlp": 0.0105531, "balance_loss_clip": 1.03349423, "balance_loss_mlp": 1.05507612, "epoch": 0.08453329325116489, "flos": 25483971738240.0, "grad_norm": 1.8828979780146442, "language_loss": 0.80161929, "learning_rate": 3.968886076755639e-06, "loss": 0.82404786, "num_input_tokens_seen": 29893205, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.328125, "step": 1406, "time_per_iteration": 2.5275039672851562 }, { "auxiliary_loss_clip": 0.01190692, "auxiliary_loss_mlp": 0.01058923, "balance_loss_clip": 1.03626132, "balance_loss_mlp": 1.05801702, "epoch": 0.08459341650383286, "flos": 20920048640640.0, "grad_norm": 1.9019985929482468, "language_loss": 0.79811746, "learning_rate": 3.96881760944111e-06, "loss": 0.82061362, "num_input_tokens_seen": 29911970, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.328125, "step": 1407, "time_per_iteration": 2.463310480117798 }, { "auxiliary_loss_clip": 0.01187565, "auxiliary_loss_mlp": 0.01050662, "balance_loss_clip": 1.02858496, "balance_loss_mlp": 1.05597258, "epoch": 0.08465353975650082, "flos": 13043079624960.0, "grad_norm": 2.719996263931142, "language_loss": 0.915025, "learning_rate": 3.968749067468819e-06, "loss": 0.93740726, "num_input_tokens_seen": 29929925, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.3125, "step": 1408, "time_per_iteration": 2.468627691268921 }, { "auxiliary_loss_clip": 0.01089028, "auxiliary_loss_mlp": 0.01062891, "balance_loss_clip": 1.05941057, "balance_loss_mlp": 1.03707695, "epoch": 0.0847136630091688, "flos": 60877422552960.0, "grad_norm": 1.0056472667060936, "language_loss": 0.61936474, "learning_rate": 3.968680450841368e-06, "loss": 0.64088398, "num_input_tokens_seen": 29985950, "router_z_loss_clip": 0.03491211, "router_z_loss_mlp": 0.51953125, "step": 1409, "time_per_iteration": 3.1690518856048584 }, { "auxiliary_loss_clip": 0.01182828, "auxiliary_loss_mlp": 0.01065566, "balance_loss_clip": 1.04372644, "balance_loss_mlp": 1.05606031, "epoch": 0.08477378626183676, "flos": 22046530043520.0, "grad_norm": 2.037661127546271, "language_loss": 0.86367673, "learning_rate": 3.968611759561355e-06, "loss": 0.88616073, "num_input_tokens_seen": 30004330, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.265625, "step": 1410, "time_per_iteration": 2.495478391647339 }, { "auxiliary_loss_clip": 0.01190177, "auxiliary_loss_mlp": 0.01061011, "balance_loss_clip": 1.03682375, "balance_loss_mlp": 1.05699658, "epoch": 0.08483390951450473, "flos": 16690059308160.0, "grad_norm": 1.8563536331138133, "language_loss": 0.74016356, "learning_rate": 3.968542993631388e-06, "loss": 0.76267546, "num_input_tokens_seen": 30022555, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.328125, "step": 1411, "time_per_iteration": 2.4622738361358643 }, { "auxiliary_loss_clip": 0.01079842, "auxiliary_loss_mlp": 0.01009318, "balance_loss_clip": 1.00614667, "balance_loss_mlp": 1.02892327, "epoch": 0.08489403276717271, "flos": 51584640082560.0, "grad_norm": 0.901175967714762, "language_loss": 0.56761408, "learning_rate": 3.968474153054073e-06, "loss": 0.58850563, "num_input_tokens_seen": 30077220, "router_z_loss_clip": 0.03173828, "router_z_loss_mlp": 0.5078125, "step": 1412, "time_per_iteration": 3.0460805892944336 }, { "auxiliary_loss_clip": 0.01182696, "auxiliary_loss_mlp": 0.01067793, "balance_loss_clip": 1.04564357, "balance_loss_mlp": 1.05331218, "epoch": 0.08495415601984067, "flos": 17092330698240.0, "grad_norm": 2.023455938803462, "language_loss": 0.89121616, "learning_rate": 3.96840523783202e-06, "loss": 0.91372108, "num_input_tokens_seen": 30094600, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.296875, "step": 1413, "time_per_iteration": 2.4488697052001953 }, { "auxiliary_loss_clip": 0.01185693, "auxiliary_loss_mlp": 0.01055442, "balance_loss_clip": 1.03225541, "balance_loss_mlp": 1.05637646, "epoch": 0.08501427927250864, "flos": 23148413608320.0, "grad_norm": 1.8836445524922687, "language_loss": 0.8794601, "learning_rate": 3.968336247967844e-06, "loss": 0.90187144, "num_input_tokens_seen": 30114475, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.296875, "step": 1414, "time_per_iteration": 2.5139472484588623 }, { "auxiliary_loss_clip": 0.01187308, "auxiliary_loss_mlp": 0.01069274, "balance_loss_clip": 1.04803109, "balance_loss_mlp": 1.05587077, "epoch": 0.08507440252517662, "flos": 19063467394560.0, "grad_norm": 1.8057338983574014, "language_loss": 0.77568722, "learning_rate": 3.96826718346416e-06, "loss": 0.79825306, "num_input_tokens_seen": 30133350, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.3125, "step": 1415, "time_per_iteration": 2.437440872192383 }, { "auxiliary_loss_clip": 0.01183022, "auxiliary_loss_mlp": 0.01067503, "balance_loss_clip": 1.04641485, "balance_loss_mlp": 1.05434597, "epoch": 0.08513452577784458, "flos": 60182296600320.0, "grad_norm": 3.997781627831339, "language_loss": 0.70370138, "learning_rate": 3.968198044323587e-06, "loss": 0.72620672, "num_input_tokens_seen": 30159005, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.2890625, "step": 1416, "time_per_iteration": 2.833951950073242 }, { "auxiliary_loss_clip": 0.01189833, "auxiliary_loss_mlp": 0.01069027, "balance_loss_clip": 1.04491043, "balance_loss_mlp": 1.05584502, "epoch": 0.08519464903051255, "flos": 27308485117440.0, "grad_norm": 1.9428705639341894, "language_loss": 0.74778295, "learning_rate": 3.968128830548748e-06, "loss": 0.7703715, "num_input_tokens_seen": 30179450, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.34375, "step": 1417, "time_per_iteration": 2.50217342376709 }, { "auxiliary_loss_clip": 0.01183038, "auxiliary_loss_mlp": 0.01061398, "balance_loss_clip": 1.038867, "balance_loss_mlp": 1.05410898, "epoch": 0.08525477228318051, "flos": 20266438809600.0, "grad_norm": 2.361551632256464, "language_loss": 0.81889814, "learning_rate": 3.968059542142265e-06, "loss": 0.84134251, "num_input_tokens_seen": 30197235, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.2890625, "step": 1418, "time_per_iteration": 2.4811294078826904 }, { "auxiliary_loss_clip": 0.01075357, "auxiliary_loss_mlp": 0.01032395, "balance_loss_clip": 1.02841341, "balance_loss_mlp": 1.02416825, "epoch": 0.08531489553584849, "flos": 67615017183360.0, "grad_norm": 0.9047956691381082, "language_loss": 0.56639338, "learning_rate": 3.9679901791067685e-06, "loss": 0.58747089, "num_input_tokens_seen": 30257410, "router_z_loss_clip": 0.03979492, "router_z_loss_mlp": 0.515625, "step": 1419, "time_per_iteration": 3.043708324432373 }, { "auxiliary_loss_clip": 0.01184213, "auxiliary_loss_mlp": 0.0106457, "balance_loss_clip": 1.04182458, "balance_loss_mlp": 1.05297279, "epoch": 0.08537501878851646, "flos": 27526965592320.0, "grad_norm": 2.1371768540849754, "language_loss": 0.70584369, "learning_rate": 3.967920741444886e-06, "loss": 0.72833157, "num_input_tokens_seen": 30277865, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.3125, "step": 1420, "time_per_iteration": 2.523298501968384 }, { "auxiliary_loss_clip": 0.01182006, "auxiliary_loss_mlp": 0.01051807, "balance_loss_clip": 1.02926445, "balance_loss_mlp": 1.05196607, "epoch": 0.08543514204118442, "flos": 22784243569920.0, "grad_norm": 1.7066906715318755, "language_loss": 0.88224548, "learning_rate": 3.967851229159252e-06, "loss": 0.90458357, "num_input_tokens_seen": 30298545, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.3046875, "step": 1421, "time_per_iteration": 2.487041711807251 }, { "auxiliary_loss_clip": 0.01071152, "auxiliary_loss_mlp": 0.01019523, "balance_loss_clip": 1.01585186, "balance_loss_mlp": 1.02047801, "epoch": 0.0854952652938524, "flos": 60990721027200.0, "grad_norm": 0.8074205471827623, "language_loss": 0.635414, "learning_rate": 3.967781642252502e-06, "loss": 0.65632081, "num_input_tokens_seen": 30361725, "router_z_loss_clip": 0.03662109, "router_z_loss_mlp": 0.5078125, "step": 1422, "time_per_iteration": 3.106898069381714 }, { "auxiliary_loss_clip": 0.01182122, "auxiliary_loss_mlp": 0.01058672, "balance_loss_clip": 1.03664184, "balance_loss_mlp": 1.05617082, "epoch": 0.08555538854652037, "flos": 28038046256640.0, "grad_norm": 2.3453485058369083, "language_loss": 0.83021104, "learning_rate": 3.967711980727276e-06, "loss": 0.85261893, "num_input_tokens_seen": 30382180, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.2578125, "step": 1423, "time_per_iteration": 2.5357825756073 }, { "auxiliary_loss_clip": 0.01187373, "auxiliary_loss_mlp": 0.0105895, "balance_loss_clip": 1.03721786, "balance_loss_mlp": 1.05622661, "epoch": 0.08561551179918833, "flos": 23509279595520.0, "grad_norm": 1.8087974738004489, "language_loss": 0.74874222, "learning_rate": 3.967642244586213e-06, "loss": 0.77120543, "num_input_tokens_seen": 30402980, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.3125, "step": 1424, "time_per_iteration": 2.5319440364837646 }, { "auxiliary_loss_clip": 0.0118523, "auxiliary_loss_mlp": 0.01053751, "balance_loss_clip": 1.03203118, "balance_loss_mlp": 1.05556965, "epoch": 0.08567563505185631, "flos": 17926930183680.0, "grad_norm": 2.0501169400459376, "language_loss": 0.75643408, "learning_rate": 3.96757243383196e-06, "loss": 0.77882385, "num_input_tokens_seen": 30420800, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.296875, "step": 1425, "time_per_iteration": 2.484860897064209 }, { "auxiliary_loss_clip": 0.01182895, "auxiliary_loss_mlp": 0.0104727, "balance_loss_clip": 1.024966, "balance_loss_mlp": 1.05478454, "epoch": 0.08573575830452428, "flos": 19719519350400.0, "grad_norm": 2.5367158218561987, "language_loss": 0.93310046, "learning_rate": 3.9675025484671624e-06, "loss": 0.95540214, "num_input_tokens_seen": 30439620, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.28125, "step": 1426, "time_per_iteration": 2.481652021408081 }, { "auxiliary_loss_clip": 0.01191789, "auxiliary_loss_mlp": 0.01058839, "balance_loss_clip": 1.0341624, "balance_loss_mlp": 1.05847812, "epoch": 0.08579588155719224, "flos": 17931563038080.0, "grad_norm": 2.5018759815118785, "language_loss": 0.75873613, "learning_rate": 3.967432588494471e-06, "loss": 0.78124243, "num_input_tokens_seen": 30457300, "router_z_loss_clip": 0.24707031, "router_z_loss_mlp": 1.328125, "step": 1427, "time_per_iteration": 2.4718217849731445 }, { "auxiliary_loss_clip": 0.01182784, "auxiliary_loss_mlp": 0.01050845, "balance_loss_clip": 1.02957845, "balance_loss_mlp": 1.0542587, "epoch": 0.08585600480986022, "flos": 16033324993920.0, "grad_norm": 2.721536858755322, "language_loss": 0.81688941, "learning_rate": 3.96736255391654e-06, "loss": 0.83922571, "num_input_tokens_seen": 30471580, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.28125, "step": 1428, "time_per_iteration": 2.443143606185913 }, { "auxiliary_loss_clip": 0.01188308, "auxiliary_loss_mlp": 0.01058778, "balance_loss_clip": 1.03567505, "balance_loss_mlp": 1.05570257, "epoch": 0.08591612806252819, "flos": 28657433404800.0, "grad_norm": 2.0629214668554305, "language_loss": 0.80070746, "learning_rate": 3.967292444736023e-06, "loss": 0.82317829, "num_input_tokens_seen": 30492720, "router_z_loss_clip": 0.23144531, "router_z_loss_mlp": 1.328125, "step": 1429, "time_per_iteration": 2.535679340362549 }, { "auxiliary_loss_clip": 0.01187693, "auxiliary_loss_mlp": 0.01059446, "balance_loss_clip": 1.03752375, "balance_loss_mlp": 1.0564723, "epoch": 0.08597625131519615, "flos": 20959119659520.0, "grad_norm": 1.9260859328377817, "language_loss": 0.87746894, "learning_rate": 3.967222260955578e-06, "loss": 0.89994037, "num_input_tokens_seen": 30509535, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.3125, "step": 1430, "time_per_iteration": 2.4424545764923096 }, { "auxiliary_loss_clip": 0.01183732, "auxiliary_loss_mlp": 0.01059269, "balance_loss_clip": 1.03747797, "balance_loss_mlp": 1.0575192, "epoch": 0.08603637456786412, "flos": 23256360956160.0, "grad_norm": 1.6258188898162744, "language_loss": 0.81712925, "learning_rate": 3.96715200257787e-06, "loss": 0.83955926, "num_input_tokens_seen": 30529490, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.265625, "step": 1431, "time_per_iteration": 3.9255552291870117 }, { "auxiliary_loss_clip": 0.01184396, "auxiliary_loss_mlp": 0.01050165, "balance_loss_clip": 1.02790833, "balance_loss_mlp": 1.0551517, "epoch": 0.0860964978205321, "flos": 28694170039680.0, "grad_norm": 1.6846920424279828, "language_loss": 0.77699685, "learning_rate": 3.967081669605559e-06, "loss": 0.79934245, "num_input_tokens_seen": 30550205, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.2890625, "step": 1432, "time_per_iteration": 4.011981010437012 }, { "auxiliary_loss_clip": 0.01182132, "auxiliary_loss_mlp": 0.01058208, "balance_loss_clip": 1.03511715, "balance_loss_mlp": 1.05273652, "epoch": 0.08615662107320006, "flos": 19318397195520.0, "grad_norm": 3.3660105594473277, "language_loss": 0.73422742, "learning_rate": 3.967011262041315e-06, "loss": 0.75663078, "num_input_tokens_seen": 30568830, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.296875, "step": 1433, "time_per_iteration": 3.8051111698150635 }, { "auxiliary_loss_clip": 0.01186891, "auxiliary_loss_mlp": 0.01059859, "balance_loss_clip": 1.03462255, "balance_loss_mlp": 1.05529094, "epoch": 0.08621674432586802, "flos": 15851688894720.0, "grad_norm": 2.4776462563673305, "language_loss": 0.85958803, "learning_rate": 3.9669407798878065e-06, "loss": 0.88205552, "num_input_tokens_seen": 30585730, "router_z_loss_clip": 0.25195312, "router_z_loss_mlp": 1.3125, "step": 1434, "time_per_iteration": 2.4783596992492676 }, { "auxiliary_loss_clip": 0.01182219, "auxiliary_loss_mlp": 0.01054251, "balance_loss_clip": 1.03266215, "balance_loss_mlp": 1.05232191, "epoch": 0.086276867578536, "flos": 14100648785280.0, "grad_norm": 2.2935171073518377, "language_loss": 0.79016984, "learning_rate": 3.966870223147707e-06, "loss": 0.81253457, "num_input_tokens_seen": 30603180, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.296875, "step": 1435, "time_per_iteration": 2.4435315132141113 }, { "auxiliary_loss_clip": 0.01076062, "auxiliary_loss_mlp": 0.01032434, "balance_loss_clip": 1.0290488, "balance_loss_mlp": 1.02665281, "epoch": 0.08633699083120397, "flos": 70184857772160.0, "grad_norm": 0.9712164809047884, "language_loss": 0.57957852, "learning_rate": 3.96679959182369e-06, "loss": 0.60066354, "num_input_tokens_seen": 30668895, "router_z_loss_clip": 0.03393555, "router_z_loss_mlp": 0.49414062, "step": 1436, "time_per_iteration": 3.208350419998169 }, { "auxiliary_loss_clip": 0.01184488, "auxiliary_loss_mlp": 0.01050044, "balance_loss_clip": 1.02734625, "balance_loss_mlp": 1.05378795, "epoch": 0.08639711408387193, "flos": 30298874140800.0, "grad_norm": 2.3212359854662985, "language_loss": 0.69138896, "learning_rate": 3.966728885918437e-06, "loss": 0.71373433, "num_input_tokens_seen": 30688955, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.3125, "step": 1437, "time_per_iteration": 2.547868251800537 }, { "auxiliary_loss_clip": 0.01182728, "auxiliary_loss_mlp": 0.01052648, "balance_loss_clip": 1.03114307, "balance_loss_mlp": 1.05278778, "epoch": 0.08645723733653991, "flos": 20297680663680.0, "grad_norm": 2.134099594667648, "language_loss": 0.7271893, "learning_rate": 3.966658105434627e-06, "loss": 0.74954307, "num_input_tokens_seen": 30706095, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.296875, "step": 1438, "time_per_iteration": 2.4324746131896973 }, { "auxiliary_loss_clip": 0.01181029, "auxiliary_loss_mlp": 0.01050688, "balance_loss_clip": 1.02878964, "balance_loss_mlp": 1.05559695, "epoch": 0.08651736058920788, "flos": 32890583134080.0, "grad_norm": 1.9149952515065702, "language_loss": 0.64499658, "learning_rate": 3.966587250374945e-06, "loss": 0.66731375, "num_input_tokens_seen": 30729025, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.25, "step": 1439, "time_per_iteration": 2.6195595264434814 }, { "auxiliary_loss_clip": 0.01184157, "auxiliary_loss_mlp": 0.01051289, "balance_loss_clip": 1.02892506, "balance_loss_mlp": 1.0558002, "epoch": 0.08657748384187584, "flos": 22637368857600.0, "grad_norm": 3.7442452199241814, "language_loss": 0.87570781, "learning_rate": 3.966516320742077e-06, "loss": 0.89806223, "num_input_tokens_seen": 30746155, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.28125, "step": 1440, "time_per_iteration": 2.469248056411743 }, { "auxiliary_loss_clip": 0.01187489, "auxiliary_loss_mlp": 0.01054426, "balance_loss_clip": 1.0313946, "balance_loss_mlp": 1.05453849, "epoch": 0.08663760709454381, "flos": 23658380951040.0, "grad_norm": 2.0438340337337553, "language_loss": 0.83974004, "learning_rate": 3.9664453165387124e-06, "loss": 0.86215925, "num_input_tokens_seen": 30761410, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.328125, "step": 1441, "time_per_iteration": 2.4995360374450684 }, { "auxiliary_loss_clip": 0.01066661, "auxiliary_loss_mlp": 0.01003918, "balance_loss_clip": 1.00081813, "balance_loss_mlp": 1.01824307, "epoch": 0.08669773034721179, "flos": 62686564911360.0, "grad_norm": 0.8524550144756052, "language_loss": 0.6046496, "learning_rate": 3.966374237767545e-06, "loss": 0.62535536, "num_input_tokens_seen": 30823010, "router_z_loss_clip": 0.03100586, "router_z_loss_mlp": 0.484375, "step": 1442, "time_per_iteration": 3.2165510654449463 }, { "auxiliary_loss_clip": 0.01185501, "auxiliary_loss_mlp": 0.01048801, "balance_loss_clip": 1.02700973, "balance_loss_mlp": 1.05355251, "epoch": 0.08675785359987975, "flos": 20667489137280.0, "grad_norm": 4.085326631137823, "language_loss": 0.79049236, "learning_rate": 3.96630308443127e-06, "loss": 0.81283534, "num_input_tokens_seen": 30841980, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.3125, "step": 1443, "time_per_iteration": 2.507537364959717 }, { "auxiliary_loss_clip": 0.01182269, "auxiliary_loss_mlp": 0.0104621, "balance_loss_clip": 1.02433467, "balance_loss_mlp": 1.05146599, "epoch": 0.08681797685254772, "flos": 26941118768640.0, "grad_norm": 1.6019741305487456, "language_loss": 0.82433784, "learning_rate": 3.966231856532584e-06, "loss": 0.84662259, "num_input_tokens_seen": 30863280, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.3046875, "step": 1444, "time_per_iteration": 2.544611692428589 }, { "auxiliary_loss_clip": 0.01185214, "auxiliary_loss_mlp": 0.01048256, "balance_loss_clip": 1.02614319, "balance_loss_mlp": 1.05356121, "epoch": 0.0868781001052157, "flos": 17712831168000.0, "grad_norm": 2.3601063122785813, "language_loss": 0.86561483, "learning_rate": 3.966160554074189e-06, "loss": 0.88794959, "num_input_tokens_seen": 30881710, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.3203125, "step": 1445, "time_per_iteration": 2.4608047008514404 }, { "auxiliary_loss_clip": 0.01184091, "auxiliary_loss_mlp": 0.01048087, "balance_loss_clip": 1.02821493, "balance_loss_mlp": 1.05626476, "epoch": 0.08693822335788366, "flos": 19896522595200.0, "grad_norm": 2.430005276769059, "language_loss": 0.81695306, "learning_rate": 3.96608917705879e-06, "loss": 0.83927488, "num_input_tokens_seen": 30900225, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.2734375, "step": 1446, "time_per_iteration": 2.4787824153900146 }, { "auxiliary_loss_clip": 0.01065363, "auxiliary_loss_mlp": 0.01005924, "balance_loss_clip": 1.00306284, "balance_loss_mlp": 1.01750612, "epoch": 0.08699834661055163, "flos": 67023747406080.0, "grad_norm": 0.7469457697337646, "language_loss": 0.54812789, "learning_rate": 3.966017725489091e-06, "loss": 0.56884074, "num_input_tokens_seen": 30959580, "router_z_loss_clip": 0.02856445, "router_z_loss_mlp": 0.47851562, "step": 1447, "time_per_iteration": 3.1677067279815674 }, { "auxiliary_loss_clip": 0.01176653, "auxiliary_loss_mlp": 0.01051448, "balance_loss_clip": 1.03106308, "balance_loss_mlp": 1.05173039, "epoch": 0.0870584698632196, "flos": 13480507451520.0, "grad_norm": 2.863355866337447, "language_loss": 0.84763885, "learning_rate": 3.965946199367804e-06, "loss": 0.86991984, "num_input_tokens_seen": 30976775, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.25, "step": 1448, "time_per_iteration": 2.4295554161071777 }, { "auxiliary_loss_clip": 0.01185161, "auxiliary_loss_mlp": 0.01051749, "balance_loss_clip": 1.03041053, "balance_loss_mlp": 1.05429721, "epoch": 0.08711859311588757, "flos": 16107013745280.0, "grad_norm": 12.040794927899368, "language_loss": 0.80507612, "learning_rate": 3.965874598697638e-06, "loss": 0.82744527, "num_input_tokens_seen": 30990495, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.3125, "step": 1449, "time_per_iteration": 2.440974235534668 }, { "auxiliary_loss_clip": 0.01180384, "auxiliary_loss_mlp": 0.01046127, "balance_loss_clip": 1.0250864, "balance_loss_mlp": 1.0544517, "epoch": 0.08717871636855554, "flos": 38472357928320.0, "grad_norm": 1.4992862073197546, "language_loss": 0.71003664, "learning_rate": 3.965802923481313e-06, "loss": 0.73230177, "num_input_tokens_seen": 31014080, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.2578125, "step": 1450, "time_per_iteration": 2.618800640106201 }, { "auxiliary_loss_clip": 0.0118223, "auxiliary_loss_mlp": 0.01050465, "balance_loss_clip": 1.02885282, "balance_loss_mlp": 1.05557215, "epoch": 0.0872388396212235, "flos": 17600574188160.0, "grad_norm": 3.9698268081058075, "language_loss": 0.83725226, "learning_rate": 3.965731173721542e-06, "loss": 0.85957921, "num_input_tokens_seen": 31031210, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.265625, "step": 1451, "time_per_iteration": 2.4504430294036865 }, { "auxiliary_loss_clip": 0.01177474, "auxiliary_loss_mlp": 0.01048885, "balance_loss_clip": 1.0289768, "balance_loss_mlp": 1.0527873, "epoch": 0.08729896287389148, "flos": 25259385951360.0, "grad_norm": 1.8736882384536455, "language_loss": 0.74179792, "learning_rate": 3.965659349421049e-06, "loss": 0.76406151, "num_input_tokens_seen": 31049710, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.25, "step": 1452, "time_per_iteration": 2.47497820854187 }, { "auxiliary_loss_clip": 0.01184712, "auxiliary_loss_mlp": 0.01058049, "balance_loss_clip": 1.03528023, "balance_loss_mlp": 1.05410266, "epoch": 0.08735908612655945, "flos": 15632454234240.0, "grad_norm": 4.509156282940375, "language_loss": 0.795627, "learning_rate": 3.965587450582556e-06, "loss": 0.81805468, "num_input_tokens_seen": 31066160, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.3125, "step": 1453, "time_per_iteration": 2.456409454345703 }, { "auxiliary_loss_clip": 0.01181349, "auxiliary_loss_mlp": 0.01056957, "balance_loss_clip": 1.03536844, "balance_loss_mlp": 1.05432582, "epoch": 0.08741920937922741, "flos": 20339660684160.0, "grad_norm": 1.9034279688054887, "language_loss": 0.71171224, "learning_rate": 3.96551547720879e-06, "loss": 0.73409534, "num_input_tokens_seen": 31085270, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.265625, "step": 1454, "time_per_iteration": 2.45811128616333 }, { "auxiliary_loss_clip": 0.01065615, "auxiliary_loss_mlp": 0.0100892, "balance_loss_clip": 1.00651205, "balance_loss_mlp": 1.0186125, "epoch": 0.08747933263189539, "flos": 62819795433600.0, "grad_norm": 0.7917602147483741, "language_loss": 0.58639348, "learning_rate": 3.96544342930248e-06, "loss": 0.60713875, "num_input_tokens_seen": 31148445, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.46875, "step": 1455, "time_per_iteration": 3.1103055477142334 }, { "auxiliary_loss_clip": 0.01180901, "auxiliary_loss_mlp": 0.01054874, "balance_loss_clip": 1.03295183, "balance_loss_mlp": 1.05265999, "epoch": 0.08753945588456336, "flos": 33035877648000.0, "grad_norm": 1.5506458885871774, "language_loss": 0.773215, "learning_rate": 3.965371306866359e-06, "loss": 0.79557276, "num_input_tokens_seen": 31168770, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.28125, "step": 1456, "time_per_iteration": 2.5570013523101807 }, { "auxiliary_loss_clip": 0.01181319, "auxiliary_loss_mlp": 0.01047118, "balance_loss_clip": 1.02630365, "balance_loss_mlp": 1.05345106, "epoch": 0.08759957913723132, "flos": 35547182046720.0, "grad_norm": 1.9890534969455542, "language_loss": 0.72142172, "learning_rate": 3.96529910990316e-06, "loss": 0.74370605, "num_input_tokens_seen": 31189270, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.28125, "step": 1457, "time_per_iteration": 2.5660200119018555 }, { "auxiliary_loss_clip": 0.0117668, "auxiliary_loss_mlp": 0.01040875, "balance_loss_clip": 1.02090704, "balance_loss_mlp": 1.05244446, "epoch": 0.0876597023898993, "flos": 23911120022400.0, "grad_norm": 1.5497935716669582, "language_loss": 0.86492217, "learning_rate": 3.965226838415622e-06, "loss": 0.88709772, "num_input_tokens_seen": 31210385, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.2421875, "step": 1458, "time_per_iteration": 2.5651919841766357 }, { "auxiliary_loss_clip": 0.01185635, "auxiliary_loss_mlp": 0.01058071, "balance_loss_clip": 1.03623223, "balance_loss_mlp": 1.05746377, "epoch": 0.08771982564256726, "flos": 18114025150080.0, "grad_norm": 1.8792202124996749, "language_loss": 0.80528623, "learning_rate": 3.965154492406486e-06, "loss": 0.82772326, "num_input_tokens_seen": 31229745, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.28125, "step": 1459, "time_per_iteration": 2.4742043018341064 }, { "auxiliary_loss_clip": 0.01186143, "auxiliary_loss_mlp": 0.01049769, "balance_loss_clip": 1.02814436, "balance_loss_mlp": 1.0553273, "epoch": 0.08777994889523523, "flos": 17712005155200.0, "grad_norm": 2.094108033109641, "language_loss": 0.84023839, "learning_rate": 3.9650820718784945e-06, "loss": 0.86259747, "num_input_tokens_seen": 31248280, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.3125, "step": 1460, "time_per_iteration": 2.4539310932159424 }, { "auxiliary_loss_clip": 0.0117679, "auxiliary_loss_mlp": 0.01048509, "balance_loss_clip": 1.02821934, "balance_loss_mlp": 1.05066025, "epoch": 0.0878400721479032, "flos": 12819930382080.0, "grad_norm": 6.592371818905983, "language_loss": 0.80863464, "learning_rate": 3.965009576834394e-06, "loss": 0.83088756, "num_input_tokens_seen": 31262190, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.2578125, "step": 1461, "time_per_iteration": 2.4368412494659424 }, { "auxiliary_loss_clip": 0.01185039, "auxiliary_loss_mlp": 0.01054974, "balance_loss_clip": 1.03421938, "balance_loss_mlp": 1.05589187, "epoch": 0.08790019540057117, "flos": 26392690938240.0, "grad_norm": 2.2464981866168316, "language_loss": 0.76692545, "learning_rate": 3.964937007276932e-06, "loss": 0.78932559, "num_input_tokens_seen": 31283690, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.296875, "step": 1462, "time_per_iteration": 2.5124945640563965 }, { "auxiliary_loss_clip": 0.01185275, "auxiliary_loss_mlp": 0.01050664, "balance_loss_clip": 1.0279901, "balance_loss_mlp": 1.05529761, "epoch": 0.08796031865323914, "flos": 19134031662720.0, "grad_norm": 2.5106900114751136, "language_loss": 0.74314213, "learning_rate": 3.9648643632088634e-06, "loss": 0.7655015, "num_input_tokens_seen": 31302505, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.296875, "step": 1463, "time_per_iteration": 2.4846906661987305 }, { "auxiliary_loss_clip": 0.01187939, "auxiliary_loss_mlp": 0.01057902, "balance_loss_clip": 1.03487062, "balance_loss_mlp": 1.0550127, "epoch": 0.0880204419059071, "flos": 26064287867520.0, "grad_norm": 2.081595549920921, "language_loss": 0.83582723, "learning_rate": 3.964791644632941e-06, "loss": 0.85828567, "num_input_tokens_seen": 31323070, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.328125, "step": 1464, "time_per_iteration": 2.503065347671509 }, { "auxiliary_loss_clip": 0.01181521, "auxiliary_loss_mlp": 0.01053369, "balance_loss_clip": 1.03182781, "balance_loss_mlp": 1.05454707, "epoch": 0.08808056515857508, "flos": 22377842115840.0, "grad_norm": 1.9378560426156535, "language_loss": 0.7823723, "learning_rate": 3.964718851551923e-06, "loss": 0.80472124, "num_input_tokens_seen": 31341880, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.265625, "step": 1465, "time_per_iteration": 2.482527732849121 }, { "auxiliary_loss_clip": 0.01185967, "auxiliary_loss_mlp": 0.0105002, "balance_loss_clip": 1.02928948, "balance_loss_mlp": 1.05447698, "epoch": 0.08814068841124305, "flos": 23185293897600.0, "grad_norm": 1.843226444096735, "language_loss": 0.85002899, "learning_rate": 3.9646459839685675e-06, "loss": 0.87238884, "num_input_tokens_seen": 31361995, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.3125, "step": 1466, "time_per_iteration": 2.468902826309204 }, { "auxiliary_loss_clip": 0.01179645, "auxiliary_loss_mlp": 0.0105043, "balance_loss_clip": 1.02850759, "balance_loss_mlp": 1.05220151, "epoch": 0.08820081166391101, "flos": 25155281358720.0, "grad_norm": 2.256448940381805, "language_loss": 0.83872008, "learning_rate": 3.964573041885641e-06, "loss": 0.86102086, "num_input_tokens_seen": 31381515, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.2734375, "step": 1467, "time_per_iteration": 2.4876251220703125 }, { "auxiliary_loss_clip": 0.01179337, "auxiliary_loss_mlp": 0.01046505, "balance_loss_clip": 1.02532113, "balance_loss_mlp": 1.05200815, "epoch": 0.08826093491657899, "flos": 22231685675520.0, "grad_norm": 1.8512513363726268, "language_loss": 0.75754738, "learning_rate": 3.964500025305907e-06, "loss": 0.77980578, "num_input_tokens_seen": 31400345, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.2734375, "step": 1468, "time_per_iteration": 2.455698013305664 }, { "auxiliary_loss_clip": 0.01178691, "auxiliary_loss_mlp": 0.01045777, "balance_loss_clip": 1.02610707, "balance_loss_mlp": 1.05291402, "epoch": 0.08832105816924696, "flos": 22126826897280.0, "grad_norm": 1.8047935451742942, "language_loss": 0.80055875, "learning_rate": 3.9644269342321355e-06, "loss": 0.8228035, "num_input_tokens_seen": 31419620, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.2578125, "step": 1469, "time_per_iteration": 2.4828379154205322 }, { "auxiliary_loss_clip": 0.01182137, "auxiliary_loss_mlp": 0.01048252, "balance_loss_clip": 1.02758121, "balance_loss_mlp": 1.05304241, "epoch": 0.08838118142191492, "flos": 17566495159680.0, "grad_norm": 2.1100529289394467, "language_loss": 0.77397621, "learning_rate": 3.9643537686670974e-06, "loss": 0.79628015, "num_input_tokens_seen": 31437970, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.2890625, "step": 1470, "time_per_iteration": 2.4519219398498535 }, { "auxiliary_loss_clip": 0.01176966, "auxiliary_loss_mlp": 0.01052836, "balance_loss_clip": 1.03067517, "balance_loss_mlp": 1.05241179, "epoch": 0.0884413046745829, "flos": 20777196251520.0, "grad_norm": 2.0405573078809116, "language_loss": 0.8402223, "learning_rate": 3.964280528613569e-06, "loss": 0.8625204, "num_input_tokens_seen": 31457040, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.25, "step": 1471, "time_per_iteration": 2.4843950271606445 }, { "auxiliary_loss_clip": 0.01175062, "auxiliary_loss_mlp": 0.0104471, "balance_loss_clip": 1.02581561, "balance_loss_mlp": 1.05456066, "epoch": 0.08850142792725087, "flos": 22125462180480.0, "grad_norm": 1.7514680200475938, "language_loss": 0.83393067, "learning_rate": 3.964207214074324e-06, "loss": 0.85612839, "num_input_tokens_seen": 31477520, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.203125, "step": 1472, "time_per_iteration": 2.4830262660980225 }, { "auxiliary_loss_clip": 0.01181893, "auxiliary_loss_mlp": 0.01050678, "balance_loss_clip": 1.02975678, "balance_loss_mlp": 1.05537009, "epoch": 0.08856155117991883, "flos": 22418744728320.0, "grad_norm": 3.3507329047570513, "language_loss": 0.82414484, "learning_rate": 3.964133825052146e-06, "loss": 0.84647059, "num_input_tokens_seen": 31495575, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.265625, "step": 1473, "time_per_iteration": 5.4242942333221436 }, { "auxiliary_loss_clip": 0.01177248, "auxiliary_loss_mlp": 0.01052553, "balance_loss_clip": 1.03266931, "balance_loss_mlp": 1.05079865, "epoch": 0.0886216744325868, "flos": 29937002572800.0, "grad_norm": 1.607217435095721, "language_loss": 0.78545129, "learning_rate": 3.964060361549816e-06, "loss": 0.80774939, "num_input_tokens_seen": 31520020, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.265625, "step": 1474, "time_per_iteration": 3.9238109588623047 }, { "auxiliary_loss_clip": 0.01177636, "auxiliary_loss_mlp": 0.01052858, "balance_loss_clip": 1.0321157, "balance_loss_mlp": 1.05371988, "epoch": 0.08868179768525478, "flos": 23982833525760.0, "grad_norm": 1.7613335494587283, "language_loss": 0.78904152, "learning_rate": 3.963986823570121e-06, "loss": 0.81134647, "num_input_tokens_seen": 31539265, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.234375, "step": 1475, "time_per_iteration": 2.500107765197754 }, { "auxiliary_loss_clip": 0.01179057, "auxiliary_loss_mlp": 0.01045469, "balance_loss_clip": 1.0246315, "balance_loss_mlp": 1.05294418, "epoch": 0.08874192093792274, "flos": 43177553216640.0, "grad_norm": 1.5727370294556926, "language_loss": 0.74048388, "learning_rate": 3.963913211115848e-06, "loss": 0.76272917, "num_input_tokens_seen": 31563425, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.265625, "step": 1476, "time_per_iteration": 2.6608147621154785 }, { "auxiliary_loss_clip": 0.01181022, "auxiliary_loss_mlp": 0.01056996, "balance_loss_clip": 1.03630078, "balance_loss_mlp": 1.05405402, "epoch": 0.0888020441905907, "flos": 32852445868800.0, "grad_norm": 1.494103805007143, "language_loss": 0.74086833, "learning_rate": 3.9638395241897895e-06, "loss": 0.76324844, "num_input_tokens_seen": 31584525, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.2734375, "step": 1477, "time_per_iteration": 2.6021158695220947 }, { "auxiliary_loss_clip": 0.01178957, "auxiliary_loss_mlp": 0.01048266, "balance_loss_clip": 1.02682054, "balance_loss_mlp": 1.0535109, "epoch": 0.08886216744325869, "flos": 23149347361920.0, "grad_norm": 2.0322201897557504, "language_loss": 0.86802095, "learning_rate": 3.963765762794739e-06, "loss": 0.89029324, "num_input_tokens_seen": 31603325, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.2578125, "step": 1478, "time_per_iteration": 2.472676992416382 }, { "auxiliary_loss_clip": 0.01176677, "auxiliary_loss_mlp": 0.01049664, "balance_loss_clip": 1.02918386, "balance_loss_mlp": 1.05199611, "epoch": 0.08892229069592665, "flos": 23331593992320.0, "grad_norm": 2.6457487302860754, "language_loss": 0.77743101, "learning_rate": 3.963691926933495e-06, "loss": 0.79969442, "num_input_tokens_seen": 31624820, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.25, "step": 1479, "time_per_iteration": 2.5002472400665283 }, { "auxiliary_loss_clip": 0.01177471, "auxiliary_loss_mlp": 0.01049525, "balance_loss_clip": 1.02736449, "balance_loss_mlp": 1.05181074, "epoch": 0.08898241394859462, "flos": 26213784272640.0, "grad_norm": 2.639682371616482, "language_loss": 0.78101122, "learning_rate": 3.9636180166088555e-06, "loss": 0.80328113, "num_input_tokens_seen": 31646080, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.2578125, "step": 1480, "time_per_iteration": 2.498056411743164 }, { "auxiliary_loss_clip": 0.01183516, "auxiliary_loss_mlp": 0.01053752, "balance_loss_clip": 1.03120947, "balance_loss_mlp": 1.05382931, "epoch": 0.0890425372012626, "flos": 23550613171200.0, "grad_norm": 1.6685408375277933, "language_loss": 0.66494775, "learning_rate": 3.963544031823624e-06, "loss": 0.68732035, "num_input_tokens_seen": 31665770, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.296875, "step": 1481, "time_per_iteration": 2.5184857845306396 }, { "auxiliary_loss_clip": 0.01179713, "auxiliary_loss_mlp": 0.01042315, "balance_loss_clip": 1.02260983, "balance_loss_mlp": 1.0544858, "epoch": 0.08910266045393056, "flos": 23002795872000.0, "grad_norm": 2.020676614614525, "language_loss": 0.96522403, "learning_rate": 3.9634699725806065e-06, "loss": 0.98744428, "num_input_tokens_seen": 31683805, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.25, "step": 1482, "time_per_iteration": 2.47959566116333 }, { "auxiliary_loss_clip": 0.01185996, "auxiliary_loss_mlp": 0.01051687, "balance_loss_clip": 1.03051496, "balance_loss_mlp": 1.05536342, "epoch": 0.08916278370659853, "flos": 31936508035200.0, "grad_norm": 1.8539000598359534, "language_loss": 0.78510422, "learning_rate": 3.96339583888261e-06, "loss": 0.80748105, "num_input_tokens_seen": 31704630, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.3046875, "step": 1483, "time_per_iteration": 2.5782434940338135 }, { "auxiliary_loss_clip": 0.01180822, "auxiliary_loss_mlp": 0.01072027, "balance_loss_clip": 1.05015182, "balance_loss_mlp": 1.05383897, "epoch": 0.08922290695926649, "flos": 17530404969600.0, "grad_norm": 2.539524242752539, "language_loss": 0.85645956, "learning_rate": 3.963321630732448e-06, "loss": 0.87898815, "num_input_tokens_seen": 31723255, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.265625, "step": 1484, "time_per_iteration": 2.4379465579986572 }, { "auxiliary_loss_clip": 0.01187354, "auxiliary_loss_mlp": 0.01055456, "balance_loss_clip": 1.03417706, "balance_loss_mlp": 1.05657136, "epoch": 0.08928303021193447, "flos": 32125075459200.0, "grad_norm": 2.202441904897864, "language_loss": 0.80294895, "learning_rate": 3.963247348132932e-06, "loss": 0.82537705, "num_input_tokens_seen": 31747045, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.3125, "step": 1485, "time_per_iteration": 2.5910375118255615 }, { "auxiliary_loss_clip": 0.01180365, "auxiliary_loss_mlp": 0.01051056, "balance_loss_clip": 1.03007555, "balance_loss_mlp": 1.05307305, "epoch": 0.08934315346460243, "flos": 22125210785280.0, "grad_norm": 1.6603718869147768, "language_loss": 0.82862139, "learning_rate": 3.96317299108688e-06, "loss": 0.85093558, "num_input_tokens_seen": 31766615, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.2734375, "step": 1486, "time_per_iteration": 2.4724578857421875 }, { "auxiliary_loss_clip": 0.01181314, "auxiliary_loss_mlp": 0.01059029, "balance_loss_clip": 1.03705895, "balance_loss_mlp": 1.05447483, "epoch": 0.0894032767172704, "flos": 22565583527040.0, "grad_norm": 1.9407073357435747, "language_loss": 0.763044, "learning_rate": 3.963098559597111e-06, "loss": 0.78544742, "num_input_tokens_seen": 31785855, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.265625, "step": 1487, "time_per_iteration": 2.48809552192688 }, { "auxiliary_loss_clip": 0.01178555, "auxiliary_loss_mlp": 0.01054849, "balance_loss_clip": 1.03283143, "balance_loss_mlp": 1.05273318, "epoch": 0.08946339996993838, "flos": 20193396503040.0, "grad_norm": 2.1715901011168257, "language_loss": 0.82871926, "learning_rate": 3.963024053666449e-06, "loss": 0.85105324, "num_input_tokens_seen": 31804210, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.2578125, "step": 1488, "time_per_iteration": 2.4583404064178467 }, { "auxiliary_loss_clip": 0.0117764, "auxiliary_loss_mlp": 0.0104236, "balance_loss_clip": 1.02270186, "balance_loss_mlp": 1.05300951, "epoch": 0.08952352322260634, "flos": 48360181104000.0, "grad_norm": 3.3793437702725733, "language_loss": 0.71643388, "learning_rate": 3.962949473297718e-06, "loss": 0.73863387, "num_input_tokens_seen": 31826150, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.25, "step": 1489, "time_per_iteration": 2.703153133392334 }, { "auxiliary_loss_clip": 0.01175935, "auxiliary_loss_mlp": 0.01046571, "balance_loss_clip": 1.02591217, "balance_loss_mlp": 1.05009472, "epoch": 0.08958364647527431, "flos": 31793081028480.0, "grad_norm": 1.9929596181970584, "language_loss": 0.89356017, "learning_rate": 3.962874818493745e-06, "loss": 0.91578525, "num_input_tokens_seen": 31848060, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.2578125, "step": 1490, "time_per_iteration": 2.5377275943756104 }, { "auxiliary_loss_clip": 0.01184774, "auxiliary_loss_mlp": 0.01057488, "balance_loss_clip": 1.03668606, "balance_loss_mlp": 1.05326653, "epoch": 0.08964376972794229, "flos": 23368186972800.0, "grad_norm": 2.266481945079257, "language_loss": 0.73828775, "learning_rate": 3.9628000892573635e-06, "loss": 0.76071036, "num_input_tokens_seen": 31870040, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.3125, "step": 1491, "time_per_iteration": 2.5106661319732666 }, { "auxiliary_loss_clip": 0.01181447, "auxiliary_loss_mlp": 0.01045514, "balance_loss_clip": 1.02621412, "balance_loss_mlp": 1.05666018, "epoch": 0.08970389298061025, "flos": 23294785530240.0, "grad_norm": 2.873954995339997, "language_loss": 0.77007526, "learning_rate": 3.9627252855914055e-06, "loss": 0.79234487, "num_input_tokens_seen": 31890400, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.25, "step": 1492, "time_per_iteration": 2.4886269569396973 }, { "auxiliary_loss_clip": 0.01176337, "auxiliary_loss_mlp": 0.01050369, "balance_loss_clip": 1.03005624, "balance_loss_mlp": 1.05370557, "epoch": 0.08976401623327822, "flos": 33761703772800.0, "grad_norm": 2.569402486900136, "language_loss": 0.71219224, "learning_rate": 3.962650407498707e-06, "loss": 0.73445934, "num_input_tokens_seen": 31913435, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.2265625, "step": 1493, "time_per_iteration": 2.6255836486816406 }, { "auxiliary_loss_clip": 0.01176324, "auxiliary_loss_mlp": 0.01055085, "balance_loss_clip": 1.03373468, "balance_loss_mlp": 1.05138373, "epoch": 0.08982413948594618, "flos": 23911335504000.0, "grad_norm": 1.6366303596109635, "language_loss": 0.86949182, "learning_rate": 3.962575454982109e-06, "loss": 0.89180601, "num_input_tokens_seen": 31932435, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.25, "step": 1494, "time_per_iteration": 2.476994752883911 }, { "auxiliary_loss_clip": 0.01173498, "auxiliary_loss_mlp": 0.01058616, "balance_loss_clip": 1.03724217, "balance_loss_mlp": 1.05043232, "epoch": 0.08988426273861416, "flos": 16837544551680.0, "grad_norm": 1.9598817557931156, "language_loss": 0.83257985, "learning_rate": 3.962500428044454e-06, "loss": 0.85490096, "num_input_tokens_seen": 31950125, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.234375, "step": 1495, "time_per_iteration": 2.4648780822753906 }, { "auxiliary_loss_clip": 0.01186002, "auxiliary_loss_mlp": 0.01053513, "balance_loss_clip": 1.03286588, "balance_loss_mlp": 1.05886745, "epoch": 0.08994438599128213, "flos": 14793365548800.0, "grad_norm": 2.1166253755012603, "language_loss": 0.70151472, "learning_rate": 3.962425326688585e-06, "loss": 0.72390985, "num_input_tokens_seen": 31968050, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.265625, "step": 1496, "time_per_iteration": 2.4335885047912598 }, { "auxiliary_loss_clip": 0.01173808, "auxiliary_loss_mlp": 0.01046807, "balance_loss_clip": 1.02786446, "balance_loss_mlp": 1.05012655, "epoch": 0.09000450924395009, "flos": 17384320356480.0, "grad_norm": 1.6240654886621193, "language_loss": 0.80103052, "learning_rate": 3.962350150917351e-06, "loss": 0.8232367, "num_input_tokens_seen": 31985675, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.234375, "step": 1497, "time_per_iteration": 2.451899766921997 }, { "auxiliary_loss_clip": 0.01183119, "auxiliary_loss_mlp": 0.01051656, "balance_loss_clip": 1.02990067, "balance_loss_mlp": 1.0527029, "epoch": 0.09006463249661807, "flos": 24280317964800.0, "grad_norm": 2.0792350307124146, "language_loss": 0.82353717, "learning_rate": 3.9622749007336035e-06, "loss": 0.84588486, "num_input_tokens_seen": 32005180, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.3046875, "step": 1498, "time_per_iteration": 2.496817111968994 }, { "auxiliary_loss_clip": 0.01182172, "auxiliary_loss_mlp": 0.01055986, "balance_loss_clip": 1.03573251, "balance_loss_mlp": 1.05434561, "epoch": 0.09012475574928604, "flos": 13661928069120.0, "grad_norm": 2.657723435312474, "language_loss": 0.78727698, "learning_rate": 3.962199576140195e-06, "loss": 0.80965865, "num_input_tokens_seen": 32022970, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.28125, "step": 1499, "time_per_iteration": 2.451822519302368 }, { "auxiliary_loss_clip": 0.01175016, "auxiliary_loss_mlp": 0.01050324, "balance_loss_clip": 1.02999878, "balance_loss_mlp": 1.05283058, "epoch": 0.090184879001954, "flos": 23327751237120.0, "grad_norm": 1.664527342652402, "language_loss": 0.93176544, "learning_rate": 3.962124177139981e-06, "loss": 0.95401883, "num_input_tokens_seen": 32043055, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.2265625, "step": 1500, "time_per_iteration": 2.4744584560394287 }, { "auxiliary_loss_clip": 0.01179773, "auxiliary_loss_mlp": 0.01046708, "balance_loss_clip": 1.02439237, "balance_loss_mlp": 1.05238438, "epoch": 0.09024500225462198, "flos": 23002688131200.0, "grad_norm": 2.104893283575632, "language_loss": 0.74548781, "learning_rate": 3.962048703735822e-06, "loss": 0.76775265, "num_input_tokens_seen": 32061900, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.2734375, "step": 1501, "time_per_iteration": 2.4886586666107178 }, { "auxiliary_loss_clip": 0.01072201, "auxiliary_loss_mlp": 0.0100149, "balance_loss_clip": 0.99907035, "balance_loss_mlp": 1.02552521, "epoch": 0.09030512550728995, "flos": 62189203242240.0, "grad_norm": 0.8424903931831339, "language_loss": 0.58331662, "learning_rate": 3.96197315593058e-06, "loss": 0.60405356, "num_input_tokens_seen": 32122745, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.46679688, "step": 1502, "time_per_iteration": 3.1043736934661865 }, { "auxiliary_loss_clip": 0.01173601, "auxiliary_loss_mlp": 0.01050305, "balance_loss_clip": 1.03057599, "balance_loss_mlp": 1.05067134, "epoch": 0.09036524875995791, "flos": 38800689171840.0, "grad_norm": 2.2960669935042657, "language_loss": 0.69492882, "learning_rate": 3.961897533727119e-06, "loss": 0.71716791, "num_input_tokens_seen": 32145125, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.2265625, "step": 1503, "time_per_iteration": 2.631666898727417 }, { "auxiliary_loss_clip": 0.01180277, "auxiliary_loss_mlp": 0.01060334, "balance_loss_clip": 1.03990173, "balance_loss_mlp": 1.05294037, "epoch": 0.09042537201262588, "flos": 21690081429120.0, "grad_norm": 2.22555011557249, "language_loss": 0.86416095, "learning_rate": 3.961821837128306e-06, "loss": 0.88656706, "num_input_tokens_seen": 32166255, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.2734375, "step": 1504, "time_per_iteration": 2.481149911880493 }, { "auxiliary_loss_clip": 0.01187343, "auxiliary_loss_mlp": 0.01061588, "balance_loss_clip": 1.03748417, "balance_loss_mlp": 1.05508804, "epoch": 0.09048549526529386, "flos": 22267021680000.0, "grad_norm": 1.9796539513317735, "language_loss": 0.72492456, "learning_rate": 3.961746066137014e-06, "loss": 0.74741387, "num_input_tokens_seen": 32184010, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.3203125, "step": 1505, "time_per_iteration": 2.490039348602295 }, { "auxiliary_loss_clip": 0.01177708, "auxiliary_loss_mlp": 0.01048728, "balance_loss_clip": 1.02786648, "balance_loss_mlp": 1.05460322, "epoch": 0.09054561851796182, "flos": 14610939350400.0, "grad_norm": 3.774910420190476, "language_loss": 0.80837274, "learning_rate": 3.961670220756114e-06, "loss": 0.83063704, "num_input_tokens_seen": 32201635, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.2265625, "step": 1506, "time_per_iteration": 2.431644916534424 }, { "auxiliary_loss_clip": 0.01176379, "auxiliary_loss_mlp": 0.01048186, "balance_loss_clip": 1.02848113, "balance_loss_mlp": 1.0529983, "epoch": 0.09060574177062979, "flos": 27636169916160.0, "grad_norm": 3.027821031932893, "language_loss": 0.76158607, "learning_rate": 3.961594300988482e-06, "loss": 0.78383172, "num_input_tokens_seen": 32221940, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.234375, "step": 1507, "time_per_iteration": 2.530848264694214 }, { "auxiliary_loss_clip": 0.01073378, "auxiliary_loss_mlp": 0.01004545, "balance_loss_clip": 1.00192261, "balance_loss_mlp": 1.02738571, "epoch": 0.09066586502329776, "flos": 66085797513600.0, "grad_norm": 0.7632470651530601, "language_loss": 0.57664943, "learning_rate": 3.961518306836998e-06, "loss": 0.59742868, "num_input_tokens_seen": 32276495, "router_z_loss_clip": 0.02624512, "router_z_loss_mlp": 0.4609375, "step": 1508, "time_per_iteration": 2.9799728393554688 }, { "auxiliary_loss_clip": 0.01180924, "auxiliary_loss_mlp": 0.01046303, "balance_loss_clip": 1.0250957, "balance_loss_mlp": 1.0542295, "epoch": 0.09072598827596573, "flos": 18916449027840.0, "grad_norm": 1.9325122064664129, "language_loss": 0.84912252, "learning_rate": 3.961442238304543e-06, "loss": 0.87139475, "num_input_tokens_seen": 32294130, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.265625, "step": 1509, "time_per_iteration": 2.4493160247802734 }, { "auxiliary_loss_clip": 0.0118752, "auxiliary_loss_mlp": 0.01062839, "balance_loss_clip": 1.04073727, "balance_loss_mlp": 1.05631518, "epoch": 0.0907861115286337, "flos": 24821742643200.0, "grad_norm": 3.148663231681924, "language_loss": 0.84371674, "learning_rate": 3.961366095394002e-06, "loss": 0.86622036, "num_input_tokens_seen": 32313555, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.3125, "step": 1510, "time_per_iteration": 2.5023326873779297 }, { "auxiliary_loss_clip": 0.01181228, "auxiliary_loss_mlp": 0.01051738, "balance_loss_clip": 1.03068542, "balance_loss_mlp": 1.05425763, "epoch": 0.09084623478130167, "flos": 21652842003840.0, "grad_norm": 2.0067733929739493, "language_loss": 0.85354716, "learning_rate": 3.961289878108262e-06, "loss": 0.87587678, "num_input_tokens_seen": 32331430, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.2734375, "step": 1511, "time_per_iteration": 2.461904764175415 }, { "auxiliary_loss_clip": 0.01177793, "auxiliary_loss_mlp": 0.0104799, "balance_loss_clip": 1.02659178, "balance_loss_mlp": 1.05371737, "epoch": 0.09090635803396964, "flos": 27639258485760.0, "grad_norm": 1.5179715645637044, "language_loss": 0.84856421, "learning_rate": 3.9612135864502135e-06, "loss": 0.87082207, "num_input_tokens_seen": 32353705, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.2421875, "step": 1512, "time_per_iteration": 2.555875539779663 }, { "auxiliary_loss_clip": 0.01173869, "auxiliary_loss_mlp": 0.01047803, "balance_loss_clip": 1.02791882, "balance_loss_mlp": 1.05193305, "epoch": 0.0909664812866376, "flos": 17669127294720.0, "grad_norm": 2.4334091762944166, "language_loss": 0.86770993, "learning_rate": 3.961137220422749e-06, "loss": 0.88992667, "num_input_tokens_seen": 32370520, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.21875, "step": 1513, "time_per_iteration": 2.4110279083251953 }, { "auxiliary_loss_clip": 0.01177062, "auxiliary_loss_mlp": 0.01043396, "balance_loss_clip": 1.0239172, "balance_loss_mlp": 1.05323887, "epoch": 0.09102660453930557, "flos": 23951448017280.0, "grad_norm": 1.8401702222513878, "language_loss": 0.86520797, "learning_rate": 3.961060780028764e-06, "loss": 0.88741255, "num_input_tokens_seen": 32389105, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.2421875, "step": 1514, "time_per_iteration": 2.5133614540100098 }, { "auxiliary_loss_clip": 0.01176726, "auxiliary_loss_mlp": 0.0105314, "balance_loss_clip": 1.0335784, "balance_loss_mlp": 1.05379045, "epoch": 0.09108672779197355, "flos": 25812949426560.0, "grad_norm": 2.30265915554896, "language_loss": 0.90078968, "learning_rate": 3.960984265271159e-06, "loss": 0.92308837, "num_input_tokens_seen": 32408065, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.234375, "step": 1515, "time_per_iteration": 5.5595009326934814 }, { "auxiliary_loss_clip": 0.01177211, "auxiliary_loss_mlp": 0.01048288, "balance_loss_clip": 1.02675915, "balance_loss_mlp": 1.05263031, "epoch": 0.09114685104464151, "flos": 29639482220160.0, "grad_norm": 2.2896274199071804, "language_loss": 0.85347199, "learning_rate": 3.9609076761528335e-06, "loss": 0.87572694, "num_input_tokens_seen": 32427225, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.25, "step": 1516, "time_per_iteration": 3.931147336959839 }, { "auxiliary_loss_clip": 0.01182435, "auxiliary_loss_mlp": 0.01049941, "balance_loss_clip": 1.0288527, "balance_loss_mlp": 1.05316591, "epoch": 0.09120697429730948, "flos": 33729635905920.0, "grad_norm": 1.5023307876081378, "language_loss": 0.81067342, "learning_rate": 3.960831012676692e-06, "loss": 0.8329972, "num_input_tokens_seen": 32450510, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.296875, "step": 1517, "time_per_iteration": 2.5848770141601562 }, { "auxiliary_loss_clip": 0.01183654, "auxiliary_loss_mlp": 0.01060237, "balance_loss_clip": 1.03846908, "balance_loss_mlp": 1.05555606, "epoch": 0.09126709754997746, "flos": 18401381953920.0, "grad_norm": 1.617948999683787, "language_loss": 0.77864689, "learning_rate": 3.960754274845642e-06, "loss": 0.80108583, "num_input_tokens_seen": 32468425, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.28125, "step": 1518, "time_per_iteration": 2.440779685974121 }, { "auxiliary_loss_clip": 0.01180398, "auxiliary_loss_mlp": 0.0105445, "balance_loss_clip": 1.03383899, "balance_loss_mlp": 1.05469465, "epoch": 0.09132722080264542, "flos": 22091957769600.0, "grad_norm": 1.8113291494035562, "language_loss": 0.86220688, "learning_rate": 3.960677462662594e-06, "loss": 0.88455534, "num_input_tokens_seen": 32487510, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.25, "step": 1519, "time_per_iteration": 2.4900858402252197 }, { "auxiliary_loss_clip": 0.01180186, "auxiliary_loss_mlp": 0.0104788, "balance_loss_clip": 1.02542114, "balance_loss_mlp": 1.05320239, "epoch": 0.09138734405531339, "flos": 21033131633280.0, "grad_norm": 2.14893293809731, "language_loss": 0.73414564, "learning_rate": 3.96060057613046e-06, "loss": 0.75642622, "num_input_tokens_seen": 32507250, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.265625, "step": 1520, "time_per_iteration": 2.476055145263672 }, { "auxiliary_loss_clip": 0.01184073, "auxiliary_loss_mlp": 0.01047784, "balance_loss_clip": 1.02613544, "balance_loss_mlp": 1.05637968, "epoch": 0.09144746730798137, "flos": 20083940784000.0, "grad_norm": 2.919221971885584, "language_loss": 0.85459, "learning_rate": 3.960523615252156e-06, "loss": 0.87690866, "num_input_tokens_seen": 32526045, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.28125, "step": 1521, "time_per_iteration": 2.4743120670318604 }, { "auxiliary_loss_clip": 0.01183371, "auxiliary_loss_mlp": 0.0105374, "balance_loss_clip": 1.03193712, "balance_loss_mlp": 1.05513382, "epoch": 0.09150759056064933, "flos": 22778210085120.0, "grad_norm": 2.6216900461512105, "language_loss": 0.84022385, "learning_rate": 3.960446580030599e-06, "loss": 0.86259496, "num_input_tokens_seen": 32546575, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.28125, "step": 1522, "time_per_iteration": 2.471815586090088 }, { "auxiliary_loss_clip": 0.01174001, "auxiliary_loss_mlp": 0.01055323, "balance_loss_clip": 1.03357935, "balance_loss_mlp": 1.05303836, "epoch": 0.0915677138133173, "flos": 27564205017600.0, "grad_norm": 2.602784784288194, "language_loss": 0.80904508, "learning_rate": 3.960369470468711e-06, "loss": 0.83133835, "num_input_tokens_seen": 32568795, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.2109375, "step": 1523, "time_per_iteration": 2.5367259979248047 }, { "auxiliary_loss_clip": 0.01183426, "auxiliary_loss_mlp": 0.01056224, "balance_loss_clip": 1.03533888, "balance_loss_mlp": 1.05727649, "epoch": 0.09162783706598528, "flos": 17674765729920.0, "grad_norm": 2.370412073267458, "language_loss": 0.74975467, "learning_rate": 3.960292286569418e-06, "loss": 0.77215117, "num_input_tokens_seen": 32587010, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.265625, "step": 1524, "time_per_iteration": 2.4271841049194336 }, { "auxiliary_loss_clip": 0.01180552, "auxiliary_loss_mlp": 0.01056727, "balance_loss_clip": 1.03476882, "balance_loss_mlp": 1.05477643, "epoch": 0.09168796031865324, "flos": 18478195188480.0, "grad_norm": 2.07760650948229, "language_loss": 0.85953516, "learning_rate": 3.960215028335644e-06, "loss": 0.881908, "num_input_tokens_seen": 32602375, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.2578125, "step": 1525, "time_per_iteration": 2.4521398544311523 }, { "auxiliary_loss_clip": 0.01181784, "auxiliary_loss_mlp": 0.01043603, "balance_loss_clip": 1.02244365, "balance_loss_mlp": 1.05638039, "epoch": 0.0917480835713212, "flos": 29387605075200.0, "grad_norm": 2.1936446642545877, "language_loss": 0.7488941, "learning_rate": 3.96013769577032e-06, "loss": 0.77114797, "num_input_tokens_seen": 32621460, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.25, "step": 1526, "time_per_iteration": 2.5265562534332275 }, { "auxiliary_loss_clip": 0.01177862, "auxiliary_loss_mlp": 0.0105082, "balance_loss_clip": 1.02967215, "balance_loss_mlp": 1.05462313, "epoch": 0.09180820682398917, "flos": 19829262378240.0, "grad_norm": 1.9154559766663026, "language_loss": 0.7730521, "learning_rate": 3.960060288876378e-06, "loss": 0.79533887, "num_input_tokens_seen": 32640440, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.234375, "step": 1527, "time_per_iteration": 2.469834804534912 }, { "auxiliary_loss_clip": 0.01178335, "auxiliary_loss_mlp": 0.01052771, "balance_loss_clip": 1.03074074, "balance_loss_mlp": 1.05324721, "epoch": 0.09186833007665715, "flos": 23841848643840.0, "grad_norm": 1.836991532236521, "language_loss": 0.78325766, "learning_rate": 3.959982807656753e-06, "loss": 0.8055687, "num_input_tokens_seen": 32660020, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.25, "step": 1528, "time_per_iteration": 2.4827613830566406 }, { "auxiliary_loss_clip": 0.0117969, "auxiliary_loss_mlp": 0.0104835, "balance_loss_clip": 1.02771497, "balance_loss_mlp": 1.05556977, "epoch": 0.09192845332932512, "flos": 12932726065920.0, "grad_norm": 2.772010023075929, "language_loss": 0.76489937, "learning_rate": 3.959905252114384e-06, "loss": 0.78717977, "num_input_tokens_seen": 32678170, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.2421875, "step": 1529, "time_per_iteration": 2.442352056503296 }, { "auxiliary_loss_clip": 0.01181563, "auxiliary_loss_mlp": 0.0104648, "balance_loss_clip": 1.02452195, "balance_loss_mlp": 1.05385804, "epoch": 0.09198857658199308, "flos": 24568177559040.0, "grad_norm": 1.9235583071754276, "language_loss": 0.82882178, "learning_rate": 3.959827622252211e-06, "loss": 0.85110217, "num_input_tokens_seen": 32697540, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.28125, "step": 1530, "time_per_iteration": 2.4806370735168457 }, { "auxiliary_loss_clip": 0.01177193, "auxiliary_loss_mlp": 0.01057197, "balance_loss_clip": 1.03573966, "balance_loss_mlp": 1.05552721, "epoch": 0.09204869983466106, "flos": 20266941600000.0, "grad_norm": 4.382075004360578, "language_loss": 0.83965898, "learning_rate": 3.959749918073179e-06, "loss": 0.86200291, "num_input_tokens_seen": 32716805, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.21875, "step": 1531, "time_per_iteration": 2.4725048542022705 }, { "auxiliary_loss_clip": 0.01178186, "auxiliary_loss_mlp": 0.01043045, "balance_loss_clip": 1.02128911, "balance_loss_mlp": 1.05408478, "epoch": 0.09210882308732903, "flos": 20885646389760.0, "grad_norm": 2.1751712864426764, "language_loss": 0.81115669, "learning_rate": 3.959672139580233e-06, "loss": 0.83336902, "num_input_tokens_seen": 32736385, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.2421875, "step": 1532, "time_per_iteration": 2.47764253616333 }, { "auxiliary_loss_clip": 0.01180512, "auxiliary_loss_mlp": 0.01049177, "balance_loss_clip": 1.02813697, "balance_loss_mlp": 1.05667639, "epoch": 0.09216894633999699, "flos": 30956326727040.0, "grad_norm": 1.960937052434569, "language_loss": 0.83981431, "learning_rate": 3.9595942867763235e-06, "loss": 0.86211121, "num_input_tokens_seen": 32757140, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.234375, "step": 1533, "time_per_iteration": 2.5340230464935303 }, { "auxiliary_loss_clip": 0.01180234, "auxiliary_loss_mlp": 0.01047827, "balance_loss_clip": 1.02725136, "balance_loss_mlp": 1.05531907, "epoch": 0.09222906959266497, "flos": 13151565676800.0, "grad_norm": 2.0499275780734054, "language_loss": 0.90121758, "learning_rate": 3.959516359664402e-06, "loss": 0.92349827, "num_input_tokens_seen": 32774860, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.25, "step": 1534, "time_per_iteration": 2.440793991088867 }, { "auxiliary_loss_clip": 0.01181163, "auxiliary_loss_mlp": 0.01059788, "balance_loss_clip": 1.0365181, "balance_loss_mlp": 1.0553658, "epoch": 0.09228919284533293, "flos": 25994477784960.0, "grad_norm": 2.113706243512768, "language_loss": 0.75845766, "learning_rate": 3.959438358247424e-06, "loss": 0.78086716, "num_input_tokens_seen": 32795250, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.2578125, "step": 1535, "time_per_iteration": 2.5011801719665527 }, { "auxiliary_loss_clip": 0.01171177, "auxiliary_loss_mlp": 0.01040614, "balance_loss_clip": 1.02146912, "balance_loss_mlp": 1.05207109, "epoch": 0.0923493160980009, "flos": 18660800954880.0, "grad_norm": 1.6904259873631329, "language_loss": 0.81708634, "learning_rate": 3.959360282528346e-06, "loss": 0.83920419, "num_input_tokens_seen": 32813805, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.1875, "step": 1536, "time_per_iteration": 2.439572811126709 }, { "auxiliary_loss_clip": 0.01176243, "auxiliary_loss_mlp": 0.01052932, "balance_loss_clip": 1.03302395, "balance_loss_mlp": 1.05496192, "epoch": 0.09240943935066886, "flos": 21140576190720.0, "grad_norm": 1.9287097543492155, "language_loss": 0.88869536, "learning_rate": 3.959282132510131e-06, "loss": 0.91098714, "num_input_tokens_seen": 32830960, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.2109375, "step": 1537, "time_per_iteration": 2.477121353149414 }, { "auxiliary_loss_clip": 0.01176434, "auxiliary_loss_mlp": 0.01055926, "balance_loss_clip": 1.03370547, "balance_loss_mlp": 1.05330408, "epoch": 0.09246956260333684, "flos": 20592435669120.0, "grad_norm": 2.1068491585588056, "language_loss": 0.80557793, "learning_rate": 3.959203908195741e-06, "loss": 0.82790148, "num_input_tokens_seen": 32848275, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.234375, "step": 1538, "time_per_iteration": 2.451533079147339 }, { "auxiliary_loss_clip": 0.01068252, "auxiliary_loss_mlp": 0.01017707, "balance_loss_clip": 1.01503706, "balance_loss_mlp": 1.02379274, "epoch": 0.09252968585600481, "flos": 67558710614400.0, "grad_norm": 0.7512199999544652, "language_loss": 0.57451123, "learning_rate": 3.959125609588142e-06, "loss": 0.59537083, "num_input_tokens_seen": 32917730, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.4453125, "step": 1539, "time_per_iteration": 3.1917593479156494 }, { "auxiliary_loss_clip": 0.01181039, "auxiliary_loss_mlp": 0.01047971, "balance_loss_clip": 1.02668023, "balance_loss_mlp": 1.05702949, "epoch": 0.09258980910867277, "flos": 17383853479680.0, "grad_norm": 3.8924157201749368, "language_loss": 0.67327487, "learning_rate": 3.959047236690304e-06, "loss": 0.69556499, "num_input_tokens_seen": 32934910, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.234375, "step": 1540, "time_per_iteration": 2.4536545276641846 }, { "auxiliary_loss_clip": 0.01179848, "auxiliary_loss_mlp": 0.01048162, "balance_loss_clip": 1.02651334, "balance_loss_mlp": 1.05652046, "epoch": 0.09264993236134075, "flos": 19865927185920.0, "grad_norm": 1.841669892605231, "language_loss": 0.83632696, "learning_rate": 3.958968789505198e-06, "loss": 0.85860711, "num_input_tokens_seen": 32953840, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.234375, "step": 1541, "time_per_iteration": 2.476675033569336 }, { "auxiliary_loss_clip": 0.01067076, "auxiliary_loss_mlp": 0.01005274, "balance_loss_clip": 1.00242484, "balance_loss_mlp": 1.02223873, "epoch": 0.09271005561400872, "flos": 62284401262080.0, "grad_norm": 0.8941617248457063, "language_loss": 0.61887693, "learning_rate": 3.9588902680358e-06, "loss": 0.63960046, "num_input_tokens_seen": 33011410, "router_z_loss_clip": 0.02844238, "router_z_loss_mlp": 0.44921875, "step": 1542, "time_per_iteration": 3.0965371131896973 }, { "auxiliary_loss_clip": 0.01178648, "auxiliary_loss_mlp": 0.01052386, "balance_loss_clip": 1.03272831, "balance_loss_mlp": 1.05550933, "epoch": 0.09277017886667668, "flos": 23329870139520.0, "grad_norm": 1.8986524029802503, "language_loss": 0.82795614, "learning_rate": 3.958811672285086e-06, "loss": 0.85026646, "num_input_tokens_seen": 33031675, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.234375, "step": 1543, "time_per_iteration": 2.4743478298187256 }, { "auxiliary_loss_clip": 0.011723, "auxiliary_loss_mlp": 0.01054187, "balance_loss_clip": 1.03368306, "balance_loss_mlp": 1.05347419, "epoch": 0.09283030211934466, "flos": 54745169875200.0, "grad_norm": 1.8309222273697243, "language_loss": 0.72529846, "learning_rate": 3.958733002256038e-06, "loss": 0.74756336, "num_input_tokens_seen": 33056355, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1875, "step": 1544, "time_per_iteration": 2.812309503555298 }, { "auxiliary_loss_clip": 0.01179801, "auxiliary_loss_mlp": 0.0104678, "balance_loss_clip": 1.02546501, "balance_loss_mlp": 1.05492413, "epoch": 0.09289042537201263, "flos": 30334784762880.0, "grad_norm": 1.9181525628900407, "language_loss": 0.77403104, "learning_rate": 3.958654257951637e-06, "loss": 0.79629672, "num_input_tokens_seen": 33079520, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.25, "step": 1545, "time_per_iteration": 2.538339614868164 }, { "auxiliary_loss_clip": 0.01173987, "auxiliary_loss_mlp": 0.01046113, "balance_loss_clip": 1.02562094, "balance_loss_mlp": 1.054919, "epoch": 0.09295054862468059, "flos": 17746838369280.0, "grad_norm": 3.046022643964191, "language_loss": 0.74868566, "learning_rate": 3.9585754393748706e-06, "loss": 0.77088666, "num_input_tokens_seen": 33096135, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1875, "step": 1546, "time_per_iteration": 2.438930034637451 }, { "auxiliary_loss_clip": 0.01180678, "auxiliary_loss_mlp": 0.01052472, "balance_loss_clip": 1.03147876, "balance_loss_mlp": 1.05568027, "epoch": 0.09301067187734856, "flos": 23658021815040.0, "grad_norm": 2.204622244150018, "language_loss": 0.84309554, "learning_rate": 3.9584965465287275e-06, "loss": 0.86542702, "num_input_tokens_seen": 33115245, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.25, "step": 1547, "time_per_iteration": 2.4758121967315674 }, { "auxiliary_loss_clip": 0.01177047, "auxiliary_loss_mlp": 0.01044298, "balance_loss_clip": 1.02397299, "balance_loss_mlp": 1.05405343, "epoch": 0.09307079513001654, "flos": 27527719777920.0, "grad_norm": 2.8378681814681066, "language_loss": 0.6737985, "learning_rate": 3.958417579416199e-06, "loss": 0.6960119, "num_input_tokens_seen": 33136640, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.234375, "step": 1548, "time_per_iteration": 2.5372564792633057 }, { "auxiliary_loss_clip": 0.01180786, "auxiliary_loss_mlp": 0.01047851, "balance_loss_clip": 1.02691805, "balance_loss_mlp": 1.05686486, "epoch": 0.0931309183826845, "flos": 20627340710400.0, "grad_norm": 1.7313041381159675, "language_loss": 0.83509767, "learning_rate": 3.9583385380402795e-06, "loss": 0.85738409, "num_input_tokens_seen": 33155060, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.234375, "step": 1549, "time_per_iteration": 2.4747469425201416 }, { "auxiliary_loss_clip": 0.01182661, "auxiliary_loss_mlp": 0.01049221, "balance_loss_clip": 1.02882433, "balance_loss_mlp": 1.05879378, "epoch": 0.09319104163535247, "flos": 29020921084800.0, "grad_norm": 1.662019057061161, "language_loss": 0.75794995, "learning_rate": 3.958259422403966e-06, "loss": 0.78026879, "num_input_tokens_seen": 33175420, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.2421875, "step": 1550, "time_per_iteration": 2.537527322769165 }, { "auxiliary_loss_clip": 0.01182337, "auxiliary_loss_mlp": 0.01058444, "balance_loss_clip": 1.03574634, "balance_loss_mlp": 1.05647218, "epoch": 0.09325116488802045, "flos": 25301545539840.0, "grad_norm": 2.173973798009713, "language_loss": 0.8301717, "learning_rate": 3.95818023251026e-06, "loss": 0.85257959, "num_input_tokens_seen": 33194120, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.25, "step": 1551, "time_per_iteration": 2.5133304595947266 }, { "auxiliary_loss_clip": 0.0106497, "auxiliary_loss_mlp": 0.01005862, "balance_loss_clip": 1.00316751, "balance_loss_mlp": 1.02083278, "epoch": 0.09331128814068841, "flos": 61536203942400.0, "grad_norm": 0.7446880962196561, "language_loss": 0.61870676, "learning_rate": 3.958100968362163e-06, "loss": 0.63941509, "num_input_tokens_seen": 33261080, "router_z_loss_clip": 0.02697754, "router_z_loss_mlp": 0.44140625, "step": 1552, "time_per_iteration": 3.22652530670166 }, { "auxiliary_loss_clip": 0.01064701, "auxiliary_loss_mlp": 0.01001539, "balance_loss_clip": 0.99874973, "balance_loss_mlp": 1.0208261, "epoch": 0.09337141139335638, "flos": 53293700171520.0, "grad_norm": 0.8728978030934327, "language_loss": 0.59037507, "learning_rate": 3.958021629962681e-06, "loss": 0.61103743, "num_input_tokens_seen": 33330235, "router_z_loss_clip": 0.0279541, "router_z_loss_mlp": 0.4375, "step": 1553, "time_per_iteration": 3.2269957065582275 }, { "auxiliary_loss_clip": 0.01181206, "auxiliary_loss_mlp": 0.01053952, "balance_loss_clip": 1.03266132, "balance_loss_mlp": 1.0543406, "epoch": 0.09343153464602436, "flos": 23476852592640.0, "grad_norm": 1.8744406519822556, "language_loss": 0.87610924, "learning_rate": 3.957942217314823e-06, "loss": 0.89846092, "num_input_tokens_seen": 33349035, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.265625, "step": 1554, "time_per_iteration": 2.497659921646118 }, { "auxiliary_loss_clip": 0.01176549, "auxiliary_loss_mlp": 0.01054234, "balance_loss_clip": 1.03358746, "balance_loss_mlp": 1.05621445, "epoch": 0.09349165789869232, "flos": 19353481804800.0, "grad_norm": 1.8787888084368747, "language_loss": 0.8148272, "learning_rate": 3.957862730421599e-06, "loss": 0.83713496, "num_input_tokens_seen": 33368060, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.203125, "step": 1555, "time_per_iteration": 2.461670398712158 }, { "auxiliary_loss_clip": 0.01062928, "auxiliary_loss_mlp": 0.01007093, "balance_loss_clip": 1.00453031, "balance_loss_mlp": 1.01958108, "epoch": 0.09355178115136029, "flos": 67502580635520.0, "grad_norm": 0.877968662640152, "language_loss": 0.59713024, "learning_rate": 3.957783169286024e-06, "loss": 0.61783046, "num_input_tokens_seen": 33430825, "router_z_loss_clip": 0.02563477, "router_z_loss_mlp": 0.43359375, "step": 1556, "time_per_iteration": 4.627899646759033 }, { "auxiliary_loss_clip": 0.01178298, "auxiliary_loss_mlp": 0.01055453, "balance_loss_clip": 1.03512776, "balance_loss_mlp": 1.05599165, "epoch": 0.09361190440402825, "flos": 37341638720640.0, "grad_norm": 1.5934552021986512, "language_loss": 0.84266257, "learning_rate": 3.9577035339111155e-06, "loss": 0.86500007, "num_input_tokens_seen": 33454855, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.21875, "step": 1557, "time_per_iteration": 4.056652307510376 }, { "auxiliary_loss_clip": 0.01174808, "auxiliary_loss_mlp": 0.01061803, "balance_loss_clip": 1.0392487, "balance_loss_mlp": 1.05246663, "epoch": 0.09367202765669623, "flos": 24899705112960.0, "grad_norm": 5.1489512377832725, "language_loss": 0.77593791, "learning_rate": 3.957623824299893e-06, "loss": 0.79830408, "num_input_tokens_seen": 33476000, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.2265625, "step": 1558, "time_per_iteration": 3.9095802307128906 }, { "auxiliary_loss_clip": 0.0118369, "auxiliary_loss_mlp": 0.01052991, "balance_loss_clip": 1.03186738, "balance_loss_mlp": 1.05776787, "epoch": 0.0937321509093642, "flos": 15705568368000.0, "grad_norm": 2.009875855464051, "language_loss": 0.79753137, "learning_rate": 3.957544040455379e-06, "loss": 0.81989813, "num_input_tokens_seen": 33493845, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.2578125, "step": 1559, "time_per_iteration": 2.460463523864746 }, { "auxiliary_loss_clip": 0.01173664, "auxiliary_loss_mlp": 0.01058926, "balance_loss_clip": 1.03873229, "balance_loss_mlp": 1.05314851, "epoch": 0.09379227416203216, "flos": 20483698222080.0, "grad_norm": 2.9287856728578543, "language_loss": 0.76458061, "learning_rate": 3.957464182380599e-06, "loss": 0.78690648, "num_input_tokens_seen": 33510850, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.2109375, "step": 1560, "time_per_iteration": 2.503559112548828 }, { "auxiliary_loss_clip": 0.01179294, "auxiliary_loss_mlp": 0.01055775, "balance_loss_clip": 1.03517544, "balance_loss_mlp": 1.05297709, "epoch": 0.09385239741470014, "flos": 24352498344960.0, "grad_norm": 1.8355301534801294, "language_loss": 0.810552, "learning_rate": 3.95738425007858e-06, "loss": 0.83290267, "num_input_tokens_seen": 33530430, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.265625, "step": 1561, "time_per_iteration": 2.5073392391204834 }, { "auxiliary_loss_clip": 0.01175958, "auxiliary_loss_mlp": 0.01047602, "balance_loss_clip": 1.026752, "balance_loss_mlp": 1.05142903, "epoch": 0.0939125206673681, "flos": 33291489807360.0, "grad_norm": 5.4149183721673015, "language_loss": 0.61058348, "learning_rate": 3.957304243552354e-06, "loss": 0.63281906, "num_input_tokens_seen": 33551975, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.2421875, "step": 1562, "time_per_iteration": 2.5784687995910645 }, { "auxiliary_loss_clip": 0.01175156, "auxiliary_loss_mlp": 0.01055026, "balance_loss_clip": 1.03547573, "balance_loss_mlp": 1.05598819, "epoch": 0.09397264392003607, "flos": 19244923925760.0, "grad_norm": 3.3254284376076297, "language_loss": 0.84952343, "learning_rate": 3.957224162804956e-06, "loss": 0.87182528, "num_input_tokens_seen": 33569850, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.1875, "step": 1563, "time_per_iteration": 2.4369308948516846 }, { "auxiliary_loss_clip": 0.01172298, "auxiliary_loss_mlp": 0.01048603, "balance_loss_clip": 1.02911258, "balance_loss_mlp": 1.05214572, "epoch": 0.09403276717270405, "flos": 19317930318720.0, "grad_norm": 2.6590597541464907, "language_loss": 0.76109684, "learning_rate": 3.9571440078394205e-06, "loss": 0.78330588, "num_input_tokens_seen": 33590510, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.203125, "step": 1564, "time_per_iteration": 2.475741147994995 }, { "auxiliary_loss_clip": 0.0117658, "auxiliary_loss_mlp": 0.01053285, "balance_loss_clip": 1.03285289, "balance_loss_mlp": 1.05591142, "epoch": 0.09409289042537201, "flos": 23583471137280.0, "grad_norm": 2.0265015064170906, "language_loss": 0.80228072, "learning_rate": 3.9570637786587895e-06, "loss": 0.82457936, "num_input_tokens_seen": 33608810, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.203125, "step": 1565, "time_per_iteration": 2.473223924636841 }, { "auxiliary_loss_clip": 0.01172574, "auxiliary_loss_mlp": 0.0106139, "balance_loss_clip": 1.04096937, "balance_loss_mlp": 1.04977643, "epoch": 0.09415301367803998, "flos": 20078446003200.0, "grad_norm": 2.9233270490842123, "language_loss": 0.75455236, "learning_rate": 3.956983475266103e-06, "loss": 0.77689201, "num_input_tokens_seen": 33627265, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.2265625, "step": 1566, "time_per_iteration": 2.4666686058044434 }, { "auxiliary_loss_clip": 0.01173412, "auxiliary_loss_mlp": 0.0105677, "balance_loss_clip": 1.03552699, "balance_loss_mlp": 1.05103898, "epoch": 0.09421313693070796, "flos": 21062075016960.0, "grad_norm": 2.05005833437113, "language_loss": 0.77711362, "learning_rate": 3.956903097664407e-06, "loss": 0.79941541, "num_input_tokens_seen": 33644810, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.21875, "step": 1567, "time_per_iteration": 2.458827495574951 }, { "auxiliary_loss_clip": 0.01177, "auxiliary_loss_mlp": 0.01051042, "balance_loss_clip": 1.03167105, "balance_loss_mlp": 1.05289328, "epoch": 0.09427326018337592, "flos": 24316156759680.0, "grad_norm": 1.8339555724699899, "language_loss": 0.82689512, "learning_rate": 3.956822645856749e-06, "loss": 0.84917557, "num_input_tokens_seen": 33665665, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.2421875, "step": 1568, "time_per_iteration": 2.5159530639648438 }, { "auxiliary_loss_clip": 0.0117877, "auxiliary_loss_mlp": 0.01045519, "balance_loss_clip": 1.02439523, "balance_loss_mlp": 1.05340767, "epoch": 0.09433338343604389, "flos": 20263888944000.0, "grad_norm": 2.119895691122424, "language_loss": 0.76687932, "learning_rate": 3.9567421198461814e-06, "loss": 0.78912222, "num_input_tokens_seen": 33684760, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.25, "step": 1569, "time_per_iteration": 2.4640893936157227 }, { "auxiliary_loss_clip": 0.01169419, "auxiliary_loss_mlp": 0.01044254, "balance_loss_clip": 1.02339196, "balance_loss_mlp": 1.05062199, "epoch": 0.09439350668871185, "flos": 12742973493120.0, "grad_norm": 2.3066784895281454, "language_loss": 0.85756624, "learning_rate": 3.956661519635756e-06, "loss": 0.87970304, "num_input_tokens_seen": 33700750, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1875, "step": 1570, "time_per_iteration": 2.4310531616210938 }, { "auxiliary_loss_clip": 0.01176596, "auxiliary_loss_mlp": 0.01044213, "balance_loss_clip": 1.02370954, "balance_loss_mlp": 1.05459344, "epoch": 0.09445362994137983, "flos": 25962266263680.0, "grad_norm": 1.6527954005854075, "language_loss": 0.76676786, "learning_rate": 3.95658084522853e-06, "loss": 0.78897595, "num_input_tokens_seen": 33724430, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.21875, "step": 1571, "time_per_iteration": 2.5418548583984375 }, { "auxiliary_loss_clip": 0.0116689, "auxiliary_loss_mlp": 0.01044601, "balance_loss_clip": 1.02457428, "balance_loss_mlp": 1.04966927, "epoch": 0.0945137531940478, "flos": 19715353372800.0, "grad_norm": 1.9452565457460145, "language_loss": 0.79529136, "learning_rate": 3.956500096627561e-06, "loss": 0.8174063, "num_input_tokens_seen": 33743455, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.171875, "step": 1572, "time_per_iteration": 2.4644062519073486 }, { "auxiliary_loss_clip": 0.01168622, "auxiliary_loss_mlp": 0.0104774, "balance_loss_clip": 1.02727199, "balance_loss_mlp": 1.04955828, "epoch": 0.09457387644671576, "flos": 23617047375360.0, "grad_norm": 1.8958341893355968, "language_loss": 0.87761122, "learning_rate": 3.956419273835913e-06, "loss": 0.89977479, "num_input_tokens_seen": 33763435, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1875, "step": 1573, "time_per_iteration": 2.5008175373077393 }, { "auxiliary_loss_clip": 0.01174298, "auxiliary_loss_mlp": 0.01057128, "balance_loss_clip": 1.03354812, "balance_loss_mlp": 1.0505302, "epoch": 0.09463399969938374, "flos": 26907291135360.0, "grad_norm": 2.3593715567705846, "language_loss": 0.81380975, "learning_rate": 3.95633837685665e-06, "loss": 0.836124, "num_input_tokens_seen": 33784325, "router_z_loss_clip": 0.23632812, "router_z_loss_mlp": 1.234375, "step": 1574, "time_per_iteration": 2.5535407066345215 }, { "auxiliary_loss_clip": 0.01173281, "auxiliary_loss_mlp": 0.01047251, "balance_loss_clip": 1.02700877, "balance_loss_mlp": 1.05186677, "epoch": 0.0946941229520517, "flos": 23659566099840.0, "grad_norm": 1.7753704733824, "language_loss": 0.80737895, "learning_rate": 3.95625740569284e-06, "loss": 0.82958424, "num_input_tokens_seen": 33802510, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.2109375, "step": 1575, "time_per_iteration": 2.4897680282592773 }, { "auxiliary_loss_clip": 0.01168312, "auxiliary_loss_mlp": 0.01054643, "balance_loss_clip": 1.03378153, "balance_loss_mlp": 1.04890847, "epoch": 0.09475424620471967, "flos": 24134053783680.0, "grad_norm": 1.9348990621927649, "language_loss": 0.86385012, "learning_rate": 3.956176360347553e-06, "loss": 0.88607967, "num_input_tokens_seen": 33819980, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1875, "step": 1576, "time_per_iteration": 2.50148344039917 }, { "auxiliary_loss_clip": 0.01066665, "auxiliary_loss_mlp": 0.01006105, "balance_loss_clip": 1.0034703, "balance_loss_mlp": 1.0224309, "epoch": 0.09481436945738765, "flos": 68426168065920.0, "grad_norm": 0.9756546846439438, "language_loss": 0.65851653, "learning_rate": 3.956095240823862e-06, "loss": 0.67924422, "num_input_tokens_seen": 33878925, "router_z_loss_clip": 0.02636719, "router_z_loss_mlp": 0.44140625, "step": 1577, "time_per_iteration": 3.047621726989746 }, { "auxiliary_loss_clip": 0.01171006, "auxiliary_loss_mlp": 0.01040444, "balance_loss_clip": 1.02041662, "balance_loss_mlp": 1.04963505, "epoch": 0.09487449271005562, "flos": 16654076858880.0, "grad_norm": 3.894413054361652, "language_loss": 0.79130304, "learning_rate": 3.956014047124844e-06, "loss": 0.81341755, "num_input_tokens_seen": 33897600, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.2109375, "step": 1578, "time_per_iteration": 2.5125343799591064 }, { "auxiliary_loss_clip": 0.01167827, "auxiliary_loss_mlp": 0.01053946, "balance_loss_clip": 1.03338253, "balance_loss_mlp": 1.04744458, "epoch": 0.09493461596272358, "flos": 24275685110400.0, "grad_norm": 2.03185100065273, "language_loss": 0.77783293, "learning_rate": 3.955932779253578e-06, "loss": 0.80005068, "num_input_tokens_seen": 33917365, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.203125, "step": 1579, "time_per_iteration": 2.5074682235717773 }, { "auxiliary_loss_clip": 0.01170025, "auxiliary_loss_mlp": 0.01049085, "balance_loss_clip": 1.02790165, "balance_loss_mlp": 1.04933918, "epoch": 0.09499473921539155, "flos": 21870173243520.0, "grad_norm": 2.05442505011154, "language_loss": 0.73087645, "learning_rate": 3.955851437213144e-06, "loss": 0.75306761, "num_input_tokens_seen": 33936680, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.203125, "step": 1580, "time_per_iteration": 2.4560821056365967 }, { "auxiliary_loss_clip": 0.01164293, "auxiliary_loss_mlp": 0.01045552, "balance_loss_clip": 1.02552509, "balance_loss_mlp": 1.04710543, "epoch": 0.09505486246805953, "flos": 33547137880320.0, "grad_norm": 1.934282566733891, "language_loss": 0.77910256, "learning_rate": 3.955770021006627e-06, "loss": 0.80120105, "num_input_tokens_seen": 33960685, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.171875, "step": 1581, "time_per_iteration": 2.6257596015930176 }, { "auxiliary_loss_clip": 0.01171236, "auxiliary_loss_mlp": 0.01049947, "balance_loss_clip": 1.02984822, "balance_loss_mlp": 1.0503993, "epoch": 0.09511498572072749, "flos": 21215342350080.0, "grad_norm": 1.855623532584483, "language_loss": 0.8685807, "learning_rate": 3.955688530637116e-06, "loss": 0.89079255, "num_input_tokens_seen": 33980015, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.2109375, "step": 1582, "time_per_iteration": 2.446220874786377 }, { "auxiliary_loss_clip": 0.01170427, "auxiliary_loss_mlp": 0.01049468, "balance_loss_clip": 1.02730691, "balance_loss_mlp": 1.04958653, "epoch": 0.09517510897339546, "flos": 14611262572800.0, "grad_norm": 2.050407072581013, "language_loss": 0.67224777, "learning_rate": 3.955606966107699e-06, "loss": 0.69444674, "num_input_tokens_seen": 33997705, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.2109375, "step": 1583, "time_per_iteration": 2.488556146621704 }, { "auxiliary_loss_clip": 0.01175267, "auxiliary_loss_mlp": 0.01047274, "balance_loss_clip": 1.02614999, "balance_loss_mlp": 1.05388653, "epoch": 0.09523523222606343, "flos": 27817339138560.0, "grad_norm": 1.9480707149539933, "language_loss": 0.70576006, "learning_rate": 3.95552532742147e-06, "loss": 0.7279855, "num_input_tokens_seen": 34017465, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.21875, "step": 1584, "time_per_iteration": 2.5167393684387207 }, { "auxiliary_loss_clip": 0.01172045, "auxiliary_loss_mlp": 0.01044278, "balance_loss_clip": 1.02527642, "balance_loss_mlp": 1.05140376, "epoch": 0.0952953554787314, "flos": 20706272847360.0, "grad_norm": 1.518167004875822, "language_loss": 0.80822366, "learning_rate": 3.955443614581525e-06, "loss": 0.83038688, "num_input_tokens_seen": 34038550, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.203125, "step": 1585, "time_per_iteration": 2.529506206512451 }, { "auxiliary_loss_clip": 0.01174022, "auxiliary_loss_mlp": 0.01052626, "balance_loss_clip": 1.03044152, "balance_loss_mlp": 1.05079865, "epoch": 0.09535547873139937, "flos": 24787627701120.0, "grad_norm": 1.6595525769097854, "language_loss": 0.72259152, "learning_rate": 3.955361827590961e-06, "loss": 0.74485803, "num_input_tokens_seen": 34058665, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.234375, "step": 1586, "time_per_iteration": 2.5005428791046143 }, { "auxiliary_loss_clip": 0.01063528, "auxiliary_loss_mlp": 0.0101667, "balance_loss_clip": 1.01384449, "balance_loss_mlp": 1.01984477, "epoch": 0.09541560198406734, "flos": 71912194905600.0, "grad_norm": 0.8526102890041511, "language_loss": 0.55480969, "learning_rate": 3.955279966452883e-06, "loss": 0.57561171, "num_input_tokens_seen": 34109655, "router_z_loss_clip": 0.02819824, "router_z_loss_mlp": 0.4375, "step": 1587, "time_per_iteration": 2.903942108154297 }, { "auxiliary_loss_clip": 0.01173544, "auxiliary_loss_mlp": 0.01050137, "balance_loss_clip": 1.02914417, "balance_loss_mlp": 1.05065417, "epoch": 0.09547572523673531, "flos": 28982604251520.0, "grad_norm": 2.01835366545467, "language_loss": 0.81317389, "learning_rate": 3.955198031170391e-06, "loss": 0.83541071, "num_input_tokens_seen": 34131115, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.234375, "step": 1588, "time_per_iteration": 2.5147554874420166 }, { "auxiliary_loss_clip": 0.01170463, "auxiliary_loss_mlp": 0.01052951, "balance_loss_clip": 1.03244686, "balance_loss_mlp": 1.05002713, "epoch": 0.09553584848940327, "flos": 24133910129280.0, "grad_norm": 1.4673679127177333, "language_loss": 0.81571758, "learning_rate": 3.955116021746594e-06, "loss": 0.83795166, "num_input_tokens_seen": 34151925, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.203125, "step": 1589, "time_per_iteration": 2.5081913471221924 }, { "auxiliary_loss_clip": 0.01167801, "auxiliary_loss_mlp": 0.01050988, "balance_loss_clip": 1.02895784, "balance_loss_mlp": 1.04995561, "epoch": 0.09559597174207124, "flos": 42851376789120.0, "grad_norm": 1.4788459176220592, "language_loss": 0.6440984, "learning_rate": 3.955033938184601e-06, "loss": 0.66628623, "num_input_tokens_seen": 34175395, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1796875, "step": 1590, "time_per_iteration": 2.653949499130249 }, { "auxiliary_loss_clip": 0.01167472, "auxiliary_loss_mlp": 0.01052759, "balance_loss_clip": 1.03213596, "balance_loss_mlp": 1.04861975, "epoch": 0.09565609499473922, "flos": 32670845683200.0, "grad_norm": 1.6497649502172755, "language_loss": 0.83133036, "learning_rate": 3.954951780487526e-06, "loss": 0.85353267, "num_input_tokens_seen": 34197760, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1875, "step": 1591, "time_per_iteration": 2.578859567642212 }, { "auxiliary_loss_clip": 0.01173982, "auxiliary_loss_mlp": 0.01051794, "balance_loss_clip": 1.0308969, "balance_loss_mlp": 1.05089355, "epoch": 0.09571621824740718, "flos": 18478410670080.0, "grad_norm": 4.166344961204136, "language_loss": 0.73836607, "learning_rate": 3.9548695486584835e-06, "loss": 0.76062381, "num_input_tokens_seen": 34215330, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.234375, "step": 1592, "time_per_iteration": 2.436042308807373 }, { "auxiliary_loss_clip": 0.01168265, "auxiliary_loss_mlp": 0.01046898, "balance_loss_clip": 1.02620339, "balance_loss_mlp": 1.04738045, "epoch": 0.09577634150007515, "flos": 29387497334400.0, "grad_norm": 2.0242382844096736, "language_loss": 0.7415849, "learning_rate": 3.954787242700592e-06, "loss": 0.76373649, "num_input_tokens_seen": 34237745, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.2109375, "step": 1593, "time_per_iteration": 2.5635576248168945 }, { "auxiliary_loss_clip": 0.01170169, "auxiliary_loss_mlp": 0.01052529, "balance_loss_clip": 1.03268087, "balance_loss_mlp": 1.05122471, "epoch": 0.09583646475274313, "flos": 22747830157440.0, "grad_norm": 1.8208444741568772, "language_loss": 0.697438, "learning_rate": 3.954704862616971e-06, "loss": 0.71966499, "num_input_tokens_seen": 34256565, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.1875, "step": 1594, "time_per_iteration": 2.485074996948242 }, { "auxiliary_loss_clip": 0.01169749, "auxiliary_loss_mlp": 0.01048999, "balance_loss_clip": 1.02842402, "balance_loss_mlp": 1.048751, "epoch": 0.0958965880054111, "flos": 23218367345280.0, "grad_norm": 2.5532449287343817, "language_loss": 0.82534546, "learning_rate": 3.954622408410747e-06, "loss": 0.84753299, "num_input_tokens_seen": 34275970, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.2109375, "step": 1595, "time_per_iteration": 2.4905943870544434 }, { "auxiliary_loss_clip": 0.01171061, "auxiliary_loss_mlp": 0.01049867, "balance_loss_clip": 1.02814698, "balance_loss_mlp": 1.05023026, "epoch": 0.09595671125807906, "flos": 21324438933120.0, "grad_norm": 2.5138818238053307, "language_loss": 0.85096294, "learning_rate": 3.954539880085045e-06, "loss": 0.87317222, "num_input_tokens_seen": 34295490, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.2109375, "step": 1596, "time_per_iteration": 2.4628682136535645 }, { "auxiliary_loss_clip": 0.01174224, "auxiliary_loss_mlp": 0.01051143, "balance_loss_clip": 1.02904177, "balance_loss_mlp": 1.05260003, "epoch": 0.09601683451074704, "flos": 39603472185600.0, "grad_norm": 1.8804340427975557, "language_loss": 0.68702626, "learning_rate": 3.9544572776429945e-06, "loss": 0.70927989, "num_input_tokens_seen": 34319990, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.21875, "step": 1597, "time_per_iteration": 2.6424784660339355 }, { "auxiliary_loss_clip": 0.01171416, "auxiliary_loss_mlp": 0.01048039, "balance_loss_clip": 1.02667665, "balance_loss_mlp": 1.04916096, "epoch": 0.096076957763415, "flos": 23732716147200.0, "grad_norm": 2.058629012322874, "language_loss": 0.7483958, "learning_rate": 3.954374601087729e-06, "loss": 0.77059031, "num_input_tokens_seen": 34339225, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.21875, "step": 1598, "time_per_iteration": 5.586432695388794 }, { "auxiliary_loss_clip": 0.01177177, "auxiliary_loss_mlp": 0.01046085, "balance_loss_clip": 1.02369702, "balance_loss_mlp": 1.05383945, "epoch": 0.09613708101608297, "flos": 34678108483200.0, "grad_norm": 1.6642112065466554, "language_loss": 0.68833357, "learning_rate": 3.954291850422382e-06, "loss": 0.71056616, "num_input_tokens_seen": 34361020, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.234375, "step": 1599, "time_per_iteration": 4.029689311981201 }, { "auxiliary_loss_clip": 0.01173082, "auxiliary_loss_mlp": 0.01058563, "balance_loss_clip": 1.03785622, "balance_loss_mlp": 1.05190682, "epoch": 0.09619720426875093, "flos": 20740028653440.0, "grad_norm": 1.9908740287860123, "language_loss": 0.83876079, "learning_rate": 3.954209025650093e-06, "loss": 0.86107719, "num_input_tokens_seen": 34378630, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.203125, "step": 1600, "time_per_iteration": 2.4613842964172363 }, { "auxiliary_loss_clip": 0.01171182, "auxiliary_loss_mlp": 0.01053199, "balance_loss_clip": 1.03261113, "balance_loss_mlp": 1.05016136, "epoch": 0.09625732752141891, "flos": 13042720488960.0, "grad_norm": 1.9655963249667696, "language_loss": 0.80305398, "learning_rate": 3.954126126774001e-06, "loss": 0.82529777, "num_input_tokens_seen": 34397110, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.2109375, "step": 1601, "time_per_iteration": 2.4630346298217773 }, { "auxiliary_loss_clip": 0.01174525, "auxiliary_loss_mlp": 0.01052101, "balance_loss_clip": 1.03008294, "balance_loss_mlp": 1.0516628, "epoch": 0.09631745077408688, "flos": 22273629782400.0, "grad_norm": 2.299469479001269, "language_loss": 0.82129574, "learning_rate": 3.954043153797251e-06, "loss": 0.84356201, "num_input_tokens_seen": 34414165, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.2265625, "step": 1602, "time_per_iteration": 2.476591110229492 }, { "auxiliary_loss_clip": 0.01171319, "auxiliary_loss_mlp": 0.01051731, "balance_loss_clip": 1.03020215, "balance_loss_mlp": 1.05241013, "epoch": 0.09637757402675484, "flos": 24754266944640.0, "grad_norm": 2.0943634043953536, "language_loss": 0.62623626, "learning_rate": 3.953960106722989e-06, "loss": 0.64846677, "num_input_tokens_seen": 34434445, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1875, "step": 1603, "time_per_iteration": 2.5238540172576904 }, { "auxiliary_loss_clip": 0.01175277, "auxiliary_loss_mlp": 0.01049472, "balance_loss_clip": 1.02609551, "balance_loss_mlp": 1.05229533, "epoch": 0.09643769727942282, "flos": 22525758322560.0, "grad_norm": 2.519129226879951, "language_loss": 0.70726573, "learning_rate": 3.953876985554364e-06, "loss": 0.72951317, "num_input_tokens_seen": 34453095, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.2265625, "step": 1604, "time_per_iteration": 2.486598491668701 }, { "auxiliary_loss_clip": 0.0117008, "auxiliary_loss_mlp": 0.01050875, "balance_loss_clip": 1.0307045, "balance_loss_mlp": 1.05053163, "epoch": 0.09649782053209079, "flos": 30921026636160.0, "grad_norm": 25.172977793113077, "language_loss": 0.79931188, "learning_rate": 3.953793790294527e-06, "loss": 0.8215214, "num_input_tokens_seen": 34473680, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1953125, "step": 1605, "time_per_iteration": 2.5988802909851074 }, { "auxiliary_loss_clip": 0.01175908, "auxiliary_loss_mlp": 0.01043884, "balance_loss_clip": 1.02262926, "balance_loss_mlp": 1.05200911, "epoch": 0.09655794378475875, "flos": 25337635729920.0, "grad_norm": 5.0109516924815125, "language_loss": 0.74474925, "learning_rate": 3.953710520946634e-06, "loss": 0.76694721, "num_input_tokens_seen": 34492610, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.234375, "step": 1606, "time_per_iteration": 2.4919283390045166 }, { "auxiliary_loss_clip": 0.01173781, "auxiliary_loss_mlp": 0.01046906, "balance_loss_clip": 1.02574658, "balance_loss_mlp": 1.05209994, "epoch": 0.09661806703742673, "flos": 22346061557760.0, "grad_norm": 1.873319906799762, "language_loss": 0.75834417, "learning_rate": 3.953627177513843e-06, "loss": 0.78055108, "num_input_tokens_seen": 34511855, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.21875, "step": 1607, "time_per_iteration": 2.4952523708343506 }, { "auxiliary_loss_clip": 0.01172674, "auxiliary_loss_mlp": 0.01043509, "balance_loss_clip": 1.02290928, "balance_loss_mlp": 1.0495764, "epoch": 0.0966781902900947, "flos": 17457578144640.0, "grad_norm": 2.4123538685973704, "language_loss": 0.86710572, "learning_rate": 3.953543759999312e-06, "loss": 0.8892675, "num_input_tokens_seen": 34528905, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.234375, "step": 1608, "time_per_iteration": 2.435626745223999 }, { "auxiliary_loss_clip": 0.01181394, "auxiliary_loss_mlp": 0.0105451, "balance_loss_clip": 1.03268278, "balance_loss_mlp": 1.05320919, "epoch": 0.09673831354276266, "flos": 36903995412480.0, "grad_norm": 3.5086574236909143, "language_loss": 0.71402436, "learning_rate": 3.953460268406207e-06, "loss": 0.73638344, "num_input_tokens_seen": 34548480, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.28125, "step": 1609, "time_per_iteration": 2.6085219383239746 }, { "auxiliary_loss_clip": 0.01173817, "auxiliary_loss_mlp": 0.01052744, "balance_loss_clip": 1.0322876, "balance_loss_mlp": 1.05149209, "epoch": 0.09679843679543064, "flos": 20701388597760.0, "grad_norm": 6.51953271346783, "language_loss": 0.84602833, "learning_rate": 3.953376702737693e-06, "loss": 0.868294, "num_input_tokens_seen": 34565410, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.21875, "step": 1610, "time_per_iteration": 2.4632697105407715 }, { "auxiliary_loss_clip": 0.01176108, "auxiliary_loss_mlp": 0.01046702, "balance_loss_clip": 1.02482748, "balance_loss_mlp": 1.05418348, "epoch": 0.0968585600480986, "flos": 23514415240320.0, "grad_norm": 2.528350780554148, "language_loss": 0.66730618, "learning_rate": 3.953293062996939e-06, "loss": 0.68953431, "num_input_tokens_seen": 34584840, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.21875, "step": 1611, "time_per_iteration": 2.4959657192230225 }, { "auxiliary_loss_clip": 0.01173483, "auxiliary_loss_mlp": 0.01052292, "balance_loss_clip": 1.03219342, "balance_loss_mlp": 1.05158305, "epoch": 0.09691868330076657, "flos": 20121072468480.0, "grad_norm": 1.7554668739544486, "language_loss": 0.80989331, "learning_rate": 3.953209349187115e-06, "loss": 0.83215106, "num_input_tokens_seen": 34603360, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.21875, "step": 1612, "time_per_iteration": 2.4758007526397705 }, { "auxiliary_loss_clip": 0.01179806, "auxiliary_loss_mlp": 0.01060451, "balance_loss_clip": 1.03871918, "balance_loss_mlp": 1.05664349, "epoch": 0.09697880655343454, "flos": 16544692967040.0, "grad_norm": 2.938120589979012, "language_loss": 0.80811441, "learning_rate": 3.953125561311398e-06, "loss": 0.83051693, "num_input_tokens_seen": 34620760, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.234375, "step": 1613, "time_per_iteration": 2.483218193054199 }, { "auxiliary_loss_clip": 0.01173729, "auxiliary_loss_mlp": 0.01051414, "balance_loss_clip": 1.02906251, "balance_loss_mlp": 1.05173802, "epoch": 0.09703892980610251, "flos": 26104184899200.0, "grad_norm": 1.938828948781003, "language_loss": 0.84161669, "learning_rate": 3.953041699372964e-06, "loss": 0.86386812, "num_input_tokens_seen": 34640695, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.21875, "step": 1614, "time_per_iteration": 2.511791706085205 }, { "auxiliary_loss_clip": 0.01077815, "auxiliary_loss_mlp": 0.01015571, "balance_loss_clip": 1.01208997, "balance_loss_mlp": 1.03092206, "epoch": 0.09709905305877048, "flos": 60443622000000.0, "grad_norm": 0.7117396071784851, "language_loss": 0.54667008, "learning_rate": 3.952957763374992e-06, "loss": 0.56760395, "num_input_tokens_seen": 34702395, "router_z_loss_clip": 0.03491211, "router_z_loss_mlp": 0.46875, "step": 1615, "time_per_iteration": 3.0880677700042725 }, { "auxiliary_loss_clip": 0.01077292, "auxiliary_loss_mlp": 0.01011613, "balance_loss_clip": 1.0081085, "balance_loss_mlp": 1.03033793, "epoch": 0.09715917631143844, "flos": 57639932893440.0, "grad_norm": 0.7750849682947079, "language_loss": 0.58262813, "learning_rate": 3.952873753320666e-06, "loss": 0.60351729, "num_input_tokens_seen": 34768910, "router_z_loss_clip": 0.03515625, "router_z_loss_mlp": 0.46875, "step": 1616, "time_per_iteration": 3.25060772895813 }, { "auxiliary_loss_clip": 0.01174578, "auxiliary_loss_mlp": 0.01057324, "balance_loss_clip": 1.0345788, "balance_loss_mlp": 1.05253959, "epoch": 0.09721929956410642, "flos": 20558212986240.0, "grad_norm": 2.038347093301129, "language_loss": 0.68944722, "learning_rate": 3.952789669213172e-06, "loss": 0.7117663, "num_input_tokens_seen": 34787680, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.21875, "step": 1617, "time_per_iteration": 2.4792635440826416 }, { "auxiliary_loss_clip": 0.01175842, "auxiliary_loss_mlp": 0.0105363, "balance_loss_clip": 1.02929926, "balance_loss_mlp": 1.05204606, "epoch": 0.09727942281677439, "flos": 27344359825920.0, "grad_norm": 1.8012433528237477, "language_loss": 0.80822718, "learning_rate": 3.952705511055698e-06, "loss": 0.83052194, "num_input_tokens_seen": 34808330, "router_z_loss_clip": 0.24414062, "router_z_loss_mlp": 1.234375, "step": 1618, "time_per_iteration": 2.51401948928833 }, { "auxiliary_loss_clip": 0.01170048, "auxiliary_loss_mlp": 0.01050263, "balance_loss_clip": 1.03046203, "balance_loss_mlp": 1.05063915, "epoch": 0.09733954606944235, "flos": 24900028335360.0, "grad_norm": 1.687245625435821, "language_loss": 0.92978942, "learning_rate": 3.952621278851435e-06, "loss": 0.95199251, "num_input_tokens_seen": 34830020, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.1953125, "step": 1619, "time_per_iteration": 2.5838325023651123 }, { "auxiliary_loss_clip": 0.01171601, "auxiliary_loss_mlp": 0.01052803, "balance_loss_clip": 1.03185844, "balance_loss_mlp": 1.05375767, "epoch": 0.09739966932211033, "flos": 31503928544640.0, "grad_norm": 4.493022222644303, "language_loss": 0.88225269, "learning_rate": 3.9525369726035784e-06, "loss": 0.90449679, "num_input_tokens_seen": 34850330, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1796875, "step": 1620, "time_per_iteration": 2.5573017597198486 }, { "auxiliary_loss_clip": 0.0117418, "auxiliary_loss_mlp": 0.01062657, "balance_loss_clip": 1.03969729, "balance_loss_mlp": 1.05132031, "epoch": 0.0974597925747783, "flos": 23878764846720.0, "grad_norm": 4.760849813645528, "language_loss": 0.7724551, "learning_rate": 3.952452592315324e-06, "loss": 0.79482347, "num_input_tokens_seen": 34871640, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.2265625, "step": 1621, "time_per_iteration": 2.492478609085083 }, { "auxiliary_loss_clip": 0.0117137, "auxiliary_loss_mlp": 0.0106913, "balance_loss_clip": 1.04783893, "balance_loss_mlp": 1.04887474, "epoch": 0.09751991582744626, "flos": 17019575700480.0, "grad_norm": 2.1971710786697827, "language_loss": 0.77671379, "learning_rate": 3.952368137989871e-06, "loss": 0.79911876, "num_input_tokens_seen": 34888100, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.2265625, "step": 1622, "time_per_iteration": 2.4420063495635986 }, { "auxiliary_loss_clip": 0.0117691, "auxiliary_loss_mlp": 0.01066965, "balance_loss_clip": 1.04468513, "balance_loss_mlp": 1.05170059, "epoch": 0.09758003908011423, "flos": 28402826826240.0, "grad_norm": 1.8360350687144373, "language_loss": 0.85950828, "learning_rate": 3.9522836096304225e-06, "loss": 0.88194704, "num_input_tokens_seen": 34910485, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.25, "step": 1623, "time_per_iteration": 2.5321474075317383 }, { "auxiliary_loss_clip": 0.01171694, "auxiliary_loss_mlp": 0.01068683, "balance_loss_clip": 1.04718995, "balance_loss_mlp": 1.05164218, "epoch": 0.09764016233278221, "flos": 18144297336960.0, "grad_norm": 2.0265505654548255, "language_loss": 0.80414212, "learning_rate": 3.952199007240184e-06, "loss": 0.82654583, "num_input_tokens_seen": 34928615, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.203125, "step": 1624, "time_per_iteration": 2.425762176513672 }, { "auxiliary_loss_clip": 0.01169878, "auxiliary_loss_mlp": 0.01054351, "balance_loss_clip": 1.03424037, "balance_loss_mlp": 1.0479306, "epoch": 0.09770028558545017, "flos": 15265842071040.0, "grad_norm": 2.3803635489452017, "language_loss": 0.85555357, "learning_rate": 3.952114330822364e-06, "loss": 0.87779588, "num_input_tokens_seen": 34946045, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.21875, "step": 1625, "time_per_iteration": 2.4395790100097656 }, { "auxiliary_loss_clip": 0.0117535, "auxiliary_loss_mlp": 0.01065549, "balance_loss_clip": 1.04434216, "balance_loss_mlp": 1.05179381, "epoch": 0.09776040883811814, "flos": 23472435219840.0, "grad_norm": 1.9772717716784158, "language_loss": 0.85328472, "learning_rate": 3.952029580380172e-06, "loss": 0.8756938, "num_input_tokens_seen": 34962865, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.2421875, "step": 1626, "time_per_iteration": 2.4784977436065674 }, { "auxiliary_loss_clip": 0.01177614, "auxiliary_loss_mlp": 0.01066741, "balance_loss_clip": 1.04432964, "balance_loss_mlp": 1.05289125, "epoch": 0.09782053209078612, "flos": 24499480798080.0, "grad_norm": 2.387881336464795, "language_loss": 0.82992673, "learning_rate": 3.9519447559168234e-06, "loss": 0.85237026, "num_input_tokens_seen": 34983505, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.25, "step": 1627, "time_per_iteration": 2.4968862533569336 }, { "auxiliary_loss_clip": 0.01169794, "auxiliary_loss_mlp": 0.01059939, "balance_loss_clip": 1.03942275, "balance_loss_mlp": 1.04982305, "epoch": 0.09788065534345408, "flos": 21580158833280.0, "grad_norm": 2.180932833094052, "language_loss": 0.8417924, "learning_rate": 3.951859857435534e-06, "loss": 0.86408973, "num_input_tokens_seen": 35001825, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.203125, "step": 1628, "time_per_iteration": 2.473182201385498 }, { "auxiliary_loss_clip": 0.01169171, "auxiliary_loss_mlp": 0.0104983, "balance_loss_clip": 1.02985072, "balance_loss_mlp": 1.04919398, "epoch": 0.09794077859612205, "flos": 23842459175040.0, "grad_norm": 1.5585237421403222, "language_loss": 0.75826001, "learning_rate": 3.951774884939523e-06, "loss": 0.78044999, "num_input_tokens_seen": 35023075, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.203125, "step": 1629, "time_per_iteration": 2.499044895172119 }, { "auxiliary_loss_clip": 0.01175748, "auxiliary_loss_mlp": 0.01056634, "balance_loss_clip": 1.03441393, "balance_loss_mlp": 1.05519283, "epoch": 0.09800090184879003, "flos": 23659889322240.0, "grad_norm": 1.6366567606170346, "language_loss": 0.78287429, "learning_rate": 3.951689838432013e-06, "loss": 0.80519813, "num_input_tokens_seen": 35043480, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.203125, "step": 1630, "time_per_iteration": 2.505138874053955 }, { "auxiliary_loss_clip": 0.01177158, "auxiliary_loss_mlp": 0.01053816, "balance_loss_clip": 1.03132081, "balance_loss_mlp": 1.0553565, "epoch": 0.09806102510145799, "flos": 17055773631360.0, "grad_norm": 1.9925912367439118, "language_loss": 0.86721957, "learning_rate": 3.951604717916228e-06, "loss": 0.88952935, "num_input_tokens_seen": 35061490, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.21875, "step": 1631, "time_per_iteration": 2.497303009033203 }, { "auxiliary_loss_clip": 0.0117334, "auxiliary_loss_mlp": 0.01056902, "balance_loss_clip": 1.03682733, "balance_loss_mlp": 1.05382609, "epoch": 0.09812114835412596, "flos": 23878477537920.0, "grad_norm": 2.118359949253288, "language_loss": 0.83230555, "learning_rate": 3.9515195233953975e-06, "loss": 0.85460794, "num_input_tokens_seen": 35079670, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1953125, "step": 1632, "time_per_iteration": 2.484312057495117 }, { "auxiliary_loss_clip": 0.01174659, "auxiliary_loss_mlp": 0.01060132, "balance_loss_clip": 1.03985417, "balance_loss_mlp": 1.05261803, "epoch": 0.09818127160679392, "flos": 20595488325120.0, "grad_norm": 3.198498848007094, "language_loss": 0.78841257, "learning_rate": 3.951434254872751e-06, "loss": 0.81076044, "num_input_tokens_seen": 35099205, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.21875, "step": 1633, "time_per_iteration": 2.4687893390655518 }, { "auxiliary_loss_clip": 0.01168435, "auxiliary_loss_mlp": 0.01047644, "balance_loss_clip": 1.02668655, "balance_loss_mlp": 1.05096984, "epoch": 0.0982413948594619, "flos": 15487339288320.0, "grad_norm": 4.796333847886843, "language_loss": 0.73203766, "learning_rate": 3.951348912351521e-06, "loss": 0.75419849, "num_input_tokens_seen": 35115270, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.171875, "step": 1634, "time_per_iteration": 2.4473931789398193 }, { "auxiliary_loss_clip": 0.01177468, "auxiliary_loss_mlp": 0.01061669, "balance_loss_clip": 1.03990173, "balance_loss_mlp": 1.05142188, "epoch": 0.09830151811212987, "flos": 24207958016640.0, "grad_norm": 2.6299282568220335, "language_loss": 0.73112905, "learning_rate": 3.951263495834947e-06, "loss": 0.75352049, "num_input_tokens_seen": 35134065, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.2578125, "step": 1635, "time_per_iteration": 2.537071943283081 }, { "auxiliary_loss_clip": 0.01179127, "auxiliary_loss_mlp": 0.01056035, "balance_loss_clip": 1.0339694, "balance_loss_mlp": 1.05490541, "epoch": 0.09836164136479783, "flos": 20594590485120.0, "grad_norm": 1.853335718730047, "language_loss": 0.78812271, "learning_rate": 3.951178005326264e-06, "loss": 0.81047434, "num_input_tokens_seen": 35154870, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.2421875, "step": 1636, "time_per_iteration": 2.5176453590393066 }, { "auxiliary_loss_clip": 0.01171992, "auxiliary_loss_mlp": 0.01051539, "balance_loss_clip": 1.03099895, "balance_loss_mlp": 1.05235624, "epoch": 0.09842176461746581, "flos": 19934157070080.0, "grad_norm": 2.102658185427914, "language_loss": 0.70108789, "learning_rate": 3.951092440828715e-06, "loss": 0.72332323, "num_input_tokens_seen": 35171850, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1953125, "step": 1637, "time_per_iteration": 2.44867205619812 }, { "auxiliary_loss_clip": 0.01173099, "auxiliary_loss_mlp": 0.01054424, "balance_loss_clip": 1.03275156, "balance_loss_mlp": 1.05133533, "epoch": 0.09848188787013377, "flos": 21214659991680.0, "grad_norm": 2.2461597821637667, "language_loss": 0.77069992, "learning_rate": 3.951006802345545e-06, "loss": 0.79297507, "num_input_tokens_seen": 35188795, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.21875, "step": 1638, "time_per_iteration": 2.456275224685669 }, { "auxiliary_loss_clip": 0.01169534, "auxiliary_loss_mlp": 0.01043172, "balance_loss_clip": 1.02277493, "balance_loss_mlp": 1.05131209, "epoch": 0.09854201112280174, "flos": 30154226071680.0, "grad_norm": 1.4156132364449667, "language_loss": 0.72434556, "learning_rate": 3.950921089880003e-06, "loss": 0.74647272, "num_input_tokens_seen": 35212100, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1796875, "step": 1639, "time_per_iteration": 4.042776823043823 }, { "auxiliary_loss_clip": 0.01170747, "auxiliary_loss_mlp": 0.01042176, "balance_loss_clip": 1.02093291, "balance_loss_mlp": 1.05008912, "epoch": 0.09860213437546972, "flos": 21795730306560.0, "grad_norm": 1.7270542052029745, "language_loss": 0.88558787, "learning_rate": 3.950835303435337e-06, "loss": 0.90771711, "num_input_tokens_seen": 35230390, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.203125, "step": 1640, "time_per_iteration": 3.8982715606689453 }, { "auxiliary_loss_clip": 0.01174309, "auxiliary_loss_mlp": 0.01038486, "balance_loss_clip": 1.0176959, "balance_loss_mlp": 1.05298042, "epoch": 0.09866225762813768, "flos": 21835555511040.0, "grad_norm": 2.378057682580366, "language_loss": 0.8049618, "learning_rate": 3.950749443014801e-06, "loss": 0.82708979, "num_input_tokens_seen": 35250405, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.2109375, "step": 1641, "time_per_iteration": 3.898247718811035 }, { "auxiliary_loss_clip": 0.01173198, "auxiliary_loss_mlp": 0.01055096, "balance_loss_clip": 1.0331136, "balance_loss_mlp": 1.05218804, "epoch": 0.09872238088080565, "flos": 17599855916160.0, "grad_norm": 2.7990608280507856, "language_loss": 0.86427391, "learning_rate": 3.95066350862165e-06, "loss": 0.8865568, "num_input_tokens_seen": 35262820, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.2109375, "step": 1642, "time_per_iteration": 2.413634777069092 }, { "auxiliary_loss_clip": 0.01173381, "auxiliary_loss_mlp": 0.01046081, "balance_loss_clip": 1.02604198, "balance_loss_mlp": 1.05220306, "epoch": 0.09878250413347361, "flos": 27636134002560.0, "grad_norm": 1.5581891116881634, "language_loss": 0.80499309, "learning_rate": 3.950577500259144e-06, "loss": 0.82718766, "num_input_tokens_seen": 35284490, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.2109375, "step": 1643, "time_per_iteration": 2.5500283241271973 }, { "auxiliary_loss_clip": 0.01171938, "auxiliary_loss_mlp": 0.01058498, "balance_loss_clip": 1.03756475, "balance_loss_mlp": 1.05130851, "epoch": 0.0988426273861416, "flos": 16544728880640.0, "grad_norm": 2.212878435661174, "language_loss": 0.82592934, "learning_rate": 3.950491417930543e-06, "loss": 0.8482337, "num_input_tokens_seen": 35302815, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.203125, "step": 1644, "time_per_iteration": 2.437661647796631 }, { "auxiliary_loss_clip": 0.01168407, "auxiliary_loss_mlp": 0.01043084, "balance_loss_clip": 1.02257943, "balance_loss_mlp": 1.05044043, "epoch": 0.09890275063880956, "flos": 21215270522880.0, "grad_norm": 4.369977464926624, "language_loss": 0.68225121, "learning_rate": 3.9504052616391124e-06, "loss": 0.70436609, "num_input_tokens_seen": 35321175, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1796875, "step": 1645, "time_per_iteration": 2.492497444152832 }, { "auxiliary_loss_clip": 0.0108257, "auxiliary_loss_mlp": 0.0108153, "balance_loss_clip": 1.07814443, "balance_loss_mlp": 1.03444958, "epoch": 0.09896287389147752, "flos": 59379372910080.0, "grad_norm": 0.872428554240557, "language_loss": 0.60862964, "learning_rate": 3.950319031388119e-06, "loss": 0.63027066, "num_input_tokens_seen": 35381740, "router_z_loss_clip": 0.03393555, "router_z_loss_mlp": 0.48046875, "step": 1646, "time_per_iteration": 3.0271265506744385 }, { "auxiliary_loss_clip": 0.01169788, "auxiliary_loss_mlp": 0.01052114, "balance_loss_clip": 1.03004837, "balance_loss_mlp": 1.04954886, "epoch": 0.0990229971441455, "flos": 29642678530560.0, "grad_norm": 1.7445666707767438, "language_loss": 0.72879958, "learning_rate": 3.950232727180833e-06, "loss": 0.75101864, "num_input_tokens_seen": 35403760, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.203125, "step": 1647, "time_per_iteration": 2.543555974960327 }, { "auxiliary_loss_clip": 0.0117442, "auxiliary_loss_mlp": 0.01057929, "balance_loss_clip": 1.03809261, "balance_loss_mlp": 1.05345535, "epoch": 0.09908312039681347, "flos": 21834873152640.0, "grad_norm": 2.036454462543396, "language_loss": 0.84408647, "learning_rate": 3.950146349020525e-06, "loss": 0.86640996, "num_input_tokens_seen": 35424050, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.2109375, "step": 1648, "time_per_iteration": 2.4768013954162598 }, { "auxiliary_loss_clip": 0.01078003, "auxiliary_loss_mlp": 0.0100998, "balance_loss_clip": 1.0063324, "balance_loss_mlp": 1.03247726, "epoch": 0.09914324364948143, "flos": 57564304807680.0, "grad_norm": 0.7288997540013307, "language_loss": 0.5567503, "learning_rate": 3.950059896910473e-06, "loss": 0.57763016, "num_input_tokens_seen": 35481690, "router_z_loss_clip": 0.03637695, "router_z_loss_mlp": 0.45703125, "step": 1649, "time_per_iteration": 3.0394458770751953 }, { "auxiliary_loss_clip": 0.0116749, "auxiliary_loss_mlp": 0.01049009, "balance_loss_clip": 1.02826703, "balance_loss_mlp": 1.04871023, "epoch": 0.09920336690214941, "flos": 34123934476800.0, "grad_norm": 3.632329009355036, "language_loss": 0.89815855, "learning_rate": 3.949973370853954e-06, "loss": 0.92032361, "num_input_tokens_seen": 35498635, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1875, "step": 1650, "time_per_iteration": 2.5646443367004395 }, { "auxiliary_loss_clip": 0.01075628, "auxiliary_loss_mlp": 0.01008938, "balance_loss_clip": 1.00512338, "balance_loss_mlp": 1.03017926, "epoch": 0.09926349015481738, "flos": 71216428464000.0, "grad_norm": 0.7989033761097607, "language_loss": 0.63706279, "learning_rate": 3.94988677085425e-06, "loss": 0.65790844, "num_input_tokens_seen": 35565720, "router_z_loss_clip": 0.03808594, "router_z_loss_mlp": 0.45507812, "step": 1651, "time_per_iteration": 3.2651846408843994 }, { "auxiliary_loss_clip": 0.011695, "auxiliary_loss_mlp": 0.0106088, "balance_loss_clip": 1.03905237, "balance_loss_mlp": 1.05089641, "epoch": 0.09932361340748534, "flos": 23148700917120.0, "grad_norm": 1.7960237728277328, "language_loss": 0.88141453, "learning_rate": 3.949800096914643e-06, "loss": 0.90371823, "num_input_tokens_seen": 35586000, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1875, "step": 1652, "time_per_iteration": 2.5080759525299072 }, { "auxiliary_loss_clip": 0.01174716, "auxiliary_loss_mlp": 0.01057081, "balance_loss_clip": 1.03668451, "balance_loss_mlp": 1.05328214, "epoch": 0.09938373666015332, "flos": 19828651847040.0, "grad_norm": 2.118996694828721, "language_loss": 0.81956518, "learning_rate": 3.949713349038422e-06, "loss": 0.84188312, "num_input_tokens_seen": 35604355, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.21875, "step": 1653, "time_per_iteration": 2.4601540565490723 }, { "auxiliary_loss_clip": 0.0117657, "auxiliary_loss_mlp": 0.01057916, "balance_loss_clip": 1.03751945, "balance_loss_mlp": 1.05452597, "epoch": 0.09944385991282129, "flos": 22090664880000.0, "grad_norm": 2.1760584551800823, "language_loss": 0.79617751, "learning_rate": 3.949626527228875e-06, "loss": 0.81852233, "num_input_tokens_seen": 35625495, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.21875, "step": 1654, "time_per_iteration": 2.528794765472412 }, { "auxiliary_loss_clip": 0.01175043, "auxiliary_loss_mlp": 0.01058716, "balance_loss_clip": 1.03907061, "balance_loss_mlp": 1.05693555, "epoch": 0.09950398316548925, "flos": 19828867328640.0, "grad_norm": 2.02807906030958, "language_loss": 0.81225193, "learning_rate": 3.949539631489295e-06, "loss": 0.83458954, "num_input_tokens_seen": 35645030, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.1796875, "step": 1655, "time_per_iteration": 2.449885845184326 }, { "auxiliary_loss_clip": 0.01168832, "auxiliary_loss_mlp": 0.01050233, "balance_loss_clip": 1.02918017, "balance_loss_mlp": 1.05162179, "epoch": 0.09956410641815722, "flos": 25003701964800.0, "grad_norm": 1.7129297372891388, "language_loss": 0.80905193, "learning_rate": 3.9494526618229765e-06, "loss": 0.83124256, "num_input_tokens_seen": 35664305, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.171875, "step": 1656, "time_per_iteration": 2.530778169631958 }, { "auxiliary_loss_clip": 0.01175367, "auxiliary_loss_mlp": 0.01060989, "balance_loss_clip": 1.0387094, "balance_loss_mlp": 1.05612326, "epoch": 0.0996242296708252, "flos": 19317714837120.0, "grad_norm": 1.6812036132744934, "language_loss": 0.8903383, "learning_rate": 3.949365618233217e-06, "loss": 0.91270185, "num_input_tokens_seen": 35684060, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.1875, "step": 1657, "time_per_iteration": 2.459310293197632 }, { "auxiliary_loss_clip": 0.01176289, "auxiliary_loss_mlp": 0.01048319, "balance_loss_clip": 1.02698088, "balance_loss_mlp": 1.05225503, "epoch": 0.09968435292349316, "flos": 21871609787520.0, "grad_norm": 3.0996903711128794, "language_loss": 0.84910667, "learning_rate": 3.9492785007233195e-06, "loss": 0.87135273, "num_input_tokens_seen": 35703250, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.2421875, "step": 1658, "time_per_iteration": 2.501915454864502 }, { "auxiliary_loss_clip": 0.01070331, "auxiliary_loss_mlp": 0.01015459, "balance_loss_clip": 1.01200187, "balance_loss_mlp": 1.02610517, "epoch": 0.09974447617616113, "flos": 65384533313280.0, "grad_norm": 0.9073602867388783, "language_loss": 0.60828668, "learning_rate": 3.949191309296585e-06, "loss": 0.62914455, "num_input_tokens_seen": 35762165, "router_z_loss_clip": 0.03466797, "router_z_loss_mlp": 0.44140625, "step": 1659, "time_per_iteration": 3.1132805347442627 }, { "auxiliary_loss_clip": 0.01172511, "auxiliary_loss_mlp": 0.01048182, "balance_loss_clip": 1.02701044, "balance_loss_mlp": 1.05279386, "epoch": 0.0998045994288291, "flos": 23659817495040.0, "grad_norm": 1.9905066260641964, "language_loss": 0.84927201, "learning_rate": 3.949104043956321e-06, "loss": 0.87147892, "num_input_tokens_seen": 35781520, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1953125, "step": 1660, "time_per_iteration": 2.494677782058716 }, { "auxiliary_loss_clip": 0.01173504, "auxiliary_loss_mlp": 0.010511, "balance_loss_clip": 1.02967823, "balance_loss_mlp": 1.05504215, "epoch": 0.09986472268149707, "flos": 19609704495360.0, "grad_norm": 2.2409977180409184, "language_loss": 0.79634774, "learning_rate": 3.949016704705836e-06, "loss": 0.8185938, "num_input_tokens_seen": 35799565, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1875, "step": 1661, "time_per_iteration": 2.458092451095581 }, { "auxiliary_loss_clip": 0.01175794, "auxiliary_loss_mlp": 0.01048339, "balance_loss_clip": 1.02634478, "balance_loss_mlp": 1.05099452, "epoch": 0.09992484593416504, "flos": 26213317395840.0, "grad_norm": 2.3518080186116537, "language_loss": 0.83642262, "learning_rate": 3.948929291548443e-06, "loss": 0.85866392, "num_input_tokens_seen": 35821085, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.25, "step": 1662, "time_per_iteration": 2.518554210662842 }, { "auxiliary_loss_clip": 0.01170774, "auxiliary_loss_mlp": 0.01059862, "balance_loss_clip": 1.03735507, "balance_loss_mlp": 1.05121291, "epoch": 0.09998496918683301, "flos": 17493632421120.0, "grad_norm": 2.151450724538351, "language_loss": 0.89217663, "learning_rate": 3.9488418044874546e-06, "loss": 0.91448301, "num_input_tokens_seen": 35839840, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.1953125, "step": 1663, "time_per_iteration": 2.431018590927124 }, { "auxiliary_loss_clip": 0.01176526, "auxiliary_loss_mlp": 0.01051282, "balance_loss_clip": 1.02958572, "balance_loss_mlp": 1.05402052, "epoch": 0.10004509243950098, "flos": 22784925928320.0, "grad_norm": 1.7416073338806617, "language_loss": 0.69943166, "learning_rate": 3.948754243526191e-06, "loss": 0.72170973, "num_input_tokens_seen": 35861545, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.2265625, "step": 1664, "time_per_iteration": 2.5157032012939453 }, { "auxiliary_loss_clip": 0.01176012, "auxiliary_loss_mlp": 0.01049525, "balance_loss_clip": 1.02862787, "balance_loss_mlp": 1.05576301, "epoch": 0.10010521569216894, "flos": 16253385667200.0, "grad_norm": 2.2066261780415495, "language_loss": 0.78785717, "learning_rate": 3.94866660866797e-06, "loss": 0.8101126, "num_input_tokens_seen": 35878295, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.203125, "step": 1665, "time_per_iteration": 2.433717727661133 }, { "auxiliary_loss_clip": 0.01178007, "auxiliary_loss_mlp": 0.0106533, "balance_loss_clip": 1.04431343, "balance_loss_mlp": 1.05734658, "epoch": 0.10016533894483691, "flos": 23402589223680.0, "grad_norm": 2.141656814463905, "language_loss": 0.69620532, "learning_rate": 3.9485788999161165e-06, "loss": 0.71863872, "num_input_tokens_seen": 35898990, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.203125, "step": 1666, "time_per_iteration": 2.52534556388855 }, { "auxiliary_loss_clip": 0.01175327, "auxiliary_loss_mlp": 0.01066298, "balance_loss_clip": 1.0435884, "balance_loss_mlp": 1.05163729, "epoch": 0.10022546219750489, "flos": 19354164163200.0, "grad_norm": 1.8915477388363935, "language_loss": 0.78580213, "learning_rate": 3.948491117273956e-06, "loss": 0.80821836, "num_input_tokens_seen": 35916225, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.234375, "step": 1667, "time_per_iteration": 2.440080165863037 }, { "auxiliary_loss_clip": 0.01174951, "auxiliary_loss_mlp": 0.01056196, "balance_loss_clip": 1.03457117, "balance_loss_mlp": 1.05388355, "epoch": 0.10028558545017285, "flos": 27085766837760.0, "grad_norm": 2.31869248595842, "language_loss": 0.771999, "learning_rate": 3.948403260744817e-06, "loss": 0.79431057, "num_input_tokens_seen": 35934630, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.203125, "step": 1668, "time_per_iteration": 2.560103416442871 }, { "auxiliary_loss_clip": 0.01171676, "auxiliary_loss_mlp": 0.01050212, "balance_loss_clip": 1.02800322, "balance_loss_mlp": 1.05158496, "epoch": 0.10034570870284082, "flos": 25847136195840.0, "grad_norm": 1.947587980213688, "language_loss": 0.77877688, "learning_rate": 3.948315330332031e-06, "loss": 0.80099577, "num_input_tokens_seen": 35953855, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.203125, "step": 1669, "time_per_iteration": 2.5190842151641846 }, { "auxiliary_loss_clip": 0.01179354, "auxiliary_loss_mlp": 0.01066732, "balance_loss_clip": 1.04461884, "balance_loss_mlp": 1.0548954, "epoch": 0.1004058319555088, "flos": 26249587153920.0, "grad_norm": 4.139339198707787, "language_loss": 0.85491538, "learning_rate": 3.948227326038933e-06, "loss": 0.8773762, "num_input_tokens_seen": 35974555, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.25, "step": 1670, "time_per_iteration": 2.5335936546325684 }, { "auxiliary_loss_clip": 0.01167278, "auxiliary_loss_mlp": 0.01052207, "balance_loss_clip": 1.03115463, "balance_loss_mlp": 1.05053091, "epoch": 0.10046595520817676, "flos": 25374480105600.0, "grad_norm": 1.6015022368336311, "language_loss": 0.77001619, "learning_rate": 3.9481392478688586e-06, "loss": 0.79221106, "num_input_tokens_seen": 35996830, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.171875, "step": 1671, "time_per_iteration": 2.512782096862793 }, { "auxiliary_loss_clip": 0.01064048, "auxiliary_loss_mlp": 0.01034534, "balance_loss_clip": 1.03167284, "balance_loss_mlp": 1.02092385, "epoch": 0.10052607846084473, "flos": 67461821677440.0, "grad_norm": 0.7853473283520085, "language_loss": 0.60831267, "learning_rate": 3.948051095825149e-06, "loss": 0.62929851, "num_input_tokens_seen": 36054465, "router_z_loss_clip": 0.02856445, "router_z_loss_mlp": 0.4296875, "step": 1672, "time_per_iteration": 3.0809266567230225 }, { "auxiliary_loss_clip": 0.01176064, "auxiliary_loss_mlp": 0.01055531, "balance_loss_clip": 1.03382349, "balance_loss_mlp": 1.05327857, "epoch": 0.10058620171351271, "flos": 21360493209600.0, "grad_norm": 1.9992471875621125, "language_loss": 0.77060276, "learning_rate": 3.947962869911147e-06, "loss": 0.79291868, "num_input_tokens_seen": 36073480, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.2265625, "step": 1673, "time_per_iteration": 2.488798141479492 }, { "auxiliary_loss_clip": 0.01171521, "auxiliary_loss_mlp": 0.01048435, "balance_loss_clip": 1.02756095, "balance_loss_mlp": 1.0500139, "epoch": 0.10064632496618067, "flos": 16800125558400.0, "grad_norm": 2.3886207948886544, "language_loss": 0.73271286, "learning_rate": 3.947874570130197e-06, "loss": 0.75491238, "num_input_tokens_seen": 36091830, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.21875, "step": 1674, "time_per_iteration": 2.4716339111328125 }, { "auxiliary_loss_clip": 0.01172941, "auxiliary_loss_mlp": 0.0104729, "balance_loss_clip": 1.02750158, "balance_loss_mlp": 1.05111051, "epoch": 0.10070644821884864, "flos": 23624445576960.0, "grad_norm": 1.8771844838403977, "language_loss": 0.79371953, "learning_rate": 3.947786196485649e-06, "loss": 0.81592178, "num_input_tokens_seen": 36111400, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.21875, "step": 1675, "time_per_iteration": 2.4874112606048584 }, { "auxiliary_loss_clip": 0.0117034, "auxiliary_loss_mlp": 0.01063252, "balance_loss_clip": 1.04351091, "balance_loss_mlp": 1.05181289, "epoch": 0.1007665714715166, "flos": 24462564595200.0, "grad_norm": 2.0489226352238985, "language_loss": 0.81669867, "learning_rate": 3.947697748980853e-06, "loss": 0.83903456, "num_input_tokens_seen": 36129345, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1875, "step": 1676, "time_per_iteration": 2.530782461166382 }, { "auxiliary_loss_clip": 0.01178661, "auxiliary_loss_mlp": 0.01059186, "balance_loss_clip": 1.03777647, "balance_loss_mlp": 1.0562067, "epoch": 0.10082669472418458, "flos": 16799119977600.0, "grad_norm": 2.9550341278314134, "language_loss": 0.86027008, "learning_rate": 3.947609227619163e-06, "loss": 0.88264859, "num_input_tokens_seen": 36146255, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.21875, "step": 1677, "time_per_iteration": 2.4380741119384766 }, { "auxiliary_loss_clip": 0.011754, "auxiliary_loss_mlp": 0.01050644, "balance_loss_clip": 1.0295558, "balance_loss_mlp": 1.0545013, "epoch": 0.10088681797685255, "flos": 13553513844480.0, "grad_norm": 1.8893625436836068, "language_loss": 0.86049491, "learning_rate": 3.947520632403936e-06, "loss": 0.8827554, "num_input_tokens_seen": 36164050, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.2109375, "step": 1678, "time_per_iteration": 2.471874713897705 }, { "auxiliary_loss_clip": 0.01176537, "auxiliary_loss_mlp": 0.01051387, "balance_loss_clip": 1.0296793, "balance_loss_mlp": 1.05599904, "epoch": 0.10094694122952051, "flos": 25265706744960.0, "grad_norm": 1.9564550175978734, "language_loss": 0.89841819, "learning_rate": 3.947431963338532e-06, "loss": 0.92069745, "num_input_tokens_seen": 36183530, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.203125, "step": 1679, "time_per_iteration": 2.4899418354034424 }, { "auxiliary_loss_clip": 0.01062236, "auxiliary_loss_mlp": 0.0100596, "balance_loss_clip": 1.00306356, "balance_loss_mlp": 1.01880121, "epoch": 0.10100706448218849, "flos": 69854299885440.0, "grad_norm": 0.7797947907861645, "language_loss": 0.5298748, "learning_rate": 3.947343220426312e-06, "loss": 0.55055672, "num_input_tokens_seen": 36248550, "router_z_loss_clip": 0.02893066, "router_z_loss_mlp": 0.43359375, "step": 1680, "time_per_iteration": 3.1425259113311768 }, { "auxiliary_loss_clip": 0.01171134, "auxiliary_loss_mlp": 0.01056847, "balance_loss_clip": 1.03605723, "balance_loss_mlp": 1.05216229, "epoch": 0.10106718773485646, "flos": 20007163463040.0, "grad_norm": 2.157256411955729, "language_loss": 0.76536047, "learning_rate": 3.947254403670641e-06, "loss": 0.78764027, "num_input_tokens_seen": 36266065, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1875, "step": 1681, "time_per_iteration": 4.0117528438568115 }, { "auxiliary_loss_clip": 0.01180816, "auxiliary_loss_mlp": 0.01061349, "balance_loss_clip": 1.03714931, "balance_loss_mlp": 1.05443013, "epoch": 0.10112731098752442, "flos": 13479825093120.0, "grad_norm": 2.3086744523321636, "language_loss": 0.93932724, "learning_rate": 3.947165513074889e-06, "loss": 0.96174896, "num_input_tokens_seen": 36280960, "router_z_loss_clip": 0.2421875, "router_z_loss_mlp": 1.265625, "step": 1682, "time_per_iteration": 5.3677544593811035 }, { "auxiliary_loss_clip": 0.01173125, "auxiliary_loss_mlp": 0.01050267, "balance_loss_clip": 1.02911937, "balance_loss_mlp": 1.05235076, "epoch": 0.1011874342401924, "flos": 18515901490560.0, "grad_norm": 2.9837497576546914, "language_loss": 0.87691605, "learning_rate": 3.947076548642425e-06, "loss": 0.89915001, "num_input_tokens_seen": 36299010, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.2109375, "step": 1683, "time_per_iteration": 2.4400742053985596 }, { "auxiliary_loss_clip": 0.01170037, "auxiliary_loss_mlp": 0.01056777, "balance_loss_clip": 1.03580785, "balance_loss_mlp": 1.05318809, "epoch": 0.10124755749286037, "flos": 20702861055360.0, "grad_norm": 1.7230388436147503, "language_loss": 0.74570215, "learning_rate": 3.946987510376624e-06, "loss": 0.76797032, "num_input_tokens_seen": 36318400, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.171875, "step": 1684, "time_per_iteration": 2.504993438720703 }, { "auxiliary_loss_clip": 0.01060589, "auxiliary_loss_mlp": 0.01007666, "balance_loss_clip": 1.00495994, "balance_loss_mlp": 1.01734161, "epoch": 0.10130768074552833, "flos": 56109456247680.0, "grad_norm": 0.7740536401305262, "language_loss": 0.61103743, "learning_rate": 3.9468983982808615e-06, "loss": 0.63171995, "num_input_tokens_seen": 36381815, "router_z_loss_clip": 0.02709961, "router_z_loss_mlp": 0.43164062, "step": 1685, "time_per_iteration": 3.1496798992156982 }, { "auxiliary_loss_clip": 0.01174284, "auxiliary_loss_mlp": 0.01056116, "balance_loss_clip": 1.03453875, "balance_loss_mlp": 1.05343318, "epoch": 0.1013678039981963, "flos": 33402346156800.0, "grad_norm": 2.15504875997031, "language_loss": 0.61400473, "learning_rate": 3.946809212358516e-06, "loss": 0.63630873, "num_input_tokens_seen": 36404320, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.203125, "step": 1686, "time_per_iteration": 2.5819637775421143 }, { "auxiliary_loss_clip": 0.01174823, "auxiliary_loss_mlp": 0.01053244, "balance_loss_clip": 1.03145313, "balance_loss_mlp": 1.05683517, "epoch": 0.10142792725086427, "flos": 31905338008320.0, "grad_norm": 2.5315912866144763, "language_loss": 0.81164956, "learning_rate": 3.946719952612972e-06, "loss": 0.83393025, "num_input_tokens_seen": 36427510, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.1796875, "step": 1687, "time_per_iteration": 2.572051525115967 }, { "auxiliary_loss_clip": 0.01178635, "auxiliary_loss_mlp": 0.01047661, "balance_loss_clip": 1.02658522, "balance_loss_mlp": 1.05620706, "epoch": 0.10148805050353224, "flos": 28475905046400.0, "grad_norm": 1.9191649980861187, "language_loss": 0.72081262, "learning_rate": 3.94663061904761e-06, "loss": 0.74307561, "num_input_tokens_seen": 36448230, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.2265625, "step": 1688, "time_per_iteration": 2.5614044666290283 }, { "auxiliary_loss_clip": 0.01173016, "auxiliary_loss_mlp": 0.01056963, "balance_loss_clip": 1.03602982, "balance_loss_mlp": 1.0544591, "epoch": 0.1015481737562002, "flos": 25148888737920.0, "grad_norm": 2.2403330124434393, "language_loss": 0.87149549, "learning_rate": 3.94654121166582e-06, "loss": 0.89379525, "num_input_tokens_seen": 36464395, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1875, "step": 1689, "time_per_iteration": 2.4744327068328857 }, { "auxiliary_loss_clip": 0.01170409, "auxiliary_loss_mlp": 0.01054968, "balance_loss_clip": 1.03488171, "balance_loss_mlp": 1.05069733, "epoch": 0.10160829700886818, "flos": 30882781630080.0, "grad_norm": 1.8236361142833322, "language_loss": 0.88057733, "learning_rate": 3.946451730470993e-06, "loss": 0.9028312, "num_input_tokens_seen": 36486475, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1953125, "step": 1690, "time_per_iteration": 2.5685272216796875 }, { "auxiliary_loss_clip": 0.01173695, "auxiliary_loss_mlp": 0.0105128, "balance_loss_clip": 1.02998972, "balance_loss_mlp": 1.0519495, "epoch": 0.10166842026153615, "flos": 20412020632320.0, "grad_norm": 2.0993986629714865, "language_loss": 0.83860588, "learning_rate": 3.946362175466521e-06, "loss": 0.86085558, "num_input_tokens_seen": 36505310, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.21875, "step": 1691, "time_per_iteration": 2.452873468399048 }, { "auxiliary_loss_clip": 0.01177327, "auxiliary_loss_mlp": 0.01047548, "balance_loss_clip": 1.02619791, "balance_loss_mlp": 1.05608082, "epoch": 0.10172854351420411, "flos": 33476968661760.0, "grad_norm": 1.585073297383619, "language_loss": 0.66704869, "learning_rate": 3.946272546655801e-06, "loss": 0.68929744, "num_input_tokens_seen": 36529820, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.2109375, "step": 1692, "time_per_iteration": 2.612335443496704 }, { "auxiliary_loss_clip": 0.01172363, "auxiliary_loss_mlp": 0.01061015, "balance_loss_clip": 1.039855, "balance_loss_mlp": 1.05148995, "epoch": 0.1017886667668721, "flos": 23550325862400.0, "grad_norm": 1.6122747783794773, "language_loss": 0.76199305, "learning_rate": 3.94618284404223e-06, "loss": 0.78432685, "num_input_tokens_seen": 36549000, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.2109375, "step": 1693, "time_per_iteration": 2.493123769760132 }, { "auxiliary_loss_clip": 0.0117525, "auxiliary_loss_mlp": 0.01052016, "balance_loss_clip": 1.0299145, "balance_loss_mlp": 1.05461097, "epoch": 0.10184879001954006, "flos": 23296078419840.0, "grad_norm": 1.7889718979121296, "language_loss": 0.87285984, "learning_rate": 3.9460930676292105e-06, "loss": 0.89513242, "num_input_tokens_seen": 36567515, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.203125, "step": 1694, "time_per_iteration": 2.4811012744903564 }, { "auxiliary_loss_clip": 0.01180724, "auxiliary_loss_mlp": 0.01050647, "balance_loss_clip": 1.02850962, "balance_loss_mlp": 1.05526161, "epoch": 0.10190891327220802, "flos": 18333116156160.0, "grad_norm": 1.9907675479233486, "language_loss": 0.79121256, "learning_rate": 3.946003217420147e-06, "loss": 0.81352627, "num_input_tokens_seen": 36586190, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.25, "step": 1695, "time_per_iteration": 2.4356203079223633 }, { "auxiliary_loss_clip": 0.01175623, "auxiliary_loss_mlp": 0.0105309, "balance_loss_clip": 1.03141785, "balance_loss_mlp": 1.0542804, "epoch": 0.10196903652487599, "flos": 26465374108800.0, "grad_norm": 1.6163221254090376, "language_loss": 0.86361897, "learning_rate": 3.945913293418447e-06, "loss": 0.8859061, "num_input_tokens_seen": 36607495, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.2109375, "step": 1696, "time_per_iteration": 2.5214524269104004 }, { "auxiliary_loss_clip": 0.01173149, "auxiliary_loss_mlp": 0.01057646, "balance_loss_clip": 1.03695107, "balance_loss_mlp": 1.05465126, "epoch": 0.10202915977754397, "flos": 21869526798720.0, "grad_norm": 3.6684961864782357, "language_loss": 0.82241297, "learning_rate": 3.945823295627519e-06, "loss": 0.84472096, "num_input_tokens_seen": 36628555, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1875, "step": 1697, "time_per_iteration": 2.488661050796509 }, { "auxiliary_loss_clip": 0.01176641, "auxiliary_loss_mlp": 0.01048078, "balance_loss_clip": 1.02640557, "balance_loss_mlp": 1.0547061, "epoch": 0.10208928303021193, "flos": 22309755886080.0, "grad_norm": 2.098508510816458, "language_loss": 0.80634773, "learning_rate": 3.9457332240507775e-06, "loss": 0.82859486, "num_input_tokens_seen": 36646250, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.21875, "step": 1698, "time_per_iteration": 2.4672720432281494 }, { "auxiliary_loss_clip": 0.01177137, "auxiliary_loss_mlp": 0.01047298, "balance_loss_clip": 1.02719903, "balance_loss_mlp": 1.05634618, "epoch": 0.1021494062828799, "flos": 22125569921280.0, "grad_norm": 2.5583460215420066, "language_loss": 0.76035666, "learning_rate": 3.945643078691637e-06, "loss": 0.782601, "num_input_tokens_seen": 36666675, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.2109375, "step": 1699, "time_per_iteration": 2.471041440963745 }, { "auxiliary_loss_clip": 0.0117475, "auxiliary_loss_mlp": 0.01046518, "balance_loss_clip": 1.02564502, "balance_loss_mlp": 1.0557065, "epoch": 0.10220952953554788, "flos": 19646728439040.0, "grad_norm": 1.77620414037664, "language_loss": 0.80172682, "learning_rate": 3.945552859553516e-06, "loss": 0.82393956, "num_input_tokens_seen": 36685225, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1875, "step": 1700, "time_per_iteration": 2.4720230102539062 }, { "auxiliary_loss_clip": 0.01176516, "auxiliary_loss_mlp": 0.01043638, "balance_loss_clip": 1.02283621, "balance_loss_mlp": 1.05459297, "epoch": 0.10226965278821584, "flos": 29787290686080.0, "grad_norm": 2.1598668382809634, "language_loss": 0.76848531, "learning_rate": 3.945462566639836e-06, "loss": 0.79068679, "num_input_tokens_seen": 36705985, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.21875, "step": 1701, "time_per_iteration": 2.52449893951416 }, { "auxiliary_loss_clip": 0.01177386, "auxiliary_loss_mlp": 0.01049831, "balance_loss_clip": 1.02815902, "balance_loss_mlp": 1.0548135, "epoch": 0.10232977604088381, "flos": 27016818681600.0, "grad_norm": 2.112262583306974, "language_loss": 0.78039598, "learning_rate": 3.945372199954019e-06, "loss": 0.80266815, "num_input_tokens_seen": 36725815, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.2265625, "step": 1702, "time_per_iteration": 2.5324201583862305 }, { "auxiliary_loss_clip": 0.01171524, "auxiliary_loss_mlp": 0.01046352, "balance_loss_clip": 1.02616966, "balance_loss_mlp": 1.05310237, "epoch": 0.10238989929355179, "flos": 20777519473920.0, "grad_norm": 1.9919547656741, "language_loss": 0.94623697, "learning_rate": 3.945281759499494e-06, "loss": 0.96841574, "num_input_tokens_seen": 36742345, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1875, "step": 1703, "time_per_iteration": 2.466526746749878 }, { "auxiliary_loss_clip": 0.01077121, "auxiliary_loss_mlp": 0.01031466, "balance_loss_clip": 1.02891469, "balance_loss_mlp": 1.0332396, "epoch": 0.10245002254621975, "flos": 57698322451200.0, "grad_norm": 0.902004809609206, "language_loss": 0.55060935, "learning_rate": 3.94519124527969e-06, "loss": 0.57169521, "num_input_tokens_seen": 36798775, "router_z_loss_clip": 0.0255127, "router_z_loss_mlp": 0.4375, "step": 1704, "time_per_iteration": 3.0226032733917236 }, { "auxiliary_loss_clip": 0.01173962, "auxiliary_loss_mlp": 0.01046207, "balance_loss_clip": 1.02526152, "balance_loss_mlp": 1.05394673, "epoch": 0.10251014579888772, "flos": 16800125558400.0, "grad_norm": 2.1511702084103046, "language_loss": 0.84112799, "learning_rate": 3.945100657298039e-06, "loss": 0.86332965, "num_input_tokens_seen": 36816295, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.203125, "step": 1705, "time_per_iteration": 2.4776151180267334 }, { "auxiliary_loss_clip": 0.0106914, "auxiliary_loss_mlp": 0.01008622, "balance_loss_clip": 1.00611901, "balance_loss_mlp": 1.02519107, "epoch": 0.1025702690515557, "flos": 68565500922240.0, "grad_norm": 0.7895134110199773, "language_loss": 0.60445893, "learning_rate": 3.9450099955579765e-06, "loss": 0.62523663, "num_input_tokens_seen": 36882030, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.43945312, "step": 1706, "time_per_iteration": 3.154102325439453 }, { "auxiliary_loss_clip": 0.01176096, "auxiliary_loss_mlp": 0.01046021, "balance_loss_clip": 1.02481365, "balance_loss_mlp": 1.05453229, "epoch": 0.10263039230422366, "flos": 14866623336960.0, "grad_norm": 2.528293549733404, "language_loss": 0.86321878, "learning_rate": 3.94491926006294e-06, "loss": 0.88543987, "num_input_tokens_seen": 36899245, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.21875, "step": 1707, "time_per_iteration": 2.4780635833740234 }, { "auxiliary_loss_clip": 0.01170472, "auxiliary_loss_mlp": 0.01045204, "balance_loss_clip": 1.0257256, "balance_loss_mlp": 1.05200553, "epoch": 0.10269051555689163, "flos": 25337599816320.0, "grad_norm": 1.505888262801849, "language_loss": 0.73221219, "learning_rate": 3.944828450816369e-06, "loss": 0.75436896, "num_input_tokens_seen": 36920950, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.1875, "step": 1708, "time_per_iteration": 2.5223758220672607 }, { "auxiliary_loss_clip": 0.01172861, "auxiliary_loss_mlp": 0.01051725, "balance_loss_clip": 1.03020799, "balance_loss_mlp": 1.05359769, "epoch": 0.10275063880955959, "flos": 21068826773760.0, "grad_norm": 1.652105447205804, "language_loss": 0.90864438, "learning_rate": 3.944737567821709e-06, "loss": 0.9308902, "num_input_tokens_seen": 36938900, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1953125, "step": 1709, "time_per_iteration": 2.4769582748413086 }, { "auxiliary_loss_clip": 0.01173534, "auxiliary_loss_mlp": 0.01051798, "balance_loss_clip": 1.03065014, "balance_loss_mlp": 1.05550158, "epoch": 0.10281076206222757, "flos": 30366780802560.0, "grad_norm": 2.160094843538502, "language_loss": 0.88107431, "learning_rate": 3.944646611082406e-06, "loss": 0.90332758, "num_input_tokens_seen": 36957010, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1796875, "step": 1710, "time_per_iteration": 2.5315043926239014 }, { "auxiliary_loss_clip": 0.01167756, "auxiliary_loss_mlp": 0.01053556, "balance_loss_clip": 1.03321934, "balance_loss_mlp": 1.05100405, "epoch": 0.10287088531489554, "flos": 22418313765120.0, "grad_norm": 1.8063266854301745, "language_loss": 0.7923184, "learning_rate": 3.944555580601908e-06, "loss": 0.81453145, "num_input_tokens_seen": 36977690, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1640625, "step": 1711, "time_per_iteration": 2.4847164154052734 }, { "auxiliary_loss_clip": 0.01174202, "auxiliary_loss_mlp": 0.01056075, "balance_loss_clip": 1.0343076, "balance_loss_mlp": 1.05390835, "epoch": 0.1029310085675635, "flos": 25115994858240.0, "grad_norm": 1.780401867118776, "language_loss": 0.73577082, "learning_rate": 3.944464476383668e-06, "loss": 0.75807357, "num_input_tokens_seen": 36997300, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.203125, "step": 1712, "time_per_iteration": 2.4758987426757812 }, { "auxiliary_loss_clip": 0.0117058, "auxiliary_loss_mlp": 0.01055059, "balance_loss_clip": 1.03548491, "balance_loss_mlp": 1.05543971, "epoch": 0.10299113182023148, "flos": 19865639877120.0, "grad_norm": 2.120977729526623, "language_loss": 0.86855978, "learning_rate": 3.94437329843114e-06, "loss": 0.89081621, "num_input_tokens_seen": 37016110, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.15625, "step": 1713, "time_per_iteration": 2.4783196449279785 }, { "auxiliary_loss_clip": 0.01169572, "auxiliary_loss_mlp": 0.01061378, "balance_loss_clip": 1.04133916, "balance_loss_mlp": 1.05206323, "epoch": 0.10305125507289944, "flos": 20447608032000.0, "grad_norm": 1.6225981713288307, "language_loss": 0.72411227, "learning_rate": 3.944282046747782e-06, "loss": 0.74642181, "num_input_tokens_seen": 37036405, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.171875, "step": 1714, "time_per_iteration": 2.484065532684326 }, { "auxiliary_loss_clip": 0.0117505, "auxiliary_loss_mlp": 0.0106303, "balance_loss_clip": 1.0415957, "balance_loss_mlp": 1.05329537, "epoch": 0.10311137832556741, "flos": 26250772302720.0, "grad_norm": 1.9833797823449268, "language_loss": 0.9119215, "learning_rate": 3.944190721337053e-06, "loss": 0.93430233, "num_input_tokens_seen": 37057580, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.21875, "step": 1715, "time_per_iteration": 2.5311686992645264 }, { "auxiliary_loss_clip": 0.01171916, "auxiliary_loss_mlp": 0.01054315, "balance_loss_clip": 1.03396595, "balance_loss_mlp": 1.05403459, "epoch": 0.10317150157823539, "flos": 35298932175360.0, "grad_norm": 2.2846183093353676, "language_loss": 0.75529963, "learning_rate": 3.944099322202418e-06, "loss": 0.7775619, "num_input_tokens_seen": 37079120, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1796875, "step": 1716, "time_per_iteration": 2.5805068016052246 }, { "auxiliary_loss_clip": 0.01174942, "auxiliary_loss_mlp": 0.01063672, "balance_loss_clip": 1.04272723, "balance_loss_mlp": 1.05344629, "epoch": 0.10323162483090335, "flos": 25739943033600.0, "grad_norm": 1.8624781795485001, "language_loss": 0.84897745, "learning_rate": 3.944007849347342e-06, "loss": 0.87136364, "num_input_tokens_seen": 37099710, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.21875, "step": 1717, "time_per_iteration": 2.5089309215545654 }, { "auxiliary_loss_clip": 0.01174214, "auxiliary_loss_mlp": 0.01062217, "balance_loss_clip": 1.04261887, "balance_loss_mlp": 1.05645728, "epoch": 0.10329174808357132, "flos": 16289870906880.0, "grad_norm": 2.0013413172461374, "language_loss": 0.83005571, "learning_rate": 3.943916302775292e-06, "loss": 0.85242003, "num_input_tokens_seen": 37117775, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.1796875, "step": 1718, "time_per_iteration": 2.4423468112945557 }, { "auxiliary_loss_clip": 0.01170154, "auxiliary_loss_mlp": 0.01052543, "balance_loss_clip": 1.03172898, "balance_loss_mlp": 1.05344081, "epoch": 0.10335187133623928, "flos": 36687166963200.0, "grad_norm": 1.8962669885911998, "language_loss": 0.73123676, "learning_rate": 3.943824682489742e-06, "loss": 0.75346375, "num_input_tokens_seen": 37140280, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.171875, "step": 1719, "time_per_iteration": 2.643075942993164 }, { "auxiliary_loss_clip": 0.01173271, "auxiliary_loss_mlp": 0.01045948, "balance_loss_clip": 1.02626646, "balance_loss_mlp": 1.05438113, "epoch": 0.10341199458890726, "flos": 14975648092800.0, "grad_norm": 1.7896514027941186, "language_loss": 0.92665905, "learning_rate": 3.9437329884941665e-06, "loss": 0.94885123, "num_input_tokens_seen": 37158350, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1875, "step": 1720, "time_per_iteration": 2.4328370094299316 }, { "auxiliary_loss_clip": 0.01172003, "auxiliary_loss_mlp": 0.01052687, "balance_loss_clip": 1.0305258, "balance_loss_mlp": 1.05266094, "epoch": 0.10347211784157523, "flos": 21031587348480.0, "grad_norm": 1.7383651698872444, "language_loss": 0.79107195, "learning_rate": 3.943641220792039e-06, "loss": 0.81331885, "num_input_tokens_seen": 37177120, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.1953125, "step": 1721, "time_per_iteration": 2.472567319869995 }, { "auxiliary_loss_clip": 0.01177306, "auxiliary_loss_mlp": 0.01054637, "balance_loss_clip": 1.03158188, "balance_loss_mlp": 1.05411458, "epoch": 0.1035322410942432, "flos": 19792094780160.0, "grad_norm": 1.7833495130265415, "language_loss": 0.80764806, "learning_rate": 3.9435493793868434e-06, "loss": 0.82996744, "num_input_tokens_seen": 37195895, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.2265625, "step": 1722, "time_per_iteration": 3.9974045753479004 }, { "auxiliary_loss_clip": 0.01070977, "auxiliary_loss_mlp": 0.01047596, "balance_loss_clip": 1.04533124, "balance_loss_mlp": 1.02830565, "epoch": 0.10359236434691117, "flos": 52698874947840.0, "grad_norm": 0.9526814270322669, "language_loss": 0.67223406, "learning_rate": 3.943457464282059e-06, "loss": 0.69341981, "num_input_tokens_seen": 37247270, "router_z_loss_clip": 0.02270508, "router_z_loss_mlp": 0.42578125, "step": 1723, "time_per_iteration": 4.282569646835327 }, { "auxiliary_loss_clip": 0.0117318, "auxiliary_loss_mlp": 0.01054079, "balance_loss_clip": 1.03370595, "balance_loss_mlp": 1.05165803, "epoch": 0.10365248759957914, "flos": 18405404277120.0, "grad_norm": 15.34530060728734, "language_loss": 0.77894235, "learning_rate": 3.9433654754811745e-06, "loss": 0.80121499, "num_input_tokens_seen": 37265595, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.21875, "step": 1724, "time_per_iteration": 3.8838295936584473 }, { "auxiliary_loss_clip": 0.01177574, "auxiliary_loss_mlp": 0.01052061, "balance_loss_clip": 1.03172374, "balance_loss_mlp": 1.05503392, "epoch": 0.1037126108522471, "flos": 47553555335040.0, "grad_norm": 1.7472538535429054, "language_loss": 0.75119925, "learning_rate": 3.943273412987676e-06, "loss": 0.77349561, "num_input_tokens_seen": 37286660, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.2265625, "step": 1725, "time_per_iteration": 2.696239709854126 }, { "auxiliary_loss_clip": 0.01172886, "auxiliary_loss_mlp": 0.01047706, "balance_loss_clip": 1.0276072, "balance_loss_mlp": 1.05459619, "epoch": 0.10377273410491508, "flos": 22816670572800.0, "grad_norm": 2.3563807001547574, "language_loss": 0.75163102, "learning_rate": 3.943181276805054e-06, "loss": 0.77383691, "num_input_tokens_seen": 37304915, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1796875, "step": 1726, "time_per_iteration": 2.464900255203247 }, { "auxiliary_loss_clip": 0.01176307, "auxiliary_loss_mlp": 0.01057341, "balance_loss_clip": 1.03618157, "balance_loss_mlp": 1.05514789, "epoch": 0.10383285735758305, "flos": 26138694890880.0, "grad_norm": 2.193834232277544, "language_loss": 0.73579824, "learning_rate": 3.9430890669368035e-06, "loss": 0.75813466, "num_input_tokens_seen": 37325265, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.2109375, "step": 1727, "time_per_iteration": 2.5240707397460938 }, { "auxiliary_loss_clip": 0.01172729, "auxiliary_loss_mlp": 0.01053492, "balance_loss_clip": 1.03289306, "balance_loss_mlp": 1.05291414, "epoch": 0.10389298061025101, "flos": 17091791994240.0, "grad_norm": 2.234022868483945, "language_loss": 0.84752774, "learning_rate": 3.942996783386422e-06, "loss": 0.86979002, "num_input_tokens_seen": 37341650, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.203125, "step": 1728, "time_per_iteration": 2.4543540477752686 }, { "auxiliary_loss_clip": 0.01172049, "auxiliary_loss_mlp": 0.01050282, "balance_loss_clip": 1.02933645, "balance_loss_mlp": 1.05313313, "epoch": 0.10395310386291898, "flos": 20776513893120.0, "grad_norm": 1.9108335372948677, "language_loss": 0.70424151, "learning_rate": 3.942904426157406e-06, "loss": 0.72646481, "num_input_tokens_seen": 37360270, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1875, "step": 1729, "time_per_iteration": 2.481889247894287 }, { "auxiliary_loss_clip": 0.01172685, "auxiliary_loss_mlp": 0.01050708, "balance_loss_clip": 1.02890515, "balance_loss_mlp": 1.05329657, "epoch": 0.10401322711558696, "flos": 12820540913280.0, "grad_norm": 4.856923250997846, "language_loss": 0.81460953, "learning_rate": 3.9428119952532605e-06, "loss": 0.83684343, "num_input_tokens_seen": 37375225, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.1953125, "step": 1730, "time_per_iteration": 2.4158685207366943 }, { "auxiliary_loss_clip": 0.01170828, "auxiliary_loss_mlp": 0.01049113, "balance_loss_clip": 1.02946723, "balance_loss_mlp": 1.05309641, "epoch": 0.10407335036825492, "flos": 23184683366400.0, "grad_norm": 2.034249616478397, "language_loss": 0.75762779, "learning_rate": 3.942719490677489e-06, "loss": 0.77982724, "num_input_tokens_seen": 37395165, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.171875, "step": 1731, "time_per_iteration": 2.5068368911743164 }, { "auxiliary_loss_clip": 0.01165907, "auxiliary_loss_mlp": 0.0104754, "balance_loss_clip": 1.02860999, "balance_loss_mlp": 1.05158496, "epoch": 0.10413347362092289, "flos": 26104184899200.0, "grad_norm": 1.7194502863504195, "language_loss": 0.82399213, "learning_rate": 3.9426269124336e-06, "loss": 0.84612662, "num_input_tokens_seen": 37414845, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.140625, "step": 1732, "time_per_iteration": 2.506211996078491 }, { "auxiliary_loss_clip": 0.01172005, "auxiliary_loss_mlp": 0.01044211, "balance_loss_clip": 1.0251497, "balance_loss_mlp": 1.0548892, "epoch": 0.10419359687359087, "flos": 12641059630080.0, "grad_norm": 2.2077481462557773, "language_loss": 0.83196706, "learning_rate": 3.942534260525104e-06, "loss": 0.8541292, "num_input_tokens_seen": 37432490, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.171875, "step": 1733, "time_per_iteration": 2.4483988285064697 }, { "auxiliary_loss_clip": 0.01174786, "auxiliary_loss_mlp": 0.01050633, "balance_loss_clip": 1.03086829, "balance_loss_mlp": 1.05326605, "epoch": 0.10425372012625883, "flos": 12125094716160.0, "grad_norm": 2.2108238150167305, "language_loss": 0.76529968, "learning_rate": 3.942441534955514e-06, "loss": 0.78755391, "num_input_tokens_seen": 37449435, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.21875, "step": 1734, "time_per_iteration": 2.427457571029663 }, { "auxiliary_loss_clip": 0.01167192, "auxiliary_loss_mlp": 0.01047408, "balance_loss_clip": 1.02815533, "balance_loss_mlp": 1.05085945, "epoch": 0.1043138433789268, "flos": 25337563902720.0, "grad_norm": 1.7107458555035038, "language_loss": 0.75280112, "learning_rate": 3.9423487357283465e-06, "loss": 0.77494711, "num_input_tokens_seen": 37469105, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.1640625, "step": 1735, "time_per_iteration": 2.5220863819122314 }, { "auxiliary_loss_clip": 0.01171913, "auxiliary_loss_mlp": 0.01053199, "balance_loss_clip": 1.03323102, "balance_loss_mlp": 1.05364156, "epoch": 0.10437396663159478, "flos": 29167149352320.0, "grad_norm": 2.308736537451229, "language_loss": 0.78625435, "learning_rate": 3.94225586284712e-06, "loss": 0.80850542, "num_input_tokens_seen": 37490540, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1796875, "step": 1736, "time_per_iteration": 2.537466526031494 }, { "auxiliary_loss_clip": 0.01169364, "auxiliary_loss_mlp": 0.01060288, "balance_loss_clip": 1.0398196, "balance_loss_mlp": 1.05299497, "epoch": 0.10443408988426274, "flos": 25080946162560.0, "grad_norm": 2.305207835567326, "language_loss": 0.70346284, "learning_rate": 3.942162916315356e-06, "loss": 0.72575927, "num_input_tokens_seen": 37511905, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1640625, "step": 1737, "time_per_iteration": 2.525601625442505 }, { "auxiliary_loss_clip": 0.01172648, "auxiliary_loss_mlp": 0.01053015, "balance_loss_clip": 1.02993619, "balance_loss_mlp": 1.05048311, "epoch": 0.1044942131369307, "flos": 26759662237440.0, "grad_norm": 1.8668226147905458, "language_loss": 0.81552976, "learning_rate": 3.942069896136581e-06, "loss": 0.83778644, "num_input_tokens_seen": 37533635, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.21875, "step": 1738, "time_per_iteration": 2.499835968017578 }, { "auxiliary_loss_clip": 0.01170648, "auxiliary_loss_mlp": 0.01058292, "balance_loss_clip": 1.03666782, "balance_loss_mlp": 1.05074632, "epoch": 0.10455433638959867, "flos": 18442571875200.0, "grad_norm": 2.843437423238513, "language_loss": 0.75115484, "learning_rate": 3.9419768023143196e-06, "loss": 0.77344429, "num_input_tokens_seen": 37552035, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.203125, "step": 1739, "time_per_iteration": 2.4788858890533447 }, { "auxiliary_loss_clip": 0.01169138, "auxiliary_loss_mlp": 0.01055571, "balance_loss_clip": 1.03587759, "balance_loss_mlp": 1.05214643, "epoch": 0.10461445964226665, "flos": 23218977876480.0, "grad_norm": 1.7813952995879616, "language_loss": 0.77746987, "learning_rate": 3.941883634852104e-06, "loss": 0.79971695, "num_input_tokens_seen": 37571540, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.171875, "step": 1740, "time_per_iteration": 2.4829957485198975 }, { "auxiliary_loss_clip": 0.01172485, "auxiliary_loss_mlp": 0.01046735, "balance_loss_clip": 1.0271486, "balance_loss_mlp": 1.05463719, "epoch": 0.10467458289493461, "flos": 24345243797760.0, "grad_norm": 2.184408557662549, "language_loss": 0.86085969, "learning_rate": 3.941790393753467e-06, "loss": 0.88305187, "num_input_tokens_seen": 37588265, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.171875, "step": 1741, "time_per_iteration": 2.5187063217163086 }, { "auxiliary_loss_clip": 0.0117309, "auxiliary_loss_mlp": 0.01056251, "balance_loss_clip": 1.03453112, "balance_loss_mlp": 1.05267453, "epoch": 0.10473470614760258, "flos": 21287953693440.0, "grad_norm": 3.348807755799902, "language_loss": 0.75272954, "learning_rate": 3.941697079021942e-06, "loss": 0.77502292, "num_input_tokens_seen": 37606860, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.203125, "step": 1742, "time_per_iteration": 2.458249568939209 }, { "auxiliary_loss_clip": 0.01173148, "auxiliary_loss_mlp": 0.01051836, "balance_loss_clip": 1.03259611, "balance_loss_mlp": 1.05725372, "epoch": 0.10479482940027056, "flos": 21687208341120.0, "grad_norm": 2.8666301261912217, "language_loss": 0.87428814, "learning_rate": 3.94160369066107e-06, "loss": 0.89653796, "num_input_tokens_seen": 37625210, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.15625, "step": 1743, "time_per_iteration": 2.494318962097168 }, { "auxiliary_loss_clip": 0.01167207, "auxiliary_loss_mlp": 0.01047693, "balance_loss_clip": 1.0265094, "balance_loss_mlp": 1.05089355, "epoch": 0.10485495265293852, "flos": 21573694385280.0, "grad_norm": 2.183572397860958, "language_loss": 0.75939429, "learning_rate": 3.941510228674391e-06, "loss": 0.78154337, "num_input_tokens_seen": 37644110, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1640625, "step": 1744, "time_per_iteration": 2.461723566055298 }, { "auxiliary_loss_clip": 0.01168893, "auxiliary_loss_mlp": 0.01054863, "balance_loss_clip": 1.03597999, "balance_loss_mlp": 1.05329466, "epoch": 0.10491507590560649, "flos": 37961923708800.0, "grad_norm": 4.215698499640567, "language_loss": 0.79535234, "learning_rate": 3.941416693065451e-06, "loss": 0.81758988, "num_input_tokens_seen": 37665800, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.15625, "step": 1745, "time_per_iteration": 2.6356611251831055 }, { "auxiliary_loss_clip": 0.01166623, "auxiliary_loss_mlp": 0.01062788, "balance_loss_clip": 1.04298687, "balance_loss_mlp": 1.04928136, "epoch": 0.10497519915827447, "flos": 26396282298240.0, "grad_norm": 2.192676127099306, "language_loss": 0.83474004, "learning_rate": 3.941323083837794e-06, "loss": 0.85703421, "num_input_tokens_seen": 37685095, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.171875, "step": 1746, "time_per_iteration": 2.4933247566223145 }, { "auxiliary_loss_clip": 0.01168151, "auxiliary_loss_mlp": 0.01056144, "balance_loss_clip": 1.03630805, "balance_loss_mlp": 1.05216718, "epoch": 0.10503532241094243, "flos": 40662190581120.0, "grad_norm": 3.9212479955033817, "language_loss": 0.70335281, "learning_rate": 3.941229400994971e-06, "loss": 0.72559571, "num_input_tokens_seen": 37707445, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.15625, "step": 1747, "time_per_iteration": 2.652832269668579 }, { "auxiliary_loss_clip": 0.01176725, "auxiliary_loss_mlp": 0.010591, "balance_loss_clip": 1.03909636, "balance_loss_mlp": 1.05375314, "epoch": 0.1050954456636104, "flos": 29789409588480.0, "grad_norm": 2.1734335263408706, "language_loss": 0.84428257, "learning_rate": 3.941135644540535e-06, "loss": 0.86664081, "num_input_tokens_seen": 37728325, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.234375, "step": 1748, "time_per_iteration": 2.5535550117492676 }, { "auxiliary_loss_clip": 0.01163575, "auxiliary_loss_mlp": 0.01049406, "balance_loss_clip": 1.02853286, "balance_loss_mlp": 1.04822052, "epoch": 0.10515556891627838, "flos": 23948754497280.0, "grad_norm": 1.6747407327875974, "language_loss": 0.71413183, "learning_rate": 3.941041814478041e-06, "loss": 0.73626173, "num_input_tokens_seen": 37748910, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.15625, "step": 1749, "time_per_iteration": 2.4984233379364014 }, { "auxiliary_loss_clip": 0.01165877, "auxiliary_loss_mlp": 0.01060986, "balance_loss_clip": 1.04019582, "balance_loss_mlp": 1.05170846, "epoch": 0.10521569216894634, "flos": 18259606972800.0, "grad_norm": 2.060157542461934, "language_loss": 0.82150209, "learning_rate": 3.940947910811047e-06, "loss": 0.84377068, "num_input_tokens_seen": 37765745, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.140625, "step": 1750, "time_per_iteration": 2.4385130405426025 }, { "auxiliary_loss_clip": 0.01172629, "auxiliary_loss_mlp": 0.01059563, "balance_loss_clip": 1.03878522, "balance_loss_mlp": 1.05444193, "epoch": 0.10527581542161431, "flos": 15630909949440.0, "grad_norm": 6.684598473684916, "language_loss": 0.92030847, "learning_rate": 3.940853933543114e-06, "loss": 0.94263041, "num_input_tokens_seen": 37780520, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1796875, "step": 1751, "time_per_iteration": 2.4160006046295166 }, { "auxiliary_loss_clip": 0.01167277, "auxiliary_loss_mlp": 0.01045463, "balance_loss_clip": 1.02507818, "balance_loss_mlp": 1.05155349, "epoch": 0.10533593867428227, "flos": 18296559089280.0, "grad_norm": 2.0059953615466037, "language_loss": 0.7892741, "learning_rate": 3.940759882677805e-06, "loss": 0.81140155, "num_input_tokens_seen": 37799515, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.15625, "step": 1752, "time_per_iteration": 2.4520914554595947 }, { "auxiliary_loss_clip": 0.0116725, "auxiliary_loss_mlp": 0.01050465, "balance_loss_clip": 1.0303421, "balance_loss_mlp": 1.05263674, "epoch": 0.10539606192695025, "flos": 29023219555200.0, "grad_norm": 2.048340452831511, "language_loss": 0.76251775, "learning_rate": 3.940665758218686e-06, "loss": 0.78469491, "num_input_tokens_seen": 37818695, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.140625, "step": 1753, "time_per_iteration": 2.5425314903259277 }, { "auxiliary_loss_clip": 0.01172672, "auxiliary_loss_mlp": 0.01056577, "balance_loss_clip": 1.03453481, "balance_loss_mlp": 1.05116069, "epoch": 0.10545618517961822, "flos": 19969313506560.0, "grad_norm": 2.215704924744879, "language_loss": 0.83828503, "learning_rate": 3.940571560169328e-06, "loss": 0.86057752, "num_input_tokens_seen": 37837860, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.2109375, "step": 1754, "time_per_iteration": 2.42757248878479 }, { "auxiliary_loss_clip": 0.01177354, "auxiliary_loss_mlp": 0.01048637, "balance_loss_clip": 1.02670228, "balance_loss_mlp": 1.05841517, "epoch": 0.10551630843228618, "flos": 16143427157760.0, "grad_norm": 2.834353321534917, "language_loss": 0.69198728, "learning_rate": 3.940477288533302e-06, "loss": 0.71424723, "num_input_tokens_seen": 37856260, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.1875, "step": 1755, "time_per_iteration": 2.444383382797241 }, { "auxiliary_loss_clip": 0.01173402, "auxiliary_loss_mlp": 0.01059106, "balance_loss_clip": 1.03745806, "balance_loss_mlp": 1.05348599, "epoch": 0.10557643168495416, "flos": 23440115957760.0, "grad_norm": 2.2494919327054004, "language_loss": 0.76847464, "learning_rate": 3.940382943314182e-06, "loss": 0.79079974, "num_input_tokens_seen": 37876960, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.203125, "step": 1756, "time_per_iteration": 2.475432872772217 }, { "auxiliary_loss_clip": 0.01172907, "auxiliary_loss_mlp": 0.01059735, "balance_loss_clip": 1.03985119, "balance_loss_mlp": 1.05366528, "epoch": 0.10563655493762213, "flos": 21799034357760.0, "grad_norm": 1.6816442740765403, "language_loss": 0.80197704, "learning_rate": 3.940288524515547e-06, "loss": 0.82430351, "num_input_tokens_seen": 37897070, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1953125, "step": 1757, "time_per_iteration": 2.4784250259399414 }, { "auxiliary_loss_clip": 0.01172328, "auxiliary_loss_mlp": 0.01052477, "balance_loss_clip": 1.03227091, "balance_loss_mlp": 1.05408323, "epoch": 0.10569667819029009, "flos": 53800863275520.0, "grad_norm": 1.6087749804668727, "language_loss": 0.78681898, "learning_rate": 3.940194032140976e-06, "loss": 0.80906713, "num_input_tokens_seen": 37923635, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.1875, "step": 1758, "time_per_iteration": 2.769742488861084 }, { "auxiliary_loss_clip": 0.01177993, "auxiliary_loss_mlp": 0.01049203, "balance_loss_clip": 1.02873456, "balance_loss_mlp": 1.05608153, "epoch": 0.10575680144295807, "flos": 22925515760640.0, "grad_norm": 2.1076700146585603, "language_loss": 0.91706371, "learning_rate": 3.940099466194054e-06, "loss": 0.93933564, "num_input_tokens_seen": 37942650, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.21875, "step": 1759, "time_per_iteration": 2.490849733352661 }, { "auxiliary_loss_clip": 0.0117359, "auxiliary_loss_mlp": 0.01049687, "balance_loss_clip": 1.02738333, "balance_loss_mlp": 1.05243194, "epoch": 0.10581692469562604, "flos": 14136667148160.0, "grad_norm": 2.3030014353455592, "language_loss": 0.76932645, "learning_rate": 3.940004826678365e-06, "loss": 0.79155922, "num_input_tokens_seen": 37960660, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.2109375, "step": 1760, "time_per_iteration": 2.4241750240325928 }, { "auxiliary_loss_clip": 0.01174968, "auxiliary_loss_mlp": 0.01056051, "balance_loss_clip": 1.0337472, "balance_loss_mlp": 1.05366111, "epoch": 0.105877047948294, "flos": 25958674903680.0, "grad_norm": 38.69345020712143, "language_loss": 0.89112842, "learning_rate": 3.939910113597498e-06, "loss": 0.91343856, "num_input_tokens_seen": 37978625, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.2109375, "step": 1761, "time_per_iteration": 2.5168721675872803 }, { "auxiliary_loss_clip": 0.01172802, "auxiliary_loss_mlp": 0.01057123, "balance_loss_clip": 1.03630924, "balance_loss_mlp": 1.0540725, "epoch": 0.10593717120096197, "flos": 30664768032000.0, "grad_norm": 2.2687365336507734, "language_loss": 0.78438783, "learning_rate": 3.9398153269550464e-06, "loss": 0.80668706, "num_input_tokens_seen": 38000005, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1875, "step": 1762, "time_per_iteration": 2.5147340297698975 }, { "auxiliary_loss_clip": 0.0107934, "auxiliary_loss_mlp": 0.01009947, "balance_loss_clip": 1.00668073, "balance_loss_mlp": 1.03591335, "epoch": 0.10599729445362994, "flos": 66436682497920.0, "grad_norm": 0.7566878633580542, "language_loss": 0.60559201, "learning_rate": 3.939720466754602e-06, "loss": 0.62648487, "num_input_tokens_seen": 38066165, "router_z_loss_clip": 0.03271484, "router_z_loss_mlp": 0.43359375, "step": 1763, "time_per_iteration": 3.2843878269195557 }, { "auxiliary_loss_clip": 0.01172334, "auxiliary_loss_mlp": 0.01049341, "balance_loss_clip": 1.02855098, "balance_loss_mlp": 1.05197406, "epoch": 0.10605741770629791, "flos": 23948179879680.0, "grad_norm": 1.7806943832849995, "language_loss": 0.80307162, "learning_rate": 3.939625532999763e-06, "loss": 0.82528836, "num_input_tokens_seen": 38086150, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.203125, "step": 1764, "time_per_iteration": 4.022074937820435 }, { "auxiliary_loss_clip": 0.01173763, "auxiliary_loss_mlp": 0.01052274, "balance_loss_clip": 1.03066099, "balance_loss_mlp": 1.05552554, "epoch": 0.10611754095896588, "flos": 19387524919680.0, "grad_norm": 1.9292025141246718, "language_loss": 0.80024606, "learning_rate": 3.9395305256941314e-06, "loss": 0.82250643, "num_input_tokens_seen": 38104205, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1796875, "step": 1765, "time_per_iteration": 3.9232022762298584 }, { "auxiliary_loss_clip": 0.01168025, "auxiliary_loss_mlp": 0.01053619, "balance_loss_clip": 1.03235221, "balance_loss_mlp": 1.05171251, "epoch": 0.10617766421163385, "flos": 22237755073920.0, "grad_norm": 1.667217034174365, "language_loss": 0.76789969, "learning_rate": 3.939435444841306e-06, "loss": 0.79011619, "num_input_tokens_seen": 38122005, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1640625, "step": 1766, "time_per_iteration": 2.475419282913208 }, { "auxiliary_loss_clip": 0.01173396, "auxiliary_loss_mlp": 0.01060358, "balance_loss_clip": 1.03868532, "balance_loss_mlp": 1.05519032, "epoch": 0.10623778746430182, "flos": 28404407024640.0, "grad_norm": 3.336615673772163, "language_loss": 0.77498543, "learning_rate": 3.939340290444895e-06, "loss": 0.79732293, "num_input_tokens_seen": 38143365, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1796875, "step": 1767, "time_per_iteration": 2.5962564945220947 }, { "auxiliary_loss_clip": 0.01074065, "auxiliary_loss_mlp": 0.01006883, "balance_loss_clip": 1.00330663, "balance_loss_mlp": 1.03090572, "epoch": 0.10629791071696978, "flos": 64234639221120.0, "grad_norm": 0.6858707130130124, "language_loss": 0.57885504, "learning_rate": 3.939245062508506e-06, "loss": 0.59966451, "num_input_tokens_seen": 38210035, "router_z_loss_clip": 0.03564453, "router_z_loss_mlp": 0.43164062, "step": 1768, "time_per_iteration": 3.1948814392089844 }, { "auxiliary_loss_clip": 0.01175062, "auxiliary_loss_mlp": 0.01042584, "balance_loss_clip": 1.02358198, "balance_loss_mlp": 1.0567379, "epoch": 0.10635803396963776, "flos": 22747578762240.0, "grad_norm": 1.4228227589116416, "language_loss": 0.8633759, "learning_rate": 3.939149761035749e-06, "loss": 0.88555241, "num_input_tokens_seen": 38231230, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.1796875, "step": 1769, "time_per_iteration": 2.5249850749969482 }, { "auxiliary_loss_clip": 0.01176581, "auxiliary_loss_mlp": 0.01047302, "balance_loss_clip": 1.02580833, "balance_loss_mlp": 1.05606329, "epoch": 0.10641815722230573, "flos": 31395586147200.0, "grad_norm": 2.1920658596501084, "language_loss": 0.61989069, "learning_rate": 3.9390543860302395e-06, "loss": 0.6421296, "num_input_tokens_seen": 38253890, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.203125, "step": 1770, "time_per_iteration": 2.560692310333252 }, { "auxiliary_loss_clip": 0.01069285, "auxiliary_loss_mlp": 0.01001674, "balance_loss_clip": 0.99833643, "balance_loss_mlp": 1.02629662, "epoch": 0.1064782804749737, "flos": 58552527784320.0, "grad_norm": 0.876288356373742, "language_loss": 0.57095963, "learning_rate": 3.9389589374955925e-06, "loss": 0.59166926, "num_input_tokens_seen": 38304290, "router_z_loss_clip": 0.03344727, "router_z_loss_mlp": 0.4296875, "step": 1771, "time_per_iteration": 2.997307538986206 }, { "auxiliary_loss_clip": 0.01174903, "auxiliary_loss_mlp": 0.01056701, "balance_loss_clip": 1.03644729, "balance_loss_mlp": 1.05683184, "epoch": 0.10653840372764166, "flos": 23987825516160.0, "grad_norm": 1.7359519616904622, "language_loss": 0.88134181, "learning_rate": 3.938863415435429e-06, "loss": 0.90365779, "num_input_tokens_seen": 38324725, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1796875, "step": 1772, "time_per_iteration": 2.4749629497528076 }, { "auxiliary_loss_clip": 0.01177821, "auxiliary_loss_mlp": 0.01054264, "balance_loss_clip": 1.03178072, "balance_loss_mlp": 1.0535872, "epoch": 0.10659852698030964, "flos": 18294655668480.0, "grad_norm": 2.9440925588103832, "language_loss": 0.76151395, "learning_rate": 3.93876781985337e-06, "loss": 0.78383476, "num_input_tokens_seen": 38340735, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.2421875, "step": 1773, "time_per_iteration": 2.4407458305358887 }, { "auxiliary_loss_clip": 0.01172881, "auxiliary_loss_mlp": 0.01056406, "balance_loss_clip": 1.03442371, "balance_loss_mlp": 1.05446327, "epoch": 0.1066586502329776, "flos": 32160591031680.0, "grad_norm": 2.4232936765349726, "language_loss": 0.82973635, "learning_rate": 3.938672150753041e-06, "loss": 0.8520292, "num_input_tokens_seen": 38361315, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.1875, "step": 1774, "time_per_iteration": 2.554908275604248 }, { "auxiliary_loss_clip": 0.01178923, "auxiliary_loss_mlp": 0.01052284, "balance_loss_clip": 1.03123164, "balance_loss_mlp": 1.05573773, "epoch": 0.10671877348564557, "flos": 17785155202560.0, "grad_norm": 2.398442425191819, "language_loss": 0.76524717, "learning_rate": 3.9385764081380704e-06, "loss": 0.78755921, "num_input_tokens_seen": 38377425, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.234375, "step": 1775, "time_per_iteration": 2.437965154647827 }, { "auxiliary_loss_clip": 0.01063388, "auxiliary_loss_mlp": 0.01010493, "balance_loss_clip": 1.006845, "balance_loss_mlp": 1.02061272, "epoch": 0.10677889673831355, "flos": 63510177813120.0, "grad_norm": 0.9578552514076172, "language_loss": 0.57471049, "learning_rate": 3.9384805920120876e-06, "loss": 0.59544933, "num_input_tokens_seen": 38440275, "router_z_loss_clip": 0.03637695, "router_z_loss_mlp": 0.42773438, "step": 1776, "time_per_iteration": 3.1149580478668213 }, { "auxiliary_loss_clip": 0.01174143, "auxiliary_loss_mlp": 0.01056206, "balance_loss_clip": 1.03434265, "balance_loss_mlp": 1.05411303, "epoch": 0.10683901999098151, "flos": 22017694400640.0, "grad_norm": 1.5380384435225245, "language_loss": 0.83489871, "learning_rate": 3.938384702378727e-06, "loss": 0.85720223, "num_input_tokens_seen": 38461820, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.203125, "step": 1777, "time_per_iteration": 2.511692762374878 }, { "auxiliary_loss_clip": 0.01171146, "auxiliary_loss_mlp": 0.01052424, "balance_loss_clip": 1.03200316, "balance_loss_mlp": 1.05479586, "epoch": 0.10689914324364948, "flos": 25042952551680.0, "grad_norm": 1.770175918242284, "language_loss": 0.87222123, "learning_rate": 3.938288739241625e-06, "loss": 0.89445692, "num_input_tokens_seen": 38482235, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1640625, "step": 1778, "time_per_iteration": 2.512995719909668 }, { "auxiliary_loss_clip": 0.01174342, "auxiliary_loss_mlp": 0.01051924, "balance_loss_clip": 1.030478, "balance_loss_mlp": 1.05410063, "epoch": 0.10695926649631746, "flos": 16435129507200.0, "grad_norm": 2.292249396886851, "language_loss": 0.8430087, "learning_rate": 3.938192702604417e-06, "loss": 0.86527133, "num_input_tokens_seen": 38500690, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.203125, "step": 1779, "time_per_iteration": 2.4553160667419434 }, { "auxiliary_loss_clip": 0.01168793, "auxiliary_loss_mlp": 0.01049055, "balance_loss_clip": 1.02909899, "balance_loss_mlp": 1.0520792, "epoch": 0.10701938974898542, "flos": 16979211792000.0, "grad_norm": 2.5181409066900455, "language_loss": 0.67024231, "learning_rate": 3.9380965924707495e-06, "loss": 0.69242084, "num_input_tokens_seen": 38518405, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.171875, "step": 1780, "time_per_iteration": 2.449369192123413 }, { "auxiliary_loss_clip": 0.01175838, "auxiliary_loss_mlp": 0.0105089, "balance_loss_clip": 1.03004038, "balance_loss_mlp": 1.05586171, "epoch": 0.10707951300165339, "flos": 15888102307200.0, "grad_norm": 2.358076529381374, "language_loss": 0.91604054, "learning_rate": 3.938000408844265e-06, "loss": 0.93830776, "num_input_tokens_seen": 38535060, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1953125, "step": 1781, "time_per_iteration": 2.4790565967559814 }, { "auxiliary_loss_clip": 0.01172333, "auxiliary_loss_mlp": 0.01048744, "balance_loss_clip": 1.02837074, "balance_loss_mlp": 1.05271316, "epoch": 0.10713963625432135, "flos": 14247164361600.0, "grad_norm": 2.567456831507455, "language_loss": 0.7938596, "learning_rate": 3.9379041517286105e-06, "loss": 0.81607044, "num_input_tokens_seen": 38552855, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1953125, "step": 1782, "time_per_iteration": 2.4446237087249756 }, { "auxiliary_loss_clip": 0.0117636, "auxiliary_loss_mlp": 0.01053149, "balance_loss_clip": 1.03182209, "balance_loss_mlp": 1.05426574, "epoch": 0.10719975950698933, "flos": 16756780821120.0, "grad_norm": 2.113380197485372, "language_loss": 0.79352474, "learning_rate": 3.937807821127436e-06, "loss": 0.81581986, "num_input_tokens_seen": 38570075, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.21875, "step": 1783, "time_per_iteration": 2.4443318843841553 }, { "auxiliary_loss_clip": 0.01175314, "auxiliary_loss_mlp": 0.01051822, "balance_loss_clip": 1.03042388, "balance_loss_mlp": 1.05444932, "epoch": 0.1072598827596573, "flos": 22710626645760.0, "grad_norm": 2.1043696427613967, "language_loss": 0.86318821, "learning_rate": 3.937711417044395e-06, "loss": 0.8854596, "num_input_tokens_seen": 38587970, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.203125, "step": 1784, "time_per_iteration": 2.4754436016082764 }, { "auxiliary_loss_clip": 0.01175467, "auxiliary_loss_mlp": 0.01052707, "balance_loss_clip": 1.03080797, "balance_loss_mlp": 1.0541048, "epoch": 0.10732000601232526, "flos": 23258264376960.0, "grad_norm": 2.1234467237421577, "language_loss": 1.00969982, "learning_rate": 3.937614939483143e-06, "loss": 1.03198147, "num_input_tokens_seen": 38605840, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.2109375, "step": 1785, "time_per_iteration": 2.514253854751587 }, { "auxiliary_loss_clip": 0.01172116, "auxiliary_loss_mlp": 0.01053905, "balance_loss_clip": 1.03315115, "balance_loss_mlp": 1.05556083, "epoch": 0.10738012926499324, "flos": 24207060176640.0, "grad_norm": 1.4599932469428756, "language_loss": 0.84890032, "learning_rate": 3.937518388447339e-06, "loss": 0.87116057, "num_input_tokens_seen": 38627070, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1640625, "step": 1786, "time_per_iteration": 2.4934909343719482 }, { "auxiliary_loss_clip": 0.01175227, "auxiliary_loss_mlp": 0.01052686, "balance_loss_clip": 1.02963138, "balance_loss_mlp": 1.0540483, "epoch": 0.1074402525176612, "flos": 20923065383040.0, "grad_norm": 2.412713202985583, "language_loss": 0.78569072, "learning_rate": 3.937421763940642e-06, "loss": 0.80796981, "num_input_tokens_seen": 38645840, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.2109375, "step": 1787, "time_per_iteration": 2.4872114658355713 }, { "auxiliary_loss_clip": 0.01177896, "auxiliary_loss_mlp": 0.01044307, "balance_loss_clip": 1.02269387, "balance_loss_mlp": 1.0555197, "epoch": 0.10750037577032917, "flos": 16946928443520.0, "grad_norm": 1.9988183030353477, "language_loss": 0.82797599, "learning_rate": 3.937325065966719e-06, "loss": 0.85019797, "num_input_tokens_seen": 38664770, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.2265625, "step": 1788, "time_per_iteration": 2.4338669776916504 }, { "auxiliary_loss_clip": 0.01173949, "auxiliary_loss_mlp": 0.01062197, "balance_loss_clip": 1.04213357, "balance_loss_mlp": 1.05525935, "epoch": 0.10756049902299715, "flos": 20266546550400.0, "grad_norm": 3.440323403009602, "language_loss": 0.78345633, "learning_rate": 3.9372282945292335e-06, "loss": 0.80581784, "num_input_tokens_seen": 38683865, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.1875, "step": 1789, "time_per_iteration": 2.463095188140869 }, { "auxiliary_loss_clip": 0.011744, "auxiliary_loss_mlp": 0.01061666, "balance_loss_clip": 1.03749049, "balance_loss_mlp": 1.05439186, "epoch": 0.10762062227566511, "flos": 23586523793280.0, "grad_norm": 7.590047465604604, "language_loss": 0.74666494, "learning_rate": 3.937131449631859e-06, "loss": 0.76902562, "num_input_tokens_seen": 38702485, "router_z_loss_clip": 0.24121094, "router_z_loss_mlp": 1.203125, "step": 1790, "time_per_iteration": 2.4682211875915527 }, { "auxiliary_loss_clip": 0.01179208, "auxiliary_loss_mlp": 0.01064266, "balance_loss_clip": 1.04036486, "balance_loss_mlp": 1.05805206, "epoch": 0.10768074552833308, "flos": 24310626065280.0, "grad_norm": 2.1240422932278835, "language_loss": 0.78450078, "learning_rate": 3.9370345312782645e-06, "loss": 0.80693549, "num_input_tokens_seen": 38722475, "router_z_loss_clip": 0.23925781, "router_z_loss_mlp": 1.2109375, "step": 1791, "time_per_iteration": 2.5041706562042236 }, { "auxiliary_loss_clip": 0.01171137, "auxiliary_loss_mlp": 0.01054479, "balance_loss_clip": 1.03401113, "balance_loss_mlp": 1.05486655, "epoch": 0.10774086878100106, "flos": 25299965341440.0, "grad_norm": 1.8099479553018363, "language_loss": 0.70486546, "learning_rate": 3.936937539472126e-06, "loss": 0.72712159, "num_input_tokens_seen": 38743285, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1640625, "step": 1792, "time_per_iteration": 2.497739553451538 }, { "auxiliary_loss_clip": 0.01177475, "auxiliary_loss_mlp": 0.01047892, "balance_loss_clip": 1.02465832, "balance_loss_mlp": 1.05481565, "epoch": 0.10780099203366902, "flos": 22054035985920.0, "grad_norm": 1.8661482465879056, "language_loss": 0.76386654, "learning_rate": 3.9368404742171236e-06, "loss": 0.78612018, "num_input_tokens_seen": 38763035, "router_z_loss_clip": 0.23242188, "router_z_loss_mlp": 1.2265625, "step": 1793, "time_per_iteration": 2.490036725997925 }, { "auxiliary_loss_clip": 0.01175682, "auxiliary_loss_mlp": 0.01058516, "balance_loss_clip": 1.03711808, "balance_loss_mlp": 1.05682385, "epoch": 0.10786111528633699, "flos": 22747471021440.0, "grad_norm": 1.5089158622880876, "language_loss": 0.85175622, "learning_rate": 3.936743335516936e-06, "loss": 0.87409818, "num_input_tokens_seen": 38784900, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1875, "step": 1794, "time_per_iteration": 2.509146213531494 }, { "auxiliary_loss_clip": 0.01181333, "auxiliary_loss_mlp": 0.01048962, "balance_loss_clip": 1.02610993, "balance_loss_mlp": 1.05646086, "epoch": 0.10792123853900495, "flos": 20851064570880.0, "grad_norm": 1.981997779271117, "language_loss": 0.74879122, "learning_rate": 3.936646123375246e-06, "loss": 0.77109408, "num_input_tokens_seen": 38804695, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.25, "step": 1795, "time_per_iteration": 2.481605052947998 }, { "auxiliary_loss_clip": 0.01177477, "auxiliary_loss_mlp": 0.01056426, "balance_loss_clip": 1.03394294, "balance_loss_mlp": 1.05524051, "epoch": 0.10798136179167293, "flos": 17748705876480.0, "grad_norm": 2.308776802427806, "language_loss": 0.81371516, "learning_rate": 3.936548837795741e-06, "loss": 0.83605421, "num_input_tokens_seen": 38822395, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.21875, "step": 1796, "time_per_iteration": 2.4330804347991943 }, { "auxiliary_loss_clip": 0.01181029, "auxiliary_loss_mlp": 0.01067535, "balance_loss_clip": 1.04474223, "balance_loss_mlp": 1.0576731, "epoch": 0.1080414850443409, "flos": 13589639948160.0, "grad_norm": 2.679008957435587, "language_loss": 0.74061632, "learning_rate": 3.936451478782111e-06, "loss": 0.76310199, "num_input_tokens_seen": 38839865, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.234375, "step": 1797, "time_per_iteration": 2.4674015045166016 }, { "auxiliary_loss_clip": 0.0117761, "auxiliary_loss_mlp": 0.01051566, "balance_loss_clip": 1.03094292, "balance_loss_mlp": 1.05686104, "epoch": 0.10810160829700886, "flos": 16253421580800.0, "grad_norm": 2.048959882617233, "language_loss": 0.82003766, "learning_rate": 3.936354046338046e-06, "loss": 0.84232944, "num_input_tokens_seen": 38857300, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.203125, "step": 1798, "time_per_iteration": 2.454651117324829 }, { "auxiliary_loss_clip": 0.01175026, "auxiliary_loss_mlp": 0.01053101, "balance_loss_clip": 1.03054667, "balance_loss_mlp": 1.05428028, "epoch": 0.10816173154967684, "flos": 15158002464000.0, "grad_norm": 2.290622538183642, "language_loss": 0.85957968, "learning_rate": 3.936256540467242e-06, "loss": 0.88186097, "num_input_tokens_seen": 38874960, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.203125, "step": 1799, "time_per_iteration": 2.450127124786377 }, { "auxiliary_loss_clip": 0.01173433, "auxiliary_loss_mlp": 0.01061698, "balance_loss_clip": 1.04065788, "balance_loss_mlp": 1.05673075, "epoch": 0.10822185480234481, "flos": 17785334770560.0, "grad_norm": 1.8632007255318783, "language_loss": 0.77036262, "learning_rate": 3.9361589611733955e-06, "loss": 0.79271394, "num_input_tokens_seen": 38893610, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.171875, "step": 1800, "time_per_iteration": 2.4336495399475098 }, { "auxiliary_loss_clip": 0.0117047, "auxiliary_loss_mlp": 0.0105329, "balance_loss_clip": 1.03319156, "balance_loss_mlp": 1.05349374, "epoch": 0.10828197805501277, "flos": 25556654908800.0, "grad_norm": 1.6383915117429948, "language_loss": 0.73059201, "learning_rate": 3.9360613084602075e-06, "loss": 0.75282967, "num_input_tokens_seen": 38913485, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.171875, "step": 1801, "time_per_iteration": 2.5120935440063477 }, { "auxiliary_loss_clip": 0.01182759, "auxiliary_loss_mlp": 0.01051033, "balance_loss_clip": 1.03102922, "balance_loss_mlp": 1.05811775, "epoch": 0.10834210130768075, "flos": 28984435845120.0, "grad_norm": 2.0075840282724573, "language_loss": 0.65940905, "learning_rate": 3.935963582331381e-06, "loss": 0.68174696, "num_input_tokens_seen": 38935650, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.25, "step": 1802, "time_per_iteration": 2.5145792961120605 }, { "auxiliary_loss_clip": 0.01174092, "auxiliary_loss_mlp": 0.01060846, "balance_loss_clip": 1.03963912, "balance_loss_mlp": 1.05529785, "epoch": 0.10840222456034872, "flos": 20264212166400.0, "grad_norm": 1.811083606236644, "language_loss": 0.81513643, "learning_rate": 3.935865782790621e-06, "loss": 0.83748579, "num_input_tokens_seen": 38954130, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1875, "step": 1803, "time_per_iteration": 2.4899702072143555 }, { "auxiliary_loss_clip": 0.01169819, "auxiliary_loss_mlp": 0.01053493, "balance_loss_clip": 1.03220248, "balance_loss_mlp": 1.05321097, "epoch": 0.10846234781301668, "flos": 19863054097920.0, "grad_norm": 1.569507464813705, "language_loss": 0.91045797, "learning_rate": 3.9357679098416365e-06, "loss": 0.93269122, "num_input_tokens_seen": 38972905, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1640625, "step": 1804, "time_per_iteration": 2.452422618865967 }, { "auxiliary_loss_clip": 0.01176695, "auxiliary_loss_mlp": 0.01051573, "balance_loss_clip": 1.02962673, "balance_loss_mlp": 1.05695617, "epoch": 0.10852247106568465, "flos": 26469037296000.0, "grad_norm": 1.879609551190407, "language_loss": 0.76161879, "learning_rate": 3.935669963488139e-06, "loss": 0.78390151, "num_input_tokens_seen": 38993255, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.1953125, "step": 1805, "time_per_iteration": 4.043160915374756 }, { "auxiliary_loss_clip": 0.01173056, "auxiliary_loss_mlp": 0.01045558, "balance_loss_clip": 1.02612698, "balance_loss_mlp": 1.05612731, "epoch": 0.10858259431835263, "flos": 30081506987520.0, "grad_norm": 1.8299753692591647, "language_loss": 0.863747, "learning_rate": 3.935571943733843e-06, "loss": 0.8859331, "num_input_tokens_seen": 39012610, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.171875, "step": 1806, "time_per_iteration": 2.6117172241210938 }, { "auxiliary_loss_clip": 0.01172429, "auxiliary_loss_mlp": 0.01049193, "balance_loss_clip": 1.02928483, "balance_loss_mlp": 1.05319715, "epoch": 0.10864271757102059, "flos": 19063180085760.0, "grad_norm": 2.3942169324328955, "language_loss": 0.807859, "learning_rate": 3.9354738505824635e-06, "loss": 0.8300752, "num_input_tokens_seen": 39030120, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1953125, "step": 1807, "time_per_iteration": 5.292227029800415 }, { "auxiliary_loss_clip": 0.01172928, "auxiliary_loss_mlp": 0.01050617, "balance_loss_clip": 1.03138852, "balance_loss_mlp": 1.0558486, "epoch": 0.10870284082368856, "flos": 24715052271360.0, "grad_norm": 1.823184881919295, "language_loss": 0.78899008, "learning_rate": 3.9353756840377225e-06, "loss": 0.81122553, "num_input_tokens_seen": 39049875, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.171875, "step": 1808, "time_per_iteration": 2.4804859161376953 }, { "auxiliary_loss_clip": 0.01173277, "auxiliary_loss_mlp": 0.01050417, "balance_loss_clip": 1.02941275, "balance_loss_mlp": 1.05558372, "epoch": 0.10876296407635654, "flos": 20627663932800.0, "grad_norm": 1.6610048315930313, "language_loss": 0.79276556, "learning_rate": 3.935277444103342e-06, "loss": 0.8150025, "num_input_tokens_seen": 39068935, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1796875, "step": 1809, "time_per_iteration": 2.4791135787963867 }, { "auxiliary_loss_clip": 0.01172081, "auxiliary_loss_mlp": 0.01051613, "balance_loss_clip": 1.03137159, "balance_loss_mlp": 1.05272257, "epoch": 0.1088230873290245, "flos": 21579835610880.0, "grad_norm": 1.993465048184352, "language_loss": 0.85031056, "learning_rate": 3.935179130783046e-06, "loss": 0.87254751, "num_input_tokens_seen": 39087370, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.1953125, "step": 1810, "time_per_iteration": 2.4526326656341553 }, { "auxiliary_loss_clip": 0.01178296, "auxiliary_loss_mlp": 0.01051443, "balance_loss_clip": 1.02940106, "balance_loss_mlp": 1.0558852, "epoch": 0.10888321058169247, "flos": 26469037296000.0, "grad_norm": 1.8001459940703324, "language_loss": 0.63538915, "learning_rate": 3.935080744080564e-06, "loss": 0.65768659, "num_input_tokens_seen": 39106635, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.21875, "step": 1811, "time_per_iteration": 2.5301053524017334 }, { "auxiliary_loss_clip": 0.01172473, "auxiliary_loss_mlp": 0.01049891, "balance_loss_clip": 1.029006, "balance_loss_mlp": 1.05313182, "epoch": 0.10894333383436045, "flos": 25848608653440.0, "grad_norm": 2.2022155096839264, "language_loss": 0.74756157, "learning_rate": 3.934982283999626e-06, "loss": 0.76978517, "num_input_tokens_seen": 39126335, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1953125, "step": 1812, "time_per_iteration": 2.5064375400543213 }, { "auxiliary_loss_clip": 0.01170969, "auxiliary_loss_mlp": 0.01050305, "balance_loss_clip": 1.02949095, "balance_loss_mlp": 1.05366802, "epoch": 0.10900345708702841, "flos": 19537093152000.0, "grad_norm": 1.852260555697042, "language_loss": 0.72723544, "learning_rate": 3.934883750543966e-06, "loss": 0.74944818, "num_input_tokens_seen": 39144820, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.171875, "step": 1813, "time_per_iteration": 2.470933198928833 }, { "auxiliary_loss_clip": 0.0117286, "auxiliary_loss_mlp": 0.0105108, "balance_loss_clip": 1.03036094, "balance_loss_mlp": 1.05722988, "epoch": 0.10906358033969638, "flos": 23623296341760.0, "grad_norm": 1.8245667973776805, "language_loss": 0.83136547, "learning_rate": 3.93478514371732e-06, "loss": 0.85360479, "num_input_tokens_seen": 39165945, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.15625, "step": 1814, "time_per_iteration": 2.4711170196533203 }, { "auxiliary_loss_clip": 0.01177024, "auxiliary_loss_mlp": 0.01052124, "balance_loss_clip": 1.03249002, "balance_loss_mlp": 1.05675864, "epoch": 0.10912370359236434, "flos": 21214731818880.0, "grad_norm": 1.9837460875244786, "language_loss": 0.84180939, "learning_rate": 3.934686463523429e-06, "loss": 0.86410087, "num_input_tokens_seen": 39183520, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.203125, "step": 1815, "time_per_iteration": 2.475630044937134 }, { "auxiliary_loss_clip": 0.01171529, "auxiliary_loss_mlp": 0.01049564, "balance_loss_clip": 1.02851152, "balance_loss_mlp": 1.05512094, "epoch": 0.10918382684503232, "flos": 13553190622080.0, "grad_norm": 2.4756248263427056, "language_loss": 0.71319371, "learning_rate": 3.9345877099660315e-06, "loss": 0.73540461, "num_input_tokens_seen": 39201190, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1640625, "step": 1816, "time_per_iteration": 2.440561056137085 }, { "auxiliary_loss_clip": 0.01172126, "auxiliary_loss_mlp": 0.01054465, "balance_loss_clip": 1.033198, "balance_loss_mlp": 1.05281138, "epoch": 0.10924395009770028, "flos": 27964321591680.0, "grad_norm": 36.12923993266326, "language_loss": 0.72848231, "learning_rate": 3.9344888830488744e-06, "loss": 0.75074822, "num_input_tokens_seen": 39221210, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1875, "step": 1817, "time_per_iteration": 2.5765151977539062 }, { "auxiliary_loss_clip": 0.01174878, "auxiliary_loss_mlp": 0.01057109, "balance_loss_clip": 1.03532994, "balance_loss_mlp": 1.05569971, "epoch": 0.10930407335036825, "flos": 25593750679680.0, "grad_norm": 1.694251493884063, "language_loss": 0.67358458, "learning_rate": 3.934389982775706e-06, "loss": 0.69590449, "num_input_tokens_seen": 39242025, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.1953125, "step": 1818, "time_per_iteration": 2.485342025756836 }, { "auxiliary_loss_clip": 0.0117654, "auxiliary_loss_mlp": 0.01061802, "balance_loss_clip": 1.04126203, "balance_loss_mlp": 1.05694962, "epoch": 0.10936419660303623, "flos": 18406194376320.0, "grad_norm": 2.2411961194482783, "language_loss": 0.73137093, "learning_rate": 3.934291009150275e-06, "loss": 0.75375432, "num_input_tokens_seen": 39259870, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1875, "step": 1819, "time_per_iteration": 2.4536354541778564 }, { "auxiliary_loss_clip": 0.01176012, "auxiliary_loss_mlp": 0.0105342, "balance_loss_clip": 1.03320193, "balance_loss_mlp": 1.05787373, "epoch": 0.1094243198557042, "flos": 23840052963840.0, "grad_norm": 3.241226027377946, "language_loss": 0.73936588, "learning_rate": 3.934191962176335e-06, "loss": 0.76166016, "num_input_tokens_seen": 39278500, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.1875, "step": 1820, "time_per_iteration": 2.4812026023864746 }, { "auxiliary_loss_clip": 0.01173967, "auxiliary_loss_mlp": 0.01056257, "balance_loss_clip": 1.03444219, "balance_loss_mlp": 1.05687237, "epoch": 0.10948444310837216, "flos": 14643940970880.0, "grad_norm": 2.006783771959961, "language_loss": 0.82315713, "learning_rate": 3.934092841857642e-06, "loss": 0.8454594, "num_input_tokens_seen": 39294800, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.171875, "step": 1821, "time_per_iteration": 2.4463624954223633 }, { "auxiliary_loss_clip": 0.01167861, "auxiliary_loss_mlp": 0.01050632, "balance_loss_clip": 1.0306282, "balance_loss_mlp": 1.05157888, "epoch": 0.10954456636104014, "flos": 27818811596160.0, "grad_norm": 1.9722042536983249, "language_loss": 0.76436812, "learning_rate": 3.933993648197955e-06, "loss": 0.78655303, "num_input_tokens_seen": 39314625, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.15625, "step": 1822, "time_per_iteration": 2.503995656967163 }, { "auxiliary_loss_clip": 0.01169625, "auxiliary_loss_mlp": 0.01052481, "balance_loss_clip": 1.03268075, "balance_loss_mlp": 1.05346847, "epoch": 0.1096046896137081, "flos": 33620934372480.0, "grad_norm": 1.691495675165464, "language_loss": 0.79421747, "learning_rate": 3.933894381201034e-06, "loss": 0.8164385, "num_input_tokens_seen": 39336465, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1640625, "step": 1823, "time_per_iteration": 2.575000047683716 }, { "auxiliary_loss_clip": 0.01173802, "auxiliary_loss_mlp": 0.01044112, "balance_loss_clip": 1.02401364, "balance_loss_mlp": 1.05804324, "epoch": 0.10966481286637607, "flos": 26980010219520.0, "grad_norm": 1.872034758037251, "language_loss": 0.79615414, "learning_rate": 3.933795040870645e-06, "loss": 0.81833327, "num_input_tokens_seen": 39357930, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.15625, "step": 1824, "time_per_iteration": 2.514281749725342 }, { "auxiliary_loss_clip": 0.0116927, "auxiliary_loss_mlp": 0.01052753, "balance_loss_clip": 1.03292859, "balance_loss_mlp": 1.05415285, "epoch": 0.10972493611904403, "flos": 23036551678080.0, "grad_norm": 2.254617396454026, "language_loss": 0.88174951, "learning_rate": 3.933695627210554e-06, "loss": 0.90396976, "num_input_tokens_seen": 39376380, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.15625, "step": 1825, "time_per_iteration": 2.476954460144043 }, { "auxiliary_loss_clip": 0.01164915, "auxiliary_loss_mlp": 0.01049675, "balance_loss_clip": 1.02945697, "balance_loss_mlp": 1.04974079, "epoch": 0.10978505937171201, "flos": 38104632443520.0, "grad_norm": 1.7592212321911458, "language_loss": 0.76639229, "learning_rate": 3.933596140224532e-06, "loss": 0.78853822, "num_input_tokens_seen": 39399935, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.15625, "step": 1826, "time_per_iteration": 2.5891196727752686 }, { "auxiliary_loss_clip": 0.01070054, "auxiliary_loss_mlp": 0.0101772, "balance_loss_clip": 1.01452529, "balance_loss_mlp": 1.02689123, "epoch": 0.10984518262437998, "flos": 59849694616320.0, "grad_norm": 0.8351521304384418, "language_loss": 0.55017012, "learning_rate": 3.93349657991635e-06, "loss": 0.57104784, "num_input_tokens_seen": 39460685, "router_z_loss_clip": 0.03198242, "router_z_loss_mlp": 0.43164062, "step": 1827, "time_per_iteration": 3.0888707637786865 }, { "auxiliary_loss_clip": 0.01068223, "auxiliary_loss_mlp": 0.01008883, "balance_loss_clip": 1.00578392, "balance_loss_mlp": 1.02527976, "epoch": 0.10990530587704794, "flos": 66719837410560.0, "grad_norm": 0.7507437829616411, "language_loss": 0.55397785, "learning_rate": 3.933396946289784e-06, "loss": 0.57474887, "num_input_tokens_seen": 39524765, "router_z_loss_clip": 0.03088379, "router_z_loss_mlp": 0.4296875, "step": 1828, "time_per_iteration": 3.117234230041504 }, { "auxiliary_loss_clip": 0.01172395, "auxiliary_loss_mlp": 0.01054057, "balance_loss_clip": 1.03174114, "balance_loss_mlp": 1.05228281, "epoch": 0.10996542912971592, "flos": 25447199189760.0, "grad_norm": 3.621599246737033, "language_loss": 0.84524083, "learning_rate": 3.933297239348612e-06, "loss": 0.86750531, "num_input_tokens_seen": 39543640, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.203125, "step": 1829, "time_per_iteration": 2.601478099822998 }, { "auxiliary_loss_clip": 0.01172911, "auxiliary_loss_mlp": 0.01053526, "balance_loss_clip": 1.0321039, "balance_loss_mlp": 1.05373502, "epoch": 0.11002555238238389, "flos": 44018186186880.0, "grad_norm": 1.7922319913995564, "language_loss": 0.88925111, "learning_rate": 3.933197459096614e-06, "loss": 0.91151547, "num_input_tokens_seen": 39567525, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1875, "step": 1830, "time_per_iteration": 2.655367374420166 }, { "auxiliary_loss_clip": 0.01063785, "auxiliary_loss_mlp": 0.01005841, "balance_loss_clip": 1.0027889, "balance_loss_mlp": 1.02170277, "epoch": 0.11008567563505185, "flos": 54065133590400.0, "grad_norm": 0.714024856984497, "language_loss": 0.55595672, "learning_rate": 3.9330976055375756e-06, "loss": 0.57665288, "num_input_tokens_seen": 39628470, "router_z_loss_clip": 0.03051758, "router_z_loss_mlp": 0.421875, "step": 1831, "time_per_iteration": 3.092912435531616 }, { "auxiliary_loss_clip": 0.01176189, "auxiliary_loss_mlp": 0.01072731, "balance_loss_clip": 1.05006933, "balance_loss_mlp": 1.05412626, "epoch": 0.11014579888771983, "flos": 24243150366720.0, "grad_norm": 2.1875482609912753, "language_loss": 0.90791261, "learning_rate": 3.932997678675282e-06, "loss": 0.9304018, "num_input_tokens_seen": 39646670, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.21875, "step": 1832, "time_per_iteration": 2.5013856887817383 }, { "auxiliary_loss_clip": 0.01062584, "auxiliary_loss_mlp": 0.01005596, "balance_loss_clip": 1.00244868, "balance_loss_mlp": 1.02071667, "epoch": 0.1102059221403878, "flos": 57743965658880.0, "grad_norm": 0.8742535995107065, "language_loss": 0.59859061, "learning_rate": 3.932897678513523e-06, "loss": 0.61927241, "num_input_tokens_seen": 39712915, "router_z_loss_clip": 0.03149414, "router_z_loss_mlp": 0.41796875, "step": 1833, "time_per_iteration": 3.115732192993164 }, { "auxiliary_loss_clip": 0.01168979, "auxiliary_loss_mlp": 0.01047322, "balance_loss_clip": 1.02641284, "balance_loss_mlp": 1.05069363, "epoch": 0.11026604539305576, "flos": 16795923667200.0, "grad_norm": 3.0526424703648734, "language_loss": 0.80747271, "learning_rate": 3.93279760505609e-06, "loss": 0.82963568, "num_input_tokens_seen": 39730650, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1875, "step": 1834, "time_per_iteration": 2.4528086185455322 }, { "auxiliary_loss_clip": 0.01173575, "auxiliary_loss_mlp": 0.01050559, "balance_loss_clip": 1.02829099, "balance_loss_mlp": 1.05557966, "epoch": 0.11032616864572373, "flos": 23988076911360.0, "grad_norm": 2.4226858533908975, "language_loss": 0.90685833, "learning_rate": 3.932697458306779e-06, "loss": 0.92909968, "num_input_tokens_seen": 39751065, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.1796875, "step": 1835, "time_per_iteration": 2.4849624633789062 }, { "auxiliary_loss_clip": 0.01169721, "auxiliary_loss_mlp": 0.01051008, "balance_loss_clip": 1.02953804, "balance_loss_mlp": 1.05228782, "epoch": 0.1103862918983917, "flos": 19683141851520.0, "grad_norm": 1.9624189623366626, "language_loss": 0.63954008, "learning_rate": 3.932597238269386e-06, "loss": 0.66174734, "num_input_tokens_seen": 39769245, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.171875, "step": 1836, "time_per_iteration": 2.482529878616333 }, { "auxiliary_loss_clip": 0.01168836, "auxiliary_loss_mlp": 0.01052993, "balance_loss_clip": 1.03291845, "balance_loss_mlp": 1.05099416, "epoch": 0.11044641515105967, "flos": 32160878340480.0, "grad_norm": 2.061454230290167, "language_loss": 0.7275368, "learning_rate": 3.932496944947711e-06, "loss": 0.74975508, "num_input_tokens_seen": 39790830, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1796875, "step": 1837, "time_per_iteration": 2.5555052757263184 }, { "auxiliary_loss_clip": 0.01173521, "auxiliary_loss_mlp": 0.0105346, "balance_loss_clip": 1.03319454, "balance_loss_mlp": 1.05630565, "epoch": 0.11050653840372764, "flos": 16689233295360.0, "grad_norm": 2.3004620863467427, "language_loss": 0.78423965, "learning_rate": 3.93239657834556e-06, "loss": 0.80650949, "num_input_tokens_seen": 39809475, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.171875, "step": 1838, "time_per_iteration": 2.456948757171631 }, { "auxiliary_loss_clip": 0.01170941, "auxiliary_loss_mlp": 0.01064259, "balance_loss_clip": 1.0428015, "balance_loss_mlp": 1.05453467, "epoch": 0.11056666165639562, "flos": 21208877902080.0, "grad_norm": 2.438621316711656, "language_loss": 0.71728402, "learning_rate": 3.932296138466736e-06, "loss": 0.739636, "num_input_tokens_seen": 39826355, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1640625, "step": 1839, "time_per_iteration": 2.4562740325927734 }, { "auxiliary_loss_clip": 0.01177924, "auxiliary_loss_mlp": 0.01054021, "balance_loss_clip": 1.03205109, "balance_loss_mlp": 1.05735052, "epoch": 0.11062678490906358, "flos": 19165488998400.0, "grad_norm": 2.288166954075407, "language_loss": 0.78665382, "learning_rate": 3.93219562531505e-06, "loss": 0.80897325, "num_input_tokens_seen": 39845335, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.203125, "step": 1840, "time_per_iteration": 2.474290132522583 }, { "auxiliary_loss_clip": 0.01167056, "auxiliary_loss_mlp": 0.01046124, "balance_loss_clip": 1.02600193, "balance_loss_mlp": 1.05161047, "epoch": 0.11068690816173155, "flos": 24895287740160.0, "grad_norm": 1.7508159237076764, "language_loss": 0.88089085, "learning_rate": 3.932095038894311e-06, "loss": 0.90302265, "num_input_tokens_seen": 39865065, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.15625, "step": 1841, "time_per_iteration": 2.488999366760254 }, { "auxiliary_loss_clip": 0.01167135, "auxiliary_loss_mlp": 0.01054332, "balance_loss_clip": 1.03374505, "balance_loss_mlp": 1.05278921, "epoch": 0.11074703141439952, "flos": 16472368932480.0, "grad_norm": 1.923853385245871, "language_loss": 0.90260983, "learning_rate": 3.931994379208334e-06, "loss": 0.92482448, "num_input_tokens_seen": 39882780, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.140625, "step": 1842, "time_per_iteration": 2.4585072994232178 }, { "auxiliary_loss_clip": 0.01167169, "auxiliary_loss_mlp": 0.01059545, "balance_loss_clip": 1.03973198, "balance_loss_mlp": 1.050524, "epoch": 0.11080715466706749, "flos": 19172420323200.0, "grad_norm": 2.126704437319164, "language_loss": 0.85800529, "learning_rate": 3.931893646260937e-06, "loss": 0.88027239, "num_input_tokens_seen": 39900295, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.171875, "step": 1843, "time_per_iteration": 2.4555583000183105 }, { "auxiliary_loss_clip": 0.01172195, "auxiliary_loss_mlp": 0.01059863, "balance_loss_clip": 1.0378449, "balance_loss_mlp": 1.05626845, "epoch": 0.11086727791973545, "flos": 27704687109120.0, "grad_norm": 1.641558713672728, "language_loss": 0.74804533, "learning_rate": 3.931792840055941e-06, "loss": 0.77036595, "num_input_tokens_seen": 39922075, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.15625, "step": 1844, "time_per_iteration": 2.5133211612701416 }, { "auxiliary_loss_clip": 0.01173096, "auxiliary_loss_mlp": 0.01054388, "balance_loss_clip": 1.03253651, "balance_loss_mlp": 1.05500615, "epoch": 0.11092740117240343, "flos": 18514967736960.0, "grad_norm": 2.3958698337895212, "language_loss": 0.75517374, "learning_rate": 3.931691960597165e-06, "loss": 0.77744865, "num_input_tokens_seen": 39940115, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1796875, "step": 1845, "time_per_iteration": 2.4281272888183594 }, { "auxiliary_loss_clip": 0.0116786, "auxiliary_loss_mlp": 0.01054994, "balance_loss_clip": 1.03524137, "balance_loss_mlp": 1.05237103, "epoch": 0.1109875244250714, "flos": 20522446018560.0, "grad_norm": 1.946008853208845, "language_loss": 0.76137543, "learning_rate": 3.9315910078884375e-06, "loss": 0.78360397, "num_input_tokens_seen": 39959920, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.15625, "step": 1846, "time_per_iteration": 2.4627673625946045 }, { "auxiliary_loss_clip": 0.01172371, "auxiliary_loss_mlp": 0.01052262, "balance_loss_clip": 1.03169835, "balance_loss_mlp": 1.05271316, "epoch": 0.11104764767773936, "flos": 14098601710080.0, "grad_norm": 4.13336705406101, "language_loss": 0.86438024, "learning_rate": 3.931489981933584e-06, "loss": 0.8866266, "num_input_tokens_seen": 39974755, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.1953125, "step": 1847, "time_per_iteration": 3.9796347618103027 }, { "auxiliary_loss_clip": 0.01170923, "auxiliary_loss_mlp": 0.01050046, "balance_loss_clip": 1.02935159, "balance_loss_mlp": 1.0516305, "epoch": 0.11110777093040733, "flos": 20594518657920.0, "grad_norm": 2.2630998470929438, "language_loss": 0.77073383, "learning_rate": 3.931388882736438e-06, "loss": 0.79294354, "num_input_tokens_seen": 39993355, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1875, "step": 1848, "time_per_iteration": 3.897719621658325 }, { "auxiliary_loss_clip": 0.01169036, "auxiliary_loss_mlp": 0.01054304, "balance_loss_clip": 1.03461027, "balance_loss_mlp": 1.05695379, "epoch": 0.11116789418307531, "flos": 21870065502720.0, "grad_norm": 1.6927528726953105, "language_loss": 0.77868593, "learning_rate": 3.931287710300832e-06, "loss": 0.80091929, "num_input_tokens_seen": 40012410, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.125, "step": 1849, "time_per_iteration": 3.836237668991089 }, { "auxiliary_loss_clip": 0.01171396, "auxiliary_loss_mlp": 0.01061922, "balance_loss_clip": 1.04181159, "balance_loss_mlp": 1.05124342, "epoch": 0.11122801743574327, "flos": 15523106256000.0, "grad_norm": 2.6283057789081004, "language_loss": 0.72006655, "learning_rate": 3.931186464630601e-06, "loss": 0.74239969, "num_input_tokens_seen": 40029315, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.203125, "step": 1850, "time_per_iteration": 2.432490825653076 }, { "auxiliary_loss_clip": 0.01172812, "auxiliary_loss_mlp": 0.01054902, "balance_loss_clip": 1.03407621, "balance_loss_mlp": 1.05434847, "epoch": 0.11128814068841124, "flos": 14392279307520.0, "grad_norm": 2.3760195508998585, "language_loss": 0.80823815, "learning_rate": 3.931085145729588e-06, "loss": 0.83051533, "num_input_tokens_seen": 40045765, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1875, "step": 1851, "time_per_iteration": 2.4280736446380615 }, { "auxiliary_loss_clip": 0.01171348, "auxiliary_loss_mlp": 0.01053195, "balance_loss_clip": 1.03378737, "balance_loss_mlp": 1.05346835, "epoch": 0.11134826394107922, "flos": 16653933204480.0, "grad_norm": 2.5303682825132023, "language_loss": 0.8838793, "learning_rate": 3.930983753601631e-06, "loss": 0.90612477, "num_input_tokens_seen": 40061660, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.1796875, "step": 1852, "time_per_iteration": 2.430389165878296 }, { "auxiliary_loss_clip": 0.01173749, "auxiliary_loss_mlp": 0.01062202, "balance_loss_clip": 1.04082799, "balance_loss_mlp": 1.05436289, "epoch": 0.11140838719374718, "flos": 16690993061760.0, "grad_norm": 2.115956576572379, "language_loss": 0.72148585, "learning_rate": 3.930882288250578e-06, "loss": 0.74384534, "num_input_tokens_seen": 40080180, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1953125, "step": 1853, "time_per_iteration": 2.4523141384124756 }, { "auxiliary_loss_clip": 0.01061178, "auxiliary_loss_mlp": 0.01040376, "balance_loss_clip": 1.03752697, "balance_loss_mlp": 1.02038455, "epoch": 0.11146851044641515, "flos": 60976355587200.0, "grad_norm": 0.7964147671660738, "language_loss": 0.53681421, "learning_rate": 3.930780749680273e-06, "loss": 0.55782974, "num_input_tokens_seen": 40138910, "router_z_loss_clip": 0.02844238, "router_z_loss_mlp": 0.40625, "step": 1854, "time_per_iteration": 3.034639358520508 }, { "auxiliary_loss_clip": 0.01178119, "auxiliary_loss_mlp": 0.01048204, "balance_loss_clip": 1.02621043, "balance_loss_mlp": 1.05315495, "epoch": 0.11152863369908313, "flos": 22193835719040.0, "grad_norm": 4.520630980231191, "language_loss": 0.84751904, "learning_rate": 3.9306791378945705e-06, "loss": 0.86978221, "num_input_tokens_seen": 40157745, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.25, "step": 1855, "time_per_iteration": 2.4911656379699707 }, { "auxiliary_loss_clip": 0.01170927, "auxiliary_loss_mlp": 0.01062051, "balance_loss_clip": 1.04185688, "balance_loss_mlp": 1.05224979, "epoch": 0.11158875695175109, "flos": 19537524115200.0, "grad_norm": 2.1726852418191167, "language_loss": 0.8153379, "learning_rate": 3.9305774528973205e-06, "loss": 0.8376677, "num_input_tokens_seen": 40175375, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.1875, "step": 1856, "time_per_iteration": 2.474006414413452 }, { "auxiliary_loss_clip": 0.01168208, "auxiliary_loss_mlp": 0.01043927, "balance_loss_clip": 1.02313697, "balance_loss_mlp": 1.05239272, "epoch": 0.11164888020441906, "flos": 25442709989760.0, "grad_norm": 2.1807876859203734, "language_loss": 0.83362561, "learning_rate": 3.93047569469238e-06, "loss": 0.85574698, "num_input_tokens_seen": 40195715, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.15625, "step": 1857, "time_per_iteration": 2.5083889961242676 }, { "auxiliary_loss_clip": 0.01169938, "auxiliary_loss_mlp": 0.01043924, "balance_loss_clip": 1.02468395, "balance_loss_mlp": 1.04963672, "epoch": 0.11170900345708702, "flos": 15632741543040.0, "grad_norm": 2.432170776591135, "language_loss": 0.82899743, "learning_rate": 3.930373863283608e-06, "loss": 0.85113603, "num_input_tokens_seen": 40213975, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.203125, "step": 1858, "time_per_iteration": 2.42877459526062 }, { "auxiliary_loss_clip": 0.01173078, "auxiliary_loss_mlp": 0.01047842, "balance_loss_clip": 1.02776682, "balance_loss_mlp": 1.05339456, "epoch": 0.111769126709755, "flos": 23039424766080.0, "grad_norm": 2.684442458951124, "language_loss": 0.91366082, "learning_rate": 3.930271958674866e-06, "loss": 0.93586999, "num_input_tokens_seen": 40233905, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1953125, "step": 1859, "time_per_iteration": 2.513679265975952 }, { "auxiliary_loss_clip": 0.01169254, "auxiliary_loss_mlp": 0.01048886, "balance_loss_clip": 1.02814412, "balance_loss_mlp": 1.0499624, "epoch": 0.11182924996242297, "flos": 20850705434880.0, "grad_norm": 2.2006720335922347, "language_loss": 0.8190105, "learning_rate": 3.930169980870018e-06, "loss": 0.84119189, "num_input_tokens_seen": 40252810, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1953125, "step": 1860, "time_per_iteration": 2.45784330368042 }, { "auxiliary_loss_clip": 0.01166883, "auxiliary_loss_mlp": 0.01058851, "balance_loss_clip": 1.03858507, "balance_loss_mlp": 1.05158985, "epoch": 0.11188937321509093, "flos": 17455315587840.0, "grad_norm": 1.9861181255373248, "language_loss": 0.75166869, "learning_rate": 3.930067929872931e-06, "loss": 0.77392602, "num_input_tokens_seen": 40272000, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.15625, "step": 1861, "time_per_iteration": 2.4963722229003906 }, { "auxiliary_loss_clip": 0.01165284, "auxiliary_loss_mlp": 0.0105285, "balance_loss_clip": 1.03378797, "balance_loss_mlp": 1.0502336, "epoch": 0.11194949646775891, "flos": 24095916518400.0, "grad_norm": 1.8188047746757683, "language_loss": 0.88841164, "learning_rate": 3.929965805687474e-06, "loss": 0.91059303, "num_input_tokens_seen": 40290660, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.1484375, "step": 1862, "time_per_iteration": 2.5136749744415283 }, { "auxiliary_loss_clip": 0.01171251, "auxiliary_loss_mlp": 0.01057936, "balance_loss_clip": 1.03795683, "balance_loss_mlp": 1.05306065, "epoch": 0.11200961972042688, "flos": 25153880728320.0, "grad_norm": 2.3402758757847897, "language_loss": 0.87096262, "learning_rate": 3.92986360831752e-06, "loss": 0.89325446, "num_input_tokens_seen": 40307820, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1796875, "step": 1863, "time_per_iteration": 2.530827283859253 }, { "auxiliary_loss_clip": 0.01169529, "auxiliary_loss_mlp": 0.01054147, "balance_loss_clip": 1.03202164, "balance_loss_mlp": 1.05077744, "epoch": 0.11206974297309484, "flos": 21288312829440.0, "grad_norm": 1.8940961814939037, "language_loss": 0.64027303, "learning_rate": 3.929761337766945e-06, "loss": 0.66250974, "num_input_tokens_seen": 40327430, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1875, "step": 1864, "time_per_iteration": 2.467912435531616 }, { "auxiliary_loss_clip": 0.01168981, "auxiliary_loss_mlp": 0.01043415, "balance_loss_clip": 1.02493763, "balance_loss_mlp": 1.05326343, "epoch": 0.11212986622576282, "flos": 18915982151040.0, "grad_norm": 2.0279498366468127, "language_loss": 0.74107325, "learning_rate": 3.929658994039627e-06, "loss": 0.76319718, "num_input_tokens_seen": 40344545, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.15625, "step": 1865, "time_per_iteration": 2.452613592147827 }, { "auxiliary_loss_clip": 0.0116737, "auxiliary_loss_mlp": 0.01058303, "balance_loss_clip": 1.0359627, "balance_loss_mlp": 1.05046356, "epoch": 0.11218998947843078, "flos": 22054754257920.0, "grad_norm": 2.7806756059233564, "language_loss": 0.84308743, "learning_rate": 3.929556577139446e-06, "loss": 0.86534417, "num_input_tokens_seen": 40362300, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.171875, "step": 1866, "time_per_iteration": 2.4573733806610107 }, { "auxiliary_loss_clip": 0.01167914, "auxiliary_loss_mlp": 0.01048591, "balance_loss_clip": 1.02788472, "balance_loss_mlp": 1.05044007, "epoch": 0.11225011273109875, "flos": 24571697091840.0, "grad_norm": 1.5941631063230974, "language_loss": 0.81369805, "learning_rate": 3.929454087070286e-06, "loss": 0.83586305, "num_input_tokens_seen": 40384720, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.171875, "step": 1867, "time_per_iteration": 2.59070086479187 }, { "auxiliary_loss_clip": 0.01170055, "auxiliary_loss_mlp": 0.0105593, "balance_loss_clip": 1.03615308, "balance_loss_mlp": 1.05278182, "epoch": 0.11231023598376672, "flos": 28438665621120.0, "grad_norm": 3.5461525174953374, "language_loss": 0.87187904, "learning_rate": 3.929351523836035e-06, "loss": 0.89413893, "num_input_tokens_seen": 40404000, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.171875, "step": 1868, "time_per_iteration": 2.5187952518463135 }, { "auxiliary_loss_clip": 0.01167908, "auxiliary_loss_mlp": 0.01051258, "balance_loss_clip": 1.03165972, "balance_loss_mlp": 1.05259228, "epoch": 0.1123703592364347, "flos": 14426466076800.0, "grad_norm": 2.5204091418949512, "language_loss": 0.68218535, "learning_rate": 3.9292488874405795e-06, "loss": 0.704377, "num_input_tokens_seen": 40418665, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.15625, "step": 1869, "time_per_iteration": 2.4406583309173584 }, { "auxiliary_loss_clip": 0.01171813, "auxiliary_loss_mlp": 0.01053289, "balance_loss_clip": 1.0325346, "balance_loss_mlp": 1.05070972, "epoch": 0.11243048248910266, "flos": 22236282616320.0, "grad_norm": 1.6628010824407924, "language_loss": 0.77384847, "learning_rate": 3.929146177887814e-06, "loss": 0.79609948, "num_input_tokens_seen": 40437870, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.2109375, "step": 1870, "time_per_iteration": 2.462353229522705 }, { "auxiliary_loss_clip": 0.01171813, "auxiliary_loss_mlp": 0.01054962, "balance_loss_clip": 1.03350389, "balance_loss_mlp": 1.05135024, "epoch": 0.11249060574177062, "flos": 18584167288320.0, "grad_norm": 2.11547140054311, "language_loss": 0.76415229, "learning_rate": 3.929043395181631e-06, "loss": 0.78642005, "num_input_tokens_seen": 40455570, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.203125, "step": 1871, "time_per_iteration": 2.4600155353546143 }, { "auxiliary_loss_clip": 0.01169108, "auxiliary_loss_mlp": 0.01047007, "balance_loss_clip": 1.02773046, "balance_loss_mlp": 1.05258763, "epoch": 0.1125507289944386, "flos": 22856567604480.0, "grad_norm": 2.0333829475844634, "language_loss": 0.82257479, "learning_rate": 3.928940539325929e-06, "loss": 0.84473598, "num_input_tokens_seen": 40473600, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.1640625, "step": 1872, "time_per_iteration": 2.460831880569458 }, { "auxiliary_loss_clip": 0.01170399, "auxiliary_loss_mlp": 0.01052118, "balance_loss_clip": 1.03261566, "balance_loss_mlp": 1.05272245, "epoch": 0.11261085224710657, "flos": 19676390094720.0, "grad_norm": 4.486254656967786, "language_loss": 0.83378577, "learning_rate": 3.9288376103246095e-06, "loss": 0.85601091, "num_input_tokens_seen": 40490025, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.171875, "step": 1873, "time_per_iteration": 2.490093946456909 }, { "auxiliary_loss_clip": 0.01173146, "auxiliary_loss_mlp": 0.01051683, "balance_loss_clip": 1.03060722, "balance_loss_mlp": 1.05149388, "epoch": 0.11267097549977453, "flos": 26063246373120.0, "grad_norm": 1.8551884604972873, "language_loss": 0.91999012, "learning_rate": 3.928734608181575e-06, "loss": 0.94223839, "num_input_tokens_seen": 40511580, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.2109375, "step": 1874, "time_per_iteration": 2.521101236343384 }, { "auxiliary_loss_clip": 0.01165067, "auxiliary_loss_mlp": 0.01060007, "balance_loss_clip": 1.04131472, "balance_loss_mlp": 1.05066299, "epoch": 0.11273109875244251, "flos": 21068036674560.0, "grad_norm": 1.444862325215125, "language_loss": 0.7538774, "learning_rate": 3.928631532900729e-06, "loss": 0.77612817, "num_input_tokens_seen": 40530155, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.140625, "step": 1875, "time_per_iteration": 2.519561767578125 }, { "auxiliary_loss_clip": 0.01164113, "auxiliary_loss_mlp": 0.01058296, "balance_loss_clip": 1.0395205, "balance_loss_mlp": 1.05161643, "epoch": 0.11279122200511048, "flos": 27088999061760.0, "grad_norm": 2.3205977190354488, "language_loss": 0.71620941, "learning_rate": 3.928528384485984e-06, "loss": 0.73843348, "num_input_tokens_seen": 40549500, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.125, "step": 1876, "time_per_iteration": 2.5185086727142334 }, { "auxiliary_loss_clip": 0.01166401, "auxiliary_loss_mlp": 0.0104969, "balance_loss_clip": 1.02955556, "balance_loss_mlp": 1.05357528, "epoch": 0.11285134525777844, "flos": 20187901722240.0, "grad_norm": 1.7972936224299974, "language_loss": 0.76910233, "learning_rate": 3.9284251629412475e-06, "loss": 0.79126322, "num_input_tokens_seen": 40567475, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.125, "step": 1877, "time_per_iteration": 2.4744479656219482 }, { "auxiliary_loss_clip": 0.01171906, "auxiliary_loss_mlp": 0.0105794, "balance_loss_clip": 1.03617239, "balance_loss_mlp": 1.05298197, "epoch": 0.11291146851044641, "flos": 12458453863680.0, "grad_norm": 4.462523570388212, "language_loss": 0.8784852, "learning_rate": 3.928321868270436e-06, "loss": 0.90078366, "num_input_tokens_seen": 40583280, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.1875, "step": 1878, "time_per_iteration": 2.427349328994751 }, { "auxiliary_loss_clip": 0.01168746, "auxiliary_loss_mlp": 0.01047768, "balance_loss_clip": 1.02750242, "balance_loss_mlp": 1.05113792, "epoch": 0.11297159176311439, "flos": 23842315520640.0, "grad_norm": 2.318506592039202, "language_loss": 0.81520945, "learning_rate": 3.928218500477466e-06, "loss": 0.83737457, "num_input_tokens_seen": 40603080, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.1796875, "step": 1879, "time_per_iteration": 2.4887735843658447 }, { "auxiliary_loss_clip": 0.01171755, "auxiliary_loss_mlp": 0.01053775, "balance_loss_clip": 1.03296065, "balance_loss_mlp": 1.05412376, "epoch": 0.11303171501578235, "flos": 29930538124800.0, "grad_norm": 2.7327365006545334, "language_loss": 0.70792937, "learning_rate": 3.928115059566259e-06, "loss": 0.73018467, "num_input_tokens_seen": 40623255, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.171875, "step": 1880, "time_per_iteration": 2.5332579612731934 }, { "auxiliary_loss_clip": 0.01165205, "auxiliary_loss_mlp": 0.01052303, "balance_loss_clip": 1.03219235, "balance_loss_mlp": 1.05072713, "epoch": 0.11309183826845032, "flos": 16180558842240.0, "grad_norm": 1.8374477295995162, "language_loss": 0.72700894, "learning_rate": 3.928011545540734e-06, "loss": 0.74918401, "num_input_tokens_seen": 40641570, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1484375, "step": 1881, "time_per_iteration": 2.4789083003997803 }, { "auxiliary_loss_clip": 0.01168495, "auxiliary_loss_mlp": 0.01053132, "balance_loss_clip": 1.03130519, "balance_loss_mlp": 1.05014455, "epoch": 0.1131519615211183, "flos": 12020702814720.0, "grad_norm": 2.4780667536445757, "language_loss": 0.7475546, "learning_rate": 3.927907958404819e-06, "loss": 0.76977092, "num_input_tokens_seen": 40658775, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1875, "step": 1882, "time_per_iteration": 2.439584732055664 }, { "auxiliary_loss_clip": 0.01166852, "auxiliary_loss_mlp": 0.01053455, "balance_loss_clip": 1.03214037, "balance_loss_mlp": 1.05213678, "epoch": 0.11321208477378626, "flos": 26250125857920.0, "grad_norm": 2.1087630465165375, "language_loss": 0.794617, "learning_rate": 3.92780429816244e-06, "loss": 0.81682003, "num_input_tokens_seen": 40679555, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.140625, "step": 1883, "time_per_iteration": 2.5053060054779053 }, { "auxiliary_loss_clip": 0.01168698, "auxiliary_loss_mlp": 0.01050789, "balance_loss_clip": 1.030249, "balance_loss_mlp": 1.04978204, "epoch": 0.11327220802645423, "flos": 13626376583040.0, "grad_norm": 1.992810903010165, "language_loss": 0.76765203, "learning_rate": 3.927700564817529e-06, "loss": 0.7898469, "num_input_tokens_seen": 40697295, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1875, "step": 1884, "time_per_iteration": 2.424125909805298 }, { "auxiliary_loss_clip": 0.01066111, "auxiliary_loss_mlp": 0.01027181, "balance_loss_clip": 1.02405822, "balance_loss_mlp": 1.02322006, "epoch": 0.1133323312791222, "flos": 57191802814080.0, "grad_norm": 0.9742839725115606, "language_loss": 0.55183017, "learning_rate": 3.927596758374019e-06, "loss": 0.57276309, "num_input_tokens_seen": 40758095, "router_z_loss_clip": 0.03112793, "router_z_loss_mlp": 0.4296875, "step": 1885, "time_per_iteration": 3.0035946369171143 }, { "auxiliary_loss_clip": 0.01160997, "auxiliary_loss_mlp": 0.01048662, "balance_loss_clip": 1.02940965, "balance_loss_mlp": 1.04939473, "epoch": 0.11339245453179017, "flos": 24351708245760.0, "grad_norm": 6.705551003413325, "language_loss": 0.90518177, "learning_rate": 3.927492878835848e-06, "loss": 0.9272784, "num_input_tokens_seen": 40777140, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.1171875, "step": 1886, "time_per_iteration": 2.496830701828003 }, { "auxiliary_loss_clip": 0.01165999, "auxiliary_loss_mlp": 0.01047148, "balance_loss_clip": 1.02760959, "balance_loss_mlp": 1.05126166, "epoch": 0.11345257778445814, "flos": 22670693700480.0, "grad_norm": 2.1315184993504506, "language_loss": 0.85167557, "learning_rate": 3.927388926206953e-06, "loss": 0.87380707, "num_input_tokens_seen": 40797505, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.1484375, "step": 1887, "time_per_iteration": 2.514277458190918 }, { "auxiliary_loss_clip": 0.0116892, "auxiliary_loss_mlp": 0.01054021, "balance_loss_clip": 1.03556764, "balance_loss_mlp": 1.05262995, "epoch": 0.11351270103712612, "flos": 20988242611200.0, "grad_norm": 4.089241551695448, "language_loss": 0.76000953, "learning_rate": 3.927284900491277e-06, "loss": 0.7822389, "num_input_tokens_seen": 40812970, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.1640625, "step": 1888, "time_per_iteration": 4.097553730010986 }, { "auxiliary_loss_clip": 0.01177736, "auxiliary_loss_mlp": 0.01057058, "balance_loss_clip": 1.03506374, "balance_loss_mlp": 1.05685639, "epoch": 0.11357282428979408, "flos": 37347923600640.0, "grad_norm": 1.6644906452334354, "language_loss": 0.68068165, "learning_rate": 3.927180801692764e-06, "loss": 0.70302957, "num_input_tokens_seen": 40837745, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.2109375, "step": 1889, "time_per_iteration": 2.6439504623413086 }, { "auxiliary_loss_clip": 0.01170006, "auxiliary_loss_mlp": 0.01050814, "balance_loss_clip": 1.03091764, "balance_loss_mlp": 1.05402136, "epoch": 0.11363294754246205, "flos": 21757018423680.0, "grad_norm": 1.5805458134863593, "language_loss": 0.83986032, "learning_rate": 3.927076629815362e-06, "loss": 0.86206853, "num_input_tokens_seen": 40856490, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.15625, "step": 1890, "time_per_iteration": 3.901899576187134 }, { "auxiliary_loss_clip": 0.01165735, "auxiliary_loss_mlp": 0.01054857, "balance_loss_clip": 1.0349009, "balance_loss_mlp": 1.05050492, "epoch": 0.11369307079513001, "flos": 22601637803520.0, "grad_norm": 2.2015890901875124, "language_loss": 0.64819026, "learning_rate": 3.926972384863022e-06, "loss": 0.67039621, "num_input_tokens_seen": 40874070, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.15625, "step": 1891, "time_per_iteration": 2.483767509460449 }, { "auxiliary_loss_clip": 0.0117229, "auxiliary_loss_mlp": 0.01045473, "balance_loss_clip": 1.02605414, "balance_loss_mlp": 1.05367255, "epoch": 0.11375319404779799, "flos": 21944257044480.0, "grad_norm": 2.0839041056530023, "language_loss": 0.88330257, "learning_rate": 3.9268680668396956e-06, "loss": 0.90548021, "num_input_tokens_seen": 40892425, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.1875, "step": 1892, "time_per_iteration": 2.4567110538482666 }, { "auxiliary_loss_clip": 0.01170254, "auxiliary_loss_mlp": 0.01063913, "balance_loss_clip": 1.04381442, "balance_loss_mlp": 1.05276108, "epoch": 0.11381331730046595, "flos": 26395456285440.0, "grad_norm": 2.141984300647526, "language_loss": 0.7315588, "learning_rate": 3.926763675749339e-06, "loss": 0.75390041, "num_input_tokens_seen": 40912190, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.171875, "step": 1893, "time_per_iteration": 2.5462212562561035 }, { "auxiliary_loss_clip": 0.0116601, "auxiliary_loss_mlp": 0.0106369, "balance_loss_clip": 1.04297149, "balance_loss_mlp": 1.04957604, "epoch": 0.11387344055313392, "flos": 23804716959360.0, "grad_norm": 1.9780282154483846, "language_loss": 0.79750645, "learning_rate": 3.92665921159591e-06, "loss": 0.81980348, "num_input_tokens_seen": 40928395, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1640625, "step": 1894, "time_per_iteration": 2.4739131927490234 }, { "auxiliary_loss_clip": 0.0117555, "auxiliary_loss_mlp": 0.01063671, "balance_loss_clip": 1.04282141, "balance_loss_mlp": 1.05386877, "epoch": 0.1139335638058019, "flos": 34522865902080.0, "grad_norm": 9.052801675371528, "language_loss": 0.79672259, "learning_rate": 3.926554674383371e-06, "loss": 0.8191148, "num_input_tokens_seen": 40946555, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.21875, "step": 1895, "time_per_iteration": 2.5436949729919434 }, { "auxiliary_loss_clip": 0.01069278, "auxiliary_loss_mlp": 0.01056305, "balance_loss_clip": 1.05374205, "balance_loss_mlp": 1.02731848, "epoch": 0.11399368705846986, "flos": 70587811520640.0, "grad_norm": 0.8149956182025605, "language_loss": 0.63428926, "learning_rate": 3.926450064115686e-06, "loss": 0.65554512, "num_input_tokens_seen": 41004910, "router_z_loss_clip": 0.02563477, "router_z_loss_mlp": 0.41796875, "step": 1896, "time_per_iteration": 3.1773431301116943 }, { "auxiliary_loss_clip": 0.01169662, "auxiliary_loss_mlp": 0.01053425, "balance_loss_clip": 1.0317167, "balance_loss_mlp": 1.05441618, "epoch": 0.11405381031113783, "flos": 21324259365120.0, "grad_norm": 1.768641383147301, "language_loss": 0.85135555, "learning_rate": 3.926345380796821e-06, "loss": 0.87358642, "num_input_tokens_seen": 41026385, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1484375, "step": 1897, "time_per_iteration": 2.5261762142181396 }, { "auxiliary_loss_clip": 0.01171115, "auxiliary_loss_mlp": 0.01049357, "balance_loss_clip": 1.02849555, "balance_loss_mlp": 1.05349469, "epoch": 0.11411393356380581, "flos": 19719627091200.0, "grad_norm": 3.007916570428595, "language_loss": 0.79719812, "learning_rate": 3.9262406244307465e-06, "loss": 0.81940281, "num_input_tokens_seen": 41045315, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.171875, "step": 1898, "time_per_iteration": 2.521383047103882 }, { "auxiliary_loss_clip": 0.01173215, "auxiliary_loss_mlp": 0.01055328, "balance_loss_clip": 1.03361964, "balance_loss_mlp": 1.05370283, "epoch": 0.11417405681647377, "flos": 17530440883200.0, "grad_norm": 2.2245713772387736, "language_loss": 0.73299885, "learning_rate": 3.926135795021435e-06, "loss": 0.75528431, "num_input_tokens_seen": 41063390, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1953125, "step": 1899, "time_per_iteration": 2.5065982341766357 }, { "auxiliary_loss_clip": 0.01066043, "auxiliary_loss_mlp": 0.01005177, "balance_loss_clip": 1.00261438, "balance_loss_mlp": 1.02453864, "epoch": 0.11423418006914174, "flos": 59674666619520.0, "grad_norm": 0.9012349118029712, "language_loss": 0.63340127, "learning_rate": 3.92603089257286e-06, "loss": 0.65411347, "num_input_tokens_seen": 41124180, "router_z_loss_clip": 0.02563477, "router_z_loss_mlp": 0.4140625, "step": 1900, "time_per_iteration": 3.0510520935058594 }, { "auxiliary_loss_clip": 0.01168976, "auxiliary_loss_mlp": 0.01050889, "balance_loss_clip": 1.0304327, "balance_loss_mlp": 1.05094254, "epoch": 0.1142943033218097, "flos": 22963114321920.0, "grad_norm": 1.8983240526198557, "language_loss": 0.7823655, "learning_rate": 3.925925917089001e-06, "loss": 0.80456412, "num_input_tokens_seen": 41143485, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1796875, "step": 1901, "time_per_iteration": 2.4906885623931885 }, { "auxiliary_loss_clip": 0.01169519, "auxiliary_loss_mlp": 0.01053041, "balance_loss_clip": 1.03260839, "balance_loss_mlp": 1.05325484, "epoch": 0.11435442657447768, "flos": 18256267008000.0, "grad_norm": 1.9472572872925304, "language_loss": 0.83815038, "learning_rate": 3.925820868573839e-06, "loss": 0.860376, "num_input_tokens_seen": 41161695, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1640625, "step": 1902, "time_per_iteration": 2.420600652694702 }, { "auxiliary_loss_clip": 0.0117014, "auxiliary_loss_mlp": 0.01047073, "balance_loss_clip": 1.02587783, "balance_loss_mlp": 1.0521512, "epoch": 0.11441454982714565, "flos": 24061191045120.0, "grad_norm": 1.9166815742357464, "language_loss": 0.77645934, "learning_rate": 3.925715747031356e-06, "loss": 0.79863149, "num_input_tokens_seen": 41181715, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1796875, "step": 1903, "time_per_iteration": 2.4929964542388916 }, { "auxiliary_loss_clip": 0.01169308, "auxiliary_loss_mlp": 0.01041348, "balance_loss_clip": 1.02318084, "balance_loss_mlp": 1.05356681, "epoch": 0.11447467307981361, "flos": 25337707557120.0, "grad_norm": 2.0902052103924174, "language_loss": 0.75830901, "learning_rate": 3.925610552465539e-06, "loss": 0.78041559, "num_input_tokens_seen": 41201770, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.15625, "step": 1904, "time_per_iteration": 2.4890975952148438 }, { "auxiliary_loss_clip": 0.011686, "auxiliary_loss_mlp": 0.01050538, "balance_loss_clip": 1.02934253, "balance_loss_mlp": 1.05299449, "epoch": 0.11453479633248159, "flos": 21726063878400.0, "grad_norm": 2.0996283013103176, "language_loss": 0.92069948, "learning_rate": 3.9255052848803764e-06, "loss": 0.94289076, "num_input_tokens_seen": 41220590, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.15625, "step": 1905, "time_per_iteration": 2.4841980934143066 }, { "auxiliary_loss_clip": 0.01174234, "auxiliary_loss_mlp": 0.01045499, "balance_loss_clip": 1.02418447, "balance_loss_mlp": 1.05000329, "epoch": 0.11459491958514956, "flos": 12969714096000.0, "grad_norm": 2.9109376765036843, "language_loss": 0.77568018, "learning_rate": 3.925399944279861e-06, "loss": 0.79787755, "num_input_tokens_seen": 41237250, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.2421875, "step": 1906, "time_per_iteration": 2.4309864044189453 }, { "auxiliary_loss_clip": 0.01169641, "auxiliary_loss_mlp": 0.01046505, "balance_loss_clip": 1.02592945, "balance_loss_mlp": 1.05218625, "epoch": 0.11465504283781752, "flos": 22711273090560.0, "grad_norm": 2.2513824264198834, "language_loss": 0.81973046, "learning_rate": 3.925294530667986e-06, "loss": 0.84189188, "num_input_tokens_seen": 41256680, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.171875, "step": 1907, "time_per_iteration": 2.4858434200286865 }, { "auxiliary_loss_clip": 0.01167599, "auxiliary_loss_mlp": 0.01053987, "balance_loss_clip": 1.034675, "balance_loss_mlp": 1.05284011, "epoch": 0.1147151660904855, "flos": 23398387332480.0, "grad_norm": 2.398176105675946, "language_loss": 0.84468555, "learning_rate": 3.92518904404875e-06, "loss": 0.8669014, "num_input_tokens_seen": 41270955, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.140625, "step": 1908, "time_per_iteration": 2.4546754360198975 }, { "auxiliary_loss_clip": 0.01065793, "auxiliary_loss_mlp": 0.01038019, "balance_loss_clip": 1.03561068, "balance_loss_mlp": 1.02423739, "epoch": 0.11477528934315347, "flos": 63011843498880.0, "grad_norm": 0.9327999716087884, "language_loss": 0.61066127, "learning_rate": 3.925083484426153e-06, "loss": 0.63169932, "num_input_tokens_seen": 41319180, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.41601562, "step": 1909, "time_per_iteration": 2.8438148498535156 }, { "auxiliary_loss_clip": 0.01173412, "auxiliary_loss_mlp": 0.01046685, "balance_loss_clip": 1.02676487, "balance_loss_mlp": 1.05633795, "epoch": 0.11483541259582143, "flos": 16325601960960.0, "grad_norm": 2.1185749426494627, "language_loss": 0.78801507, "learning_rate": 3.924977851804197e-06, "loss": 0.81021607, "num_input_tokens_seen": 41337480, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.171875, "step": 1910, "time_per_iteration": 2.53202748298645 }, { "auxiliary_loss_clip": 0.01174754, "auxiliary_loss_mlp": 0.01046747, "balance_loss_clip": 1.02669644, "balance_loss_mlp": 1.05568767, "epoch": 0.1148955358484894, "flos": 21580410228480.0, "grad_norm": 2.3840535780388095, "language_loss": 0.77049899, "learning_rate": 3.9248721461868875e-06, "loss": 0.792714, "num_input_tokens_seen": 41354650, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.1875, "step": 1911, "time_per_iteration": 2.487924337387085 }, { "auxiliary_loss_clip": 0.01163014, "auxiliary_loss_mlp": 0.01046564, "balance_loss_clip": 1.02690697, "balance_loss_mlp": 1.05153441, "epoch": 0.11495565910115738, "flos": 27673696650240.0, "grad_norm": 1.6067806595286476, "language_loss": 0.79200596, "learning_rate": 3.9247663675782336e-06, "loss": 0.81410182, "num_input_tokens_seen": 41376935, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.1171875, "step": 1912, "time_per_iteration": 2.5373783111572266 }, { "auxiliary_loss_clip": 0.01168551, "auxiliary_loss_mlp": 0.01057708, "balance_loss_clip": 1.03702497, "balance_loss_mlp": 1.05312037, "epoch": 0.11501578235382534, "flos": 20632368614400.0, "grad_norm": 2.208010261705775, "language_loss": 0.78202474, "learning_rate": 3.924660515982246e-06, "loss": 0.80428737, "num_input_tokens_seen": 41396105, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.15625, "step": 1913, "time_per_iteration": 2.499854564666748 }, { "auxiliary_loss_clip": 0.01169102, "auxiliary_loss_mlp": 0.01047794, "balance_loss_clip": 1.02681279, "balance_loss_mlp": 1.05105293, "epoch": 0.1150759056064933, "flos": 19829046896640.0, "grad_norm": 2.0393397490555936, "language_loss": 0.69950765, "learning_rate": 3.924554591402939e-06, "loss": 0.72167665, "num_input_tokens_seen": 41415600, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.171875, "step": 1914, "time_per_iteration": 2.4917373657226562 }, { "auxiliary_loss_clip": 0.01062639, "auxiliary_loss_mlp": 0.01052191, "balance_loss_clip": 1.04996192, "balance_loss_mlp": 1.02164435, "epoch": 0.11513602885916129, "flos": 70045776311040.0, "grad_norm": 0.7765811281159201, "language_loss": 0.60993981, "learning_rate": 3.92444859384433e-06, "loss": 0.63108808, "num_input_tokens_seen": 41478760, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.41015625, "step": 1915, "time_per_iteration": 3.2032155990600586 }, { "auxiliary_loss_clip": 0.01168868, "auxiliary_loss_mlp": 0.01056822, "balance_loss_clip": 1.03646147, "balance_loss_mlp": 1.05426276, "epoch": 0.11519615211182925, "flos": 15741730385280.0, "grad_norm": 2.065064724294272, "language_loss": 0.93336284, "learning_rate": 3.924342523310436e-06, "loss": 0.95561969, "num_input_tokens_seen": 41495720, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1484375, "step": 1916, "time_per_iteration": 2.4716315269470215 }, { "auxiliary_loss_clip": 0.01167205, "auxiliary_loss_mlp": 0.010623, "balance_loss_clip": 1.04032993, "balance_loss_mlp": 1.05106068, "epoch": 0.11525627536449722, "flos": 20667632791680.0, "grad_norm": 2.543943774560744, "language_loss": 0.724787, "learning_rate": 3.9242363798052806e-06, "loss": 0.74708205, "num_input_tokens_seen": 41513585, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.1640625, "step": 1917, "time_per_iteration": 2.487372875213623 }, { "auxiliary_loss_clip": 0.01166874, "auxiliary_loss_mlp": 0.01047675, "balance_loss_clip": 1.02698064, "balance_loss_mlp": 1.05234694, "epoch": 0.1153163986171652, "flos": 20303283185280.0, "grad_norm": 2.0110275594151483, "language_loss": 0.74276882, "learning_rate": 3.92413016333289e-06, "loss": 0.76491427, "num_input_tokens_seen": 41533390, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.140625, "step": 1918, "time_per_iteration": 2.49491548538208 }, { "auxiliary_loss_clip": 0.01170283, "auxiliary_loss_mlp": 0.01044798, "balance_loss_clip": 1.02473485, "balance_loss_mlp": 1.0508337, "epoch": 0.11537652186983316, "flos": 17639321984640.0, "grad_norm": 2.4084816581901385, "language_loss": 0.86552346, "learning_rate": 3.92402387389729e-06, "loss": 0.88767427, "num_input_tokens_seen": 41551015, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1953125, "step": 1919, "time_per_iteration": 2.4363045692443848 }, { "auxiliary_loss_clip": 0.01165707, "auxiliary_loss_mlp": 0.01056883, "balance_loss_clip": 1.03571153, "balance_loss_mlp": 1.05015326, "epoch": 0.11543664512250112, "flos": 21069401391360.0, "grad_norm": 1.975481330555711, "language_loss": 0.86419165, "learning_rate": 3.923917511502512e-06, "loss": 0.88641763, "num_input_tokens_seen": 41568055, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.15625, "step": 1920, "time_per_iteration": 2.4834983348846436 }, { "auxiliary_loss_clip": 0.01162947, "auxiliary_loss_mlp": 0.01047743, "balance_loss_clip": 1.02758467, "balance_loss_mlp": 1.05056667, "epoch": 0.11549676837516909, "flos": 22747542848640.0, "grad_norm": 2.0176191117876296, "language_loss": 0.79542488, "learning_rate": 3.923811076152589e-06, "loss": 0.81753182, "num_input_tokens_seen": 41587435, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.125, "step": 1921, "time_per_iteration": 2.484232187271118 }, { "auxiliary_loss_clip": 0.01171184, "auxiliary_loss_mlp": 0.01054866, "balance_loss_clip": 1.03343248, "balance_loss_mlp": 1.05202413, "epoch": 0.11555689162783707, "flos": 19168972617600.0, "grad_norm": 1.98206444662024, "language_loss": 0.78639853, "learning_rate": 3.923704567851557e-06, "loss": 0.80865908, "num_input_tokens_seen": 41604975, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1875, "step": 1922, "time_per_iteration": 2.4560694694519043 }, { "auxiliary_loss_clip": 0.0116898, "auxiliary_loss_mlp": 0.0105625, "balance_loss_clip": 1.03593647, "balance_loss_mlp": 1.05178738, "epoch": 0.11561701488050503, "flos": 24572056227840.0, "grad_norm": 1.862655060574594, "language_loss": 0.8422457, "learning_rate": 3.923597986603456e-06, "loss": 0.86449802, "num_input_tokens_seen": 41626155, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.171875, "step": 1923, "time_per_iteration": 2.506255626678467 }, { "auxiliary_loss_clip": 0.01168258, "auxiliary_loss_mlp": 0.01049037, "balance_loss_clip": 1.02800822, "balance_loss_mlp": 1.05227149, "epoch": 0.115677138133173, "flos": 17092546179840.0, "grad_norm": 2.4035624347495466, "language_loss": 0.81004465, "learning_rate": 3.9234913324123264e-06, "loss": 0.83221763, "num_input_tokens_seen": 41644805, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.15625, "step": 1924, "time_per_iteration": 2.4488017559051514 }, { "auxiliary_loss_clip": 0.01060842, "auxiliary_loss_mlp": 0.01006033, "balance_loss_clip": 1.00361276, "balance_loss_mlp": 1.02034676, "epoch": 0.11573726138584098, "flos": 62703875266560.0, "grad_norm": 0.8230624470543655, "language_loss": 0.61193705, "learning_rate": 3.923384605282212e-06, "loss": 0.63260579, "num_input_tokens_seen": 41709345, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.40429688, "step": 1925, "time_per_iteration": 3.116299629211426 }, { "auxiliary_loss_clip": 0.01168323, "auxiliary_loss_mlp": 0.01067405, "balance_loss_clip": 1.04576874, "balance_loss_mlp": 1.05056238, "epoch": 0.11579738463850894, "flos": 22601135013120.0, "grad_norm": 1.774762360012752, "language_loss": 0.74565679, "learning_rate": 3.923277805217161e-06, "loss": 0.76801407, "num_input_tokens_seen": 41730210, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.171875, "step": 1926, "time_per_iteration": 2.4930577278137207 }, { "auxiliary_loss_clip": 0.01167757, "auxiliary_loss_mlp": 0.01055201, "balance_loss_clip": 1.03175235, "balance_loss_mlp": 1.04958045, "epoch": 0.11585750789117691, "flos": 21726135705600.0, "grad_norm": 2.8817256111007614, "language_loss": 0.72015584, "learning_rate": 3.923170932221222e-06, "loss": 0.74238545, "num_input_tokens_seen": 41750270, "router_z_loss_clip": 0.234375, "router_z_loss_mlp": 1.1796875, "step": 1927, "time_per_iteration": 2.453888416290283 }, { "auxiliary_loss_clip": 0.01165066, "auxiliary_loss_mlp": 0.01052532, "balance_loss_clip": 1.0315156, "balance_loss_mlp": 1.05024874, "epoch": 0.11591763114384489, "flos": 26287544851200.0, "grad_norm": 1.6156506083672244, "language_loss": 0.86846882, "learning_rate": 3.92306398629845e-06, "loss": 0.89064479, "num_input_tokens_seen": 41772975, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1484375, "step": 1928, "time_per_iteration": 2.5267672538757324 }, { "auxiliary_loss_clip": 0.01168672, "auxiliary_loss_mlp": 0.01058383, "balance_loss_clip": 1.03740191, "balance_loss_mlp": 1.05173266, "epoch": 0.11597775439651285, "flos": 23000461488000.0, "grad_norm": 1.7226594317563046, "language_loss": 0.77546275, "learning_rate": 3.922956967452898e-06, "loss": 0.79773331, "num_input_tokens_seen": 41791765, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.171875, "step": 1929, "time_per_iteration": 2.4734673500061035 }, { "auxiliary_loss_clip": 0.01163983, "auxiliary_loss_mlp": 0.01053853, "balance_loss_clip": 1.03456473, "balance_loss_mlp": 1.0502311, "epoch": 0.11603787764918082, "flos": 31941715507200.0, "grad_norm": 1.7559954226234833, "language_loss": 0.77240407, "learning_rate": 3.922849875688626e-06, "loss": 0.79458243, "num_input_tokens_seen": 41815615, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.140625, "step": 1930, "time_per_iteration": 4.078597068786621 }, { "auxiliary_loss_clip": 0.0116575, "auxiliary_loss_mlp": 0.01048606, "balance_loss_clip": 1.02808988, "balance_loss_mlp": 1.04988027, "epoch": 0.1160980009018488, "flos": 22271654534400.0, "grad_norm": 1.7768130935030162, "language_loss": 0.72246426, "learning_rate": 3.922742711009693e-06, "loss": 0.74460781, "num_input_tokens_seen": 41834810, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.15625, "step": 1931, "time_per_iteration": 2.4724926948547363 }, { "auxiliary_loss_clip": 0.01170351, "auxiliary_loss_mlp": 0.01053868, "balance_loss_clip": 1.03210032, "balance_loss_mlp": 1.05221808, "epoch": 0.11615812415451676, "flos": 22783633038720.0, "grad_norm": 2.420266232439111, "language_loss": 0.82191455, "learning_rate": 3.922635473420164e-06, "loss": 0.84415674, "num_input_tokens_seen": 41854975, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.171875, "step": 1932, "time_per_iteration": 5.386817693710327 }, { "auxiliary_loss_clip": 0.01058232, "auxiliary_loss_mlp": 0.01002617, "balance_loss_clip": 0.99997103, "balance_loss_mlp": 1.01806211, "epoch": 0.11621824740718473, "flos": 67146096107520.0, "grad_norm": 0.7741222782122514, "language_loss": 0.61121607, "learning_rate": 3.922528162924105e-06, "loss": 0.63182449, "num_input_tokens_seen": 41911105, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.40234375, "step": 1933, "time_per_iteration": 2.9919662475585938 }, { "auxiliary_loss_clip": 0.01168217, "auxiliary_loss_mlp": 0.01050246, "balance_loss_clip": 1.0294205, "balance_loss_mlp": 1.04980695, "epoch": 0.11627837065985269, "flos": 20375930442240.0, "grad_norm": 2.4502849371627202, "language_loss": 0.85836613, "learning_rate": 3.922420779525586e-06, "loss": 0.8805508, "num_input_tokens_seen": 41931750, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1875, "step": 1934, "time_per_iteration": 2.5026350021362305 }, { "auxiliary_loss_clip": 0.01175065, "auxiliary_loss_mlp": 0.01057036, "balance_loss_clip": 1.03538752, "balance_loss_mlp": 1.05481076, "epoch": 0.11633849391252067, "flos": 21725812483200.0, "grad_norm": 2.365269609277234, "language_loss": 0.65465844, "learning_rate": 3.9223133232286776e-06, "loss": 0.67697942, "num_input_tokens_seen": 41949400, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.203125, "step": 1935, "time_per_iteration": 2.4541714191436768 }, { "auxiliary_loss_clip": 0.01171511, "auxiliary_loss_mlp": 0.01052013, "balance_loss_clip": 1.03246284, "balance_loss_mlp": 1.0525707, "epoch": 0.11639861716518864, "flos": 18805341283200.0, "grad_norm": 1.9167336497728114, "language_loss": 0.75753868, "learning_rate": 3.922205794037456e-06, "loss": 0.77977389, "num_input_tokens_seen": 41968100, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.1875, "step": 1936, "time_per_iteration": 2.4600772857666016 }, { "auxiliary_loss_clip": 0.01168688, "auxiliary_loss_mlp": 0.01048532, "balance_loss_clip": 1.02612054, "balance_loss_mlp": 1.04923058, "epoch": 0.1164587404178566, "flos": 21214983214080.0, "grad_norm": 1.9327996544367574, "language_loss": 0.84182489, "learning_rate": 3.922098191955998e-06, "loss": 0.8639971, "num_input_tokens_seen": 41986375, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.1953125, "step": 1937, "time_per_iteration": 2.464109182357788 }, { "auxiliary_loss_clip": 0.01161736, "auxiliary_loss_mlp": 0.01042556, "balance_loss_clip": 1.02220702, "balance_loss_mlp": 1.04807734, "epoch": 0.11651886367052458, "flos": 27818632028160.0, "grad_norm": 1.79529208841125, "language_loss": 0.76200092, "learning_rate": 3.921990516988384e-06, "loss": 0.78404379, "num_input_tokens_seen": 42006055, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.140625, "step": 1938, "time_per_iteration": 2.5239038467407227 }, { "auxiliary_loss_clip": 0.01172006, "auxiliary_loss_mlp": 0.01049169, "balance_loss_clip": 1.02795005, "balance_loss_mlp": 1.05199814, "epoch": 0.11657898692319255, "flos": 22889569224960.0, "grad_norm": 1.9332397848872416, "language_loss": 0.79242074, "learning_rate": 3.921882769138696e-06, "loss": 0.81463253, "num_input_tokens_seen": 42024995, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.203125, "step": 1939, "time_per_iteration": 2.4580037593841553 }, { "auxiliary_loss_clip": 0.0116748, "auxiliary_loss_mlp": 0.01056668, "balance_loss_clip": 1.03531766, "balance_loss_mlp": 1.0512296, "epoch": 0.11663911017586051, "flos": 24315905364480.0, "grad_norm": 3.082511435029379, "language_loss": 0.86317784, "learning_rate": 3.9217749484110215e-06, "loss": 0.88541931, "num_input_tokens_seen": 42042640, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1640625, "step": 1940, "time_per_iteration": 2.49933123588562 }, { "auxiliary_loss_clip": 0.01166189, "auxiliary_loss_mlp": 0.01054386, "balance_loss_clip": 1.03511024, "balance_loss_mlp": 1.05226803, "epoch": 0.11669923342852849, "flos": 42340152470400.0, "grad_norm": 1.585265175331628, "language_loss": 0.75603735, "learning_rate": 3.921667054809449e-06, "loss": 0.77824306, "num_input_tokens_seen": 42067005, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.140625, "step": 1941, "time_per_iteration": 2.65682315826416 }, { "auxiliary_loss_clip": 0.01165178, "auxiliary_loss_mlp": 0.01063378, "balance_loss_clip": 1.04262364, "balance_loss_mlp": 1.04922783, "epoch": 0.11675935668119646, "flos": 14642288945280.0, "grad_norm": 2.4070041113902474, "language_loss": 0.88254714, "learning_rate": 3.921559088338068e-06, "loss": 0.90483272, "num_input_tokens_seen": 42082295, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.15625, "step": 1942, "time_per_iteration": 2.444000720977783 }, { "auxiliary_loss_clip": 0.01163022, "auxiliary_loss_mlp": 0.01046404, "balance_loss_clip": 1.02760446, "balance_loss_mlp": 1.04898977, "epoch": 0.11681947993386442, "flos": 35116470063360.0, "grad_norm": 1.7588767478372236, "language_loss": 0.67770886, "learning_rate": 3.921451049000975e-06, "loss": 0.69980317, "num_input_tokens_seen": 42105295, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.140625, "step": 1943, "time_per_iteration": 2.581491231918335 }, { "auxiliary_loss_clip": 0.01167183, "auxiliary_loss_mlp": 0.01047493, "balance_loss_clip": 1.02692914, "balance_loss_mlp": 1.05191469, "epoch": 0.11687960318653239, "flos": 38983259024640.0, "grad_norm": 3.2559912545339675, "language_loss": 0.69838524, "learning_rate": 3.921342936802265e-06, "loss": 0.72053194, "num_input_tokens_seen": 42125520, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.15625, "step": 1944, "time_per_iteration": 2.6392006874084473 }, { "auxiliary_loss_clip": 0.01165508, "auxiliary_loss_mlp": 0.01047772, "balance_loss_clip": 1.02822137, "balance_loss_mlp": 1.04959798, "epoch": 0.11693972643920036, "flos": 25994980575360.0, "grad_norm": 1.6890943193368573, "language_loss": 0.83009404, "learning_rate": 3.921234751746038e-06, "loss": 0.85222685, "num_input_tokens_seen": 42146335, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.15625, "step": 1945, "time_per_iteration": 2.5085463523864746 }, { "auxiliary_loss_clip": 0.01165311, "auxiliary_loss_mlp": 0.01056429, "balance_loss_clip": 1.03628278, "balance_loss_mlp": 1.04891205, "epoch": 0.11699984969186833, "flos": 27272107618560.0, "grad_norm": 2.0276924336854614, "language_loss": 0.76271904, "learning_rate": 3.9211264938363975e-06, "loss": 0.78493649, "num_input_tokens_seen": 42165320, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1640625, "step": 1946, "time_per_iteration": 2.563936471939087 }, { "auxiliary_loss_clip": 0.01165706, "auxiliary_loss_mlp": 0.0105213, "balance_loss_clip": 1.03254437, "balance_loss_mlp": 1.05256951, "epoch": 0.1170599729445363, "flos": 15267853232640.0, "grad_norm": 1.9179537084983356, "language_loss": 0.69238365, "learning_rate": 3.921018163077448e-06, "loss": 0.714562, "num_input_tokens_seen": 42182955, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.1328125, "step": 1947, "time_per_iteration": 2.4195284843444824 }, { "auxiliary_loss_clip": 0.0117123, "auxiliary_loss_mlp": 0.01062849, "balance_loss_clip": 1.04158235, "balance_loss_mlp": 1.05532944, "epoch": 0.11712009619720427, "flos": 17164439251200.0, "grad_norm": 2.2759607259963937, "language_loss": 0.84787595, "learning_rate": 3.920909759473295e-06, "loss": 0.87021673, "num_input_tokens_seen": 42200760, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.15625, "step": 1948, "time_per_iteration": 2.439720392227173 }, { "auxiliary_loss_clip": 0.01061427, "auxiliary_loss_mlp": 0.01021485, "balance_loss_clip": 1.01840901, "balance_loss_mlp": 1.02152538, "epoch": 0.11718021944987224, "flos": 70940991997440.0, "grad_norm": 0.8248782300769559, "language_loss": 0.65135396, "learning_rate": 3.920801283028054e-06, "loss": 0.67218304, "num_input_tokens_seen": 42265745, "router_z_loss_clip": 0.03076172, "router_z_loss_mlp": 0.3984375, "step": 1949, "time_per_iteration": 3.0919716358184814 }, { "auxiliary_loss_clip": 0.01166819, "auxiliary_loss_mlp": 0.01055455, "balance_loss_clip": 1.03449798, "balance_loss_mlp": 1.05333638, "epoch": 0.1172403427025402, "flos": 27453456408960.0, "grad_norm": 1.9197332188994582, "language_loss": 0.71928006, "learning_rate": 3.920692733745835e-06, "loss": 0.74150282, "num_input_tokens_seen": 42286245, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.140625, "step": 1950, "time_per_iteration": 2.528087615966797 }, { "auxiliary_loss_clip": 0.0117244, "auxiliary_loss_mlp": 0.01055708, "balance_loss_clip": 1.03441668, "balance_loss_mlp": 1.05407286, "epoch": 0.11730046595520818, "flos": 15668723992320.0, "grad_norm": 2.256434916786722, "language_loss": 0.76415199, "learning_rate": 3.920584111630755e-06, "loss": 0.78643346, "num_input_tokens_seen": 42302710, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1875, "step": 1951, "time_per_iteration": 2.419050455093384 }, { "auxiliary_loss_clip": 0.01170353, "auxiliary_loss_mlp": 0.01056167, "balance_loss_clip": 1.03534126, "balance_loss_mlp": 1.05506837, "epoch": 0.11736058920787615, "flos": 25630164092160.0, "grad_norm": 1.801229313585266, "language_loss": 0.76541865, "learning_rate": 3.9204754166869325e-06, "loss": 0.78768384, "num_input_tokens_seen": 42324115, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.15625, "step": 1952, "time_per_iteration": 2.531975269317627 }, { "auxiliary_loss_clip": 0.01169158, "auxiliary_loss_mlp": 0.01052857, "balance_loss_clip": 1.03145933, "balance_loss_mlp": 1.0502429, "epoch": 0.11742071246054411, "flos": 21434289701760.0, "grad_norm": 1.9419012958356647, "language_loss": 0.72314608, "learning_rate": 3.920366648918491e-06, "loss": 0.74536633, "num_input_tokens_seen": 42342505, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1875, "step": 1953, "time_per_iteration": 2.467849016189575 }, { "auxiliary_loss_clip": 0.01178167, "auxiliary_loss_mlp": 0.0105034, "balance_loss_clip": 1.02770257, "balance_loss_mlp": 1.05620909, "epoch": 0.11748083571321208, "flos": 15997845335040.0, "grad_norm": 2.325901369011954, "language_loss": 0.79309106, "learning_rate": 3.920257808329552e-06, "loss": 0.81537616, "num_input_tokens_seen": 42360525, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.21875, "step": 1954, "time_per_iteration": 2.4720804691314697 }, { "auxiliary_loss_clip": 0.01172729, "auxiliary_loss_mlp": 0.01056966, "balance_loss_clip": 1.03426838, "balance_loss_mlp": 1.05356431, "epoch": 0.11754095896588006, "flos": 16180056051840.0, "grad_norm": 8.958357012383363, "language_loss": 0.86016095, "learning_rate": 3.920148894924246e-06, "loss": 0.88245785, "num_input_tokens_seen": 42377045, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.1875, "step": 1955, "time_per_iteration": 2.4313769340515137 }, { "auxiliary_loss_clip": 0.01173299, "auxiliary_loss_mlp": 0.01052848, "balance_loss_clip": 1.03175938, "balance_loss_mlp": 1.05323958, "epoch": 0.11760108221854802, "flos": 13261596013440.0, "grad_norm": 2.8304813398610156, "language_loss": 0.78127098, "learning_rate": 3.920039908706701e-06, "loss": 0.80353248, "num_input_tokens_seen": 42393960, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.203125, "step": 1956, "time_per_iteration": 2.4500794410705566 }, { "auxiliary_loss_clip": 0.01167177, "auxiliary_loss_mlp": 0.01052614, "balance_loss_clip": 1.03002441, "balance_loss_mlp": 1.05276704, "epoch": 0.11766120547121599, "flos": 24498439303680.0, "grad_norm": 2.1491977940692224, "language_loss": 0.80381519, "learning_rate": 3.91993084968105e-06, "loss": 0.82601309, "num_input_tokens_seen": 42413160, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.140625, "step": 1957, "time_per_iteration": 2.509524345397949 }, { "auxiliary_loss_clip": 0.01177592, "auxiliary_loss_mlp": 0.01050369, "balance_loss_clip": 1.02857745, "balance_loss_mlp": 1.05676842, "epoch": 0.11772132872388397, "flos": 17784005967360.0, "grad_norm": 3.514054821301318, "language_loss": 0.78581512, "learning_rate": 3.919821717851428e-06, "loss": 0.80809474, "num_input_tokens_seen": 42432590, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.203125, "step": 1958, "time_per_iteration": 2.4644181728363037 }, { "auxiliary_loss_clip": 0.01171609, "auxiliary_loss_mlp": 0.01050819, "balance_loss_clip": 1.02868164, "balance_loss_mlp": 1.05392504, "epoch": 0.11778145197655193, "flos": 13217030213760.0, "grad_norm": 1.9629574806588568, "language_loss": 0.76542473, "learning_rate": 3.919712513221976e-06, "loss": 0.7876491, "num_input_tokens_seen": 42450135, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.1796875, "step": 1959, "time_per_iteration": 2.416642665863037 }, { "auxiliary_loss_clip": 0.01174408, "auxiliary_loss_mlp": 0.01049939, "balance_loss_clip": 1.02895796, "balance_loss_mlp": 1.05429089, "epoch": 0.1178415752292199, "flos": 20230204965120.0, "grad_norm": 4.332810245750005, "language_loss": 0.69985783, "learning_rate": 3.919603235796832e-06, "loss": 0.72210133, "num_input_tokens_seen": 42470050, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.203125, "step": 1960, "time_per_iteration": 2.4767067432403564 }, { "auxiliary_loss_clip": 0.01178596, "auxiliary_loss_mlp": 0.01050891, "balance_loss_clip": 1.02865851, "balance_loss_mlp": 1.05541849, "epoch": 0.11790169848188788, "flos": 13040134709760.0, "grad_norm": 2.4407690077098314, "language_loss": 0.81547046, "learning_rate": 3.9194938855801406e-06, "loss": 0.83776534, "num_input_tokens_seen": 42484335, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.234375, "step": 1961, "time_per_iteration": 2.40480375289917 }, { "auxiliary_loss_clip": 0.01165679, "auxiliary_loss_mlp": 0.0105831, "balance_loss_clip": 1.03699565, "balance_loss_mlp": 1.05181885, "epoch": 0.11796182173455584, "flos": 22265728790400.0, "grad_norm": 1.7470991565741882, "language_loss": 0.92109478, "learning_rate": 3.919384462576049e-06, "loss": 0.9433347, "num_input_tokens_seen": 42502720, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.140625, "step": 1962, "time_per_iteration": 2.470609664916992 }, { "auxiliary_loss_clip": 0.01174549, "auxiliary_loss_mlp": 0.01059466, "balance_loss_clip": 1.03806782, "balance_loss_mlp": 1.05502188, "epoch": 0.1180219449872238, "flos": 10635017892480.0, "grad_norm": 2.020089528902819, "language_loss": 0.87501907, "learning_rate": 3.919274966788707e-06, "loss": 0.89735925, "num_input_tokens_seen": 42519460, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1953125, "step": 1963, "time_per_iteration": 2.4198830127716064 }, { "auxiliary_loss_clip": 0.01175718, "auxiliary_loss_mlp": 0.01046646, "balance_loss_clip": 1.02546263, "balance_loss_mlp": 1.05423999, "epoch": 0.11808206823989177, "flos": 20923532259840.0, "grad_norm": 1.8919166767137119, "language_loss": 0.83818501, "learning_rate": 3.919165398222265e-06, "loss": 0.86040866, "num_input_tokens_seen": 42539420, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.2109375, "step": 1964, "time_per_iteration": 2.501049280166626 }, { "auxiliary_loss_clip": 0.01177626, "auxiliary_loss_mlp": 0.01062404, "balance_loss_clip": 1.04083943, "balance_loss_mlp": 1.05909514, "epoch": 0.11814219149255975, "flos": 20777770869120.0, "grad_norm": 2.1126500390823324, "language_loss": 0.82891166, "learning_rate": 3.919055756880879e-06, "loss": 0.85131192, "num_input_tokens_seen": 42558225, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1875, "step": 1965, "time_per_iteration": 2.463723659515381 }, { "auxiliary_loss_clip": 0.01175668, "auxiliary_loss_mlp": 0.0105018, "balance_loss_clip": 1.02818584, "balance_loss_mlp": 1.05605876, "epoch": 0.11820231474522772, "flos": 48759938542080.0, "grad_norm": 1.5442273781396556, "language_loss": 0.74486935, "learning_rate": 3.918946042768707e-06, "loss": 0.76712775, "num_input_tokens_seen": 42580790, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1953125, "step": 1966, "time_per_iteration": 2.716155767440796 }, { "auxiliary_loss_clip": 0.01187056, "auxiliary_loss_mlp": 0.01058039, "balance_loss_clip": 1.03546095, "balance_loss_mlp": 1.06312656, "epoch": 0.11826243799789568, "flos": 16690598012160.0, "grad_norm": 3.0482527290750974, "language_loss": 0.7239809, "learning_rate": 3.918836255889908e-06, "loss": 0.74643183, "num_input_tokens_seen": 42597355, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.234375, "step": 1967, "time_per_iteration": 2.4529244899749756 }, { "auxiliary_loss_clip": 0.01174239, "auxiliary_loss_mlp": 0.01052218, "balance_loss_clip": 1.03021193, "balance_loss_mlp": 1.05464077, "epoch": 0.11832256125056366, "flos": 16909868586240.0, "grad_norm": 2.1546471363152215, "language_loss": 0.88848341, "learning_rate": 3.9187263962486456e-06, "loss": 0.910748, "num_input_tokens_seen": 42616060, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.1953125, "step": 1968, "time_per_iteration": 2.44956111907959 }, { "auxiliary_loss_clip": 0.01176278, "auxiliary_loss_mlp": 0.01052148, "balance_loss_clip": 1.03047538, "balance_loss_mlp": 1.05750084, "epoch": 0.11838268450323162, "flos": 22820405587200.0, "grad_norm": 1.8544420252728655, "language_loss": 0.67076278, "learning_rate": 3.918616463849087e-06, "loss": 0.69304705, "num_input_tokens_seen": 42636285, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1875, "step": 1969, "time_per_iteration": 2.483344554901123 }, { "auxiliary_loss_clip": 0.01177993, "auxiliary_loss_mlp": 0.01061504, "balance_loss_clip": 1.03856778, "balance_loss_mlp": 1.06026638, "epoch": 0.11844280775589959, "flos": 33545844990720.0, "grad_norm": 1.9593927072627328, "language_loss": 0.80789745, "learning_rate": 3.918506458695399e-06, "loss": 0.83029234, "num_input_tokens_seen": 42658320, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.171875, "step": 1970, "time_per_iteration": 2.596120834350586 }, { "auxiliary_loss_clip": 0.01070799, "auxiliary_loss_mlp": 0.01010749, "balance_loss_clip": 1.00801909, "balance_loss_mlp": 1.03084707, "epoch": 0.11850293100856757, "flos": 66350998604160.0, "grad_norm": 0.799249768951571, "language_loss": 0.66114569, "learning_rate": 3.918396380791754e-06, "loss": 0.68196118, "num_input_tokens_seen": 42721500, "router_z_loss_clip": 0.02734375, "router_z_loss_mlp": 0.3984375, "step": 1971, "time_per_iteration": 4.6212921142578125 }, { "auxiliary_loss_clip": 0.01178, "auxiliary_loss_mlp": 0.01052094, "balance_loss_clip": 1.03070831, "balance_loss_mlp": 1.05616343, "epoch": 0.11856305426123553, "flos": 24681045070080.0, "grad_norm": 2.3154450503739934, "language_loss": 0.79405284, "learning_rate": 3.918286230142327e-06, "loss": 0.8163538, "num_input_tokens_seen": 42739825, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.21875, "step": 1972, "time_per_iteration": 2.4989418983459473 }, { "auxiliary_loss_clip": 0.01175834, "auxiliary_loss_mlp": 0.01048436, "balance_loss_clip": 1.02700269, "balance_loss_mlp": 1.05776024, "epoch": 0.1186231775139035, "flos": 24280102483200.0, "grad_norm": 2.2176163908964455, "language_loss": 0.72339559, "learning_rate": 3.918176006751292e-06, "loss": 0.74563825, "num_input_tokens_seen": 42758695, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1796875, "step": 1973, "time_per_iteration": 2.508514165878296 }, { "auxiliary_loss_clip": 0.01172666, "auxiliary_loss_mlp": 0.01044552, "balance_loss_clip": 1.02316618, "balance_loss_mlp": 1.05591547, "epoch": 0.11868330076657148, "flos": 21757413473280.0, "grad_norm": 1.644968913400844, "language_loss": 0.72138309, "learning_rate": 3.918065710622832e-06, "loss": 0.74355531, "num_input_tokens_seen": 42778510, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1640625, "step": 1974, "time_per_iteration": 3.9454314708709717 }, { "auxiliary_loss_clip": 0.0117424, "auxiliary_loss_mlp": 0.01046586, "balance_loss_clip": 1.02589095, "balance_loss_mlp": 1.05603635, "epoch": 0.11874342401923944, "flos": 17193274894080.0, "grad_norm": 2.686727616503895, "language_loss": 0.77844715, "learning_rate": 3.917955341761128e-06, "loss": 0.80065536, "num_input_tokens_seen": 42793995, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1875, "step": 1975, "time_per_iteration": 2.450004816055298 }, { "auxiliary_loss_clip": 0.0117524, "auxiliary_loss_mlp": 0.01049834, "balance_loss_clip": 1.0288533, "balance_loss_mlp": 1.05890536, "epoch": 0.11880354727190741, "flos": 15229572312960.0, "grad_norm": 2.1125060815014636, "language_loss": 0.75143981, "learning_rate": 3.917844900170364e-06, "loss": 0.77369052, "num_input_tokens_seen": 42809000, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1640625, "step": 1976, "time_per_iteration": 2.454317569732666 }, { "auxiliary_loss_clip": 0.01171134, "auxiliary_loss_mlp": 0.01053118, "balance_loss_clip": 1.03247046, "balance_loss_mlp": 1.05480051, "epoch": 0.11886367052457537, "flos": 27309706179840.0, "grad_norm": 1.860224955818291, "language_loss": 0.75224864, "learning_rate": 3.91773438585473e-06, "loss": 0.77449113, "num_input_tokens_seen": 42831585, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1640625, "step": 1977, "time_per_iteration": 2.529923677444458 }, { "auxiliary_loss_clip": 0.0117475, "auxiliary_loss_mlp": 0.01051815, "balance_loss_clip": 1.03133488, "balance_loss_mlp": 1.05581617, "epoch": 0.11892379377724335, "flos": 21798280172160.0, "grad_norm": 2.416595033273154, "language_loss": 0.73665822, "learning_rate": 3.9176237988184165e-06, "loss": 0.75892383, "num_input_tokens_seen": 42848420, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1875, "step": 1978, "time_per_iteration": 2.4526443481445312 }, { "auxiliary_loss_clip": 0.01172781, "auxiliary_loss_mlp": 0.01052145, "balance_loss_clip": 1.03174853, "balance_loss_mlp": 1.0565691, "epoch": 0.11898391702991132, "flos": 13991013498240.0, "grad_norm": 1.7438622946732274, "language_loss": 0.73375058, "learning_rate": 3.917513139065616e-06, "loss": 0.75599986, "num_input_tokens_seen": 42866645, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1640625, "step": 1979, "time_per_iteration": 2.4611587524414062 }, { "auxiliary_loss_clip": 0.01171862, "auxiliary_loss_mlp": 0.01052846, "balance_loss_clip": 1.03267574, "balance_loss_mlp": 1.05398405, "epoch": 0.11904404028257928, "flos": 32234567091840.0, "grad_norm": 1.8623932356770363, "language_loss": 0.98748791, "learning_rate": 3.917402406600525e-06, "loss": 1.00973499, "num_input_tokens_seen": 42888515, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1796875, "step": 1980, "time_per_iteration": 2.554178476333618 }, { "auxiliary_loss_clip": 0.01176548, "auxiliary_loss_mlp": 0.01050344, "balance_loss_clip": 1.02844548, "balance_loss_mlp": 1.05575335, "epoch": 0.11910416353524726, "flos": 23586272398080.0, "grad_norm": 3.156952789610185, "language_loss": 0.86020124, "learning_rate": 3.917291601427342e-06, "loss": 0.88247019, "num_input_tokens_seen": 42909035, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.203125, "step": 1981, "time_per_iteration": 2.5043256282806396 }, { "auxiliary_loss_clip": 0.01175197, "auxiliary_loss_mlp": 0.01062806, "balance_loss_clip": 1.04088366, "balance_loss_mlp": 1.05621874, "epoch": 0.11916428678791523, "flos": 25333038789120.0, "grad_norm": 1.9645982289559956, "language_loss": 0.84647441, "learning_rate": 3.91718072355027e-06, "loss": 0.86885446, "num_input_tokens_seen": 42927555, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1875, "step": 1982, "time_per_iteration": 2.494204521179199 }, { "auxiliary_loss_clip": 0.01168006, "auxiliary_loss_mlp": 0.01049706, "balance_loss_clip": 1.0296309, "balance_loss_mlp": 1.05204093, "epoch": 0.11922441004058319, "flos": 19788431592960.0, "grad_norm": 1.9953347276697602, "language_loss": 0.85141611, "learning_rate": 3.917069772973513e-06, "loss": 0.87359321, "num_input_tokens_seen": 42945300, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.1640625, "step": 1983, "time_per_iteration": 2.456629514694214 }, { "auxiliary_loss_clip": 0.01172735, "auxiliary_loss_mlp": 0.01052825, "balance_loss_clip": 1.03177273, "balance_loss_mlp": 1.05392146, "epoch": 0.11928453329325117, "flos": 21536347219200.0, "grad_norm": 2.723291044980602, "language_loss": 0.76899147, "learning_rate": 3.916958749701277e-06, "loss": 0.79124701, "num_input_tokens_seen": 42961295, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1875, "step": 1984, "time_per_iteration": 2.42680025100708 }, { "auxiliary_loss_clip": 0.01168313, "auxiliary_loss_mlp": 0.0105326, "balance_loss_clip": 1.03338742, "balance_loss_mlp": 1.05189455, "epoch": 0.11934465654591914, "flos": 20815010294400.0, "grad_norm": 3.9139303253706865, "language_loss": 0.83442348, "learning_rate": 3.9168476537377745e-06, "loss": 0.85663921, "num_input_tokens_seen": 42980330, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1640625, "step": 1985, "time_per_iteration": 2.464040994644165 }, { "auxiliary_loss_clip": 0.01162606, "auxiliary_loss_mlp": 0.01052464, "balance_loss_clip": 1.03229403, "balance_loss_mlp": 1.04936671, "epoch": 0.1194047797985871, "flos": 19060486565760.0, "grad_norm": 1.955311688039543, "language_loss": 0.74187768, "learning_rate": 3.916736485087216e-06, "loss": 0.76402843, "num_input_tokens_seen": 42996125, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1328125, "step": 1986, "time_per_iteration": 2.425355911254883 }, { "auxiliary_loss_clip": 0.01172694, "auxiliary_loss_mlp": 0.0105426, "balance_loss_clip": 1.03470933, "balance_loss_mlp": 1.05592072, "epoch": 0.11946490305125507, "flos": 27190805184000.0, "grad_norm": 1.9746016687935366, "language_loss": 0.72539902, "learning_rate": 3.916625243753819e-06, "loss": 0.74766856, "num_input_tokens_seen": 43014180, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.171875, "step": 1987, "time_per_iteration": 2.509443521499634 }, { "auxiliary_loss_clip": 0.01170158, "auxiliary_loss_mlp": 0.01051214, "balance_loss_clip": 1.0301137, "balance_loss_mlp": 1.05369639, "epoch": 0.11952502630392305, "flos": 21140791672320.0, "grad_norm": 3.195362650855132, "language_loss": 0.72169971, "learning_rate": 3.916513929741799e-06, "loss": 0.74391347, "num_input_tokens_seen": 43032120, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1640625, "step": 1988, "time_per_iteration": 2.4594216346740723 }, { "auxiliary_loss_clip": 0.01167076, "auxiliary_loss_mlp": 0.01057641, "balance_loss_clip": 1.03571868, "balance_loss_mlp": 1.05177951, "epoch": 0.11958514955659101, "flos": 22124241118080.0, "grad_norm": 1.779658162229123, "language_loss": 0.81271744, "learning_rate": 3.91640254305538e-06, "loss": 0.83496457, "num_input_tokens_seen": 43052215, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.15625, "step": 1989, "time_per_iteration": 2.508931875228882 }, { "auxiliary_loss_clip": 0.01171429, "auxiliary_loss_mlp": 0.01051163, "balance_loss_clip": 1.03087413, "balance_loss_mlp": 1.05411959, "epoch": 0.11964527280925898, "flos": 17421452040960.0, "grad_norm": 2.47373374214606, "language_loss": 0.7599442, "learning_rate": 3.916291083698784e-06, "loss": 0.78217006, "num_input_tokens_seen": 43069720, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.171875, "step": 1990, "time_per_iteration": 2.4576733112335205 }, { "auxiliary_loss_clip": 0.01071834, "auxiliary_loss_mlp": 0.01006712, "balance_loss_clip": 1.00398207, "balance_loss_mlp": 1.03127611, "epoch": 0.11970539606192696, "flos": 70679741402880.0, "grad_norm": 0.8561432023137796, "language_loss": 0.55233115, "learning_rate": 3.916179551676238e-06, "loss": 0.5731166, "num_input_tokens_seen": 43123130, "router_z_loss_clip": 0.02734375, "router_z_loss_mlp": 0.40625, "step": 1991, "time_per_iteration": 3.0988705158233643 }, { "auxiliary_loss_clip": 0.01166624, "auxiliary_loss_mlp": 0.01052502, "balance_loss_clip": 1.033023, "balance_loss_mlp": 1.0535078, "epoch": 0.11976551931459492, "flos": 21215019127680.0, "grad_norm": 2.744351146301977, "language_loss": 0.78389299, "learning_rate": 3.916067946991971e-06, "loss": 0.80608428, "num_input_tokens_seen": 43140015, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.1328125, "step": 1992, "time_per_iteration": 2.467116594314575 }, { "auxiliary_loss_clip": 0.01170718, "auxiliary_loss_mlp": 0.01048363, "balance_loss_clip": 1.02748919, "balance_loss_mlp": 1.05272377, "epoch": 0.11982564256726289, "flos": 25989306226560.0, "grad_norm": 2.2207228667801555, "language_loss": 0.79159641, "learning_rate": 3.915956269650216e-06, "loss": 0.81378722, "num_input_tokens_seen": 43160105, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1796875, "step": 1993, "time_per_iteration": 2.521677255630493 }, { "auxiliary_loss_clip": 0.0116549, "auxiliary_loss_mlp": 0.01053436, "balance_loss_clip": 1.03377819, "balance_loss_mlp": 1.05099714, "epoch": 0.11988576581993086, "flos": 21650866755840.0, "grad_norm": 1.8400741944044707, "language_loss": 0.82493854, "learning_rate": 3.915844519655208e-06, "loss": 0.8471278, "num_input_tokens_seen": 43179835, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.140625, "step": 1994, "time_per_iteration": 2.470430850982666 }, { "auxiliary_loss_clip": 0.0116907, "auxiliary_loss_mlp": 0.01055172, "balance_loss_clip": 1.03607523, "balance_loss_mlp": 1.05550933, "epoch": 0.11994588907259883, "flos": 17857407409920.0, "grad_norm": 2.3766347839890507, "language_loss": 0.88201928, "learning_rate": 3.915732697011183e-06, "loss": 0.90426171, "num_input_tokens_seen": 43197210, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.1328125, "step": 1995, "time_per_iteration": 2.5137641429901123 }, { "auxiliary_loss_clip": 0.01170345, "auxiliary_loss_mlp": 0.0105176, "balance_loss_clip": 1.03145885, "balance_loss_mlp": 1.05507123, "epoch": 0.1200060123252668, "flos": 24462744163200.0, "grad_norm": 9.960602080127076, "language_loss": 0.74196255, "learning_rate": 3.9156208017223825e-06, "loss": 0.76418364, "num_input_tokens_seen": 43215050, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.15625, "step": 1996, "time_per_iteration": 2.4940314292907715 }, { "auxiliary_loss_clip": 0.01168399, "auxiliary_loss_mlp": 0.01046875, "balance_loss_clip": 1.02597785, "balance_loss_mlp": 1.05328417, "epoch": 0.12006613557793476, "flos": 18732191235840.0, "grad_norm": 2.6370792382777495, "language_loss": 0.87771404, "learning_rate": 3.915508833793048e-06, "loss": 0.89986682, "num_input_tokens_seen": 43233900, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.15625, "step": 1997, "time_per_iteration": 2.4627881050109863 }, { "auxiliary_loss_clip": 0.01163899, "auxiliary_loss_mlp": 0.01063421, "balance_loss_clip": 1.04261947, "balance_loss_mlp": 1.05074537, "epoch": 0.12012625883060274, "flos": 22267739952000.0, "grad_norm": 1.8812832679538465, "language_loss": 0.78873384, "learning_rate": 3.915396793227428e-06, "loss": 0.81100708, "num_input_tokens_seen": 43252105, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1328125, "step": 1998, "time_per_iteration": 2.4667210578918457 }, { "auxiliary_loss_clip": 0.01167325, "auxiliary_loss_mlp": 0.01051057, "balance_loss_clip": 1.03033853, "balance_loss_mlp": 1.05385804, "epoch": 0.1201863820832707, "flos": 21758885930880.0, "grad_norm": 1.715189202953438, "language_loss": 0.73316497, "learning_rate": 3.915284680029769e-06, "loss": 0.75534874, "num_input_tokens_seen": 43270315, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1328125, "step": 1999, "time_per_iteration": 2.4915432929992676 }, { "auxiliary_loss_clip": 0.01168293, "auxiliary_loss_mlp": 0.01062271, "balance_loss_clip": 1.04243481, "balance_loss_mlp": 1.05321693, "epoch": 0.12024650533593867, "flos": 21907987286400.0, "grad_norm": 2.7184904375653915, "language_loss": 0.75041145, "learning_rate": 3.915172494204323e-06, "loss": 0.77271712, "num_input_tokens_seen": 43289935, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.15625, "step": 2000, "time_per_iteration": 2.486593008041382 }, { "auxiliary_loss_clip": 0.01167918, "auxiliary_loss_mlp": 0.01049743, "balance_loss_clip": 1.02950096, "balance_loss_mlp": 1.05252957, "epoch": 0.12030662858860665, "flos": 21689219502720.0, "grad_norm": 2.073764908977131, "language_loss": 0.84949458, "learning_rate": 3.915060235755344e-06, "loss": 0.8716712, "num_input_tokens_seen": 43309325, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.15625, "step": 2001, "time_per_iteration": 2.4790232181549072 }, { "auxiliary_loss_clip": 0.0117094, "auxiliary_loss_mlp": 0.01050441, "balance_loss_clip": 1.03141499, "balance_loss_mlp": 1.05543542, "epoch": 0.12036675184127461, "flos": 12933228856320.0, "grad_norm": 2.116917976197669, "language_loss": 0.74110162, "learning_rate": 3.91494790468709e-06, "loss": 0.76331538, "num_input_tokens_seen": 43327010, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.15625, "step": 2002, "time_per_iteration": 2.441516637802124 }, { "auxiliary_loss_clip": 0.01173122, "auxiliary_loss_mlp": 0.01046155, "balance_loss_clip": 1.02506649, "balance_loss_mlp": 1.05446875, "epoch": 0.12042687509394258, "flos": 20851028657280.0, "grad_norm": 1.9891784813821471, "language_loss": 0.77644414, "learning_rate": 3.9148355010038185e-06, "loss": 0.79863691, "num_input_tokens_seen": 43345650, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1875, "step": 2003, "time_per_iteration": 2.4648044109344482 }, { "auxiliary_loss_clip": 0.01163351, "auxiliary_loss_mlp": 0.01048527, "balance_loss_clip": 1.02737975, "balance_loss_mlp": 1.05056405, "epoch": 0.12048699834661056, "flos": 23878513451520.0, "grad_norm": 1.6567235451337226, "language_loss": 0.72023511, "learning_rate": 3.914723024709793e-06, "loss": 0.74235392, "num_input_tokens_seen": 43365555, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.125, "step": 2004, "time_per_iteration": 2.490173101425171 }, { "auxiliary_loss_clip": 0.01173199, "auxiliary_loss_mlp": 0.01055076, "balance_loss_clip": 1.03336811, "balance_loss_mlp": 1.05491257, "epoch": 0.12054712159927852, "flos": 19756363726080.0, "grad_norm": 1.6772624260305202, "language_loss": 0.78381413, "learning_rate": 3.914610475809279e-06, "loss": 0.80609691, "num_input_tokens_seen": 43384990, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1875, "step": 2005, "time_per_iteration": 2.4945597648620605 }, { "auxiliary_loss_clip": 0.0106671, "auxiliary_loss_mlp": 0.01025924, "balance_loss_clip": 1.02352798, "balance_loss_mlp": 1.02748096, "epoch": 0.12060724485194649, "flos": 51672763123200.0, "grad_norm": 0.938393941205858, "language_loss": 0.58096564, "learning_rate": 3.914497854306543e-06, "loss": 0.60189188, "num_input_tokens_seen": 43436335, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.39257812, "step": 2006, "time_per_iteration": 2.8803038597106934 }, { "auxiliary_loss_clip": 0.01164985, "auxiliary_loss_mlp": 0.01047471, "balance_loss_clip": 1.02772963, "balance_loss_mlp": 1.05296659, "epoch": 0.12066736810461445, "flos": 18990425088000.0, "grad_norm": 1.7278845014250734, "language_loss": 0.76660955, "learning_rate": 3.9143851602058575e-06, "loss": 0.78873414, "num_input_tokens_seen": 43456495, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1171875, "step": 2007, "time_per_iteration": 2.478444814682007 }, { "auxiliary_loss_clip": 0.01172026, "auxiliary_loss_mlp": 0.01058787, "balance_loss_clip": 1.03754377, "balance_loss_mlp": 1.05454957, "epoch": 0.12072749135728243, "flos": 16471973882880.0, "grad_norm": 2.830649998148492, "language_loss": 0.82805192, "learning_rate": 3.914272393511494e-06, "loss": 0.8503601, "num_input_tokens_seen": 43473085, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.171875, "step": 2008, "time_per_iteration": 2.4399096965789795 }, { "auxiliary_loss_clip": 0.01167703, "auxiliary_loss_mlp": 0.01047742, "balance_loss_clip": 1.02673697, "balance_loss_mlp": 1.05211306, "epoch": 0.1207876146099504, "flos": 18077108947200.0, "grad_norm": 4.270038513133501, "language_loss": 0.8427704, "learning_rate": 3.91415955422773e-06, "loss": 0.86492479, "num_input_tokens_seen": 43491135, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.15625, "step": 2009, "time_per_iteration": 2.4605798721313477 }, { "auxiliary_loss_clip": 0.01171466, "auxiliary_loss_mlp": 0.01053044, "balance_loss_clip": 1.03089452, "balance_loss_mlp": 1.05650532, "epoch": 0.12084773786261836, "flos": 21871573873920.0, "grad_norm": 1.695866278733535, "language_loss": 0.83954525, "learning_rate": 3.914046642358844e-06, "loss": 0.86179036, "num_input_tokens_seen": 43510440, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.1484375, "step": 2010, "time_per_iteration": 2.4567999839782715 }, { "auxiliary_loss_clip": 0.01173335, "auxiliary_loss_mlp": 0.01054571, "balance_loss_clip": 1.03348327, "balance_loss_mlp": 1.05688059, "epoch": 0.12090786111528634, "flos": 18333044328960.0, "grad_norm": 1.7112399188910359, "language_loss": 0.83897859, "learning_rate": 3.9139336579091174e-06, "loss": 0.86125767, "num_input_tokens_seen": 43530145, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1640625, "step": 2011, "time_per_iteration": 2.4953083992004395 }, { "auxiliary_loss_clip": 0.01173521, "auxiliary_loss_mlp": 0.0105501, "balance_loss_clip": 1.03411245, "balance_loss_mlp": 1.05628037, "epoch": 0.1209679843679543, "flos": 21105850717440.0, "grad_norm": 1.9048213232442628, "language_loss": 0.96216506, "learning_rate": 3.913820600882834e-06, "loss": 0.98445034, "num_input_tokens_seen": 43549315, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.171875, "step": 2012, "time_per_iteration": 2.5009703636169434 }, { "auxiliary_loss_clip": 0.01167191, "auxiliary_loss_mlp": 0.0105182, "balance_loss_clip": 1.03031421, "balance_loss_mlp": 1.05509114, "epoch": 0.12102810762062227, "flos": 29241053585280.0, "grad_norm": 2.74692739221979, "language_loss": 0.80757421, "learning_rate": 3.913707471284283e-06, "loss": 0.82976431, "num_input_tokens_seen": 43569240, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.125, "step": 2013, "time_per_iteration": 4.061115264892578 }, { "auxiliary_loss_clip": 0.01175569, "auxiliary_loss_mlp": 0.01047891, "balance_loss_clip": 1.02544379, "balance_loss_mlp": 1.0570116, "epoch": 0.12108823087329025, "flos": 17930701111680.0, "grad_norm": 2.9560709046065754, "language_loss": 0.76892292, "learning_rate": 3.9135942691177515e-06, "loss": 0.79115748, "num_input_tokens_seen": 43587710, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.1875, "step": 2014, "time_per_iteration": 2.4944891929626465 }, { "auxiliary_loss_clip": 0.01169759, "auxiliary_loss_mlp": 0.01047379, "balance_loss_clip": 1.02577817, "balance_loss_mlp": 1.05537701, "epoch": 0.12114835412595822, "flos": 22091850028800.0, "grad_norm": 1.9362673784246278, "language_loss": 0.86948609, "learning_rate": 3.913480994387535e-06, "loss": 0.89165753, "num_input_tokens_seen": 43606000, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.140625, "step": 2015, "time_per_iteration": 3.9243969917297363 }, { "auxiliary_loss_clip": 0.01163644, "auxiliary_loss_mlp": 0.01054143, "balance_loss_clip": 1.03305531, "balance_loss_mlp": 1.05098915, "epoch": 0.12120847737862618, "flos": 20412343854720.0, "grad_norm": 1.8892629014631455, "language_loss": 0.69321454, "learning_rate": 3.913367647097926e-06, "loss": 0.71539235, "num_input_tokens_seen": 43624815, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.125, "step": 2016, "time_per_iteration": 3.979656934738159 }, { "auxiliary_loss_clip": 0.01170232, "auxiliary_loss_mlp": 0.01051276, "balance_loss_clip": 1.0282445, "balance_loss_mlp": 1.0550406, "epoch": 0.12126860063129415, "flos": 22309037614080.0, "grad_norm": 6.5755976978920305, "language_loss": 0.80409777, "learning_rate": 3.913254227253225e-06, "loss": 0.8263129, "num_input_tokens_seen": 43643960, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.1484375, "step": 2017, "time_per_iteration": 2.4682538509368896 }, { "auxiliary_loss_clip": 0.01169845, "auxiliary_loss_mlp": 0.010509, "balance_loss_clip": 1.029037, "balance_loss_mlp": 1.0549736, "epoch": 0.12132872388396213, "flos": 13699275235200.0, "grad_norm": 2.54869224273993, "language_loss": 0.68509007, "learning_rate": 3.913140734857731e-06, "loss": 0.70729756, "num_input_tokens_seen": 43662650, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1484375, "step": 2018, "time_per_iteration": 2.4219512939453125 }, { "auxiliary_loss_clip": 0.01171259, "auxiliary_loss_mlp": 0.01048641, "balance_loss_clip": 1.02850676, "balance_loss_mlp": 1.0572021, "epoch": 0.12138884713663009, "flos": 26466954307200.0, "grad_norm": 1.590877121660859, "language_loss": 0.72323197, "learning_rate": 3.91302716991575e-06, "loss": 0.74543101, "num_input_tokens_seen": 43684205, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.140625, "step": 2019, "time_per_iteration": 2.516350269317627 }, { "auxiliary_loss_clip": 0.0116519, "auxiliary_loss_mlp": 0.01055048, "balance_loss_clip": 1.03379321, "balance_loss_mlp": 1.0500071, "epoch": 0.12144897038929806, "flos": 26141603892480.0, "grad_norm": 2.1459109388814417, "language_loss": 0.92238015, "learning_rate": 3.912913532431586e-06, "loss": 0.94458246, "num_input_tokens_seen": 43706320, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.15625, "step": 2020, "time_per_iteration": 2.491710662841797 }, { "auxiliary_loss_clip": 0.01170379, "auxiliary_loss_mlp": 0.01051889, "balance_loss_clip": 1.03118253, "balance_loss_mlp": 1.05549407, "epoch": 0.12150909364196603, "flos": 24717530309760.0, "grad_norm": 2.029995580943345, "language_loss": 0.77570587, "learning_rate": 3.912799822409549e-06, "loss": 0.79792857, "num_input_tokens_seen": 43724805, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1484375, "step": 2021, "time_per_iteration": 2.5284409523010254 }, { "auxiliary_loss_clip": 0.01167407, "auxiliary_loss_mlp": 0.01048809, "balance_loss_clip": 1.02758956, "balance_loss_mlp": 1.0536046, "epoch": 0.121569216894634, "flos": 25186990089600.0, "grad_norm": 1.8909418789432664, "language_loss": 0.80471838, "learning_rate": 3.912686039853952e-06, "loss": 0.82688051, "num_input_tokens_seen": 43742320, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.140625, "step": 2022, "time_per_iteration": 2.488147497177124 }, { "auxiliary_loss_clip": 0.01172254, "auxiliary_loss_mlp": 0.01054924, "balance_loss_clip": 1.03359699, "balance_loss_mlp": 1.05734062, "epoch": 0.12162934014730196, "flos": 13444094039040.0, "grad_norm": 1.786687177856955, "language_loss": 0.85293484, "learning_rate": 3.912572184769108e-06, "loss": 0.87520665, "num_input_tokens_seen": 43760665, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1484375, "step": 2023, "time_per_iteration": 2.4577839374542236 }, { "auxiliary_loss_clip": 0.01169224, "auxiliary_loss_mlp": 0.01053477, "balance_loss_clip": 1.03191185, "balance_loss_mlp": 1.05422378, "epoch": 0.12168946339996994, "flos": 16946138344320.0, "grad_norm": 2.202016503146866, "language_loss": 0.85336626, "learning_rate": 3.912458257159335e-06, "loss": 0.87559325, "num_input_tokens_seen": 43779020, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1484375, "step": 2024, "time_per_iteration": 2.42579984664917 }, { "auxiliary_loss_clip": 0.01165844, "auxiliary_loss_mlp": 0.01053286, "balance_loss_clip": 1.03242445, "balance_loss_mlp": 1.05011463, "epoch": 0.12174958665263791, "flos": 29821585196160.0, "grad_norm": 1.947294545421231, "language_loss": 0.71963906, "learning_rate": 3.912344257028954e-06, "loss": 0.74183035, "num_input_tokens_seen": 43798850, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.15625, "step": 2025, "time_per_iteration": 2.5192291736602783 }, { "auxiliary_loss_clip": 0.01168866, "auxiliary_loss_mlp": 0.01048609, "balance_loss_clip": 1.02822399, "balance_loss_mlp": 1.05364835, "epoch": 0.12180970990530587, "flos": 24641902224000.0, "grad_norm": 1.828828060428882, "language_loss": 0.75947094, "learning_rate": 3.912230184382286e-06, "loss": 0.78164577, "num_input_tokens_seen": 43820130, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.15625, "step": 2026, "time_per_iteration": 2.5342493057250977 }, { "auxiliary_loss_clip": 0.01167551, "auxiliary_loss_mlp": 0.01045309, "balance_loss_clip": 1.02472186, "balance_loss_mlp": 1.05248737, "epoch": 0.12186983315797385, "flos": 20521691832960.0, "grad_norm": 2.0544302657462055, "language_loss": 0.88641894, "learning_rate": 3.912116039223659e-06, "loss": 0.90854752, "num_input_tokens_seen": 43838485, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.1484375, "step": 2027, "time_per_iteration": 2.4770708084106445 }, { "auxiliary_loss_clip": 0.01164662, "auxiliary_loss_mlp": 0.01051236, "balance_loss_clip": 1.03256822, "balance_loss_mlp": 1.05237746, "epoch": 0.12192995641064182, "flos": 27818344719360.0, "grad_norm": 1.5825506035961667, "language_loss": 0.75593609, "learning_rate": 3.912001821557399e-06, "loss": 0.77809507, "num_input_tokens_seen": 43859080, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.125, "step": 2028, "time_per_iteration": 2.525470495223999 }, { "auxiliary_loss_clip": 0.01165659, "auxiliary_loss_mlp": 0.01053865, "balance_loss_clip": 1.03269291, "balance_loss_mlp": 1.05222392, "epoch": 0.12199007966330978, "flos": 22017119783040.0, "grad_norm": 2.699663020345963, "language_loss": 0.77109134, "learning_rate": 3.911887531387839e-06, "loss": 0.79328656, "num_input_tokens_seen": 43879030, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1328125, "step": 2029, "time_per_iteration": 2.4904110431671143 }, { "auxiliary_loss_clip": 0.01164465, "auxiliary_loss_mlp": 0.0105166, "balance_loss_clip": 1.03132296, "balance_loss_mlp": 1.05110812, "epoch": 0.12205020291597775, "flos": 23295216493440.0, "grad_norm": 1.7244611521670246, "language_loss": 0.78882563, "learning_rate": 3.911773168719313e-06, "loss": 0.81098688, "num_input_tokens_seen": 43898505, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1328125, "step": 2030, "time_per_iteration": 2.480605363845825 }, { "auxiliary_loss_clip": 0.01163481, "auxiliary_loss_mlp": 0.01049902, "balance_loss_clip": 1.02826512, "balance_loss_mlp": 1.05128551, "epoch": 0.12211032616864573, "flos": 26031609469440.0, "grad_norm": 2.271947890219538, "language_loss": 0.74360412, "learning_rate": 3.911658733556155e-06, "loss": 0.76573789, "num_input_tokens_seen": 43917945, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.125, "step": 2031, "time_per_iteration": 2.4905221462249756 }, { "auxiliary_loss_clip": 0.01166147, "auxiliary_loss_mlp": 0.01045668, "balance_loss_clip": 1.02674913, "balance_loss_mlp": 1.05321789, "epoch": 0.12217044942131369, "flos": 20410943224320.0, "grad_norm": 1.8117846929217913, "language_loss": 0.7520355, "learning_rate": 3.911544225902707e-06, "loss": 0.77415365, "num_input_tokens_seen": 43937385, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.1328125, "step": 2032, "time_per_iteration": 2.446179151535034 }, { "auxiliary_loss_clip": 0.011576, "auxiliary_loss_mlp": 0.01046097, "balance_loss_clip": 1.02654707, "balance_loss_mlp": 1.04744172, "epoch": 0.12223057267398166, "flos": 22857142222080.0, "grad_norm": 1.6319170632141349, "language_loss": 0.89050436, "learning_rate": 3.911429645763311e-06, "loss": 0.91254133, "num_input_tokens_seen": 43958130, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.1015625, "step": 2033, "time_per_iteration": 2.517761468887329 }, { "auxiliary_loss_clip": 0.01169729, "auxiliary_loss_mlp": 0.01047119, "balance_loss_clip": 1.02705598, "balance_loss_mlp": 1.05521822, "epoch": 0.12229069592664964, "flos": 20047563285120.0, "grad_norm": 2.02266198224394, "language_loss": 0.65808588, "learning_rate": 3.911314993142311e-06, "loss": 0.6802544, "num_input_tokens_seen": 43976800, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.140625, "step": 2034, "time_per_iteration": 2.4449775218963623 }, { "auxiliary_loss_clip": 0.01166517, "auxiliary_loss_mlp": 0.01056244, "balance_loss_clip": 1.03469062, "balance_loss_mlp": 1.05334401, "epoch": 0.1223508191793176, "flos": 22274240313600.0, "grad_norm": 1.5856402518609187, "language_loss": 0.7646656, "learning_rate": 3.911200268044055e-06, "loss": 0.78689325, "num_input_tokens_seen": 43996620, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1328125, "step": 2035, "time_per_iteration": 2.482131004333496 }, { "auxiliary_loss_clip": 0.01168969, "auxiliary_loss_mlp": 0.01053195, "balance_loss_clip": 1.0316062, "balance_loss_mlp": 1.0519489, "epoch": 0.12241094243198557, "flos": 21285978445440.0, "grad_norm": 1.8747695203132946, "language_loss": 0.71535337, "learning_rate": 3.911085470472892e-06, "loss": 0.73757505, "num_input_tokens_seen": 44016175, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.171875, "step": 2036, "time_per_iteration": 2.460891008377075 }, { "auxiliary_loss_clip": 0.01166002, "auxiliary_loss_mlp": 0.01051148, "balance_loss_clip": 1.02958274, "balance_loss_mlp": 1.05404794, "epoch": 0.12247106568465355, "flos": 17382381022080.0, "grad_norm": 1.768025581027361, "language_loss": 0.83109343, "learning_rate": 3.910970600433178e-06, "loss": 0.85326493, "num_input_tokens_seen": 44035060, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1171875, "step": 2037, "time_per_iteration": 2.481370210647583 }, { "auxiliary_loss_clip": 0.01169817, "auxiliary_loss_mlp": 0.01062155, "balance_loss_clip": 1.03980374, "balance_loss_mlp": 1.05386364, "epoch": 0.12253118893732151, "flos": 27045438842880.0, "grad_norm": 2.9636356827991093, "language_loss": 0.79506648, "learning_rate": 3.910855657929267e-06, "loss": 0.81738615, "num_input_tokens_seen": 44053330, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.15625, "step": 2038, "time_per_iteration": 2.483211040496826 }, { "auxiliary_loss_clip": 0.01060711, "auxiliary_loss_mlp": 0.01017645, "balance_loss_clip": 1.01459277, "balance_loss_mlp": 1.02106571, "epoch": 0.12259131218998948, "flos": 53861518368000.0, "grad_norm": 0.827172972736347, "language_loss": 0.58678937, "learning_rate": 3.910740642965518e-06, "loss": 0.60757291, "num_input_tokens_seen": 44107575, "router_z_loss_clip": 0.03051758, "router_z_loss_mlp": 0.39648438, "step": 2039, "time_per_iteration": 2.9753637313842773 }, { "auxiliary_loss_clip": 0.01169272, "auxiliary_loss_mlp": 0.01049651, "balance_loss_clip": 1.02805018, "balance_loss_mlp": 1.05392206, "epoch": 0.12265143544265744, "flos": 17891917401600.0, "grad_norm": 1.9073849611777545, "language_loss": 0.80423653, "learning_rate": 3.910625555546292e-06, "loss": 0.82642579, "num_input_tokens_seen": 44126075, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.15625, "step": 2040, "time_per_iteration": 2.4283862113952637 }, { "auxiliary_loss_clip": 0.01162129, "auxiliary_loss_mlp": 0.01050651, "balance_loss_clip": 1.02993262, "balance_loss_mlp": 1.05098772, "epoch": 0.12271155869532542, "flos": 21799932197760.0, "grad_norm": 1.7850544747777917, "language_loss": 0.83379853, "learning_rate": 3.910510395675953e-06, "loss": 0.85592639, "num_input_tokens_seen": 44145605, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.109375, "step": 2041, "time_per_iteration": 2.465449810028076 }, { "auxiliary_loss_clip": 0.01168138, "auxiliary_loss_mlp": 0.01051205, "balance_loss_clip": 1.0299499, "balance_loss_mlp": 1.05264091, "epoch": 0.12277168194799339, "flos": 19828759587840.0, "grad_norm": 1.4459068551607006, "language_loss": 0.67070508, "learning_rate": 3.9103951633588694e-06, "loss": 0.69289851, "num_input_tokens_seen": 44164770, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.15625, "step": 2042, "time_per_iteration": 2.459245204925537 }, { "auxiliary_loss_clip": 0.0116194, "auxiliary_loss_mlp": 0.0104745, "balance_loss_clip": 1.02729225, "balance_loss_mlp": 1.05027676, "epoch": 0.12283180520066135, "flos": 23221024951680.0, "grad_norm": 1.6837911705652393, "language_loss": 0.81596434, "learning_rate": 3.910279858599409e-06, "loss": 0.83805823, "num_input_tokens_seen": 44184025, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.109375, "step": 2043, "time_per_iteration": 2.4665768146514893 }, { "auxiliary_loss_clip": 0.01162933, "auxiliary_loss_mlp": 0.0104828, "balance_loss_clip": 1.0275259, "balance_loss_mlp": 1.04944801, "epoch": 0.12289192845332933, "flos": 18588476920320.0, "grad_norm": 1.8328170310530416, "language_loss": 0.80604231, "learning_rate": 3.910164481401946e-06, "loss": 0.82815444, "num_input_tokens_seen": 44202950, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.140625, "step": 2044, "time_per_iteration": 2.427650213241577 }, { "auxiliary_loss_clip": 0.01161394, "auxiliary_loss_mlp": 0.01048469, "balance_loss_clip": 1.02791739, "balance_loss_mlp": 1.05188131, "epoch": 0.1229520517059973, "flos": 25769532862080.0, "grad_norm": 1.7570542411422225, "language_loss": 0.78160089, "learning_rate": 3.910049031770853e-06, "loss": 0.80369949, "num_input_tokens_seen": 44221115, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.09375, "step": 2045, "time_per_iteration": 2.4899742603302 }, { "auxiliary_loss_clip": 0.01167828, "auxiliary_loss_mlp": 0.01062388, "balance_loss_clip": 1.04067993, "balance_loss_mlp": 1.05199456, "epoch": 0.12301217495866526, "flos": 20887154760960.0, "grad_norm": 1.9580517897108165, "language_loss": 0.6750375, "learning_rate": 3.90993350971051e-06, "loss": 0.69733965, "num_input_tokens_seen": 44240575, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.15625, "step": 2046, "time_per_iteration": 2.4609358310699463 }, { "auxiliary_loss_clip": 0.01167586, "auxiliary_loss_mlp": 0.01050879, "balance_loss_clip": 1.03046978, "balance_loss_mlp": 1.0546838, "epoch": 0.12307229821133324, "flos": 22378811783040.0, "grad_norm": 2.0813214490011362, "language_loss": 0.72360253, "learning_rate": 3.909817915225297e-06, "loss": 0.74578714, "num_input_tokens_seen": 44257145, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1328125, "step": 2047, "time_per_iteration": 2.4606947898864746 }, { "auxiliary_loss_clip": 0.01162146, "auxiliary_loss_mlp": 0.01063327, "balance_loss_clip": 1.04164243, "balance_loss_mlp": 1.05069983, "epoch": 0.1231324214640012, "flos": 23367396873600.0, "grad_norm": 1.8407108152584664, "language_loss": 0.76708984, "learning_rate": 3.909702248319597e-06, "loss": 0.78934455, "num_input_tokens_seen": 44278035, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.109375, "step": 2048, "time_per_iteration": 2.522446870803833 }, { "auxiliary_loss_clip": 0.01161412, "auxiliary_loss_mlp": 0.01050157, "balance_loss_clip": 1.03130996, "balance_loss_mlp": 1.05168343, "epoch": 0.12319254471666917, "flos": 23767154311680.0, "grad_norm": 1.887857733827233, "language_loss": 0.8496421, "learning_rate": 3.909586508997797e-06, "loss": 0.87175786, "num_input_tokens_seen": 44296980, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.09375, "step": 2049, "time_per_iteration": 2.5066885948181152 }, { "auxiliary_loss_clip": 0.01163765, "auxiliary_loss_mlp": 0.01052671, "balance_loss_clip": 1.03179765, "balance_loss_mlp": 1.05060148, "epoch": 0.12325266796933713, "flos": 23550146294400.0, "grad_norm": 1.8555507914210396, "language_loss": 0.7571367, "learning_rate": 3.909470697264285e-06, "loss": 0.77930105, "num_input_tokens_seen": 44318005, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1328125, "step": 2050, "time_per_iteration": 2.504054546356201 }, { "auxiliary_loss_clip": 0.0116533, "auxiliary_loss_mlp": 0.01062773, "balance_loss_clip": 1.04248357, "balance_loss_mlp": 1.05215621, "epoch": 0.12331279122200511, "flos": 24423996366720.0, "grad_norm": 2.021328786477123, "language_loss": 0.81018317, "learning_rate": 3.909354813123452e-06, "loss": 0.83246422, "num_input_tokens_seen": 44335260, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1328125, "step": 2051, "time_per_iteration": 2.5037903785705566 }, { "auxiliary_loss_clip": 0.0116497, "auxiliary_loss_mlp": 0.01050462, "balance_loss_clip": 1.03006554, "balance_loss_mlp": 1.05424535, "epoch": 0.12337291447467308, "flos": 25484294960640.0, "grad_norm": 1.6170112093465445, "language_loss": 0.80152988, "learning_rate": 3.909238856579693e-06, "loss": 0.82368422, "num_input_tokens_seen": 44355315, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1015625, "step": 2052, "time_per_iteration": 2.507132053375244 }, { "auxiliary_loss_clip": 0.01166577, "auxiliary_loss_mlp": 0.01060752, "balance_loss_clip": 1.03969944, "balance_loss_mlp": 1.0524416, "epoch": 0.12343303772734104, "flos": 23550002640000.0, "grad_norm": 4.256342844685933, "language_loss": 0.73469269, "learning_rate": 3.909122827637406e-06, "loss": 0.75696588, "num_input_tokens_seen": 44373020, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.140625, "step": 2053, "time_per_iteration": 2.5012660026550293 }, { "auxiliary_loss_clip": 0.01163873, "auxiliary_loss_mlp": 0.01053901, "balance_loss_clip": 1.03311062, "balance_loss_mlp": 1.04858446, "epoch": 0.12349316098000902, "flos": 47557074867840.0, "grad_norm": 1.541128527697542, "language_loss": 0.74319327, "learning_rate": 3.909006726300991e-06, "loss": 0.76537102, "num_input_tokens_seen": 44397525, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.15625, "step": 2054, "time_per_iteration": 2.6859445571899414 }, { "auxiliary_loss_clip": 0.01161277, "auxiliary_loss_mlp": 0.01045837, "balance_loss_clip": 1.02714527, "balance_loss_mlp": 1.05066478, "epoch": 0.12355328423267699, "flos": 25045969294080.0, "grad_norm": 1.8173604796620648, "language_loss": 0.85100436, "learning_rate": 3.908890552574849e-06, "loss": 0.87307549, "num_input_tokens_seen": 44415890, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.109375, "step": 2055, "time_per_iteration": 4.009137153625488 }, { "auxiliary_loss_clip": 0.01163022, "auxiliary_loss_mlp": 0.01051663, "balance_loss_clip": 1.03286362, "balance_loss_mlp": 1.05155373, "epoch": 0.12361340748534495, "flos": 27709140395520.0, "grad_norm": 1.76101373379586, "language_loss": 0.77192223, "learning_rate": 3.908774306463384e-06, "loss": 0.79406905, "num_input_tokens_seen": 44436625, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.109375, "step": 2056, "time_per_iteration": 2.511791944503784 }, { "auxiliary_loss_clip": 0.01163094, "auxiliary_loss_mlp": 0.01055725, "balance_loss_clip": 1.03535235, "balance_loss_mlp": 1.05018139, "epoch": 0.12367353073801293, "flos": 26140598311680.0, "grad_norm": 2.048101642034755, "language_loss": 0.82644343, "learning_rate": 3.908657987971009e-06, "loss": 0.84863168, "num_input_tokens_seen": 44455265, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.125, "step": 2057, "time_per_iteration": 3.9503231048583984 }, { "auxiliary_loss_clip": 0.01167375, "auxiliary_loss_mlp": 0.01056373, "balance_loss_clip": 1.03452194, "balance_loss_mlp": 1.05211377, "epoch": 0.1237336539906809, "flos": 25156035544320.0, "grad_norm": 1.4818650883906308, "language_loss": 0.78042638, "learning_rate": 3.90854159710213e-06, "loss": 0.80266386, "num_input_tokens_seen": 44475815, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.15625, "step": 2058, "time_per_iteration": 2.5200328826904297 }, { "auxiliary_loss_clip": 0.01165644, "auxiliary_loss_mlp": 0.01051382, "balance_loss_clip": 1.02949524, "balance_loss_mlp": 1.05021667, "epoch": 0.12379377724334886, "flos": 15304589867520.0, "grad_norm": 2.052491173729549, "language_loss": 0.836043, "learning_rate": 3.9084251338611624e-06, "loss": 0.85821331, "num_input_tokens_seen": 44494045, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.15625, "step": 2059, "time_per_iteration": 2.4298763275146484 }, { "auxiliary_loss_clip": 0.01168115, "auxiliary_loss_mlp": 0.01062768, "balance_loss_clip": 1.04010677, "balance_loss_mlp": 1.05195546, "epoch": 0.12385390049601683, "flos": 21316717509120.0, "grad_norm": 2.182430656205129, "language_loss": 0.80845833, "learning_rate": 3.908308598252523e-06, "loss": 0.83076721, "num_input_tokens_seen": 44509120, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.1640625, "step": 2060, "time_per_iteration": 2.4451465606689453 }, { "auxiliary_loss_clip": 0.01166255, "auxiliary_loss_mlp": 0.01056127, "balance_loss_clip": 1.03413308, "balance_loss_mlp": 1.05173612, "epoch": 0.1239140237486848, "flos": 15116309752320.0, "grad_norm": 10.305495792745203, "language_loss": 0.8589325, "learning_rate": 3.9081919902806306e-06, "loss": 0.88115633, "num_input_tokens_seen": 44525780, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.140625, "step": 2061, "time_per_iteration": 2.412540912628174 }, { "auxiliary_loss_clip": 0.01161826, "auxiliary_loss_mlp": 0.01045465, "balance_loss_clip": 1.02596271, "balance_loss_mlp": 1.05179513, "epoch": 0.12397414700135277, "flos": 21976791788160.0, "grad_norm": 1.805169386107288, "language_loss": 0.85016245, "learning_rate": 3.908075309949906e-06, "loss": 0.87223542, "num_input_tokens_seen": 44543125, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.1015625, "step": 2062, "time_per_iteration": 2.493722438812256 }, { "auxiliary_loss_clip": 0.01166936, "auxiliary_loss_mlp": 0.01052783, "balance_loss_clip": 1.03180194, "balance_loss_mlp": 1.05517328, "epoch": 0.12403427025402074, "flos": 13400892956160.0, "grad_norm": 1.6623907549554904, "language_loss": 0.78660667, "learning_rate": 3.907958557264774e-06, "loss": 0.8088038, "num_input_tokens_seen": 44560275, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1171875, "step": 2063, "time_per_iteration": 2.427780866622925 }, { "auxiliary_loss_clip": 0.01164552, "auxiliary_loss_mlp": 0.0105385, "balance_loss_clip": 1.03186727, "balance_loss_mlp": 1.05214655, "epoch": 0.12409439350668872, "flos": 15304374385920.0, "grad_norm": 2.02430857447198, "language_loss": 0.79364651, "learning_rate": 3.907841732229663e-06, "loss": 0.81583059, "num_input_tokens_seen": 44577640, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.125, "step": 2064, "time_per_iteration": 2.4380996227264404 }, { "auxiliary_loss_clip": 0.01162983, "auxiliary_loss_mlp": 0.0105059, "balance_loss_clip": 1.0296092, "balance_loss_mlp": 1.05126762, "epoch": 0.12415451675935668, "flos": 25009376313600.0, "grad_norm": 2.1146168228913313, "language_loss": 0.92449206, "learning_rate": 3.907724834849002e-06, "loss": 0.9466278, "num_input_tokens_seen": 44594860, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1171875, "step": 2065, "time_per_iteration": 2.4923899173736572 }, { "auxiliary_loss_clip": 0.01164319, "auxiliary_loss_mlp": 0.01044775, "balance_loss_clip": 1.02383018, "balance_loss_mlp": 1.05053115, "epoch": 0.12421464001202465, "flos": 23659673840640.0, "grad_norm": 1.6637815913841336, "language_loss": 0.81002587, "learning_rate": 3.907607865127225e-06, "loss": 0.83211678, "num_input_tokens_seen": 44614780, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1328125, "step": 2066, "time_per_iteration": 2.486889362335205 }, { "auxiliary_loss_clip": 0.01063334, "auxiliary_loss_mlp": 0.01021813, "balance_loss_clip": 1.01896417, "balance_loss_mlp": 1.02518725, "epoch": 0.12427476326469263, "flos": 65732904345600.0, "grad_norm": 0.86689454501266, "language_loss": 0.63277149, "learning_rate": 3.907490823068766e-06, "loss": 0.65362298, "num_input_tokens_seen": 44671240, "router_z_loss_clip": 0.02844238, "router_z_loss_mlp": 0.38085938, "step": 2067, "time_per_iteration": 3.075272560119629 }, { "auxiliary_loss_clip": 0.01165703, "auxiliary_loss_mlp": 0.0105288, "balance_loss_clip": 1.032305, "balance_loss_mlp": 1.05264044, "epoch": 0.12433488651736059, "flos": 24535427333760.0, "grad_norm": 3.1153735035281414, "language_loss": 0.93580145, "learning_rate": 3.907373708678063e-06, "loss": 0.95798731, "num_input_tokens_seen": 44691050, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.125, "step": 2068, "time_per_iteration": 2.5011818408966064 }, { "auxiliary_loss_clip": 0.01170638, "auxiliary_loss_mlp": 0.0105051, "balance_loss_clip": 1.03094792, "balance_loss_mlp": 1.05733252, "epoch": 0.12439500977002856, "flos": 21031659175680.0, "grad_norm": 2.473001321657378, "language_loss": 0.81080079, "learning_rate": 3.9072565219595596e-06, "loss": 0.83301228, "num_input_tokens_seen": 44709850, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.1328125, "step": 2069, "time_per_iteration": 2.4800326824188232 }, { "auxiliary_loss_clip": 0.01167582, "auxiliary_loss_mlp": 0.01057807, "balance_loss_clip": 1.03652811, "balance_loss_mlp": 1.05411911, "epoch": 0.12445513302269653, "flos": 26830621555200.0, "grad_norm": 1.5995473862902287, "language_loss": 0.77459729, "learning_rate": 3.907139262917696e-06, "loss": 0.79685116, "num_input_tokens_seen": 44731475, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.125, "step": 2070, "time_per_iteration": 2.491102933883667 }, { "auxiliary_loss_clip": 0.01169442, "auxiliary_loss_mlp": 0.01054292, "balance_loss_clip": 1.03329968, "balance_loss_mlp": 1.05616295, "epoch": 0.1245152562753645, "flos": 18368919037440.0, "grad_norm": 2.0815712782856903, "language_loss": 0.80991113, "learning_rate": 3.907021931556922e-06, "loss": 0.83214843, "num_input_tokens_seen": 44749685, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1328125, "step": 2071, "time_per_iteration": 2.4618241786956787 }, { "auxiliary_loss_clip": 0.01163725, "auxiliary_loss_mlp": 0.01053606, "balance_loss_clip": 1.03181434, "balance_loss_mlp": 1.05394304, "epoch": 0.12457537952803246, "flos": 33107986200960.0, "grad_norm": 1.6686014723558609, "language_loss": 0.78013968, "learning_rate": 3.906904527881684e-06, "loss": 0.80231297, "num_input_tokens_seen": 44772165, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.09375, "step": 2072, "time_per_iteration": 2.5567128658294678 }, { "auxiliary_loss_clip": 0.01167379, "auxiliary_loss_mlp": 0.01049392, "balance_loss_clip": 1.02882886, "balance_loss_mlp": 1.05692744, "epoch": 0.12463550278070043, "flos": 22270217990400.0, "grad_norm": 2.0739068487284, "language_loss": 0.75035512, "learning_rate": 3.9067870518964355e-06, "loss": 0.77252281, "num_input_tokens_seen": 44790580, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.109375, "step": 2073, "time_per_iteration": 2.4881272315979004 }, { "auxiliary_loss_clip": 0.01162364, "auxiliary_loss_mlp": 0.01051221, "balance_loss_clip": 1.02962017, "balance_loss_mlp": 1.05111122, "epoch": 0.12469562603336841, "flos": 14679025580160.0, "grad_norm": 1.8753745690063404, "language_loss": 0.90433627, "learning_rate": 3.906669503605631e-06, "loss": 0.92647219, "num_input_tokens_seen": 44806730, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.109375, "step": 2074, "time_per_iteration": 2.4225096702575684 }, { "auxiliary_loss_clip": 0.0116732, "auxiliary_loss_mlp": 0.01059578, "balance_loss_clip": 1.036654, "balance_loss_mlp": 1.05240834, "epoch": 0.12475574928603637, "flos": 24644775312000.0, "grad_norm": 2.5043309596440717, "language_loss": 0.83880663, "learning_rate": 3.906551883013728e-06, "loss": 0.86107564, "num_input_tokens_seen": 44825550, "router_z_loss_clip": 0.22949219, "router_z_loss_mlp": 1.1484375, "step": 2075, "time_per_iteration": 2.491792917251587 }, { "auxiliary_loss_clip": 0.01164455, "auxiliary_loss_mlp": 0.01056792, "balance_loss_clip": 1.03445232, "balance_loss_mlp": 1.05233765, "epoch": 0.12481587253870434, "flos": 21762980081280.0, "grad_norm": 2.795234707928854, "language_loss": 0.73645532, "learning_rate": 3.9064341901251865e-06, "loss": 0.75866777, "num_input_tokens_seen": 44844155, "router_z_loss_clip": 0.22363281, "router_z_loss_mlp": 1.1171875, "step": 2076, "time_per_iteration": 2.4651570320129395 }, { "auxiliary_loss_clip": 0.01162099, "auxiliary_loss_mlp": 0.01043591, "balance_loss_clip": 1.02411222, "balance_loss_mlp": 1.05504322, "epoch": 0.12487599579137232, "flos": 21432529935360.0, "grad_norm": 3.02133892007418, "language_loss": 0.7563076, "learning_rate": 3.906316424944469e-06, "loss": 0.77836442, "num_input_tokens_seen": 44863780, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.078125, "step": 2077, "time_per_iteration": 2.4701435565948486 }, { "auxiliary_loss_clip": 0.01166316, "auxiliary_loss_mlp": 0.01054188, "balance_loss_clip": 1.03262269, "balance_loss_mlp": 1.05396533, "epoch": 0.12493611904404028, "flos": 16107624276480.0, "grad_norm": 2.215852788610626, "language_loss": 0.82852781, "learning_rate": 3.906198587476043e-06, "loss": 0.8507328, "num_input_tokens_seen": 44881480, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.125, "step": 2078, "time_per_iteration": 2.4635345935821533 }, { "auxiliary_loss_clip": 0.0116661, "auxiliary_loss_mlp": 0.01051708, "balance_loss_clip": 1.03025079, "balance_loss_mlp": 1.05487013, "epoch": 0.12499624229670825, "flos": 21580266574080.0, "grad_norm": 1.5972440194872328, "language_loss": 0.75062847, "learning_rate": 3.906080677724374e-06, "loss": 0.77281165, "num_input_tokens_seen": 44900390, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1171875, "step": 2079, "time_per_iteration": 2.4688479900360107 }, { "auxiliary_loss_clip": 0.01173603, "auxiliary_loss_mlp": 0.01055951, "balance_loss_clip": 1.03485107, "balance_loss_mlp": 1.05856514, "epoch": 0.1250563655493762, "flos": 25699040421120.0, "grad_norm": 2.9875903894501548, "language_loss": 0.83605683, "learning_rate": 3.905962695693935e-06, "loss": 0.85835242, "num_input_tokens_seen": 44920375, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1484375, "step": 2080, "time_per_iteration": 2.553163528442383 }, { "auxiliary_loss_clip": 0.01162332, "auxiliary_loss_mlp": 0.01055735, "balance_loss_clip": 1.03518355, "balance_loss_mlp": 1.05240953, "epoch": 0.12511648880204418, "flos": 16909509450240.0, "grad_norm": 2.0813889412256144, "language_loss": 0.8477248, "learning_rate": 3.9058446413892e-06, "loss": 0.86990547, "num_input_tokens_seen": 44938415, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1015625, "step": 2081, "time_per_iteration": 2.432741165161133 }, { "auxiliary_loss_clip": 0.01161611, "auxiliary_loss_mlp": 0.01044055, "balance_loss_clip": 1.02463531, "balance_loss_mlp": 1.05130935, "epoch": 0.12517661205471217, "flos": 17567500740480.0, "grad_norm": 1.6317984027942287, "language_loss": 0.77010822, "learning_rate": 3.905726514814646e-06, "loss": 0.79216492, "num_input_tokens_seen": 44957135, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.1015625, "step": 2082, "time_per_iteration": 2.467120409011841 }, { "auxiliary_loss_clip": 0.01176981, "auxiliary_loss_mlp": 0.01051223, "balance_loss_clip": 1.02862072, "balance_loss_mlp": 1.05780661, "epoch": 0.12523673530738014, "flos": 16033791870720.0, "grad_norm": 2.3909661971380927, "language_loss": 0.78917974, "learning_rate": 3.9056083159747495e-06, "loss": 0.81146181, "num_input_tokens_seen": 44974480, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.1953125, "step": 2083, "time_per_iteration": 2.4242286682128906 }, { "auxiliary_loss_clip": 0.0116693, "auxiliary_loss_mlp": 0.01047894, "balance_loss_clip": 1.02520823, "balance_loss_mlp": 1.05316424, "epoch": 0.1252968585600481, "flos": 18807747494400.0, "grad_norm": 2.0563749919400074, "language_loss": 0.89785075, "learning_rate": 3.9054900448739966e-06, "loss": 0.919999, "num_input_tokens_seen": 44990310, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.140625, "step": 2084, "time_per_iteration": 2.4739596843719482 }, { "auxiliary_loss_clip": 0.01166151, "auxiliary_loss_mlp": 0.01049809, "balance_loss_clip": 1.02962685, "balance_loss_mlp": 1.05429363, "epoch": 0.12535698181271607, "flos": 27271568914560.0, "grad_norm": 1.7942240257487891, "language_loss": 0.80038166, "learning_rate": 3.905371701516869e-06, "loss": 0.8225413, "num_input_tokens_seen": 45010720, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1171875, "step": 2085, "time_per_iteration": 2.511094570159912 }, { "auxiliary_loss_clip": 0.01163513, "auxiliary_loss_mlp": 0.01054207, "balance_loss_clip": 1.03297544, "balance_loss_mlp": 1.05290771, "epoch": 0.12541710506538403, "flos": 22054107813120.0, "grad_norm": 1.6597218676699157, "language_loss": 0.88017368, "learning_rate": 3.905253285907856e-06, "loss": 0.90235096, "num_input_tokens_seen": 45030360, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.109375, "step": 2086, "time_per_iteration": 2.4955410957336426 }, { "auxiliary_loss_clip": 0.01161462, "auxiliary_loss_mlp": 0.01045843, "balance_loss_clip": 1.02622139, "balance_loss_mlp": 1.05328369, "epoch": 0.125477228318052, "flos": 12603173760000.0, "grad_norm": 2.5524618531553984, "language_loss": 0.8715046, "learning_rate": 3.905134798051447e-06, "loss": 0.89357769, "num_input_tokens_seen": 45045085, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.078125, "step": 2087, "time_per_iteration": 2.4460718631744385 }, { "auxiliary_loss_clip": 0.011629, "auxiliary_loss_mlp": 0.01051797, "balance_loss_clip": 1.03041124, "balance_loss_mlp": 1.05278206, "epoch": 0.12553735157071996, "flos": 23878549365120.0, "grad_norm": 1.7957772845668363, "language_loss": 0.73133814, "learning_rate": 3.905016237952136e-06, "loss": 0.75348514, "num_input_tokens_seen": 45065145, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.109375, "step": 2088, "time_per_iteration": 2.5102503299713135 }, { "auxiliary_loss_clip": 0.01064358, "auxiliary_loss_mlp": 0.01006516, "balance_loss_clip": 1.00384581, "balance_loss_mlp": 1.02677786, "epoch": 0.12559747482338796, "flos": 69920841830400.0, "grad_norm": 0.761211960461337, "language_loss": 0.61765337, "learning_rate": 3.904897605614418e-06, "loss": 0.63836205, "num_input_tokens_seen": 45126230, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.375, "step": 2089, "time_per_iteration": 3.0716748237609863 }, { "auxiliary_loss_clip": 0.01162639, "auxiliary_loss_mlp": 0.01054131, "balance_loss_clip": 1.0326736, "balance_loss_mlp": 1.05381227, "epoch": 0.12565759807605592, "flos": 24279563779200.0, "grad_norm": 3.6356876124538244, "language_loss": 0.77954513, "learning_rate": 3.904778901042793e-06, "loss": 0.80171275, "num_input_tokens_seen": 45145545, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.0859375, "step": 2090, "time_per_iteration": 2.4854319095611572 }, { "auxiliary_loss_clip": 0.01062791, "auxiliary_loss_mlp": 0.01010615, "balance_loss_clip": 1.00800407, "balance_loss_mlp": 1.02539253, "epoch": 0.12571772132872389, "flos": 56451180286080.0, "grad_norm": 0.7917356518606171, "language_loss": 0.59329808, "learning_rate": 3.90466012424176e-06, "loss": 0.61403215, "num_input_tokens_seen": 45206845, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.375, "step": 2091, "time_per_iteration": 3.0310847759246826 }, { "auxiliary_loss_clip": 0.01164898, "auxiliary_loss_mlp": 0.01051324, "balance_loss_clip": 1.0324409, "balance_loss_mlp": 1.05505228, "epoch": 0.12577784458139185, "flos": 41245846675200.0, "grad_norm": 1.6846342919748756, "language_loss": 0.63086671, "learning_rate": 3.904541275215825e-06, "loss": 0.65302885, "num_input_tokens_seen": 45228495, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.1015625, "step": 2092, "time_per_iteration": 2.631211519241333 }, { "auxiliary_loss_clip": 0.01167957, "auxiliary_loss_mlp": 0.01061939, "balance_loss_clip": 1.04050565, "balance_loss_mlp": 1.05292296, "epoch": 0.12583796783405982, "flos": 19755501799680.0, "grad_norm": 1.9888370093743848, "language_loss": 0.80663472, "learning_rate": 3.904422353969493e-06, "loss": 0.82893372, "num_input_tokens_seen": 45245720, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1484375, "step": 2093, "time_per_iteration": 2.4570910930633545 }, { "auxiliary_loss_clip": 0.01163622, "auxiliary_loss_mlp": 0.01062969, "balance_loss_clip": 1.04352641, "balance_loss_mlp": 1.05384827, "epoch": 0.12589809108672778, "flos": 22602104680320.0, "grad_norm": 1.9501437729947553, "language_loss": 0.76074386, "learning_rate": 3.904303360507276e-06, "loss": 0.78300977, "num_input_tokens_seen": 45265650, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.09375, "step": 2094, "time_per_iteration": 2.4557204246520996 }, { "auxiliary_loss_clip": 0.01159937, "auxiliary_loss_mlp": 0.0105139, "balance_loss_clip": 1.03181636, "balance_loss_mlp": 1.05109501, "epoch": 0.12595821433939577, "flos": 45222845541120.0, "grad_norm": 1.5208392202880403, "language_loss": 0.76739526, "learning_rate": 3.9041842948336835e-06, "loss": 0.78950858, "num_input_tokens_seen": 45287790, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0859375, "step": 2095, "time_per_iteration": 2.686441659927368 }, { "auxiliary_loss_clip": 0.01162588, "auxiliary_loss_mlp": 0.01058733, "balance_loss_clip": 1.03813314, "balance_loss_mlp": 1.04954112, "epoch": 0.12601833759206374, "flos": 14319811618560.0, "grad_norm": 2.230283757809849, "language_loss": 0.83119512, "learning_rate": 3.904065156953232e-06, "loss": 0.85340834, "num_input_tokens_seen": 45305720, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.1328125, "step": 2096, "time_per_iteration": 5.275973558425903 }, { "auxiliary_loss_clip": 0.01163668, "auxiliary_loss_mlp": 0.01054483, "balance_loss_clip": 1.03523111, "balance_loss_mlp": 1.05299807, "epoch": 0.1260784608447317, "flos": 21288241002240.0, "grad_norm": 1.8265530292600198, "language_loss": 0.75562024, "learning_rate": 3.903945946870439e-06, "loss": 0.77780175, "num_input_tokens_seen": 45325290, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.109375, "step": 2097, "time_per_iteration": 2.46077299118042 }, { "auxiliary_loss_clip": 0.0116163, "auxiliary_loss_mlp": 0.01062114, "balance_loss_clip": 1.04381549, "balance_loss_mlp": 1.05172515, "epoch": 0.12613858409739967, "flos": 26251311006720.0, "grad_norm": 2.9431696143556816, "language_loss": 0.87458861, "learning_rate": 3.9038266645898246e-06, "loss": 0.89682597, "num_input_tokens_seen": 45344465, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.1015625, "step": 2098, "time_per_iteration": 5.366294622421265 }, { "auxiliary_loss_clip": 0.01165801, "auxiliary_loss_mlp": 0.01059546, "balance_loss_clip": 1.03699112, "balance_loss_mlp": 1.05076742, "epoch": 0.12619870735006763, "flos": 21579979265280.0, "grad_norm": 1.9147246727876206, "language_loss": 0.69771349, "learning_rate": 3.903707310115912e-06, "loss": 0.71996689, "num_input_tokens_seen": 45362465, "router_z_loss_clip": 0.22558594, "router_z_loss_mlp": 1.1484375, "step": 2099, "time_per_iteration": 2.462928533554077 }, { "auxiliary_loss_clip": 0.01162546, "auxiliary_loss_mlp": 0.01055305, "balance_loss_clip": 1.03365684, "balance_loss_mlp": 1.0506773, "epoch": 0.1262588306027356, "flos": 23367037737600.0, "grad_norm": 2.0538261953254877, "language_loss": 0.81821924, "learning_rate": 3.903587883453228e-06, "loss": 0.84039772, "num_input_tokens_seen": 45382700, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1171875, "step": 2100, "time_per_iteration": 2.4887142181396484 }, { "auxiliary_loss_clip": 0.01166178, "auxiliary_loss_mlp": 0.01055238, "balance_loss_clip": 1.03436422, "balance_loss_mlp": 1.05413294, "epoch": 0.12631895385540357, "flos": 23949185460480.0, "grad_norm": 2.4762255612282846, "language_loss": 0.80244523, "learning_rate": 3.903468384606302e-06, "loss": 0.82465935, "num_input_tokens_seen": 45401005, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1171875, "step": 2101, "time_per_iteration": 2.5072813034057617 }, { "auxiliary_loss_clip": 0.01058412, "auxiliary_loss_mlp": 0.01030937, "balance_loss_clip": 1.02817142, "balance_loss_mlp": 1.02118254, "epoch": 0.12637907710807156, "flos": 70282138780800.0, "grad_norm": 0.7173590793999786, "language_loss": 0.5709666, "learning_rate": 3.903348813579662e-06, "loss": 0.59186012, "num_input_tokens_seen": 45466555, "router_z_loss_clip": 0.02770996, "router_z_loss_mlp": 0.37109375, "step": 2102, "time_per_iteration": 3.133939504623413 }, { "auxiliary_loss_clip": 0.01165864, "auxiliary_loss_mlp": 0.01053973, "balance_loss_clip": 1.03447008, "balance_loss_mlp": 1.05360579, "epoch": 0.12643920036073952, "flos": 18915084311040.0, "grad_norm": 1.9978279594504096, "language_loss": 0.93408251, "learning_rate": 3.903229170377845e-06, "loss": 0.95628089, "num_input_tokens_seen": 45485165, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.125, "step": 2103, "time_per_iteration": 2.4427194595336914 }, { "auxiliary_loss_clip": 0.01155977, "auxiliary_loss_mlp": 0.01037784, "balance_loss_clip": 1.01876998, "balance_loss_mlp": 1.05099583, "epoch": 0.1264993236134075, "flos": 27782470010880.0, "grad_norm": 1.8048636869286359, "language_loss": 0.78091842, "learning_rate": 3.903109455005387e-06, "loss": 0.80285609, "num_input_tokens_seen": 45504630, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.046875, "step": 2104, "time_per_iteration": 2.5140538215637207 }, { "auxiliary_loss_clip": 0.0116666, "auxiliary_loss_mlp": 0.01055945, "balance_loss_clip": 1.03652585, "balance_loss_mlp": 1.05547523, "epoch": 0.12655944686607545, "flos": 24754697907840.0, "grad_norm": 2.3403340954864706, "language_loss": 0.81401551, "learning_rate": 3.902989667466828e-06, "loss": 0.83624148, "num_input_tokens_seen": 45524885, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.109375, "step": 2105, "time_per_iteration": 2.5065741539001465 }, { "auxiliary_loss_clip": 0.01166881, "auxiliary_loss_mlp": 0.01056319, "balance_loss_clip": 1.0345757, "balance_loss_mlp": 1.05314171, "epoch": 0.12661957011874342, "flos": 24133048202880.0, "grad_norm": 2.4822706865805593, "language_loss": 0.83091724, "learning_rate": 3.90286980776671e-06, "loss": 0.85314929, "num_input_tokens_seen": 45545000, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.140625, "step": 2106, "time_per_iteration": 2.49045729637146 }, { "auxiliary_loss_clip": 0.01161161, "auxiliary_loss_mlp": 0.01049132, "balance_loss_clip": 1.0290575, "balance_loss_mlp": 1.05272055, "epoch": 0.12667969337141138, "flos": 24569614103040.0, "grad_norm": 1.7115573915405602, "language_loss": 0.73505062, "learning_rate": 3.902749875909578e-06, "loss": 0.75715351, "num_input_tokens_seen": 45564210, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0859375, "step": 2107, "time_per_iteration": 2.5082359313964844 }, { "auxiliary_loss_clip": 0.01157303, "auxiliary_loss_mlp": 0.01046707, "balance_loss_clip": 1.02781248, "balance_loss_mlp": 1.05023456, "epoch": 0.12673981662407935, "flos": 22961677777920.0, "grad_norm": 2.515738633491498, "language_loss": 0.79443932, "learning_rate": 3.90262987189998e-06, "loss": 0.81647944, "num_input_tokens_seen": 45583030, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0703125, "step": 2108, "time_per_iteration": 2.47874116897583 }, { "auxiliary_loss_clip": 0.01162455, "auxiliary_loss_mlp": 0.01053276, "balance_loss_clip": 1.03331995, "balance_loss_mlp": 1.05010343, "epoch": 0.12679993987674734, "flos": 17274864637440.0, "grad_norm": 2.1138461648346185, "language_loss": 0.75517035, "learning_rate": 3.902509795742467e-06, "loss": 0.77732766, "num_input_tokens_seen": 45602265, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.125, "step": 2109, "time_per_iteration": 2.4906575679779053 }, { "auxiliary_loss_clip": 0.01160347, "auxiliary_loss_mlp": 0.01049851, "balance_loss_clip": 1.03049183, "balance_loss_mlp": 1.05213296, "epoch": 0.1268600631294153, "flos": 17275080119040.0, "grad_norm": 1.6432426417522317, "language_loss": 0.82906055, "learning_rate": 3.902389647441592e-06, "loss": 0.85116255, "num_input_tokens_seen": 45620595, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.078125, "step": 2110, "time_per_iteration": 2.4386205673217773 }, { "auxiliary_loss_clip": 0.0115932, "auxiliary_loss_mlp": 0.01057956, "balance_loss_clip": 1.03716636, "balance_loss_mlp": 1.05060029, "epoch": 0.12692018638208327, "flos": 24061047390720.0, "grad_norm": 1.564749610849611, "language_loss": 0.78706533, "learning_rate": 3.90226942700191e-06, "loss": 0.80923814, "num_input_tokens_seen": 45641140, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.0859375, "step": 2111, "time_per_iteration": 2.4857468605041504 }, { "auxiliary_loss_clip": 0.01171258, "auxiliary_loss_mlp": 0.01068299, "balance_loss_clip": 1.04628146, "balance_loss_mlp": 1.05519009, "epoch": 0.12698030963475124, "flos": 31831900652160.0, "grad_norm": 1.9812518598220397, "language_loss": 0.76489121, "learning_rate": 3.902149134427982e-06, "loss": 0.78728676, "num_input_tokens_seen": 45662315, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.15625, "step": 2112, "time_per_iteration": 2.5302584171295166 }, { "auxiliary_loss_clip": 0.01162556, "auxiliary_loss_mlp": 0.01061536, "balance_loss_clip": 1.04146099, "balance_loss_mlp": 1.05212212, "epoch": 0.1270404328874192, "flos": 25187744275200.0, "grad_norm": 1.7830958075810732, "language_loss": 0.85368723, "learning_rate": 3.902028769724367e-06, "loss": 0.87592816, "num_input_tokens_seen": 45680335, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1015625, "step": 2113, "time_per_iteration": 2.4976186752319336 }, { "auxiliary_loss_clip": 0.01161321, "auxiliary_loss_mlp": 0.01063805, "balance_loss_clip": 1.04176366, "balance_loss_mlp": 1.05152583, "epoch": 0.12710055614008717, "flos": 15997342544640.0, "grad_norm": 2.351690094694992, "language_loss": 0.7397697, "learning_rate": 3.9019083328956315e-06, "loss": 0.76202095, "num_input_tokens_seen": 45696240, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.09375, "step": 2114, "time_per_iteration": 2.4165444374084473 }, { "auxiliary_loss_clip": 0.01162912, "auxiliary_loss_mlp": 0.01063099, "balance_loss_clip": 1.04197514, "balance_loss_mlp": 1.05369616, "epoch": 0.12716067939275516, "flos": 15085642515840.0, "grad_norm": 1.95908766461694, "language_loss": 0.8347289, "learning_rate": 3.901787823946341e-06, "loss": 0.85698903, "num_input_tokens_seen": 45713695, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.09375, "step": 2115, "time_per_iteration": 2.4429914951324463 }, { "auxiliary_loss_clip": 0.01163306, "auxiliary_loss_mlp": 0.01058181, "balance_loss_clip": 1.03767729, "balance_loss_mlp": 1.05270791, "epoch": 0.12722080264542313, "flos": 28366736636160.0, "grad_norm": 1.4506747239493727, "language_loss": 0.86840832, "learning_rate": 3.901667242881065e-06, "loss": 0.89062321, "num_input_tokens_seen": 45736655, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.109375, "step": 2116, "time_per_iteration": 2.5199084281921387 }, { "auxiliary_loss_clip": 0.01160078, "auxiliary_loss_mlp": 0.01053131, "balance_loss_clip": 1.03325891, "balance_loss_mlp": 1.05153894, "epoch": 0.1272809258980911, "flos": 32379897519360.0, "grad_norm": 1.7757501627492038, "language_loss": 0.70524269, "learning_rate": 3.9015465897043775e-06, "loss": 0.72737479, "num_input_tokens_seen": 45758195, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.078125, "step": 2117, "time_per_iteration": 2.5797576904296875 }, { "auxiliary_loss_clip": 0.01162683, "auxiliary_loss_mlp": 0.01054956, "balance_loss_clip": 1.03356981, "balance_loss_mlp": 1.05256796, "epoch": 0.12734104915075906, "flos": 16034402401920.0, "grad_norm": 2.227386008793452, "language_loss": 0.87061274, "learning_rate": 3.901425864420852e-06, "loss": 0.89278913, "num_input_tokens_seen": 45774280, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1015625, "step": 2118, "time_per_iteration": 2.43100643157959 }, { "auxiliary_loss_clip": 0.01159502, "auxiliary_loss_mlp": 0.01050824, "balance_loss_clip": 1.03167892, "balance_loss_mlp": 1.05031681, "epoch": 0.12740117240342702, "flos": 18260325244800.0, "grad_norm": 2.866702009775126, "language_loss": 0.87333333, "learning_rate": 3.901305067035068e-06, "loss": 0.89543664, "num_input_tokens_seen": 45792760, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.09375, "step": 2119, "time_per_iteration": 2.4449045658111572 }, { "auxiliary_loss_clip": 0.0116343, "auxiliary_loss_mlp": 0.01050815, "balance_loss_clip": 1.03026366, "balance_loss_mlp": 1.05341697, "epoch": 0.127461295656095, "flos": 12121790664960.0, "grad_norm": 2.1266684238309246, "language_loss": 0.87548375, "learning_rate": 3.901184197551605e-06, "loss": 0.89762622, "num_input_tokens_seen": 45804300, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.09375, "step": 2120, "time_per_iteration": 2.3992741107940674 }, { "auxiliary_loss_clip": 0.01161558, "auxiliary_loss_mlp": 0.01045903, "balance_loss_clip": 1.02576828, "balance_loss_mlp": 1.05134094, "epoch": 0.12752141890876295, "flos": 23149095966720.0, "grad_norm": 2.1867329035247116, "language_loss": 0.75903809, "learning_rate": 3.901063255975046e-06, "loss": 0.78111279, "num_input_tokens_seen": 45823780, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1015625, "step": 2121, "time_per_iteration": 2.496480703353882 }, { "auxiliary_loss_clip": 0.01162771, "auxiliary_loss_mlp": 0.01047888, "balance_loss_clip": 1.0271697, "balance_loss_mlp": 1.052598, "epoch": 0.12758154216143094, "flos": 21615997628160.0, "grad_norm": 2.1693617907778275, "language_loss": 0.831833, "learning_rate": 3.900942242309978e-06, "loss": 0.85393965, "num_input_tokens_seen": 45840495, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1015625, "step": 2122, "time_per_iteration": 2.473764181137085 }, { "auxiliary_loss_clip": 0.01164326, "auxiliary_loss_mlp": 0.01051814, "balance_loss_clip": 1.03105974, "balance_loss_mlp": 1.05280089, "epoch": 0.1276416654140989, "flos": 15924874855680.0, "grad_norm": 1.8654586895940872, "language_loss": 0.79146492, "learning_rate": 3.90082115656099e-06, "loss": 0.81362629, "num_input_tokens_seen": 45857735, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.109375, "step": 2123, "time_per_iteration": 2.4480130672454834 }, { "auxiliary_loss_clip": 0.01166021, "auxiliary_loss_mlp": 0.01052449, "balance_loss_clip": 1.03134918, "balance_loss_mlp": 1.05425107, "epoch": 0.12770178866676687, "flos": 22382690451840.0, "grad_norm": 1.5406980847894374, "language_loss": 0.79380888, "learning_rate": 3.900699998732673e-06, "loss": 0.81599355, "num_input_tokens_seen": 45876485, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1171875, "step": 2124, "time_per_iteration": 2.509089231491089 }, { "auxiliary_loss_clip": 0.01165924, "auxiliary_loss_mlp": 0.01050229, "balance_loss_clip": 1.03034496, "balance_loss_mlp": 1.05267596, "epoch": 0.12776191191943484, "flos": 21652482867840.0, "grad_norm": 1.8620676642962268, "language_loss": 0.75791776, "learning_rate": 3.900578768829623e-06, "loss": 0.78007936, "num_input_tokens_seen": 45894645, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1328125, "step": 2125, "time_per_iteration": 2.488746166229248 }, { "auxiliary_loss_clip": 0.01162433, "auxiliary_loss_mlp": 0.01045557, "balance_loss_clip": 1.02512455, "balance_loss_mlp": 1.05163956, "epoch": 0.1278220351721028, "flos": 25735561574400.0, "grad_norm": 3.073073377391845, "language_loss": 0.77466726, "learning_rate": 3.900457466856434e-06, "loss": 0.79674709, "num_input_tokens_seen": 45913755, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.109375, "step": 2126, "time_per_iteration": 2.5379161834716797 }, { "auxiliary_loss_clip": 0.01164641, "auxiliary_loss_mlp": 0.01050635, "balance_loss_clip": 1.03083467, "balance_loss_mlp": 1.05573392, "epoch": 0.12788215842477077, "flos": 41243224982400.0, "grad_norm": 1.3705234178555912, "language_loss": 0.69067979, "learning_rate": 3.9003360928177085e-06, "loss": 0.71283257, "num_input_tokens_seen": 45936095, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0859375, "step": 2127, "time_per_iteration": 2.6376280784606934 }, { "auxiliary_loss_clip": 0.01059339, "auxiliary_loss_mlp": 0.01013801, "balance_loss_clip": 1.01129746, "balance_loss_mlp": 1.02212167, "epoch": 0.12794228167743876, "flos": 70877430881280.0, "grad_norm": 0.8706719907033124, "language_loss": 0.62792885, "learning_rate": 3.900214646718047e-06, "loss": 0.6486603, "num_input_tokens_seen": 46004655, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.37109375, "step": 2128, "time_per_iteration": 3.1594529151916504 }, { "auxiliary_loss_clip": 0.01163814, "auxiliary_loss_mlp": 0.01044366, "balance_loss_clip": 1.02272987, "balance_loss_mlp": 1.05194271, "epoch": 0.12800240493010673, "flos": 16289727252480.0, "grad_norm": 2.67636232935908, "language_loss": 0.76989472, "learning_rate": 3.900093128562056e-06, "loss": 0.79197651, "num_input_tokens_seen": 46023610, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1171875, "step": 2129, "time_per_iteration": 2.4634830951690674 }, { "auxiliary_loss_clip": 0.01172215, "auxiliary_loss_mlp": 0.01053016, "balance_loss_clip": 1.03086686, "balance_loss_mlp": 1.05541348, "epoch": 0.1280625281827747, "flos": 20631542601600.0, "grad_norm": 2.3754049835338624, "language_loss": 0.79055685, "learning_rate": 3.899971538354343e-06, "loss": 0.81280911, "num_input_tokens_seen": 46041725, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.1640625, "step": 2130, "time_per_iteration": 2.49523663520813 }, { "auxiliary_loss_clip": 0.0116467, "auxiliary_loss_mlp": 0.01045461, "balance_loss_clip": 1.02577972, "balance_loss_mlp": 1.05210733, "epoch": 0.12812265143544266, "flos": 22638230784000.0, "grad_norm": 2.7565582447517447, "language_loss": 0.70893347, "learning_rate": 3.899849876099518e-06, "loss": 0.73103482, "num_input_tokens_seen": 46061095, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.125, "step": 2131, "time_per_iteration": 2.4509732723236084 }, { "auxiliary_loss_clip": 0.0116272, "auxiliary_loss_mlp": 0.01050313, "balance_loss_clip": 1.02954721, "balance_loss_mlp": 1.05268979, "epoch": 0.12818277468811062, "flos": 34714701463680.0, "grad_norm": 1.9344649240811989, "language_loss": 0.72395051, "learning_rate": 3.899728141802197e-06, "loss": 0.74608082, "num_input_tokens_seen": 46082670, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1015625, "step": 2132, "time_per_iteration": 2.6090636253356934 }, { "auxiliary_loss_clip": 0.01157539, "auxiliary_loss_mlp": 0.01048177, "balance_loss_clip": 1.02812552, "balance_loss_mlp": 1.05129719, "epoch": 0.1282428979407786, "flos": 23112107936640.0, "grad_norm": 1.690206900687791, "language_loss": 0.81994909, "learning_rate": 3.8996063354669935e-06, "loss": 0.84200633, "num_input_tokens_seen": 46102410, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0625, "step": 2133, "time_per_iteration": 2.4760043621063232 }, { "auxiliary_loss_clip": 0.01167763, "auxiliary_loss_mlp": 0.01059586, "balance_loss_clip": 1.03758013, "balance_loss_mlp": 1.0516994, "epoch": 0.12830302119344655, "flos": 20886508316160.0, "grad_norm": 2.255178824306691, "language_loss": 0.79561567, "learning_rate": 3.899484457098528e-06, "loss": 0.81788921, "num_input_tokens_seen": 46121145, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.15625, "step": 2134, "time_per_iteration": 2.493004322052002 }, { "auxiliary_loss_clip": 0.01166066, "auxiliary_loss_mlp": 0.01049287, "balance_loss_clip": 1.02873564, "balance_loss_mlp": 1.05369473, "epoch": 0.12836314444611455, "flos": 21397768548480.0, "grad_norm": 1.7112957668632665, "language_loss": 0.82608277, "learning_rate": 3.899362506701421e-06, "loss": 0.84823632, "num_input_tokens_seen": 46140740, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.125, "step": 2135, "time_per_iteration": 2.4672300815582275 }, { "auxiliary_loss_clip": 0.01160995, "auxiliary_loss_mlp": 0.01056075, "balance_loss_clip": 1.03492677, "balance_loss_mlp": 1.05119777, "epoch": 0.1284232676987825, "flos": 13662466773120.0, "grad_norm": 2.240907145805763, "language_loss": 0.77426201, "learning_rate": 3.899240484280298e-06, "loss": 0.79643267, "num_input_tokens_seen": 46156805, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1015625, "step": 2136, "time_per_iteration": 2.4499571323394775 }, { "auxiliary_loss_clip": 0.01059507, "auxiliary_loss_mlp": 0.01005035, "balance_loss_clip": 1.00277042, "balance_loss_mlp": 1.0219121, "epoch": 0.12848339095145048, "flos": 59994737735040.0, "grad_norm": 0.899819221854555, "language_loss": 0.59202254, "learning_rate": 3.899118389839785e-06, "loss": 0.61266792, "num_input_tokens_seen": 46222085, "router_z_loss_clip": 0.02270508, "router_z_loss_mlp": 0.375, "step": 2137, "time_per_iteration": 3.2360012531280518 }, { "auxiliary_loss_clip": 0.0116019, "auxiliary_loss_mlp": 0.01052391, "balance_loss_clip": 1.03222024, "balance_loss_mlp": 1.04827213, "epoch": 0.12854351420411844, "flos": 13881378211200.0, "grad_norm": 6.816036452629976, "language_loss": 0.82335174, "learning_rate": 3.898996223384512e-06, "loss": 0.84547752, "num_input_tokens_seen": 46239970, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.1171875, "step": 2138, "time_per_iteration": 5.257770299911499 }, { "auxiliary_loss_clip": 0.01165016, "auxiliary_loss_mlp": 0.01052825, "balance_loss_clip": 1.03015184, "balance_loss_mlp": 1.05210185, "epoch": 0.1286036374567864, "flos": 22637943475200.0, "grad_norm": 2.3119000210933316, "language_loss": 0.78851449, "learning_rate": 3.898873984919113e-06, "loss": 0.81069291, "num_input_tokens_seen": 46257740, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.1328125, "step": 2139, "time_per_iteration": 2.4697070121765137 }, { "auxiliary_loss_clip": 0.01164334, "auxiliary_loss_mlp": 0.01046746, "balance_loss_clip": 1.02656412, "balance_loss_mlp": 1.05170405, "epoch": 0.12866376070945437, "flos": 16324775948160.0, "grad_norm": 2.2001217750219433, "language_loss": 0.85159481, "learning_rate": 3.8987516744482215e-06, "loss": 0.87370563, "num_input_tokens_seen": 46275445, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.125, "step": 2140, "time_per_iteration": 5.209266424179077 }, { "auxiliary_loss_clip": 0.01158766, "auxiliary_loss_mlp": 0.01048131, "balance_loss_clip": 1.02815139, "balance_loss_mlp": 1.04928744, "epoch": 0.12872388396212234, "flos": 11874546374400.0, "grad_norm": 1.8361151009599328, "language_loss": 0.85792696, "learning_rate": 3.898629291976476e-06, "loss": 0.87999594, "num_input_tokens_seen": 46291710, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.09375, "step": 2141, "time_per_iteration": 2.4261505603790283 }, { "auxiliary_loss_clip": 0.01165057, "auxiliary_loss_mlp": 0.01047577, "balance_loss_clip": 1.02673876, "balance_loss_mlp": 1.05018353, "epoch": 0.12878400721479033, "flos": 28366700722560.0, "grad_norm": 1.919453977459735, "language_loss": 0.68356168, "learning_rate": 3.898506837508518e-06, "loss": 0.705688, "num_input_tokens_seen": 46311335, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1484375, "step": 2142, "time_per_iteration": 2.545147180557251 }, { "auxiliary_loss_clip": 0.01169209, "auxiliary_loss_mlp": 0.01048554, "balance_loss_clip": 1.02753735, "balance_loss_mlp": 1.0545764, "epoch": 0.1288441304674583, "flos": 25885632597120.0, "grad_norm": 2.1778771480115977, "language_loss": 0.8277688, "learning_rate": 3.89838431104899e-06, "loss": 0.84994638, "num_input_tokens_seen": 46330985, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.1484375, "step": 2143, "time_per_iteration": 2.4998035430908203 }, { "auxiliary_loss_clip": 0.01167267, "auxiliary_loss_mlp": 0.0105174, "balance_loss_clip": 1.03177202, "balance_loss_mlp": 1.05456996, "epoch": 0.12890425372012626, "flos": 20813789232000.0, "grad_norm": 1.7355713186780808, "language_loss": 0.82333457, "learning_rate": 3.898261712602539e-06, "loss": 0.84552455, "num_input_tokens_seen": 46351295, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.125, "step": 2144, "time_per_iteration": 2.507720470428467 }, { "auxiliary_loss_clip": 0.01160959, "auxiliary_loss_mlp": 0.01050416, "balance_loss_clip": 1.028494, "balance_loss_mlp": 1.0480839, "epoch": 0.12896437697279423, "flos": 22565870835840.0, "grad_norm": 4.877170024746465, "language_loss": 0.78525704, "learning_rate": 3.898139042173813e-06, "loss": 0.80737078, "num_input_tokens_seen": 46368600, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.125, "step": 2145, "time_per_iteration": 2.4523632526397705 }, { "auxiliary_loss_clip": 0.01162531, "auxiliary_loss_mlp": 0.01052001, "balance_loss_clip": 1.03110409, "balance_loss_mlp": 1.04952145, "epoch": 0.1290245002254622, "flos": 17493776075520.0, "grad_norm": 1.8662824203477097, "language_loss": 0.82524079, "learning_rate": 3.898016299767465e-06, "loss": 0.84738612, "num_input_tokens_seen": 46387370, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.125, "step": 2146, "time_per_iteration": 2.450639009475708 }, { "auxiliary_loss_clip": 0.01162448, "auxiliary_loss_mlp": 0.01049824, "balance_loss_clip": 1.02866399, "balance_loss_mlp": 1.05180764, "epoch": 0.12908462347813016, "flos": 36315957859200.0, "grad_norm": 2.118776831637998, "language_loss": 0.70621979, "learning_rate": 3.897893485388149e-06, "loss": 0.72834247, "num_input_tokens_seen": 46409570, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.1015625, "step": 2147, "time_per_iteration": 2.590801954269409 }, { "auxiliary_loss_clip": 0.01160911, "auxiliary_loss_mlp": 0.01050346, "balance_loss_clip": 1.03042567, "balance_loss_mlp": 1.04944086, "epoch": 0.12914474673079815, "flos": 22528703237760.0, "grad_norm": 3.2525854046280083, "language_loss": 0.71442866, "learning_rate": 3.897770599040521e-06, "loss": 0.73654121, "num_input_tokens_seen": 46429320, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.109375, "step": 2148, "time_per_iteration": 2.4721457958221436 }, { "auxiliary_loss_clip": 0.01162754, "auxiliary_loss_mlp": 0.01049631, "balance_loss_clip": 1.02972281, "balance_loss_mlp": 1.05326807, "epoch": 0.12920486998346611, "flos": 21471888263040.0, "grad_norm": 1.7051108841045066, "language_loss": 0.78895986, "learning_rate": 3.897647640729242e-06, "loss": 0.81108367, "num_input_tokens_seen": 46450155, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.09375, "step": 2149, "time_per_iteration": 2.484978675842285 }, { "auxiliary_loss_clip": 0.01164129, "auxiliary_loss_mlp": 0.01049576, "balance_loss_clip": 1.02846384, "balance_loss_mlp": 1.05384302, "epoch": 0.12926499323613408, "flos": 27308556944640.0, "grad_norm": 2.4368893101565656, "language_loss": 0.7641139, "learning_rate": 3.897524610458975e-06, "loss": 0.78625095, "num_input_tokens_seen": 46470280, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1015625, "step": 2150, "time_per_iteration": 2.5256683826446533 }, { "auxiliary_loss_clip": 0.01162486, "auxiliary_loss_mlp": 0.01051502, "balance_loss_clip": 1.03109396, "balance_loss_mlp": 1.0504092, "epoch": 0.12932511648880204, "flos": 22091131756800.0, "grad_norm": 2.29614059000955, "language_loss": 0.70729911, "learning_rate": 3.8974015082343835e-06, "loss": 0.72943902, "num_input_tokens_seen": 46487605, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1171875, "step": 2151, "time_per_iteration": 2.4609110355377197 }, { "auxiliary_loss_clip": 0.01162505, "auxiliary_loss_mlp": 0.01045423, "balance_loss_clip": 1.02553868, "balance_loss_mlp": 1.05377638, "epoch": 0.12938523974147, "flos": 20302780394880.0, "grad_norm": 2.1954530551399007, "language_loss": 0.83944952, "learning_rate": 3.897278334060137e-06, "loss": 0.86152875, "num_input_tokens_seen": 46505100, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0859375, "step": 2152, "time_per_iteration": 2.469729423522949 }, { "auxiliary_loss_clip": 0.01164897, "auxiliary_loss_mlp": 0.01058512, "balance_loss_clip": 1.03850842, "balance_loss_mlp": 1.0524739, "epoch": 0.12944536299413797, "flos": 19499961467520.0, "grad_norm": 1.613204762472584, "language_loss": 0.78673637, "learning_rate": 3.897155087940906e-06, "loss": 0.80897051, "num_input_tokens_seen": 46524020, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.125, "step": 2153, "time_per_iteration": 2.448016881942749 }, { "auxiliary_loss_clip": 0.01164525, "auxiliary_loss_mlp": 0.01050518, "balance_loss_clip": 1.03090811, "balance_loss_mlp": 1.05337739, "epoch": 0.12950548624680594, "flos": 27707919333120.0, "grad_norm": 1.863506353466284, "language_loss": 0.80266607, "learning_rate": 3.897031769881364e-06, "loss": 0.82481647, "num_input_tokens_seen": 46544640, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.109375, "step": 2154, "time_per_iteration": 2.5407555103302 }, { "auxiliary_loss_clip": 0.01166857, "auxiliary_loss_mlp": 0.01051939, "balance_loss_clip": 1.03135145, "balance_loss_mlp": 1.054878, "epoch": 0.12956560949947393, "flos": 17565740974080.0, "grad_norm": 2.3984448910306138, "language_loss": 0.83768672, "learning_rate": 3.896908379886188e-06, "loss": 0.85987467, "num_input_tokens_seen": 46561395, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.1171875, "step": 2155, "time_per_iteration": 2.4348134994506836 }, { "auxiliary_loss_clip": 0.01164532, "auxiliary_loss_mlp": 0.01056195, "balance_loss_clip": 1.03619194, "balance_loss_mlp": 1.05124378, "epoch": 0.1296257327521419, "flos": 20740711011840.0, "grad_norm": 2.3439162051571616, "language_loss": 0.75925374, "learning_rate": 3.896784917960055e-06, "loss": 0.781461, "num_input_tokens_seen": 46579395, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.1328125, "step": 2156, "time_per_iteration": 2.4907310009002686 }, { "auxiliary_loss_clip": 0.01160447, "auxiliary_loss_mlp": 0.01050785, "balance_loss_clip": 1.03074574, "balance_loss_mlp": 1.05235529, "epoch": 0.12968585600480986, "flos": 16395735265920.0, "grad_norm": 1.756074454943518, "language_loss": 0.8647871, "learning_rate": 3.896661384107648e-06, "loss": 0.88689935, "num_input_tokens_seen": 46597090, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.078125, "step": 2157, "time_per_iteration": 2.4227404594421387 }, { "auxiliary_loss_clip": 0.01163642, "auxiliary_loss_mlp": 0.0105729, "balance_loss_clip": 1.03603458, "balance_loss_mlp": 1.04888797, "epoch": 0.12974597925747783, "flos": 28329533124480.0, "grad_norm": 2.4059641169995176, "language_loss": 0.80670685, "learning_rate": 3.896537778333651e-06, "loss": 0.82891607, "num_input_tokens_seen": 46617355, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1484375, "step": 2158, "time_per_iteration": 2.554083824157715 }, { "auxiliary_loss_clip": 0.01166309, "auxiliary_loss_mlp": 0.01060178, "balance_loss_clip": 1.03992414, "balance_loss_mlp": 1.0531925, "epoch": 0.1298061025101458, "flos": 9683025782400.0, "grad_norm": 2.520883376958797, "language_loss": 0.75024921, "learning_rate": 3.896414100642752e-06, "loss": 0.7725141, "num_input_tokens_seen": 46633130, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1328125, "step": 2159, "time_per_iteration": 2.4435088634490967 }, { "auxiliary_loss_clip": 0.01158135, "auxiliary_loss_mlp": 0.01047054, "balance_loss_clip": 1.02651477, "balance_loss_mlp": 1.04961312, "epoch": 0.12986622576281376, "flos": 27709535445120.0, "grad_norm": 1.9149242700580063, "language_loss": 0.82876325, "learning_rate": 3.89629035103964e-06, "loss": 0.85081518, "num_input_tokens_seen": 46650575, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0859375, "step": 2160, "time_per_iteration": 2.588585615158081 }, { "auxiliary_loss_clip": 0.01158186, "auxiliary_loss_mlp": 0.01047542, "balance_loss_clip": 1.02739596, "balance_loss_mlp": 1.05292892, "epoch": 0.12992634901548175, "flos": 18802719590400.0, "grad_norm": 1.564973341240782, "language_loss": 0.82110012, "learning_rate": 3.896166529529008e-06, "loss": 0.84315741, "num_input_tokens_seen": 46668780, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.046875, "step": 2161, "time_per_iteration": 2.4540035724639893 }, { "auxiliary_loss_clip": 0.0116162, "auxiliary_loss_mlp": 0.01051792, "balance_loss_clip": 1.03088236, "balance_loss_mlp": 1.05106521, "epoch": 0.12998647226814972, "flos": 29127575543040.0, "grad_norm": 2.0497210042560656, "language_loss": 0.82486176, "learning_rate": 3.896042636115551e-06, "loss": 0.84699595, "num_input_tokens_seen": 46687550, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1015625, "step": 2162, "time_per_iteration": 2.62939190864563 }, { "auxiliary_loss_clip": 0.01159813, "auxiliary_loss_mlp": 0.01050694, "balance_loss_clip": 1.03005874, "balance_loss_mlp": 1.04834175, "epoch": 0.13004659552081768, "flos": 19573686132480.0, "grad_norm": 2.482486827148142, "language_loss": 0.72928602, "learning_rate": 3.895918670803968e-06, "loss": 0.75139105, "num_input_tokens_seen": 46706730, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.109375, "step": 2163, "time_per_iteration": 2.4985339641571045 }, { "auxiliary_loss_clip": 0.01165026, "auxiliary_loss_mlp": 0.01055196, "balance_loss_clip": 1.03351164, "balance_loss_mlp": 1.05151975, "epoch": 0.13010671877348565, "flos": 22490709626880.0, "grad_norm": 2.1824663066646863, "language_loss": 0.81790102, "learning_rate": 3.895794633598958e-06, "loss": 0.84010327, "num_input_tokens_seen": 46724250, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1328125, "step": 2164, "time_per_iteration": 2.5045340061187744 }, { "auxiliary_loss_clip": 0.01163812, "auxiliary_loss_mlp": 0.01045426, "balance_loss_clip": 1.02641177, "balance_loss_mlp": 1.05249321, "epoch": 0.1301668420261536, "flos": 23878226142720.0, "grad_norm": 2.334718784440337, "language_loss": 0.72797388, "learning_rate": 3.8956705245052256e-06, "loss": 0.75006628, "num_input_tokens_seen": 46744105, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.109375, "step": 2165, "time_per_iteration": 2.4705159664154053 }, { "auxiliary_loss_clip": 0.01164535, "auxiliary_loss_mlp": 0.01045817, "balance_loss_clip": 1.02438283, "balance_loss_mlp": 1.05209661, "epoch": 0.13022696527882158, "flos": 23150065633920.0, "grad_norm": 1.741608907173591, "language_loss": 0.74637347, "learning_rate": 3.8955463435274765e-06, "loss": 0.76847696, "num_input_tokens_seen": 46764250, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.125, "step": 2166, "time_per_iteration": 2.532825469970703 }, { "auxiliary_loss_clip": 0.01161976, "auxiliary_loss_mlp": 0.01042625, "balance_loss_clip": 1.02334893, "balance_loss_mlp": 1.05081356, "epoch": 0.13028708853148954, "flos": 26908548111360.0, "grad_norm": 1.510023946369332, "language_loss": 0.83217025, "learning_rate": 3.895422090670421e-06, "loss": 0.85421628, "num_input_tokens_seen": 46786865, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.109375, "step": 2167, "time_per_iteration": 2.576225757598877 }, { "auxiliary_loss_clip": 0.01162546, "auxiliary_loss_mlp": 0.01052199, "balance_loss_clip": 1.0319339, "balance_loss_mlp": 1.05161142, "epoch": 0.13034721178415754, "flos": 21251468453760.0, "grad_norm": 1.7765979086953843, "language_loss": 0.83242619, "learning_rate": 3.89529776593877e-06, "loss": 0.85457361, "num_input_tokens_seen": 46807030, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.109375, "step": 2168, "time_per_iteration": 2.5472376346588135 }, { "auxiliary_loss_clip": 0.01161978, "auxiliary_loss_mlp": 0.01050945, "balance_loss_clip": 1.02965438, "balance_loss_mlp": 1.04968238, "epoch": 0.1304073350368255, "flos": 18767239931520.0, "grad_norm": 1.9876826214603456, "language_loss": 0.80249172, "learning_rate": 3.8951733693372375e-06, "loss": 0.8246209, "num_input_tokens_seen": 46826280, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.125, "step": 2169, "time_per_iteration": 2.455095052719116 }, { "auxiliary_loss_clip": 0.01164031, "auxiliary_loss_mlp": 0.01046755, "balance_loss_clip": 1.02528584, "balance_loss_mlp": 1.05219555, "epoch": 0.13046745828949347, "flos": 28364653647360.0, "grad_norm": 2.183619688381841, "language_loss": 0.66269046, "learning_rate": 3.8950489008705406e-06, "loss": 0.68479836, "num_input_tokens_seen": 46846505, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1171875, "step": 2170, "time_per_iteration": 2.530632257461548 }, { "auxiliary_loss_clip": 0.0116063, "auxiliary_loss_mlp": 0.01048544, "balance_loss_clip": 1.0286237, "balance_loss_mlp": 1.05059814, "epoch": 0.13052758154216143, "flos": 29605044055680.0, "grad_norm": 1.6446686404050757, "language_loss": 0.67090869, "learning_rate": 3.8949243605434e-06, "loss": 0.69300044, "num_input_tokens_seen": 46867380, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1015625, "step": 2171, "time_per_iteration": 2.520770311355591 }, { "auxiliary_loss_clip": 0.01160293, "auxiliary_loss_mlp": 0.01047491, "balance_loss_clip": 1.02604496, "balance_loss_mlp": 1.04942214, "epoch": 0.1305877047948294, "flos": 19390864884480.0, "grad_norm": 2.06697447484932, "language_loss": 0.72399968, "learning_rate": 3.894799748360537e-06, "loss": 0.74607748, "num_input_tokens_seen": 46886810, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.109375, "step": 2172, "time_per_iteration": 2.4781289100646973 }, { "auxiliary_loss_clip": 0.01157841, "auxiliary_loss_mlp": 0.01040091, "balance_loss_clip": 1.0217452, "balance_loss_mlp": 1.05317068, "epoch": 0.13064782804749736, "flos": 16873527000960.0, "grad_norm": 1.6952779088396008, "language_loss": 0.75962138, "learning_rate": 3.894675064326678e-06, "loss": 0.78160071, "num_input_tokens_seen": 46905620, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.046875, "step": 2173, "time_per_iteration": 2.437164306640625 }, { "auxiliary_loss_clip": 0.01162083, "auxiliary_loss_mlp": 0.01056443, "balance_loss_clip": 1.03512812, "balance_loss_mlp": 1.05079961, "epoch": 0.13070795130016533, "flos": 24499085748480.0, "grad_norm": 3.0240118097290742, "language_loss": 0.70696044, "learning_rate": 3.894550308446551e-06, "loss": 0.72914571, "num_input_tokens_seen": 46925120, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1171875, "step": 2174, "time_per_iteration": 2.511383295059204 }, { "auxiliary_loss_clip": 0.01061594, "auxiliary_loss_mlp": 0.01011219, "balance_loss_clip": 1.00873935, "balance_loss_mlp": 1.02349508, "epoch": 0.13076807455283332, "flos": 71054505953280.0, "grad_norm": 0.820214308220465, "language_loss": 0.59029639, "learning_rate": 3.894425480724886e-06, "loss": 0.6110245, "num_input_tokens_seen": 46988195, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.38085938, "step": 2175, "time_per_iteration": 3.209921360015869 }, { "auxiliary_loss_clip": 0.01159186, "auxiliary_loss_mlp": 0.01050375, "balance_loss_clip": 1.03097951, "balance_loss_mlp": 1.05065966, "epoch": 0.13082819780550128, "flos": 20264499475200.0, "grad_norm": 1.9778977261171868, "language_loss": 0.80374372, "learning_rate": 3.894300581166417e-06, "loss": 0.82583928, "num_input_tokens_seen": 47004720, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0859375, "step": 2176, "time_per_iteration": 2.4923179149627686 }, { "auxiliary_loss_clip": 0.01160423, "auxiliary_loss_mlp": 0.01052696, "balance_loss_clip": 1.03084493, "balance_loss_mlp": 1.05010581, "epoch": 0.13088832105816925, "flos": 34203441231360.0, "grad_norm": 1.907733590564295, "language_loss": 0.75014496, "learning_rate": 3.894175609775881e-06, "loss": 0.77227616, "num_input_tokens_seen": 47024255, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1015625, "step": 2177, "time_per_iteration": 2.5529303550720215 }, { "auxiliary_loss_clip": 0.01157238, "auxiliary_loss_mlp": 0.01049104, "balance_loss_clip": 1.02664542, "balance_loss_mlp": 1.04899788, "epoch": 0.13094844431083721, "flos": 17894970057600.0, "grad_norm": 1.7983816806646555, "language_loss": 0.82524252, "learning_rate": 3.894050566558015e-06, "loss": 0.84730601, "num_input_tokens_seen": 47042465, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.0859375, "step": 2178, "time_per_iteration": 2.4607138633728027 }, { "auxiliary_loss_clip": 0.0116069, "auxiliary_loss_mlp": 0.01044085, "balance_loss_clip": 1.0240103, "balance_loss_mlp": 1.05175161, "epoch": 0.13100856756350518, "flos": 17311313963520.0, "grad_norm": 2.1722120050267164, "language_loss": 0.74713898, "learning_rate": 3.893925451517562e-06, "loss": 0.76918674, "num_input_tokens_seen": 47060370, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.09375, "step": 2179, "time_per_iteration": 3.9124531745910645 }, { "auxiliary_loss_clip": 0.01157003, "auxiliary_loss_mlp": 0.01050869, "balance_loss_clip": 1.03074658, "balance_loss_mlp": 1.05019701, "epoch": 0.13106869081617314, "flos": 22200551562240.0, "grad_norm": 2.050763901126851, "language_loss": 0.84854031, "learning_rate": 3.893800264659266e-06, "loss": 0.87061906, "num_input_tokens_seen": 47081415, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0703125, "step": 2180, "time_per_iteration": 3.829050302505493 }, { "auxiliary_loss_clip": 0.01160173, "auxiliary_loss_mlp": 0.0105497, "balance_loss_clip": 1.03527617, "balance_loss_mlp": 1.05299485, "epoch": 0.13112881406884114, "flos": 21763123735680.0, "grad_norm": 1.7457471013144685, "language_loss": 0.89915586, "learning_rate": 3.8936750059878746e-06, "loss": 0.92130727, "num_input_tokens_seen": 47099860, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0703125, "step": 2181, "time_per_iteration": 3.843846082687378 }, { "auxiliary_loss_clip": 0.01161155, "auxiliary_loss_mlp": 0.01048396, "balance_loss_clip": 1.02872622, "balance_loss_mlp": 1.05249023, "epoch": 0.1311889373215091, "flos": 23331091201920.0, "grad_norm": 2.7727090241789885, "language_loss": 0.69078708, "learning_rate": 3.893549675508137e-06, "loss": 0.71288252, "num_input_tokens_seen": 47118540, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0859375, "step": 2182, "time_per_iteration": 3.895420551300049 }, { "auxiliary_loss_clip": 0.01160551, "auxiliary_loss_mlp": 0.01053256, "balance_loss_clip": 1.03227532, "balance_loss_mlp": 1.05094266, "epoch": 0.13124906057417707, "flos": 21467363149440.0, "grad_norm": 1.9787585406924038, "language_loss": 0.78495228, "learning_rate": 3.893424273224806e-06, "loss": 0.8070904, "num_input_tokens_seen": 47136710, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.09375, "step": 2183, "time_per_iteration": 2.472257137298584 }, { "auxiliary_loss_clip": 0.01158136, "auxiliary_loss_mlp": 0.01041587, "balance_loss_clip": 1.02129793, "balance_loss_mlp": 1.04955721, "epoch": 0.13130918382684503, "flos": 23255319461760.0, "grad_norm": 1.6853615949318088, "language_loss": 0.85889208, "learning_rate": 3.893298799142636e-06, "loss": 0.8808893, "num_input_tokens_seen": 47157155, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.09375, "step": 2184, "time_per_iteration": 2.4863431453704834 }, { "auxiliary_loss_clip": 0.01163927, "auxiliary_loss_mlp": 0.01050979, "balance_loss_clip": 1.02946234, "balance_loss_mlp": 1.05236149, "epoch": 0.131369307079513, "flos": 20850274471680.0, "grad_norm": 1.9012715372043179, "language_loss": 0.81993878, "learning_rate": 3.893173253266387e-06, "loss": 0.84208786, "num_input_tokens_seen": 47176820, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1171875, "step": 2185, "time_per_iteration": 2.5335445404052734 }, { "auxiliary_loss_clip": 0.01164815, "auxiliary_loss_mlp": 0.01054569, "balance_loss_clip": 1.03346884, "balance_loss_mlp": 1.05237174, "epoch": 0.13142943033218096, "flos": 17858341163520.0, "grad_norm": 1.8128950365719487, "language_loss": 0.72739291, "learning_rate": 3.893047635600818e-06, "loss": 0.74958676, "num_input_tokens_seen": 47195855, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.125, "step": 2186, "time_per_iteration": 2.431593179702759 }, { "auxiliary_loss_clip": 0.01163406, "auxiliary_loss_mlp": 0.01050853, "balance_loss_clip": 1.02899051, "balance_loss_mlp": 1.05332017, "epoch": 0.13148955358484893, "flos": 20996035862400.0, "grad_norm": 1.8516560075690078, "language_loss": 0.80240321, "learning_rate": 3.892921946150693e-06, "loss": 0.82454586, "num_input_tokens_seen": 47214535, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.1015625, "step": 2187, "time_per_iteration": 2.4616055488586426 }, { "auxiliary_loss_clip": 0.01067425, "auxiliary_loss_mlp": 0.01002787, "balance_loss_clip": 1.00008106, "balance_loss_mlp": 1.02917695, "epoch": 0.13154967683751692, "flos": 70172467580160.0, "grad_norm": 0.8314714203150085, "language_loss": 0.59038699, "learning_rate": 3.892796184920778e-06, "loss": 0.61108911, "num_input_tokens_seen": 47270300, "router_z_loss_clip": 0.02709961, "router_z_loss_mlp": 0.3828125, "step": 2188, "time_per_iteration": 3.1204593181610107 }, { "auxiliary_loss_clip": 0.01164219, "auxiliary_loss_mlp": 0.01052738, "balance_loss_clip": 1.03192425, "balance_loss_mlp": 1.05552149, "epoch": 0.1316098000901849, "flos": 20376145923840.0, "grad_norm": 1.8274675121167032, "language_loss": 0.74105215, "learning_rate": 3.892670351915842e-06, "loss": 0.76322174, "num_input_tokens_seen": 47290720, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.0859375, "step": 2189, "time_per_iteration": 2.479999542236328 }, { "auxiliary_loss_clip": 0.01162185, "auxiliary_loss_mlp": 0.01048611, "balance_loss_clip": 1.02810717, "balance_loss_mlp": 1.05397689, "epoch": 0.13166992334285285, "flos": 23221132692480.0, "grad_norm": 1.7637607584891513, "language_loss": 0.73373514, "learning_rate": 3.892544447140657e-06, "loss": 0.7558431, "num_input_tokens_seen": 47311820, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0859375, "step": 2190, "time_per_iteration": 2.469414472579956 }, { "auxiliary_loss_clip": 0.01163879, "auxiliary_loss_mlp": 0.0104998, "balance_loss_clip": 1.03004789, "balance_loss_mlp": 1.05416954, "epoch": 0.13173004659552082, "flos": 23330947547520.0, "grad_norm": 4.262091098102157, "language_loss": 0.74480861, "learning_rate": 3.892418470599996e-06, "loss": 0.76694727, "num_input_tokens_seen": 47331605, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.09375, "step": 2191, "time_per_iteration": 2.4834678173065186 }, { "auxiliary_loss_clip": 0.01162487, "auxiliary_loss_mlp": 0.01049329, "balance_loss_clip": 1.02864599, "balance_loss_mlp": 1.05184484, "epoch": 0.13179016984818878, "flos": 21251504367360.0, "grad_norm": 2.0200954550145425, "language_loss": 0.79066658, "learning_rate": 3.892292422298637e-06, "loss": 0.81278473, "num_input_tokens_seen": 47350455, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.109375, "step": 2192, "time_per_iteration": 2.4424266815185547 }, { "auxiliary_loss_clip": 0.01163139, "auxiliary_loss_mlp": 0.01049896, "balance_loss_clip": 1.02974999, "balance_loss_mlp": 1.05204952, "epoch": 0.13185029310085675, "flos": 17778690754560.0, "grad_norm": 1.8048114607347474, "language_loss": 0.85467541, "learning_rate": 3.892166302241361e-06, "loss": 0.87680578, "num_input_tokens_seen": 47368225, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.109375, "step": 2193, "time_per_iteration": 2.4523890018463135 }, { "auxiliary_loss_clip": 0.01062028, "auxiliary_loss_mlp": 0.01008645, "balance_loss_clip": 1.00608242, "balance_loss_mlp": 1.02446914, "epoch": 0.1319104163535247, "flos": 69851785933440.0, "grad_norm": 0.7569977134895208, "language_loss": 0.5408054, "learning_rate": 3.8920401104329475e-06, "loss": 0.56151217, "num_input_tokens_seen": 47427125, "router_z_loss_clip": 0.02563477, "router_z_loss_mlp": 0.375, "step": 2194, "time_per_iteration": 3.0581517219543457 }, { "auxiliary_loss_clip": 0.01156422, "auxiliary_loss_mlp": 0.01050852, "balance_loss_clip": 1.03044355, "balance_loss_mlp": 1.04821169, "epoch": 0.1319705396061927, "flos": 25193095401600.0, "grad_norm": 1.7365390994121175, "language_loss": 0.72130805, "learning_rate": 3.891913846878185e-06, "loss": 0.74338078, "num_input_tokens_seen": 47450275, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.078125, "step": 2195, "time_per_iteration": 2.524395704269409 }, { "auxiliary_loss_clip": 0.0116392, "auxiliary_loss_mlp": 0.01046075, "balance_loss_clip": 1.02364039, "balance_loss_mlp": 1.04985499, "epoch": 0.13203066285886067, "flos": 20740459616640.0, "grad_norm": 1.8608200956451633, "language_loss": 0.78442961, "learning_rate": 3.891787511581859e-06, "loss": 0.80652958, "num_input_tokens_seen": 47469155, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.140625, "step": 2196, "time_per_iteration": 2.456822633743286 }, { "auxiliary_loss_clip": 0.01161303, "auxiliary_loss_mlp": 0.01046797, "balance_loss_clip": 1.02640045, "balance_loss_mlp": 1.04958737, "epoch": 0.13209078611152864, "flos": 22054395121920.0, "grad_norm": 1.9179834678721477, "language_loss": 0.75206584, "learning_rate": 3.89166110454876e-06, "loss": 0.77414685, "num_input_tokens_seen": 47488405, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.1171875, "step": 2197, "time_per_iteration": 2.479149341583252 }, { "auxiliary_loss_clip": 0.01166012, "auxiliary_loss_mlp": 0.01048422, "balance_loss_clip": 1.02728665, "balance_loss_mlp": 1.05183029, "epoch": 0.1321509093641966, "flos": 16284950743680.0, "grad_norm": 2.0683078173029408, "language_loss": 0.79907405, "learning_rate": 3.891534625783685e-06, "loss": 0.82121837, "num_input_tokens_seen": 47505650, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.140625, "step": 2198, "time_per_iteration": 2.4360384941101074 }, { "auxiliary_loss_clip": 0.01160934, "auxiliary_loss_mlp": 0.01057255, "balance_loss_clip": 1.03729939, "balance_loss_mlp": 1.05173683, "epoch": 0.13221103261686457, "flos": 16983018633600.0, "grad_norm": 2.65834207214582, "language_loss": 0.82617962, "learning_rate": 3.891408075291425e-06, "loss": 0.84836149, "num_input_tokens_seen": 47521540, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.09375, "step": 2199, "time_per_iteration": 2.433758497238159 }, { "auxiliary_loss_clip": 0.0116065, "auxiliary_loss_mlp": 0.01053627, "balance_loss_clip": 1.03290904, "balance_loss_mlp": 1.0504787, "epoch": 0.13227115586953253, "flos": 34233605677440.0, "grad_norm": 1.567097798430965, "language_loss": 0.69461012, "learning_rate": 3.8912814530767826e-06, "loss": 0.71675289, "num_input_tokens_seen": 47543625, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1015625, "step": 2200, "time_per_iteration": 2.5738027095794678 }, { "auxiliary_loss_clip": 0.01158685, "auxiliary_loss_mlp": 0.01053664, "balance_loss_clip": 1.03212261, "balance_loss_mlp": 1.04946005, "epoch": 0.13233127912220052, "flos": 20704656735360.0, "grad_norm": 1.840115577734617, "language_loss": 0.84700507, "learning_rate": 3.891154759144557e-06, "loss": 0.86912858, "num_input_tokens_seen": 47563740, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.09375, "step": 2201, "time_per_iteration": 2.4872448444366455 }, { "auxiliary_loss_clip": 0.0116264, "auxiliary_loss_mlp": 0.01054162, "balance_loss_clip": 1.0332768, "balance_loss_mlp": 1.05134726, "epoch": 0.1323914023748685, "flos": 25805048434560.0, "grad_norm": 1.7204619365593543, "language_loss": 0.86810267, "learning_rate": 3.891027993499554e-06, "loss": 0.89027065, "num_input_tokens_seen": 47582655, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.109375, "step": 2202, "time_per_iteration": 2.488598108291626 }, { "auxiliary_loss_clip": 0.01159908, "auxiliary_loss_mlp": 0.0104895, "balance_loss_clip": 1.02812421, "balance_loss_mlp": 1.05103564, "epoch": 0.13245152562753645, "flos": 21251540280960.0, "grad_norm": 1.8444724117429394, "language_loss": 0.72204, "learning_rate": 3.89090115614658e-06, "loss": 0.74412858, "num_input_tokens_seen": 47600875, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.09375, "step": 2203, "time_per_iteration": 2.497699737548828 }, { "auxiliary_loss_clip": 0.01159943, "auxiliary_loss_mlp": 0.01053267, "balance_loss_clip": 1.03347802, "balance_loss_mlp": 1.04837644, "epoch": 0.13251164888020442, "flos": 26610955931520.0, "grad_norm": 2.347843767431595, "language_loss": 0.74096918, "learning_rate": 3.890774247090444e-06, "loss": 0.76310128, "num_input_tokens_seen": 47619250, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1171875, "step": 2204, "time_per_iteration": 2.5174403190612793 }, { "auxiliary_loss_clip": 0.01164073, "auxiliary_loss_mlp": 0.01051224, "balance_loss_clip": 1.02912259, "balance_loss_mlp": 1.05303669, "epoch": 0.13257177213287238, "flos": 29826541272960.0, "grad_norm": 1.7214052754844535, "language_loss": 0.78567135, "learning_rate": 3.89064726633596e-06, "loss": 0.80782431, "num_input_tokens_seen": 47639445, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.109375, "step": 2205, "time_per_iteration": 2.5393807888031006 }, { "auxiliary_loss_clip": 0.01159929, "auxiliary_loss_mlp": 0.01050653, "balance_loss_clip": 1.02982736, "balance_loss_mlp": 1.05233431, "epoch": 0.13263189538554035, "flos": 21288456483840.0, "grad_norm": 2.843332230841925, "language_loss": 0.79110414, "learning_rate": 3.890520213887941e-06, "loss": 0.81320995, "num_input_tokens_seen": 47658740, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.078125, "step": 2206, "time_per_iteration": 2.4566612243652344 }, { "auxiliary_loss_clip": 0.01163927, "auxiliary_loss_mlp": 0.01043278, "balance_loss_clip": 1.02391839, "balance_loss_mlp": 1.05301595, "epoch": 0.13269201863820831, "flos": 16874101618560.0, "grad_norm": 2.034669356588115, "language_loss": 0.74190605, "learning_rate": 3.890393089751208e-06, "loss": 0.76397806, "num_input_tokens_seen": 47676880, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.109375, "step": 2207, "time_per_iteration": 2.4351003170013428 }, { "auxiliary_loss_clip": 0.01154259, "auxiliary_loss_mlp": 0.01046484, "balance_loss_clip": 1.02559853, "balance_loss_mlp": 1.04849076, "epoch": 0.1327521418908763, "flos": 23768914078080.0, "grad_norm": 2.1961914149588595, "language_loss": 0.84023792, "learning_rate": 3.890265893930578e-06, "loss": 0.86224532, "num_input_tokens_seen": 47696635, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.0625, "step": 2208, "time_per_iteration": 2.4662368297576904 }, { "auxiliary_loss_clip": 0.01156835, "auxiliary_loss_mlp": 0.01048992, "balance_loss_clip": 1.03046703, "balance_loss_mlp": 1.05368423, "epoch": 0.13281226514354427, "flos": 26505594362880.0, "grad_norm": 1.680337903066163, "language_loss": 0.85567695, "learning_rate": 3.890138626430876e-06, "loss": 0.8777352, "num_input_tokens_seen": 47717760, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.03125, "step": 2209, "time_per_iteration": 2.5128393173217773 }, { "auxiliary_loss_clip": 0.0116178, "auxiliary_loss_mlp": 0.01045792, "balance_loss_clip": 1.0262413, "balance_loss_mlp": 1.05237186, "epoch": 0.13287238839621224, "flos": 24498762526080.0, "grad_norm": 2.351461686270181, "language_loss": 0.82245421, "learning_rate": 3.890011287256929e-06, "loss": 0.84452993, "num_input_tokens_seen": 47737685, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.09375, "step": 2210, "time_per_iteration": 2.476524591445923 }, { "auxiliary_loss_clip": 0.0106392, "auxiliary_loss_mlp": 0.01012048, "balance_loss_clip": 1.0093298, "balance_loss_mlp": 1.0263052, "epoch": 0.1329325116488802, "flos": 67694344369920.0, "grad_norm": 0.7767732509536515, "language_loss": 0.58040488, "learning_rate": 3.889883876413563e-06, "loss": 0.60116452, "num_input_tokens_seen": 47802415, "router_z_loss_clip": 0.02722168, "router_z_loss_mlp": 0.375, "step": 2211, "time_per_iteration": 3.204437255859375 }, { "auxiliary_loss_clip": 0.01062261, "auxiliary_loss_mlp": 0.01004065, "balance_loss_clip": 1.00143075, "balance_loss_mlp": 1.02503991, "epoch": 0.13299263490154817, "flos": 72261894741120.0, "grad_norm": 0.8002538191726832, "language_loss": 0.55324489, "learning_rate": 3.889756393905611e-06, "loss": 0.57390815, "num_input_tokens_seen": 47871485, "router_z_loss_clip": 0.02636719, "router_z_loss_mlp": 0.37109375, "step": 2212, "time_per_iteration": 3.1589722633361816 }, { "auxiliary_loss_clip": 0.01165221, "auxiliary_loss_mlp": 0.01056233, "balance_loss_clip": 1.03495455, "balance_loss_mlp": 1.05266905, "epoch": 0.13305275815421613, "flos": 17931275729280.0, "grad_norm": 2.3760944143176523, "language_loss": 0.75045156, "learning_rate": 3.889628839737908e-06, "loss": 0.7726661, "num_input_tokens_seen": 47888315, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.125, "step": 2213, "time_per_iteration": 2.4476852416992188 }, { "auxiliary_loss_clip": 0.01154083, "auxiliary_loss_mlp": 0.01047926, "balance_loss_clip": 1.02922177, "balance_loss_mlp": 1.04934788, "epoch": 0.13311288140688413, "flos": 22340889999360.0, "grad_norm": 1.6190042413228447, "language_loss": 0.79430926, "learning_rate": 3.889501213915291e-06, "loss": 0.81632936, "num_input_tokens_seen": 47906600, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2214, "time_per_iteration": 2.452087163925171 }, { "auxiliary_loss_clip": 0.01160701, "auxiliary_loss_mlp": 0.01052561, "balance_loss_clip": 1.03289115, "balance_loss_mlp": 1.05154157, "epoch": 0.1331730046595521, "flos": 31868888682240.0, "grad_norm": 1.7864450657703965, "language_loss": 0.69142932, "learning_rate": 3.889373516442597e-06, "loss": 0.71356189, "num_input_tokens_seen": 47927630, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.09375, "step": 2215, "time_per_iteration": 2.5280706882476807 }, { "auxiliary_loss_clip": 0.01159796, "auxiliary_loss_mlp": 0.0105032, "balance_loss_clip": 1.03030431, "balance_loss_mlp": 1.04999626, "epoch": 0.13323312791222006, "flos": 22566589107840.0, "grad_norm": 1.84480077504525, "language_loss": 0.81046462, "learning_rate": 3.889245747324671e-06, "loss": 0.83256578, "num_input_tokens_seen": 47947935, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.09375, "step": 2216, "time_per_iteration": 2.5083811283111572 }, { "auxiliary_loss_clip": 0.01158705, "auxiliary_loss_mlp": 0.01064933, "balance_loss_clip": 1.0444411, "balance_loss_mlp": 1.05031681, "epoch": 0.13329325116488802, "flos": 15085319293440.0, "grad_norm": 2.0915885304400463, "language_loss": 0.87012613, "learning_rate": 3.889117906566356e-06, "loss": 0.89236248, "num_input_tokens_seen": 47965515, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0859375, "step": 2217, "time_per_iteration": 2.4386672973632812 }, { "auxiliary_loss_clip": 0.01159208, "auxiliary_loss_mlp": 0.01057293, "balance_loss_clip": 1.03640699, "balance_loss_mlp": 1.05033362, "epoch": 0.133353374417556, "flos": 27453671890560.0, "grad_norm": 2.268203834338839, "language_loss": 0.72899914, "learning_rate": 3.888989994172501e-06, "loss": 0.7511642, "num_input_tokens_seen": 47985675, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.0859375, "step": 2218, "time_per_iteration": 2.501819372177124 }, { "auxiliary_loss_clip": 0.01158201, "auxiliary_loss_mlp": 0.01052271, "balance_loss_clip": 1.03218389, "balance_loss_mlp": 1.04957867, "epoch": 0.13341349767022395, "flos": 24094695456000.0, "grad_norm": 1.7552170859304044, "language_loss": 0.87183344, "learning_rate": 3.8888620101479565e-06, "loss": 0.89393818, "num_input_tokens_seen": 48004985, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0859375, "step": 2219, "time_per_iteration": 2.494868040084839 }, { "auxiliary_loss_clip": 0.01158838, "auxiliary_loss_mlp": 0.01059104, "balance_loss_clip": 1.04014945, "balance_loss_mlp": 1.05103874, "epoch": 0.13347362092289192, "flos": 24133335511680.0, "grad_norm": 1.7298556178491775, "language_loss": 0.77194721, "learning_rate": 3.888733954497574e-06, "loss": 0.79412663, "num_input_tokens_seen": 48024965, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.078125, "step": 2220, "time_per_iteration": 2.473017930984497 }, { "auxiliary_loss_clip": 0.01153233, "auxiliary_loss_mlp": 0.01044882, "balance_loss_clip": 1.02642822, "balance_loss_mlp": 1.04688966, "epoch": 0.1335337441755599, "flos": 18436538390400.0, "grad_norm": 2.1431319834119944, "language_loss": 0.79156899, "learning_rate": 3.888605827226212e-06, "loss": 0.81355011, "num_input_tokens_seen": 48040890, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0625, "step": 2221, "time_per_iteration": 3.9519083499908447 }, { "auxiliary_loss_clip": 0.01060272, "auxiliary_loss_mlp": 0.01079371, "balance_loss_clip": 1.07670045, "balance_loss_mlp": 1.02315927, "epoch": 0.13359386742822787, "flos": 50611997652480.0, "grad_norm": 0.9894928893989768, "language_loss": 0.6901831, "learning_rate": 3.8884776283387275e-06, "loss": 0.7115795, "num_input_tokens_seen": 48091855, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.37109375, "step": 2222, "time_per_iteration": 2.932283639907837 }, { "auxiliary_loss_clip": 0.01158788, "auxiliary_loss_mlp": 0.01045785, "balance_loss_clip": 1.02677071, "balance_loss_mlp": 1.05181634, "epoch": 0.13365399068089584, "flos": 22778569221120.0, "grad_norm": 1.8550874330743037, "language_loss": 0.66943276, "learning_rate": 3.888349357839982e-06, "loss": 0.69147849, "num_input_tokens_seen": 48111350, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0703125, "step": 2223, "time_per_iteration": 5.346913814544678 }, { "auxiliary_loss_clip": 0.01157422, "auxiliary_loss_mlp": 0.01054277, "balance_loss_clip": 1.03314114, "balance_loss_mlp": 1.04798424, "epoch": 0.1337141139335638, "flos": 12531603911040.0, "grad_norm": 2.24292872776228, "language_loss": 0.82651532, "learning_rate": 3.88822101573484e-06, "loss": 0.84863228, "num_input_tokens_seen": 48129840, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.09375, "step": 2224, "time_per_iteration": 2.4296810626983643 }, { "auxiliary_loss_clip": 0.0116115, "auxiliary_loss_mlp": 0.01044012, "balance_loss_clip": 1.02301955, "balance_loss_mlp": 1.04978561, "epoch": 0.13377423718623177, "flos": 23038957889280.0, "grad_norm": 2.0318401868066207, "language_loss": 0.65928996, "learning_rate": 3.888092602028167e-06, "loss": 0.68134165, "num_input_tokens_seen": 48149240, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.109375, "step": 2225, "time_per_iteration": 2.5000617504119873 }, { "auxiliary_loss_clip": 0.01159131, "auxiliary_loss_mlp": 0.0105076, "balance_loss_clip": 1.03038728, "balance_loss_mlp": 1.04921973, "epoch": 0.13383436043889974, "flos": 16216397637120.0, "grad_norm": 2.3936691226410605, "language_loss": 0.89056206, "learning_rate": 3.887964116724835e-06, "loss": 0.91266096, "num_input_tokens_seen": 48166330, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.09375, "step": 2226, "time_per_iteration": 2.4607136249542236 }, { "auxiliary_loss_clip": 0.01159684, "auxiliary_loss_mlp": 0.01053314, "balance_loss_clip": 1.03326297, "balance_loss_mlp": 1.04927778, "epoch": 0.1338944836915677, "flos": 24279671520000.0, "grad_norm": 1.836250958967486, "language_loss": 0.73741096, "learning_rate": 3.887835559829712e-06, "loss": 0.75954092, "num_input_tokens_seen": 48187600, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.109375, "step": 2227, "time_per_iteration": 2.496176242828369 }, { "auxiliary_loss_clip": 0.01155412, "auxiliary_loss_mlp": 0.01050032, "balance_loss_clip": 1.02912247, "balance_loss_mlp": 1.04751658, "epoch": 0.1339546069442357, "flos": 17598742594560.0, "grad_norm": 2.0015308232050115, "language_loss": 0.85111308, "learning_rate": 3.8877069313476764e-06, "loss": 0.87316751, "num_input_tokens_seen": 48204400, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.078125, "step": 2228, "time_per_iteration": 2.4187614917755127 }, { "auxiliary_loss_clip": 0.01155555, "auxiliary_loss_mlp": 0.01047122, "balance_loss_clip": 1.02667773, "balance_loss_mlp": 1.04903221, "epoch": 0.13401473019690366, "flos": 18990065952000.0, "grad_norm": 20.947253776481354, "language_loss": 0.81027913, "learning_rate": 3.8875782312836054e-06, "loss": 0.83230591, "num_input_tokens_seen": 48222180, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.0625, "step": 2229, "time_per_iteration": 2.452667474746704 }, { "auxiliary_loss_clip": 0.0116035, "auxiliary_loss_mlp": 0.01051894, "balance_loss_clip": 1.03196239, "balance_loss_mlp": 1.05158103, "epoch": 0.13407485344957162, "flos": 26943812288640.0, "grad_norm": 2.0336700286572436, "language_loss": 0.74140757, "learning_rate": 3.887449459642378e-06, "loss": 0.76353002, "num_input_tokens_seen": 48243245, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0859375, "step": 2230, "time_per_iteration": 2.5022032260894775 }, { "auxiliary_loss_clip": 0.01158822, "auxiliary_loss_mlp": 0.01063241, "balance_loss_clip": 1.04359531, "balance_loss_mlp": 1.04916143, "epoch": 0.1341349767022396, "flos": 20339373375360.0, "grad_norm": 2.517518631715877, "language_loss": 0.80094236, "learning_rate": 3.8873206164288785e-06, "loss": 0.82316297, "num_input_tokens_seen": 48262600, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.09375, "step": 2231, "time_per_iteration": 2.4672865867614746 }, { "auxiliary_loss_clip": 0.01160814, "auxiliary_loss_mlp": 0.0105639, "balance_loss_clip": 1.0358268, "balance_loss_mlp": 1.05231237, "epoch": 0.13419509995490755, "flos": 29862020931840.0, "grad_norm": 2.0869915583749448, "language_loss": 0.72088051, "learning_rate": 3.887191701647992e-06, "loss": 0.74305254, "num_input_tokens_seen": 48285075, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0859375, "step": 2232, "time_per_iteration": 2.5080668926239014 }, { "auxiliary_loss_clip": 0.01165008, "auxiliary_loss_mlp": 0.01055649, "balance_loss_clip": 1.03353548, "balance_loss_mlp": 1.05251491, "epoch": 0.13425522320757552, "flos": 26942986275840.0, "grad_norm": 3.2387956794464676, "language_loss": 0.65920627, "learning_rate": 3.8870627153046066e-06, "loss": 0.68141288, "num_input_tokens_seen": 48301285, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.125, "step": 2233, "time_per_iteration": 2.516098976135254 }, { "auxiliary_loss_clip": 0.01158104, "auxiliary_loss_mlp": 0.01053647, "balance_loss_clip": 1.03271425, "balance_loss_mlp": 1.04708111, "epoch": 0.1343153464602435, "flos": 15777281871360.0, "grad_norm": 2.941057158013027, "language_loss": 0.81347072, "learning_rate": 3.886933657403615e-06, "loss": 0.83558822, "num_input_tokens_seen": 48317835, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.109375, "step": 2234, "time_per_iteration": 2.4262146949768066 }, { "auxiliary_loss_clip": 0.01161773, "auxiliary_loss_mlp": 0.01055622, "balance_loss_clip": 1.03503454, "balance_loss_mlp": 1.05084431, "epoch": 0.13437546971291148, "flos": 24314756129280.0, "grad_norm": 1.8868640899642812, "language_loss": 0.82392818, "learning_rate": 3.886804527949909e-06, "loss": 0.84610212, "num_input_tokens_seen": 48335670, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.109375, "step": 2235, "time_per_iteration": 2.48848819732666 }, { "auxiliary_loss_clip": 0.01158118, "auxiliary_loss_mlp": 0.01058211, "balance_loss_clip": 1.03631258, "balance_loss_mlp": 1.04882288, "epoch": 0.13443559296557944, "flos": 26650673395200.0, "grad_norm": 1.5904434403551555, "language_loss": 0.86386406, "learning_rate": 3.8866753269483864e-06, "loss": 0.88602734, "num_input_tokens_seen": 48357805, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.09375, "step": 2236, "time_per_iteration": 2.5158917903900146 }, { "auxiliary_loss_clip": 0.01162023, "auxiliary_loss_mlp": 0.01050439, "balance_loss_clip": 1.02988768, "balance_loss_mlp": 1.05225635, "epoch": 0.1344957162182474, "flos": 21796197183360.0, "grad_norm": 1.7509446232027353, "language_loss": 0.77686393, "learning_rate": 3.886546054403946e-06, "loss": 0.79898858, "num_input_tokens_seen": 48377845, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.09375, "step": 2237, "time_per_iteration": 2.4979398250579834 }, { "auxiliary_loss_clip": 0.01160084, "auxiliary_loss_mlp": 0.01048208, "balance_loss_clip": 1.02641654, "balance_loss_mlp": 1.05060911, "epoch": 0.13455583947091537, "flos": 19865568049920.0, "grad_norm": 1.8526202588726477, "language_loss": 0.78481758, "learning_rate": 3.886416710321491e-06, "loss": 0.8069005, "num_input_tokens_seen": 48394735, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.09375, "step": 2238, "time_per_iteration": 2.448596239089966 }, { "auxiliary_loss_clip": 0.01158672, "auxiliary_loss_mlp": 0.01050718, "balance_loss_clip": 1.02923644, "balance_loss_mlp": 1.05129051, "epoch": 0.13461596272358334, "flos": 30846835094400.0, "grad_norm": 3.000012001890803, "language_loss": 0.68098366, "learning_rate": 3.886287294705924e-06, "loss": 0.70307761, "num_input_tokens_seen": 48414200, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.078125, "step": 2239, "time_per_iteration": 2.5484447479248047 }, { "auxiliary_loss_clip": 0.01163382, "auxiliary_loss_mlp": 0.01048922, "balance_loss_clip": 1.02916884, "balance_loss_mlp": 1.05157542, "epoch": 0.1346760859762513, "flos": 12494436312960.0, "grad_norm": 2.7690470336450828, "language_loss": 0.81803393, "learning_rate": 3.8861578075621555e-06, "loss": 0.84015691, "num_input_tokens_seen": 48431065, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.1171875, "step": 2240, "time_per_iteration": 2.425859212875366 }, { "auxiliary_loss_clip": 0.01163308, "auxiliary_loss_mlp": 0.01047949, "balance_loss_clip": 1.0280652, "balance_loss_mlp": 1.05104291, "epoch": 0.1347362092289193, "flos": 21836022387840.0, "grad_norm": 1.9117977896899334, "language_loss": 0.77664125, "learning_rate": 3.886028248895093e-06, "loss": 0.7987538, "num_input_tokens_seen": 48450335, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.125, "step": 2241, "time_per_iteration": 2.4892592430114746 }, { "auxiliary_loss_clip": 0.01157808, "auxiliary_loss_mlp": 0.01040852, "balance_loss_clip": 1.02279162, "balance_loss_mlp": 1.0527488, "epoch": 0.13479633248158726, "flos": 23509459163520.0, "grad_norm": 1.5803364717304544, "language_loss": 0.83364737, "learning_rate": 3.88589861870965e-06, "loss": 0.85563397, "num_input_tokens_seen": 48468555, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.0546875, "step": 2242, "time_per_iteration": 2.4793906211853027 }, { "auxiliary_loss_clip": 0.01162121, "auxiliary_loss_mlp": 0.01054819, "balance_loss_clip": 1.03351593, "balance_loss_mlp": 1.05229771, "epoch": 0.13485645573425523, "flos": 29344332165120.0, "grad_norm": 3.1553964287839835, "language_loss": 0.6475811, "learning_rate": 3.885768917010744e-06, "loss": 0.66975045, "num_input_tokens_seen": 48488515, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1015625, "step": 2243, "time_per_iteration": 2.5419938564300537 }, { "auxiliary_loss_clip": 0.01152117, "auxiliary_loss_mlp": 0.01045781, "balance_loss_clip": 1.02570653, "balance_loss_mlp": 1.04920781, "epoch": 0.1349165789869232, "flos": 28037112503040.0, "grad_norm": 1.5302974039590214, "language_loss": 0.72704238, "learning_rate": 3.8856391438032895e-06, "loss": 0.74902135, "num_input_tokens_seen": 48510515, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.03125, "step": 2244, "time_per_iteration": 2.512768268585205 }, { "auxiliary_loss_clip": 0.01156376, "auxiliary_loss_mlp": 0.0104907, "balance_loss_clip": 1.03038955, "balance_loss_mlp": 1.04938996, "epoch": 0.13497670223959116, "flos": 22853730430080.0, "grad_norm": 1.640496590222441, "language_loss": 0.86173809, "learning_rate": 3.88550929909221e-06, "loss": 0.88379258, "num_input_tokens_seen": 48529940, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0625, "step": 2245, "time_per_iteration": 2.4824347496032715 }, { "auxiliary_loss_clip": 0.01155897, "auxiliary_loss_mlp": 0.01051883, "balance_loss_clip": 1.03246355, "balance_loss_mlp": 1.05178118, "epoch": 0.13503682549225912, "flos": 16504580453760.0, "grad_norm": 1.756399190250715, "language_loss": 0.78678179, "learning_rate": 3.88537938288243e-06, "loss": 0.80885953, "num_input_tokens_seen": 48548190, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0390625, "step": 2246, "time_per_iteration": 2.4483184814453125 }, { "auxiliary_loss_clip": 0.0106603, "auxiliary_loss_mlp": 0.01016571, "balance_loss_clip": 1.01410329, "balance_loss_mlp": 1.02866244, "epoch": 0.1350969487449271, "flos": 70756303242240.0, "grad_norm": 0.7589992045789662, "language_loss": 0.60521913, "learning_rate": 3.885249395178874e-06, "loss": 0.62604511, "num_input_tokens_seen": 48613165, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.37304688, "step": 2247, "time_per_iteration": 3.1874871253967285 }, { "auxiliary_loss_clip": 0.01169866, "auxiliary_loss_mlp": 0.01054833, "balance_loss_clip": 1.03341103, "balance_loss_mlp": 1.05690682, "epoch": 0.13515707199759508, "flos": 23075981832960.0, "grad_norm": 1.8431182382272318, "language_loss": 0.81259573, "learning_rate": 3.885119335986473e-06, "loss": 0.83484268, "num_input_tokens_seen": 48631705, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.1328125, "step": 2248, "time_per_iteration": 2.4768407344818115 }, { "auxiliary_loss_clip": 0.01154166, "auxiliary_loss_mlp": 0.01048196, "balance_loss_clip": 1.02940869, "balance_loss_mlp": 1.04984236, "epoch": 0.13521719525026304, "flos": 23186371305600.0, "grad_norm": 1.9372896929414563, "language_loss": 0.76889658, "learning_rate": 3.884989205310157e-06, "loss": 0.7909202, "num_input_tokens_seen": 48649740, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2249, "time_per_iteration": 2.486095428466797 }, { "auxiliary_loss_clip": 0.01158835, "auxiliary_loss_mlp": 0.01058582, "balance_loss_clip": 1.039819, "balance_loss_mlp": 1.05290389, "epoch": 0.135277318502931, "flos": 24790931752320.0, "grad_norm": 1.4566661412088266, "language_loss": 0.84374309, "learning_rate": 3.884859003154862e-06, "loss": 0.86591721, "num_input_tokens_seen": 48671565, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0625, "step": 2250, "time_per_iteration": 2.5233314037323 }, { "auxiliary_loss_clip": 0.01158052, "auxiliary_loss_mlp": 0.01051503, "balance_loss_clip": 1.03059399, "balance_loss_mlp": 1.05051959, "epoch": 0.13533744175559898, "flos": 21908525990400.0, "grad_norm": 2.25124663295973, "language_loss": 0.82363409, "learning_rate": 3.884728729525524e-06, "loss": 0.84572959, "num_input_tokens_seen": 48690425, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.078125, "step": 2251, "time_per_iteration": 2.461790084838867 }, { "auxiliary_loss_clip": 0.01157525, "auxiliary_loss_mlp": 0.01057224, "balance_loss_clip": 1.03649306, "balance_loss_mlp": 1.04894876, "epoch": 0.13539756500826694, "flos": 21211643249280.0, "grad_norm": 1.6873552882658085, "language_loss": 0.85911644, "learning_rate": 3.884598384427084e-06, "loss": 0.88126391, "num_input_tokens_seen": 48707505, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0859375, "step": 2252, "time_per_iteration": 2.4599952697753906 }, { "auxiliary_loss_clip": 0.01062528, "auxiliary_loss_mlp": 0.01014724, "balance_loss_clip": 1.01219714, "balance_loss_mlp": 1.02616453, "epoch": 0.1354576882609349, "flos": 63242103634560.0, "grad_norm": 1.175060184670831, "language_loss": 0.61832774, "learning_rate": 3.884467967864485e-06, "loss": 0.63910019, "num_input_tokens_seen": 48775895, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.36328125, "step": 2253, "time_per_iteration": 3.193025827407837 }, { "auxiliary_loss_clip": 0.01157161, "auxiliary_loss_mlp": 0.01062718, "balance_loss_clip": 1.04370451, "balance_loss_mlp": 1.05079293, "epoch": 0.1355178115136029, "flos": 25483037984640.0, "grad_norm": 9.668558510985052, "language_loss": 0.89408356, "learning_rate": 3.884337479842671e-06, "loss": 0.91628236, "num_input_tokens_seen": 48798370, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2254, "time_per_iteration": 2.5173611640930176 }, { "auxiliary_loss_clip": 0.0116311, "auxiliary_loss_mlp": 0.01055597, "balance_loss_clip": 1.03455639, "balance_loss_mlp": 1.05093241, "epoch": 0.13557793476627086, "flos": 21616967295360.0, "grad_norm": 2.3225633087791215, "language_loss": 0.8477633, "learning_rate": 3.884206920366591e-06, "loss": 0.86995029, "num_input_tokens_seen": 48817955, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.125, "step": 2255, "time_per_iteration": 2.457481861114502 }, { "auxiliary_loss_clip": 0.01156073, "auxiliary_loss_mlp": 0.01056588, "balance_loss_clip": 1.03672791, "balance_loss_mlp": 1.04873228, "epoch": 0.13563805801893883, "flos": 24928253447040.0, "grad_norm": 2.43390299269074, "language_loss": 0.74872684, "learning_rate": 3.884076289441196e-06, "loss": 0.7708534, "num_input_tokens_seen": 48836330, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.078125, "step": 2256, "time_per_iteration": 2.486414670944214 }, { "auxiliary_loss_clip": 0.01161751, "auxiliary_loss_mlp": 0.01054452, "balance_loss_clip": 1.03337538, "balance_loss_mlp": 1.04938519, "epoch": 0.1356981812716068, "flos": 14750272206720.0, "grad_norm": 1.9732180761199987, "language_loss": 0.83700901, "learning_rate": 3.88394558707144e-06, "loss": 0.85917103, "num_input_tokens_seen": 48851890, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.125, "step": 2257, "time_per_iteration": 2.413719654083252 }, { "auxiliary_loss_clip": 0.01164035, "auxiliary_loss_mlp": 0.01062257, "balance_loss_clip": 1.04074001, "balance_loss_mlp": 1.05067968, "epoch": 0.13575830452427476, "flos": 11108571822720.0, "grad_norm": 2.601539391990493, "language_loss": 0.82103622, "learning_rate": 3.883814813262277e-06, "loss": 0.84329915, "num_input_tokens_seen": 48865510, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.1328125, "step": 2258, "time_per_iteration": 2.414290189743042 }, { "auxiliary_loss_clip": 0.01157682, "auxiliary_loss_mlp": 0.0105528, "balance_loss_clip": 1.03254676, "balance_loss_mlp": 1.04898882, "epoch": 0.13581842777694272, "flos": 17960290940160.0, "grad_norm": 2.2404970608720767, "language_loss": 0.82827389, "learning_rate": 3.883683968018669e-06, "loss": 0.85040355, "num_input_tokens_seen": 48882360, "router_z_loss_clip": 0.2265625, "router_z_loss_mlp": 1.0859375, "step": 2259, "time_per_iteration": 2.4327898025512695 }, { "auxiliary_loss_clip": 0.01160331, "auxiliary_loss_mlp": 0.0106096, "balance_loss_clip": 1.0426966, "balance_loss_mlp": 1.05135858, "epoch": 0.1358785510296107, "flos": 22857142222080.0, "grad_norm": 1.8595046531943487, "language_loss": 0.73445684, "learning_rate": 3.8835530513455755e-06, "loss": 0.75666976, "num_input_tokens_seen": 48902700, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.09375, "step": 2260, "time_per_iteration": 2.476585865020752 }, { "auxiliary_loss_clip": 0.01156401, "auxiliary_loss_mlp": 0.01063563, "balance_loss_clip": 1.04378605, "balance_loss_mlp": 1.0495038, "epoch": 0.13593867428227868, "flos": 25739404329600.0, "grad_norm": 2.4361118802847175, "language_loss": 0.74766028, "learning_rate": 3.883422063247961e-06, "loss": 0.76985991, "num_input_tokens_seen": 48922525, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0703125, "step": 2261, "time_per_iteration": 2.5037500858306885 }, { "auxiliary_loss_clip": 0.01157538, "auxiliary_loss_mlp": 0.01056195, "balance_loss_clip": 1.03659737, "balance_loss_mlp": 1.04770803, "epoch": 0.13599879753494665, "flos": 31249214225280.0, "grad_norm": 2.04886244034359, "language_loss": 0.63356102, "learning_rate": 3.883291003730794e-06, "loss": 0.65569836, "num_input_tokens_seen": 48942510, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.09375, "step": 2262, "time_per_iteration": 2.5483639240264893 }, { "auxiliary_loss_clip": 0.01157877, "auxiliary_loss_mlp": 0.01047679, "balance_loss_clip": 1.02870107, "balance_loss_mlp": 1.04902601, "epoch": 0.1360589207876146, "flos": 23915034604800.0, "grad_norm": 2.321191338743384, "language_loss": 0.8240425, "learning_rate": 3.883159872799043e-06, "loss": 0.84609807, "num_input_tokens_seen": 48962625, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0859375, "step": 2263, "time_per_iteration": 5.379512310028076 }, { "auxiliary_loss_clip": 0.01161599, "auxiliary_loss_mlp": 0.01060588, "balance_loss_clip": 1.0377357, "balance_loss_mlp": 1.05077529, "epoch": 0.13611904404028258, "flos": 19974197756160.0, "grad_norm": 1.73484608379699, "language_loss": 0.87471735, "learning_rate": 3.8830286704576815e-06, "loss": 0.89693916, "num_input_tokens_seen": 48982525, "router_z_loss_clip": 0.22851562, "router_z_loss_mlp": 1.109375, "step": 2264, "time_per_iteration": 2.4830896854400635 }, { "auxiliary_loss_clip": 0.01160415, "auxiliary_loss_mlp": 0.01051764, "balance_loss_clip": 1.02976942, "balance_loss_mlp": 1.04912066, "epoch": 0.13617916729295054, "flos": 15340644144000.0, "grad_norm": 3.913871814221071, "language_loss": 0.71411741, "learning_rate": 3.882897396711683e-06, "loss": 0.73623919, "num_input_tokens_seen": 48997605, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.109375, "step": 2265, "time_per_iteration": 5.234049558639526 }, { "auxiliary_loss_clip": 0.01155956, "auxiliary_loss_mlp": 0.01045343, "balance_loss_clip": 1.02580464, "balance_loss_mlp": 1.04937124, "epoch": 0.1362392905456185, "flos": 27451445247360.0, "grad_norm": 2.2594237031623243, "language_loss": 0.66473925, "learning_rate": 3.882766051566027e-06, "loss": 0.6867522, "num_input_tokens_seen": 49018535, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0625, "step": 2266, "time_per_iteration": 2.5126960277557373 }, { "auxiliary_loss_clip": 0.01156229, "auxiliary_loss_mlp": 0.01054848, "balance_loss_clip": 1.03491664, "balance_loss_mlp": 1.04962003, "epoch": 0.1362994137982865, "flos": 25009017177600.0, "grad_norm": 1.6755016527004176, "language_loss": 0.76241565, "learning_rate": 3.882634635025694e-06, "loss": 0.78452647, "num_input_tokens_seen": 49038865, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0703125, "step": 2267, "time_per_iteration": 2.4860339164733887 }, { "auxiliary_loss_clip": 0.01155944, "auxiliary_loss_mlp": 0.01048593, "balance_loss_clip": 1.02787471, "balance_loss_mlp": 1.047328, "epoch": 0.13635953705095447, "flos": 20303031790080.0, "grad_norm": 1.8274414051095489, "language_loss": 0.81898797, "learning_rate": 3.882503147095667e-06, "loss": 0.84103334, "num_input_tokens_seen": 49058010, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0859375, "step": 2268, "time_per_iteration": 2.4580910205841064 }, { "auxiliary_loss_clip": 0.01156696, "auxiliary_loss_mlp": 0.01046005, "balance_loss_clip": 1.02499998, "balance_loss_mlp": 1.05097413, "epoch": 0.13641966030362243, "flos": 31358418549120.0, "grad_norm": 1.6364532148536577, "language_loss": 0.7626217, "learning_rate": 3.882371587780931e-06, "loss": 0.78464878, "num_input_tokens_seen": 49080330, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.0546875, "step": 2269, "time_per_iteration": 2.5265955924987793 }, { "auxiliary_loss_clip": 0.01161161, "auxiliary_loss_mlp": 0.01044944, "balance_loss_clip": 1.02427292, "balance_loss_mlp": 1.05142105, "epoch": 0.1364797835562904, "flos": 20478095700480.0, "grad_norm": 2.045455095857732, "language_loss": 0.80997479, "learning_rate": 3.882239957086477e-06, "loss": 0.8320359, "num_input_tokens_seen": 49097035, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.09375, "step": 2270, "time_per_iteration": 2.4558379650115967 }, { "auxiliary_loss_clip": 0.0116217, "auxiliary_loss_mlp": 0.01053052, "balance_loss_clip": 1.03240466, "balance_loss_mlp": 1.05041909, "epoch": 0.13653990680895836, "flos": 13078343802240.0, "grad_norm": 2.7377769650571677, "language_loss": 0.7549535, "learning_rate": 3.882108255017295e-06, "loss": 0.77710575, "num_input_tokens_seen": 49113945, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.1171875, "step": 2271, "time_per_iteration": 2.4387991428375244 }, { "auxiliary_loss_clip": 0.01158729, "auxiliary_loss_mlp": 0.01056007, "balance_loss_clip": 1.03440666, "balance_loss_mlp": 1.04981065, "epoch": 0.13660003006162633, "flos": 16946712961920.0, "grad_norm": 2.0934510711364394, "language_loss": 0.80461782, "learning_rate": 3.881976481578379e-06, "loss": 0.82676518, "num_input_tokens_seen": 49132855, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.0859375, "step": 2272, "time_per_iteration": 2.45744252204895 }, { "auxiliary_loss_clip": 0.01059324, "auxiliary_loss_mlp": 0.01012047, "balance_loss_clip": 1.00967431, "balance_loss_mlp": 1.02342761, "epoch": 0.1366601533142943, "flos": 68682749892480.0, "grad_norm": 0.7049350320576564, "language_loss": 0.6062085, "learning_rate": 3.8818446367747255e-06, "loss": 0.62692219, "num_input_tokens_seen": 49198310, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.359375, "step": 2273, "time_per_iteration": 3.181760549545288 }, { "auxiliary_loss_clip": 0.01156111, "auxiliary_loss_mlp": 0.01044623, "balance_loss_clip": 1.02399957, "balance_loss_mlp": 1.04949331, "epoch": 0.13672027656696228, "flos": 19244241567360.0, "grad_norm": 1.5516093483846134, "language_loss": 0.77296054, "learning_rate": 3.881712720611336e-06, "loss": 0.79496789, "num_input_tokens_seen": 49217250, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.0625, "step": 2274, "time_per_iteration": 2.4548561573028564 }, { "auxiliary_loss_clip": 0.01157047, "auxiliary_loss_mlp": 0.01046945, "balance_loss_clip": 1.02650046, "balance_loss_mlp": 1.04858565, "epoch": 0.13678039981963025, "flos": 24534924543360.0, "grad_norm": 1.8200510562718524, "language_loss": 0.78683531, "learning_rate": 3.881580733093211e-06, "loss": 0.8088752, "num_input_tokens_seen": 49236615, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0859375, "step": 2275, "time_per_iteration": 2.495257616043091 }, { "auxiliary_loss_clip": 0.01157611, "auxiliary_loss_mlp": 0.01040607, "balance_loss_clip": 1.02181995, "balance_loss_mlp": 1.04989338, "epoch": 0.13684052307229821, "flos": 15669334523520.0, "grad_norm": 2.4622220478922037, "language_loss": 0.81748801, "learning_rate": 3.881448674225356e-06, "loss": 0.83947021, "num_input_tokens_seen": 49253935, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.078125, "step": 2276, "time_per_iteration": 2.4997830390930176 }, { "auxiliary_loss_clip": 0.01163417, "auxiliary_loss_mlp": 0.01056431, "balance_loss_clip": 1.03335214, "balance_loss_mlp": 1.0500592, "epoch": 0.13690064632496618, "flos": 28364689560960.0, "grad_norm": 2.821254503277533, "language_loss": 0.69271988, "learning_rate": 3.881316544012779e-06, "loss": 0.71491838, "num_input_tokens_seen": 49273605, "router_z_loss_clip": 0.23046875, "router_z_loss_mlp": 1.1328125, "step": 2277, "time_per_iteration": 2.499929666519165 }, { "auxiliary_loss_clip": 0.0116021, "auxiliary_loss_mlp": 0.01054968, "balance_loss_clip": 1.03385568, "balance_loss_mlp": 1.05030012, "epoch": 0.13696076957763414, "flos": 23404779953280.0, "grad_norm": 2.5877120034937113, "language_loss": 0.80373198, "learning_rate": 3.88118434246049e-06, "loss": 0.82588375, "num_input_tokens_seen": 49291785, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1015625, "step": 2278, "time_per_iteration": 2.481302261352539 }, { "auxiliary_loss_clip": 0.01162721, "auxiliary_loss_mlp": 0.01049097, "balance_loss_clip": 1.02880788, "balance_loss_mlp": 1.05474854, "epoch": 0.1370208928303021, "flos": 37196595601920.0, "grad_norm": 4.474192703283581, "language_loss": 0.75375402, "learning_rate": 3.881052069573502e-06, "loss": 0.77587217, "num_input_tokens_seen": 49311405, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.078125, "step": 2279, "time_per_iteration": 2.595592975616455 }, { "auxiliary_loss_clip": 0.01165508, "auxiliary_loss_mlp": 0.01057041, "balance_loss_clip": 1.0366323, "balance_loss_mlp": 1.05320024, "epoch": 0.13708101608297008, "flos": 26976311118720.0, "grad_norm": 7.974245473868638, "language_loss": 0.76617503, "learning_rate": 3.880919725356831e-06, "loss": 0.78840053, "num_input_tokens_seen": 49331835, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.125, "step": 2280, "time_per_iteration": 2.516134262084961 }, { "auxiliary_loss_clip": 0.01155942, "auxiliary_loss_mlp": 0.01048781, "balance_loss_clip": 1.02950466, "balance_loss_mlp": 1.04986036, "epoch": 0.13714113933563807, "flos": 32556864850560.0, "grad_norm": 1.6666290098743013, "language_loss": 0.79734623, "learning_rate": 3.880787309815496e-06, "loss": 0.81939346, "num_input_tokens_seen": 49352290, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0625, "step": 2281, "time_per_iteration": 2.54382586479187 }, { "auxiliary_loss_clip": 0.01167039, "auxiliary_loss_mlp": 0.01060665, "balance_loss_clip": 1.04068601, "balance_loss_mlp": 1.05429268, "epoch": 0.13720126258830603, "flos": 16101267569280.0, "grad_norm": 1.7869223170085005, "language_loss": 0.83615845, "learning_rate": 3.880654822954518e-06, "loss": 0.85843551, "num_input_tokens_seen": 49370285, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.125, "step": 2282, "time_per_iteration": 2.444594621658325 }, { "auxiliary_loss_clip": 0.01162104, "auxiliary_loss_mlp": 0.01054298, "balance_loss_clip": 1.0351882, "balance_loss_mlp": 1.05456114, "epoch": 0.137261385840974, "flos": 18953544798720.0, "grad_norm": 1.7912854441667958, "language_loss": 0.73856622, "learning_rate": 3.8805222647789195e-06, "loss": 0.76073027, "num_input_tokens_seen": 49389610, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.078125, "step": 2283, "time_per_iteration": 2.4566128253936768 }, { "auxiliary_loss_clip": 0.01163754, "auxiliary_loss_mlp": 0.01050332, "balance_loss_clip": 1.03134179, "balance_loss_mlp": 1.05717146, "epoch": 0.13732150909364196, "flos": 23295360147840.0, "grad_norm": 3.460380539205792, "language_loss": 0.83952451, "learning_rate": 3.880389635293729e-06, "loss": 0.86166537, "num_input_tokens_seen": 49408390, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2284, "time_per_iteration": 2.482832431793213 }, { "auxiliary_loss_clip": 0.01165343, "auxiliary_loss_mlp": 0.01050546, "balance_loss_clip": 1.02894557, "balance_loss_mlp": 1.05270219, "epoch": 0.13738163234630993, "flos": 29351263489920.0, "grad_norm": 1.8486971229592666, "language_loss": 0.74870491, "learning_rate": 3.880256934503974e-06, "loss": 0.77086377, "num_input_tokens_seen": 49427725, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.125, "step": 2285, "time_per_iteration": 2.527071237564087 }, { "auxiliary_loss_clip": 0.01161226, "auxiliary_loss_mlp": 0.01050191, "balance_loss_clip": 1.03111792, "balance_loss_mlp": 1.05543435, "epoch": 0.1374417555989779, "flos": 26651319840000.0, "grad_norm": 2.1949375495754375, "language_loss": 0.74694085, "learning_rate": 3.880124162414689e-06, "loss": 0.76905507, "num_input_tokens_seen": 49449000, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0546875, "step": 2286, "time_per_iteration": 2.527439832687378 }, { "auxiliary_loss_clip": 0.01165643, "auxiliary_loss_mlp": 0.01046274, "balance_loss_clip": 1.02485228, "balance_loss_mlp": 1.05544508, "epoch": 0.1375018788516459, "flos": 28403401443840.0, "grad_norm": 1.9702310723823109, "language_loss": 0.86332905, "learning_rate": 3.879991319030908e-06, "loss": 0.88544816, "num_input_tokens_seen": 49468360, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.109375, "step": 2287, "time_per_iteration": 2.501978635787964 }, { "auxiliary_loss_clip": 0.01162176, "auxiliary_loss_mlp": 0.01047109, "balance_loss_clip": 1.02784491, "balance_loss_mlp": 1.05320001, "epoch": 0.13756200210431385, "flos": 37413783187200.0, "grad_norm": 2.0571179551510563, "language_loss": 0.68520093, "learning_rate": 3.879858404357666e-06, "loss": 0.70729381, "num_input_tokens_seen": 49493450, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0859375, "step": 2288, "time_per_iteration": 2.650533676147461 }, { "auxiliary_loss_clip": 0.01162345, "auxiliary_loss_mlp": 0.0105488, "balance_loss_clip": 1.03521061, "balance_loss_mlp": 1.05425692, "epoch": 0.13762212535698182, "flos": 22711021695360.0, "grad_norm": 3.007505903559264, "language_loss": 0.8716163, "learning_rate": 3.879725418400005e-06, "loss": 0.89378858, "num_input_tokens_seen": 49511220, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.078125, "step": 2289, "time_per_iteration": 2.4605588912963867 }, { "auxiliary_loss_clip": 0.01154701, "auxiliary_loss_mlp": 0.01046295, "balance_loss_clip": 1.02738833, "balance_loss_mlp": 1.05136752, "epoch": 0.13768224860964978, "flos": 23952130375680.0, "grad_norm": 1.811617169009056, "language_loss": 0.7462064, "learning_rate": 3.879592361162969e-06, "loss": 0.76821637, "num_input_tokens_seen": 49529820, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.03125, "step": 2290, "time_per_iteration": 2.5014939308166504 }, { "auxiliary_loss_clip": 0.01061561, "auxiliary_loss_mlp": 0.01052222, "balance_loss_clip": 1.05004025, "balance_loss_mlp": 1.02568603, "epoch": 0.13774237186231775, "flos": 63590438753280.0, "grad_norm": 0.7140385299040152, "language_loss": 0.51628065, "learning_rate": 3.8794592326516015e-06, "loss": 0.53741848, "num_input_tokens_seen": 49595325, "router_z_loss_clip": 0.02185059, "router_z_loss_mlp": 0.359375, "step": 2291, "time_per_iteration": 3.1412088871002197 }, { "auxiliary_loss_clip": 0.01160412, "auxiliary_loss_mlp": 0.01048377, "balance_loss_clip": 1.02820718, "balance_loss_mlp": 1.05207133, "epoch": 0.1378024951149857, "flos": 24279456038400.0, "grad_norm": 2.030658466880208, "language_loss": 0.70942402, "learning_rate": 3.879326032870952e-06, "loss": 0.73151189, "num_input_tokens_seen": 49615850, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.078125, "step": 2292, "time_per_iteration": 2.494743585586548 }, { "auxiliary_loss_clip": 0.0115962, "auxiliary_loss_mlp": 0.01042065, "balance_loss_clip": 1.02315879, "balance_loss_mlp": 1.05279911, "epoch": 0.13786261836765368, "flos": 14021537080320.0, "grad_norm": 2.420932387207521, "language_loss": 0.79709566, "learning_rate": 3.879192761826071e-06, "loss": 0.81911254, "num_input_tokens_seen": 49631860, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2293, "time_per_iteration": 2.406385660171509 }, { "auxiliary_loss_clip": 0.01162832, "auxiliary_loss_mlp": 0.01050375, "balance_loss_clip": 1.0302403, "balance_loss_mlp": 1.05350566, "epoch": 0.13792274162032167, "flos": 28878679226880.0, "grad_norm": 3.0355880745507378, "language_loss": 0.78285486, "learning_rate": 3.879059419522011e-06, "loss": 0.80498695, "num_input_tokens_seen": 49652145, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.09375, "step": 2294, "time_per_iteration": 2.5502376556396484 }, { "auxiliary_loss_clip": 0.01159598, "auxiliary_loss_mlp": 0.010462, "balance_loss_clip": 1.02788949, "balance_loss_mlp": 1.05383348, "epoch": 0.13798286487298964, "flos": 21141150808320.0, "grad_norm": 2.262630353130102, "language_loss": 0.80210876, "learning_rate": 3.878926005963831e-06, "loss": 0.82416672, "num_input_tokens_seen": 49669880, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0546875, "step": 2295, "time_per_iteration": 2.4571993350982666 }, { "auxiliary_loss_clip": 0.01155284, "auxiliary_loss_mlp": 0.01047496, "balance_loss_clip": 1.02765942, "balance_loss_mlp": 1.04831588, "epoch": 0.1380429881256576, "flos": 22487477402880.0, "grad_norm": 1.9044078706619072, "language_loss": 0.78500247, "learning_rate": 3.878792521156588e-06, "loss": 0.80703032, "num_input_tokens_seen": 49687255, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0703125, "step": 2296, "time_per_iteration": 2.4636952877044678 }, { "auxiliary_loss_clip": 0.01159477, "auxiliary_loss_mlp": 0.01060525, "balance_loss_clip": 1.04024792, "balance_loss_mlp": 1.05366611, "epoch": 0.13810311137832557, "flos": 21393674398080.0, "grad_norm": 2.0269846291506397, "language_loss": 0.78720021, "learning_rate": 3.8786589651053446e-06, "loss": 0.8094002, "num_input_tokens_seen": 49706650, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0625, "step": 2297, "time_per_iteration": 2.4625461101531982 }, { "auxiliary_loss_clip": 0.01157794, "auxiliary_loss_mlp": 0.01054507, "balance_loss_clip": 1.03574383, "balance_loss_mlp": 1.0525403, "epoch": 0.13816323463099353, "flos": 25989844930560.0, "grad_norm": 2.3625380674680128, "language_loss": 0.69078362, "learning_rate": 3.878525337815164e-06, "loss": 0.7129066, "num_input_tokens_seen": 49725715, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2298, "time_per_iteration": 2.4959239959716797 }, { "auxiliary_loss_clip": 0.01162066, "auxiliary_loss_mlp": 0.01050265, "balance_loss_clip": 1.02999961, "balance_loss_mlp": 1.05222416, "epoch": 0.1382233578836615, "flos": 19244313394560.0, "grad_norm": 2.031928923877202, "language_loss": 0.86630058, "learning_rate": 3.878391639291116e-06, "loss": 0.8884238, "num_input_tokens_seen": 49744710, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.1015625, "step": 2299, "time_per_iteration": 2.441519021987915 }, { "auxiliary_loss_clip": 0.01159694, "auxiliary_loss_mlp": 0.01053456, "balance_loss_clip": 1.03298759, "balance_loss_mlp": 1.05155897, "epoch": 0.1382834811363295, "flos": 25666290195840.0, "grad_norm": 1.8164091624836318, "language_loss": 0.75365913, "learning_rate": 3.878257869538267e-06, "loss": 0.77579057, "num_input_tokens_seen": 49764300, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.078125, "step": 2300, "time_per_iteration": 2.4972076416015625 }, { "auxiliary_loss_clip": 0.01158567, "auxiliary_loss_mlp": 0.01045479, "balance_loss_clip": 1.02637005, "balance_loss_mlp": 1.05295861, "epoch": 0.13834360438899745, "flos": 19784193788160.0, "grad_norm": 3.0084896207450345, "language_loss": 0.82906151, "learning_rate": 3.878124028561692e-06, "loss": 0.85110188, "num_input_tokens_seen": 49778380, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0546875, "step": 2301, "time_per_iteration": 2.4255685806274414 }, { "auxiliary_loss_clip": 0.01152363, "auxiliary_loss_mlp": 0.01039873, "balance_loss_clip": 1.02072752, "balance_loss_mlp": 1.04805136, "epoch": 0.13840372764166542, "flos": 26651858544000.0, "grad_norm": 1.9567849789207379, "language_loss": 0.85987699, "learning_rate": 3.877990116366466e-06, "loss": 0.88179928, "num_input_tokens_seen": 49797460, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0390625, "step": 2302, "time_per_iteration": 2.5126547813415527 }, { "auxiliary_loss_clip": 0.01051276, "auxiliary_loss_mlp": 0.01010829, "balance_loss_clip": 1.00837374, "balance_loss_mlp": 1.01516926, "epoch": 0.13846385089433338, "flos": 70510998286080.0, "grad_norm": 0.7520762806665195, "language_loss": 0.65598857, "learning_rate": 3.877856132957667e-06, "loss": 0.67660964, "num_input_tokens_seen": 49868005, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.36132812, "step": 2303, "time_per_iteration": 3.2176010608673096 }, { "auxiliary_loss_clip": 0.01152971, "auxiliary_loss_mlp": 0.01041714, "balance_loss_clip": 1.0232724, "balance_loss_mlp": 1.04854393, "epoch": 0.13852397414700135, "flos": 17348732956800.0, "grad_norm": 1.7300098105662192, "language_loss": 0.78742969, "learning_rate": 3.877722078340374e-06, "loss": 0.80937654, "num_input_tokens_seen": 49885825, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0390625, "step": 2304, "time_per_iteration": 3.95453143119812 }, { "auxiliary_loss_clip": 0.01161766, "auxiliary_loss_mlp": 0.01040533, "balance_loss_clip": 1.0214473, "balance_loss_mlp": 1.05348599, "epoch": 0.13858409739966931, "flos": 21543781334400.0, "grad_norm": 1.6784296715601574, "language_loss": 0.77734029, "learning_rate": 3.877587952519672e-06, "loss": 0.79936326, "num_input_tokens_seen": 49905975, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.078125, "step": 2305, "time_per_iteration": 3.8778176307678223 }, { "auxiliary_loss_clip": 0.01151954, "auxiliary_loss_mlp": 0.01044494, "balance_loss_clip": 1.02626717, "balance_loss_mlp": 1.04819703, "epoch": 0.13864422065233728, "flos": 21579907438080.0, "grad_norm": 1.8842160753424952, "language_loss": 0.87937796, "learning_rate": 3.877453755500647e-06, "loss": 0.90134251, "num_input_tokens_seen": 49925800, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0390625, "step": 2306, "time_per_iteration": 3.8689393997192383 }, { "auxiliary_loss_clip": 0.01049665, "auxiliary_loss_mlp": 0.01004363, "balance_loss_clip": 1.00187111, "balance_loss_mlp": 1.01435781, "epoch": 0.13870434390500527, "flos": 53371156872960.0, "grad_norm": 0.8717424716155538, "language_loss": 0.59027028, "learning_rate": 3.877319487288387e-06, "loss": 0.61081064, "num_input_tokens_seen": 49977620, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.35351562, "step": 2307, "time_per_iteration": 4.513395309448242 }, { "auxiliary_loss_clip": 0.01164632, "auxiliary_loss_mlp": 0.01051398, "balance_loss_clip": 1.03081059, "balance_loss_mlp": 1.05359876, "epoch": 0.13876446715767324, "flos": 22565906749440.0, "grad_norm": 2.2481897333970173, "language_loss": 0.79588181, "learning_rate": 3.877185147887984e-06, "loss": 0.8180421, "num_input_tokens_seen": 49996650, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.109375, "step": 2308, "time_per_iteration": 2.4620420932769775 }, { "auxiliary_loss_clip": 0.01157095, "auxiliary_loss_mlp": 0.01053798, "balance_loss_clip": 1.03353238, "balance_loss_mlp": 1.05163598, "epoch": 0.1388245904103412, "flos": 20705231352960.0, "grad_norm": 2.159603620831374, "language_loss": 0.78005683, "learning_rate": 3.877050737304533e-06, "loss": 0.80216575, "num_input_tokens_seen": 50015640, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0546875, "step": 2309, "time_per_iteration": 2.476048707962036 }, { "auxiliary_loss_clip": 0.01163544, "auxiliary_loss_mlp": 0.01049204, "balance_loss_clip": 1.02995133, "balance_loss_mlp": 1.05237293, "epoch": 0.13888471366300917, "flos": 20554729367040.0, "grad_norm": 1.738714710283402, "language_loss": 0.67492586, "learning_rate": 3.876916255543129e-06, "loss": 0.69705331, "num_input_tokens_seen": 50033500, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.1171875, "step": 2310, "time_per_iteration": 2.43538236618042 }, { "auxiliary_loss_clip": 0.01158839, "auxiliary_loss_mlp": 0.01057226, "balance_loss_clip": 1.03579211, "balance_loss_mlp": 1.05117774, "epoch": 0.13894483691567713, "flos": 13838033473920.0, "grad_norm": 1.7670166890302905, "language_loss": 0.83890319, "learning_rate": 3.8767817026088725e-06, "loss": 0.86106384, "num_input_tokens_seen": 50050075, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.078125, "step": 2311, "time_per_iteration": 2.430950164794922 }, { "auxiliary_loss_clip": 0.0116601, "auxiliary_loss_mlp": 0.01054232, "balance_loss_clip": 1.03431213, "balance_loss_mlp": 1.05380809, "epoch": 0.1390049601683451, "flos": 28031186759040.0, "grad_norm": 2.1668352778345246, "language_loss": 0.8238132, "learning_rate": 3.876647078506866e-06, "loss": 0.84601557, "num_input_tokens_seen": 50070080, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.125, "step": 2312, "time_per_iteration": 2.5057120323181152 }, { "auxiliary_loss_clip": 0.01164711, "auxiliary_loss_mlp": 0.01057175, "balance_loss_clip": 1.03853047, "balance_loss_mlp": 1.05359411, "epoch": 0.13906508342101306, "flos": 26756860976640.0, "grad_norm": 1.6503700715870955, "language_loss": 0.86754954, "learning_rate": 3.876512383242215e-06, "loss": 0.88976836, "num_input_tokens_seen": 50090040, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.109375, "step": 2313, "time_per_iteration": 2.5528669357299805 }, { "auxiliary_loss_clip": 0.01159078, "auxiliary_loss_mlp": 0.01056154, "balance_loss_clip": 1.03692603, "balance_loss_mlp": 1.05212975, "epoch": 0.13912520667368106, "flos": 24535104111360.0, "grad_norm": 2.0060285357045613, "language_loss": 0.79963267, "learning_rate": 3.876377616820024e-06, "loss": 0.82178497, "num_input_tokens_seen": 50110595, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0703125, "step": 2314, "time_per_iteration": 2.4891602993011475 }, { "auxiliary_loss_clip": 0.01158789, "auxiliary_loss_mlp": 0.01053015, "balance_loss_clip": 1.03394175, "balance_loss_mlp": 1.05118501, "epoch": 0.13918532992634902, "flos": 19383215287680.0, "grad_norm": 2.9919815123471674, "language_loss": 0.86066914, "learning_rate": 3.876242779245409e-06, "loss": 0.88278717, "num_input_tokens_seen": 50125430, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.078125, "step": 2315, "time_per_iteration": 2.4322662353515625 }, { "auxiliary_loss_clip": 0.01158578, "auxiliary_loss_mlp": 0.01057451, "balance_loss_clip": 1.03756702, "balance_loss_mlp": 1.05193448, "epoch": 0.139245453179017, "flos": 21323756574720.0, "grad_norm": 2.1481998474820703, "language_loss": 0.76650357, "learning_rate": 3.876107870523477e-06, "loss": 0.78866386, "num_input_tokens_seen": 50144120, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0703125, "step": 2316, "time_per_iteration": 2.482693910598755 }, { "auxiliary_loss_clip": 0.01159665, "auxiliary_loss_mlp": 0.01059363, "balance_loss_clip": 1.03838205, "balance_loss_mlp": 1.05290914, "epoch": 0.13930557643168495, "flos": 19500607912320.0, "grad_norm": 1.664189360593963, "language_loss": 0.76969814, "learning_rate": 3.875972890659349e-06, "loss": 0.79188848, "num_input_tokens_seen": 50162500, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.0625, "step": 2317, "time_per_iteration": 2.4682445526123047 }, { "auxiliary_loss_clip": 0.01163932, "auxiliary_loss_mlp": 0.01056659, "balance_loss_clip": 1.0372045, "balance_loss_mlp": 1.05411959, "epoch": 0.13936569968435292, "flos": 25410821690880.0, "grad_norm": 1.8247558899979786, "language_loss": 0.80484021, "learning_rate": 3.875837839658139e-06, "loss": 0.82704616, "num_input_tokens_seen": 50182415, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.1015625, "step": 2318, "time_per_iteration": 2.48799467086792 }, { "auxiliary_loss_clip": 0.010508, "auxiliary_loss_mlp": 0.01010866, "balance_loss_clip": 1.00863707, "balance_loss_mlp": 1.01643193, "epoch": 0.13942582293702088, "flos": 70771063731840.0, "grad_norm": 0.850308476251518, "language_loss": 0.59013689, "learning_rate": 3.87570271752497e-06, "loss": 0.61075354, "num_input_tokens_seen": 50245160, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.34375, "step": 2319, "time_per_iteration": 3.143674612045288 }, { "auxiliary_loss_clip": 0.01164096, "auxiliary_loss_mlp": 0.01050568, "balance_loss_clip": 1.03087485, "balance_loss_mlp": 1.05417907, "epoch": 0.13948594618968888, "flos": 35590885920000.0, "grad_norm": 2.6653864769206987, "language_loss": 0.65285206, "learning_rate": 3.875567524264967e-06, "loss": 0.6749987, "num_input_tokens_seen": 50268215, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.09375, "step": 2320, "time_per_iteration": 2.5831942558288574 }, { "auxiliary_loss_clip": 0.01158796, "auxiliary_loss_mlp": 0.01046064, "balance_loss_clip": 1.02740753, "balance_loss_mlp": 1.05315232, "epoch": 0.13954606944235684, "flos": 21105204272640.0, "grad_norm": 1.5024218851057154, "language_loss": 0.70741504, "learning_rate": 3.875432259883256e-06, "loss": 0.72946364, "num_input_tokens_seen": 50288575, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0546875, "step": 2321, "time_per_iteration": 2.4891421794891357 }, { "auxiliary_loss_clip": 0.01158527, "auxiliary_loss_mlp": 0.01052545, "balance_loss_clip": 1.03187442, "balance_loss_mlp": 1.05021036, "epoch": 0.1396061926950248, "flos": 25044425009280.0, "grad_norm": 1.9072369749327407, "language_loss": 0.85892904, "learning_rate": 3.875296924384965e-06, "loss": 0.88103974, "num_input_tokens_seen": 50308735, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.078125, "step": 2322, "time_per_iteration": 2.49067759513855 }, { "auxiliary_loss_clip": 0.01152284, "auxiliary_loss_mlp": 0.01050043, "balance_loss_clip": 1.03131533, "balance_loss_mlp": 1.04969907, "epoch": 0.13966631594769277, "flos": 37634023428480.0, "grad_norm": 1.5086007788529734, "language_loss": 0.66634059, "learning_rate": 3.875161517775226e-06, "loss": 0.68836385, "num_input_tokens_seen": 50331025, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.03125, "step": 2323, "time_per_iteration": 2.628993272781372 }, { "auxiliary_loss_clip": 0.01165939, "auxiliary_loss_mlp": 0.01050065, "balance_loss_clip": 1.03018117, "balance_loss_mlp": 1.0525707, "epoch": 0.13972643920036074, "flos": 16690993061760.0, "grad_norm": 1.8622452774955285, "language_loss": 0.89048409, "learning_rate": 3.875026040059175e-06, "loss": 0.91264415, "num_input_tokens_seen": 50349725, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.1328125, "step": 2324, "time_per_iteration": 2.42736554145813 }, { "auxiliary_loss_clip": 0.01162298, "auxiliary_loss_mlp": 0.01053285, "balance_loss_clip": 1.03216112, "balance_loss_mlp": 1.0520376, "epoch": 0.1397865624530287, "flos": 23331055288320.0, "grad_norm": 2.4277325360936124, "language_loss": 0.7073319, "learning_rate": 3.8748904912419485e-06, "loss": 0.72948766, "num_input_tokens_seen": 50367965, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1015625, "step": 2325, "time_per_iteration": 2.5208187103271484 }, { "auxiliary_loss_clip": 0.01161965, "auxiliary_loss_mlp": 0.01052153, "balance_loss_clip": 1.03261447, "balance_loss_mlp": 1.05395007, "epoch": 0.13984668570569667, "flos": 22778317825920.0, "grad_norm": 2.8561887854425145, "language_loss": 0.81774503, "learning_rate": 3.874754871328688e-06, "loss": 0.83988625, "num_input_tokens_seen": 50385605, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.078125, "step": 2326, "time_per_iteration": 2.4655709266662598 }, { "auxiliary_loss_clip": 0.01160478, "auxiliary_loss_mlp": 0.01043101, "balance_loss_clip": 1.02573252, "balance_loss_mlp": 1.05541003, "epoch": 0.13990680895836466, "flos": 19464553635840.0, "grad_norm": 1.7119942560307657, "language_loss": 0.89013904, "learning_rate": 3.874619180324534e-06, "loss": 0.91217482, "num_input_tokens_seen": 50403985, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.046875, "step": 2327, "time_per_iteration": 2.491482973098755 }, { "auxiliary_loss_clip": 0.01158822, "auxiliary_loss_mlp": 0.01056115, "balance_loss_clip": 1.0358609, "balance_loss_mlp": 1.05289912, "epoch": 0.13996693221103262, "flos": 20303283185280.0, "grad_norm": 2.2837596938729736, "language_loss": 0.8539238, "learning_rate": 3.874483418234632e-06, "loss": 0.87607312, "num_input_tokens_seen": 50421590, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0625, "step": 2328, "time_per_iteration": 2.4484293460845947 }, { "auxiliary_loss_clip": 0.01160058, "auxiliary_loss_mlp": 0.01040786, "balance_loss_clip": 1.02090192, "balance_loss_mlp": 1.05269384, "epoch": 0.1400270554637006, "flos": 26617707688320.0, "grad_norm": 1.5769155929489274, "language_loss": 0.74271214, "learning_rate": 3.874347585064131e-06, "loss": 0.76472056, "num_input_tokens_seen": 50443945, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0703125, "step": 2329, "time_per_iteration": 2.5493712425231934 }, { "auxiliary_loss_clip": 0.01159023, "auxiliary_loss_mlp": 0.01044711, "balance_loss_clip": 1.02513647, "balance_loss_mlp": 1.04989552, "epoch": 0.14008717871636855, "flos": 19391475415680.0, "grad_norm": 2.4415357578626824, "language_loss": 0.78245455, "learning_rate": 3.874211680818183e-06, "loss": 0.80449188, "num_input_tokens_seen": 50462065, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.09375, "step": 2330, "time_per_iteration": 2.4420087337493896 }, { "auxiliary_loss_clip": 0.01158486, "auxiliary_loss_mlp": 0.01043772, "balance_loss_clip": 1.02465057, "balance_loss_mlp": 1.05150199, "epoch": 0.14014730196903652, "flos": 15304266645120.0, "grad_norm": 2.161535175890311, "language_loss": 0.72070217, "learning_rate": 3.87407570550194e-06, "loss": 0.74272478, "num_input_tokens_seen": 50479565, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0625, "step": 2331, "time_per_iteration": 2.4502665996551514 }, { "auxiliary_loss_clip": 0.01156607, "auxiliary_loss_mlp": 0.01050111, "balance_loss_clip": 1.03016746, "balance_loss_mlp": 1.05444825, "epoch": 0.14020742522170448, "flos": 14939701557120.0, "grad_norm": 1.8293109802397358, "language_loss": 0.72433424, "learning_rate": 3.873939659120557e-06, "loss": 0.74640143, "num_input_tokens_seen": 50497305, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0234375, "step": 2332, "time_per_iteration": 2.429006576538086 }, { "auxiliary_loss_clip": 0.0105652, "auxiliary_loss_mlp": 0.01018327, "balance_loss_clip": 1.01531112, "balance_loss_mlp": 1.02188313, "epoch": 0.14026754847437245, "flos": 48824580044160.0, "grad_norm": 0.9281849888048428, "language_loss": 0.561445, "learning_rate": 3.873803541679196e-06, "loss": 0.58219349, "num_input_tokens_seen": 50549735, "router_z_loss_clip": 0.03015137, "router_z_loss_mlp": 0.34765625, "step": 2333, "time_per_iteration": 2.9506473541259766 }, { "auxiliary_loss_clip": 0.01158543, "auxiliary_loss_mlp": 0.01040922, "balance_loss_clip": 1.02153826, "balance_loss_mlp": 1.05208373, "epoch": 0.14032767172704044, "flos": 25773267876480.0, "grad_norm": 1.7544616736462195, "language_loss": 0.82909524, "learning_rate": 3.873667353183016e-06, "loss": 0.85108989, "num_input_tokens_seen": 50570100, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0625, "step": 2334, "time_per_iteration": 2.4900574684143066 }, { "auxiliary_loss_clip": 0.01157026, "auxiliary_loss_mlp": 0.01044508, "balance_loss_clip": 1.02666247, "balance_loss_mlp": 1.05085051, "epoch": 0.1403877949797084, "flos": 21216312017280.0, "grad_norm": 1.7113254118330792, "language_loss": 0.81313658, "learning_rate": 3.8735310936371825e-06, "loss": 0.83515191, "num_input_tokens_seen": 50589185, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.0625, "step": 2335, "time_per_iteration": 2.516186475753784 }, { "auxiliary_loss_clip": 0.01165327, "auxiliary_loss_mlp": 0.01049458, "balance_loss_clip": 1.02727354, "balance_loss_mlp": 1.05480802, "epoch": 0.14044791823237637, "flos": 22747973811840.0, "grad_norm": 1.6109853134919552, "language_loss": 0.82003635, "learning_rate": 3.873394763046862e-06, "loss": 0.84218419, "num_input_tokens_seen": 50609645, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.109375, "step": 2336, "time_per_iteration": 2.4870765209198 }, { "auxiliary_loss_clip": 0.01159707, "auxiliary_loss_mlp": 0.01049549, "balance_loss_clip": 1.03001106, "balance_loss_mlp": 1.0527643, "epoch": 0.14050804148504434, "flos": 22964443125120.0, "grad_norm": 1.7045733588928182, "language_loss": 0.80406541, "learning_rate": 3.873258361417225e-06, "loss": 0.82615799, "num_input_tokens_seen": 50628385, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0703125, "step": 2337, "time_per_iteration": 2.483346939086914 }, { "auxiliary_loss_clip": 0.01157733, "auxiliary_loss_mlp": 0.01051776, "balance_loss_clip": 1.03232157, "balance_loss_mlp": 1.04966664, "epoch": 0.1405681647377123, "flos": 22200336080640.0, "grad_norm": 1.8065772581619253, "language_loss": 0.79538673, "learning_rate": 3.873121888753442e-06, "loss": 0.81748188, "num_input_tokens_seen": 50647260, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.078125, "step": 2338, "time_per_iteration": 2.460529327392578 }, { "auxiliary_loss_clip": 0.01166466, "auxiliary_loss_mlp": 0.01052326, "balance_loss_clip": 1.03196502, "balance_loss_mlp": 1.05605459, "epoch": 0.14062828799038027, "flos": 23732787974400.0, "grad_norm": 3.0435321151467036, "language_loss": 0.79699123, "learning_rate": 3.87298534506069e-06, "loss": 0.81917918, "num_input_tokens_seen": 50666130, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.1015625, "step": 2339, "time_per_iteration": 2.4713637828826904 }, { "auxiliary_loss_clip": 0.01159801, "auxiliary_loss_mlp": 0.01058371, "balance_loss_clip": 1.03890407, "balance_loss_mlp": 1.05285692, "epoch": 0.14068841124304826, "flos": 39202493685120.0, "grad_norm": 2.6754683246398074, "language_loss": 0.65871298, "learning_rate": 3.872848730344146e-06, "loss": 0.68089473, "num_input_tokens_seen": 50687440, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0703125, "step": 2340, "time_per_iteration": 2.612902879714966 }, { "auxiliary_loss_clip": 0.01156036, "auxiliary_loss_mlp": 0.01047646, "balance_loss_clip": 1.02846491, "balance_loss_mlp": 1.0520575, "epoch": 0.14074853449571623, "flos": 20192283181440.0, "grad_norm": 2.4871885237449836, "language_loss": 0.78559017, "learning_rate": 3.87271204460899e-06, "loss": 0.80762696, "num_input_tokens_seen": 50704030, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0390625, "step": 2341, "time_per_iteration": 2.463714361190796 }, { "auxiliary_loss_clip": 0.01156091, "auxiliary_loss_mlp": 0.01054532, "balance_loss_clip": 1.03492224, "balance_loss_mlp": 1.05086422, "epoch": 0.1408086577483842, "flos": 18405871153920.0, "grad_norm": 2.9205203598621665, "language_loss": 0.80518365, "learning_rate": 3.8725752878604066e-06, "loss": 0.82728994, "num_input_tokens_seen": 50723305, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0546875, "step": 2342, "time_per_iteration": 2.4198827743530273 }, { "auxiliary_loss_clip": 0.01158825, "auxiliary_loss_mlp": 0.01050967, "balance_loss_clip": 1.03291905, "balance_loss_mlp": 1.05593753, "epoch": 0.14086878100105216, "flos": 25264593423360.0, "grad_norm": 1.8697380358797282, "language_loss": 0.77573681, "learning_rate": 3.87243846010358e-06, "loss": 0.79783475, "num_input_tokens_seen": 50743270, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.03125, "step": 2343, "time_per_iteration": 2.4927189350128174 }, { "auxiliary_loss_clip": 0.01058674, "auxiliary_loss_mlp": 0.01036056, "balance_loss_clip": 1.03362393, "balance_loss_mlp": 1.02365875, "epoch": 0.14092890425372012, "flos": 65978388869760.0, "grad_norm": 0.8448918961478195, "language_loss": 0.61515868, "learning_rate": 3.872301561343699e-06, "loss": 0.63610595, "num_input_tokens_seen": 50802710, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.3515625, "step": 2344, "time_per_iteration": 3.0212767124176025 }, { "auxiliary_loss_clip": 0.01151911, "auxiliary_loss_mlp": 0.01043895, "balance_loss_clip": 1.02653766, "balance_loss_mlp": 1.04917717, "epoch": 0.1409890275063881, "flos": 23694973931520.0, "grad_norm": 1.7937644602765561, "language_loss": 0.64647782, "learning_rate": 3.872164591585956e-06, "loss": 0.66843581, "num_input_tokens_seen": 50822625, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.03125, "step": 2345, "time_per_iteration": 2.497201442718506 }, { "auxiliary_loss_clip": 0.01161358, "auxiliary_loss_mlp": 0.01047811, "balance_loss_clip": 1.02673471, "balance_loss_mlp": 1.04919636, "epoch": 0.14104915075905605, "flos": 23623152687360.0, "grad_norm": 3.1457943872775176, "language_loss": 0.7346108, "learning_rate": 3.8720275508355435e-06, "loss": 0.75670254, "num_input_tokens_seen": 50842330, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.125, "step": 2346, "time_per_iteration": 4.047670602798462 }, { "auxiliary_loss_clip": 0.01160832, "auxiliary_loss_mlp": 0.01045034, "balance_loss_clip": 1.0251019, "balance_loss_mlp": 1.05259216, "epoch": 0.14110927401172405, "flos": 20595165102720.0, "grad_norm": 1.722222669352832, "language_loss": 0.76998127, "learning_rate": 3.8718904390976585e-06, "loss": 0.79203999, "num_input_tokens_seen": 50861035, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.078125, "step": 2347, "time_per_iteration": 2.4573428630828857 }, { "auxiliary_loss_clip": 0.01157638, "auxiliary_loss_mlp": 0.01048273, "balance_loss_clip": 1.02964079, "balance_loss_mlp": 1.05010486, "epoch": 0.141169397264392, "flos": 28548049512960.0, "grad_norm": 1.9829400755300415, "language_loss": 0.76695788, "learning_rate": 3.8717532563775e-06, "loss": 0.78901696, "num_input_tokens_seen": 50880105, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0703125, "step": 2348, "time_per_iteration": 5.331411600112915 }, { "auxiliary_loss_clip": 0.01157564, "auxiliary_loss_mlp": 0.01041672, "balance_loss_clip": 1.0225147, "balance_loss_mlp": 1.05168331, "epoch": 0.14122952051705998, "flos": 17092258871040.0, "grad_norm": 1.62366568734344, "language_loss": 0.86517847, "learning_rate": 3.871616002680272e-06, "loss": 0.88717091, "num_input_tokens_seen": 50897720, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0625, "step": 2349, "time_per_iteration": 2.4402904510498047 }, { "auxiliary_loss_clip": 0.01156878, "auxiliary_loss_mlp": 0.01046717, "balance_loss_clip": 1.02776301, "balance_loss_mlp": 1.0533309, "epoch": 0.14128964376972794, "flos": 28946801370240.0, "grad_norm": 1.6618055453233942, "language_loss": 0.88999033, "learning_rate": 3.871478678011177e-06, "loss": 0.91202629, "num_input_tokens_seen": 50918385, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.03125, "step": 2350, "time_per_iteration": 2.513446092605591 }, { "auxiliary_loss_clip": 0.01160289, "auxiliary_loss_mlp": 0.01045819, "balance_loss_clip": 1.02548218, "balance_loss_mlp": 1.05304372, "epoch": 0.1413497670223959, "flos": 18989778643200.0, "grad_norm": 1.6818722894659892, "language_loss": 0.80954486, "learning_rate": 3.871341282375423e-06, "loss": 0.83160597, "num_input_tokens_seen": 50938270, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0703125, "step": 2351, "time_per_iteration": 2.470755100250244 }, { "auxiliary_loss_clip": 0.01160792, "auxiliary_loss_mlp": 0.01039455, "balance_loss_clip": 1.02078724, "balance_loss_mlp": 1.05295563, "epoch": 0.14140989027506387, "flos": 29862236413440.0, "grad_norm": 3.088437769252744, "language_loss": 0.83117527, "learning_rate": 3.871203815778219e-06, "loss": 0.85317779, "num_input_tokens_seen": 50958155, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.078125, "step": 2352, "time_per_iteration": 2.5221211910247803 }, { "auxiliary_loss_clip": 0.01056701, "auxiliary_loss_mlp": 0.01004412, "balance_loss_clip": 1.0019083, "balance_loss_mlp": 1.02253699, "epoch": 0.14147001352773186, "flos": 62079532041600.0, "grad_norm": 0.9094398899301644, "language_loss": 0.62004167, "learning_rate": 3.87106627822478e-06, "loss": 0.64065278, "num_input_tokens_seen": 51020705, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.34179688, "step": 2353, "time_per_iteration": 3.033170223236084 }, { "auxiliary_loss_clip": 0.01155364, "auxiliary_loss_mlp": 0.01048612, "balance_loss_clip": 1.02982426, "balance_loss_mlp": 1.05156791, "epoch": 0.14153013678039983, "flos": 22017514832640.0, "grad_norm": 1.7155080251739778, "language_loss": 0.87182909, "learning_rate": 3.8709286697203196e-06, "loss": 0.89386892, "num_input_tokens_seen": 51039995, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0390625, "step": 2354, "time_per_iteration": 2.4823877811431885 }, { "auxiliary_loss_clip": 0.01157591, "auxiliary_loss_mlp": 0.01042332, "balance_loss_clip": 1.02287686, "balance_loss_mlp": 1.05097914, "epoch": 0.1415902600330678, "flos": 19720093968000.0, "grad_norm": 2.7608175261149652, "language_loss": 0.7455653, "learning_rate": 3.870790990270057e-06, "loss": 0.76756454, "num_input_tokens_seen": 51059075, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0625, "step": 2355, "time_per_iteration": 2.461642265319824 }, { "auxiliary_loss_clip": 0.01052046, "auxiliary_loss_mlp": 0.01003736, "balance_loss_clip": 1.00123262, "balance_loss_mlp": 1.01768279, "epoch": 0.14165038328573576, "flos": 65900929190400.0, "grad_norm": 0.6836343701425218, "language_loss": 0.51813811, "learning_rate": 3.870653239879212e-06, "loss": 0.53869593, "num_input_tokens_seen": 51120380, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.34375, "step": 2356, "time_per_iteration": 3.00301456451416 }, { "auxiliary_loss_clip": 0.01157841, "auxiliary_loss_mlp": 0.01055205, "balance_loss_clip": 1.03534484, "balance_loss_mlp": 1.05233157, "epoch": 0.14171050653840372, "flos": 12130158533760.0, "grad_norm": 2.070644254414993, "language_loss": 0.70540017, "learning_rate": 3.8705154185530095e-06, "loss": 0.7275306, "num_input_tokens_seen": 51136950, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0546875, "step": 2357, "time_per_iteration": 2.4219679832458496 }, { "auxiliary_loss_clip": 0.01160091, "auxiliary_loss_mlp": 0.01047659, "balance_loss_clip": 1.02859747, "balance_loss_mlp": 1.0507791, "epoch": 0.1417706297910717, "flos": 20412487509120.0, "grad_norm": 3.622037375422834, "language_loss": 0.81955516, "learning_rate": 3.870377526296674e-06, "loss": 0.84163272, "num_input_tokens_seen": 51155175, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.09375, "step": 2358, "time_per_iteration": 2.444704294204712 }, { "auxiliary_loss_clip": 0.01162843, "auxiliary_loss_mlp": 0.01045652, "balance_loss_clip": 1.02576828, "balance_loss_mlp": 1.05189919, "epoch": 0.14183075304373965, "flos": 22380607463040.0, "grad_norm": 1.9029913862767744, "language_loss": 0.71711469, "learning_rate": 3.870239563115436e-06, "loss": 0.7391997, "num_input_tokens_seen": 51174500, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.109375, "step": 2359, "time_per_iteration": 2.476158618927002 }, { "auxiliary_loss_clip": 0.01157809, "auxiliary_loss_mlp": 0.01045798, "balance_loss_clip": 1.02635527, "balance_loss_mlp": 1.05169368, "epoch": 0.14189087629640765, "flos": 21580913018880.0, "grad_norm": 2.083730195621982, "language_loss": 0.75986457, "learning_rate": 3.870101529014526e-06, "loss": 0.78190064, "num_input_tokens_seen": 51194270, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0625, "step": 2360, "time_per_iteration": 2.4574875831604004 }, { "auxiliary_loss_clip": 0.0115681, "auxiliary_loss_mlp": 0.01048667, "balance_loss_clip": 1.02815139, "balance_loss_mlp": 1.05238867, "epoch": 0.1419509995490756, "flos": 20008564093440.0, "grad_norm": 2.052443696339941, "language_loss": 0.82032341, "learning_rate": 3.869963423999178e-06, "loss": 0.84237814, "num_input_tokens_seen": 51211850, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.046875, "step": 2361, "time_per_iteration": 2.450551986694336 }, { "auxiliary_loss_clip": 0.01154216, "auxiliary_loss_mlp": 0.01055268, "balance_loss_clip": 1.03577781, "balance_loss_mlp": 1.05019045, "epoch": 0.14201112280174358, "flos": 31941464112000.0, "grad_norm": 1.8510897174218715, "language_loss": 0.74279583, "learning_rate": 3.86982524807463e-06, "loss": 0.76489067, "num_input_tokens_seen": 51233545, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.046875, "step": 2362, "time_per_iteration": 2.538818836212158 }, { "auxiliary_loss_clip": 0.01159531, "auxiliary_loss_mlp": 0.01050239, "balance_loss_clip": 1.03062868, "balance_loss_mlp": 1.05349541, "epoch": 0.14207124605441154, "flos": 41464147582080.0, "grad_norm": 2.0526833178384716, "language_loss": 0.73538238, "learning_rate": 3.869687001246122e-06, "loss": 0.75748003, "num_input_tokens_seen": 51257615, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0625, "step": 2363, "time_per_iteration": 2.6714255809783936 }, { "auxiliary_loss_clip": 0.01157361, "auxiliary_loss_mlp": 0.01044177, "balance_loss_clip": 1.02522218, "balance_loss_mlp": 1.05152833, "epoch": 0.1421313693070795, "flos": 31905086613120.0, "grad_norm": 1.6708163688570843, "language_loss": 0.73197258, "learning_rate": 3.8695486835188946e-06, "loss": 0.75398803, "num_input_tokens_seen": 51279645, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0546875, "step": 2364, "time_per_iteration": 2.544593334197998 }, { "auxiliary_loss_clip": 0.01151988, "auxiliary_loss_mlp": 0.01045022, "balance_loss_clip": 1.02739096, "balance_loss_mlp": 1.04952335, "epoch": 0.14219149255974747, "flos": 26871165031680.0, "grad_norm": 1.8943547417283042, "language_loss": 0.90550983, "learning_rate": 3.869410294898195e-06, "loss": 0.92747992, "num_input_tokens_seen": 51299775, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.0234375, "step": 2365, "time_per_iteration": 2.5039222240448 }, { "auxiliary_loss_clip": 0.01156303, "auxiliary_loss_mlp": 0.01049119, "balance_loss_clip": 1.02912772, "balance_loss_mlp": 1.04964042, "epoch": 0.14225161581241544, "flos": 27454426076160.0, "grad_norm": 1.7357033732341667, "language_loss": 0.65465128, "learning_rate": 3.869271835389268e-06, "loss": 0.67670548, "num_input_tokens_seen": 51319430, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0625, "step": 2366, "time_per_iteration": 2.5162458419799805 }, { "auxiliary_loss_clip": 0.01155563, "auxiliary_loss_mlp": 0.01053296, "balance_loss_clip": 1.03329253, "balance_loss_mlp": 1.05081964, "epoch": 0.14231173906508343, "flos": 10561436881920.0, "grad_norm": 2.0021300867468277, "language_loss": 0.80697572, "learning_rate": 3.8691333049973665e-06, "loss": 0.82906425, "num_input_tokens_seen": 51336045, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.046875, "step": 2367, "time_per_iteration": 2.4258036613464355 }, { "auxiliary_loss_clip": 0.01159415, "auxiliary_loss_mlp": 0.01060202, "balance_loss_clip": 1.03917336, "balance_loss_mlp": 1.05163407, "epoch": 0.1423718623177514, "flos": 28360882719360.0, "grad_norm": 2.522803544715756, "language_loss": 0.82684147, "learning_rate": 3.868994703727742e-06, "loss": 0.84903765, "num_input_tokens_seen": 51357030, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.078125, "step": 2368, "time_per_iteration": 2.5132226943969727 }, { "auxiliary_loss_clip": 0.01158888, "auxiliary_loss_mlp": 0.01055315, "balance_loss_clip": 1.03426266, "balance_loss_mlp": 1.05249929, "epoch": 0.14243198557041936, "flos": 19354235990400.0, "grad_norm": 2.1292503226689945, "language_loss": 0.87104684, "learning_rate": 3.868856031585652e-06, "loss": 0.89318889, "num_input_tokens_seen": 51374890, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.0625, "step": 2369, "time_per_iteration": 2.4436073303222656 }, { "auxiliary_loss_clip": 0.01161925, "auxiliary_loss_mlp": 0.01048956, "balance_loss_clip": 1.0287261, "balance_loss_mlp": 1.05086446, "epoch": 0.14249210882308733, "flos": 28806857982720.0, "grad_norm": 1.4618813927731624, "language_loss": 0.75919098, "learning_rate": 3.868717288576354e-06, "loss": 0.78129977, "num_input_tokens_seen": 51398100, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.109375, "step": 2370, "time_per_iteration": 2.5516583919525146 }, { "auxiliary_loss_clip": 0.01155866, "auxiliary_loss_mlp": 0.01064817, "balance_loss_clip": 1.04402709, "balance_loss_mlp": 1.0495441, "epoch": 0.1425522320757553, "flos": 21835016807040.0, "grad_norm": 1.70735569050791, "language_loss": 0.83434469, "learning_rate": 3.868578474705109e-06, "loss": 0.85655153, "num_input_tokens_seen": 51418745, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.0625, "step": 2371, "time_per_iteration": 2.472449541091919 }, { "auxiliary_loss_clip": 0.01160248, "auxiliary_loss_mlp": 0.01055441, "balance_loss_clip": 1.03583145, "balance_loss_mlp": 1.05242527, "epoch": 0.14261235532842326, "flos": 17311457617920.0, "grad_norm": 2.483134510967119, "language_loss": 0.82812476, "learning_rate": 3.868439589977181e-06, "loss": 0.85028166, "num_input_tokens_seen": 51437455, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.078125, "step": 2372, "time_per_iteration": 2.443099021911621 }, { "auxiliary_loss_clip": 0.01158921, "auxiliary_loss_mlp": 0.0105176, "balance_loss_clip": 1.03205466, "balance_loss_mlp": 1.05230916, "epoch": 0.14267247858109125, "flos": 18806741913600.0, "grad_norm": 2.3730938453643446, "language_loss": 0.84842765, "learning_rate": 3.868300634397836e-06, "loss": 0.87053448, "num_input_tokens_seen": 51455710, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0703125, "step": 2373, "time_per_iteration": 2.4503173828125 }, { "auxiliary_loss_clip": 0.01156739, "auxiliary_loss_mlp": 0.01055623, "balance_loss_clip": 1.03756273, "balance_loss_mlp": 1.05183172, "epoch": 0.14273260183375922, "flos": 11358904682880.0, "grad_norm": 2.0235200955613677, "language_loss": 0.85789758, "learning_rate": 3.8681616079723445e-06, "loss": 0.88002121, "num_input_tokens_seen": 51471270, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.046875, "step": 2374, "time_per_iteration": 2.4505536556243896 }, { "auxiliary_loss_clip": 0.01163895, "auxiliary_loss_mlp": 0.01057766, "balance_loss_clip": 1.03690457, "balance_loss_mlp": 1.05277324, "epoch": 0.14279272508642718, "flos": 27567688636800.0, "grad_norm": 1.6466127026389465, "language_loss": 0.79264194, "learning_rate": 3.868022510705977e-06, "loss": 0.81485856, "num_input_tokens_seen": 51492705, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.109375, "step": 2375, "time_per_iteration": 2.584561824798584 }, { "auxiliary_loss_clip": 0.01159889, "auxiliary_loss_mlp": 0.01055811, "balance_loss_clip": 1.03624821, "balance_loss_mlp": 1.05472505, "epoch": 0.14285284833909515, "flos": 16252559654400.0, "grad_norm": 2.1691882289656523, "language_loss": 0.76692891, "learning_rate": 3.867883342604009e-06, "loss": 0.78908587, "num_input_tokens_seen": 51510780, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.046875, "step": 2376, "time_per_iteration": 2.4230947494506836 }, { "auxiliary_loss_clip": 0.01160659, "auxiliary_loss_mlp": 0.01050586, "balance_loss_clip": 1.03096461, "balance_loss_mlp": 1.05512881, "epoch": 0.1429129715917631, "flos": 19755609540480.0, "grad_norm": 2.0449768749277077, "language_loss": 0.93575156, "learning_rate": 3.867744103671717e-06, "loss": 0.95786393, "num_input_tokens_seen": 51531400, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0546875, "step": 2377, "time_per_iteration": 2.48992657661438 }, { "auxiliary_loss_clip": 0.01158428, "auxiliary_loss_mlp": 0.01050616, "balance_loss_clip": 1.0290277, "balance_loss_mlp": 1.05144179, "epoch": 0.14297309484443108, "flos": 21137092571520.0, "grad_norm": 1.751017877003418, "language_loss": 0.91414905, "learning_rate": 3.867604793914382e-06, "loss": 0.93623948, "num_input_tokens_seen": 51548215, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.0703125, "step": 2378, "time_per_iteration": 2.436847686767578 }, { "auxiliary_loss_clip": 0.01163529, "auxiliary_loss_mlp": 0.01050681, "balance_loss_clip": 1.0298903, "balance_loss_mlp": 1.05361581, "epoch": 0.14303321809709904, "flos": 23586667447680.0, "grad_norm": 1.7189596831419847, "language_loss": 0.73645782, "learning_rate": 3.8674654133372864e-06, "loss": 0.75859988, "num_input_tokens_seen": 51566820, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.1015625, "step": 2379, "time_per_iteration": 2.491917371749878 }, { "auxiliary_loss_clip": 0.01158904, "auxiliary_loss_mlp": 0.01055606, "balance_loss_clip": 1.03517318, "balance_loss_mlp": 1.05185175, "epoch": 0.14309334134976703, "flos": 15888281875200.0, "grad_norm": 1.8769268898844094, "language_loss": 0.78589129, "learning_rate": 3.867325961945714e-06, "loss": 0.80803633, "num_input_tokens_seen": 51585075, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0703125, "step": 2380, "time_per_iteration": 2.4261155128479004 }, { "auxiliary_loss_clip": 0.01165217, "auxiliary_loss_mlp": 0.01051819, "balance_loss_clip": 1.0316968, "balance_loss_mlp": 1.05708098, "epoch": 0.143153464602435, "flos": 16325601960960.0, "grad_norm": 2.179865833073378, "language_loss": 0.87871611, "learning_rate": 3.867186439744955e-06, "loss": 0.90088648, "num_input_tokens_seen": 51603185, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.078125, "step": 2381, "time_per_iteration": 2.4425246715545654 }, { "auxiliary_loss_clip": 0.01160031, "auxiliary_loss_mlp": 0.01050395, "balance_loss_clip": 1.03031993, "balance_loss_mlp": 1.05536175, "epoch": 0.14321358785510296, "flos": 17092079303040.0, "grad_norm": 2.171405795226405, "language_loss": 0.76495773, "learning_rate": 3.867046846740299e-06, "loss": 0.78706193, "num_input_tokens_seen": 51620880, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.046875, "step": 2382, "time_per_iteration": 2.419217824935913 }, { "auxiliary_loss_clip": 0.01158726, "auxiliary_loss_mlp": 0.01049377, "balance_loss_clip": 1.02942169, "balance_loss_mlp": 1.0514046, "epoch": 0.14327371110777093, "flos": 26322916769280.0, "grad_norm": 3.005088139652753, "language_loss": 0.76746851, "learning_rate": 3.866907182937039e-06, "loss": 0.78954959, "num_input_tokens_seen": 51640170, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.078125, "step": 2383, "time_per_iteration": 2.5159544944763184 }, { "auxiliary_loss_clip": 0.01161485, "auxiliary_loss_mlp": 0.01050002, "balance_loss_clip": 1.02803159, "balance_loss_mlp": 1.05323887, "epoch": 0.1433338343604389, "flos": 18076462502400.0, "grad_norm": 2.1487500424253367, "language_loss": 0.87804234, "learning_rate": 3.866767448340471e-06, "loss": 0.90015715, "num_input_tokens_seen": 51656580, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.078125, "step": 2384, "time_per_iteration": 2.43546986579895 }, { "auxiliary_loss_clip": 0.01169404, "auxiliary_loss_mlp": 0.01049843, "balance_loss_clip": 1.02817082, "balance_loss_mlp": 1.05768406, "epoch": 0.14339395761310686, "flos": 15522783033600.0, "grad_norm": 2.2687232241043938, "language_loss": 0.80434078, "learning_rate": 3.866627642955895e-06, "loss": 0.8265332, "num_input_tokens_seen": 51674645, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1171875, "step": 2385, "time_per_iteration": 2.4487626552581787 }, { "auxiliary_loss_clip": 0.01159589, "auxiliary_loss_mlp": 0.01042659, "balance_loss_clip": 1.02296519, "balance_loss_mlp": 1.05240285, "epoch": 0.14345408086577485, "flos": 28548767784960.0, "grad_norm": 1.6499799454438115, "language_loss": 0.75176561, "learning_rate": 3.866487766788612e-06, "loss": 0.77378809, "num_input_tokens_seen": 51695770, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0703125, "step": 2386, "time_per_iteration": 2.5149831771850586 }, { "auxiliary_loss_clip": 0.01159774, "auxiliary_loss_mlp": 0.01046981, "balance_loss_clip": 1.02714467, "balance_loss_mlp": 1.05325544, "epoch": 0.14351420411844282, "flos": 20230061310720.0, "grad_norm": 1.9896046677292822, "language_loss": 0.78631943, "learning_rate": 3.866347819843925e-06, "loss": 0.80838704, "num_input_tokens_seen": 51714165, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0625, "step": 2387, "time_per_iteration": 2.493290901184082 }, { "auxiliary_loss_clip": 0.01157142, "auxiliary_loss_mlp": 0.01054551, "balance_loss_clip": 1.03328395, "balance_loss_mlp": 1.05130291, "epoch": 0.14357432737111078, "flos": 19865029345920.0, "grad_norm": 2.021566993586691, "language_loss": 0.82450885, "learning_rate": 3.866207802127143e-06, "loss": 0.8466258, "num_input_tokens_seen": 51734440, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.0546875, "step": 2388, "time_per_iteration": 4.039836883544922 }, { "auxiliary_loss_clip": 0.01161214, "auxiliary_loss_mlp": 0.0104504, "balance_loss_clip": 1.02678943, "balance_loss_mlp": 1.05474472, "epoch": 0.14363445062377875, "flos": 28256814040320.0, "grad_norm": 1.8882430997608959, "language_loss": 0.81927454, "learning_rate": 3.866067713643573e-06, "loss": 0.84133708, "num_input_tokens_seen": 51753730, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0703125, "step": 2389, "time_per_iteration": 2.5056724548339844 }, { "auxiliary_loss_clip": 0.01161689, "auxiliary_loss_mlp": 0.01050356, "balance_loss_clip": 1.02950597, "balance_loss_mlp": 1.05161929, "epoch": 0.1436945738764467, "flos": 18186672407040.0, "grad_norm": 2.7331394084669665, "language_loss": 0.83349919, "learning_rate": 3.8659275543985285e-06, "loss": 0.85561967, "num_input_tokens_seen": 51771195, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.1015625, "step": 2390, "time_per_iteration": 5.44156551361084 }, { "auxiliary_loss_clip": 0.01158291, "auxiliary_loss_mlp": 0.01049416, "balance_loss_clip": 1.0298655, "balance_loss_mlp": 1.0507015, "epoch": 0.14375469712911468, "flos": 27307910499840.0, "grad_norm": 1.6442310538483311, "language_loss": 0.75127804, "learning_rate": 3.865787324397324e-06, "loss": 0.77335513, "num_input_tokens_seen": 51792290, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.078125, "step": 2391, "time_per_iteration": 2.5446810722351074 }, { "auxiliary_loss_clip": 0.01053865, "auxiliary_loss_mlp": 0.01024436, "balance_loss_clip": 1.021909, "balance_loss_mlp": 1.01893508, "epoch": 0.14381482038178264, "flos": 56891445287040.0, "grad_norm": 0.9132791215929543, "language_loss": 0.61763084, "learning_rate": 3.865647023645277e-06, "loss": 0.63841391, "num_input_tokens_seen": 51843675, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.34765625, "step": 2392, "time_per_iteration": 2.9322474002838135 }, { "auxiliary_loss_clip": 0.01162393, "auxiliary_loss_mlp": 0.01053574, "balance_loss_clip": 1.03202105, "balance_loss_mlp": 1.05090094, "epoch": 0.14387494363445064, "flos": 14282177143680.0, "grad_norm": 2.0884803397543004, "language_loss": 0.76965523, "learning_rate": 3.865506652147709e-06, "loss": 0.79181492, "num_input_tokens_seen": 51860285, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.1171875, "step": 2393, "time_per_iteration": 2.4345154762268066 }, { "auxiliary_loss_clip": 0.01160362, "auxiliary_loss_mlp": 0.01048646, "balance_loss_clip": 1.02923894, "balance_loss_mlp": 1.05305028, "epoch": 0.1439350668871186, "flos": 26761493831040.0, "grad_norm": 2.0936044315993483, "language_loss": 0.76847434, "learning_rate": 3.865366209909941e-06, "loss": 0.79056448, "num_input_tokens_seen": 51880105, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.078125, "step": 2394, "time_per_iteration": 2.503340721130371 }, { "auxiliary_loss_clip": 0.0115509, "auxiliary_loss_mlp": 0.01047083, "balance_loss_clip": 1.02750862, "balance_loss_mlp": 1.04901743, "epoch": 0.14399519013978657, "flos": 40700040537600.0, "grad_norm": 1.5928332209453713, "language_loss": 0.8622669, "learning_rate": 3.8652256969372994e-06, "loss": 0.88428861, "num_input_tokens_seen": 51905175, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0625, "step": 2395, "time_per_iteration": 2.6276938915252686 }, { "auxiliary_loss_clip": 0.01154873, "auxiliary_loss_mlp": 0.01048042, "balance_loss_clip": 1.02881348, "balance_loss_mlp": 1.05139685, "epoch": 0.14405531339245453, "flos": 20557530627840.0, "grad_norm": 1.571988839354263, "language_loss": 0.8317908, "learning_rate": 3.865085113235113e-06, "loss": 0.85381997, "num_input_tokens_seen": 51924490, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.03125, "step": 2396, "time_per_iteration": 2.442188024520874 }, { "auxiliary_loss_clip": 0.01152928, "auxiliary_loss_mlp": 0.01044352, "balance_loss_clip": 1.02569628, "balance_loss_mlp": 1.04967654, "epoch": 0.1441154366451225, "flos": 19572931946880.0, "grad_norm": 3.1566292432416265, "language_loss": 0.82509232, "learning_rate": 3.864944458808712e-06, "loss": 0.84706509, "num_input_tokens_seen": 51940490, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.03125, "step": 2397, "time_per_iteration": 2.4556267261505127 }, { "auxiliary_loss_clip": 0.01157645, "auxiliary_loss_mlp": 0.01045633, "balance_loss_clip": 1.02604723, "balance_loss_mlp": 1.05054307, "epoch": 0.14417555989779046, "flos": 18515721922560.0, "grad_norm": 1.6162391139696888, "language_loss": 0.79900944, "learning_rate": 3.86480373366343e-06, "loss": 0.82104218, "num_input_tokens_seen": 51957910, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0703125, "step": 2398, "time_per_iteration": 2.4286444187164307 }, { "auxiliary_loss_clip": 0.01156045, "auxiliary_loss_mlp": 0.01051703, "balance_loss_clip": 1.03239083, "balance_loss_mlp": 1.05211806, "epoch": 0.14423568315045843, "flos": 26031681296640.0, "grad_norm": 2.991800614621666, "language_loss": 0.65163124, "learning_rate": 3.864662937804603e-06, "loss": 0.67370868, "num_input_tokens_seen": 51978010, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0390625, "step": 2399, "time_per_iteration": 2.48531174659729 }, { "auxiliary_loss_clip": 0.011555, "auxiliary_loss_mlp": 0.01045672, "balance_loss_clip": 1.02531135, "balance_loss_mlp": 1.05070794, "epoch": 0.14429580640312642, "flos": 21288743792640.0, "grad_norm": 1.714130005833975, "language_loss": 0.82131159, "learning_rate": 3.864522071237571e-06, "loss": 0.84332335, "num_input_tokens_seen": 51998515, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.046875, "step": 2400, "time_per_iteration": 2.4627997875213623 }, { "auxiliary_loss_clip": 0.01159894, "auxiliary_loss_mlp": 0.01050667, "balance_loss_clip": 1.02955484, "balance_loss_mlp": 1.05234027, "epoch": 0.14435592965579438, "flos": 25627865621760.0, "grad_norm": 1.588397447869034, "language_loss": 0.74437702, "learning_rate": 3.864381133967676e-06, "loss": 0.76648259, "num_input_tokens_seen": 52019270, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.078125, "step": 2401, "time_per_iteration": 2.513181447982788 }, { "auxiliary_loss_clip": 0.01154631, "auxiliary_loss_mlp": 0.01044967, "balance_loss_clip": 1.02581048, "balance_loss_mlp": 1.05014086, "epoch": 0.14441605290846235, "flos": 22965053656320.0, "grad_norm": 1.5345407243723457, "language_loss": 0.80962503, "learning_rate": 3.86424012600026e-06, "loss": 0.83162105, "num_input_tokens_seen": 52039315, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.046875, "step": 2402, "time_per_iteration": 2.4623055458068848 }, { "auxiliary_loss_clip": 0.01156445, "auxiliary_loss_mlp": 0.01044177, "balance_loss_clip": 1.02420902, "balance_loss_mlp": 1.05130064, "epoch": 0.14447617616113032, "flos": 17347655548800.0, "grad_norm": 2.1389237038515367, "language_loss": 0.84130228, "learning_rate": 3.864099047340673e-06, "loss": 0.86330849, "num_input_tokens_seen": 52056555, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0546875, "step": 2403, "time_per_iteration": 2.4362738132476807 }, { "auxiliary_loss_clip": 0.01153804, "auxiliary_loss_mlp": 0.01052673, "balance_loss_clip": 1.03190649, "balance_loss_mlp": 1.04824686, "epoch": 0.14453629941379828, "flos": 24060185464320.0, "grad_norm": 1.5452174462279977, "language_loss": 0.70249796, "learning_rate": 3.863957897994262e-06, "loss": 0.72456276, "num_input_tokens_seen": 52075800, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.0546875, "step": 2404, "time_per_iteration": 2.469815254211426 }, { "auxiliary_loss_clip": 0.01151128, "auxiliary_loss_mlp": 0.01046738, "balance_loss_clip": 1.02818882, "balance_loss_mlp": 1.04814291, "epoch": 0.14459642266646625, "flos": 14429554646400.0, "grad_norm": 2.0263260341936693, "language_loss": 0.73129863, "learning_rate": 3.863816677966381e-06, "loss": 0.7532773, "num_input_tokens_seen": 52092585, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.03125, "step": 2405, "time_per_iteration": 2.4440882205963135 }, { "auxiliary_loss_clip": 0.01153005, "auxiliary_loss_mlp": 0.0104713, "balance_loss_clip": 1.02790141, "balance_loss_mlp": 1.05013406, "epoch": 0.14465654591913424, "flos": 9867032179200.0, "grad_norm": 2.221404306636624, "language_loss": 0.73395634, "learning_rate": 3.863675387262386e-06, "loss": 0.75595772, "num_input_tokens_seen": 52108990, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.03125, "step": 2406, "time_per_iteration": 2.420431137084961 }, { "auxiliary_loss_clip": 0.0115375, "auxiliary_loss_mlp": 0.01053161, "balance_loss_clip": 1.03214478, "balance_loss_mlp": 1.04872036, "epoch": 0.1447166691718022, "flos": 24972926987520.0, "grad_norm": 2.1839583935873526, "language_loss": 0.75539815, "learning_rate": 3.8635340258876325e-06, "loss": 0.77746731, "num_input_tokens_seen": 52125385, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.046875, "step": 2407, "time_per_iteration": 2.509359359741211 }, { "auxiliary_loss_clip": 0.01150975, "auxiliary_loss_mlp": 0.01045424, "balance_loss_clip": 1.02660096, "balance_loss_mlp": 1.04742897, "epoch": 0.14477679242447017, "flos": 21908023200000.0, "grad_norm": 1.5347149746465951, "language_loss": 0.79091477, "learning_rate": 3.8633925938474826e-06, "loss": 0.81287873, "num_input_tokens_seen": 52144985, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.03125, "step": 2408, "time_per_iteration": 2.4786264896392822 }, { "auxiliary_loss_clip": 0.01156126, "auxiliary_loss_mlp": 0.01054837, "balance_loss_clip": 1.03469098, "balance_loss_mlp": 1.0514797, "epoch": 0.14483691567713813, "flos": 20740746925440.0, "grad_norm": 2.7029884245391456, "language_loss": 0.82332337, "learning_rate": 3.863251091147299e-06, "loss": 0.845433, "num_input_tokens_seen": 52163885, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.046875, "step": 2409, "time_per_iteration": 2.4724483489990234 }, { "auxiliary_loss_clip": 0.01156559, "auxiliary_loss_mlp": 0.01058763, "balance_loss_clip": 1.03943884, "balance_loss_mlp": 1.05192614, "epoch": 0.1448970389298061, "flos": 35407705536000.0, "grad_norm": 1.884535904485516, "language_loss": 0.75348914, "learning_rate": 3.863109517792446e-06, "loss": 0.77564234, "num_input_tokens_seen": 52184325, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.046875, "step": 2410, "time_per_iteration": 2.566131830215454 }, { "auxiliary_loss_clip": 0.01154516, "auxiliary_loss_mlp": 0.01049334, "balance_loss_clip": 1.03066587, "balance_loss_mlp": 1.05092573, "epoch": 0.14495716218247406, "flos": 15414368808960.0, "grad_norm": 1.898752055940094, "language_loss": 0.81240368, "learning_rate": 3.8629678737882945e-06, "loss": 0.83444226, "num_input_tokens_seen": 52202740, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.03125, "step": 2411, "time_per_iteration": 2.4483823776245117 }, { "auxiliary_loss_clip": 0.01155108, "auxiliary_loss_mlp": 0.01056305, "balance_loss_clip": 1.03767264, "balance_loss_mlp": 1.05245411, "epoch": 0.14501728543514203, "flos": 33693222493440.0, "grad_norm": 3.951882284417422, "language_loss": 0.7023561, "learning_rate": 3.862826159140214e-06, "loss": 0.7244702, "num_input_tokens_seen": 52223100, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0234375, "step": 2412, "time_per_iteration": 2.566767692565918 }, { "auxiliary_loss_clip": 0.01154618, "auxiliary_loss_mlp": 0.0104603, "balance_loss_clip": 1.02703965, "balance_loss_mlp": 1.05142713, "epoch": 0.14507740868781002, "flos": 15596112648960.0, "grad_norm": 2.9788246409146937, "language_loss": 0.76991117, "learning_rate": 3.862684373853579e-06, "loss": 0.79191756, "num_input_tokens_seen": 52239690, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.03125, "step": 2413, "time_per_iteration": 2.5195162296295166 }, { "auxiliary_loss_clip": 0.01059902, "auxiliary_loss_mlp": 0.01010111, "balance_loss_clip": 1.00781059, "balance_loss_mlp": 1.02472639, "epoch": 0.145137531940478, "flos": 66675343438080.0, "grad_norm": 0.9094080693438833, "language_loss": 0.58933342, "learning_rate": 3.8625425179337656e-06, "loss": 0.61003351, "num_input_tokens_seen": 52296705, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.3515625, "step": 2414, "time_per_iteration": 3.005634069442749 }, { "auxiliary_loss_clip": 0.010583, "auxiliary_loss_mlp": 0.0100589, "balance_loss_clip": 1.00355339, "balance_loss_mlp": 1.02299058, "epoch": 0.14519765519314595, "flos": 67521578929920.0, "grad_norm": 0.8745079733215442, "language_loss": 0.6222769, "learning_rate": 3.862400591386154e-06, "loss": 0.64291871, "num_input_tokens_seen": 52361830, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.3515625, "step": 2415, "time_per_iteration": 3.0913257598876953 }, { "auxiliary_loss_clip": 0.01152384, "auxiliary_loss_mlp": 0.01046482, "balance_loss_clip": 1.02658594, "balance_loss_mlp": 1.04877257, "epoch": 0.14525777844581392, "flos": 17198913329280.0, "grad_norm": 1.915191533258364, "language_loss": 0.72517562, "learning_rate": 3.8622585942161245e-06, "loss": 0.74716425, "num_input_tokens_seen": 52379420, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0390625, "step": 2416, "time_per_iteration": 2.443179130554199 }, { "auxiliary_loss_clip": 0.01056763, "auxiliary_loss_mlp": 0.01002959, "balance_loss_clip": 1.0004797, "balance_loss_mlp": 1.02180779, "epoch": 0.14531790169848188, "flos": 65404609015680.0, "grad_norm": 0.7151675041097094, "language_loss": 0.60386705, "learning_rate": 3.8621165264290635e-06, "loss": 0.62446427, "num_input_tokens_seen": 52446290, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.34960938, "step": 2417, "time_per_iteration": 3.1222407817840576 }, { "auxiliary_loss_clip": 0.01156865, "auxiliary_loss_mlp": 0.01053979, "balance_loss_clip": 1.03415489, "balance_loss_mlp": 1.04987335, "epoch": 0.14537802495114985, "flos": 32562467372160.0, "grad_norm": 2.899851937052247, "language_loss": 0.78860456, "learning_rate": 3.861974388030356e-06, "loss": 0.81071305, "num_input_tokens_seen": 52467295, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0703125, "step": 2418, "time_per_iteration": 2.547027349472046 }, { "auxiliary_loss_clip": 0.01153034, "auxiliary_loss_mlp": 0.01047599, "balance_loss_clip": 1.02876425, "balance_loss_mlp": 1.0512315, "epoch": 0.1454381482038178, "flos": 20226685432320.0, "grad_norm": 1.9669025623614182, "language_loss": 0.7155, "learning_rate": 3.861832179025394e-06, "loss": 0.73750639, "num_input_tokens_seen": 52487295, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.015625, "step": 2419, "time_per_iteration": 2.441545009613037 }, { "auxiliary_loss_clip": 0.01153401, "auxiliary_loss_mlp": 0.01046832, "balance_loss_clip": 1.02719843, "balance_loss_mlp": 1.04995203, "epoch": 0.1454982714564858, "flos": 22893124671360.0, "grad_norm": 7.223238051565814, "language_loss": 0.9039855, "learning_rate": 3.861689899419569e-06, "loss": 0.92598784, "num_input_tokens_seen": 52504220, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.03125, "step": 2420, "time_per_iteration": 2.4690303802490234 }, { "auxiliary_loss_clip": 0.0115439, "auxiliary_loss_mlp": 0.01052969, "balance_loss_clip": 1.03384769, "balance_loss_mlp": 1.05072713, "epoch": 0.14555839470915377, "flos": 20229845829120.0, "grad_norm": 2.2155854081709325, "language_loss": 0.8285023, "learning_rate": 3.861547549218276e-06, "loss": 0.85057592, "num_input_tokens_seen": 52521900, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0390625, "step": 2421, "time_per_iteration": 2.4555609226226807 }, { "auxiliary_loss_clip": 0.01155004, "auxiliary_loss_mlp": 0.01050846, "balance_loss_clip": 1.03146291, "balance_loss_mlp": 1.04955745, "epoch": 0.14561851796182174, "flos": 22236282616320.0, "grad_norm": 1.5205587383838566, "language_loss": 0.81626558, "learning_rate": 3.861405128426914e-06, "loss": 0.83832407, "num_input_tokens_seen": 52540495, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0546875, "step": 2422, "time_per_iteration": 2.478883743286133 }, { "auxiliary_loss_clip": 0.01054718, "auxiliary_loss_mlp": 0.0102732, "balance_loss_clip": 1.02487648, "balance_loss_mlp": 1.02035379, "epoch": 0.1456786412144897, "flos": 52636786289280.0, "grad_norm": 0.9186233493917866, "language_loss": 0.63373846, "learning_rate": 3.861262637050883e-06, "loss": 0.65455878, "num_input_tokens_seen": 52603305, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.34375, "step": 2423, "time_per_iteration": 3.0990254878997803 }, { "auxiliary_loss_clip": 0.01156024, "auxiliary_loss_mlp": 0.01045866, "balance_loss_clip": 1.02767515, "balance_loss_mlp": 1.0528245, "epoch": 0.14573876446715767, "flos": 23221671396480.0, "grad_norm": 1.5761975741038965, "language_loss": 0.82565498, "learning_rate": 3.861120075095585e-06, "loss": 0.84767383, "num_input_tokens_seen": 52623435, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.03125, "step": 2424, "time_per_iteration": 2.5139689445495605 }, { "auxiliary_loss_clip": 0.01153981, "auxiliary_loss_mlp": 0.01048992, "balance_loss_clip": 1.02975178, "balance_loss_mlp": 1.05113125, "epoch": 0.14579888771982563, "flos": 18114384286080.0, "grad_norm": 2.3757579630144354, "language_loss": 0.79171598, "learning_rate": 3.860977442566429e-06, "loss": 0.81374568, "num_input_tokens_seen": 52642255, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.03125, "step": 2425, "time_per_iteration": 2.4677608013153076 }, { "auxiliary_loss_clip": 0.0115755, "auxiliary_loss_mlp": 0.01048976, "balance_loss_clip": 1.02974713, "balance_loss_mlp": 1.05350542, "epoch": 0.14585901097249362, "flos": 23001107932800.0, "grad_norm": 2.3248520894763676, "language_loss": 0.8342427, "learning_rate": 3.860834739468821e-06, "loss": 0.85630798, "num_input_tokens_seen": 52658700, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.046875, "step": 2426, "time_per_iteration": 2.472275972366333 }, { "auxiliary_loss_clip": 0.01156633, "auxiliary_loss_mlp": 0.01050169, "balance_loss_clip": 1.03108346, "balance_loss_mlp": 1.05357158, "epoch": 0.1459191342251616, "flos": 21908669644800.0, "grad_norm": 2.032351077705466, "language_loss": 0.87449318, "learning_rate": 3.860691965808173e-06, "loss": 0.89656121, "num_input_tokens_seen": 52678140, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.03125, "step": 2427, "time_per_iteration": 2.4648444652557373 }, { "auxiliary_loss_clip": 0.01158555, "auxiliary_loss_mlp": 0.01046951, "balance_loss_clip": 1.02669728, "balance_loss_mlp": 1.05075121, "epoch": 0.14597925747782955, "flos": 14975504438400.0, "grad_norm": 1.909154541264451, "language_loss": 0.67020214, "learning_rate": 3.8605491215899e-06, "loss": 0.69225717, "num_input_tokens_seen": 52696825, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.078125, "step": 2428, "time_per_iteration": 2.448857545852661 }, { "auxiliary_loss_clip": 0.01154249, "auxiliary_loss_mlp": 0.01053613, "balance_loss_clip": 1.03397918, "balance_loss_mlp": 1.05034065, "epoch": 0.14603938073049752, "flos": 21068898600960.0, "grad_norm": 1.6859999781575852, "language_loss": 0.83327377, "learning_rate": 3.860406206819417e-06, "loss": 0.8553524, "num_input_tokens_seen": 52715125, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0390625, "step": 2429, "time_per_iteration": 5.389102935791016 }, { "auxiliary_loss_clip": 0.01153132, "auxiliary_loss_mlp": 0.01048194, "balance_loss_clip": 1.02953792, "balance_loss_mlp": 1.04838789, "epoch": 0.14609950398316549, "flos": 19864777950720.0, "grad_norm": 1.6535366838533854, "language_loss": 0.78937513, "learning_rate": 3.860263221502145e-06, "loss": 0.81138843, "num_input_tokens_seen": 52734015, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.046875, "step": 2430, "time_per_iteration": 2.447310447692871 }, { "auxiliary_loss_clip": 0.01158481, "auxiliary_loss_mlp": 0.01051408, "balance_loss_clip": 1.03251362, "balance_loss_mlp": 1.05260801, "epoch": 0.14615962723583345, "flos": 22418852469120.0, "grad_norm": 2.39231620272722, "language_loss": 0.830468, "learning_rate": 3.860120165643504e-06, "loss": 0.8525669, "num_input_tokens_seen": 52753025, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0546875, "step": 2431, "time_per_iteration": 3.852787494659424 }, { "auxiliary_loss_clip": 0.01158304, "auxiliary_loss_mlp": 0.01052184, "balance_loss_clip": 1.03137028, "balance_loss_mlp": 1.0505383, "epoch": 0.14621975048850142, "flos": 22346241125760.0, "grad_norm": 1.7303512928643823, "language_loss": 0.79160148, "learning_rate": 3.859977039248921e-06, "loss": 0.81370634, "num_input_tokens_seen": 52773420, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.078125, "step": 2432, "time_per_iteration": 3.926273822784424 }, { "auxiliary_loss_clip": 0.01155235, "auxiliary_loss_mlp": 0.01053852, "balance_loss_clip": 1.03339553, "balance_loss_mlp": 1.05093062, "epoch": 0.1462798737411694, "flos": 24389163152640.0, "grad_norm": 1.8823089121054808, "language_loss": 0.7992, "learning_rate": 3.859833842323822e-06, "loss": 0.82129085, "num_input_tokens_seen": 52792870, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.046875, "step": 2433, "time_per_iteration": 2.48775053024292 }, { "auxiliary_loss_clip": 0.01153694, "auxiliary_loss_mlp": 0.01050043, "balance_loss_clip": 1.02975404, "balance_loss_mlp": 1.05250096, "epoch": 0.14633999699383737, "flos": 19244672530560.0, "grad_norm": 2.0353678926416476, "language_loss": 0.78420913, "learning_rate": 3.859690574873638e-06, "loss": 0.80624652, "num_input_tokens_seen": 52811615, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0078125, "step": 2434, "time_per_iteration": 2.4619436264038086 }, { "auxiliary_loss_clip": 0.01049727, "auxiliary_loss_mlp": 0.01007267, "balance_loss_clip": 1.00508523, "balance_loss_mlp": 1.01530337, "epoch": 0.14640012024650534, "flos": 62660638270080.0, "grad_norm": 0.8413307813496385, "language_loss": 0.58489382, "learning_rate": 3.8595472369038e-06, "loss": 0.60546374, "num_input_tokens_seen": 52873230, "router_z_loss_clip": 0.02185059, "router_z_loss_mlp": 0.34375, "step": 2435, "time_per_iteration": 3.0500686168670654 }, { "auxiliary_loss_clip": 0.01149801, "auxiliary_loss_mlp": 0.01047363, "balance_loss_clip": 1.02890968, "balance_loss_mlp": 1.04879367, "epoch": 0.1464602434991733, "flos": 12276243146880.0, "grad_norm": 2.0070667289943582, "language_loss": 0.88768917, "learning_rate": 3.859403828419744e-06, "loss": 0.90966082, "num_input_tokens_seen": 52889325, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0078125, "step": 2436, "time_per_iteration": 2.4401838779449463 }, { "auxiliary_loss_clip": 0.011576, "auxiliary_loss_mlp": 0.01049336, "balance_loss_clip": 1.02992868, "balance_loss_mlp": 1.05109537, "epoch": 0.14652036675184127, "flos": 20922311197440.0, "grad_norm": 2.2045366329527547, "language_loss": 0.74433422, "learning_rate": 3.85926034942691e-06, "loss": 0.76640362, "num_input_tokens_seen": 52909705, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0625, "step": 2437, "time_per_iteration": 2.4858272075653076 }, { "auxiliary_loss_clip": 0.01156418, "auxiliary_loss_mlp": 0.0105014, "balance_loss_clip": 1.02887321, "balance_loss_mlp": 1.05018544, "epoch": 0.14658049000450923, "flos": 27703681528320.0, "grad_norm": 1.989122849015116, "language_loss": 0.73379534, "learning_rate": 3.859116799930736e-06, "loss": 0.75586092, "num_input_tokens_seen": 52930300, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.0625, "step": 2438, "time_per_iteration": 2.506373405456543 }, { "auxiliary_loss_clip": 0.01156092, "auxiliary_loss_mlp": 0.01038649, "balance_loss_clip": 1.02054119, "balance_loss_mlp": 1.05365014, "epoch": 0.14664061325717723, "flos": 24936513575040.0, "grad_norm": 2.1260754169184204, "language_loss": 0.74333876, "learning_rate": 3.858973179936668e-06, "loss": 0.76528621, "num_input_tokens_seen": 52949955, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0234375, "step": 2439, "time_per_iteration": 2.491363763809204 }, { "auxiliary_loss_clip": 0.01155638, "auxiliary_loss_mlp": 0.01046717, "balance_loss_clip": 1.02653503, "balance_loss_mlp": 1.05183959, "epoch": 0.1467007365098452, "flos": 40297661406720.0, "grad_norm": 1.8337559725801031, "language_loss": 0.74496174, "learning_rate": 3.85882948945015e-06, "loss": 0.7669853, "num_input_tokens_seen": 52972905, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0390625, "step": 2440, "time_per_iteration": 2.652282476425171 }, { "auxiliary_loss_clip": 0.0115015, "auxiliary_loss_mlp": 0.01048448, "balance_loss_clip": 1.02945852, "balance_loss_mlp": 1.04897594, "epoch": 0.14676085976251316, "flos": 26541074021760.0, "grad_norm": 1.5837814946803592, "language_loss": 0.83006603, "learning_rate": 3.85868572847663e-06, "loss": 0.85205197, "num_input_tokens_seen": 52994850, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.015625, "step": 2441, "time_per_iteration": 2.5305135250091553 }, { "auxiliary_loss_clip": 0.01160694, "auxiliary_loss_mlp": 0.01045298, "balance_loss_clip": 1.02474618, "balance_loss_mlp": 1.05066204, "epoch": 0.14682098301518112, "flos": 23550110380800.0, "grad_norm": 5.074976974777142, "language_loss": 0.7164768, "learning_rate": 3.858541897021563e-06, "loss": 0.73853672, "num_input_tokens_seen": 53014740, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1015625, "step": 2442, "time_per_iteration": 2.5035393238067627 }, { "auxiliary_loss_clip": 0.01159412, "auxiliary_loss_mlp": 0.01042178, "balance_loss_clip": 1.02165067, "balance_loss_mlp": 1.05001855, "epoch": 0.1468811062678491, "flos": 11651073909120.0, "grad_norm": 2.80827794434957, "language_loss": 0.80993795, "learning_rate": 3.8583979950904e-06, "loss": 0.83195382, "num_input_tokens_seen": 53029780, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.09375, "step": 2443, "time_per_iteration": 2.4386672973632812 }, { "auxiliary_loss_clip": 0.01156041, "auxiliary_loss_mlp": 0.01048292, "balance_loss_clip": 1.0282532, "balance_loss_mlp": 1.05173993, "epoch": 0.14694122952051705, "flos": 23002616304000.0, "grad_norm": 1.6705081407149092, "language_loss": 0.82940567, "learning_rate": 3.858254022688599e-06, "loss": 0.85144901, "num_input_tokens_seen": 53048620, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.046875, "step": 2444, "time_per_iteration": 2.484633207321167 }, { "auxiliary_loss_clip": 0.01156755, "auxiliary_loss_mlp": 0.01050986, "balance_loss_clip": 1.03180504, "balance_loss_mlp": 1.05064893, "epoch": 0.14700135277318502, "flos": 26502972670080.0, "grad_norm": 1.8052728454020046, "language_loss": 0.70826364, "learning_rate": 3.85810997982162e-06, "loss": 0.73034108, "num_input_tokens_seen": 53070055, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0625, "step": 2445, "time_per_iteration": 2.4902288913726807 }, { "auxiliary_loss_clip": 0.01046897, "auxiliary_loss_mlp": 0.0100933, "balance_loss_clip": 1.0070771, "balance_loss_mlp": 1.01271582, "epoch": 0.147061476025853, "flos": 59449434387840.0, "grad_norm": 0.8270730830952739, "language_loss": 0.63183415, "learning_rate": 3.857965866494923e-06, "loss": 0.65239638, "num_input_tokens_seen": 53126945, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.34179688, "step": 2446, "time_per_iteration": 2.986478567123413 }, { "auxiliary_loss_clip": 0.0115917, "auxiliary_loss_mlp": 0.01042079, "balance_loss_clip": 1.02227807, "balance_loss_mlp": 1.05331635, "epoch": 0.14712159927852098, "flos": 28330897841280.0, "grad_norm": 5.369142936775846, "language_loss": 0.74552, "learning_rate": 3.857821682713975e-06, "loss": 0.76753247, "num_input_tokens_seen": 53149130, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0625, "step": 2447, "time_per_iteration": 2.5437090396881104 }, { "auxiliary_loss_clip": 0.01155995, "auxiliary_loss_mlp": 0.01041593, "balance_loss_clip": 1.02263856, "balance_loss_mlp": 1.05153823, "epoch": 0.14718172253118894, "flos": 27089825074560.0, "grad_norm": 1.8483913295032883, "language_loss": 0.85378224, "learning_rate": 3.857677428484242e-06, "loss": 0.87575811, "num_input_tokens_seen": 53167120, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.046875, "step": 2448, "time_per_iteration": 2.5033373832702637 }, { "auxiliary_loss_clip": 0.01046602, "auxiliary_loss_mlp": 0.01008394, "balance_loss_clip": 1.00620079, "balance_loss_mlp": 1.01249945, "epoch": 0.1472418457838569, "flos": 66706764860160.0, "grad_norm": 0.7658583558652418, "language_loss": 0.56876582, "learning_rate": 3.857533103811195e-06, "loss": 0.58931577, "num_input_tokens_seen": 53227945, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.33984375, "step": 2449, "time_per_iteration": 3.0305538177490234 }, { "auxiliary_loss_clip": 0.01150797, "auxiliary_loss_mlp": 0.01044686, "balance_loss_clip": 1.02457571, "balance_loss_mlp": 1.04846966, "epoch": 0.14730196903652487, "flos": 19573578391680.0, "grad_norm": 1.9301679976209998, "language_loss": 0.8533054, "learning_rate": 3.857388708700307e-06, "loss": 0.87526023, "num_input_tokens_seen": 53244615, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0234375, "step": 2450, "time_per_iteration": 2.436012029647827 }, { "auxiliary_loss_clip": 0.01157302, "auxiliary_loss_mlp": 0.01048218, "balance_loss_clip": 1.02789307, "balance_loss_mlp": 1.05091476, "epoch": 0.14736209228919284, "flos": 16071031296000.0, "grad_norm": 1.9794288463075929, "language_loss": 0.75197196, "learning_rate": 3.857244243157052e-06, "loss": 0.77402711, "num_input_tokens_seen": 53262205, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0625, "step": 2451, "time_per_iteration": 2.4367260932922363 }, { "auxiliary_loss_clip": 0.01150131, "auxiliary_loss_mlp": 0.01041732, "balance_loss_clip": 1.02315974, "balance_loss_mlp": 1.05016732, "epoch": 0.1474222155418608, "flos": 23039460679680.0, "grad_norm": 1.614002604351864, "language_loss": 0.82457721, "learning_rate": 3.85709970718691e-06, "loss": 0.84649587, "num_input_tokens_seen": 53282445, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0, "step": 2452, "time_per_iteration": 2.473315715789795 }, { "auxiliary_loss_clip": 0.01155198, "auxiliary_loss_mlp": 0.01041814, "balance_loss_clip": 1.02368224, "balance_loss_mlp": 1.05226731, "epoch": 0.1474823387945288, "flos": 17018641946880.0, "grad_norm": 1.5632946670043821, "language_loss": 0.74264354, "learning_rate": 3.856955100795361e-06, "loss": 0.76461369, "num_input_tokens_seen": 53299060, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.03125, "step": 2453, "time_per_iteration": 2.4389803409576416 }, { "auxiliary_loss_clip": 0.01156488, "auxiliary_loss_mlp": 0.0104864, "balance_loss_clip": 1.02808797, "balance_loss_mlp": 1.05072606, "epoch": 0.14754246204719676, "flos": 17895041884800.0, "grad_norm": 2.028616344237974, "language_loss": 0.76443899, "learning_rate": 3.856810423987889e-06, "loss": 0.7864902, "num_input_tokens_seen": 53315970, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0546875, "step": 2454, "time_per_iteration": 2.426121234893799 }, { "auxiliary_loss_clip": 0.0115641, "auxiliary_loss_mlp": 0.010402, "balance_loss_clip": 1.02123392, "balance_loss_mlp": 1.04986286, "epoch": 0.14760258529986472, "flos": 13079097987840.0, "grad_norm": 1.790391149092731, "language_loss": 0.83107769, "learning_rate": 3.856665676769979e-06, "loss": 0.85304379, "num_input_tokens_seen": 53332940, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2455, "time_per_iteration": 2.4527711868286133 }, { "auxiliary_loss_clip": 0.01159666, "auxiliary_loss_mlp": 0.01046282, "balance_loss_clip": 1.02729201, "balance_loss_mlp": 1.05018485, "epoch": 0.1476627085525327, "flos": 30806399358720.0, "grad_norm": 1.8434015928774081, "language_loss": 0.84143448, "learning_rate": 3.85652085914712e-06, "loss": 0.86349404, "num_input_tokens_seen": 53353295, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.09375, "step": 2456, "time_per_iteration": 2.5175282955169678 }, { "auxiliary_loss_clip": 0.01153159, "auxiliary_loss_mlp": 0.01041787, "balance_loss_clip": 1.02295184, "balance_loss_mlp": 1.05114222, "epoch": 0.14772283180520066, "flos": 21689434984320.0, "grad_norm": 1.7002708284974002, "language_loss": 0.84457171, "learning_rate": 3.856375971124805e-06, "loss": 0.86652124, "num_input_tokens_seen": 53373410, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.015625, "step": 2457, "time_per_iteration": 2.4902005195617676 }, { "auxiliary_loss_clip": 0.01152427, "auxiliary_loss_mlp": 0.01041851, "balance_loss_clip": 1.02308774, "balance_loss_mlp": 1.05131423, "epoch": 0.14778295505786862, "flos": 18770400328320.0, "grad_norm": 1.820374938798203, "language_loss": 0.75414217, "learning_rate": 3.856231012708527e-06, "loss": 0.77608496, "num_input_tokens_seen": 53391430, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0078125, "step": 2458, "time_per_iteration": 2.44163179397583 }, { "auxiliary_loss_clip": 0.01161954, "auxiliary_loss_mlp": 0.01047707, "balance_loss_clip": 1.02665448, "balance_loss_mlp": 1.05193472, "epoch": 0.1478430783105366, "flos": 22893555634560.0, "grad_norm": 1.9674975381640045, "language_loss": 0.83427572, "learning_rate": 3.856085983903782e-06, "loss": 0.85637236, "num_input_tokens_seen": 53409960, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.1015625, "step": 2459, "time_per_iteration": 2.4725232124328613 }, { "auxiliary_loss_clip": 0.01150019, "auxiliary_loss_mlp": 0.01041372, "balance_loss_clip": 1.02257228, "balance_loss_mlp": 1.04864335, "epoch": 0.14790320156320458, "flos": 15085319293440.0, "grad_norm": 2.0681180762308267, "language_loss": 0.75393283, "learning_rate": 3.855940884716071e-06, "loss": 0.77584684, "num_input_tokens_seen": 53426160, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.015625, "step": 2460, "time_per_iteration": 2.422189474105835 }, { "auxiliary_loss_clip": 0.01159516, "auxiliary_loss_mlp": 0.01045544, "balance_loss_clip": 1.02679288, "balance_loss_mlp": 1.05177402, "epoch": 0.14796332481587254, "flos": 26504768350080.0, "grad_norm": 1.7401272185749623, "language_loss": 0.81685126, "learning_rate": 3.855795715150896e-06, "loss": 0.83890188, "num_input_tokens_seen": 53448530, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.078125, "step": 2461, "time_per_iteration": 2.5325546264648438 }, { "auxiliary_loss_clip": 0.01156586, "auxiliary_loss_mlp": 0.01050689, "balance_loss_clip": 1.03020847, "balance_loss_mlp": 1.05149984, "epoch": 0.1480234480685405, "flos": 17563191108480.0, "grad_norm": 3.082068075937344, "language_loss": 0.6593523, "learning_rate": 3.855650475213761e-06, "loss": 0.68142509, "num_input_tokens_seen": 53465915, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.046875, "step": 2462, "time_per_iteration": 2.423264980316162 }, { "auxiliary_loss_clip": 0.01154364, "auxiliary_loss_mlp": 0.01047686, "balance_loss_clip": 1.02783775, "balance_loss_mlp": 1.05008829, "epoch": 0.14808357132120847, "flos": 53582203232640.0, "grad_norm": 2.20564943329999, "language_loss": 0.67515349, "learning_rate": 3.8555051649101745e-06, "loss": 0.69717401, "num_input_tokens_seen": 53496055, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.046875, "step": 2463, "time_per_iteration": 2.7838001251220703 }, { "auxiliary_loss_clip": 0.0115639, "auxiliary_loss_mlp": 0.01054099, "balance_loss_clip": 1.0342741, "balance_loss_mlp": 1.05014086, "epoch": 0.14814369457387644, "flos": 19829190551040.0, "grad_norm": 1.6453764240373894, "language_loss": 0.76550692, "learning_rate": 3.855359784245646e-06, "loss": 0.78761184, "num_input_tokens_seen": 53513790, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0625, "step": 2464, "time_per_iteration": 2.4423861503601074 }, { "auxiliary_loss_clip": 0.01153794, "auxiliary_loss_mlp": 0.01048455, "balance_loss_clip": 1.03115737, "balance_loss_mlp": 1.05229568, "epoch": 0.1482038178265444, "flos": 23914962777600.0, "grad_norm": 1.7927828034118842, "language_loss": 0.79813933, "learning_rate": 3.855214333225688e-06, "loss": 0.82016188, "num_input_tokens_seen": 53533410, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 1.015625, "step": 2465, "time_per_iteration": 2.5050666332244873 }, { "auxiliary_loss_clip": 0.01163729, "auxiliary_loss_mlp": 0.01046043, "balance_loss_clip": 1.02603972, "balance_loss_mlp": 1.05466938, "epoch": 0.1482639410792124, "flos": 24170503109760.0, "grad_norm": 1.6881321796490227, "language_loss": 0.76073307, "learning_rate": 3.855068811855817e-06, "loss": 0.78283083, "num_input_tokens_seen": 53554775, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.09375, "step": 2466, "time_per_iteration": 2.4903509616851807 }, { "auxiliary_loss_clip": 0.0105241, "auxiliary_loss_mlp": 0.0101785, "balance_loss_clip": 1.01575172, "balance_loss_mlp": 1.017349, "epoch": 0.14832406433188036, "flos": 66191051341440.0, "grad_norm": 0.7859742878205059, "language_loss": 0.60110962, "learning_rate": 3.854923220141551e-06, "loss": 0.62181222, "num_input_tokens_seen": 53609675, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.3515625, "step": 2467, "time_per_iteration": 3.1048386096954346 }, { "auxiliary_loss_clip": 0.01156473, "auxiliary_loss_mlp": 0.01044449, "balance_loss_clip": 1.02512491, "balance_loss_mlp": 1.05253077, "epoch": 0.14838418758454833, "flos": 25411252654080.0, "grad_norm": 1.9657349364826882, "language_loss": 0.87495971, "learning_rate": 3.85477755808841e-06, "loss": 0.89696884, "num_input_tokens_seen": 53626950, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.046875, "step": 2468, "time_per_iteration": 2.5212619304656982 }, { "auxiliary_loss_clip": 0.01159485, "auxiliary_loss_mlp": 0.0105032, "balance_loss_clip": 1.02954197, "balance_loss_mlp": 1.05189598, "epoch": 0.1484443108372163, "flos": 23289901280640.0, "grad_norm": 1.9280313287808808, "language_loss": 0.76059067, "learning_rate": 3.854631825701919e-06, "loss": 0.78268874, "num_input_tokens_seen": 53644200, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.078125, "step": 2469, "time_per_iteration": 2.502923011779785 }, { "auxiliary_loss_clip": 0.01152581, "auxiliary_loss_mlp": 0.0104739, "balance_loss_clip": 1.02861428, "balance_loss_mlp": 1.04965317, "epoch": 0.14850443408988426, "flos": 14647675985280.0, "grad_norm": 3.4193579374881256, "language_loss": 0.76210201, "learning_rate": 3.854486022987603e-06, "loss": 0.78410172, "num_input_tokens_seen": 53659650, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.03125, "step": 2470, "time_per_iteration": 2.4377281665802 }, { "auxiliary_loss_clip": 0.01151611, "auxiliary_loss_mlp": 0.01045573, "balance_loss_clip": 1.02634406, "balance_loss_mlp": 1.05025721, "epoch": 0.14856455734255222, "flos": 23548314700800.0, "grad_norm": 1.875400808669048, "language_loss": 0.72121662, "learning_rate": 3.8543401499509905e-06, "loss": 0.7431885, "num_input_tokens_seen": 53680275, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.015625, "step": 2471, "time_per_iteration": 5.390460252761841 }, { "auxiliary_loss_clip": 0.01158338, "auxiliary_loss_mlp": 0.01045857, "balance_loss_clip": 1.02532911, "balance_loss_mlp": 1.04965222, "epoch": 0.1486246805952202, "flos": 18077288515200.0, "grad_norm": 2.318068262371856, "language_loss": 0.89767432, "learning_rate": 3.854194206597615e-06, "loss": 0.9197163, "num_input_tokens_seen": 53698270, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0859375, "step": 2472, "time_per_iteration": 2.4507880210876465 }, { "auxiliary_loss_clip": 0.01156569, "auxiliary_loss_mlp": 0.01046891, "balance_loss_clip": 1.0266968, "balance_loss_mlp": 1.05055666, "epoch": 0.14868480384788818, "flos": 19353625459200.0, "grad_norm": 2.3345608346704974, "language_loss": 0.80656087, "learning_rate": 3.854048192933008e-06, "loss": 0.82859552, "num_input_tokens_seen": 53716845, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0625, "step": 2473, "time_per_iteration": 5.357048034667969 }, { "auxiliary_loss_clip": 0.01157298, "auxiliary_loss_mlp": 0.01054496, "balance_loss_clip": 1.03511286, "balance_loss_mlp": 1.04993176, "epoch": 0.14874492710055615, "flos": 22200192426240.0, "grad_norm": 2.9354181149023013, "language_loss": 0.77839744, "learning_rate": 3.853902108962709e-06, "loss": 0.80051529, "num_input_tokens_seen": 53734970, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.078125, "step": 2474, "time_per_iteration": 2.4475619792938232 }, { "auxiliary_loss_clip": 0.01157087, "auxiliary_loss_mlp": 0.01055688, "balance_loss_clip": 1.03606641, "balance_loss_mlp": 1.04853821, "epoch": 0.1488050503532241, "flos": 21103444506240.0, "grad_norm": 1.9727872319040531, "language_loss": 0.82405353, "learning_rate": 3.853755954692255e-06, "loss": 0.84618133, "num_input_tokens_seen": 53753415, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0859375, "step": 2475, "time_per_iteration": 2.4630448818206787 }, { "auxiliary_loss_clip": 0.01157425, "auxiliary_loss_mlp": 0.01055159, "balance_loss_clip": 1.0362525, "balance_loss_mlp": 1.05324686, "epoch": 0.14886517360589208, "flos": 12786569625600.0, "grad_norm": 1.8370114562067237, "language_loss": 0.80904371, "learning_rate": 3.85360973012719e-06, "loss": 0.83116949, "num_input_tokens_seen": 53770305, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.046875, "step": 2476, "time_per_iteration": 2.424292802810669 }, { "auxiliary_loss_clip": 0.01150812, "auxiliary_loss_mlp": 0.01044878, "balance_loss_clip": 1.02649617, "balance_loss_mlp": 1.05093384, "epoch": 0.14892529685856004, "flos": 29022860419200.0, "grad_norm": 3.0303395965856823, "language_loss": 0.77709556, "learning_rate": 3.853463435273058e-06, "loss": 0.79905242, "num_input_tokens_seen": 53788895, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0, "step": 2477, "time_per_iteration": 2.520366907119751 }, { "auxiliary_loss_clip": 0.01055744, "auxiliary_loss_mlp": 0.01007556, "balance_loss_clip": 1.0055536, "balance_loss_mlp": 1.02127445, "epoch": 0.148985420111228, "flos": 61926121054080.0, "grad_norm": 0.8055968584944826, "language_loss": 0.60195965, "learning_rate": 3.853317070135407e-06, "loss": 0.62259269, "num_input_tokens_seen": 53850260, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.34375, "step": 2478, "time_per_iteration": 3.1216044425964355 }, { "auxiliary_loss_clip": 0.0115621, "auxiliary_loss_mlp": 0.01045712, "balance_loss_clip": 1.02780652, "balance_loss_mlp": 1.05152416, "epoch": 0.149045543363896, "flos": 23915106432000.0, "grad_norm": 2.2048007481960945, "language_loss": 0.71079701, "learning_rate": 3.853170634719787e-06, "loss": 0.73281622, "num_input_tokens_seen": 53867520, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.046875, "step": 2479, "time_per_iteration": 2.508500099182129 }, { "auxiliary_loss_clip": 0.01154961, "auxiliary_loss_mlp": 0.01046625, "balance_loss_clip": 1.02694392, "balance_loss_mlp": 1.05001795, "epoch": 0.14910566661656396, "flos": 23654394541440.0, "grad_norm": 1.5676985699155372, "language_loss": 0.80955672, "learning_rate": 3.853024129031751e-06, "loss": 0.83157265, "num_input_tokens_seen": 53886620, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.046875, "step": 2480, "time_per_iteration": 2.4697108268737793 }, { "auxiliary_loss_clip": 0.01156612, "auxiliary_loss_mlp": 0.01044242, "balance_loss_clip": 1.02520418, "balance_loss_mlp": 1.05023563, "epoch": 0.14916578986923193, "flos": 20515299212160.0, "grad_norm": 1.8824066533606119, "language_loss": 0.84439099, "learning_rate": 3.852877553076854e-06, "loss": 0.86639953, "num_input_tokens_seen": 53902230, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0625, "step": 2481, "time_per_iteration": 2.4738082885742188 }, { "auxiliary_loss_clip": 0.01153989, "auxiliary_loss_mlp": 0.01052188, "balance_loss_clip": 1.03127825, "balance_loss_mlp": 1.04858744, "epoch": 0.1492259131218999, "flos": 22491822948480.0, "grad_norm": 1.9627248967764146, "language_loss": 0.77612191, "learning_rate": 3.8527309068606546e-06, "loss": 0.79818368, "num_input_tokens_seen": 53919475, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.0546875, "step": 2482, "time_per_iteration": 2.4658284187316895 }, { "auxiliary_loss_clip": 0.01162388, "auxiliary_loss_mlp": 0.01039059, "balance_loss_clip": 1.01856661, "balance_loss_mlp": 1.0521363, "epoch": 0.14928603637456786, "flos": 23185868515200.0, "grad_norm": 2.6266112891551137, "language_loss": 0.79200906, "learning_rate": 3.852584190388713e-06, "loss": 0.81402349, "num_input_tokens_seen": 53939150, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.1015625, "step": 2483, "time_per_iteration": 2.4864985942840576 }, { "auxiliary_loss_clip": 0.01151727, "auxiliary_loss_mlp": 0.01040748, "balance_loss_clip": 1.02325964, "balance_loss_mlp": 1.05055475, "epoch": 0.14934615962723582, "flos": 21653237053440.0, "grad_norm": 1.973636653248875, "language_loss": 0.70429587, "learning_rate": 3.852437403666595e-06, "loss": 0.72622061, "num_input_tokens_seen": 53958735, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.015625, "step": 2484, "time_per_iteration": 2.468899726867676 }, { "auxiliary_loss_clip": 0.0115446, "auxiliary_loss_mlp": 0.01039356, "balance_loss_clip": 1.01923323, "balance_loss_mlp": 1.04842591, "epoch": 0.1494062828799038, "flos": 27010066924800.0, "grad_norm": 1.8218907045081472, "language_loss": 0.84902978, "learning_rate": 3.852290546699863e-06, "loss": 0.87096792, "num_input_tokens_seen": 53975065, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0625, "step": 2485, "time_per_iteration": 2.5008199214935303 }, { "auxiliary_loss_clip": 0.01158444, "auxiliary_loss_mlp": 0.01043351, "balance_loss_clip": 1.02418256, "balance_loss_mlp": 1.05154586, "epoch": 0.14946640613257178, "flos": 21214947300480.0, "grad_norm": 1.9705667614190372, "language_loss": 0.85088289, "learning_rate": 3.8521436194940894e-06, "loss": 0.87290084, "num_input_tokens_seen": 53993330, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0703125, "step": 2486, "time_per_iteration": 2.4669957160949707 }, { "auxiliary_loss_clip": 0.01151615, "auxiliary_loss_mlp": 0.01040759, "balance_loss_clip": 1.02342629, "balance_loss_mlp": 1.04932714, "epoch": 0.14952652938523975, "flos": 13370872164480.0, "grad_norm": 5.597866976745329, "language_loss": 0.75135446, "learning_rate": 3.851996622054842e-06, "loss": 0.77327824, "num_input_tokens_seen": 54010515, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.0234375, "step": 2487, "time_per_iteration": 2.44599986076355 }, { "auxiliary_loss_clip": 0.01153951, "auxiliary_loss_mlp": 0.0104526, "balance_loss_clip": 1.02648497, "balance_loss_mlp": 1.04992652, "epoch": 0.1495866526379077, "flos": 35517699959040.0, "grad_norm": 1.9811076463955777, "language_loss": 0.71858734, "learning_rate": 3.8518495543877e-06, "loss": 0.74057949, "num_input_tokens_seen": 54031315, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2488, "time_per_iteration": 2.576526403427124 }, { "auxiliary_loss_clip": 0.01157702, "auxiliary_loss_mlp": 0.01048052, "balance_loss_clip": 1.02914524, "balance_loss_mlp": 1.0512197, "epoch": 0.14964677589057568, "flos": 17632749795840.0, "grad_norm": 2.2305660260357936, "language_loss": 0.7061311, "learning_rate": 3.851702416498235e-06, "loss": 0.72818863, "num_input_tokens_seen": 54045965, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2489, "time_per_iteration": 2.4336774349212646 }, { "auxiliary_loss_clip": 0.01156984, "auxiliary_loss_mlp": 0.01049548, "balance_loss_clip": 1.03085566, "balance_loss_mlp": 1.05042946, "epoch": 0.14970689914324364, "flos": 20185280029440.0, "grad_norm": 3.2520931180432626, "language_loss": 0.81423765, "learning_rate": 3.8515552083920295e-06, "loss": 0.83630294, "num_input_tokens_seen": 54059960, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0625, "step": 2490, "time_per_iteration": 2.403538942337036 }, { "auxiliary_loss_clip": 0.01160841, "auxiliary_loss_mlp": 0.01047566, "balance_loss_clip": 1.02911258, "balance_loss_mlp": 1.05300856, "epoch": 0.1497670223959116, "flos": 37228699382400.0, "grad_norm": 1.7642724385076278, "language_loss": 0.79902577, "learning_rate": 3.851407930074666e-06, "loss": 0.82110983, "num_input_tokens_seen": 54079330, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.078125, "step": 2491, "time_per_iteration": 2.583418369293213 }, { "auxiliary_loss_clip": 0.01158951, "auxiliary_loss_mlp": 0.01046675, "balance_loss_clip": 1.02651668, "balance_loss_mlp": 1.05002379, "epoch": 0.1498271456485796, "flos": 24455848752000.0, "grad_norm": 2.05038406912978, "language_loss": 0.91099977, "learning_rate": 3.851260581551727e-06, "loss": 0.933056, "num_input_tokens_seen": 54097555, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.09375, "step": 2492, "time_per_iteration": 2.4886295795440674 }, { "auxiliary_loss_clip": 0.01154851, "auxiliary_loss_mlp": 0.01055948, "balance_loss_clip": 1.03671932, "balance_loss_mlp": 1.0500952, "epoch": 0.14988726890124757, "flos": 16253601148800.0, "grad_norm": 8.796462674093396, "language_loss": 0.79092348, "learning_rate": 3.851113162828802e-06, "loss": 0.81303144, "num_input_tokens_seen": 54115600, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.046875, "step": 2493, "time_per_iteration": 2.4347612857818604 }, { "auxiliary_loss_clip": 0.0115597, "auxiliary_loss_mlp": 0.01046805, "balance_loss_clip": 1.0269568, "balance_loss_mlp": 1.05005264, "epoch": 0.14994739215391553, "flos": 20666555383680.0, "grad_norm": 2.088137308244446, "language_loss": 0.80174756, "learning_rate": 3.85096567391148e-06, "loss": 0.82377529, "num_input_tokens_seen": 54135220, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0625, "step": 2494, "time_per_iteration": 2.454745054244995 }, { "auxiliary_loss_clip": 0.01151593, "auxiliary_loss_mlp": 0.01048473, "balance_loss_clip": 1.02783823, "balance_loss_mlp": 1.04930365, "epoch": 0.1500075154065835, "flos": 70652375239680.0, "grad_norm": 1.7629388307452039, "language_loss": 0.65723759, "learning_rate": 3.850818114805354e-06, "loss": 0.67923826, "num_input_tokens_seen": 54161065, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.0234375, "step": 2495, "time_per_iteration": 2.8495445251464844 }, { "auxiliary_loss_clip": 0.01054451, "auxiliary_loss_mlp": 0.01003379, "balance_loss_clip": 1.00125682, "balance_loss_mlp": 1.02045083, "epoch": 0.15006763865925146, "flos": 68011937447040.0, "grad_norm": 0.883694808058064, "language_loss": 0.59538698, "learning_rate": 3.850670485516019e-06, "loss": 0.61596525, "num_input_tokens_seen": 54225095, "router_z_loss_clip": 0.02124023, "router_z_loss_mlp": 0.33984375, "step": 2496, "time_per_iteration": 3.095531463623047 }, { "auxiliary_loss_clip": 0.01155784, "auxiliary_loss_mlp": 0.01055081, "balance_loss_clip": 1.03483915, "balance_loss_mlp": 1.04846263, "epoch": 0.15012776191191943, "flos": 18916269459840.0, "grad_norm": 2.1701419045865147, "language_loss": 0.65237212, "learning_rate": 3.850522786049075e-06, "loss": 0.6744808, "num_input_tokens_seen": 54243750, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.0703125, "step": 2497, "time_per_iteration": 2.445136070251465 }, { "auxiliary_loss_clip": 0.01159799, "auxiliary_loss_mlp": 0.0105097, "balance_loss_clip": 1.03227842, "balance_loss_mlp": 1.0540148, "epoch": 0.1501878851645874, "flos": 23701330638720.0, "grad_norm": 1.4506796027448925, "language_loss": 0.75051415, "learning_rate": 3.850375016410121e-06, "loss": 0.77262187, "num_input_tokens_seen": 54266185, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0546875, "step": 2498, "time_per_iteration": 2.549241065979004 }, { "auxiliary_loss_clip": 0.01164094, "auxiliary_loss_mlp": 0.01047191, "balance_loss_clip": 1.02677047, "balance_loss_mlp": 1.05528426, "epoch": 0.15024800841725539, "flos": 20412523422720.0, "grad_norm": 2.1692649779325675, "language_loss": 0.7241801, "learning_rate": 3.850227176604761e-06, "loss": 0.74629295, "num_input_tokens_seen": 54283940, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.0859375, "step": 2499, "time_per_iteration": 2.4593703746795654 }, { "auxiliary_loss_clip": 0.01158198, "auxiliary_loss_mlp": 0.01058142, "balance_loss_clip": 1.03854418, "balance_loss_mlp": 1.05164385, "epoch": 0.15030813166992335, "flos": 31831002812160.0, "grad_norm": 2.0778437915648915, "language_loss": 0.72226381, "learning_rate": 3.850079266638601e-06, "loss": 0.7444272, "num_input_tokens_seen": 54304830, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0625, "step": 2500, "time_per_iteration": 2.5676822662353516 }, { "auxiliary_loss_clip": 0.01158531, "auxiliary_loss_mlp": 0.01063493, "balance_loss_clip": 1.04363251, "balance_loss_mlp": 1.05329967, "epoch": 0.15036825492259132, "flos": 35657822914560.0, "grad_norm": 1.706302972413232, "language_loss": 0.65034962, "learning_rate": 3.849931286517249e-06, "loss": 0.67256987, "num_input_tokens_seen": 54325595, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0546875, "step": 2501, "time_per_iteration": 2.6041159629821777 }, { "auxiliary_loss_clip": 0.01155748, "auxiliary_loss_mlp": 0.01063696, "balance_loss_clip": 1.04288185, "balance_loss_mlp": 1.05036139, "epoch": 0.15042837817525928, "flos": 18838163335680.0, "grad_norm": 2.223116957232122, "language_loss": 0.83400077, "learning_rate": 3.849783236246318e-06, "loss": 0.85619521, "num_input_tokens_seen": 54342180, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.0546875, "step": 2502, "time_per_iteration": 2.442314386367798 }, { "auxiliary_loss_clip": 0.01152485, "auxiliary_loss_mlp": 0.01057085, "balance_loss_clip": 1.03846502, "balance_loss_mlp": 1.04757333, "epoch": 0.15048850142792725, "flos": 19535548867200.0, "grad_norm": 1.916112465112425, "language_loss": 0.772699, "learning_rate": 3.849635115831421e-06, "loss": 0.79479468, "num_input_tokens_seen": 54360255, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.046875, "step": 2503, "time_per_iteration": 2.4375407695770264 }, { "auxiliary_loss_clip": 0.01155151, "auxiliary_loss_mlp": 0.01047686, "balance_loss_clip": 1.0293045, "balance_loss_mlp": 1.05166721, "epoch": 0.1505486246805952, "flos": 22017550746240.0, "grad_norm": 1.863501088545275, "language_loss": 0.85531986, "learning_rate": 3.849486925278176e-06, "loss": 0.8773483, "num_input_tokens_seen": 54378260, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.03125, "step": 2504, "time_per_iteration": 2.4784739017486572 }, { "auxiliary_loss_clip": 0.01152268, "auxiliary_loss_mlp": 0.01045676, "balance_loss_clip": 1.0273416, "balance_loss_mlp": 1.04999721, "epoch": 0.15060874793326318, "flos": 20743153136640.0, "grad_norm": 1.6912301070978808, "language_loss": 0.82941341, "learning_rate": 3.8493386645922e-06, "loss": 0.85139287, "num_input_tokens_seen": 54399745, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0234375, "step": 2505, "time_per_iteration": 2.51930570602417 }, { "auxiliary_loss_clip": 0.01154029, "auxiliary_loss_mlp": 0.01054617, "balance_loss_clip": 1.03593731, "balance_loss_mlp": 1.04936886, "epoch": 0.15066887118593117, "flos": 16471902055680.0, "grad_norm": 1.9743602461778529, "language_loss": 0.76100147, "learning_rate": 3.849190333779117e-06, "loss": 0.78308797, "num_input_tokens_seen": 54417105, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2506, "time_per_iteration": 2.4532837867736816 }, { "auxiliary_loss_clip": 0.01159491, "auxiliary_loss_mlp": 0.01050878, "balance_loss_clip": 1.03139913, "balance_loss_mlp": 1.05168366, "epoch": 0.15072899443859913, "flos": 19859319083520.0, "grad_norm": 2.673645277242167, "language_loss": 0.75473028, "learning_rate": 3.849041932844552e-06, "loss": 0.77683401, "num_input_tokens_seen": 54433920, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.078125, "step": 2507, "time_per_iteration": 2.4052276611328125 }, { "auxiliary_loss_clip": 0.01150386, "auxiliary_loss_mlp": 0.01050468, "balance_loss_clip": 1.03195477, "balance_loss_mlp": 1.04840839, "epoch": 0.1507891176912671, "flos": 20776226584320.0, "grad_norm": 1.9907672806097225, "language_loss": 0.68836272, "learning_rate": 3.848893461794131e-06, "loss": 0.71037126, "num_input_tokens_seen": 54451540, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0234375, "step": 2508, "time_per_iteration": 2.474374771118164 }, { "auxiliary_loss_clip": 0.01160964, "auxiliary_loss_mlp": 0.01052037, "balance_loss_clip": 1.03305888, "balance_loss_mlp": 1.05507755, "epoch": 0.15084924094393506, "flos": 23586631534080.0, "grad_norm": 1.744827684976985, "language_loss": 0.77327365, "learning_rate": 3.8487449206334845e-06, "loss": 0.79540366, "num_input_tokens_seen": 54470800, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2509, "time_per_iteration": 2.451415777206421 }, { "auxiliary_loss_clip": 0.01164304, "auxiliary_loss_mlp": 0.01060402, "balance_loss_clip": 1.03906393, "balance_loss_mlp": 1.05245471, "epoch": 0.15090936419660303, "flos": 18911313383040.0, "grad_norm": 2.1703665442653053, "language_loss": 0.80287981, "learning_rate": 3.848596309368246e-06, "loss": 0.82512689, "num_input_tokens_seen": 54486525, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.1171875, "step": 2510, "time_per_iteration": 2.435851573944092 }, { "auxiliary_loss_clip": 0.01159621, "auxiliary_loss_mlp": 0.01060003, "balance_loss_clip": 1.03885508, "balance_loss_mlp": 1.05289745, "epoch": 0.150969487449271, "flos": 17928223073280.0, "grad_norm": 1.8765582034908144, "language_loss": 0.74232942, "learning_rate": 3.8484476280040495e-06, "loss": 0.76452565, "num_input_tokens_seen": 54503795, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.0625, "step": 2511, "time_per_iteration": 2.412396192550659 }, { "auxiliary_loss_clip": 0.01155344, "auxiliary_loss_mlp": 0.01043232, "balance_loss_clip": 1.02482605, "balance_loss_mlp": 1.05106473, "epoch": 0.151029610701939, "flos": 24243078539520.0, "grad_norm": 1.906553145121316, "language_loss": 0.68827605, "learning_rate": 3.848298876546534e-06, "loss": 0.71026182, "num_input_tokens_seen": 54523025, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.046875, "step": 2512, "time_per_iteration": 4.014248371124268 }, { "auxiliary_loss_clip": 0.01157569, "auxiliary_loss_mlp": 0.01049314, "balance_loss_clip": 1.02989531, "balance_loss_mlp": 1.05280447, "epoch": 0.15108973395460695, "flos": 30262496641920.0, "grad_norm": 2.5756773772964454, "language_loss": 0.73746538, "learning_rate": 3.84815005500134e-06, "loss": 0.75953418, "num_input_tokens_seen": 54545025, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.046875, "step": 2513, "time_per_iteration": 3.9067981243133545 }, { "auxiliary_loss_clip": 0.01059626, "auxiliary_loss_mlp": 0.01020055, "balance_loss_clip": 1.01807594, "balance_loss_mlp": 1.02526796, "epoch": 0.15114985720727492, "flos": 60437624428800.0, "grad_norm": 0.8845365523038292, "language_loss": 0.64772874, "learning_rate": 3.84800116337411e-06, "loss": 0.66852558, "num_input_tokens_seen": 54604545, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.34375, "step": 2514, "time_per_iteration": 3.046525716781616 }, { "auxiliary_loss_clip": 0.01153331, "auxiliary_loss_mlp": 0.01040164, "balance_loss_clip": 1.02180612, "balance_loss_mlp": 1.05084348, "epoch": 0.15120998045994288, "flos": 20521691832960.0, "grad_norm": 2.0559273414166777, "language_loss": 0.72842282, "learning_rate": 3.8478522016704916e-06, "loss": 0.75035775, "num_input_tokens_seen": 54620590, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0234375, "step": 2515, "time_per_iteration": 3.8717405796051025 }, { "auxiliary_loss_clip": 0.01153309, "auxiliary_loss_mlp": 0.01042666, "balance_loss_clip": 1.02313983, "balance_loss_mlp": 1.05068088, "epoch": 0.15127010371261085, "flos": 21178893024000.0, "grad_norm": 8.00856174102338, "language_loss": 0.77688295, "learning_rate": 3.8477031698961325e-06, "loss": 0.79884267, "num_input_tokens_seen": 54640410, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.03125, "step": 2516, "time_per_iteration": 2.481773853302002 }, { "auxiliary_loss_clip": 0.01054881, "auxiliary_loss_mlp": 0.01001627, "balance_loss_clip": 0.99958819, "balance_loss_mlp": 1.02090287, "epoch": 0.1513302269652788, "flos": 65320648974720.0, "grad_norm": 0.7448431599168, "language_loss": 0.54669034, "learning_rate": 3.8475540680566835e-06, "loss": 0.56725538, "num_input_tokens_seen": 54701430, "router_z_loss_clip": 0.02038574, "router_z_loss_mlp": 0.33984375, "step": 2517, "time_per_iteration": 3.112004280090332 }, { "auxiliary_loss_clip": 0.01156401, "auxiliary_loss_mlp": 0.01045949, "balance_loss_clip": 1.02539754, "balance_loss_mlp": 1.05040312, "epoch": 0.15139035021794678, "flos": 19135827342720.0, "grad_norm": 2.1456758864746646, "language_loss": 0.78659081, "learning_rate": 3.8474048961577995e-06, "loss": 0.80861431, "num_input_tokens_seen": 54720845, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0625, "step": 2518, "time_per_iteration": 2.4755849838256836 }, { "auxiliary_loss_clip": 0.01161405, "auxiliary_loss_mlp": 0.01053355, "balance_loss_clip": 1.03275597, "balance_loss_mlp": 1.05337, "epoch": 0.15145047347061477, "flos": 26578564842240.0, "grad_norm": 2.646644096190234, "language_loss": 0.704144, "learning_rate": 3.847255654205137e-06, "loss": 0.7262916, "num_input_tokens_seen": 54740495, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.078125, "step": 2519, "time_per_iteration": 2.5189690589904785 }, { "auxiliary_loss_clip": 0.01155458, "auxiliary_loss_mlp": 0.01046823, "balance_loss_clip": 1.02693844, "balance_loss_mlp": 1.05010343, "epoch": 0.15151059672328274, "flos": 20302959962880.0, "grad_norm": 1.804332470255413, "language_loss": 0.78515649, "learning_rate": 3.847106342204354e-06, "loss": 0.80717927, "num_input_tokens_seen": 54758415, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0546875, "step": 2520, "time_per_iteration": 2.4707186222076416 }, { "auxiliary_loss_clip": 0.01161109, "auxiliary_loss_mlp": 0.01054748, "balance_loss_clip": 1.03394604, "balance_loss_mlp": 1.0525744, "epoch": 0.1515707199759507, "flos": 27228367831680.0, "grad_norm": 1.8203996888947886, "language_loss": 0.74737275, "learning_rate": 3.846956960161114e-06, "loss": 0.76953137, "num_input_tokens_seen": 54779355, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.0859375, "step": 2521, "time_per_iteration": 2.519932985305786 }, { "auxiliary_loss_clip": 0.01161146, "auxiliary_loss_mlp": 0.01049965, "balance_loss_clip": 1.02922285, "balance_loss_mlp": 1.05177951, "epoch": 0.15163084322861867, "flos": 23587349806080.0, "grad_norm": 2.4166607810541434, "language_loss": 0.82427758, "learning_rate": 3.84680750808108e-06, "loss": 0.8463887, "num_input_tokens_seen": 54799465, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.09375, "step": 2522, "time_per_iteration": 2.5789878368377686 }, { "auxiliary_loss_clip": 0.01052347, "auxiliary_loss_mlp": 0.01012443, "balance_loss_clip": 1.01039243, "balance_loss_mlp": 1.01789212, "epoch": 0.15169096648128663, "flos": 66889622021760.0, "grad_norm": 1.0153677381436839, "language_loss": 0.57928097, "learning_rate": 3.846657985969922e-06, "loss": 0.59992898, "num_input_tokens_seen": 54857665, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.34375, "step": 2523, "time_per_iteration": 3.077829360961914 }, { "auxiliary_loss_clip": 0.01156021, "auxiliary_loss_mlp": 0.01055129, "balance_loss_clip": 1.0346849, "balance_loss_mlp": 1.05123949, "epoch": 0.1517510897339546, "flos": 29095435848960.0, "grad_norm": 1.6705791633210394, "language_loss": 0.75189054, "learning_rate": 3.8465083938333066e-06, "loss": 0.77400208, "num_input_tokens_seen": 54879895, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.046875, "step": 2524, "time_per_iteration": 2.564910650253296 }, { "auxiliary_loss_clip": 0.01155638, "auxiliary_loss_mlp": 0.01046541, "balance_loss_clip": 1.02658522, "balance_loss_mlp": 1.04985154, "epoch": 0.1518112129866226, "flos": 18406553512320.0, "grad_norm": 1.6705303059405412, "language_loss": 0.74607152, "learning_rate": 3.8463587316769085e-06, "loss": 0.76809329, "num_input_tokens_seen": 54898245, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0546875, "step": 2525, "time_per_iteration": 2.4330456256866455 }, { "auxiliary_loss_clip": 0.01157741, "auxiliary_loss_mlp": 0.01047914, "balance_loss_clip": 1.02688503, "balance_loss_mlp": 1.05093145, "epoch": 0.15187133623929056, "flos": 19425410789760.0, "grad_norm": 2.3201835827084816, "language_loss": 0.79827094, "learning_rate": 3.846208999506402e-06, "loss": 0.82032752, "num_input_tokens_seen": 54917060, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.0703125, "step": 2526, "time_per_iteration": 2.4471843242645264 }, { "auxiliary_loss_clip": 0.01151867, "auxiliary_loss_mlp": 0.01049559, "balance_loss_clip": 1.03071213, "balance_loss_mlp": 1.05057251, "epoch": 0.15193145949195852, "flos": 17566207850880.0, "grad_norm": 1.6499397563669738, "language_loss": 0.84671319, "learning_rate": 3.846059197327466e-06, "loss": 0.86872745, "num_input_tokens_seen": 54936365, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.015625, "step": 2527, "time_per_iteration": 2.4298338890075684 }, { "auxiliary_loss_clip": 0.01156048, "auxiliary_loss_mlp": 0.01041713, "balance_loss_clip": 1.02325976, "balance_loss_mlp": 1.05036128, "epoch": 0.15199158274462649, "flos": 36176265866880.0, "grad_norm": 1.7025500205497546, "language_loss": 0.69712722, "learning_rate": 3.845909325145779e-06, "loss": 0.71910489, "num_input_tokens_seen": 54961365, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0625, "step": 2528, "time_per_iteration": 2.630000591278076 }, { "auxiliary_loss_clip": 0.01154147, "auxiliary_loss_mlp": 0.01054021, "balance_loss_clip": 1.03482795, "balance_loss_mlp": 1.0500778, "epoch": 0.15205170599729445, "flos": 23074042498560.0, "grad_norm": 2.0831401721533314, "language_loss": 0.86659265, "learning_rate": 3.845759382967026e-06, "loss": 0.88867438, "num_input_tokens_seen": 54980750, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0390625, "step": 2529, "time_per_iteration": 2.4667928218841553 }, { "auxiliary_loss_clip": 0.01151302, "auxiliary_loss_mlp": 0.01041106, "balance_loss_clip": 1.02160299, "balance_loss_mlp": 1.04920793, "epoch": 0.15211182924996242, "flos": 21908382336000.0, "grad_norm": 2.0748992782834206, "language_loss": 0.83339083, "learning_rate": 3.845609370796893e-06, "loss": 0.85531491, "num_input_tokens_seen": 54999675, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0234375, "step": 2530, "time_per_iteration": 2.4857633113861084 }, { "auxiliary_loss_clip": 0.01156605, "auxiliary_loss_mlp": 0.01049949, "balance_loss_clip": 1.03010035, "balance_loss_mlp": 1.0507772, "epoch": 0.15217195250263038, "flos": 13881521865600.0, "grad_norm": 2.415447495351504, "language_loss": 0.80321199, "learning_rate": 3.845459288641066e-06, "loss": 0.82527757, "num_input_tokens_seen": 55018295, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0625, "step": 2531, "time_per_iteration": 2.4246363639831543 }, { "auxiliary_loss_clip": 0.01154697, "auxiliary_loss_mlp": 0.01048977, "balance_loss_clip": 1.03085685, "balance_loss_mlp": 1.05106783, "epoch": 0.15223207575529837, "flos": 24535319592960.0, "grad_norm": 1.7612595184383575, "language_loss": 0.79034364, "learning_rate": 3.8453091365052394e-06, "loss": 0.81238043, "num_input_tokens_seen": 55037975, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.03125, "step": 2532, "time_per_iteration": 2.5021345615386963 }, { "auxiliary_loss_clip": 0.01152814, "auxiliary_loss_mlp": 0.01054632, "balance_loss_clip": 1.03505754, "balance_loss_mlp": 1.0506804, "epoch": 0.15229219900796634, "flos": 25556798563200.0, "grad_norm": 1.7794662137227213, "language_loss": 0.87674636, "learning_rate": 3.845158914395105e-06, "loss": 0.89882082, "num_input_tokens_seen": 55057135, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0234375, "step": 2533, "time_per_iteration": 2.476041555404663 }, { "auxiliary_loss_clip": 0.01155755, "auxiliary_loss_mlp": 0.01053261, "balance_loss_clip": 1.03350759, "balance_loss_mlp": 1.05034649, "epoch": 0.1523523222606343, "flos": 18217806520320.0, "grad_norm": 2.3975497058405284, "language_loss": 0.78795719, "learning_rate": 3.84500862231636e-06, "loss": 0.81004739, "num_input_tokens_seen": 55075525, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0546875, "step": 2534, "time_per_iteration": 2.438730239868164 }, { "auxiliary_loss_clip": 0.01159067, "auxiliary_loss_mlp": 0.01054791, "balance_loss_clip": 1.03395355, "balance_loss_mlp": 1.05017805, "epoch": 0.15241244551330227, "flos": 13260087642240.0, "grad_norm": 2.406870050152661, "language_loss": 0.76752663, "learning_rate": 3.844858260274702e-06, "loss": 0.78966516, "num_input_tokens_seen": 55090845, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.0859375, "step": 2535, "time_per_iteration": 2.4079537391662598 }, { "auxiliary_loss_clip": 0.01160716, "auxiliary_loss_mlp": 0.01051605, "balance_loss_clip": 1.03265119, "balance_loss_mlp": 1.05156302, "epoch": 0.15247256876597023, "flos": 19715568854400.0, "grad_norm": 2.6711936676037915, "language_loss": 0.78375411, "learning_rate": 3.844707828275835e-06, "loss": 0.80587733, "num_input_tokens_seen": 55108750, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.09375, "step": 2536, "time_per_iteration": 2.486776113510132 }, { "auxiliary_loss_clip": 0.01153619, "auxiliary_loss_mlp": 0.01057098, "balance_loss_clip": 1.03813136, "balance_loss_mlp": 1.05240774, "epoch": 0.1525326920186382, "flos": 20375858615040.0, "grad_norm": 2.2895039780289146, "language_loss": 0.7590223, "learning_rate": 3.844557326325461e-06, "loss": 0.78112948, "num_input_tokens_seen": 55126750, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.015625, "step": 2537, "time_per_iteration": 2.4487149715423584 }, { "auxiliary_loss_clip": 0.01157497, "auxiliary_loss_mlp": 0.01054866, "balance_loss_clip": 1.03532755, "balance_loss_mlp": 1.05236876, "epoch": 0.15259281527130616, "flos": 13589963170560.0, "grad_norm": 1.9124623651329948, "language_loss": 0.77794373, "learning_rate": 3.8444067544292896e-06, "loss": 0.80006737, "num_input_tokens_seen": 55144690, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.046875, "step": 2538, "time_per_iteration": 2.4632530212402344 }, { "auxiliary_loss_clip": 0.01152294, "auxiliary_loss_mlp": 0.01042938, "balance_loss_clip": 1.02488971, "balance_loss_mlp": 1.04936588, "epoch": 0.15265293852397416, "flos": 22860374446080.0, "grad_norm": 1.6467786968224218, "language_loss": 0.89636946, "learning_rate": 3.844256112593029e-06, "loss": 0.91832179, "num_input_tokens_seen": 55166055, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.03125, "step": 2539, "time_per_iteration": 2.4815404415130615 }, { "auxiliary_loss_clip": 0.01156171, "auxiliary_loss_mlp": 0.01054458, "balance_loss_clip": 1.03478813, "balance_loss_mlp": 1.05164552, "epoch": 0.15271306177664212, "flos": 29238108670080.0, "grad_norm": 1.8148256811020012, "language_loss": 0.93284559, "learning_rate": 3.844105400822391e-06, "loss": 0.95495188, "num_input_tokens_seen": 55186285, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.046875, "step": 2540, "time_per_iteration": 2.5401699542999268 }, { "auxiliary_loss_clip": 0.01150216, "auxiliary_loss_mlp": 0.01044087, "balance_loss_clip": 1.02578855, "balance_loss_mlp": 1.04863834, "epoch": 0.1527731850293101, "flos": 31246269310080.0, "grad_norm": 1.7923724897159232, "language_loss": 0.75069022, "learning_rate": 3.843954619123092e-06, "loss": 0.77263331, "num_input_tokens_seen": 55207915, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.015625, "step": 2541, "time_per_iteration": 2.5273547172546387 }, { "auxiliary_loss_clip": 0.01153559, "auxiliary_loss_mlp": 0.01047987, "balance_loss_clip": 1.02894962, "balance_loss_mlp": 1.05044365, "epoch": 0.15283330828197805, "flos": 22382079920640.0, "grad_norm": 1.6105683476258512, "language_loss": 0.81356442, "learning_rate": 3.84380376750085e-06, "loss": 0.83557999, "num_input_tokens_seen": 55227860, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.03125, "step": 2542, "time_per_iteration": 2.4819626808166504 }, { "auxiliary_loss_clip": 0.01158104, "auxiliary_loss_mlp": 0.01055193, "balance_loss_clip": 1.03551126, "balance_loss_mlp": 1.05318654, "epoch": 0.15289343153464602, "flos": 25520133755520.0, "grad_norm": 2.9677477156885126, "language_loss": 0.77528954, "learning_rate": 3.843652845961383e-06, "loss": 0.79742247, "num_input_tokens_seen": 55247330, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.046875, "step": 2543, "time_per_iteration": 2.5252907276153564 }, { "auxiliary_loss_clip": 0.01154651, "auxiliary_loss_mlp": 0.01047792, "balance_loss_clip": 1.02875447, "balance_loss_mlp": 1.05142522, "epoch": 0.15295355478731398, "flos": 22710016114560.0, "grad_norm": 1.9443737537078964, "language_loss": 0.8621397, "learning_rate": 3.843501854510416e-06, "loss": 0.88416421, "num_input_tokens_seen": 55266195, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.03125, "step": 2544, "time_per_iteration": 2.486060857772827 }, { "auxiliary_loss_clip": 0.01157027, "auxiliary_loss_mlp": 0.01053238, "balance_loss_clip": 1.03244781, "balance_loss_mlp": 1.04939532, "epoch": 0.15301367803998198, "flos": 23251907669760.0, "grad_norm": 1.8575284313092295, "language_loss": 0.82430679, "learning_rate": 3.843350793153673e-06, "loss": 0.84640944, "num_input_tokens_seen": 55283305, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.078125, "step": 2545, "time_per_iteration": 2.435319423675537 }, { "auxiliary_loss_clip": 0.01157472, "auxiliary_loss_mlp": 0.01044993, "balance_loss_clip": 1.02538276, "balance_loss_mlp": 1.05380261, "epoch": 0.15307380129264994, "flos": 25886279041920.0, "grad_norm": 2.3899319867278304, "language_loss": 0.71258742, "learning_rate": 3.843199661896884e-06, "loss": 0.73461211, "num_input_tokens_seen": 55303035, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0390625, "step": 2546, "time_per_iteration": 2.5072193145751953 }, { "auxiliary_loss_clip": 0.01158702, "auxiliary_loss_mlp": 0.01045249, "balance_loss_clip": 1.02458978, "balance_loss_mlp": 1.05256331, "epoch": 0.1531339245453179, "flos": 46973239205760.0, "grad_norm": 1.5931408323351468, "language_loss": 0.77675307, "learning_rate": 3.843048460745779e-06, "loss": 0.79879254, "num_input_tokens_seen": 55327570, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0625, "step": 2547, "time_per_iteration": 2.677779197692871 }, { "auxiliary_loss_clip": 0.01160209, "auxiliary_loss_mlp": 0.01055998, "balance_loss_clip": 1.0353272, "balance_loss_mlp": 1.05396056, "epoch": 0.15319404779798587, "flos": 35882049565440.0, "grad_norm": 2.0156807671015624, "language_loss": 0.74278527, "learning_rate": 3.842897189706092e-06, "loss": 0.7649473, "num_input_tokens_seen": 55351090, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0625, "step": 2548, "time_per_iteration": 2.6196136474609375 }, { "auxiliary_loss_clip": 0.01156613, "auxiliary_loss_mlp": 0.01052046, "balance_loss_clip": 1.03197098, "balance_loss_mlp": 1.05141306, "epoch": 0.15325417105065384, "flos": 25664638170240.0, "grad_norm": 1.383018125785442, "language_loss": 0.80727518, "learning_rate": 3.842745848783558e-06, "loss": 0.8293618, "num_input_tokens_seen": 55371050, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0546875, "step": 2549, "time_per_iteration": 2.5042660236358643 }, { "auxiliary_loss_clip": 0.01155897, "auxiliary_loss_mlp": 0.01056256, "balance_loss_clip": 1.03588331, "balance_loss_mlp": 1.0506047, "epoch": 0.1533142943033218, "flos": 18770831291520.0, "grad_norm": 1.4497767760997664, "language_loss": 0.74794918, "learning_rate": 3.842594437983917e-06, "loss": 0.77007067, "num_input_tokens_seen": 55390375, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0546875, "step": 2550, "time_per_iteration": 2.4605531692504883 }, { "auxiliary_loss_clip": 0.01159802, "auxiliary_loss_mlp": 0.01040774, "balance_loss_clip": 1.02034116, "balance_loss_mlp": 1.0526104, "epoch": 0.15337441755598977, "flos": 23107367341440.0, "grad_norm": 2.208930902230845, "language_loss": 0.77161586, "learning_rate": 3.8424429573129115e-06, "loss": 0.79362166, "num_input_tokens_seen": 55408890, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.0703125, "step": 2551, "time_per_iteration": 2.4492292404174805 }, { "auxiliary_loss_clip": 0.01051462, "auxiliary_loss_mlp": 0.0102893, "balance_loss_clip": 1.0267123, "balance_loss_mlp": 1.01684856, "epoch": 0.15343454080865776, "flos": 59861079227520.0, "grad_norm": 0.9388311760755741, "language_loss": 0.56699359, "learning_rate": 3.842291406776283e-06, "loss": 0.58779752, "num_input_tokens_seen": 55463815, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.34570312, "step": 2552, "time_per_iteration": 3.0214710235595703 }, { "auxiliary_loss_clip": 0.01157426, "auxiliary_loss_mlp": 0.01042432, "balance_loss_clip": 1.02248847, "balance_loss_mlp": 1.05135345, "epoch": 0.15349466406132573, "flos": 11910887959680.0, "grad_norm": 2.3714787333428964, "language_loss": 0.88281447, "learning_rate": 3.84213978637978e-06, "loss": 0.90481305, "num_input_tokens_seen": 55481050, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0625, "step": 2553, "time_per_iteration": 2.4271836280822754 }, { "auxiliary_loss_clip": 0.01161667, "auxiliary_loss_mlp": 0.01046557, "balance_loss_clip": 1.02570772, "balance_loss_mlp": 1.05326283, "epoch": 0.1535547873139937, "flos": 24096922099200.0, "grad_norm": 1.6532777899595428, "language_loss": 0.78403944, "learning_rate": 3.841988096129152e-06, "loss": 0.80612171, "num_input_tokens_seen": 55500050, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.0859375, "step": 2554, "time_per_iteration": 5.399699687957764 }, { "auxiliary_loss_clip": 0.01160266, "auxiliary_loss_mlp": 0.01054324, "balance_loss_clip": 1.03314054, "balance_loss_mlp": 1.05356896, "epoch": 0.15361491056666166, "flos": 17566459246080.0, "grad_norm": 2.0614137730084505, "language_loss": 0.7745738, "learning_rate": 3.841836336030151e-06, "loss": 0.79671967, "num_input_tokens_seen": 55518125, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.0703125, "step": 2555, "time_per_iteration": 2.4390392303466797 }, { "auxiliary_loss_clip": 0.01154588, "auxiliary_loss_mlp": 0.01044858, "balance_loss_clip": 1.02618968, "balance_loss_mlp": 1.05164301, "epoch": 0.15367503381932962, "flos": 25046041121280.0, "grad_norm": 1.5456240662255218, "language_loss": 0.77221483, "learning_rate": 3.8416845060885305e-06, "loss": 0.79420924, "num_input_tokens_seen": 55540960, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.03125, "step": 2556, "time_per_iteration": 5.550373315811157 }, { "auxiliary_loss_clip": 0.01150229, "auxiliary_loss_mlp": 0.01039868, "balance_loss_clip": 1.01988828, "balance_loss_mlp": 1.04924655, "epoch": 0.15373515707199759, "flos": 21507332008320.0, "grad_norm": 1.7384657146626665, "language_loss": 0.90523958, "learning_rate": 3.84153260631005e-06, "loss": 0.92714053, "num_input_tokens_seen": 55559210, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0078125, "step": 2557, "time_per_iteration": 2.468400239944458 }, { "auxiliary_loss_clip": 0.01155688, "auxiliary_loss_mlp": 0.01046664, "balance_loss_clip": 1.02585053, "balance_loss_mlp": 1.05088186, "epoch": 0.15379528032466555, "flos": 25994729180160.0, "grad_norm": 2.137138634703475, "language_loss": 0.7043153, "learning_rate": 3.841380636700468e-06, "loss": 0.72633886, "num_input_tokens_seen": 55578925, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.046875, "step": 2558, "time_per_iteration": 2.5234475135803223 }, { "auxiliary_loss_clip": 0.01154542, "auxiliary_loss_mlp": 0.01045964, "balance_loss_clip": 1.02569795, "balance_loss_mlp": 1.0503248, "epoch": 0.15385540357733354, "flos": 19277315015040.0, "grad_norm": 1.944995032084835, "language_loss": 0.92358011, "learning_rate": 3.841228597265548e-06, "loss": 0.94558513, "num_input_tokens_seen": 55597255, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0390625, "step": 2559, "time_per_iteration": 2.4719038009643555 }, { "auxiliary_loss_clip": 0.01159375, "auxiliary_loss_mlp": 0.01057138, "balance_loss_clip": 1.03601456, "balance_loss_mlp": 1.05361772, "epoch": 0.1539155268300015, "flos": 28549126920960.0, "grad_norm": 2.3355073603339975, "language_loss": 0.63757163, "learning_rate": 3.841076488011055e-06, "loss": 0.65973675, "num_input_tokens_seen": 55619515, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.0546875, "step": 2560, "time_per_iteration": 2.563213348388672 }, { "auxiliary_loss_clip": 0.01155121, "auxiliary_loss_mlp": 0.0105082, "balance_loss_clip": 1.03018486, "balance_loss_mlp": 1.04885411, "epoch": 0.15397565008266947, "flos": 23547883737600.0, "grad_norm": 1.7409370922352672, "language_loss": 0.87922144, "learning_rate": 3.8409243089427574e-06, "loss": 0.90128082, "num_input_tokens_seen": 55640050, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.0625, "step": 2561, "time_per_iteration": 2.5064005851745605 }, { "auxiliary_loss_clip": 0.01150382, "auxiliary_loss_mlp": 0.01042194, "balance_loss_clip": 1.02354956, "balance_loss_mlp": 1.05030191, "epoch": 0.15403577333533744, "flos": 17129821518720.0, "grad_norm": 2.4592682616027637, "language_loss": 0.82590407, "learning_rate": 3.840772060066425e-06, "loss": 0.84782982, "num_input_tokens_seen": 55658695, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0, "step": 2562, "time_per_iteration": 2.440951347351074 }, { "auxiliary_loss_clip": 0.01162247, "auxiliary_loss_mlp": 0.01054162, "balance_loss_clip": 1.03217995, "balance_loss_mlp": 1.05263591, "epoch": 0.1540958965880054, "flos": 17894503180800.0, "grad_norm": 1.8831949851473242, "language_loss": 0.74662125, "learning_rate": 3.840619741387832e-06, "loss": 0.76878536, "num_input_tokens_seen": 55676340, "router_z_loss_clip": 0.21972656, "router_z_loss_mlp": 1.1015625, "step": 2563, "time_per_iteration": 2.4160146713256836 }, { "auxiliary_loss_clip": 0.01157152, "auxiliary_loss_mlp": 0.01047127, "balance_loss_clip": 1.02630115, "balance_loss_mlp": 1.04998815, "epoch": 0.15415601984067337, "flos": 32161057908480.0, "grad_norm": 1.8486861883774626, "language_loss": 0.75945842, "learning_rate": 3.8404673529127534e-06, "loss": 0.78150117, "num_input_tokens_seen": 55698890, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.0703125, "step": 2564, "time_per_iteration": 2.5666894912719727 }, { "auxiliary_loss_clip": 0.01152145, "auxiliary_loss_mlp": 0.01050925, "balance_loss_clip": 1.03158903, "balance_loss_mlp": 1.04905367, "epoch": 0.15421614309334136, "flos": 24024418496640.0, "grad_norm": 1.9391957985188755, "language_loss": 0.70730889, "learning_rate": 3.840314894646969e-06, "loss": 0.72933966, "num_input_tokens_seen": 55718535, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.03125, "step": 2565, "time_per_iteration": 2.4652442932128906 }, { "auxiliary_loss_clip": 0.01153955, "auxiliary_loss_mlp": 0.01050389, "balance_loss_clip": 1.03027844, "balance_loss_mlp": 1.05014968, "epoch": 0.15427626634600933, "flos": 24386290064640.0, "grad_norm": 2.306727067658613, "language_loss": 0.71642566, "learning_rate": 3.840162366596259e-06, "loss": 0.73846912, "num_input_tokens_seen": 55738970, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0390625, "step": 2566, "time_per_iteration": 2.5033371448516846 }, { "auxiliary_loss_clip": 0.01147993, "auxiliary_loss_mlp": 0.01040543, "balance_loss_clip": 1.02177954, "balance_loss_mlp": 1.04708171, "epoch": 0.1543363895986773, "flos": 23331522165120.0, "grad_norm": 1.9443983196365453, "language_loss": 0.85242772, "learning_rate": 3.840009768766408e-06, "loss": 0.87431312, "num_input_tokens_seen": 55759585, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0078125, "step": 2567, "time_per_iteration": 2.4898078441619873 }, { "auxiliary_loss_clip": 0.01153071, "auxiliary_loss_mlp": 0.0104466, "balance_loss_clip": 1.02627754, "balance_loss_mlp": 1.0510726, "epoch": 0.15439651285134526, "flos": 24274284480000.0, "grad_norm": 1.9707286886526316, "language_loss": 0.78262126, "learning_rate": 3.839857101163202e-06, "loss": 0.80459857, "num_input_tokens_seen": 55779250, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.015625, "step": 2568, "time_per_iteration": 2.534069538116455 }, { "auxiliary_loss_clip": 0.01154156, "auxiliary_loss_mlp": 0.01039462, "balance_loss_clip": 1.01963723, "balance_loss_mlp": 1.05131364, "epoch": 0.15445663610401322, "flos": 22456163721600.0, "grad_norm": 1.7152556182536935, "language_loss": 0.70378774, "learning_rate": 3.83970436379243e-06, "loss": 0.72572392, "num_input_tokens_seen": 55800470, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.03125, "step": 2569, "time_per_iteration": 2.4648470878601074 }, { "auxiliary_loss_clip": 0.011493, "auxiliary_loss_mlp": 0.01045275, "balance_loss_clip": 1.02611804, "balance_loss_mlp": 1.0482775, "epoch": 0.1545167593566812, "flos": 22049510872320.0, "grad_norm": 1.6204701668536643, "language_loss": 0.76354671, "learning_rate": 3.839551556659884e-06, "loss": 0.78549248, "num_input_tokens_seen": 55817795, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.015625, "step": 2570, "time_per_iteration": 2.491166591644287 }, { "auxiliary_loss_clip": 0.01152845, "auxiliary_loss_mlp": 0.0104279, "balance_loss_clip": 1.02365661, "balance_loss_mlp": 1.0509361, "epoch": 0.15457688260934915, "flos": 19318253541120.0, "grad_norm": 2.3363549077300436, "language_loss": 0.77823091, "learning_rate": 3.839398679771359e-06, "loss": 0.80018729, "num_input_tokens_seen": 55836125, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.015625, "step": 2571, "time_per_iteration": 2.4386324882507324 }, { "auxiliary_loss_clip": 0.01153831, "auxiliary_loss_mlp": 0.01045734, "balance_loss_clip": 1.02738762, "balance_loss_mlp": 1.05174184, "epoch": 0.15463700586201715, "flos": 24133981956480.0, "grad_norm": 1.997497993319257, "language_loss": 0.8272469, "learning_rate": 3.839245733132652e-06, "loss": 0.84924257, "num_input_tokens_seen": 55855280, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0234375, "step": 2572, "time_per_iteration": 2.542884111404419 }, { "auxiliary_loss_clip": 0.01159602, "auxiliary_loss_mlp": 0.01048793, "balance_loss_clip": 1.0290879, "balance_loss_mlp": 1.0529027, "epoch": 0.1546971291146851, "flos": 22420935457920.0, "grad_norm": 1.5183640722642786, "language_loss": 0.90405202, "learning_rate": 3.839092716749563e-06, "loss": 0.92613602, "num_input_tokens_seen": 55875695, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0625, "step": 2573, "time_per_iteration": 2.4730632305145264 }, { "auxiliary_loss_clip": 0.01156203, "auxiliary_loss_mlp": 0.01048337, "balance_loss_clip": 1.02932286, "balance_loss_mlp": 1.05209041, "epoch": 0.15475725236735308, "flos": 17530225401600.0, "grad_norm": 4.288909280190246, "language_loss": 0.70647073, "learning_rate": 3.838939630627893e-06, "loss": 0.7285161, "num_input_tokens_seen": 55894575, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.046875, "step": 2574, "time_per_iteration": 2.4597713947296143 }, { "auxiliary_loss_clip": 0.0115548, "auxiliary_loss_mlp": 0.01048303, "balance_loss_clip": 1.02750063, "balance_loss_mlp": 1.0506736, "epoch": 0.15481737562002104, "flos": 22561740771840.0, "grad_norm": 1.7838441767474893, "language_loss": 0.82743609, "learning_rate": 3.838786474773448e-06, "loss": 0.84947389, "num_input_tokens_seen": 55912855, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.046875, "step": 2575, "time_per_iteration": 2.451098918914795 }, { "auxiliary_loss_clip": 0.01153205, "auxiliary_loss_mlp": 0.01045293, "balance_loss_clip": 1.02711344, "balance_loss_mlp": 1.0486722, "epoch": 0.154877498872689, "flos": 24900567039360.0, "grad_norm": 1.8219675127778523, "language_loss": 0.84476668, "learning_rate": 3.838633249192036e-06, "loss": 0.86675167, "num_input_tokens_seen": 55932375, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.046875, "step": 2576, "time_per_iteration": 2.504429817199707 }, { "auxiliary_loss_clip": 0.01152662, "auxiliary_loss_mlp": 0.01043374, "balance_loss_clip": 1.02403843, "balance_loss_mlp": 1.04919243, "epoch": 0.15493762212535697, "flos": 28147501975680.0, "grad_norm": 2.0527575987893893, "language_loss": 0.82125747, "learning_rate": 3.838479953889465e-06, "loss": 0.84321791, "num_input_tokens_seen": 55953970, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.03125, "step": 2577, "time_per_iteration": 2.5191702842712402 }, { "auxiliary_loss_clip": 0.01157089, "auxiliary_loss_mlp": 0.01049729, "balance_loss_clip": 1.03096569, "balance_loss_mlp": 1.05341411, "epoch": 0.15499774537802496, "flos": 25411073086080.0, "grad_norm": 2.0233437334253885, "language_loss": 0.76514149, "learning_rate": 3.8383265888715525e-06, "loss": 0.78720969, "num_input_tokens_seen": 55973120, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0390625, "step": 2578, "time_per_iteration": 2.5103507041931152 }, { "auxiliary_loss_clip": 0.01153823, "auxiliary_loss_mlp": 0.01049141, "balance_loss_clip": 1.02953088, "balance_loss_mlp": 1.05035067, "epoch": 0.15505786863069293, "flos": 22091562720000.0, "grad_norm": 1.9483762861199498, "language_loss": 0.82785463, "learning_rate": 3.83817315414411e-06, "loss": 0.84988427, "num_input_tokens_seen": 55993260, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.03125, "step": 2579, "time_per_iteration": 2.4645774364471436 }, { "auxiliary_loss_clip": 0.01156583, "auxiliary_loss_mlp": 0.01050344, "balance_loss_clip": 1.03160429, "balance_loss_mlp": 1.05379319, "epoch": 0.1551179918833609, "flos": 18917131386240.0, "grad_norm": 1.5584050854725664, "language_loss": 0.80874717, "learning_rate": 3.838019649712958e-06, "loss": 0.83081645, "num_input_tokens_seen": 56012130, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0234375, "step": 2580, "time_per_iteration": 2.4652557373046875 }, { "auxiliary_loss_clip": 0.01053939, "auxiliary_loss_mlp": 0.01002734, "balance_loss_clip": 1.00052834, "balance_loss_mlp": 1.01982141, "epoch": 0.15517811513602886, "flos": 66239172587520.0, "grad_norm": 0.8450966677561939, "language_loss": 0.58896494, "learning_rate": 3.8378660755839166e-06, "loss": 0.6095317, "num_input_tokens_seen": 56079045, "router_z_loss_clip": 0.02209473, "router_z_loss_mlp": 0.34179688, "step": 2581, "time_per_iteration": 3.1968131065368652 }, { "auxiliary_loss_clip": 0.01158279, "auxiliary_loss_mlp": 0.01050366, "balance_loss_clip": 1.03137612, "balance_loss_mlp": 1.05289626, "epoch": 0.15523823838869683, "flos": 24021078531840.0, "grad_norm": 2.055593535728351, "language_loss": 0.85396594, "learning_rate": 3.8377124317628095e-06, "loss": 0.87605238, "num_input_tokens_seen": 56098745, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.046875, "step": 2582, "time_per_iteration": 2.491652011871338 }, { "auxiliary_loss_clip": 0.01157205, "auxiliary_loss_mlp": 0.01059618, "balance_loss_clip": 1.03975773, "balance_loss_mlp": 1.05282474, "epoch": 0.1552983616413648, "flos": 20485062938880.0, "grad_norm": 2.0155191835172515, "language_loss": 0.78423166, "learning_rate": 3.8375587182554625e-06, "loss": 0.80639982, "num_input_tokens_seen": 56117655, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.046875, "step": 2583, "time_per_iteration": 2.495054244995117 }, { "auxiliary_loss_clip": 0.01156947, "auxiliary_loss_mlp": 0.01057982, "balance_loss_clip": 1.03694129, "balance_loss_mlp": 1.05299878, "epoch": 0.15535848489403276, "flos": 32123710742400.0, "grad_norm": 1.6892750401326926, "language_loss": 0.7576493, "learning_rate": 3.837404935067705e-06, "loss": 0.77979857, "num_input_tokens_seen": 56141960, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.0390625, "step": 2584, "time_per_iteration": 2.56121563911438 }, { "auxiliary_loss_clip": 0.01155017, "auxiliary_loss_mlp": 0.01043551, "balance_loss_clip": 1.02397656, "balance_loss_mlp": 1.05100691, "epoch": 0.15541860814670075, "flos": 19098444263040.0, "grad_norm": 1.6291226691460954, "language_loss": 0.75790823, "learning_rate": 3.837251082205368e-06, "loss": 0.77989388, "num_input_tokens_seen": 56161430, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0390625, "step": 2585, "time_per_iteration": 2.606557607650757 }, { "auxiliary_loss_clip": 0.01152705, "auxiliary_loss_mlp": 0.01046651, "balance_loss_clip": 1.02767313, "balance_loss_mlp": 1.0518086, "epoch": 0.1554787313993687, "flos": 19172097100800.0, "grad_norm": 2.2879845897960687, "language_loss": 0.6134184, "learning_rate": 3.837097159674286e-06, "loss": 0.63541192, "num_input_tokens_seen": 56179390, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0078125, "step": 2586, "time_per_iteration": 2.4310669898986816 }, { "auxiliary_loss_clip": 0.01156355, "auxiliary_loss_mlp": 0.01048277, "balance_loss_clip": 1.02894115, "balance_loss_mlp": 1.05080473, "epoch": 0.15553885465203668, "flos": 16143822207360.0, "grad_norm": 1.6366001630553924, "language_loss": 0.80853516, "learning_rate": 3.836943167480296e-06, "loss": 0.83058149, "num_input_tokens_seen": 56198020, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0546875, "step": 2587, "time_per_iteration": 2.441445827484131 }, { "auxiliary_loss_clip": 0.0115985, "auxiliary_loss_mlp": 0.01059281, "balance_loss_clip": 1.03690493, "balance_loss_mlp": 1.05287051, "epoch": 0.15559897790470464, "flos": 25337779384320.0, "grad_norm": 1.9125296428534395, "language_loss": 0.88731277, "learning_rate": 3.836789105629236e-06, "loss": 0.90950406, "num_input_tokens_seen": 56218165, "router_z_loss_clip": 0.22460938, "router_z_loss_mlp": 1.0703125, "step": 2588, "time_per_iteration": 2.497626543045044 }, { "auxiliary_loss_clip": 0.01156394, "auxiliary_loss_mlp": 0.01057178, "balance_loss_clip": 1.03601789, "balance_loss_mlp": 1.05335128, "epoch": 0.1556591011573726, "flos": 23148772744320.0, "grad_norm": 2.2847028625647487, "language_loss": 0.64421868, "learning_rate": 3.83663497412695e-06, "loss": 0.66635442, "num_input_tokens_seen": 56237160, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.03125, "step": 2589, "time_per_iteration": 2.483165979385376 }, { "auxiliary_loss_clip": 0.01154971, "auxiliary_loss_mlp": 0.0104656, "balance_loss_clip": 1.02553189, "balance_loss_mlp": 1.05077159, "epoch": 0.15571922441004057, "flos": 25370888745600.0, "grad_norm": 1.624717679060388, "language_loss": 0.82840824, "learning_rate": 3.836480772979281e-06, "loss": 0.85042363, "num_input_tokens_seen": 56257610, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.0390625, "step": 2590, "time_per_iteration": 2.4898016452789307 }, { "auxiliary_loss_clip": 0.01155473, "auxiliary_loss_mlp": 0.01048029, "balance_loss_clip": 1.02815676, "balance_loss_mlp": 1.05026913, "epoch": 0.15577934766270854, "flos": 14501375890560.0, "grad_norm": 2.165425586046355, "language_loss": 0.79023099, "learning_rate": 3.836326502192077e-06, "loss": 0.81226599, "num_input_tokens_seen": 56275215, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0546875, "step": 2591, "time_per_iteration": 2.4332361221313477 }, { "auxiliary_loss_clip": 0.01153914, "auxiliary_loss_mlp": 0.01049784, "balance_loss_clip": 1.03137851, "balance_loss_mlp": 1.05177748, "epoch": 0.15583947091537653, "flos": 37414537372800.0, "grad_norm": 2.1043714713771364, "language_loss": 0.64812368, "learning_rate": 3.836172161771189e-06, "loss": 0.67016065, "num_input_tokens_seen": 56297130, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0234375, "step": 2592, "time_per_iteration": 2.5857911109924316 }, { "auxiliary_loss_clip": 0.0116195, "auxiliary_loss_mlp": 0.01054818, "balance_loss_clip": 1.03406382, "balance_loss_mlp": 1.0555675, "epoch": 0.1558995941680445, "flos": 21834729498240.0, "grad_norm": 2.074896063877349, "language_loss": 0.8175931, "learning_rate": 3.836017751722467e-06, "loss": 0.83976084, "num_input_tokens_seen": 56314995, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0625, "step": 2593, "time_per_iteration": 2.4780194759368896 }, { "auxiliary_loss_clip": 0.01152718, "auxiliary_loss_mlp": 0.01049332, "balance_loss_clip": 1.02857721, "balance_loss_mlp": 1.05154395, "epoch": 0.15595971742071246, "flos": 19792633484160.0, "grad_norm": 1.930785236890048, "language_loss": 0.73052096, "learning_rate": 3.8358632720517695e-06, "loss": 0.75254148, "num_input_tokens_seen": 56334005, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0078125, "step": 2594, "time_per_iteration": 2.457505464553833 }, { "auxiliary_loss_clip": 0.01150652, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.02117872, "balance_loss_mlp": 1.0506835, "epoch": 0.15601984067338043, "flos": 26722135503360.0, "grad_norm": 2.202870079686508, "language_loss": 0.81513613, "learning_rate": 3.835708722764952e-06, "loss": 0.83704376, "num_input_tokens_seen": 56353795, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0, "step": 2595, "time_per_iteration": 3.981987237930298 }, { "auxiliary_loss_clip": 0.01153292, "auxiliary_loss_mlp": 0.01045985, "balance_loss_clip": 1.02638674, "balance_loss_mlp": 1.04914117, "epoch": 0.1560799639260484, "flos": 18369278173440.0, "grad_norm": 1.841294998620032, "language_loss": 0.87034452, "learning_rate": 3.835554103867876e-06, "loss": 0.89233726, "num_input_tokens_seen": 56373195, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.046875, "step": 2596, "time_per_iteration": 3.827545642852783 }, { "auxiliary_loss_clip": 0.0115125, "auxiliary_loss_mlp": 0.01042917, "balance_loss_clip": 1.02461863, "balance_loss_mlp": 1.0516659, "epoch": 0.15614008717871636, "flos": 22598980197120.0, "grad_norm": 2.2630575987711756, "language_loss": 0.68797886, "learning_rate": 3.835399415366404e-06, "loss": 0.70992053, "num_input_tokens_seen": 56391525, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.99609375, "step": 2597, "time_per_iteration": 2.4741158485412598 }, { "auxiliary_loss_clip": 0.0114917, "auxiliary_loss_mlp": 0.01044619, "balance_loss_clip": 1.026618, "balance_loss_mlp": 1.05126786, "epoch": 0.15620021043138435, "flos": 22746860490240.0, "grad_norm": 1.6630200166282623, "language_loss": 0.79813564, "learning_rate": 3.8352446572664035e-06, "loss": 0.8200736, "num_input_tokens_seen": 56410715, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9765625, "step": 2598, "time_per_iteration": 5.35629415512085 }, { "auxiliary_loss_clip": 0.01148393, "auxiliary_loss_mlp": 0.01042348, "balance_loss_clip": 1.02310801, "balance_loss_mlp": 1.0489707, "epoch": 0.15626033368405232, "flos": 13114936782720.0, "grad_norm": 1.9463930934864095, "language_loss": 0.82767135, "learning_rate": 3.8350898295737405e-06, "loss": 0.8495788, "num_input_tokens_seen": 56429170, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.99609375, "step": 2599, "time_per_iteration": 2.4531867504119873 }, { "auxiliary_loss_clip": 0.01159848, "auxiliary_loss_mlp": 0.01054723, "balance_loss_clip": 1.03306282, "balance_loss_mlp": 1.05219007, "epoch": 0.15632045693672028, "flos": 16472297105280.0, "grad_norm": 2.1282026082925563, "language_loss": 0.82217848, "learning_rate": 3.834934932294287e-06, "loss": 0.84432423, "num_input_tokens_seen": 56445685, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.078125, "step": 2600, "time_per_iteration": 2.4123473167419434 }, { "auxiliary_loss_clip": 0.0115935, "auxiliary_loss_mlp": 0.01049944, "balance_loss_clip": 1.0309782, "balance_loss_mlp": 1.05535364, "epoch": 0.15638058018938825, "flos": 20850346298880.0, "grad_norm": 2.2870227795224336, "language_loss": 0.88103312, "learning_rate": 3.834779965433917e-06, "loss": 0.90312612, "num_input_tokens_seen": 56465900, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.046875, "step": 2601, "time_per_iteration": 2.4855501651763916 }, { "auxiliary_loss_clip": 0.01161666, "auxiliary_loss_mlp": 0.01069529, "balance_loss_clip": 1.04796433, "balance_loss_mlp": 1.05694556, "epoch": 0.1564407034420562, "flos": 21872220318720.0, "grad_norm": 1.8533384871212988, "language_loss": 0.78440803, "learning_rate": 3.834624928998508e-06, "loss": 0.80672002, "num_input_tokens_seen": 56485020, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.046875, "step": 2602, "time_per_iteration": 2.463022232055664 }, { "auxiliary_loss_clip": 0.01157655, "auxiliary_loss_mlp": 0.01048995, "balance_loss_clip": 1.02974296, "balance_loss_mlp": 1.05429792, "epoch": 0.15650082669472418, "flos": 21834549930240.0, "grad_norm": 1.8579160907606889, "language_loss": 0.74205506, "learning_rate": 3.8344698229939376e-06, "loss": 0.76412153, "num_input_tokens_seen": 56505205, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.03125, "step": 2603, "time_per_iteration": 2.488152503967285 }, { "auxiliary_loss_clip": 0.01159194, "auxiliary_loss_mlp": 0.01053774, "balance_loss_clip": 1.03329372, "balance_loss_mlp": 1.05508769, "epoch": 0.15656094994739214, "flos": 13800542653440.0, "grad_norm": 3.0234585060106998, "language_loss": 0.87823355, "learning_rate": 3.8343146474260865e-06, "loss": 0.90036321, "num_input_tokens_seen": 56521495, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0390625, "step": 2604, "time_per_iteration": 2.45455002784729 }, { "auxiliary_loss_clip": 0.01161076, "auxiliary_loss_mlp": 0.01049409, "balance_loss_clip": 1.03012109, "balance_loss_mlp": 1.05504763, "epoch": 0.15662107320006013, "flos": 27308197808640.0, "grad_norm": 1.9956075629136996, "language_loss": 0.8505187, "learning_rate": 3.834159402300841e-06, "loss": 0.87262356, "num_input_tokens_seen": 56540665, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0625, "step": 2605, "time_per_iteration": 2.528876304626465 }, { "auxiliary_loss_clip": 0.01165402, "auxiliary_loss_mlp": 0.01051196, "balance_loss_clip": 1.03034592, "balance_loss_mlp": 1.05739224, "epoch": 0.1566811964527281, "flos": 26685075646080.0, "grad_norm": 2.7895962404370382, "language_loss": 0.72976923, "learning_rate": 3.834004087624087e-06, "loss": 0.75193524, "num_input_tokens_seen": 56560805, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.078125, "step": 2606, "time_per_iteration": 2.4988701343536377 }, { "auxiliary_loss_clip": 0.01162072, "auxiliary_loss_mlp": 0.01047769, "balance_loss_clip": 1.02954173, "balance_loss_mlp": 1.05890381, "epoch": 0.15674131970539606, "flos": 16103422385280.0, "grad_norm": 3.9156365565637903, "language_loss": 0.76122797, "learning_rate": 3.8338487034017145e-06, "loss": 0.78332639, "num_input_tokens_seen": 56576335, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.03125, "step": 2607, "time_per_iteration": 2.4435033798217773 }, { "auxiliary_loss_clip": 0.01158995, "auxiliary_loss_mlp": 0.01044426, "balance_loss_clip": 1.02582908, "balance_loss_mlp": 1.0575974, "epoch": 0.15680144295806403, "flos": 19169690889600.0, "grad_norm": 1.862964545616024, "language_loss": 0.81664264, "learning_rate": 3.833693249639615e-06, "loss": 0.83867681, "num_input_tokens_seen": 56595880, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.015625, "step": 2608, "time_per_iteration": 2.4623801708221436 }, { "auxiliary_loss_clip": 0.0116423, "auxiliary_loss_mlp": 0.01051721, "balance_loss_clip": 1.02958345, "balance_loss_mlp": 1.05793536, "epoch": 0.156861566210732, "flos": 20813430096000.0, "grad_norm": 1.7015282181758666, "language_loss": 0.7249499, "learning_rate": 3.833537726343684e-06, "loss": 0.74710941, "num_input_tokens_seen": 56615130, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.0625, "step": 2609, "time_per_iteration": 2.4922850131988525 }, { "auxiliary_loss_clip": 0.01160837, "auxiliary_loss_mlp": 0.01041513, "balance_loss_clip": 1.02196288, "balance_loss_mlp": 1.05436254, "epoch": 0.15692168946339996, "flos": 20047922421120.0, "grad_norm": 1.741619806152399, "language_loss": 0.71642518, "learning_rate": 3.833382133519818e-06, "loss": 0.73844874, "num_input_tokens_seen": 56634005, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0625, "step": 2610, "time_per_iteration": 2.4775590896606445 }, { "auxiliary_loss_clip": 0.01161431, "auxiliary_loss_mlp": 0.01052604, "balance_loss_clip": 1.03121781, "balance_loss_mlp": 1.05473614, "epoch": 0.15698181271606793, "flos": 21398019943680.0, "grad_norm": 2.009582014693558, "language_loss": 0.72637409, "learning_rate": 3.833226471173919e-06, "loss": 0.74851447, "num_input_tokens_seen": 56653480, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.0703125, "step": 2611, "time_per_iteration": 2.4781506061553955 }, { "auxiliary_loss_clip": 0.01157748, "auxiliary_loss_mlp": 0.01044636, "balance_loss_clip": 1.02528834, "balance_loss_mlp": 1.05446374, "epoch": 0.15704193596873592, "flos": 20845785271680.0, "grad_norm": 1.9175866373149804, "language_loss": 0.70602363, "learning_rate": 3.833070739311887e-06, "loss": 0.72804749, "num_input_tokens_seen": 56672270, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.03125, "step": 2612, "time_per_iteration": 2.4495127201080322 }, { "auxiliary_loss_clip": 0.01164427, "auxiliary_loss_mlp": 0.01056243, "balance_loss_clip": 1.03616762, "balance_loss_mlp": 1.05829394, "epoch": 0.15710205922140388, "flos": 21762908254080.0, "grad_norm": 1.842010308917219, "language_loss": 0.75813651, "learning_rate": 3.83291493793963e-06, "loss": 0.78034317, "num_input_tokens_seen": 56691510, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0625, "step": 2613, "time_per_iteration": 2.477426528930664 }, { "auxiliary_loss_clip": 0.0115829, "auxiliary_loss_mlp": 0.0106175, "balance_loss_clip": 1.04202044, "balance_loss_mlp": 1.05364406, "epoch": 0.15716218247407185, "flos": 25007760201600.0, "grad_norm": 1.7068215309951336, "language_loss": 0.65874302, "learning_rate": 3.832759067063055e-06, "loss": 0.68094343, "num_input_tokens_seen": 56712230, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.046875, "step": 2614, "time_per_iteration": 2.4954662322998047 }, { "auxiliary_loss_clip": 0.01164077, "auxiliary_loss_mlp": 0.01048533, "balance_loss_clip": 1.02790964, "balance_loss_mlp": 1.05723333, "epoch": 0.1572223057267398, "flos": 20191780391040.0, "grad_norm": 2.1804230887699174, "language_loss": 0.75309318, "learning_rate": 3.832603126688072e-06, "loss": 0.77521932, "num_input_tokens_seen": 56727490, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.0703125, "step": 2615, "time_per_iteration": 2.4561586380004883 }, { "auxiliary_loss_clip": 0.01155896, "auxiliary_loss_mlp": 0.0105009, "balance_loss_clip": 1.03125525, "balance_loss_mlp": 1.05551302, "epoch": 0.15728242897940778, "flos": 20959514709120.0, "grad_norm": 1.5646507157374, "language_loss": 0.7288996, "learning_rate": 3.832447116820594e-06, "loss": 0.75095952, "num_input_tokens_seen": 56747385, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0078125, "step": 2616, "time_per_iteration": 2.461153507232666 }, { "auxiliary_loss_clip": 0.01160311, "auxiliary_loss_mlp": 0.01049399, "balance_loss_clip": 1.02903855, "balance_loss_mlp": 1.05481923, "epoch": 0.15734255223207574, "flos": 23038275530880.0, "grad_norm": 2.605944578142777, "language_loss": 0.7278465, "learning_rate": 3.832291037466539e-06, "loss": 0.74994361, "num_input_tokens_seen": 56768055, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0546875, "step": 2617, "time_per_iteration": 2.4990673065185547 }, { "auxiliary_loss_clip": 0.01158882, "auxiliary_loss_mlp": 0.01043725, "balance_loss_clip": 1.02435315, "balance_loss_mlp": 1.05631614, "epoch": 0.15740267548474374, "flos": 20551281661440.0, "grad_norm": 2.129571183478307, "language_loss": 0.73927051, "learning_rate": 3.8321348886318235e-06, "loss": 0.76129657, "num_input_tokens_seen": 56785110, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0234375, "step": 2618, "time_per_iteration": 2.4477126598358154 }, { "auxiliary_loss_clip": 0.01164074, "auxiliary_loss_mlp": 0.01051657, "balance_loss_clip": 1.03013933, "balance_loss_mlp": 1.05568993, "epoch": 0.1574627987374117, "flos": 22666922772480.0, "grad_norm": 1.8955972884889678, "language_loss": 0.78760266, "learning_rate": 3.8319786703223695e-06, "loss": 0.80976003, "num_input_tokens_seen": 56804975, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.0859375, "step": 2619, "time_per_iteration": 2.5075528621673584 }, { "auxiliary_loss_clip": 0.01159569, "auxiliary_loss_mlp": 0.01058151, "balance_loss_clip": 1.03869653, "balance_loss_mlp": 1.05663788, "epoch": 0.15752292199007967, "flos": 16800664262400.0, "grad_norm": 1.745868247548471, "language_loss": 0.76982528, "learning_rate": 3.831822382544101e-06, "loss": 0.7920025, "num_input_tokens_seen": 56822470, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.03125, "step": 2620, "time_per_iteration": 2.432739496231079 }, { "auxiliary_loss_clip": 0.01162267, "auxiliary_loss_mlp": 0.01053479, "balance_loss_clip": 1.03180671, "balance_loss_mlp": 1.05622065, "epoch": 0.15758304524274763, "flos": 29826002568960.0, "grad_norm": 1.7854886614687573, "language_loss": 0.70963198, "learning_rate": 3.831666025302944e-06, "loss": 0.73178947, "num_input_tokens_seen": 56842100, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.0625, "step": 2621, "time_per_iteration": 2.5497074127197266 }, { "auxiliary_loss_clip": 0.01161524, "auxiliary_loss_mlp": 0.01050395, "balance_loss_clip": 1.02927089, "balance_loss_mlp": 1.05552852, "epoch": 0.1576431684954156, "flos": 53577426723840.0, "grad_norm": 3.207498953693197, "language_loss": 0.72226143, "learning_rate": 3.831509598604828e-06, "loss": 0.74438059, "num_input_tokens_seen": 56865920, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.0625, "step": 2622, "time_per_iteration": 2.7459447383880615 }, { "auxiliary_loss_clip": 0.01157125, "auxiliary_loss_mlp": 0.01042835, "balance_loss_clip": 1.02380908, "balance_loss_mlp": 1.05345535, "epoch": 0.15770329174808356, "flos": 20813609664000.0, "grad_norm": 1.7139206826289257, "language_loss": 0.87879908, "learning_rate": 3.831353102455684e-06, "loss": 0.90079868, "num_input_tokens_seen": 56885265, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0390625, "step": 2623, "time_per_iteration": 2.4791016578674316 }, { "auxiliary_loss_clip": 0.01157064, "auxiliary_loss_mlp": 0.01047073, "balance_loss_clip": 1.02865517, "balance_loss_mlp": 1.05396545, "epoch": 0.15776341500075153, "flos": 24974004395520.0, "grad_norm": 2.631288615493795, "language_loss": 0.82163447, "learning_rate": 3.831196536861448e-06, "loss": 0.84367579, "num_input_tokens_seen": 56906710, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.03125, "step": 2624, "time_per_iteration": 2.4921939373016357 }, { "auxiliary_loss_clip": 0.01160844, "auxiliary_loss_mlp": 0.0104816, "balance_loss_clip": 1.02745318, "balance_loss_mlp": 1.05455315, "epoch": 0.15782353825341952, "flos": 21907915459200.0, "grad_norm": 2.404356888056872, "language_loss": 0.79922938, "learning_rate": 3.831039901828054e-06, "loss": 0.8213194, "num_input_tokens_seen": 56924275, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0625, "step": 2625, "time_per_iteration": 2.4712345600128174 }, { "auxiliary_loss_clip": 0.01156096, "auxiliary_loss_mlp": 0.01045565, "balance_loss_clip": 1.02789843, "balance_loss_mlp": 1.05398345, "epoch": 0.15788366150608749, "flos": 26177191292160.0, "grad_norm": 2.6837699414315077, "language_loss": 0.80274242, "learning_rate": 3.830883197361445e-06, "loss": 0.82475901, "num_input_tokens_seen": 56941525, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 1.015625, "step": 2626, "time_per_iteration": 2.492702007293701 }, { "auxiliary_loss_clip": 0.01160142, "auxiliary_loss_mlp": 0.01046572, "balance_loss_clip": 1.0266763, "balance_loss_mlp": 1.05704093, "epoch": 0.15794378475875545, "flos": 27709822753920.0, "grad_norm": 1.6177363046208173, "language_loss": 0.73714489, "learning_rate": 3.830726423467561e-06, "loss": 0.75921202, "num_input_tokens_seen": 56962145, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.03125, "step": 2627, "time_per_iteration": 2.534917116165161 }, { "auxiliary_loss_clip": 0.01157422, "auxiliary_loss_mlp": 0.01048852, "balance_loss_clip": 1.02913499, "balance_loss_mlp": 1.05382478, "epoch": 0.15800390801142342, "flos": 12130158533760.0, "grad_norm": 1.9269332340468581, "language_loss": 0.84868872, "learning_rate": 3.830569580152348e-06, "loss": 0.87075138, "num_input_tokens_seen": 56977505, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.03125, "step": 2628, "time_per_iteration": 2.4314873218536377 }, { "auxiliary_loss_clip": 0.01152374, "auxiliary_loss_mlp": 0.01042078, "balance_loss_clip": 1.02461386, "balance_loss_mlp": 1.05099785, "epoch": 0.15806403126409138, "flos": 20704728562560.0, "grad_norm": 1.8575841092145873, "language_loss": 0.76991558, "learning_rate": 3.830412667421752e-06, "loss": 0.7918601, "num_input_tokens_seen": 56996770, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.015625, "step": 2629, "time_per_iteration": 2.485165596008301 }, { "auxiliary_loss_clip": 0.0115842, "auxiliary_loss_mlp": 0.01049756, "balance_loss_clip": 1.02997899, "balance_loss_mlp": 1.05420685, "epoch": 0.15812415451675935, "flos": 17821712269440.0, "grad_norm": 2.974545920842858, "language_loss": 0.73662597, "learning_rate": 3.8302556852817245e-06, "loss": 0.75870776, "num_input_tokens_seen": 57014970, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0390625, "step": 2630, "time_per_iteration": 2.415400266647339 }, { "auxiliary_loss_clip": 0.01159349, "auxiliary_loss_mlp": 0.01046567, "balance_loss_clip": 1.0271244, "balance_loss_mlp": 1.05316329, "epoch": 0.15818427776942734, "flos": 20084048524800.0, "grad_norm": 2.0767927782734343, "language_loss": 0.8381846, "learning_rate": 3.8300986337382184e-06, "loss": 0.8602438, "num_input_tokens_seen": 57034045, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0625, "step": 2631, "time_per_iteration": 2.482954740524292 }, { "auxiliary_loss_clip": 0.01154716, "auxiliary_loss_mlp": 0.01045352, "balance_loss_clip": 1.02668405, "balance_loss_mlp": 1.05127025, "epoch": 0.1582444010220953, "flos": 21214911386880.0, "grad_norm": 1.7166952319556545, "language_loss": 0.78284091, "learning_rate": 3.8299415127971895e-06, "loss": 0.80484164, "num_input_tokens_seen": 57053695, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.03125, "step": 2632, "time_per_iteration": 2.474118232727051 }, { "auxiliary_loss_clip": 0.01160632, "auxiliary_loss_mlp": 0.01053733, "balance_loss_clip": 1.03394401, "balance_loss_mlp": 1.05597126, "epoch": 0.15830452427476327, "flos": 17858341163520.0, "grad_norm": 1.9331974673791723, "language_loss": 0.83263332, "learning_rate": 3.829784322464594e-06, "loss": 0.85477698, "num_input_tokens_seen": 57071290, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.046875, "step": 2633, "time_per_iteration": 2.4969892501831055 }, { "auxiliary_loss_clip": 0.0116158, "auxiliary_loss_mlp": 0.01046502, "balance_loss_clip": 1.02754807, "balance_loss_mlp": 1.05616641, "epoch": 0.15836464752743123, "flos": 24534960456960.0, "grad_norm": 1.8067709070719344, "language_loss": 0.77213019, "learning_rate": 3.829627062746394e-06, "loss": 0.79421103, "num_input_tokens_seen": 57091465, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0546875, "step": 2634, "time_per_iteration": 2.5180869102478027 }, { "auxiliary_loss_clip": 0.01160356, "auxiliary_loss_mlp": 0.01045687, "balance_loss_clip": 1.02581501, "balance_loss_mlp": 1.05338025, "epoch": 0.1584247707800992, "flos": 20120821073280.0, "grad_norm": 2.115947578829429, "language_loss": 0.88897645, "learning_rate": 3.829469733648552e-06, "loss": 0.91103685, "num_input_tokens_seen": 57110075, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0703125, "step": 2635, "time_per_iteration": 2.491065502166748 }, { "auxiliary_loss_clip": 0.01158151, "auxiliary_loss_mlp": 0.01056552, "balance_loss_clip": 1.0358696, "balance_loss_mlp": 1.05203247, "epoch": 0.15848489403276717, "flos": 20375966355840.0, "grad_norm": 2.6207357040725716, "language_loss": 0.75662947, "learning_rate": 3.829312335177034e-06, "loss": 0.77877653, "num_input_tokens_seen": 57128945, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0625, "step": 2636, "time_per_iteration": 2.4698915481567383 }, { "auxiliary_loss_clip": 0.0116125, "auxiliary_loss_mlp": 0.01045848, "balance_loss_clip": 1.02492642, "balance_loss_mlp": 1.05388391, "epoch": 0.15854501728543513, "flos": 39346890359040.0, "grad_norm": 1.9341741707104352, "language_loss": 0.72252929, "learning_rate": 3.82915486733781e-06, "loss": 0.74460024, "num_input_tokens_seen": 57152385, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.078125, "step": 2637, "time_per_iteration": 4.134963512420654 }, { "auxiliary_loss_clip": 0.01154767, "auxiliary_loss_mlp": 0.01044016, "balance_loss_clip": 1.02525234, "balance_loss_mlp": 1.05214632, "epoch": 0.15860514053810312, "flos": 24864225454080.0, "grad_norm": 1.908535491880599, "language_loss": 0.77609724, "learning_rate": 3.82899733013685e-06, "loss": 0.79808509, "num_input_tokens_seen": 57172620, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.03125, "step": 2638, "time_per_iteration": 3.875924587249756 }, { "auxiliary_loss_clip": 0.01158253, "auxiliary_loss_mlp": 0.01059835, "balance_loss_clip": 1.03809106, "balance_loss_mlp": 1.05202007, "epoch": 0.1586652637907711, "flos": 26177694082560.0, "grad_norm": 1.906055026886879, "language_loss": 0.7581923, "learning_rate": 3.828839723580128e-06, "loss": 0.78037322, "num_input_tokens_seen": 57194680, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.0625, "step": 2639, "time_per_iteration": 3.8812568187713623 }, { "auxiliary_loss_clip": 0.01159216, "auxiliary_loss_mlp": 0.01056686, "balance_loss_clip": 1.03726709, "balance_loss_mlp": 1.05298078, "epoch": 0.15872538704343905, "flos": 19792058866560.0, "grad_norm": 2.2337184840431594, "language_loss": 0.81437314, "learning_rate": 3.82868204767362e-06, "loss": 0.83653212, "num_input_tokens_seen": 57214675, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0625, "step": 2640, "time_per_iteration": 3.884284019470215 }, { "auxiliary_loss_clip": 0.01154506, "auxiliary_loss_mlp": 0.01053361, "balance_loss_clip": 1.03402531, "balance_loss_mlp": 1.05209041, "epoch": 0.15878551029610702, "flos": 28475366342400.0, "grad_norm": 1.3834838026718277, "language_loss": 0.67051518, "learning_rate": 3.828524302423306e-06, "loss": 0.69259393, "num_input_tokens_seen": 57235830, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0234375, "step": 2641, "time_per_iteration": 2.537442922592163 }, { "auxiliary_loss_clip": 0.01164871, "auxiliary_loss_mlp": 0.01056831, "balance_loss_clip": 1.03612447, "balance_loss_mlp": 1.05477035, "epoch": 0.15884563354877498, "flos": 24206701040640.0, "grad_norm": 2.1379235281835256, "language_loss": 0.75265217, "learning_rate": 3.828366487835167e-06, "loss": 0.7748692, "num_input_tokens_seen": 57255970, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.1015625, "step": 2642, "time_per_iteration": 2.4739224910736084 }, { "auxiliary_loss_clip": 0.01153084, "auxiliary_loss_mlp": 0.01049762, "balance_loss_clip": 1.03015232, "balance_loss_mlp": 1.05187583, "epoch": 0.15890575680144295, "flos": 23949795991680.0, "grad_norm": 2.4407086792515136, "language_loss": 0.70283502, "learning_rate": 3.828208603915186e-06, "loss": 0.72486341, "num_input_tokens_seen": 57274435, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.015625, "step": 2643, "time_per_iteration": 2.4852025508880615 }, { "auxiliary_loss_clip": 0.01154664, "auxiliary_loss_mlp": 0.01046643, "balance_loss_clip": 1.0287025, "balance_loss_mlp": 1.05309129, "epoch": 0.15896588005411091, "flos": 21215019127680.0, "grad_norm": 2.0210399737704363, "language_loss": 0.78635806, "learning_rate": 3.828050650669353e-06, "loss": 0.80837113, "num_input_tokens_seen": 57293115, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.015625, "step": 2644, "time_per_iteration": 2.491943597793579 }, { "auxiliary_loss_clip": 0.01156729, "auxiliary_loss_mlp": 0.01050239, "balance_loss_clip": 1.03121281, "balance_loss_mlp": 1.05343628, "epoch": 0.1590260033067789, "flos": 24352390604160.0, "grad_norm": 1.7912517762796933, "language_loss": 0.8230834, "learning_rate": 3.827892628103657e-06, "loss": 0.84515309, "num_input_tokens_seen": 57312565, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.03125, "step": 2645, "time_per_iteration": 2.535111665725708 }, { "auxiliary_loss_clip": 0.01157897, "auxiliary_loss_mlp": 0.01049811, "balance_loss_clip": 1.02914047, "balance_loss_mlp": 1.05157685, "epoch": 0.15908612655944687, "flos": 32048944583040.0, "grad_norm": 2.7165429279862416, "language_loss": 0.70077443, "learning_rate": 3.827734536224087e-06, "loss": 0.72285151, "num_input_tokens_seen": 57333360, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0625, "step": 2646, "time_per_iteration": 2.537173271179199 }, { "auxiliary_loss_clip": 0.0115252, "auxiliary_loss_mlp": 0.01047947, "balance_loss_clip": 1.02902889, "balance_loss_mlp": 1.05165315, "epoch": 0.15914624981211484, "flos": 17785370684160.0, "grad_norm": 2.20540658205476, "language_loss": 0.62517345, "learning_rate": 3.827576375036642e-06, "loss": 0.64717817, "num_input_tokens_seen": 57350575, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0078125, "step": 2647, "time_per_iteration": 2.4806230068206787 }, { "auxiliary_loss_clip": 0.01158404, "auxiliary_loss_mlp": 0.01048695, "balance_loss_clip": 1.02891862, "balance_loss_mlp": 1.05541408, "epoch": 0.1592063730647828, "flos": 17712507945600.0, "grad_norm": 3.032685690423668, "language_loss": 0.89491665, "learning_rate": 3.827418144547318e-06, "loss": 0.91698766, "num_input_tokens_seen": 57367570, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.03125, "step": 2648, "time_per_iteration": 2.4308419227600098 }, { "auxiliary_loss_clip": 0.01153699, "auxiliary_loss_mlp": 0.01047621, "balance_loss_clip": 1.0293467, "balance_loss_mlp": 1.0538013, "epoch": 0.15926649631745077, "flos": 18803545603200.0, "grad_norm": 2.0135370747699834, "language_loss": 0.91242707, "learning_rate": 3.827259844762114e-06, "loss": 0.93444026, "num_input_tokens_seen": 57383980, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0, "step": 2649, "time_per_iteration": 2.465911388397217 }, { "auxiliary_loss_clip": 0.01165889, "auxiliary_loss_mlp": 0.01048862, "balance_loss_clip": 1.02763104, "balance_loss_mlp": 1.05324101, "epoch": 0.15932661957011873, "flos": 17566243764480.0, "grad_norm": 2.3320662471076177, "language_loss": 0.70985866, "learning_rate": 3.827101475687033e-06, "loss": 0.73200619, "num_input_tokens_seen": 57400840, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.125, "step": 2650, "time_per_iteration": 2.4228885173797607 }, { "auxiliary_loss_clip": 0.01153505, "auxiliary_loss_mlp": 0.01041188, "balance_loss_clip": 1.02333045, "balance_loss_mlp": 1.0528903, "epoch": 0.15938674282278673, "flos": 13334351011200.0, "grad_norm": 2.1737188635009987, "language_loss": 0.70743185, "learning_rate": 3.826943037328082e-06, "loss": 0.72937882, "num_input_tokens_seen": 57419230, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.0078125, "step": 2651, "time_per_iteration": 2.488342046737671 }, { "auxiliary_loss_clip": 0.01157888, "auxiliary_loss_mlp": 0.01048129, "balance_loss_clip": 1.02833998, "balance_loss_mlp": 1.05280471, "epoch": 0.1594468660754547, "flos": 22488842119680.0, "grad_norm": 1.913362562311664, "language_loss": 0.8026408, "learning_rate": 3.8267845296912674e-06, "loss": 0.82470089, "num_input_tokens_seen": 57439315, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.046875, "step": 2652, "time_per_iteration": 2.4654979705810547 }, { "auxiliary_loss_clip": 0.01153006, "auxiliary_loss_mlp": 0.01046444, "balance_loss_clip": 1.02775228, "balance_loss_mlp": 1.05297923, "epoch": 0.15950698932812266, "flos": 15007320910080.0, "grad_norm": 2.525184977156922, "language_loss": 0.69977784, "learning_rate": 3.826625952782601e-06, "loss": 0.72177243, "num_input_tokens_seen": 57454635, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0, "step": 2653, "time_per_iteration": 2.4306554794311523 }, { "auxiliary_loss_clip": 0.01154263, "auxiliary_loss_mlp": 0.01038889, "balance_loss_clip": 1.01970792, "balance_loss_mlp": 1.05139565, "epoch": 0.15956711258079062, "flos": 30155052084480.0, "grad_norm": 4.855247292511209, "language_loss": 0.76383686, "learning_rate": 3.826467306608095e-06, "loss": 0.78576839, "num_input_tokens_seen": 57476805, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.03125, "step": 2654, "time_per_iteration": 2.537017583847046 }, { "auxiliary_loss_clip": 0.01152017, "auxiliary_loss_mlp": 0.01041195, "balance_loss_clip": 1.02234769, "balance_loss_mlp": 1.04924226, "epoch": 0.1596272358334586, "flos": 21032700670080.0, "grad_norm": 2.0143593158748256, "language_loss": 0.82076603, "learning_rate": 3.826308591173765e-06, "loss": 0.84269816, "num_input_tokens_seen": 57496400, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0234375, "step": 2655, "time_per_iteration": 2.4774982929229736 }, { "auxiliary_loss_clip": 0.0115481, "auxiliary_loss_mlp": 0.01046361, "balance_loss_clip": 1.02824163, "balance_loss_mlp": 1.05099511, "epoch": 0.15968735908612655, "flos": 15268032800640.0, "grad_norm": 1.9425160494875415, "language_loss": 0.736175, "learning_rate": 3.826149806485631e-06, "loss": 0.7581867, "num_input_tokens_seen": 57513700, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0390625, "step": 2656, "time_per_iteration": 2.4435086250305176 }, { "auxiliary_loss_clip": 0.01150686, "auxiliary_loss_mlp": 0.01042758, "balance_loss_clip": 1.02429271, "balance_loss_mlp": 1.05076694, "epoch": 0.15974748233879452, "flos": 52665726695040.0, "grad_norm": 1.7970460301062994, "language_loss": 0.77674866, "learning_rate": 3.825990952549713e-06, "loss": 0.79868311, "num_input_tokens_seen": 57536180, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0, "step": 2657, "time_per_iteration": 2.8031957149505615 }, { "auxiliary_loss_clip": 0.01153568, "auxiliary_loss_mlp": 0.01048323, "balance_loss_clip": 1.02938056, "balance_loss_mlp": 1.05313814, "epoch": 0.1598076055914625, "flos": 18733232730240.0, "grad_norm": 1.6748445858285197, "language_loss": 0.74939358, "learning_rate": 3.825832029372035e-06, "loss": 0.77141249, "num_input_tokens_seen": 57555025, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0, "step": 2658, "time_per_iteration": 2.477531671524048 }, { "auxiliary_loss_clip": 0.01156034, "auxiliary_loss_mlp": 0.01046722, "balance_loss_clip": 1.02553821, "balance_loss_mlp": 1.05155945, "epoch": 0.15986772884413047, "flos": 34349238535680.0, "grad_norm": 1.916183903305714, "language_loss": 0.75505614, "learning_rate": 3.825673036958624e-06, "loss": 0.77708369, "num_input_tokens_seen": 57577660, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.046875, "step": 2659, "time_per_iteration": 2.587114095687866 }, { "auxiliary_loss_clip": 0.01155375, "auxiliary_loss_mlp": 0.01048112, "balance_loss_clip": 1.02804887, "balance_loss_mlp": 1.05142522, "epoch": 0.15992785209679844, "flos": 22054969739520.0, "grad_norm": 2.1246448567238265, "language_loss": 0.90646577, "learning_rate": 3.825513975315508e-06, "loss": 0.92850065, "num_input_tokens_seen": 57596335, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0390625, "step": 2660, "time_per_iteration": 2.4884066581726074 }, { "auxiliary_loss_clip": 0.01158439, "auxiliary_loss_mlp": 0.01045585, "balance_loss_clip": 1.02603507, "balance_loss_mlp": 1.05378819, "epoch": 0.1599879753494664, "flos": 33066652625280.0, "grad_norm": 1.9758693614319227, "language_loss": 0.78205556, "learning_rate": 3.82535484444872e-06, "loss": 0.8040958, "num_input_tokens_seen": 57616830, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.046875, "step": 2661, "time_per_iteration": 2.5758426189422607 }, { "auxiliary_loss_clip": 0.01156658, "auxiliary_loss_mlp": 0.01043793, "balance_loss_clip": 1.02467179, "balance_loss_mlp": 1.05217683, "epoch": 0.16004809860213437, "flos": 28038010343040.0, "grad_norm": 1.7626103269751705, "language_loss": 0.74711043, "learning_rate": 3.825195644364292e-06, "loss": 0.76911497, "num_input_tokens_seen": 57635515, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.046875, "step": 2662, "time_per_iteration": 2.5065319538116455 }, { "auxiliary_loss_clip": 0.01155802, "auxiliary_loss_mlp": 0.01046527, "balance_loss_clip": 1.02735853, "balance_loss_mlp": 1.05117345, "epoch": 0.16010822185480234, "flos": 22780113505920.0, "grad_norm": 1.769371470455651, "language_loss": 0.8206687, "learning_rate": 3.825036375068263e-06, "loss": 0.84269196, "num_input_tokens_seen": 57654250, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.046875, "step": 2663, "time_per_iteration": 2.478372097015381 }, { "auxiliary_loss_clip": 0.01160037, "auxiliary_loss_mlp": 0.01051746, "balance_loss_clip": 1.03224289, "balance_loss_mlp": 1.0545491, "epoch": 0.16016834510747033, "flos": 20084012611200.0, "grad_norm": 2.237442693922041, "language_loss": 0.79635143, "learning_rate": 3.824877036566672e-06, "loss": 0.81846923, "num_input_tokens_seen": 57672645, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0546875, "step": 2664, "time_per_iteration": 2.443310260772705 }, { "auxiliary_loss_clip": 0.01152507, "auxiliary_loss_mlp": 0.01049065, "balance_loss_clip": 1.03019452, "balance_loss_mlp": 1.0498327, "epoch": 0.1602284683601383, "flos": 21173829206400.0, "grad_norm": 1.905558456375048, "language_loss": 0.93814099, "learning_rate": 3.824717628865561e-06, "loss": 0.96015674, "num_input_tokens_seen": 57691055, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.03125, "step": 2665, "time_per_iteration": 2.461466073989868 }, { "auxiliary_loss_clip": 0.0115412, "auxiliary_loss_mlp": 0.01044407, "balance_loss_clip": 1.02529764, "balance_loss_mlp": 1.04961252, "epoch": 0.16028859161280626, "flos": 14647568244480.0, "grad_norm": 2.122251831764075, "language_loss": 0.84776747, "learning_rate": 3.824558151970974e-06, "loss": 0.86975271, "num_input_tokens_seen": 57707235, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.046875, "step": 2666, "time_per_iteration": 2.40644907951355 }, { "auxiliary_loss_clip": 0.01155031, "auxiliary_loss_mlp": 0.01047047, "balance_loss_clip": 1.02849793, "balance_loss_mlp": 1.05180824, "epoch": 0.16034871486547422, "flos": 20990325600000.0, "grad_norm": 1.9650676172701036, "language_loss": 0.81366253, "learning_rate": 3.8243986058889595e-06, "loss": 0.83568329, "num_input_tokens_seen": 57724190, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.03125, "step": 2667, "time_per_iteration": 2.472419500350952 }, { "auxiliary_loss_clip": 0.01156072, "auxiliary_loss_mlp": 0.0105053, "balance_loss_clip": 1.03098011, "balance_loss_mlp": 1.05378318, "epoch": 0.1604088381181422, "flos": 21397732634880.0, "grad_norm": 1.7272970812896848, "language_loss": 0.73546654, "learning_rate": 3.824238990625567e-06, "loss": 0.7575326, "num_input_tokens_seen": 57743620, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0234375, "step": 2668, "time_per_iteration": 2.44401216506958 }, { "auxiliary_loss_clip": 0.01154972, "auxiliary_loss_mlp": 0.01055265, "balance_loss_clip": 1.03623939, "balance_loss_mlp": 1.0510844, "epoch": 0.16046896137081015, "flos": 23877040993920.0, "grad_norm": 1.558853626187972, "language_loss": 0.77623761, "learning_rate": 3.824079306186848e-06, "loss": 0.79833996, "num_input_tokens_seen": 57764810, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0390625, "step": 2669, "time_per_iteration": 2.4907584190368652 }, { "auxiliary_loss_clip": 0.01056539, "auxiliary_loss_mlp": 0.01044513, "balance_loss_clip": 1.04167581, "balance_loss_mlp": 1.02145934, "epoch": 0.16052908462347812, "flos": 59806709015040.0, "grad_norm": 0.815792970515426, "language_loss": 0.55531633, "learning_rate": 3.823919552578861e-06, "loss": 0.57632685, "num_input_tokens_seen": 57824390, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.3515625, "step": 2670, "time_per_iteration": 2.9558537006378174 }, { "auxiliary_loss_clip": 0.01152648, "auxiliary_loss_mlp": 0.01042872, "balance_loss_clip": 1.0237273, "balance_loss_mlp": 1.04807568, "epoch": 0.1605892078761461, "flos": 18296559089280.0, "grad_norm": 2.1803761321758346, "language_loss": 0.77559328, "learning_rate": 3.82375972980766e-06, "loss": 0.79754853, "num_input_tokens_seen": 57843665, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.046875, "step": 2671, "time_per_iteration": 2.450568437576294 }, { "auxiliary_loss_clip": 0.01156939, "auxiliary_loss_mlp": 0.01042739, "balance_loss_clip": 1.02435708, "balance_loss_mlp": 1.05285442, "epoch": 0.16064933112881408, "flos": 32160734686080.0, "grad_norm": 1.944667349626632, "language_loss": 0.65124261, "learning_rate": 3.8235998378793086e-06, "loss": 0.67323935, "num_input_tokens_seen": 57863305, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0390625, "step": 2672, "time_per_iteration": 2.575141191482544 }, { "auxiliary_loss_clip": 0.01157654, "auxiliary_loss_mlp": 0.01040883, "balance_loss_clip": 1.02008128, "balance_loss_mlp": 1.0514977, "epoch": 0.16070945438148204, "flos": 19828795501440.0, "grad_norm": 1.8557433860071957, "language_loss": 0.85739887, "learning_rate": 3.8234398767998675e-06, "loss": 0.87938422, "num_input_tokens_seen": 57883025, "router_z_loss_clip": 0.20800781, "router_z_loss_mlp": 1.0625, "step": 2673, "time_per_iteration": 2.4700403213500977 }, { "auxiliary_loss_clip": 0.01155947, "auxiliary_loss_mlp": 0.01049265, "balance_loss_clip": 1.03094268, "balance_loss_mlp": 1.0522871, "epoch": 0.16076957763415, "flos": 18913144976640.0, "grad_norm": 2.160428594531425, "language_loss": 0.7246393, "learning_rate": 3.823279846575403e-06, "loss": 0.74669135, "num_input_tokens_seen": 57901430, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.03125, "step": 2674, "time_per_iteration": 2.429490089416504 }, { "auxiliary_loss_clip": 0.01154407, "auxiliary_loss_mlp": 0.01040199, "balance_loss_clip": 1.01945686, "balance_loss_mlp": 1.05043101, "epoch": 0.16082970088681797, "flos": 16764358590720.0, "grad_norm": 1.566733382424915, "language_loss": 0.84090483, "learning_rate": 3.823119747211986e-06, "loss": 0.86285096, "num_input_tokens_seen": 57919550, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0390625, "step": 2675, "time_per_iteration": 2.4539644718170166 }, { "auxiliary_loss_clip": 0.01157046, "auxiliary_loss_mlp": 0.01045135, "balance_loss_clip": 1.0250721, "balance_loss_mlp": 1.05307984, "epoch": 0.16088982413948594, "flos": 35150261783040.0, "grad_norm": 1.98691281328486, "language_loss": 0.82533234, "learning_rate": 3.822959578715685e-06, "loss": 0.84735417, "num_input_tokens_seen": 57939890, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0390625, "step": 2676, "time_per_iteration": 2.566404342651367 }, { "auxiliary_loss_clip": 0.01153992, "auxiliary_loss_mlp": 0.01046338, "balance_loss_clip": 1.02870727, "balance_loss_mlp": 1.05271184, "epoch": 0.1609499473921539, "flos": 18625105814400.0, "grad_norm": 2.313450494229165, "language_loss": 0.73683923, "learning_rate": 3.822799341092573e-06, "loss": 0.75884259, "num_input_tokens_seen": 57957410, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.015625, "step": 2677, "time_per_iteration": 2.4794669151306152 }, { "auxiliary_loss_clip": 0.01152478, "auxiliary_loss_mlp": 0.01039142, "balance_loss_clip": 1.02071214, "balance_loss_mlp": 1.05069542, "epoch": 0.1610100706448219, "flos": 33145728416640.0, "grad_norm": 1.9218486738137002, "language_loss": 0.76328766, "learning_rate": 3.822639034348728e-06, "loss": 0.78520381, "num_input_tokens_seen": 57977900, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.015625, "step": 2678, "time_per_iteration": 4.0306243896484375 }, { "auxiliary_loss_clip": 0.01155028, "auxiliary_loss_mlp": 0.01041236, "balance_loss_clip": 1.02080357, "balance_loss_mlp": 1.05023551, "epoch": 0.16107019389748986, "flos": 34676707852800.0, "grad_norm": 1.9995578424850255, "language_loss": 0.70692277, "learning_rate": 3.822478658490228e-06, "loss": 0.72888541, "num_input_tokens_seen": 57998210, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.046875, "step": 2679, "time_per_iteration": 3.9816246032714844 }, { "auxiliary_loss_clip": 0.01055342, "auxiliary_loss_mlp": 0.01015461, "balance_loss_clip": 1.0130415, "balance_loss_mlp": 1.02083886, "epoch": 0.16113031715015783, "flos": 65713403260800.0, "grad_norm": 0.795079428542787, "language_loss": 0.51794434, "learning_rate": 3.822318213523154e-06, "loss": 0.53865242, "num_input_tokens_seen": 58059420, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.34570312, "step": 2680, "time_per_iteration": 3.126101493835449 }, { "auxiliary_loss_clip": 0.01153935, "auxiliary_loss_mlp": 0.01048074, "balance_loss_clip": 1.02719998, "balance_loss_mlp": 1.04853559, "epoch": 0.1611904404028258, "flos": 20810413353600.0, "grad_norm": 1.6158576945774519, "language_loss": 0.8021841, "learning_rate": 3.8221576994535925e-06, "loss": 0.82420421, "num_input_tokens_seen": 58078370, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.0546875, "step": 2681, "time_per_iteration": 5.366640090942383 }, { "auxiliary_loss_clip": 0.01155439, "auxiliary_loss_mlp": 0.01052151, "balance_loss_clip": 1.03330374, "balance_loss_mlp": 1.05383801, "epoch": 0.16125056365549376, "flos": 27013335062400.0, "grad_norm": 1.8286300714600034, "language_loss": 0.68959659, "learning_rate": 3.821997116287627e-06, "loss": 0.71167248, "num_input_tokens_seen": 58097395, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.015625, "step": 2682, "time_per_iteration": 2.521822214126587 }, { "auxiliary_loss_clip": 0.01158193, "auxiliary_loss_mlp": 0.01050659, "balance_loss_clip": 1.03047645, "balance_loss_mlp": 1.05417061, "epoch": 0.16131068690816172, "flos": 19276524915840.0, "grad_norm": 2.0844016423206404, "language_loss": 0.87408876, "learning_rate": 3.821836464031348e-06, "loss": 0.89617723, "num_input_tokens_seen": 58115630, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0390625, "step": 2683, "time_per_iteration": 2.4392104148864746 }, { "auxiliary_loss_clip": 0.01158065, "auxiliary_loss_mlp": 0.01055107, "balance_loss_clip": 1.03424489, "balance_loss_mlp": 1.05327308, "epoch": 0.16137081016082971, "flos": 35337931367040.0, "grad_norm": 1.7055622111715463, "language_loss": 0.74630696, "learning_rate": 3.821675742690849e-06, "loss": 0.7684387, "num_input_tokens_seen": 58138655, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.046875, "step": 2684, "time_per_iteration": 2.5998449325561523 }, { "auxiliary_loss_clip": 0.01161598, "auxiliary_loss_mlp": 0.0104819, "balance_loss_clip": 1.02805495, "balance_loss_mlp": 1.05565405, "epoch": 0.16143093341349768, "flos": 34235257703040.0, "grad_norm": 1.699174771560321, "language_loss": 0.70539606, "learning_rate": 3.821514952272223e-06, "loss": 0.72749394, "num_input_tokens_seen": 58157440, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0546875, "step": 2685, "time_per_iteration": 2.5582494735717773 }, { "auxiliary_loss_clip": 0.01152429, "auxiliary_loss_mlp": 0.0104911, "balance_loss_clip": 1.03066778, "balance_loss_mlp": 1.05139983, "epoch": 0.16149105666616564, "flos": 27999262546560.0, "grad_norm": 1.7869616128544763, "language_loss": 0.71885252, "learning_rate": 3.821354092781567e-06, "loss": 0.74086791, "num_input_tokens_seen": 58176660, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0078125, "step": 2686, "time_per_iteration": 2.5250539779663086 }, { "auxiliary_loss_clip": 0.01158328, "auxiliary_loss_mlp": 0.01055367, "balance_loss_clip": 1.03537595, "balance_loss_mlp": 1.05358028, "epoch": 0.1615511799188336, "flos": 19422214479360.0, "grad_norm": 2.473944021735258, "language_loss": 0.81855518, "learning_rate": 3.821193164224981e-06, "loss": 0.84069216, "num_input_tokens_seen": 58195085, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.046875, "step": 2687, "time_per_iteration": 2.4413039684295654 }, { "auxiliary_loss_clip": 0.01158474, "auxiliary_loss_mlp": 0.01052787, "balance_loss_clip": 1.03141272, "balance_loss_mlp": 1.04898667, "epoch": 0.16161130317150157, "flos": 22854915578880.0, "grad_norm": 4.473621416787062, "language_loss": 0.71464545, "learning_rate": 3.821032166608568e-06, "loss": 0.73675805, "num_input_tokens_seen": 58213540, "router_z_loss_clip": 0.21386719, "router_z_loss_mlp": 1.09375, "step": 2688, "time_per_iteration": 2.4766409397125244 }, { "auxiliary_loss_clip": 0.01156284, "auxiliary_loss_mlp": 0.01048924, "balance_loss_clip": 1.02968335, "balance_loss_mlp": 1.05165601, "epoch": 0.16167142642416954, "flos": 26110577520000.0, "grad_norm": 1.5626884115616866, "language_loss": 0.75776458, "learning_rate": 3.8208710999384325e-06, "loss": 0.77981663, "num_input_tokens_seen": 58236995, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.046875, "step": 2689, "time_per_iteration": 2.5467114448547363 }, { "auxiliary_loss_clip": 0.01157081, "auxiliary_loss_mlp": 0.01056477, "balance_loss_clip": 1.03645027, "balance_loss_mlp": 1.05433095, "epoch": 0.1617315496768375, "flos": 22779646629120.0, "grad_norm": 1.7190943744896372, "language_loss": 0.8794651, "learning_rate": 3.820709964220683e-06, "loss": 0.90160072, "num_input_tokens_seen": 58257230, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.03125, "step": 2690, "time_per_iteration": 2.5002942085266113 }, { "auxiliary_loss_clip": 0.01154576, "auxiliary_loss_mlp": 0.01050203, "balance_loss_clip": 1.03244042, "balance_loss_mlp": 1.05283785, "epoch": 0.1617916729295055, "flos": 22017299351040.0, "grad_norm": 1.519731091283635, "language_loss": 0.87975013, "learning_rate": 3.8205487594614284e-06, "loss": 0.90179795, "num_input_tokens_seen": 58277080, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.015625, "step": 2691, "time_per_iteration": 2.4699316024780273 }, { "auxiliary_loss_clip": 0.01160567, "auxiliary_loss_mlp": 0.01056446, "balance_loss_clip": 1.03370094, "balance_loss_mlp": 1.05185497, "epoch": 0.16185179618217346, "flos": 23438248450560.0, "grad_norm": 2.105969180844147, "language_loss": 0.82289934, "learning_rate": 3.820387485666784e-06, "loss": 0.84506947, "num_input_tokens_seen": 58294815, "router_z_loss_clip": 0.22753906, "router_z_loss_mlp": 1.0859375, "step": 2692, "time_per_iteration": 2.488847494125366 }, { "auxiliary_loss_clip": 0.01162435, "auxiliary_loss_mlp": 0.01057473, "balance_loss_clip": 1.03644419, "balance_loss_mlp": 1.05273426, "epoch": 0.16191191943484143, "flos": 25666110627840.0, "grad_norm": 3.7145633907261777, "language_loss": 0.81401145, "learning_rate": 3.820226142842862e-06, "loss": 0.83621049, "num_input_tokens_seen": 58313215, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.09375, "step": 2693, "time_per_iteration": 2.479830026626587 }, { "auxiliary_loss_clip": 0.01153937, "auxiliary_loss_mlp": 0.01053634, "balance_loss_clip": 1.03574109, "balance_loss_mlp": 1.05289507, "epoch": 0.1619720426875094, "flos": 23477355383040.0, "grad_norm": 1.5622046174878028, "language_loss": 0.83864689, "learning_rate": 3.820064730995783e-06, "loss": 0.8607226, "num_input_tokens_seen": 58333215, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.0078125, "step": 2694, "time_per_iteration": 2.4953553676605225 }, { "auxiliary_loss_clip": 0.01161051, "auxiliary_loss_mlp": 0.01057455, "balance_loss_clip": 1.03678393, "balance_loss_mlp": 1.05263782, "epoch": 0.16203216594017736, "flos": 24133658734080.0, "grad_norm": 1.9013550894863724, "language_loss": 0.69556153, "learning_rate": 3.819903250131667e-06, "loss": 0.71774656, "num_input_tokens_seen": 58351160, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.0859375, "step": 2695, "time_per_iteration": 2.470301389694214 }, { "auxiliary_loss_clip": 0.01163419, "auxiliary_loss_mlp": 0.01050147, "balance_loss_clip": 1.02970278, "balance_loss_mlp": 1.0557735, "epoch": 0.16209228919284532, "flos": 22340889999360.0, "grad_norm": 1.9913333343102682, "language_loss": 0.82444239, "learning_rate": 3.819741700256637e-06, "loss": 0.84657812, "num_input_tokens_seen": 58368505, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.078125, "step": 2696, "time_per_iteration": 2.462599277496338 }, { "auxiliary_loss_clip": 0.01164508, "auxiliary_loss_mlp": 0.01061601, "balance_loss_clip": 1.03940415, "balance_loss_mlp": 1.05339527, "epoch": 0.1621524124455133, "flos": 15815131827840.0, "grad_norm": 2.1361517494345645, "language_loss": 0.88102096, "learning_rate": 3.8195800813768194e-06, "loss": 0.90328205, "num_input_tokens_seen": 58385085, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.109375, "step": 2697, "time_per_iteration": 2.425832509994507 }, { "auxiliary_loss_clip": 0.01148554, "auxiliary_loss_mlp": 0.01045595, "balance_loss_clip": 1.02707016, "balance_loss_mlp": 1.0485121, "epoch": 0.16221253569818128, "flos": 30186688988160.0, "grad_norm": 1.409154151127896, "language_loss": 0.81143045, "learning_rate": 3.819418393498343e-06, "loss": 0.833372, "num_input_tokens_seen": 58406985, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0, "step": 2698, "time_per_iteration": 2.540053129196167 }, { "auxiliary_loss_clip": 0.01152906, "auxiliary_loss_mlp": 0.01047674, "balance_loss_clip": 1.02793264, "balance_loss_mlp": 1.05099988, "epoch": 0.16227265895084925, "flos": 24605991601920.0, "grad_norm": 1.7483173131565397, "language_loss": 0.77628362, "learning_rate": 3.819256636627339e-06, "loss": 0.79828936, "num_input_tokens_seen": 58426205, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.015625, "step": 2699, "time_per_iteration": 2.4760963916778564 }, { "auxiliary_loss_clip": 0.0115166, "auxiliary_loss_mlp": 0.01042072, "balance_loss_clip": 1.02380908, "balance_loss_mlp": 1.0479933, "epoch": 0.1623327822035172, "flos": 19573326996480.0, "grad_norm": 2.000572958407467, "language_loss": 0.86472714, "learning_rate": 3.81909481076994e-06, "loss": 0.88666451, "num_input_tokens_seen": 58443830, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0390625, "step": 2700, "time_per_iteration": 2.458867311477661 }, { "auxiliary_loss_clip": 0.01150427, "auxiliary_loss_mlp": 0.01047754, "balance_loss_clip": 1.02676105, "balance_loss_mlp": 1.0484221, "epoch": 0.16239290545618518, "flos": 26468462678400.0, "grad_norm": 1.942824783615158, "language_loss": 0.80522424, "learning_rate": 3.818932915932284e-06, "loss": 0.82720602, "num_input_tokens_seen": 58464405, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.015625, "step": 2701, "time_per_iteration": 2.4960505962371826 }, { "auxiliary_loss_clip": 0.01155532, "auxiliary_loss_mlp": 0.01047044, "balance_loss_clip": 1.0275054, "balance_loss_mlp": 1.05203605, "epoch": 0.16245302870885314, "flos": 15851940289920.0, "grad_norm": 1.606606037667999, "language_loss": 0.73140097, "learning_rate": 3.818770952120511e-06, "loss": 0.75342679, "num_input_tokens_seen": 58483295, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.03125, "step": 2702, "time_per_iteration": 2.449903726577759 }, { "auxiliary_loss_clip": 0.0115645, "auxiliary_loss_mlp": 0.01047696, "balance_loss_clip": 1.0261308, "balance_loss_mlp": 1.05106223, "epoch": 0.1625131519615211, "flos": 14756521173120.0, "grad_norm": 1.8334911651612595, "language_loss": 0.73070562, "learning_rate": 3.81860891934076e-06, "loss": 0.75274706, "num_input_tokens_seen": 58501205, "router_z_loss_clip": 0.21582031, "router_z_loss_mlp": 1.0546875, "step": 2703, "time_per_iteration": 2.42636775970459 }, { "auxiliary_loss_clip": 0.01153807, "auxiliary_loss_mlp": 0.01047207, "balance_loss_clip": 1.02627397, "balance_loss_mlp": 1.04816937, "epoch": 0.1625732752141891, "flos": 28220508368640.0, "grad_norm": 1.9160137384470213, "language_loss": 0.70733523, "learning_rate": 3.818446817599176e-06, "loss": 0.72934538, "num_input_tokens_seen": 58522315, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.0625, "step": 2704, "time_per_iteration": 2.527200937271118 }, { "auxiliary_loss_clip": 0.01054464, "auxiliary_loss_mlp": 0.01006229, "balance_loss_clip": 1.00359464, "balance_loss_mlp": 1.01852703, "epoch": 0.16263339846685707, "flos": 67327947688320.0, "grad_norm": 0.7806490820368859, "language_loss": 0.533723, "learning_rate": 3.818284646901907e-06, "loss": 0.55432993, "num_input_tokens_seen": 58586695, "router_z_loss_clip": 0.02636719, "router_z_loss_mlp": 0.359375, "step": 2705, "time_per_iteration": 3.0812971591949463 }, { "auxiliary_loss_clip": 0.01158656, "auxiliary_loss_mlp": 0.01049154, "balance_loss_clip": 1.02931774, "balance_loss_mlp": 1.05164313, "epoch": 0.16269352171952503, "flos": 14319165173760.0, "grad_norm": 2.84441296432746, "language_loss": 0.75810403, "learning_rate": 3.818122407255102e-06, "loss": 0.78018212, "num_input_tokens_seen": 58602435, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0703125, "step": 2706, "time_per_iteration": 2.4347732067108154 }, { "auxiliary_loss_clip": 0.01157413, "auxiliary_loss_mlp": 0.0104492, "balance_loss_clip": 1.02566755, "balance_loss_mlp": 1.05262184, "epoch": 0.162753644972193, "flos": 28361205941760.0, "grad_norm": 1.7454723061036967, "language_loss": 0.72623187, "learning_rate": 3.817960098664914e-06, "loss": 0.74825519, "num_input_tokens_seen": 58621275, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.046875, "step": 2707, "time_per_iteration": 2.5085179805755615 }, { "auxiliary_loss_clip": 0.01157125, "auxiliary_loss_mlp": 0.0104511, "balance_loss_clip": 1.02647758, "balance_loss_mlp": 1.05321097, "epoch": 0.16281376822486096, "flos": 19937856170880.0, "grad_norm": 36.196409111176706, "language_loss": 0.83501512, "learning_rate": 3.817797721137495e-06, "loss": 0.85703743, "num_input_tokens_seen": 58637550, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0390625, "step": 2708, "time_per_iteration": 2.4554994106292725 }, { "auxiliary_loss_clip": 0.01162559, "auxiliary_loss_mlp": 0.01042924, "balance_loss_clip": 1.02071536, "balance_loss_mlp": 1.05256379, "epoch": 0.16287389147752893, "flos": 21251719848960.0, "grad_norm": 2.6430265550343246, "language_loss": 0.85878801, "learning_rate": 3.817635274679006e-06, "loss": 0.8808428, "num_input_tokens_seen": 58654135, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.09375, "step": 2709, "time_per_iteration": 2.4779605865478516 }, { "auxiliary_loss_clip": 0.01154601, "auxiliary_loss_mlp": 0.01047282, "balance_loss_clip": 1.02714753, "balance_loss_mlp": 1.04919374, "epoch": 0.1629340147301969, "flos": 19244672530560.0, "grad_norm": 1.6906905800551137, "language_loss": 0.91405261, "learning_rate": 3.817472759295605e-06, "loss": 0.93607146, "num_input_tokens_seen": 58674320, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0546875, "step": 2710, "time_per_iteration": 2.5141451358795166 }, { "auxiliary_loss_clip": 0.01156308, "auxiliary_loss_mlp": 0.01055694, "balance_loss_clip": 1.03625107, "balance_loss_mlp": 1.05338585, "epoch": 0.16299413798286488, "flos": 21249816428160.0, "grad_norm": 2.2890200268029033, "language_loss": 0.81373751, "learning_rate": 3.817310174993453e-06, "loss": 0.83585751, "num_input_tokens_seen": 58691000, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.03125, "step": 2711, "time_per_iteration": 2.433678150177002 }, { "auxiliary_loss_clip": 0.01158624, "auxiliary_loss_mlp": 0.01042058, "balance_loss_clip": 1.02274656, "balance_loss_mlp": 1.04936564, "epoch": 0.16305426123553285, "flos": 18770579896320.0, "grad_norm": 2.4402136041517837, "language_loss": 0.8084175, "learning_rate": 3.817147521778719e-06, "loss": 0.83042431, "num_input_tokens_seen": 58710230, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.09375, "step": 2712, "time_per_iteration": 2.4608991146087646 }, { "auxiliary_loss_clip": 0.01162201, "auxiliary_loss_mlp": 0.01055452, "balance_loss_clip": 1.03575873, "balance_loss_mlp": 1.05306888, "epoch": 0.16311438448820081, "flos": 22087648137600.0, "grad_norm": 1.8350013110907772, "language_loss": 0.76926816, "learning_rate": 3.816984799657568e-06, "loss": 0.7914446, "num_input_tokens_seen": 58728610, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.09375, "step": 2713, "time_per_iteration": 2.4608943462371826 }, { "auxiliary_loss_clip": 0.01155938, "auxiliary_loss_mlp": 0.01057776, "balance_loss_clip": 1.03797555, "balance_loss_mlp": 1.05480075, "epoch": 0.16317450774086878, "flos": 16467700164480.0, "grad_norm": 2.012337589621599, "language_loss": 0.79523408, "learning_rate": 3.8168220086361715e-06, "loss": 0.81737113, "num_input_tokens_seen": 58744385, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.015625, "step": 2714, "time_per_iteration": 2.4385926723480225 }, { "auxiliary_loss_clip": 0.01158883, "auxiliary_loss_mlp": 0.01054637, "balance_loss_clip": 1.0354321, "balance_loss_mlp": 1.05395174, "epoch": 0.16323463099353674, "flos": 24352929308160.0, "grad_norm": 1.7637905750522531, "language_loss": 0.7820912, "learning_rate": 3.816659148720702e-06, "loss": 0.8042264, "num_input_tokens_seen": 58763905, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0546875, "step": 2715, "time_per_iteration": 2.4894649982452393 }, { "auxiliary_loss_clip": 0.01154975, "auxiliary_loss_mlp": 0.01045856, "balance_loss_clip": 1.02717578, "balance_loss_mlp": 1.0515871, "epoch": 0.1632947542462047, "flos": 24900782520960.0, "grad_norm": 2.0888028639877176, "language_loss": 0.81960839, "learning_rate": 3.816496219917336e-06, "loss": 0.84161669, "num_input_tokens_seen": 58785580, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.03125, "step": 2716, "time_per_iteration": 2.506131649017334 }, { "auxiliary_loss_clip": 0.01160953, "auxiliary_loss_mlp": 0.01054968, "balance_loss_clip": 1.03616846, "balance_loss_mlp": 1.0562886, "epoch": 0.1633548774988727, "flos": 24900279730560.0, "grad_norm": 2.367777584598157, "language_loss": 0.86534166, "learning_rate": 3.816333222232251e-06, "loss": 0.88750088, "num_input_tokens_seen": 58806075, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2717, "time_per_iteration": 2.491067409515381 }, { "auxiliary_loss_clip": 0.01155684, "auxiliary_loss_mlp": 0.01046646, "balance_loss_clip": 1.02784646, "balance_loss_mlp": 1.05311489, "epoch": 0.16341500075154067, "flos": 30441798357120.0, "grad_norm": 1.804090591493782, "language_loss": 0.76286352, "learning_rate": 3.816170155671629e-06, "loss": 0.78488684, "num_input_tokens_seen": 58827405, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0234375, "step": 2718, "time_per_iteration": 2.5427322387695312 }, { "auxiliary_loss_clip": 0.01160804, "auxiliary_loss_mlp": 0.01043792, "balance_loss_clip": 1.02550507, "balance_loss_mlp": 1.05333543, "epoch": 0.16347512400420863, "flos": 22784530878720.0, "grad_norm": 1.9171258721595914, "language_loss": 0.7366575, "learning_rate": 3.816007020241652e-06, "loss": 0.75870347, "num_input_tokens_seen": 58847205, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.078125, "step": 2719, "time_per_iteration": 2.460622549057007 }, { "auxiliary_loss_clip": 0.01156633, "auxiliary_loss_mlp": 0.01046364, "balance_loss_clip": 1.02712321, "balance_loss_mlp": 1.05174136, "epoch": 0.1635352472568766, "flos": 22633274707200.0, "grad_norm": 2.0260362299254253, "language_loss": 0.72255617, "learning_rate": 3.815843815948507e-06, "loss": 0.74458617, "num_input_tokens_seen": 58866865, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.046875, "step": 2720, "time_per_iteration": 3.9384422302246094 }, { "auxiliary_loss_clip": 0.01156135, "auxiliary_loss_mlp": 0.01048475, "balance_loss_clip": 1.0276376, "balance_loss_mlp": 1.05386961, "epoch": 0.16359537050954456, "flos": 15522998515200.0, "grad_norm": 2.168076280498307, "language_loss": 0.74978316, "learning_rate": 3.8156805427983824e-06, "loss": 0.77182925, "num_input_tokens_seen": 58885200, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.0234375, "step": 2721, "time_per_iteration": 3.864372730255127 }, { "auxiliary_loss_clip": 0.01160734, "auxiliary_loss_mlp": 0.01048646, "balance_loss_clip": 1.02876186, "balance_loss_mlp": 1.05293584, "epoch": 0.16365549376221253, "flos": 22090162089600.0, "grad_norm": 1.8734974182188047, "language_loss": 0.789415, "learning_rate": 3.8155172007974695e-06, "loss": 0.81150883, "num_input_tokens_seen": 58906385, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.078125, "step": 2722, "time_per_iteration": 3.886448860168457 }, { "auxiliary_loss_clip": 0.01163166, "auxiliary_loss_mlp": 0.01053385, "balance_loss_clip": 1.03112853, "balance_loss_mlp": 1.05397081, "epoch": 0.1637156170148805, "flos": 24060400945920.0, "grad_norm": 2.130000408106711, "language_loss": 0.85099018, "learning_rate": 3.8153537899519624e-06, "loss": 0.87315571, "num_input_tokens_seen": 58925040, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.09375, "step": 2723, "time_per_iteration": 3.8802053928375244 }, { "auxiliary_loss_clip": 0.01152926, "auxiliary_loss_mlp": 0.01039869, "balance_loss_clip": 1.02048576, "balance_loss_mlp": 1.05224085, "epoch": 0.1637757402675485, "flos": 26685362954880.0, "grad_norm": 2.0641420568988686, "language_loss": 0.7106331, "learning_rate": 3.815190310268058e-06, "loss": 0.73256105, "num_input_tokens_seen": 58944790, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0078125, "step": 2724, "time_per_iteration": 2.5347092151641846 }, { "auxiliary_loss_clip": 0.01155107, "auxiliary_loss_mlp": 0.01044961, "balance_loss_clip": 1.02715123, "balance_loss_mlp": 1.05385375, "epoch": 0.16383586352021645, "flos": 16106941918080.0, "grad_norm": 1.9514354149160258, "language_loss": 0.70923114, "learning_rate": 3.815026761751955e-06, "loss": 0.73123181, "num_input_tokens_seen": 58962500, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.015625, "step": 2725, "time_per_iteration": 2.4396212100982666 }, { "auxiliary_loss_clip": 0.01153333, "auxiliary_loss_mlp": 0.01043524, "balance_loss_clip": 1.02485609, "balance_loss_mlp": 1.05370831, "epoch": 0.16389598677288442, "flos": 19165991788800.0, "grad_norm": 2.1385783391057704, "language_loss": 0.88709128, "learning_rate": 3.814863144409855e-06, "loss": 0.90905988, "num_input_tokens_seen": 58980355, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0, "step": 2726, "time_per_iteration": 2.4808883666992188 }, { "auxiliary_loss_clip": 0.01159292, "auxiliary_loss_mlp": 0.01044202, "balance_loss_clip": 1.02446055, "balance_loss_mlp": 1.05533326, "epoch": 0.16395611002555238, "flos": 21507008785920.0, "grad_norm": 1.8434914674456448, "language_loss": 0.74325782, "learning_rate": 3.814699458247963e-06, "loss": 0.76529276, "num_input_tokens_seen": 58999505, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0390625, "step": 2727, "time_per_iteration": 2.4640188217163086 }, { "auxiliary_loss_clip": 0.01155141, "auxiliary_loss_mlp": 0.01047352, "balance_loss_clip": 1.02990007, "balance_loss_mlp": 1.05453134, "epoch": 0.16401623327822035, "flos": 21470918595840.0, "grad_norm": 1.5747937062658008, "language_loss": 0.82592726, "learning_rate": 3.8145357032724855e-06, "loss": 0.84795225, "num_input_tokens_seen": 59017930, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.0078125, "step": 2728, "time_per_iteration": 2.4882309436798096 }, { "auxiliary_loss_clip": 0.01160089, "auxiliary_loss_mlp": 0.01047091, "balance_loss_clip": 1.02731371, "balance_loss_mlp": 1.05428159, "epoch": 0.1640763565308883, "flos": 13626232928640.0, "grad_norm": 1.9875653376045916, "language_loss": 0.85025746, "learning_rate": 3.814371879489633e-06, "loss": 0.87232924, "num_input_tokens_seen": 59035130, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0625, "step": 2729, "time_per_iteration": 2.438154697418213 }, { "auxiliary_loss_clip": 0.01157895, "auxiliary_loss_mlp": 0.01043398, "balance_loss_clip": 1.02521896, "balance_loss_mlp": 1.05384851, "epoch": 0.16413647978355628, "flos": 15451464579840.0, "grad_norm": 1.8370198898464438, "language_loss": 0.72309041, "learning_rate": 3.814207986905616e-06, "loss": 0.74510336, "num_input_tokens_seen": 59053080, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.046875, "step": 2730, "time_per_iteration": 2.4430551528930664 }, { "auxiliary_loss_clip": 0.011593, "auxiliary_loss_mlp": 0.01049641, "balance_loss_clip": 1.02880311, "balance_loss_mlp": 1.05217016, "epoch": 0.16419660303622427, "flos": 45878682015360.0, "grad_norm": 1.8302780626774913, "language_loss": 0.74482393, "learning_rate": 3.814044025526651e-06, "loss": 0.76691341, "num_input_tokens_seen": 59075610, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.0703125, "step": 2731, "time_per_iteration": 2.6733994483947754 }, { "auxiliary_loss_clip": 0.01161243, "auxiliary_loss_mlp": 0.01045199, "balance_loss_clip": 1.02484953, "balance_loss_mlp": 1.05638993, "epoch": 0.16425672628889224, "flos": 18952826526720.0, "grad_norm": 2.2968058932516677, "language_loss": 0.78541481, "learning_rate": 3.8138799953589548e-06, "loss": 0.80747926, "num_input_tokens_seen": 59094555, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.046875, "step": 2732, "time_per_iteration": 2.4809200763702393 }, { "auxiliary_loss_clip": 0.01158894, "auxiliary_loss_mlp": 0.01048746, "balance_loss_clip": 1.02917194, "balance_loss_mlp": 1.05400634, "epoch": 0.1643168495415602, "flos": 24312996362880.0, "grad_norm": 1.9944010697348118, "language_loss": 0.69567549, "learning_rate": 3.8137158964087473e-06, "loss": 0.71775192, "num_input_tokens_seen": 59113515, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.046875, "step": 2733, "time_per_iteration": 2.4828670024871826 }, { "auxiliary_loss_clip": 0.01158147, "auxiliary_loss_mlp": 0.01047186, "balance_loss_clip": 1.02609801, "balance_loss_mlp": 1.05409551, "epoch": 0.16437697279422817, "flos": 26428421992320.0, "grad_norm": 2.6937949877558096, "language_loss": 0.80708724, "learning_rate": 3.8135517286822508e-06, "loss": 0.8291406, "num_input_tokens_seen": 59133275, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.0390625, "step": 2734, "time_per_iteration": 2.5361030101776123 }, { "auxiliary_loss_clip": 0.01156106, "auxiliary_loss_mlp": 0.0104796, "balance_loss_clip": 1.02875543, "balance_loss_mlp": 1.05358481, "epoch": 0.16443709604689613, "flos": 34532239351680.0, "grad_norm": 2.099936614204124, "language_loss": 0.8198626, "learning_rate": 3.8133874921856914e-06, "loss": 0.84190321, "num_input_tokens_seen": 59154095, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.03125, "step": 2735, "time_per_iteration": 2.5677990913391113 }, { "auxiliary_loss_clip": 0.01156071, "auxiliary_loss_mlp": 0.01043883, "balance_loss_clip": 1.02587032, "balance_loss_mlp": 1.05485511, "epoch": 0.1644972192995641, "flos": 23258048895360.0, "grad_norm": 2.5883167503045703, "language_loss": 0.78775311, "learning_rate": 3.813223186925296e-06, "loss": 0.80975258, "num_input_tokens_seen": 59173795, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.015625, "step": 2736, "time_per_iteration": 2.503192186355591 }, { "auxiliary_loss_clip": 0.01160556, "auxiliary_loss_mlp": 0.01050868, "balance_loss_clip": 1.0324738, "balance_loss_mlp": 1.05640936, "epoch": 0.1645573425522321, "flos": 26979543342720.0, "grad_norm": 1.8621884004402862, "language_loss": 0.81758642, "learning_rate": 3.8130588129072964e-06, "loss": 0.83970064, "num_input_tokens_seen": 59191610, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0390625, "step": 2737, "time_per_iteration": 2.491391658782959 }, { "auxiliary_loss_clip": 0.01161039, "auxiliary_loss_mlp": 0.01048013, "balance_loss_clip": 1.02979767, "balance_loss_mlp": 1.05541456, "epoch": 0.16461746580490005, "flos": 28731768600960.0, "grad_norm": 1.6703106920638708, "language_loss": 0.87298769, "learning_rate": 3.8128943701379246e-06, "loss": 0.89507818, "num_input_tokens_seen": 59213000, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0546875, "step": 2738, "time_per_iteration": 2.5476179122924805 }, { "auxiliary_loss_clip": 0.01159815, "auxiliary_loss_mlp": 0.01057429, "balance_loss_clip": 1.03841472, "balance_loss_mlp": 1.05457878, "epoch": 0.16467758905756802, "flos": 24930156867840.0, "grad_norm": 2.110416605934226, "language_loss": 0.71982521, "learning_rate": 3.8127298586234167e-06, "loss": 0.74199766, "num_input_tokens_seen": 59232340, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0546875, "step": 2739, "time_per_iteration": 2.493223190307617 }, { "auxiliary_loss_clip": 0.01157559, "auxiliary_loss_mlp": 0.0104669, "balance_loss_clip": 1.02727079, "balance_loss_mlp": 1.05469441, "epoch": 0.16473771231023598, "flos": 24826519152000.0, "grad_norm": 1.6425757444818576, "language_loss": 0.81603813, "learning_rate": 3.8125652783700104e-06, "loss": 0.83808064, "num_input_tokens_seen": 59253950, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.03125, "step": 2740, "time_per_iteration": 2.56040358543396 }, { "auxiliary_loss_clip": 0.01166087, "auxiliary_loss_mlp": 0.01061805, "balance_loss_clip": 1.03957224, "balance_loss_mlp": 1.05876112, "epoch": 0.16479783556290395, "flos": 39896072375040.0, "grad_norm": 2.216148890226781, "language_loss": 0.69313169, "learning_rate": 3.8124006293839475e-06, "loss": 0.71541059, "num_input_tokens_seen": 59275545, "router_z_loss_clip": 0.22167969, "router_z_loss_mlp": 1.078125, "step": 2741, "time_per_iteration": 2.63344407081604 }, { "auxiliary_loss_clip": 0.0116139, "auxiliary_loss_mlp": 0.0104643, "balance_loss_clip": 1.02801228, "balance_loss_mlp": 1.05719852, "epoch": 0.16485795881557191, "flos": 19897061299200.0, "grad_norm": 5.610041849774293, "language_loss": 0.79783475, "learning_rate": 3.812235911671472e-06, "loss": 0.81991297, "num_input_tokens_seen": 59293480, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.046875, "step": 2742, "time_per_iteration": 2.4761011600494385 }, { "auxiliary_loss_clip": 0.01159506, "auxiliary_loss_mlp": 0.01054575, "balance_loss_clip": 1.03449988, "balance_loss_mlp": 1.05657041, "epoch": 0.16491808206823988, "flos": 20556129997440.0, "grad_norm": 1.8374670820960513, "language_loss": 0.84841073, "learning_rate": 3.8120711252388274e-06, "loss": 0.87055153, "num_input_tokens_seen": 59313435, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.03125, "step": 2743, "time_per_iteration": 2.4806888103485107 }, { "auxiliary_loss_clip": 0.01155563, "auxiliary_loss_mlp": 0.01052368, "balance_loss_clip": 1.03302026, "balance_loss_mlp": 1.05456197, "epoch": 0.16497820532090787, "flos": 23800802376960.0, "grad_norm": 1.561855029115077, "language_loss": 0.85877895, "learning_rate": 3.811906270092265e-06, "loss": 0.88085824, "num_input_tokens_seen": 59331535, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0078125, "step": 2744, "time_per_iteration": 2.5031282901763916 }, { "auxiliary_loss_clip": 0.01155379, "auxiliary_loss_mlp": 0.01049857, "balance_loss_clip": 1.03217828, "balance_loss_mlp": 1.05609751, "epoch": 0.16503832857357584, "flos": 25482642935040.0, "grad_norm": 1.729391619351045, "language_loss": 0.8289541, "learning_rate": 3.811741346238036e-06, "loss": 0.85100645, "num_input_tokens_seen": 59350680, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9921875, "step": 2745, "time_per_iteration": 2.5141282081604004 }, { "auxiliary_loss_clip": 0.01163667, "auxiliary_loss_mlp": 0.01053056, "balance_loss_clip": 1.03403044, "balance_loss_mlp": 1.05740857, "epoch": 0.1650984518262438, "flos": 17676058619520.0, "grad_norm": 1.8098711366020284, "language_loss": 0.76866758, "learning_rate": 3.8115763536823923e-06, "loss": 0.79083484, "num_input_tokens_seen": 59367020, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0625, "step": 2746, "time_per_iteration": 2.470499038696289 }, { "auxiliary_loss_clip": 0.01159612, "auxiliary_loss_mlp": 0.01052064, "balance_loss_clip": 1.03296685, "balance_loss_mlp": 1.05617094, "epoch": 0.16515857507891177, "flos": 18698327688960.0, "grad_norm": 1.6239296377657877, "language_loss": 0.80711412, "learning_rate": 3.811411292431592e-06, "loss": 0.8292309, "num_input_tokens_seen": 59386075, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.03125, "step": 2747, "time_per_iteration": 2.4649837017059326 }, { "auxiliary_loss_clip": 0.01164146, "auxiliary_loss_mlp": 0.01046026, "balance_loss_clip": 1.02671409, "balance_loss_mlp": 1.0596416, "epoch": 0.16521869833157973, "flos": 15010481306880.0, "grad_norm": 2.13280151411277, "language_loss": 0.6954028, "learning_rate": 3.8112461624918945e-06, "loss": 0.7175045, "num_input_tokens_seen": 59402690, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.046875, "step": 2748, "time_per_iteration": 2.4619109630584717 }, { "auxiliary_loss_clip": 0.01164966, "auxiliary_loss_mlp": 0.01053989, "balance_loss_clip": 1.03477287, "balance_loss_mlp": 1.06052577, "epoch": 0.1652788215842477, "flos": 22121152548480.0, "grad_norm": 2.2489972801467846, "language_loss": 0.87861896, "learning_rate": 3.811080963869561e-06, "loss": 0.90080845, "num_input_tokens_seen": 59421130, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.046875, "step": 2749, "time_per_iteration": 2.4972727298736572 }, { "auxiliary_loss_clip": 0.01159947, "auxiliary_loss_mlp": 0.01043339, "balance_loss_clip": 1.02374125, "balance_loss_mlp": 1.05442917, "epoch": 0.16533894483691566, "flos": 18333080242560.0, "grad_norm": 1.8457960630237167, "language_loss": 0.7906698, "learning_rate": 3.8109156965708557e-06, "loss": 0.81270272, "num_input_tokens_seen": 59438970, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0546875, "step": 2750, "time_per_iteration": 2.4602112770080566 }, { "auxiliary_loss_clip": 0.01162107, "auxiliary_loss_mlp": 0.01044425, "balance_loss_clip": 1.02482748, "balance_loss_mlp": 1.05884266, "epoch": 0.16539906808958366, "flos": 22382115834240.0, "grad_norm": 1.6873414201045844, "language_loss": 0.95076621, "learning_rate": 3.8107503606020455e-06, "loss": 0.97283149, "num_input_tokens_seen": 59458510, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.03125, "step": 2751, "time_per_iteration": 2.4884214401245117 }, { "auxiliary_loss_clip": 0.01160882, "auxiliary_loss_mlp": 0.01048912, "balance_loss_clip": 1.02977943, "balance_loss_mlp": 1.05982542, "epoch": 0.16545919134225162, "flos": 22711093522560.0, "grad_norm": 1.9791631550921125, "language_loss": 0.70977432, "learning_rate": 3.8105849559693997e-06, "loss": 0.73187226, "num_input_tokens_seen": 59477110, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0078125, "step": 2752, "time_per_iteration": 2.503852128982544 }, { "auxiliary_loss_clip": 0.01086116, "auxiliary_loss_mlp": 0.01010404, "balance_loss_clip": 1.00682783, "balance_loss_mlp": 1.04680216, "epoch": 0.1655193145949196, "flos": 67802974076160.0, "grad_norm": 0.7593736170968695, "language_loss": 0.54137373, "learning_rate": 3.810419482679192e-06, "loss": 0.56233895, "num_input_tokens_seen": 59541155, "router_z_loss_clip": 0.03564453, "router_z_loss_mlp": 0.39453125, "step": 2753, "time_per_iteration": 3.1737191677093506 }, { "auxiliary_loss_clip": 0.01159887, "auxiliary_loss_mlp": 0.01040076, "balance_loss_clip": 1.02022731, "balance_loss_mlp": 1.05580544, "epoch": 0.16557943784758755, "flos": 24280389792000.0, "grad_norm": 1.6151413750471149, "language_loss": 0.75176978, "learning_rate": 3.8102539407376954e-06, "loss": 0.77376944, "num_input_tokens_seen": 59561155, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0390625, "step": 2754, "time_per_iteration": 2.513165235519409 }, { "auxiliary_loss_clip": 0.01170658, "auxiliary_loss_mlp": 0.01065468, "balance_loss_clip": 1.04343843, "balance_loss_mlp": 1.05983055, "epoch": 0.16563956110025552, "flos": 20083617561600.0, "grad_norm": 2.6219684698976984, "language_loss": 0.86519825, "learning_rate": 3.810088330151188e-06, "loss": 0.88755953, "num_input_tokens_seen": 59580460, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.109375, "step": 2755, "time_per_iteration": 2.4826786518096924 }, { "auxiliary_loss_clip": 0.01157775, "auxiliary_loss_mlp": 0.01052403, "balance_loss_clip": 1.0325551, "balance_loss_mlp": 1.05467176, "epoch": 0.16569968435292348, "flos": 28034454896640.0, "grad_norm": 1.7480367749718464, "language_loss": 0.73355192, "learning_rate": 3.80992265092595e-06, "loss": 0.75565368, "num_input_tokens_seen": 59600025, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.03125, "step": 2756, "time_per_iteration": 2.533125877380371 }, { "auxiliary_loss_clip": 0.01156532, "auxiliary_loss_mlp": 0.01046955, "balance_loss_clip": 1.02760768, "balance_loss_mlp": 1.05777133, "epoch": 0.16575980760559147, "flos": 26250233598720.0, "grad_norm": 1.5985752244721352, "language_loss": 0.75110406, "learning_rate": 3.8097569030682636e-06, "loss": 0.77313888, "num_input_tokens_seen": 59620600, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.98828125, "step": 2757, "time_per_iteration": 2.5311484336853027 }, { "auxiliary_loss_clip": 0.01160216, "auxiliary_loss_mlp": 0.01045995, "balance_loss_clip": 1.02671862, "balance_loss_mlp": 1.05676901, "epoch": 0.16581993085825944, "flos": 26943955943040.0, "grad_norm": 1.9188964694835902, "language_loss": 0.84796751, "learning_rate": 3.8095910865844137e-06, "loss": 0.87002969, "num_input_tokens_seen": 59641385, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.03125, "step": 2758, "time_per_iteration": 2.536609649658203 }, { "auxiliary_loss_clip": 0.01162401, "auxiliary_loss_mlp": 0.01051242, "balance_loss_clip": 1.03343236, "balance_loss_mlp": 1.05941355, "epoch": 0.1658800541109274, "flos": 21653632103040.0, "grad_norm": 2.0812461928196977, "language_loss": 0.79424351, "learning_rate": 3.809425201480689e-06, "loss": 0.8163799, "num_input_tokens_seen": 59659865, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.03125, "step": 2759, "time_per_iteration": 2.491689682006836 }, { "auxiliary_loss_clip": 0.01158549, "auxiliary_loss_mlp": 0.01047061, "balance_loss_clip": 1.0279752, "balance_loss_mlp": 1.05533862, "epoch": 0.16594017736359537, "flos": 16435488643200.0, "grad_norm": 1.9192616562556892, "language_loss": 0.74920738, "learning_rate": 3.8092592477633793e-06, "loss": 0.77126348, "num_input_tokens_seen": 59678780, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.03125, "step": 2760, "time_per_iteration": 2.463043212890625 }, { "auxiliary_loss_clip": 0.01163608, "auxiliary_loss_mlp": 0.01039355, "balance_loss_clip": 1.01979303, "balance_loss_mlp": 1.05709124, "epoch": 0.16600030061626334, "flos": 22637297030400.0, "grad_norm": 3.924679487729901, "language_loss": 0.73295367, "learning_rate": 3.8090932254387774e-06, "loss": 0.75498331, "num_input_tokens_seen": 59698795, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0625, "step": 2761, "time_per_iteration": 3.993438959121704 }, { "auxiliary_loss_clip": 0.01157817, "auxiliary_loss_mlp": 0.01040477, "balance_loss_clip": 1.02133262, "balance_loss_mlp": 1.05523801, "epoch": 0.1660604238689313, "flos": 26396569607040.0, "grad_norm": 1.745910908135781, "language_loss": 0.89082241, "learning_rate": 3.8089271345131788e-06, "loss": 0.91280532, "num_input_tokens_seen": 59718795, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0234375, "step": 2762, "time_per_iteration": 2.513598918914795 }, { "auxiliary_loss_clip": 0.01160373, "auxiliary_loss_mlp": 0.010495, "balance_loss_clip": 1.02988958, "balance_loss_mlp": 1.05620682, "epoch": 0.16612054712159927, "flos": 23039999383680.0, "grad_norm": 1.6455044109252157, "language_loss": 0.88064152, "learning_rate": 3.8087609749928822e-06, "loss": 0.90274024, "num_input_tokens_seen": 59737555, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0390625, "step": 2763, "time_per_iteration": 3.852184534072876 }, { "auxiliary_loss_clip": 0.010808, "auxiliary_loss_mlp": 0.01003944, "balance_loss_clip": 1.00098729, "balance_loss_mlp": 1.0421747, "epoch": 0.16618067037426726, "flos": 59241225202560.0, "grad_norm": 0.7736966525486099, "language_loss": 0.59804952, "learning_rate": 3.8085947468841885e-06, "loss": 0.61889696, "num_input_tokens_seen": 59800915, "router_z_loss_clip": 0.02954102, "router_z_loss_mlp": 0.38671875, "step": 2764, "time_per_iteration": 4.553582191467285 }, { "auxiliary_loss_clip": 0.011628, "auxiliary_loss_mlp": 0.01048232, "balance_loss_clip": 1.02732301, "balance_loss_mlp": 1.05839014, "epoch": 0.16624079362693522, "flos": 27198813916800.0, "grad_norm": 1.6619066366441928, "language_loss": 0.82098264, "learning_rate": 3.808428450193401e-06, "loss": 0.84309292, "num_input_tokens_seen": 59822910, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.046875, "step": 2765, "time_per_iteration": 3.982786178588867 }, { "auxiliary_loss_clip": 0.01168583, "auxiliary_loss_mlp": 0.01052886, "balance_loss_clip": 1.03110623, "balance_loss_mlp": 1.05852258, "epoch": 0.1663009168796032, "flos": 10925068216320.0, "grad_norm": 2.1219463767585047, "language_loss": 0.69469512, "learning_rate": 3.8082620849268244e-06, "loss": 0.71690977, "num_input_tokens_seen": 59838805, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.1015625, "step": 2766, "time_per_iteration": 2.434603691101074 }, { "auxiliary_loss_clip": 0.01160221, "auxiliary_loss_mlp": 0.01047163, "balance_loss_clip": 1.02788711, "balance_loss_mlp": 1.05921674, "epoch": 0.16636104013227115, "flos": 17894431353600.0, "grad_norm": 2.13336924058475, "language_loss": 0.88609582, "learning_rate": 3.808095651090769e-06, "loss": 0.90816963, "num_input_tokens_seen": 59855345, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.015625, "step": 2767, "time_per_iteration": 2.4934329986572266 }, { "auxiliary_loss_clip": 0.01079299, "auxiliary_loss_mlp": 0.01005745, "balance_loss_clip": 1.00269306, "balance_loss_mlp": 1.04210067, "epoch": 0.16642116338493912, "flos": 66726050463360.0, "grad_norm": 0.7887401922615601, "language_loss": 0.52917051, "learning_rate": 3.8079291486915447e-06, "loss": 0.55002093, "num_input_tokens_seen": 59917710, "router_z_loss_clip": 0.03051758, "router_z_loss_mlp": 0.37109375, "step": 2768, "time_per_iteration": 3.2052440643310547 }, { "auxiliary_loss_clip": 0.0116315, "auxiliary_loss_mlp": 0.01048357, "balance_loss_clip": 1.0276742, "balance_loss_mlp": 1.05563331, "epoch": 0.16648128663760708, "flos": 19026048401280.0, "grad_norm": 2.490662963282855, "language_loss": 0.85093129, "learning_rate": 3.8077625777354667e-06, "loss": 0.8730464, "num_input_tokens_seen": 59935105, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.078125, "step": 2769, "time_per_iteration": 2.481990098953247 }, { "auxiliary_loss_clip": 0.01078704, "auxiliary_loss_mlp": 0.01007078, "balance_loss_clip": 1.00415695, "balance_loss_mlp": 1.04150462, "epoch": 0.16654140989027508, "flos": 70134976759680.0, "grad_norm": 0.813441516988102, "language_loss": 0.57556707, "learning_rate": 3.80759593822885e-06, "loss": 0.59642494, "num_input_tokens_seen": 59984085, "router_z_loss_clip": 0.0291748, "router_z_loss_mlp": 0.37109375, "step": 2770, "time_per_iteration": 2.9528470039367676 }, { "auxiliary_loss_clip": 0.01076338, "auxiliary_loss_mlp": 0.01008247, "balance_loss_clip": 1.00527847, "balance_loss_mlp": 1.03940296, "epoch": 0.16660153314294304, "flos": 70272406195200.0, "grad_norm": 0.8680078261145846, "language_loss": 0.56267083, "learning_rate": 3.807429230178015e-06, "loss": 0.58351672, "num_input_tokens_seen": 60043470, "router_z_loss_clip": 0.02966309, "router_z_loss_mlp": 0.36914062, "step": 2771, "time_per_iteration": 2.9435083866119385 }, { "auxiliary_loss_clip": 0.01159767, "auxiliary_loss_mlp": 0.01053796, "balance_loss_clip": 1.03349471, "balance_loss_mlp": 1.05616105, "epoch": 0.166661656395611, "flos": 23075048079360.0, "grad_norm": 2.281989334846615, "language_loss": 0.7081393, "learning_rate": 3.8072624535892817e-06, "loss": 0.73027503, "num_input_tokens_seen": 60063045, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.03125, "step": 2772, "time_per_iteration": 2.5541481971740723 }, { "auxiliary_loss_clip": 0.01159194, "auxiliary_loss_mlp": 0.01047878, "balance_loss_clip": 1.02805328, "balance_loss_mlp": 1.056126, "epoch": 0.16672177964827897, "flos": 28366341586560.0, "grad_norm": 2.56968632633647, "language_loss": 0.86059201, "learning_rate": 3.807095608468975e-06, "loss": 0.88266265, "num_input_tokens_seen": 60081945, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.03125, "step": 2773, "time_per_iteration": 2.537869453430176 }, { "auxiliary_loss_clip": 0.01159668, "auxiliary_loss_mlp": 0.01040315, "balance_loss_clip": 1.02202785, "balance_loss_mlp": 1.05634379, "epoch": 0.16678190290094694, "flos": 19091010147840.0, "grad_norm": 2.0152675208620976, "language_loss": 0.82389307, "learning_rate": 3.8069286948234224e-06, "loss": 0.84589291, "num_input_tokens_seen": 60096820, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.03125, "step": 2774, "time_per_iteration": 2.4580013751983643 }, { "auxiliary_loss_clip": 0.01162001, "auxiliary_loss_mlp": 0.0104687, "balance_loss_clip": 1.02693844, "balance_loss_mlp": 1.05658221, "epoch": 0.1668420261536149, "flos": 21799106184960.0, "grad_norm": 2.5955248137910236, "language_loss": 0.8324247, "learning_rate": 3.806761712658952e-06, "loss": 0.85451341, "num_input_tokens_seen": 60116140, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0546875, "step": 2775, "time_per_iteration": 2.4702670574188232 }, { "auxiliary_loss_clip": 0.01157806, "auxiliary_loss_mlp": 0.01049486, "balance_loss_clip": 1.03143752, "balance_loss_mlp": 1.05639315, "epoch": 0.16690214940628287, "flos": 19062533640960.0, "grad_norm": 1.6624449653549203, "language_loss": 0.80497104, "learning_rate": 3.806594661981897e-06, "loss": 0.82704401, "num_input_tokens_seen": 60134235, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.015625, "step": 2776, "time_per_iteration": 2.481884717941284 }, { "auxiliary_loss_clip": 0.01153816, "auxiliary_loss_mlp": 0.01046566, "balance_loss_clip": 1.02771926, "balance_loss_mlp": 1.0559082, "epoch": 0.16696227265895086, "flos": 18588548747520.0, "grad_norm": 1.9830596227970054, "language_loss": 0.80561048, "learning_rate": 3.8064275427985906e-06, "loss": 0.82761431, "num_input_tokens_seen": 60153275, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9765625, "step": 2777, "time_per_iteration": 2.4512124061584473 }, { "auxiliary_loss_clip": 0.01157441, "auxiliary_loss_mlp": 0.01042471, "balance_loss_clip": 1.02338576, "balance_loss_mlp": 1.05390549, "epoch": 0.16702239591161883, "flos": 23294139085440.0, "grad_norm": 3.5962361205184386, "language_loss": 0.85500014, "learning_rate": 3.806260355115371e-06, "loss": 0.87699926, "num_input_tokens_seen": 60173215, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.03125, "step": 2778, "time_per_iteration": 2.4988226890563965 }, { "auxiliary_loss_clip": 0.01162024, "auxiliary_loss_mlp": 0.01040716, "balance_loss_clip": 1.02171433, "balance_loss_mlp": 1.05663216, "epoch": 0.1670825191642868, "flos": 24425648392320.0, "grad_norm": 1.9149894010772448, "language_loss": 0.74651253, "learning_rate": 3.8060930989385778e-06, "loss": 0.76853991, "num_input_tokens_seen": 60190515, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0546875, "step": 2779, "time_per_iteration": 2.4926929473876953 }, { "auxiliary_loss_clip": 0.01159124, "auxiliary_loss_mlp": 0.01045829, "balance_loss_clip": 1.02636266, "balance_loss_mlp": 1.05590105, "epoch": 0.16714264241695476, "flos": 26797512193920.0, "grad_norm": 2.273981941595231, "language_loss": 0.65199041, "learning_rate": 3.805925774274554e-06, "loss": 0.67403996, "num_input_tokens_seen": 60211655, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.03125, "step": 2780, "time_per_iteration": 2.544177293777466 }, { "auxiliary_loss_clip": 0.01155812, "auxiliary_loss_mlp": 0.01044614, "balance_loss_clip": 1.02507615, "balance_loss_mlp": 1.05413556, "epoch": 0.16720276566962272, "flos": 21835304115840.0, "grad_norm": 1.9809122191581308, "language_loss": 0.78586549, "learning_rate": 3.805758381129643e-06, "loss": 0.80786979, "num_input_tokens_seen": 60230860, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.015625, "step": 2781, "time_per_iteration": 2.4613871574401855 }, { "auxiliary_loss_clip": 0.01159925, "auxiliary_loss_mlp": 0.01048646, "balance_loss_clip": 1.02998972, "balance_loss_mlp": 1.05459464, "epoch": 0.1672628889222907, "flos": 21470415805440.0, "grad_norm": 1.5926237705044237, "language_loss": 0.75273931, "learning_rate": 3.805590919510193e-06, "loss": 0.77482498, "num_input_tokens_seen": 60250535, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0546875, "step": 2782, "time_per_iteration": 2.4846994876861572 }, { "auxiliary_loss_clip": 0.01165498, "auxiliary_loss_mlp": 0.01049376, "balance_loss_clip": 1.02917016, "balance_loss_mlp": 1.05666447, "epoch": 0.16732301217495865, "flos": 30774008269440.0, "grad_norm": 1.8725351643056545, "language_loss": 0.67691469, "learning_rate": 3.8054233894225547e-06, "loss": 0.69906342, "num_input_tokens_seen": 60269530, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0859375, "step": 2783, "time_per_iteration": 2.5281898975372314 }, { "auxiliary_loss_clip": 0.01159077, "auxiliary_loss_mlp": 0.01050759, "balance_loss_clip": 1.03185213, "balance_loss_mlp": 1.05550778, "epoch": 0.16738313542762664, "flos": 23474625949440.0, "grad_norm": 1.6200484645898705, "language_loss": 0.70189297, "learning_rate": 3.805255790873081e-06, "loss": 0.72399133, "num_input_tokens_seen": 60289900, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.03125, "step": 2784, "time_per_iteration": 2.4951813220977783 }, { "auxiliary_loss_clip": 0.01159533, "auxiliary_loss_mlp": 0.0105024, "balance_loss_clip": 1.02921104, "balance_loss_mlp": 1.05346894, "epoch": 0.1674432586802946, "flos": 29789086366080.0, "grad_norm": 2.0606466942747548, "language_loss": 0.60858405, "learning_rate": 3.805088123868126e-06, "loss": 0.63068175, "num_input_tokens_seen": 60310025, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.0625, "step": 2785, "time_per_iteration": 2.5310311317443848 }, { "auxiliary_loss_clip": 0.01067025, "auxiliary_loss_mlp": 0.01001299, "balance_loss_clip": 0.99887908, "balance_loss_mlp": 1.03038192, "epoch": 0.16750338193296258, "flos": 66136073575680.0, "grad_norm": 0.7830117786179084, "language_loss": 0.58843613, "learning_rate": 3.8049203884140492e-06, "loss": 0.60911936, "num_input_tokens_seen": 60377800, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.3671875, "step": 2786, "time_per_iteration": 3.1266539096832275 }, { "auxiliary_loss_clip": 0.01158407, "auxiliary_loss_mlp": 0.01046054, "balance_loss_clip": 1.02634907, "balance_loss_mlp": 1.0532198, "epoch": 0.16756350518563054, "flos": 25696777864320.0, "grad_norm": 1.7993667778364286, "language_loss": 0.76033294, "learning_rate": 3.80475258451721e-06, "loss": 0.7823776, "num_input_tokens_seen": 60398215, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0546875, "step": 2787, "time_per_iteration": 2.5130598545074463 }, { "auxiliary_loss_clip": 0.01158571, "auxiliary_loss_mlp": 0.01045899, "balance_loss_clip": 1.02738607, "balance_loss_mlp": 1.05457544, "epoch": 0.1676236284382985, "flos": 23836102467840.0, "grad_norm": 1.8197956088296916, "language_loss": 0.77415293, "learning_rate": 3.804584712183972e-06, "loss": 0.79619765, "num_input_tokens_seen": 60416910, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0390625, "step": 2788, "time_per_iteration": 2.493352174758911 }, { "auxiliary_loss_clip": 0.01065972, "auxiliary_loss_mlp": 0.01000413, "balance_loss_clip": 0.99781477, "balance_loss_mlp": 1.029392, "epoch": 0.16768375169096647, "flos": 59874902985600.0, "grad_norm": 0.8665695569491069, "language_loss": 0.59363198, "learning_rate": 3.8044167714207013e-06, "loss": 0.61429584, "num_input_tokens_seen": 60468660, "router_z_loss_clip": 0.02600098, "router_z_loss_mlp": 0.3671875, "step": 2789, "time_per_iteration": 2.9500162601470947 }, { "auxiliary_loss_clip": 0.01158807, "auxiliary_loss_mlp": 0.0105504, "balance_loss_clip": 1.03544188, "balance_loss_mlp": 1.05380177, "epoch": 0.16774387494363446, "flos": 38435657207040.0, "grad_norm": 1.5786955250078725, "language_loss": 0.70223689, "learning_rate": 3.804248762233765e-06, "loss": 0.72437537, "num_input_tokens_seen": 60492370, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.046875, "step": 2790, "time_per_iteration": 2.632406711578369 }, { "auxiliary_loss_clip": 0.01158468, "auxiliary_loss_mlp": 0.01053197, "balance_loss_clip": 1.03479147, "balance_loss_mlp": 1.05461609, "epoch": 0.16780399819630243, "flos": 22637620252800.0, "grad_norm": 1.6261653280564705, "language_loss": 0.79240012, "learning_rate": 3.8040806846295356e-06, "loss": 0.81451678, "num_input_tokens_seen": 60512655, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0390625, "step": 2791, "time_per_iteration": 2.504794120788574 }, { "auxiliary_loss_clip": 0.0115813, "auxiliary_loss_mlp": 0.01049749, "balance_loss_clip": 1.03084266, "balance_loss_mlp": 1.05394936, "epoch": 0.1678641214489704, "flos": 32891516887680.0, "grad_norm": 1.6889916101309794, "language_loss": 0.71568733, "learning_rate": 3.8039125386143853e-06, "loss": 0.73776615, "num_input_tokens_seen": 60533090, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.046875, "step": 2792, "time_per_iteration": 2.586531162261963 }, { "auxiliary_loss_clip": 0.01159392, "auxiliary_loss_mlp": 0.01043809, "balance_loss_clip": 1.02533126, "balance_loss_mlp": 1.05457723, "epoch": 0.16792424470163836, "flos": 19974916028160.0, "grad_norm": 1.8395590816392493, "language_loss": 0.71416658, "learning_rate": 3.803744324194691e-06, "loss": 0.73619854, "num_input_tokens_seen": 60553190, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.046875, "step": 2793, "time_per_iteration": 2.4690585136413574 }, { "auxiliary_loss_clip": 0.01160063, "auxiliary_loss_mlp": 0.01048711, "balance_loss_clip": 1.02958965, "balance_loss_mlp": 1.05634499, "epoch": 0.16798436795430632, "flos": 19719878486400.0, "grad_norm": 2.4102892058601393, "language_loss": 0.76944602, "learning_rate": 3.803576041376831e-06, "loss": 0.79153383, "num_input_tokens_seen": 60571995, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0390625, "step": 2794, "time_per_iteration": 2.491971254348755 }, { "auxiliary_loss_clip": 0.01159225, "auxiliary_loss_mlp": 0.01047598, "balance_loss_clip": 1.02809548, "balance_loss_mlp": 1.05536556, "epoch": 0.1680444912069743, "flos": 28104839596800.0, "grad_norm": 16.053845455618774, "language_loss": 0.71588135, "learning_rate": 3.803407690167187e-06, "loss": 0.73794961, "num_input_tokens_seen": 60591275, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0390625, "step": 2795, "time_per_iteration": 2.508572816848755 }, { "auxiliary_loss_clip": 0.01155882, "auxiliary_loss_mlp": 0.01042777, "balance_loss_clip": 1.0247407, "balance_loss_mlp": 1.05248022, "epoch": 0.16810461445964225, "flos": 18075205526400.0, "grad_norm": 1.7575510424168805, "language_loss": 0.84061724, "learning_rate": 3.803239270572142e-06, "loss": 0.86260384, "num_input_tokens_seen": 60609235, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.03125, "step": 2796, "time_per_iteration": 2.4962713718414307 }, { "auxiliary_loss_clip": 0.01160683, "auxiliary_loss_mlp": 0.0104593, "balance_loss_clip": 1.02660644, "balance_loss_mlp": 1.05543113, "epoch": 0.16816473771231025, "flos": 23878657105920.0, "grad_norm": 1.691150030105644, "language_loss": 0.81427574, "learning_rate": 3.8030707825980838e-06, "loss": 0.83634192, "num_input_tokens_seen": 60629880, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.046875, "step": 2797, "time_per_iteration": 2.5046207904815674 }, { "auxiliary_loss_clip": 0.0115349, "auxiliary_loss_mlp": 0.0104056, "balance_loss_clip": 1.02437115, "balance_loss_mlp": 1.05490375, "epoch": 0.1682248609649782, "flos": 22783597125120.0, "grad_norm": 1.7113406168539724, "language_loss": 0.75261384, "learning_rate": 3.802902226251401e-06, "loss": 0.77455437, "num_input_tokens_seen": 60651175, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.984375, "step": 2798, "time_per_iteration": 2.5072150230407715 }, { "auxiliary_loss_clip": 0.01160686, "auxiliary_loss_mlp": 0.01048427, "balance_loss_clip": 1.0308795, "balance_loss_mlp": 1.05721974, "epoch": 0.16828498421764618, "flos": 20705123612160.0, "grad_norm": 1.4871249902041503, "language_loss": 0.79761302, "learning_rate": 3.8027336015384845e-06, "loss": 0.81970417, "num_input_tokens_seen": 60670210, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.03125, "step": 2799, "time_per_iteration": 2.4557130336761475 }, { "auxiliary_loss_clip": 0.01159323, "auxiliary_loss_mlp": 0.01042134, "balance_loss_clip": 1.02312016, "balance_loss_mlp": 1.05305028, "epoch": 0.16834510747031414, "flos": 29420606695680.0, "grad_norm": 1.9063551977150635, "language_loss": 0.71056926, "learning_rate": 3.8025649084657296e-06, "loss": 0.73258376, "num_input_tokens_seen": 60690895, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2800, "time_per_iteration": 2.558145761489868 }, { "auxiliary_loss_clip": 0.01155488, "auxiliary_loss_mlp": 0.01040044, "balance_loss_clip": 1.01998091, "balance_loss_mlp": 1.05314302, "epoch": 0.1684052307229821, "flos": 18145374744960.0, "grad_norm": 1.971785006061995, "language_loss": 0.83727193, "learning_rate": 3.8023961470395326e-06, "loss": 0.85922718, "num_input_tokens_seen": 60708280, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0234375, "step": 2801, "time_per_iteration": 2.4424352645874023 }, { "auxiliary_loss_clip": 0.01157299, "auxiliary_loss_mlp": 0.01052436, "balance_loss_clip": 1.03414941, "balance_loss_mlp": 1.05303824, "epoch": 0.16846535397565007, "flos": 16574929240320.0, "grad_norm": 2.347045451699567, "language_loss": 0.82197309, "learning_rate": 3.8022273172662933e-06, "loss": 0.84407043, "num_input_tokens_seen": 60724150, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.046875, "step": 2802, "time_per_iteration": 2.4460268020629883 }, { "auxiliary_loss_clip": 0.01160238, "auxiliary_loss_mlp": 0.01044937, "balance_loss_clip": 1.02555358, "balance_loss_mlp": 1.05459142, "epoch": 0.16852547722831807, "flos": 30408868563840.0, "grad_norm": 1.574548557019479, "language_loss": 0.80961955, "learning_rate": 3.802058419152413e-06, "loss": 0.8316713, "num_input_tokens_seen": 60746485, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0546875, "step": 2803, "time_per_iteration": 4.048713445663452 }, { "auxiliary_loss_clip": 0.01155249, "auxiliary_loss_mlp": 0.01047541, "balance_loss_clip": 1.02959991, "balance_loss_mlp": 1.05362546, "epoch": 0.16858560048098603, "flos": 33507420416640.0, "grad_norm": 2.4515679665934393, "language_loss": 0.76470369, "learning_rate": 3.801889452704297e-06, "loss": 0.78673166, "num_input_tokens_seen": 60762875, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.015625, "step": 2804, "time_per_iteration": 3.9240002632141113 }, { "auxiliary_loss_clip": 0.01063603, "auxiliary_loss_mlp": 0.01013238, "balance_loss_clip": 1.01075876, "balance_loss_mlp": 1.02729678, "epoch": 0.168645723733654, "flos": 67370502326400.0, "grad_norm": 0.8261353798771426, "language_loss": 0.55417454, "learning_rate": 3.8017204179283526e-06, "loss": 0.57494295, "num_input_tokens_seen": 60825510, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.36328125, "step": 2805, "time_per_iteration": 3.0411593914031982 }, { "auxiliary_loss_clip": 0.0115109, "auxiliary_loss_mlp": 0.01042233, "balance_loss_clip": 1.02510262, "balance_loss_mlp": 1.0500145, "epoch": 0.16870584698632196, "flos": 21324618501120.0, "grad_norm": 2.0711953377854626, "language_loss": 0.73021811, "learning_rate": 3.8015513148309892e-06, "loss": 0.75215137, "num_input_tokens_seen": 60844440, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 1.015625, "step": 2806, "time_per_iteration": 5.323753356933594 }, { "auxiliary_loss_clip": 0.01152024, "auxiliary_loss_mlp": 0.01047218, "balance_loss_clip": 1.0289669, "balance_loss_mlp": 1.05056751, "epoch": 0.16876597023898993, "flos": 20740746925440.0, "grad_norm": 1.8467081381970067, "language_loss": 0.69640702, "learning_rate": 3.80138214341862e-06, "loss": 0.71839947, "num_input_tokens_seen": 60863210, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.015625, "step": 2807, "time_per_iteration": 2.468653917312622 }, { "auxiliary_loss_clip": 0.01156137, "auxiliary_loss_mlp": 0.01048454, "balance_loss_clip": 1.02916622, "balance_loss_mlp": 1.0520246, "epoch": 0.1688260934916579, "flos": 20303498666880.0, "grad_norm": 2.5038038311897846, "language_loss": 0.70588863, "learning_rate": 3.8012129036976587e-06, "loss": 0.7279346, "num_input_tokens_seen": 60882510, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0390625, "step": 2808, "time_per_iteration": 2.461148262023926 }, { "auxiliary_loss_clip": 0.0115886, "auxiliary_loss_mlp": 0.01045683, "balance_loss_clip": 1.0256561, "balance_loss_mlp": 1.05233073, "epoch": 0.16888621674432586, "flos": 20340702178560.0, "grad_norm": 2.24186815600959, "language_loss": 0.80190253, "learning_rate": 3.8010435956745236e-06, "loss": 0.82394803, "num_input_tokens_seen": 60901105, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.0625, "step": 2809, "time_per_iteration": 2.486299514770508 }, { "auxiliary_loss_clip": 0.01160306, "auxiliary_loss_mlp": 0.01045955, "balance_loss_clip": 1.02685797, "balance_loss_mlp": 1.05175614, "epoch": 0.16894633999699385, "flos": 16244802316800.0, "grad_norm": 2.315510043808443, "language_loss": 0.8828485, "learning_rate": 3.8008742193556358e-06, "loss": 0.90491116, "num_input_tokens_seen": 60915340, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.078125, "step": 2810, "time_per_iteration": 2.408038377761841 }, { "auxiliary_loss_clip": 0.01159655, "auxiliary_loss_mlp": 0.01050605, "balance_loss_clip": 1.03123355, "balance_loss_mlp": 1.0533582, "epoch": 0.16900646324966181, "flos": 19610171372160.0, "grad_norm": 1.8813646753982225, "language_loss": 0.92562866, "learning_rate": 3.800704774747416e-06, "loss": 0.94773114, "num_input_tokens_seen": 60933735, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0625, "step": 2811, "time_per_iteration": 2.4617016315460205 }, { "auxiliary_loss_clip": 0.01158295, "auxiliary_loss_mlp": 0.0104486, "balance_loss_clip": 1.02708578, "balance_loss_mlp": 1.05388772, "epoch": 0.16906658650232978, "flos": 22018089450240.0, "grad_norm": 1.982605814897745, "language_loss": 0.78768456, "learning_rate": 3.800535261856291e-06, "loss": 0.80971611, "num_input_tokens_seen": 60953105, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.0390625, "step": 2812, "time_per_iteration": 2.4708447456359863 }, { "auxiliary_loss_clip": 0.01154937, "auxiliary_loss_mlp": 0.01048976, "balance_loss_clip": 1.03108239, "balance_loss_mlp": 1.05337572, "epoch": 0.16912670975499774, "flos": 11763690024960.0, "grad_norm": 2.5572252414788927, "language_loss": 0.74663359, "learning_rate": 3.8003656806886887e-06, "loss": 0.76867276, "num_input_tokens_seen": 60969150, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 1.015625, "step": 2813, "time_per_iteration": 2.4120612144470215 }, { "auxiliary_loss_clip": 0.0115966, "auxiliary_loss_mlp": 0.0105026, "balance_loss_clip": 1.03131807, "balance_loss_mlp": 1.05352521, "epoch": 0.1691868330076657, "flos": 17161386595200.0, "grad_norm": 2.2927451655967466, "language_loss": 0.68951452, "learning_rate": 3.8001960312510396e-06, "loss": 0.71161371, "num_input_tokens_seen": 60982825, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2814, "time_per_iteration": 2.4140825271606445 }, { "auxiliary_loss_clip": 0.01157318, "auxiliary_loss_mlp": 0.0104443, "balance_loss_clip": 1.02598822, "balance_loss_mlp": 1.05356443, "epoch": 0.16924695626033368, "flos": 22416553998720.0, "grad_norm": 1.868237903319789, "language_loss": 0.61669838, "learning_rate": 3.800026313549776e-06, "loss": 0.63871586, "num_input_tokens_seen": 61000875, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0390625, "step": 2815, "time_per_iteration": 2.4514334201812744 }, { "auxiliary_loss_clip": 0.0115452, "auxiliary_loss_mlp": 0.01041865, "balance_loss_clip": 1.02342296, "balance_loss_mlp": 1.05192924, "epoch": 0.16930707951300164, "flos": 25739655724800.0, "grad_norm": 1.8266517467334642, "language_loss": 0.82153106, "learning_rate": 3.7998565275913342e-06, "loss": 0.84349483, "num_input_tokens_seen": 61021940, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.03125, "step": 2816, "time_per_iteration": 2.493565797805786 }, { "auxiliary_loss_clip": 0.01160687, "auxiliary_loss_mlp": 0.01049907, "balance_loss_clip": 1.03027284, "balance_loss_mlp": 1.05583954, "epoch": 0.16936720276566963, "flos": 22747040058240.0, "grad_norm": 1.9571724319156472, "language_loss": 0.87001204, "learning_rate": 3.799686673382153e-06, "loss": 0.89211798, "num_input_tokens_seen": 61040285, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.046875, "step": 2817, "time_per_iteration": 2.474421262741089 }, { "auxiliary_loss_clip": 0.01158279, "auxiliary_loss_mlp": 0.01048233, "balance_loss_clip": 1.02867103, "balance_loss_mlp": 1.05542266, "epoch": 0.1694273260183376, "flos": 19573973441280.0, "grad_norm": 1.7074569482802753, "language_loss": 0.81247675, "learning_rate": 3.799516750928672e-06, "loss": 0.83454192, "num_input_tokens_seen": 61059020, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.03125, "step": 2818, "time_per_iteration": 2.4986565113067627 }, { "auxiliary_loss_clip": 0.0115694, "auxiliary_loss_mlp": 0.01046655, "balance_loss_clip": 1.02685404, "balance_loss_mlp": 1.05350578, "epoch": 0.16948744927100556, "flos": 12457843332480.0, "grad_norm": 3.0088319653288385, "language_loss": 0.81080401, "learning_rate": 3.799346760237336e-06, "loss": 0.83283997, "num_input_tokens_seen": 61074245, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.03125, "step": 2819, "time_per_iteration": 2.4271233081817627 }, { "auxiliary_loss_clip": 0.01058974, "auxiliary_loss_mlp": 0.01027855, "balance_loss_clip": 1.02522039, "balance_loss_mlp": 1.02278304, "epoch": 0.16954757252367353, "flos": 71291694435840.0, "grad_norm": 0.935821393367143, "language_loss": 0.6113987, "learning_rate": 3.7991767013145902e-06, "loss": 0.63226694, "num_input_tokens_seen": 61127080, "router_z_loss_clip": 0.02636719, "router_z_loss_mlp": 0.36328125, "step": 2820, "time_per_iteration": 2.99143648147583 }, { "auxiliary_loss_clip": 0.01156243, "auxiliary_loss_mlp": 0.01049676, "balance_loss_clip": 1.0308888, "balance_loss_mlp": 1.05203664, "epoch": 0.1696076957763415, "flos": 29606516513280.0, "grad_norm": 1.9757482609094021, "language_loss": 0.78626758, "learning_rate": 3.7990065741668844e-06, "loss": 0.80832678, "num_input_tokens_seen": 61146955, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0390625, "step": 2821, "time_per_iteration": 2.562117576599121 }, { "auxiliary_loss_clip": 0.01158412, "auxiliary_loss_mlp": 0.01054394, "balance_loss_clip": 1.03365135, "balance_loss_mlp": 1.05404651, "epoch": 0.16966781902900946, "flos": 24388588535040.0, "grad_norm": 4.226038125728723, "language_loss": 0.78894973, "learning_rate": 3.7988363788006685e-06, "loss": 0.81107777, "num_input_tokens_seen": 61166605, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.046875, "step": 2822, "time_per_iteration": 2.495764970779419 }, { "auxiliary_loss_clip": 0.01152586, "auxiliary_loss_mlp": 0.01051208, "balance_loss_clip": 1.03265929, "balance_loss_mlp": 1.05062521, "epoch": 0.16972794228167745, "flos": 23038814234880.0, "grad_norm": 3.987752208368644, "language_loss": 0.74754798, "learning_rate": 3.7986661152223967e-06, "loss": 0.76958591, "num_input_tokens_seen": 61186535, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.015625, "step": 2823, "time_per_iteration": 2.504422664642334 }, { "auxiliary_loss_clip": 0.01158749, "auxiliary_loss_mlp": 0.01055395, "balance_loss_clip": 1.0350343, "balance_loss_mlp": 1.05395544, "epoch": 0.16978806553434542, "flos": 35228691129600.0, "grad_norm": 1.7696644830623613, "language_loss": 0.60318434, "learning_rate": 3.7984957834385257e-06, "loss": 0.6253258, "num_input_tokens_seen": 61208965, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.046875, "step": 2824, "time_per_iteration": 2.6076087951660156 }, { "auxiliary_loss_clip": 0.01158955, "auxiliary_loss_mlp": 0.01043379, "balance_loss_clip": 1.02318454, "balance_loss_mlp": 1.05426705, "epoch": 0.16984818878701338, "flos": 32014290936960.0, "grad_norm": 1.5550210540369427, "language_loss": 0.72873259, "learning_rate": 3.7983253834555144e-06, "loss": 0.75075591, "num_input_tokens_seen": 61230670, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.046875, "step": 2825, "time_per_iteration": 2.5668623447418213 }, { "auxiliary_loss_clip": 0.01163871, "auxiliary_loss_mlp": 0.01051352, "balance_loss_clip": 1.02988291, "balance_loss_mlp": 1.05444074, "epoch": 0.16990831203968135, "flos": 22818609907200.0, "grad_norm": 1.9725022784546553, "language_loss": 0.8557508, "learning_rate": 3.7981549152798245e-06, "loss": 0.87790304, "num_input_tokens_seen": 61249510, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.09375, "step": 2826, "time_per_iteration": 2.4570393562316895 }, { "auxiliary_loss_clip": 0.01161816, "auxiliary_loss_mlp": 0.01047682, "balance_loss_clip": 1.02829838, "balance_loss_mlp": 1.05479717, "epoch": 0.1699684352923493, "flos": 23039604334080.0, "grad_norm": 2.2343125981130427, "language_loss": 0.82363999, "learning_rate": 3.7979843789179196e-06, "loss": 0.84573489, "num_input_tokens_seen": 61269440, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0703125, "step": 2827, "time_per_iteration": 2.489644765853882 }, { "auxiliary_loss_clip": 0.01159687, "auxiliary_loss_mlp": 0.01046129, "balance_loss_clip": 1.02562535, "balance_loss_mlp": 1.05217803, "epoch": 0.17002855854501728, "flos": 21434110133760.0, "grad_norm": 1.7637100216293902, "language_loss": 0.73868781, "learning_rate": 3.797813774376267e-06, "loss": 0.760746, "num_input_tokens_seen": 61288195, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0703125, "step": 2828, "time_per_iteration": 2.4619383811950684 }, { "auxiliary_loss_clip": 0.01057909, "auxiliary_loss_mlp": 0.01003443, "balance_loss_clip": 1.00017679, "balance_loss_mlp": 1.02021456, "epoch": 0.17008868179768524, "flos": 71453509205760.0, "grad_norm": 0.7680695189684728, "language_loss": 0.56488746, "learning_rate": 3.797643101661336e-06, "loss": 0.58550096, "num_input_tokens_seen": 61350850, "router_z_loss_clip": 0.03271484, "router_z_loss_mlp": 0.37695312, "step": 2829, "time_per_iteration": 3.1224288940429688 }, { "auxiliary_loss_clip": 0.01157171, "auxiliary_loss_mlp": 0.01055189, "balance_loss_clip": 1.03505456, "balance_loss_mlp": 1.05213892, "epoch": 0.17014880505035324, "flos": 24900315644160.0, "grad_norm": 3.806908161441301, "language_loss": 0.83604127, "learning_rate": 3.7974723607795983e-06, "loss": 0.85816485, "num_input_tokens_seen": 61370765, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0546875, "step": 2830, "time_per_iteration": 2.4868621826171875 }, { "auxiliary_loss_clip": 0.01156595, "auxiliary_loss_mlp": 0.01048381, "balance_loss_clip": 1.0279845, "balance_loss_mlp": 1.05016088, "epoch": 0.1702089283030212, "flos": 29862415981440.0, "grad_norm": 3.7397980975538387, "language_loss": 0.78945333, "learning_rate": 3.797301551737529e-06, "loss": 0.81150311, "num_input_tokens_seen": 61388935, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.0625, "step": 2831, "time_per_iteration": 2.5233020782470703 }, { "auxiliary_loss_clip": 0.01162162, "auxiliary_loss_mlp": 0.01052111, "balance_loss_clip": 1.03087974, "balance_loss_mlp": 1.05419433, "epoch": 0.17026905155568917, "flos": 17744180762880.0, "grad_norm": 2.298019726908818, "language_loss": 0.79130995, "learning_rate": 3.7971306745416044e-06, "loss": 0.81345272, "num_input_tokens_seen": 61407350, "router_z_loss_clip": 0.21191406, "router_z_loss_mlp": 1.078125, "step": 2832, "time_per_iteration": 2.4279465675354004 }, { "auxiliary_loss_clip": 0.01161411, "auxiliary_loss_mlp": 0.01049764, "balance_loss_clip": 1.03039205, "balance_loss_mlp": 1.0541985, "epoch": 0.17032917480835713, "flos": 23148665003520.0, "grad_norm": 1.6165490309842274, "language_loss": 0.8901751, "learning_rate": 3.7969597291983046e-06, "loss": 0.91228688, "num_input_tokens_seen": 61429010, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0703125, "step": 2833, "time_per_iteration": 2.499150276184082 }, { "auxiliary_loss_clip": 0.0115553, "auxiliary_loss_mlp": 0.01049058, "balance_loss_clip": 1.02971005, "balance_loss_mlp": 1.05046678, "epoch": 0.1703892980610251, "flos": 39202565512320.0, "grad_norm": 2.2233788743669827, "language_loss": 0.7239486, "learning_rate": 3.7967887157141115e-06, "loss": 0.74599445, "num_input_tokens_seen": 61450040, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.046875, "step": 2834, "time_per_iteration": 2.6006462574005127 }, { "auxiliary_loss_clip": 0.01162157, "auxiliary_loss_mlp": 0.01057215, "balance_loss_clip": 1.03870153, "balance_loss_mlp": 1.05518031, "epoch": 0.17044942131369306, "flos": 23039101543680.0, "grad_norm": 2.777621702486136, "language_loss": 0.86204106, "learning_rate": 3.7966176340955106e-06, "loss": 0.88423479, "num_input_tokens_seen": 61468585, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0703125, "step": 2835, "time_per_iteration": 2.496579647064209 }, { "auxiliary_loss_clip": 0.01162352, "auxiliary_loss_mlp": 0.01055771, "balance_loss_clip": 1.03371704, "balance_loss_mlp": 1.05357814, "epoch": 0.17050954456636103, "flos": 17054983532160.0, "grad_norm": 2.2504233405309493, "language_loss": 0.74099422, "learning_rate": 3.796446484348989e-06, "loss": 0.76317537, "num_input_tokens_seen": 61486330, "router_z_loss_clip": 0.22070312, "router_z_loss_mlp": 1.0859375, "step": 2836, "time_per_iteration": 2.43367862701416 }, { "auxiliary_loss_clip": 0.01163256, "auxiliary_loss_mlp": 0.01047076, "balance_loss_clip": 1.0254041, "balance_loss_mlp": 1.0536387, "epoch": 0.17056966781902902, "flos": 16836969934080.0, "grad_norm": 2.845309499453628, "language_loss": 0.80365682, "learning_rate": 3.796275266481036e-06, "loss": 0.82576013, "num_input_tokens_seen": 61503950, "router_z_loss_clip": 0.21679688, "router_z_loss_mlp": 1.1015625, "step": 2837, "time_per_iteration": 2.4515275955200195 }, { "auxiliary_loss_clip": 0.01155713, "auxiliary_loss_mlp": 0.01044732, "balance_loss_clip": 1.02534842, "balance_loss_mlp": 1.05359721, "epoch": 0.17062979107169698, "flos": 17712543859200.0, "grad_norm": 2.7405537186870617, "language_loss": 0.83268619, "learning_rate": 3.7961039804981456e-06, "loss": 0.85469055, "num_input_tokens_seen": 61523550, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0234375, "step": 2838, "time_per_iteration": 2.438690423965454 }, { "auxiliary_loss_clip": 0.011549, "auxiliary_loss_mlp": 0.01051516, "balance_loss_clip": 1.03237081, "balance_loss_mlp": 1.05193114, "epoch": 0.17068991432436495, "flos": 22525040050560.0, "grad_norm": 2.0934053362820326, "language_loss": 0.93389702, "learning_rate": 3.795932626406812e-06, "loss": 0.95596117, "num_input_tokens_seen": 61542720, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.03125, "step": 2839, "time_per_iteration": 2.4970805644989014 }, { "auxiliary_loss_clip": 0.01158512, "auxiliary_loss_mlp": 0.01050567, "balance_loss_clip": 1.02862072, "balance_loss_mlp": 1.05143774, "epoch": 0.17075003757703291, "flos": 25882939077120.0, "grad_norm": 1.8450745528677344, "language_loss": 0.83690739, "learning_rate": 3.7957612042135336e-06, "loss": 0.85899818, "num_input_tokens_seen": 61563040, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.0703125, "step": 2840, "time_per_iteration": 2.4858219623565674 }, { "auxiliary_loss_clip": 0.01160095, "auxiliary_loss_mlp": 0.01051175, "balance_loss_clip": 1.03059995, "balance_loss_mlp": 1.0529685, "epoch": 0.17081016082970088, "flos": 20120713332480.0, "grad_norm": 1.7672205443146152, "language_loss": 0.76043022, "learning_rate": 3.79558971392481e-06, "loss": 0.782543, "num_input_tokens_seen": 61581890, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.0703125, "step": 2841, "time_per_iteration": 2.4505622386932373 }, { "auxiliary_loss_clip": 0.01157424, "auxiliary_loss_mlp": 0.01053379, "balance_loss_clip": 1.03353119, "balance_loss_mlp": 1.05078459, "epoch": 0.17087028408236885, "flos": 24936477661440.0, "grad_norm": 1.8276525852075935, "language_loss": 0.76832747, "learning_rate": 3.7954181555471443e-06, "loss": 0.79043555, "num_input_tokens_seen": 61602095, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0625, "step": 2842, "time_per_iteration": 2.477867364883423 }, { "auxiliary_loss_clip": 0.01153146, "auxiliary_loss_mlp": 0.01048335, "balance_loss_clip": 1.02821267, "balance_loss_mlp": 1.05002952, "epoch": 0.17093040733503684, "flos": 19057864872960.0, "grad_norm": 1.9041140459280668, "language_loss": 0.85529256, "learning_rate": 3.795246529087043e-06, "loss": 0.87730742, "num_input_tokens_seen": 61620400, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.03125, "step": 2843, "time_per_iteration": 2.5010716915130615 }, { "auxiliary_loss_clip": 0.01153098, "auxiliary_loss_mlp": 0.01050297, "balance_loss_clip": 1.03085375, "balance_loss_mlp": 1.05095506, "epoch": 0.1709905305877048, "flos": 13078954333440.0, "grad_norm": 1.680570377422845, "language_loss": 0.67744756, "learning_rate": 3.7950748345510126e-06, "loss": 0.69948155, "num_input_tokens_seen": 61637680, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0234375, "step": 2844, "time_per_iteration": 3.9526708126068115 }, { "auxiliary_loss_clip": 0.01155269, "auxiliary_loss_mlp": 0.01049154, "balance_loss_clip": 1.02851915, "balance_loss_mlp": 1.05067468, "epoch": 0.17105065384037277, "flos": 19209336526080.0, "grad_norm": 1.7170848226080588, "language_loss": 0.78344631, "learning_rate": 3.7949030719455646e-06, "loss": 0.80549055, "num_input_tokens_seen": 61655630, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.046875, "step": 2845, "time_per_iteration": 2.443195104598999 }, { "auxiliary_loss_clip": 0.01153559, "auxiliary_loss_mlp": 0.01048454, "balance_loss_clip": 1.02964234, "balance_loss_mlp": 1.04986811, "epoch": 0.17111077709304073, "flos": 18515183218560.0, "grad_norm": 2.0569929922205077, "language_loss": 0.78061056, "learning_rate": 3.7947312412772127e-06, "loss": 0.8026306, "num_input_tokens_seen": 61673475, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0390625, "step": 2846, "time_per_iteration": 3.921104669570923 }, { "auxiliary_loss_clip": 0.01154249, "auxiliary_loss_mlp": 0.01047308, "balance_loss_clip": 1.02899766, "balance_loss_mlp": 1.05023909, "epoch": 0.1711709003457087, "flos": 25082670015360.0, "grad_norm": 1.7459110321593925, "language_loss": 0.79910648, "learning_rate": 3.794559342552472e-06, "loss": 0.82112205, "num_input_tokens_seen": 61693370, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0390625, "step": 2847, "time_per_iteration": 3.9876344203948975 }, { "auxiliary_loss_clip": 0.01151391, "auxiliary_loss_mlp": 0.01051318, "balance_loss_clip": 1.0314101, "balance_loss_mlp": 1.04517007, "epoch": 0.17123102359837666, "flos": 17566387418880.0, "grad_norm": 3.279600064711171, "language_loss": 0.86429363, "learning_rate": 3.7943873757778614e-06, "loss": 0.88632071, "num_input_tokens_seen": 61710820, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0625, "step": 2848, "time_per_iteration": 3.8687684535980225 }, { "auxiliary_loss_clip": 0.01153266, "auxiliary_loss_mlp": 0.01045576, "balance_loss_clip": 1.02622795, "balance_loss_mlp": 1.04936814, "epoch": 0.17129114685104463, "flos": 26173635845760.0, "grad_norm": 1.9123600666872969, "language_loss": 0.7522732, "learning_rate": 3.794215340959902e-06, "loss": 0.77426159, "num_input_tokens_seen": 61729855, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0390625, "step": 2849, "time_per_iteration": 2.4820401668548584 }, { "auxiliary_loss_clip": 0.01052529, "auxiliary_loss_mlp": 0.01004991, "balance_loss_clip": 1.00235641, "balance_loss_mlp": 1.01607156, "epoch": 0.17135127010371262, "flos": 69269710037760.0, "grad_norm": 0.8518059449243964, "language_loss": 0.57477605, "learning_rate": 3.7940432381051163e-06, "loss": 0.59535122, "num_input_tokens_seen": 61790290, "router_z_loss_clip": 0.02636719, "router_z_loss_mlp": 0.36328125, "step": 2850, "time_per_iteration": 3.0674867630004883 }, { "auxiliary_loss_clip": 0.01148689, "auxiliary_loss_mlp": 0.01045442, "balance_loss_clip": 1.0263803, "balance_loss_mlp": 1.04797602, "epoch": 0.1714113933563806, "flos": 23550110380800.0, "grad_norm": 2.235912794811754, "language_loss": 0.81481153, "learning_rate": 3.793871067220031e-06, "loss": 0.83675283, "num_input_tokens_seen": 61809265, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0078125, "step": 2851, "time_per_iteration": 2.512850761413574 }, { "auxiliary_loss_clip": 0.01150158, "auxiliary_loss_mlp": 0.0104226, "balance_loss_clip": 1.02453399, "balance_loss_mlp": 1.04849017, "epoch": 0.17147151660904855, "flos": 21142443697920.0, "grad_norm": 2.1869621882997867, "language_loss": 0.93226862, "learning_rate": 3.7936988283111764e-06, "loss": 0.95419288, "num_input_tokens_seen": 61828980, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.015625, "step": 2852, "time_per_iteration": 2.436737537384033 }, { "auxiliary_loss_clip": 0.01152752, "auxiliary_loss_mlp": 0.01049957, "balance_loss_clip": 1.03041863, "balance_loss_mlp": 1.0474683, "epoch": 0.17153163986171652, "flos": 18624890332800.0, "grad_norm": 1.8872671067626494, "language_loss": 0.69279957, "learning_rate": 3.7935265213850817e-06, "loss": 0.7148267, "num_input_tokens_seen": 61847915, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0546875, "step": 2853, "time_per_iteration": 2.4419970512390137 }, { "auxiliary_loss_clip": 0.01156981, "auxiliary_loss_mlp": 0.01046083, "balance_loss_clip": 1.02729583, "balance_loss_mlp": 1.05160904, "epoch": 0.17159176311438448, "flos": 18223265387520.0, "grad_norm": 2.0919854458877376, "language_loss": 0.6673249, "learning_rate": 3.7933541464482815e-06, "loss": 0.68935555, "num_input_tokens_seen": 61865570, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0546875, "step": 2854, "time_per_iteration": 2.428541660308838 }, { "auxiliary_loss_clip": 0.01148652, "auxiliary_loss_mlp": 0.01044084, "balance_loss_clip": 1.02585697, "balance_loss_mlp": 1.0466795, "epoch": 0.17165188636705245, "flos": 20738987159040.0, "grad_norm": 1.5848138842545947, "language_loss": 0.8917464, "learning_rate": 3.7931817035073124e-06, "loss": 0.91367376, "num_input_tokens_seen": 61883340, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0234375, "step": 2855, "time_per_iteration": 2.4938888549804688 }, { "auxiliary_loss_clip": 0.01154139, "auxiliary_loss_mlp": 0.01047437, "balance_loss_clip": 1.02966332, "balance_loss_mlp": 1.04972291, "epoch": 0.17171200961972044, "flos": 24899884680960.0, "grad_norm": 2.205910977208941, "language_loss": 0.83804214, "learning_rate": 3.7930091925687134e-06, "loss": 0.86005795, "num_input_tokens_seen": 61900610, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.046875, "step": 2856, "time_per_iteration": 2.477811813354492 }, { "auxiliary_loss_clip": 0.01155607, "auxiliary_loss_mlp": 0.01050594, "balance_loss_clip": 1.03149652, "balance_loss_mlp": 1.0511322, "epoch": 0.1717721328723884, "flos": 20157234485760.0, "grad_norm": 1.9272016483035141, "language_loss": 0.86943138, "learning_rate": 3.792836613639026e-06, "loss": 0.89149344, "num_input_tokens_seen": 61916795, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.046875, "step": 2857, "time_per_iteration": 2.4838998317718506 }, { "auxiliary_loss_clip": 0.01152169, "auxiliary_loss_mlp": 0.01052327, "balance_loss_clip": 1.03263366, "balance_loss_mlp": 1.04875684, "epoch": 0.17183225612505637, "flos": 23361650697600.0, "grad_norm": 1.9997173371339072, "language_loss": 0.77843934, "learning_rate": 3.7926639667247947e-06, "loss": 0.8004843, "num_input_tokens_seen": 61936665, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.03125, "step": 2858, "time_per_iteration": 2.468126058578491 }, { "auxiliary_loss_clip": 0.0116317, "auxiliary_loss_mlp": 0.01052288, "balance_loss_clip": 1.0305084, "balance_loss_mlp": 1.05080891, "epoch": 0.17189237937772434, "flos": 18114240631680.0, "grad_norm": 1.8032969426619083, "language_loss": 0.76837647, "learning_rate": 3.7924912518325663e-06, "loss": 0.79053104, "num_input_tokens_seen": 61954415, "router_z_loss_clip": 0.21777344, "router_z_loss_mlp": 1.125, "step": 2859, "time_per_iteration": 2.4526424407958984 }, { "auxiliary_loss_clip": 0.01152312, "auxiliary_loss_mlp": 0.01044478, "balance_loss_clip": 1.02453411, "balance_loss_mlp": 1.05004406, "epoch": 0.1719525026303923, "flos": 23258408031360.0, "grad_norm": 1.689746746306892, "language_loss": 0.76990414, "learning_rate": 3.7923184689688902e-06, "loss": 0.79187208, "num_input_tokens_seen": 61973940, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0234375, "step": 2860, "time_per_iteration": 2.4607863426208496 }, { "auxiliary_loss_clip": 0.01154965, "auxiliary_loss_mlp": 0.01051848, "balance_loss_clip": 1.0326432, "balance_loss_mlp": 1.04894769, "epoch": 0.17201262588306027, "flos": 20810413353600.0, "grad_norm": 1.7985589862067957, "language_loss": 0.81621277, "learning_rate": 3.792145618140317e-06, "loss": 0.83828092, "num_input_tokens_seen": 61991845, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0625, "step": 2861, "time_per_iteration": 2.4535703659057617 }, { "auxiliary_loss_clip": 0.01154289, "auxiliary_loss_mlp": 0.01045903, "balance_loss_clip": 1.02719879, "balance_loss_mlp": 1.04936707, "epoch": 0.17207274913572823, "flos": 20375858615040.0, "grad_norm": 1.8886094827339435, "language_loss": 0.85158372, "learning_rate": 3.7919726993534038e-06, "loss": 0.87358564, "num_input_tokens_seen": 62009395, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.046875, "step": 2862, "time_per_iteration": 2.436856269836426 }, { "auxiliary_loss_clip": 0.0115074, "auxiliary_loss_mlp": 0.01046488, "balance_loss_clip": 1.0289644, "balance_loss_mlp": 1.0499264, "epoch": 0.17213287238839622, "flos": 26797727675520.0, "grad_norm": 4.146871910991224, "language_loss": 0.78199184, "learning_rate": 3.7917997126147054e-06, "loss": 0.80396408, "num_input_tokens_seen": 62029005, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 1.0078125, "step": 2863, "time_per_iteration": 2.5117712020874023 }, { "auxiliary_loss_clip": 0.01151436, "auxiliary_loss_mlp": 0.0104908, "balance_loss_clip": 1.03059042, "balance_loss_mlp": 1.04855418, "epoch": 0.1721929956410642, "flos": 26030819370240.0, "grad_norm": 1.7433309786963054, "language_loss": 0.72134757, "learning_rate": 3.7916266579307823e-06, "loss": 0.74335277, "num_input_tokens_seen": 62048730, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.03125, "step": 2864, "time_per_iteration": 2.4934048652648926 }, { "auxiliary_loss_clip": 0.0115753, "auxiliary_loss_mlp": 0.01052716, "balance_loss_clip": 1.03408384, "balance_loss_mlp": 1.05220926, "epoch": 0.17225311889373215, "flos": 22273091078400.0, "grad_norm": 1.75794773129489, "language_loss": 0.72725618, "learning_rate": 3.7914535353081973e-06, "loss": 0.74935865, "num_input_tokens_seen": 62069000, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0546875, "step": 2865, "time_per_iteration": 2.4777615070343018 }, { "auxiliary_loss_clip": 0.01156584, "auxiliary_loss_mlp": 0.01049093, "balance_loss_clip": 1.03031778, "balance_loss_mlp": 1.05329061, "epoch": 0.17231324214640012, "flos": 21287774125440.0, "grad_norm": 4.725936166148123, "language_loss": 0.78677243, "learning_rate": 3.7912803447535145e-06, "loss": 0.80882919, "num_input_tokens_seen": 62086750, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.03125, "step": 2866, "time_per_iteration": 2.4534130096435547 }, { "auxiliary_loss_clip": 0.01154117, "auxiliary_loss_mlp": 0.01051442, "balance_loss_clip": 1.03123569, "balance_loss_mlp": 1.05003786, "epoch": 0.17237336539906808, "flos": 19680735640320.0, "grad_norm": 1.7346529654458098, "language_loss": 0.79693496, "learning_rate": 3.7911070862733016e-06, "loss": 0.81899059, "num_input_tokens_seen": 62106240, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.0390625, "step": 2867, "time_per_iteration": 2.4735963344573975 }, { "auxiliary_loss_clip": 0.01152756, "auxiliary_loss_mlp": 0.01046718, "balance_loss_clip": 1.02734661, "balance_loss_mlp": 1.04868305, "epoch": 0.17243348865173605, "flos": 17529650784000.0, "grad_norm": 1.671085800738078, "language_loss": 0.79425335, "learning_rate": 3.7909337598741276e-06, "loss": 0.81624806, "num_input_tokens_seen": 62124895, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0390625, "step": 2868, "time_per_iteration": 2.4347586631774902 }, { "auxiliary_loss_clip": 0.01160689, "auxiliary_loss_mlp": 0.0104305, "balance_loss_clip": 1.0249064, "balance_loss_mlp": 1.05387485, "epoch": 0.17249361190440402, "flos": 18259858368000.0, "grad_norm": 2.0620139980335725, "language_loss": 0.84209234, "learning_rate": 3.7907603655625674e-06, "loss": 0.86412972, "num_input_tokens_seen": 62143510, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0703125, "step": 2869, "time_per_iteration": 2.445697069168091 }, { "auxiliary_loss_clip": 0.01156055, "auxiliary_loss_mlp": 0.01052072, "balance_loss_clip": 1.03186619, "balance_loss_mlp": 1.05035448, "epoch": 0.172553735157072, "flos": 21174367910400.0, "grad_norm": 1.7675625434182614, "language_loss": 0.77613163, "learning_rate": 3.7905869033451932e-06, "loss": 0.79821289, "num_input_tokens_seen": 62162285, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.0625, "step": 2870, "time_per_iteration": 2.4483678340911865 }, { "auxiliary_loss_clip": 0.01148818, "auxiliary_loss_mlp": 0.01045304, "balance_loss_clip": 1.02785206, "balance_loss_mlp": 1.04959249, "epoch": 0.17261385840973997, "flos": 22273270646400.0, "grad_norm": 1.6269943168387848, "language_loss": 0.77117383, "learning_rate": 3.7904133732285857e-06, "loss": 0.79311508, "num_input_tokens_seen": 62180970, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9921875, "step": 2871, "time_per_iteration": 2.4723451137542725 }, { "auxiliary_loss_clip": 0.01156171, "auxiliary_loss_mlp": 0.01044716, "balance_loss_clip": 1.02517796, "balance_loss_mlp": 1.05186605, "epoch": 0.17267398166240794, "flos": 27922233830400.0, "grad_norm": 2.954658409521069, "language_loss": 0.74764019, "learning_rate": 3.7902397752193228e-06, "loss": 0.76964909, "num_input_tokens_seen": 62198965, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.046875, "step": 2872, "time_per_iteration": 2.494645357131958 }, { "auxiliary_loss_clip": 0.01148782, "auxiliary_loss_mlp": 0.01043232, "balance_loss_clip": 1.02383685, "balance_loss_mlp": 1.04896855, "epoch": 0.1727341049150759, "flos": 21945118970880.0, "grad_norm": 1.888948302303171, "language_loss": 0.82783735, "learning_rate": 3.790066109323988e-06, "loss": 0.84975749, "num_input_tokens_seen": 62219890, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0, "step": 2873, "time_per_iteration": 2.488631010055542 }, { "auxiliary_loss_clip": 0.01151891, "auxiliary_loss_mlp": 0.01044631, "balance_loss_clip": 1.02436531, "balance_loss_mlp": 1.04851937, "epoch": 0.17279422816774387, "flos": 18107883924480.0, "grad_norm": 1.872107602537694, "language_loss": 0.75024736, "learning_rate": 3.7898923755491678e-06, "loss": 0.77221262, "num_input_tokens_seen": 62237140, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.03125, "step": 2874, "time_per_iteration": 2.4217963218688965 }, { "auxiliary_loss_clip": 0.01154394, "auxiliary_loss_mlp": 0.01047413, "balance_loss_clip": 1.02727818, "balance_loss_mlp": 1.0510242, "epoch": 0.17285435142041183, "flos": 21835447770240.0, "grad_norm": 5.165589487839717, "language_loss": 0.81004345, "learning_rate": 3.7897185739014487e-06, "loss": 0.83206159, "num_input_tokens_seen": 62255405, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0390625, "step": 2875, "time_per_iteration": 2.4765894412994385 }, { "auxiliary_loss_clip": 0.01158972, "auxiliary_loss_mlp": 0.01053214, "balance_loss_clip": 1.03288913, "balance_loss_mlp": 1.05235922, "epoch": 0.17291447467307983, "flos": 18368452160640.0, "grad_norm": 2.6296870844736535, "language_loss": 0.86903191, "learning_rate": 3.7895447043874217e-06, "loss": 0.89115375, "num_input_tokens_seen": 62271280, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0625, "step": 2876, "time_per_iteration": 2.4206228256225586 }, { "auxiliary_loss_clip": 0.01154708, "auxiliary_loss_mlp": 0.01044548, "balance_loss_clip": 1.02552223, "balance_loss_mlp": 1.05266345, "epoch": 0.1729745979257478, "flos": 18624638937600.0, "grad_norm": 4.8500413406970235, "language_loss": 0.84804386, "learning_rate": 3.789370767013681e-06, "loss": 0.87003648, "num_input_tokens_seen": 62289140, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0234375, "step": 2877, "time_per_iteration": 2.4338760375976562 }, { "auxiliary_loss_clip": 0.01155606, "auxiliary_loss_mlp": 0.01043915, "balance_loss_clip": 1.02430487, "balance_loss_mlp": 1.05262804, "epoch": 0.17303472117841576, "flos": 22998234844800.0, "grad_norm": 2.860316922987389, "language_loss": 0.79669505, "learning_rate": 3.7891967617868204e-06, "loss": 0.8186903, "num_input_tokens_seen": 62307490, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.03125, "step": 2878, "time_per_iteration": 2.451728582382202 }, { "auxiliary_loss_clip": 0.01150905, "auxiliary_loss_mlp": 0.01043252, "balance_loss_clip": 1.02445269, "balance_loss_mlp": 1.05001307, "epoch": 0.17309484443108372, "flos": 25664386775040.0, "grad_norm": 1.5855252086441876, "language_loss": 0.70512182, "learning_rate": 3.78902268871344e-06, "loss": 0.72706342, "num_input_tokens_seen": 62328570, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0078125, "step": 2879, "time_per_iteration": 2.5106468200683594 }, { "auxiliary_loss_clip": 0.01153189, "auxiliary_loss_mlp": 0.01048711, "balance_loss_clip": 1.03007841, "balance_loss_mlp": 1.04968572, "epoch": 0.1731549676837517, "flos": 13552903313280.0, "grad_norm": 2.0740099111280785, "language_loss": 0.83431482, "learning_rate": 3.78884854780014e-06, "loss": 0.85633385, "num_input_tokens_seen": 62345735, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.03125, "step": 2880, "time_per_iteration": 2.4252493381500244 }, { "auxiliary_loss_clip": 0.01159353, "auxiliary_loss_mlp": 0.0104489, "balance_loss_clip": 1.02506542, "balance_loss_mlp": 1.05347586, "epoch": 0.17321509093641965, "flos": 22857070394880.0, "grad_norm": 1.9707627321761616, "language_loss": 0.81013608, "learning_rate": 3.7886743390535236e-06, "loss": 0.83217853, "num_input_tokens_seen": 62365525, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0625, "step": 2881, "time_per_iteration": 2.4866034984588623 }, { "auxiliary_loss_clip": 0.01153297, "auxiliary_loss_mlp": 0.01044443, "balance_loss_clip": 1.02608514, "balance_loss_mlp": 1.05077755, "epoch": 0.17327521418908762, "flos": 24352785653760.0, "grad_norm": 1.7312519663031902, "language_loss": 0.77341008, "learning_rate": 3.788500062480197e-06, "loss": 0.79538745, "num_input_tokens_seen": 62385160, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0234375, "step": 2882, "time_per_iteration": 2.4768857955932617 }, { "auxiliary_loss_clip": 0.01150985, "auxiliary_loss_mlp": 0.0105546, "balance_loss_clip": 1.03673196, "balance_loss_mlp": 1.04948854, "epoch": 0.1733353374417556, "flos": 33105651816960.0, "grad_norm": 1.7924687555546461, "language_loss": 0.76110983, "learning_rate": 3.788325718086769e-06, "loss": 0.78317428, "num_input_tokens_seen": 62405280, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.015625, "step": 2883, "time_per_iteration": 2.5637049674987793 }, { "auxiliary_loss_clip": 0.01151647, "auxiliary_loss_mlp": 0.01045812, "balance_loss_clip": 1.02718008, "balance_loss_mlp": 1.05003333, "epoch": 0.17339546069442358, "flos": 24388947671040.0, "grad_norm": 2.746780317044735, "language_loss": 0.85833085, "learning_rate": 3.7881513058798503e-06, "loss": 0.88030547, "num_input_tokens_seen": 62423665, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.015625, "step": 2884, "time_per_iteration": 2.479116201400757 }, { "auxiliary_loss_clip": 0.01154228, "auxiliary_loss_mlp": 0.01046587, "balance_loss_clip": 1.02812171, "balance_loss_mlp": 1.05102634, "epoch": 0.17345558394709154, "flos": 27454174680960.0, "grad_norm": 1.608743479141867, "language_loss": 0.7412914, "learning_rate": 3.787976825866055e-06, "loss": 0.76329952, "num_input_tokens_seen": 62445170, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.03125, "step": 2885, "time_per_iteration": 2.5193073749542236 }, { "auxiliary_loss_clip": 0.01149488, "auxiliary_loss_mlp": 0.0104674, "balance_loss_clip": 1.02929938, "balance_loss_mlp": 1.0506475, "epoch": 0.1735157071997595, "flos": 24682158391680.0, "grad_norm": 1.5238654853271378, "language_loss": 0.70339692, "learning_rate": 3.7878022780519998e-06, "loss": 0.7253592, "num_input_tokens_seen": 62466135, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.984375, "step": 2886, "time_per_iteration": 4.007513761520386 }, { "auxiliary_loss_clip": 0.0115322, "auxiliary_loss_mlp": 0.0104319, "balance_loss_clip": 1.0243547, "balance_loss_mlp": 1.04986143, "epoch": 0.17357583045242747, "flos": 21688932193920.0, "grad_norm": 1.8915609380688887, "language_loss": 0.69299096, "learning_rate": 3.7876276624443024e-06, "loss": 0.71495509, "num_input_tokens_seen": 62483910, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.03125, "step": 2887, "time_per_iteration": 3.8328123092651367 }, { "auxiliary_loss_clip": 0.01154551, "auxiliary_loss_mlp": 0.01050254, "balance_loss_clip": 1.03194332, "balance_loss_mlp": 1.05221665, "epoch": 0.17363595370509544, "flos": 15375728753280.0, "grad_norm": 1.9814065898580338, "language_loss": 0.85347891, "learning_rate": 3.787452979049585e-06, "loss": 0.87552691, "num_input_tokens_seen": 62501530, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0234375, "step": 2888, "time_per_iteration": 2.4486804008483887 }, { "auxiliary_loss_clip": 0.01153106, "auxiliary_loss_mlp": 0.01046193, "balance_loss_clip": 1.02633238, "balance_loss_mlp": 1.04984665, "epoch": 0.1736960769577634, "flos": 23440941970560.0, "grad_norm": 2.2252361225842225, "language_loss": 0.78531396, "learning_rate": 3.7872782278744718e-06, "loss": 0.80730695, "num_input_tokens_seen": 62521295, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.03125, "step": 2889, "time_per_iteration": 3.883702278137207 }, { "auxiliary_loss_clip": 0.01150396, "auxiliary_loss_mlp": 0.01045802, "balance_loss_clip": 1.02718127, "balance_loss_mlp": 1.05196142, "epoch": 0.1737562002104314, "flos": 18587830475520.0, "grad_norm": 2.224889950705253, "language_loss": 0.84036493, "learning_rate": 3.7871034089255883e-06, "loss": 0.86232686, "num_input_tokens_seen": 62539615, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.984375, "step": 2890, "time_per_iteration": 3.883732557296753 }, { "auxiliary_loss_clip": 0.01156359, "auxiliary_loss_mlp": 0.01051455, "balance_loss_clip": 1.03266811, "balance_loss_mlp": 1.05212772, "epoch": 0.17381632346309936, "flos": 15998060816640.0, "grad_norm": 2.5216806121732036, "language_loss": 0.82732517, "learning_rate": 3.7869285222095653e-06, "loss": 0.84940332, "num_input_tokens_seen": 62556820, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.046875, "step": 2891, "time_per_iteration": 2.442460298538208 }, { "auxiliary_loss_clip": 0.0115382, "auxiliary_loss_mlp": 0.01045099, "balance_loss_clip": 1.0258224, "balance_loss_mlp": 1.04942822, "epoch": 0.17387644671576732, "flos": 13369830670080.0, "grad_norm": 2.0502681010450265, "language_loss": 0.81227815, "learning_rate": 3.7867535677330334e-06, "loss": 0.83426738, "num_input_tokens_seen": 62572450, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.046875, "step": 2892, "time_per_iteration": 2.4261937141418457 }, { "auxiliary_loss_clip": 0.01159435, "auxiliary_loss_mlp": 0.01057673, "balance_loss_clip": 1.0366323, "balance_loss_mlp": 1.05453885, "epoch": 0.1739365699684353, "flos": 26615516958720.0, "grad_norm": 1.8992526832874852, "language_loss": 0.74362761, "learning_rate": 3.786578545502627e-06, "loss": 0.76579869, "num_input_tokens_seen": 62592580, "router_z_loss_clip": 0.2109375, "router_z_loss_mlp": 1.046875, "step": 2893, "time_per_iteration": 2.538857936859131 }, { "auxiliary_loss_clip": 0.01155435, "auxiliary_loss_mlp": 0.01049373, "balance_loss_clip": 1.02983499, "balance_loss_mlp": 1.05128419, "epoch": 0.17399669322110325, "flos": 23367971491200.0, "grad_norm": 2.0888018724663593, "language_loss": 0.82610875, "learning_rate": 3.7864034555249828e-06, "loss": 0.84815681, "num_input_tokens_seen": 62611220, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0390625, "step": 2894, "time_per_iteration": 2.466202974319458 }, { "auxiliary_loss_clip": 0.01155195, "auxiliary_loss_mlp": 0.01052108, "balance_loss_clip": 1.02980423, "balance_loss_mlp": 1.05120671, "epoch": 0.17405681647377122, "flos": 22054107813120.0, "grad_norm": 1.8045756629215302, "language_loss": 0.73814976, "learning_rate": 3.786228297806741e-06, "loss": 0.76022285, "num_input_tokens_seen": 62629185, "router_z_loss_clip": 0.22265625, "router_z_loss_mlp": 1.0390625, "step": 2895, "time_per_iteration": 2.5162148475646973 }, { "auxiliary_loss_clip": 0.01060501, "auxiliary_loss_mlp": 0.01012772, "balance_loss_clip": 1.0101254, "balance_loss_mlp": 1.02470028, "epoch": 0.1741169397264392, "flos": 61457559114240.0, "grad_norm": 1.0187806372534554, "language_loss": 0.62841654, "learning_rate": 3.7860530723545435e-06, "loss": 0.6491493, "num_input_tokens_seen": 62691895, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.35742188, "step": 2896, "time_per_iteration": 3.175096035003662 }, { "auxiliary_loss_clip": 0.0115051, "auxiliary_loss_mlp": 0.01043472, "balance_loss_clip": 1.02401733, "balance_loss_mlp": 1.04775202, "epoch": 0.17417706297910718, "flos": 27017680608000.0, "grad_norm": 1.7401603082787171, "language_loss": 0.75774872, "learning_rate": 3.785877779175034e-06, "loss": 0.77968848, "num_input_tokens_seen": 62713790, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.03125, "step": 2897, "time_per_iteration": 2.5262091159820557 }, { "auxiliary_loss_clip": 0.01150112, "auxiliary_loss_mlp": 0.01041044, "balance_loss_clip": 1.02235198, "balance_loss_mlp": 1.05013227, "epoch": 0.17423718623177514, "flos": 33508856960640.0, "grad_norm": 2.3334918641689537, "language_loss": 0.69270593, "learning_rate": 3.7857024182748606e-06, "loss": 0.71461743, "num_input_tokens_seen": 62736285, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0, "step": 2898, "time_per_iteration": 2.5844290256500244 }, { "auxiliary_loss_clip": 0.01158045, "auxiliary_loss_mlp": 0.01046275, "balance_loss_clip": 1.02678394, "balance_loss_mlp": 1.05277634, "epoch": 0.1742973094844431, "flos": 27198634348800.0, "grad_norm": 2.1722885622025, "language_loss": 0.76312715, "learning_rate": 3.7855269896606717e-06, "loss": 0.78517032, "num_input_tokens_seen": 62756240, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0546875, "step": 2899, "time_per_iteration": 2.5304245948791504 }, { "auxiliary_loss_clip": 0.01149401, "auxiliary_loss_mlp": 0.01046285, "balance_loss_clip": 1.02723503, "balance_loss_mlp": 1.04946864, "epoch": 0.17435743273711107, "flos": 22710734386560.0, "grad_norm": 1.7638821824962825, "language_loss": 0.72933704, "learning_rate": 3.785351493339121e-06, "loss": 0.7512939, "num_input_tokens_seen": 62775910, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0, "step": 2900, "time_per_iteration": 2.4703521728515625 }, { "auxiliary_loss_clip": 0.01154109, "auxiliary_loss_mlp": 0.0105133, "balance_loss_clip": 1.03337741, "balance_loss_mlp": 1.05187249, "epoch": 0.17441755598977904, "flos": 41646466039680.0, "grad_norm": 1.4666236080288333, "language_loss": 0.69662881, "learning_rate": 3.785175929316863e-06, "loss": 0.71868324, "num_input_tokens_seen": 62799385, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0234375, "step": 2901, "time_per_iteration": 2.666377305984497 }, { "auxiliary_loss_clip": 0.01158204, "auxiliary_loss_mlp": 0.01050072, "balance_loss_clip": 1.03108203, "balance_loss_mlp": 1.05360413, "epoch": 0.174477679242447, "flos": 26287077974400.0, "grad_norm": 2.0336831509450053, "language_loss": 0.76205218, "learning_rate": 3.7850002976005543e-06, "loss": 0.78413498, "num_input_tokens_seen": 62819380, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.046875, "step": 2902, "time_per_iteration": 2.5051636695861816 }, { "auxiliary_loss_clip": 0.01153634, "auxiliary_loss_mlp": 0.01051588, "balance_loss_clip": 1.03311074, "balance_loss_mlp": 1.05008459, "epoch": 0.174537802495115, "flos": 17858412990720.0, "grad_norm": 2.0435742516584545, "language_loss": 0.81368226, "learning_rate": 3.7848245981968558e-06, "loss": 0.83573449, "num_input_tokens_seen": 62836205, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.03125, "step": 2903, "time_per_iteration": 2.445859909057617 }, { "auxiliary_loss_clip": 0.01154695, "auxiliary_loss_mlp": 0.01044017, "balance_loss_clip": 1.02544379, "balance_loss_mlp": 1.05247188, "epoch": 0.17459792574778296, "flos": 16940715390720.0, "grad_norm": 1.9154350012275418, "language_loss": 0.73380846, "learning_rate": 3.784648831112429e-06, "loss": 0.75579554, "num_input_tokens_seen": 62854045, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0234375, "step": 2904, "time_per_iteration": 2.444530487060547 }, { "auxiliary_loss_clip": 0.01153188, "auxiliary_loss_mlp": 0.01042888, "balance_loss_clip": 1.02475595, "balance_loss_mlp": 1.04966688, "epoch": 0.17465804900045093, "flos": 25520026014720.0, "grad_norm": 1.8199600631322368, "language_loss": 0.64101177, "learning_rate": 3.7844729963539406e-06, "loss": 0.66297257, "num_input_tokens_seen": 62873075, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.03125, "step": 2905, "time_per_iteration": 2.5247836112976074 }, { "auxiliary_loss_clip": 0.01164659, "auxiliary_loss_mlp": 0.01048111, "balance_loss_clip": 1.02835822, "balance_loss_mlp": 1.05598056, "epoch": 0.1747181722531189, "flos": 24129708238080.0, "grad_norm": 1.8529370290603293, "language_loss": 0.79322231, "learning_rate": 3.7842970939280566e-06, "loss": 0.81535006, "num_input_tokens_seen": 62892675, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0859375, "step": 2906, "time_per_iteration": 2.4898953437805176 }, { "auxiliary_loss_clip": 0.01158212, "auxiliary_loss_mlp": 0.01052148, "balance_loss_clip": 1.03264499, "balance_loss_mlp": 1.0554049, "epoch": 0.17477829550578686, "flos": 17748813617280.0, "grad_norm": 1.8165264994793675, "language_loss": 0.81298143, "learning_rate": 3.784121123841449e-06, "loss": 0.83508503, "num_input_tokens_seen": 62910675, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.03125, "step": 2907, "time_per_iteration": 2.4495954513549805 }, { "auxiliary_loss_clip": 0.01157071, "auxiliary_loss_mlp": 0.01047029, "balance_loss_clip": 1.02806318, "balance_loss_mlp": 1.05328274, "epoch": 0.17483841875845482, "flos": 15377344865280.0, "grad_norm": 2.3623057519695174, "language_loss": 0.81111646, "learning_rate": 3.7839450861007886e-06, "loss": 0.83315742, "num_input_tokens_seen": 62928130, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0390625, "step": 2908, "time_per_iteration": 2.449033737182617 }, { "auxiliary_loss_clip": 0.01156562, "auxiliary_loss_mlp": 0.01049243, "balance_loss_clip": 1.02898967, "balance_loss_mlp": 1.05304408, "epoch": 0.17489854201112282, "flos": 17163254102400.0, "grad_norm": 2.9059271471172194, "language_loss": 0.80179822, "learning_rate": 3.7837689807127518e-06, "loss": 0.82385623, "num_input_tokens_seen": 62944290, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.03125, "step": 2909, "time_per_iteration": 2.4295482635498047 }, { "auxiliary_loss_clip": 0.01155763, "auxiliary_loss_mlp": 0.01049668, "balance_loss_clip": 1.02917552, "balance_loss_mlp": 1.05134273, "epoch": 0.17495866526379078, "flos": 19755286318080.0, "grad_norm": 1.6807893604547242, "language_loss": 0.76684588, "learning_rate": 3.783592807684017e-06, "loss": 0.78890014, "num_input_tokens_seen": 62963505, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.046875, "step": 2910, "time_per_iteration": 2.447885274887085 }, { "auxiliary_loss_clip": 0.01156026, "auxiliary_loss_mlp": 0.01050086, "balance_loss_clip": 1.02972555, "balance_loss_mlp": 1.05220914, "epoch": 0.17501878851645875, "flos": 28511133310080.0, "grad_norm": 1.7370600862552028, "language_loss": 0.87129784, "learning_rate": 3.7834165670212645e-06, "loss": 0.89335895, "num_input_tokens_seen": 62985020, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0390625, "step": 2911, "time_per_iteration": 2.5374507904052734 }, { "auxiliary_loss_clip": 0.01152807, "auxiliary_loss_mlp": 0.01047761, "balance_loss_clip": 1.0276264, "balance_loss_mlp": 1.04941881, "epoch": 0.1750789117691267, "flos": 17931203902080.0, "grad_norm": 2.045017817636764, "language_loss": 0.89483869, "learning_rate": 3.7832402587311764e-06, "loss": 0.91684437, "num_input_tokens_seen": 63001745, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.03125, "step": 2912, "time_per_iteration": 2.4475514888763428 }, { "auxiliary_loss_clip": 0.01158145, "auxiliary_loss_mlp": 0.01044363, "balance_loss_clip": 1.02464569, "balance_loss_mlp": 1.05228674, "epoch": 0.17513903502179468, "flos": 18259427404800.0, "grad_norm": 1.884827624372521, "language_loss": 0.72808027, "learning_rate": 3.783063882820439e-06, "loss": 0.75010526, "num_input_tokens_seen": 63019750, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0546875, "step": 2913, "time_per_iteration": 2.4751691818237305 }, { "auxiliary_loss_clip": 0.01153618, "auxiliary_loss_mlp": 0.01042965, "balance_loss_clip": 1.02430916, "balance_loss_mlp": 1.05153191, "epoch": 0.17519915827446264, "flos": 20704728562560.0, "grad_norm": 1.743234540794534, "language_loss": 0.69071573, "learning_rate": 3.782887439295741e-06, "loss": 0.71268153, "num_input_tokens_seen": 63039500, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.015625, "step": 2914, "time_per_iteration": 2.478670835494995 }, { "auxiliary_loss_clip": 0.01154105, "auxiliary_loss_mlp": 0.0105237, "balance_loss_clip": 1.03256893, "balance_loss_mlp": 1.05146313, "epoch": 0.1752592815271306, "flos": 20523415685760.0, "grad_norm": 1.7943896142355844, "language_loss": 0.93398315, "learning_rate": 3.782710928163772e-06, "loss": 0.95604795, "num_input_tokens_seen": 63059785, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.03125, "step": 2915, "time_per_iteration": 2.4903666973114014 }, { "auxiliary_loss_clip": 0.01145595, "auxiliary_loss_mlp": 0.01047612, "balance_loss_clip": 1.02833557, "balance_loss_mlp": 1.04714787, "epoch": 0.1753194047797986, "flos": 21799178012160.0, "grad_norm": 1.6451985232946866, "language_loss": 0.80759478, "learning_rate": 3.782534349431226e-06, "loss": 0.82952684, "num_input_tokens_seen": 63079385, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.984375, "step": 2916, "time_per_iteration": 2.4713518619537354 }, { "auxiliary_loss_clip": 0.01151594, "auxiliary_loss_mlp": 0.01055708, "balance_loss_clip": 1.03620505, "balance_loss_mlp": 1.04785252, "epoch": 0.17537952803246656, "flos": 20668351063680.0, "grad_norm": 2.121035116412246, "language_loss": 0.73998439, "learning_rate": 3.782357703104799e-06, "loss": 0.76205742, "num_input_tokens_seen": 63098970, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0390625, "step": 2917, "time_per_iteration": 2.5052571296691895 }, { "auxiliary_loss_clip": 0.01148532, "auxiliary_loss_mlp": 0.01052783, "balance_loss_clip": 1.03341174, "balance_loss_mlp": 1.04994333, "epoch": 0.17543965128513453, "flos": 23295072839040.0, "grad_norm": 1.884388762612316, "language_loss": 0.77426404, "learning_rate": 3.7821809891911897e-06, "loss": 0.79627717, "num_input_tokens_seen": 63118750, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.984375, "step": 2918, "time_per_iteration": 2.5041580200195312 }, { "auxiliary_loss_clip": 0.01157259, "auxiliary_loss_mlp": 0.01045127, "balance_loss_clip": 1.02579093, "balance_loss_mlp": 1.0511837, "epoch": 0.1754997745378025, "flos": 29095615416960.0, "grad_norm": 2.315724947254189, "language_loss": 0.74247879, "learning_rate": 3.782004207697098e-06, "loss": 0.76450264, "num_input_tokens_seen": 63136865, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0625, "step": 2919, "time_per_iteration": 2.5355682373046875 }, { "auxiliary_loss_clip": 0.01159954, "auxiliary_loss_mlp": 0.01048541, "balance_loss_clip": 1.02957451, "balance_loss_mlp": 1.05232525, "epoch": 0.17555989779047046, "flos": 30371844620160.0, "grad_norm": 1.852937911181439, "language_loss": 0.74764144, "learning_rate": 3.781827358629228e-06, "loss": 0.7697264, "num_input_tokens_seen": 63158325, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.078125, "step": 2920, "time_per_iteration": 2.523024559020996 }, { "auxiliary_loss_clip": 0.01147607, "auxiliary_loss_mlp": 0.01044777, "balance_loss_clip": 1.02737272, "balance_loss_mlp": 1.04765916, "epoch": 0.17562002104313842, "flos": 23287746464640.0, "grad_norm": 2.164405215476684, "language_loss": 0.79280317, "learning_rate": 3.7816504419942873e-06, "loss": 0.81472707, "num_input_tokens_seen": 63173115, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.0, "step": 2921, "time_per_iteration": 2.4464833736419678 }, { "auxiliary_loss_clip": 0.01156555, "auxiliary_loss_mlp": 0.01048059, "balance_loss_clip": 1.02853286, "balance_loss_mlp": 1.05001545, "epoch": 0.1756801442958064, "flos": 24790500789120.0, "grad_norm": 1.5706871442946537, "language_loss": 0.87546813, "learning_rate": 3.7814734577989823e-06, "loss": 0.89751428, "num_input_tokens_seen": 63192880, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0625, "step": 2922, "time_per_iteration": 2.4811277389526367 }, { "auxiliary_loss_clip": 0.01152617, "auxiliary_loss_mlp": 0.01049858, "balance_loss_clip": 1.03061807, "balance_loss_mlp": 1.04890382, "epoch": 0.17574026754847438, "flos": 25771651764480.0, "grad_norm": 2.8415066622968292, "language_loss": 0.62316239, "learning_rate": 3.7812964060500253e-06, "loss": 0.64518708, "num_input_tokens_seen": 63214395, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0390625, "step": 2923, "time_per_iteration": 2.5082080364227295 }, { "auxiliary_loss_clip": 0.01158577, "auxiliary_loss_mlp": 0.01048375, "balance_loss_clip": 1.02812111, "balance_loss_mlp": 1.05377507, "epoch": 0.17580039080114235, "flos": 17456608477440.0, "grad_norm": 2.5170917581626577, "language_loss": 0.80509716, "learning_rate": 3.78111928675413e-06, "loss": 0.82716674, "num_input_tokens_seen": 63231020, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.046875, "step": 2924, "time_per_iteration": 2.455935478210449 }, { "auxiliary_loss_clip": 0.01155099, "auxiliary_loss_mlp": 0.01055835, "balance_loss_clip": 1.03455591, "balance_loss_mlp": 1.04876125, "epoch": 0.1758605140538103, "flos": 14864648088960.0, "grad_norm": 1.963528941741656, "language_loss": 0.70783198, "learning_rate": 3.7809420999180126e-06, "loss": 0.72994137, "num_input_tokens_seen": 63246245, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.0625, "step": 2925, "time_per_iteration": 2.4589996337890625 }, { "auxiliary_loss_clip": 0.0115218, "auxiliary_loss_mlp": 0.01045049, "balance_loss_clip": 1.02688146, "balance_loss_mlp": 1.05129528, "epoch": 0.17592063730647828, "flos": 23004268329600.0, "grad_norm": 1.6694727861069643, "language_loss": 0.72044384, "learning_rate": 3.7807648455483934e-06, "loss": 0.74241614, "num_input_tokens_seen": 63267790, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0078125, "step": 2926, "time_per_iteration": 2.496962547302246 }, { "auxiliary_loss_clip": 0.01153893, "auxiliary_loss_mlp": 0.01041237, "balance_loss_clip": 1.02020884, "balance_loss_mlp": 1.04957342, "epoch": 0.17598076055914624, "flos": 20741501111040.0, "grad_norm": 1.7463027787953866, "language_loss": 0.84671009, "learning_rate": 3.7805875236519918e-06, "loss": 0.8686614, "num_input_tokens_seen": 63286830, "router_z_loss_clip": 0.20996094, "router_z_loss_mlp": 1.046875, "step": 2927, "time_per_iteration": 3.9275834560394287 }, { "auxiliary_loss_clip": 0.01151903, "auxiliary_loss_mlp": 0.01050341, "balance_loss_clip": 1.03206587, "balance_loss_mlp": 1.05081165, "epoch": 0.1760408838118142, "flos": 34092441227520.0, "grad_norm": 1.917357063461344, "language_loss": 0.71990681, "learning_rate": 3.7804101342355336e-06, "loss": 0.74192917, "num_input_tokens_seen": 63308870, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0078125, "step": 2928, "time_per_iteration": 2.5969839096069336 }, { "auxiliary_loss_clip": 0.01151978, "auxiliary_loss_mlp": 0.0104696, "balance_loss_clip": 1.02785087, "balance_loss_mlp": 1.05184054, "epoch": 0.1761010070644822, "flos": 24168384207360.0, "grad_norm": 1.8686472037978583, "language_loss": 0.83375347, "learning_rate": 3.780232677305744e-06, "loss": 0.85574287, "num_input_tokens_seen": 63329005, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0, "step": 2929, "time_per_iteration": 3.8348240852355957 }, { "auxiliary_loss_clip": 0.01151672, "auxiliary_loss_mlp": 0.01043766, "balance_loss_clip": 1.02536035, "balance_loss_mlp": 1.04895151, "epoch": 0.17616113031715017, "flos": 26576697335040.0, "grad_norm": 1.8310107714772086, "language_loss": 0.79095733, "learning_rate": 3.7800551528693535e-06, "loss": 0.81291175, "num_input_tokens_seen": 63349390, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0234375, "step": 2930, "time_per_iteration": 2.5222318172454834 }, { "auxiliary_loss_clip": 0.01153902, "auxiliary_loss_mlp": 0.01045393, "balance_loss_clip": 1.02612829, "balance_loss_mlp": 1.05206442, "epoch": 0.17622125356981813, "flos": 25666685245440.0, "grad_norm": 1.92197090703856, "language_loss": 0.76434892, "learning_rate": 3.7798775609330927e-06, "loss": 0.78634185, "num_input_tokens_seen": 63368835, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.015625, "step": 2931, "time_per_iteration": 5.408959627151489 }, { "auxiliary_loss_clip": 0.01151994, "auxiliary_loss_mlp": 0.01042076, "balance_loss_clip": 1.02401543, "balance_loss_mlp": 1.05000758, "epoch": 0.1762813768224861, "flos": 16508530949760.0, "grad_norm": 2.4524415294429107, "language_loss": 0.75832546, "learning_rate": 3.779699901503696e-06, "loss": 0.78026617, "num_input_tokens_seen": 63385220, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.015625, "step": 2932, "time_per_iteration": 2.448913812637329 }, { "auxiliary_loss_clip": 0.01157833, "auxiliary_loss_mlp": 0.01049927, "balance_loss_clip": 1.02989984, "balance_loss_mlp": 1.04964876, "epoch": 0.17634150007515406, "flos": 11211850402560.0, "grad_norm": 2.6122952364278724, "language_loss": 0.89952695, "learning_rate": 3.7795221745879016e-06, "loss": 0.92160451, "num_input_tokens_seen": 63400865, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.078125, "step": 2933, "time_per_iteration": 2.409208297729492 }, { "auxiliary_loss_clip": 0.01151664, "auxiliary_loss_mlp": 0.01050015, "balance_loss_clip": 1.03244352, "balance_loss_mlp": 1.05187893, "epoch": 0.17640162332782203, "flos": 23659925235840.0, "grad_norm": 1.6497634256407019, "language_loss": 0.88132095, "learning_rate": 3.779344380192448e-06, "loss": 0.90333772, "num_input_tokens_seen": 63421390, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 1.0, "step": 2934, "time_per_iteration": 2.5172276496887207 }, { "auxiliary_loss_clip": 0.01150373, "auxiliary_loss_mlp": 0.01043247, "balance_loss_clip": 1.02525842, "balance_loss_mlp": 1.05157161, "epoch": 0.17646174658049, "flos": 53796984606720.0, "grad_norm": 1.7340075609442893, "language_loss": 0.70593089, "learning_rate": 3.779166518324077e-06, "loss": 0.72786707, "num_input_tokens_seen": 63444715, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.98828125, "step": 2935, "time_per_iteration": 2.752985715866089 }, { "auxiliary_loss_clip": 0.01157284, "auxiliary_loss_mlp": 0.01041921, "balance_loss_clip": 1.02374148, "balance_loss_mlp": 1.05192685, "epoch": 0.17652186983315798, "flos": 24243868638720.0, "grad_norm": 2.5729667336593534, "language_loss": 0.69584346, "learning_rate": 3.7789885889895325e-06, "loss": 0.71783555, "num_input_tokens_seen": 63465525, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0546875, "step": 2936, "time_per_iteration": 2.527264356613159 }, { "auxiliary_loss_clip": 0.01156017, "auxiliary_loss_mlp": 0.01043159, "balance_loss_clip": 1.02513409, "balance_loss_mlp": 1.05518293, "epoch": 0.17658199308582595, "flos": 27454282421760.0, "grad_norm": 2.229111953964968, "language_loss": 0.71765316, "learning_rate": 3.7788105921955634e-06, "loss": 0.73964489, "num_input_tokens_seen": 63485815, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.0078125, "step": 2937, "time_per_iteration": 2.5052101612091064 }, { "auxiliary_loss_clip": 0.01157874, "auxiliary_loss_mlp": 0.01042567, "balance_loss_clip": 1.02307582, "balance_loss_mlp": 1.05220461, "epoch": 0.17664211633849392, "flos": 22418672901120.0, "grad_norm": 2.8390902818854284, "language_loss": 0.7587074, "learning_rate": 3.7786325279489184e-06, "loss": 0.78071189, "num_input_tokens_seen": 63503905, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0546875, "step": 2938, "time_per_iteration": 2.4907374382019043 }, { "auxiliary_loss_clip": 0.01156516, "auxiliary_loss_mlp": 0.01042548, "balance_loss_clip": 1.02410674, "balance_loss_mlp": 1.05364716, "epoch": 0.17670223959116188, "flos": 24715124098560.0, "grad_norm": 2.2711022017495606, "language_loss": 0.70779252, "learning_rate": 3.7784543962563495e-06, "loss": 0.72978318, "num_input_tokens_seen": 63521985, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.03125, "step": 2939, "time_per_iteration": 2.483142614364624 }, { "auxiliary_loss_clip": 0.01156612, "auxiliary_loss_mlp": 0.0104824, "balance_loss_clip": 1.02967906, "balance_loss_mlp": 1.05612266, "epoch": 0.17676236284382985, "flos": 22527051212160.0, "grad_norm": 2.1931273169849885, "language_loss": 0.74014014, "learning_rate": 3.7782761971246115e-06, "loss": 0.76218867, "num_input_tokens_seen": 63539830, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0078125, "step": 2940, "time_per_iteration": 2.479787588119507 }, { "auxiliary_loss_clip": 0.0115673, "auxiliary_loss_mlp": 0.01049665, "balance_loss_clip": 1.02939963, "balance_loss_mlp": 1.0531919, "epoch": 0.1768224860964978, "flos": 12385160161920.0, "grad_norm": 2.111065798901183, "language_loss": 0.8539964, "learning_rate": 3.7780979305604616e-06, "loss": 0.87606031, "num_input_tokens_seen": 63555495, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.03125, "step": 2941, "time_per_iteration": 2.4269227981567383 }, { "auxiliary_loss_clip": 0.01155155, "auxiliary_loss_mlp": 0.01042632, "balance_loss_clip": 1.02384496, "balance_loss_mlp": 1.05292869, "epoch": 0.1768826093491658, "flos": 24353360271360.0, "grad_norm": 2.120655734682025, "language_loss": 0.76653588, "learning_rate": 3.7779195965706607e-06, "loss": 0.78851378, "num_input_tokens_seen": 63575290, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0234375, "step": 2942, "time_per_iteration": 2.50944447517395 }, { "auxiliary_loss_clip": 0.01159571, "auxiliary_loss_mlp": 0.01042646, "balance_loss_clip": 1.02358472, "balance_loss_mlp": 1.05459118, "epoch": 0.17694273260183377, "flos": 23587062497280.0, "grad_norm": 1.9294378993822345, "language_loss": 0.79847074, "learning_rate": 3.77774119516197e-06, "loss": 0.82049292, "num_input_tokens_seen": 63594670, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.046875, "step": 2943, "time_per_iteration": 2.4806926250457764 }, { "auxiliary_loss_clip": 0.01158302, "auxiliary_loss_mlp": 0.01049448, "balance_loss_clip": 1.02852702, "balance_loss_mlp": 1.05182266, "epoch": 0.17700285585450173, "flos": 26760991040640.0, "grad_norm": 1.7116361521351429, "language_loss": 0.80674922, "learning_rate": 3.777562726341155e-06, "loss": 0.82882667, "num_input_tokens_seen": 63614780, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.0703125, "step": 2944, "time_per_iteration": 2.5145552158355713 }, { "auxiliary_loss_clip": 0.01156202, "auxiliary_loss_mlp": 0.01055117, "balance_loss_clip": 1.03699768, "balance_loss_mlp": 1.05315471, "epoch": 0.1770629791071697, "flos": 42776323320960.0, "grad_norm": 1.9558341611103802, "language_loss": 0.73578238, "learning_rate": 3.7773841901149835e-06, "loss": 0.75789565, "num_input_tokens_seen": 63637190, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.03125, "step": 2945, "time_per_iteration": 2.6495511531829834 }, { "auxiliary_loss_clip": 0.01158125, "auxiliary_loss_mlp": 0.01049832, "balance_loss_clip": 1.03166413, "balance_loss_mlp": 1.05647707, "epoch": 0.17712310235983766, "flos": 17345572560000.0, "grad_norm": 2.8401776178191236, "language_loss": 0.78049886, "learning_rate": 3.7772055864902256e-06, "loss": 0.80257845, "num_input_tokens_seen": 63652140, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.015625, "step": 2946, "time_per_iteration": 2.4518728256225586 }, { "auxiliary_loss_clip": 0.01151368, "auxiliary_loss_mlp": 0.0104991, "balance_loss_clip": 1.03120601, "balance_loss_mlp": 1.05067205, "epoch": 0.17718322561250563, "flos": 23878477537920.0, "grad_norm": 1.8880857896057366, "language_loss": 0.76318681, "learning_rate": 3.7770269154736535e-06, "loss": 0.78519958, "num_input_tokens_seen": 63671700, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0, "step": 2947, "time_per_iteration": 2.4735918045043945 }, { "auxiliary_loss_clip": 0.01152669, "auxiliary_loss_mlp": 0.01046958, "balance_loss_clip": 1.02750266, "balance_loss_mlp": 1.05102205, "epoch": 0.1772433488651736, "flos": 36466352104320.0, "grad_norm": 2.289888117349003, "language_loss": 0.72632468, "learning_rate": 3.7768481770720424e-06, "loss": 0.74832094, "num_input_tokens_seen": 63691685, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.015625, "step": 2948, "time_per_iteration": 2.605689764022827 }, { "auxiliary_loss_clip": 0.01154491, "auxiliary_loss_mlp": 0.010484, "balance_loss_clip": 1.02983856, "balance_loss_mlp": 1.05438924, "epoch": 0.1773034721178416, "flos": 26684716510080.0, "grad_norm": 1.7706817461614364, "language_loss": 0.82298338, "learning_rate": 3.776669371292171e-06, "loss": 0.84501225, "num_input_tokens_seen": 63711720, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0, "step": 2949, "time_per_iteration": 2.5342390537261963 }, { "auxiliary_loss_clip": 0.01061766, "auxiliary_loss_mlp": 0.0100157, "balance_loss_clip": 0.99881595, "balance_loss_mlp": 1.02615356, "epoch": 0.17736359537050955, "flos": 57117467617920.0, "grad_norm": 0.8703712064325158, "language_loss": 0.6496346, "learning_rate": 3.7764904981408186e-06, "loss": 0.670268, "num_input_tokens_seen": 63776280, "router_z_loss_clip": 0.02758789, "router_z_loss_mlp": 0.35546875, "step": 2950, "time_per_iteration": 3.155900239944458 }, { "auxiliary_loss_clip": 0.01147105, "auxiliary_loss_mlp": 0.01044611, "balance_loss_clip": 1.02618122, "balance_loss_mlp": 1.04765058, "epoch": 0.17742371862317752, "flos": 27198203385600.0, "grad_norm": 1.9754763461656097, "language_loss": 0.83650112, "learning_rate": 3.7763115576247686e-06, "loss": 0.85841835, "num_input_tokens_seen": 63797535, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.99609375, "step": 2951, "time_per_iteration": 2.5262610912323 }, { "auxiliary_loss_clip": 0.01157513, "auxiliary_loss_mlp": 0.01050766, "balance_loss_clip": 1.03258634, "balance_loss_mlp": 1.05362403, "epoch": 0.17748384187584548, "flos": 20959694277120.0, "grad_norm": 3.6187976343104133, "language_loss": 0.80164987, "learning_rate": 3.776132549750806e-06, "loss": 0.82373267, "num_input_tokens_seen": 63817045, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0390625, "step": 2952, "time_per_iteration": 2.4785213470458984 }, { "auxiliary_loss_clip": 0.01152179, "auxiliary_loss_mlp": 0.01047277, "balance_loss_clip": 1.02754807, "balance_loss_mlp": 1.05053639, "epoch": 0.17754396512851345, "flos": 25009986844800.0, "grad_norm": 2.242989873564779, "language_loss": 0.7933476, "learning_rate": 3.7759534745257194e-06, "loss": 0.81534219, "num_input_tokens_seen": 63837665, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.015625, "step": 2953, "time_per_iteration": 2.4872210025787354 }, { "auxiliary_loss_clip": 0.01154356, "auxiliary_loss_mlp": 0.01045204, "balance_loss_clip": 1.0267148, "balance_loss_mlp": 1.05239475, "epoch": 0.1776040883811814, "flos": 32051566275840.0, "grad_norm": 1.8562630749261486, "language_loss": 0.87940121, "learning_rate": 3.7757743319562994e-06, "loss": 0.90139687, "num_input_tokens_seen": 63858455, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.015625, "step": 2954, "time_per_iteration": 2.556302309036255 }, { "auxiliary_loss_clip": 0.01153084, "auxiliary_loss_mlp": 0.01052083, "balance_loss_clip": 1.03267574, "balance_loss_mlp": 1.05040371, "epoch": 0.17766421163384938, "flos": 21574125348480.0, "grad_norm": 1.7737719255634785, "language_loss": 0.85115528, "learning_rate": 3.7755951220493386e-06, "loss": 0.87320697, "num_input_tokens_seen": 63876935, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0234375, "step": 2955, "time_per_iteration": 2.4809107780456543 }, { "auxiliary_loss_clip": 0.01149505, "auxiliary_loss_mlp": 0.01042874, "balance_loss_clip": 1.02457547, "balance_loss_mlp": 1.04849017, "epoch": 0.17772433488651737, "flos": 22419319345920.0, "grad_norm": 1.6650126741387667, "language_loss": 0.70991361, "learning_rate": 3.7754158448116327e-06, "loss": 0.73183739, "num_input_tokens_seen": 63896815, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0078125, "step": 2956, "time_per_iteration": 2.5121960639953613 }, { "auxiliary_loss_clip": 0.01149142, "auxiliary_loss_mlp": 0.01047096, "balance_loss_clip": 1.02786732, "balance_loss_mlp": 1.04877567, "epoch": 0.17778445813918534, "flos": 25629445820160.0, "grad_norm": 2.4006455443271597, "language_loss": 0.83266032, "learning_rate": 3.7752365002499795e-06, "loss": 0.85462272, "num_input_tokens_seen": 63916140, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0, "step": 2957, "time_per_iteration": 2.4867300987243652 }, { "auxiliary_loss_clip": 0.01151549, "auxiliary_loss_mlp": 0.01046355, "balance_loss_clip": 1.02781844, "balance_loss_mlp": 1.05161011, "epoch": 0.1778445813918533, "flos": 25628871202560.0, "grad_norm": 1.628322555728115, "language_loss": 0.74934685, "learning_rate": 3.7750570883711807e-06, "loss": 0.77132589, "num_input_tokens_seen": 63935220, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.99609375, "step": 2958, "time_per_iteration": 2.507188320159912 }, { "auxiliary_loss_clip": 0.01157668, "auxiliary_loss_mlp": 0.01045812, "balance_loss_clip": 1.02738261, "balance_loss_mlp": 1.05409825, "epoch": 0.17790470464452127, "flos": 22345522853760.0, "grad_norm": 2.4387493588251568, "language_loss": 0.8032918, "learning_rate": 3.7748776091820397e-06, "loss": 0.82532662, "num_input_tokens_seen": 63954550, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.0390625, "step": 2959, "time_per_iteration": 2.455470561981201 }, { "auxiliary_loss_clip": 0.01157967, "auxiliary_loss_mlp": 0.01049571, "balance_loss_clip": 1.02968717, "balance_loss_mlp": 1.05178702, "epoch": 0.17796482789718923, "flos": 18765875214720.0, "grad_norm": 1.7907259133817202, "language_loss": 0.5177331, "learning_rate": 3.774698062689362e-06, "loss": 0.53980851, "num_input_tokens_seen": 63972425, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0625, "step": 2960, "time_per_iteration": 2.4576480388641357 }, { "auxiliary_loss_clip": 0.01156109, "auxiliary_loss_mlp": 0.0105608, "balance_loss_clip": 1.03739977, "balance_loss_mlp": 1.05206621, "epoch": 0.1780249511498572, "flos": 23440941970560.0, "grad_norm": 1.8115384735427529, "language_loss": 0.89225423, "learning_rate": 3.7745184488999548e-06, "loss": 0.91437614, "num_input_tokens_seen": 63992165, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0390625, "step": 2961, "time_per_iteration": 2.482792377471924 }, { "auxiliary_loss_clip": 0.01156334, "auxiliary_loss_mlp": 0.0105207, "balance_loss_clip": 1.03178024, "balance_loss_mlp": 1.05082643, "epoch": 0.1780850744025252, "flos": 23367468700800.0, "grad_norm": 1.6456730683306122, "language_loss": 0.7894063, "learning_rate": 3.774338767820631e-06, "loss": 0.81149036, "num_input_tokens_seen": 64013470, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0546875, "step": 2962, "time_per_iteration": 2.5190906524658203 }, { "auxiliary_loss_clip": 0.01155574, "auxiliary_loss_mlp": 0.01051106, "balance_loss_clip": 1.03078115, "balance_loss_mlp": 1.0516839, "epoch": 0.17814519765519315, "flos": 13771994319360.0, "grad_norm": 1.7450703638408154, "language_loss": 0.74806869, "learning_rate": 3.774159019458203e-06, "loss": 0.77013552, "num_input_tokens_seen": 64030975, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0390625, "step": 2963, "time_per_iteration": 2.4471051692962646 }, { "auxiliary_loss_clip": 0.01159511, "auxiliary_loss_mlp": 0.01044506, "balance_loss_clip": 1.02390623, "balance_loss_mlp": 1.05279636, "epoch": 0.17820532090786112, "flos": 21976396738560.0, "grad_norm": 1.5282943816018923, "language_loss": 0.78871357, "learning_rate": 3.7739792038194877e-06, "loss": 0.8107537, "num_input_tokens_seen": 64050075, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.0703125, "step": 2964, "time_per_iteration": 2.4829213619232178 }, { "auxiliary_loss_clip": 0.01156359, "auxiliary_loss_mlp": 0.01054724, "balance_loss_clip": 1.03555501, "balance_loss_mlp": 1.05463982, "epoch": 0.17826544416052909, "flos": 24790752184320.0, "grad_norm": 2.3768033779414584, "language_loss": 0.81062388, "learning_rate": 3.7737993209113027e-06, "loss": 0.8327347, "num_input_tokens_seen": 64071920, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.015625, "step": 2965, "time_per_iteration": 2.4807093143463135 }, { "auxiliary_loss_clip": 0.01151857, "auxiliary_loss_mlp": 0.01051271, "balance_loss_clip": 1.03334165, "balance_loss_mlp": 1.05023456, "epoch": 0.17832556741319705, "flos": 13879582531200.0, "grad_norm": 2.27635157246476, "language_loss": 0.94683921, "learning_rate": 3.7736193707404698e-06, "loss": 0.96887046, "num_input_tokens_seen": 64086835, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.015625, "step": 2966, "time_per_iteration": 2.432828426361084 }, { "auxiliary_loss_clip": 0.01155253, "auxiliary_loss_mlp": 0.01053652, "balance_loss_clip": 1.03276646, "balance_loss_mlp": 1.05232811, "epoch": 0.17838569066586502, "flos": 36641703323520.0, "grad_norm": 2.192847802290189, "language_loss": 0.73059207, "learning_rate": 3.7734393533138127e-06, "loss": 0.75268108, "num_input_tokens_seen": 64107360, "router_z_loss_clip": 0.20898438, "router_z_loss_mlp": 1.03125, "step": 2967, "time_per_iteration": 2.588275671005249 }, { "auxiliary_loss_clip": 0.0115036, "auxiliary_loss_mlp": 0.01049151, "balance_loss_clip": 1.03014922, "balance_loss_mlp": 1.05191696, "epoch": 0.17844581391853298, "flos": 18727271072640.0, "grad_norm": 1.893638044218161, "language_loss": 0.77379698, "learning_rate": 3.773259268638157e-06, "loss": 0.7957921, "num_input_tokens_seen": 64124690, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.984375, "step": 2968, "time_per_iteration": 2.5451788902282715 }, { "auxiliary_loss_clip": 0.01153148, "auxiliary_loss_mlp": 0.01043824, "balance_loss_clip": 1.02531052, "balance_loss_mlp": 1.05219936, "epoch": 0.17850593717120097, "flos": 27378259286400.0, "grad_norm": 1.673830973114944, "language_loss": 0.75902843, "learning_rate": 3.7730791167203333e-06, "loss": 0.78099817, "num_input_tokens_seen": 64146315, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0078125, "step": 2969, "time_per_iteration": 3.9968528747558594 }, { "auxiliary_loss_clip": 0.01055566, "auxiliary_loss_mlp": 0.01018173, "balance_loss_clip": 1.01555061, "balance_loss_mlp": 1.01987767, "epoch": 0.17856606042386894, "flos": 66996025084800.0, "grad_norm": 0.8458682535604904, "language_loss": 0.69039893, "learning_rate": 3.772898897567171e-06, "loss": 0.71113634, "num_input_tokens_seen": 64210875, "router_z_loss_clip": 0.02624512, "router_z_loss_mlp": 0.35546875, "step": 2970, "time_per_iteration": 4.548672437667847 }, { "auxiliary_loss_clip": 0.01155976, "auxiliary_loss_mlp": 0.01042822, "balance_loss_clip": 1.02388, "balance_loss_mlp": 1.04996562, "epoch": 0.1786261836765369, "flos": 36977001805440.0, "grad_norm": 1.7966752287905023, "language_loss": 0.67140436, "learning_rate": 3.772718611185505e-06, "loss": 0.6933924, "num_input_tokens_seen": 64230740, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0625, "step": 2971, "time_per_iteration": 2.6027514934539795 }, { "auxiliary_loss_clip": 0.01152623, "auxiliary_loss_mlp": 0.01048765, "balance_loss_clip": 1.02822566, "balance_loss_mlp": 1.04904556, "epoch": 0.17868630692920487, "flos": 24825441744000.0, "grad_norm": 2.3673638313076304, "language_loss": 0.89981979, "learning_rate": 3.7725382575821717e-06, "loss": 0.92183363, "num_input_tokens_seen": 64252300, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.03125, "step": 2972, "time_per_iteration": 3.894383192062378 }, { "auxiliary_loss_clip": 0.01153575, "auxiliary_loss_mlp": 0.01054045, "balance_loss_clip": 1.03516233, "balance_loss_mlp": 1.05199718, "epoch": 0.17874643018187283, "flos": 16981977139200.0, "grad_norm": 2.0783873583414936, "language_loss": 0.88441879, "learning_rate": 3.77235783676401e-06, "loss": 0.90649498, "num_input_tokens_seen": 64270105, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.015625, "step": 2973, "time_per_iteration": 3.889965534210205 }, { "auxiliary_loss_clip": 0.01153836, "auxiliary_loss_mlp": 0.01051435, "balance_loss_clip": 1.03151524, "balance_loss_mlp": 1.05228448, "epoch": 0.1788065534345408, "flos": 21032233793280.0, "grad_norm": 1.9443449198904967, "language_loss": 0.76340044, "learning_rate": 3.7721773487378615e-06, "loss": 0.78545314, "num_input_tokens_seen": 64287250, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.015625, "step": 2974, "time_per_iteration": 2.4902923107147217 }, { "auxiliary_loss_clip": 0.01153731, "auxiliary_loss_mlp": 0.01049953, "balance_loss_clip": 1.03053415, "balance_loss_mlp": 1.05150342, "epoch": 0.17886667668720876, "flos": 23987717775360.0, "grad_norm": 2.1654513638862194, "language_loss": 0.7460078, "learning_rate": 3.7719967935105705e-06, "loss": 0.76804465, "num_input_tokens_seen": 64307140, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0234375, "step": 2975, "time_per_iteration": 2.4927444458007812 }, { "auxiliary_loss_clip": 0.01151598, "auxiliary_loss_mlp": 0.01046713, "balance_loss_clip": 1.02754402, "balance_loss_mlp": 1.05178356, "epoch": 0.17892679993987676, "flos": 25739476156800.0, "grad_norm": 1.6005008259812508, "language_loss": 0.73430181, "learning_rate": 3.7718161710889833e-06, "loss": 0.75628489, "num_input_tokens_seen": 64328760, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0, "step": 2976, "time_per_iteration": 2.504312753677368 }, { "auxiliary_loss_clip": 0.01147713, "auxiliary_loss_mlp": 0.01041104, "balance_loss_clip": 1.02527297, "balance_loss_mlp": 1.0520761, "epoch": 0.17898692319254472, "flos": 25699686865920.0, "grad_norm": 1.498666644384812, "language_loss": 0.77224916, "learning_rate": 3.7716354814799495e-06, "loss": 0.79413736, "num_input_tokens_seen": 64348800, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.953125, "step": 2977, "time_per_iteration": 2.5160818099975586 }, { "auxiliary_loss_clip": 0.0115566, "auxiliary_loss_mlp": 0.01053141, "balance_loss_clip": 1.03508067, "balance_loss_mlp": 1.05510569, "epoch": 0.1790470464452127, "flos": 19317786664320.0, "grad_norm": 3.1171030603195264, "language_loss": 0.79925203, "learning_rate": 3.7714547246903203e-06, "loss": 0.82134002, "num_input_tokens_seen": 64367955, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0078125, "step": 2978, "time_per_iteration": 2.4932825565338135 }, { "auxiliary_loss_clip": 0.01157406, "auxiliary_loss_mlp": 0.0104626, "balance_loss_clip": 1.02673399, "balance_loss_mlp": 1.05258203, "epoch": 0.17910716969788065, "flos": 30044267562240.0, "grad_norm": 1.4565168253838714, "language_loss": 0.76308328, "learning_rate": 3.7712739007269508e-06, "loss": 0.78511995, "num_input_tokens_seen": 64389805, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.046875, "step": 2979, "time_per_iteration": 2.552807092666626 }, { "auxiliary_loss_clip": 0.01150802, "auxiliary_loss_mlp": 0.01044258, "balance_loss_clip": 1.02576828, "balance_loss_mlp": 1.05046177, "epoch": 0.17916729295054862, "flos": 19427709260160.0, "grad_norm": 1.8567591456966412, "language_loss": 0.6930806, "learning_rate": 3.7710930095966976e-06, "loss": 0.71503121, "num_input_tokens_seen": 64408220, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0078125, "step": 2980, "time_per_iteration": 2.44909930229187 }, { "auxiliary_loss_clip": 0.01156412, "auxiliary_loss_mlp": 0.01049028, "balance_loss_clip": 1.02765369, "balance_loss_mlp": 1.0542686, "epoch": 0.17922741620321658, "flos": 14611549881600.0, "grad_norm": 1.9636789533304397, "language_loss": 0.71047014, "learning_rate": 3.7709120513064196e-06, "loss": 0.73252451, "num_input_tokens_seen": 64426380, "router_z_loss_clip": 0.21289062, "router_z_loss_mlp": 1.0234375, "step": 2981, "time_per_iteration": 2.4510245323181152 }, { "auxiliary_loss_clip": 0.01158679, "auxiliary_loss_mlp": 0.01059509, "balance_loss_clip": 1.0394702, "balance_loss_mlp": 1.0543716, "epoch": 0.17928753945588458, "flos": 17165301177600.0, "grad_norm": 2.1163947674802133, "language_loss": 0.8227638, "learning_rate": 3.7707310258629796e-06, "loss": 0.84494567, "num_input_tokens_seen": 64444355, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0390625, "step": 2982, "time_per_iteration": 2.4388134479522705 }, { "auxiliary_loss_clip": 0.01152601, "auxiliary_loss_mlp": 0.01043409, "balance_loss_clip": 1.02472854, "balance_loss_mlp": 1.05116606, "epoch": 0.17934766270855254, "flos": 31395622060800.0, "grad_norm": 1.6305052932143533, "language_loss": 0.82787609, "learning_rate": 3.7705499332732413e-06, "loss": 0.84983623, "num_input_tokens_seen": 64467800, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.015625, "step": 2983, "time_per_iteration": 2.5632057189941406 }, { "auxiliary_loss_clip": 0.01156995, "auxiliary_loss_mlp": 0.01053651, "balance_loss_clip": 1.03312325, "balance_loss_mlp": 1.05158544, "epoch": 0.1794077859612205, "flos": 20814184281600.0, "grad_norm": 1.9918253278643134, "language_loss": 0.85000068, "learning_rate": 3.7703687735440718e-06, "loss": 0.87210715, "num_input_tokens_seen": 64487230, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 1.0546875, "step": 2984, "time_per_iteration": 2.4637489318847656 }, { "auxiliary_loss_clip": 0.01153557, "auxiliary_loss_mlp": 0.01049786, "balance_loss_clip": 1.02997398, "balance_loss_mlp": 1.0504353, "epoch": 0.17946790921388847, "flos": 28986447006720.0, "grad_norm": 1.4527932151119956, "language_loss": 0.89544833, "learning_rate": 3.7701875466823416e-06, "loss": 0.91748178, "num_input_tokens_seen": 64509165, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.03125, "step": 2985, "time_per_iteration": 2.5436031818389893 }, { "auxiliary_loss_clip": 0.0115036, "auxiliary_loss_mlp": 0.01045029, "balance_loss_clip": 1.02798188, "balance_loss_mlp": 1.05145621, "epoch": 0.17952803246655644, "flos": 20737406960640.0, "grad_norm": 2.497983488227169, "language_loss": 0.69386029, "learning_rate": 3.770006252694922e-06, "loss": 0.71581417, "num_input_tokens_seen": 64527940, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.98828125, "step": 2986, "time_per_iteration": 2.4538230895996094 }, { "auxiliary_loss_clip": 0.01149159, "auxiliary_loss_mlp": 0.01044981, "balance_loss_clip": 1.02562118, "balance_loss_mlp": 1.05021119, "epoch": 0.1795881557192244, "flos": 28255988027520.0, "grad_norm": 2.4975206970747643, "language_loss": 0.78159314, "learning_rate": 3.769824891588688e-06, "loss": 0.80353451, "num_input_tokens_seen": 64545230, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.98828125, "step": 2987, "time_per_iteration": 2.539416790008545 }, { "auxiliary_loss_clip": 0.01155853, "auxiliary_loss_mlp": 0.01043297, "balance_loss_clip": 1.02287626, "balance_loss_mlp": 1.05043387, "epoch": 0.17964827897189237, "flos": 18552027594240.0, "grad_norm": 1.9302523371657154, "language_loss": 0.7807281, "learning_rate": 3.7696434633705164e-06, "loss": 0.80271959, "num_input_tokens_seen": 64563820, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.046875, "step": 2988, "time_per_iteration": 2.440072536468506 }, { "auxiliary_loss_clip": 0.01061746, "auxiliary_loss_mlp": 0.0100323, "balance_loss_clip": 1.00013053, "balance_loss_mlp": 1.02570581, "epoch": 0.17970840222456036, "flos": 58165088711040.0, "grad_norm": 2.866396154665574, "language_loss": 0.62719458, "learning_rate": 3.7694619680472875e-06, "loss": 0.64784431, "num_input_tokens_seen": 64621315, "router_z_loss_clip": 0.03088379, "router_z_loss_mlp": 0.359375, "step": 2989, "time_per_iteration": 3.022050619125366 }, { "auxiliary_loss_clip": 0.01152796, "auxiliary_loss_mlp": 0.0104352, "balance_loss_clip": 1.02549601, "balance_loss_mlp": 1.05065584, "epoch": 0.17976852547722832, "flos": 20300805146880.0, "grad_norm": 2.112354036389122, "language_loss": 0.70804667, "learning_rate": 3.7692804056258837e-06, "loss": 0.73000979, "num_input_tokens_seen": 64639885, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0234375, "step": 2990, "time_per_iteration": 2.4666080474853516 }, { "auxiliary_loss_clip": 0.01156359, "auxiliary_loss_mlp": 0.01044918, "balance_loss_clip": 1.02604747, "balance_loss_mlp": 1.05227065, "epoch": 0.1798286487298963, "flos": 39669367685760.0, "grad_norm": 2.0062255280697707, "language_loss": 0.69036323, "learning_rate": 3.7690987761131893e-06, "loss": 0.712376, "num_input_tokens_seen": 64661220, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0390625, "step": 2991, "time_per_iteration": 2.6439454555511475 }, { "auxiliary_loss_clip": 0.01153483, "auxiliary_loss_mlp": 0.01042974, "balance_loss_clip": 1.02356684, "balance_loss_mlp": 1.05131412, "epoch": 0.17988877198256426, "flos": 25520313323520.0, "grad_norm": 1.5665207196634918, "language_loss": 0.82633781, "learning_rate": 3.7689170795160924e-06, "loss": 0.84830236, "num_input_tokens_seen": 64682530, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.015625, "step": 2992, "time_per_iteration": 2.5048115253448486 }, { "auxiliary_loss_clip": 0.01145908, "auxiliary_loss_mlp": 0.01041795, "balance_loss_clip": 1.02421141, "balance_loss_mlp": 1.0473851, "epoch": 0.17994889523523222, "flos": 18807496099200.0, "grad_norm": 1.8622065327709825, "language_loss": 0.82051259, "learning_rate": 3.7687353158414822e-06, "loss": 0.84238958, "num_input_tokens_seen": 64701025, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.984375, "step": 2993, "time_per_iteration": 2.491331100463867 }, { "auxiliary_loss_clip": 0.01148966, "auxiliary_loss_mlp": 0.0104415, "balance_loss_clip": 1.02507615, "balance_loss_mlp": 1.04709566, "epoch": 0.18000901848790019, "flos": 21104450087040.0, "grad_norm": 1.6156440470715523, "language_loss": 0.78616822, "learning_rate": 3.7685534850962517e-06, "loss": 0.80809939, "num_input_tokens_seen": 64719570, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.015625, "step": 2994, "time_per_iteration": 2.5078299045562744 }, { "auxiliary_loss_clip": 0.01153197, "auxiliary_loss_mlp": 0.01047934, "balance_loss_clip": 1.02946818, "balance_loss_mlp": 1.0504148, "epoch": 0.18006914174056818, "flos": 19646441130240.0, "grad_norm": 2.1154884018286544, "language_loss": 0.80319965, "learning_rate": 3.768371587287296e-06, "loss": 0.82521093, "num_input_tokens_seen": 64738110, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 1.03125, "step": 2995, "time_per_iteration": 2.4674596786499023 }, { "auxiliary_loss_clip": 0.0115282, "auxiliary_loss_mlp": 0.01051088, "balance_loss_clip": 1.03375483, "balance_loss_mlp": 1.05152571, "epoch": 0.18012926499323614, "flos": 19499889640320.0, "grad_norm": 2.2283915075487886, "language_loss": 0.84359449, "learning_rate": 3.768189622421512e-06, "loss": 0.86563355, "num_input_tokens_seen": 64756345, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.015625, "step": 2996, "time_per_iteration": 2.453640937805176 }, { "auxiliary_loss_clip": 0.01146096, "auxiliary_loss_mlp": 0.01039539, "balance_loss_clip": 1.02188396, "balance_loss_mlp": 1.04815412, "epoch": 0.1801893882459041, "flos": 19464553635840.0, "grad_norm": 1.6365308261291116, "language_loss": 0.88411736, "learning_rate": 3.7680075905058006e-06, "loss": 0.90597379, "num_input_tokens_seen": 64776375, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9765625, "step": 2997, "time_per_iteration": 2.489611864089966 }, { "auxiliary_loss_clip": 0.01155806, "auxiliary_loss_mlp": 0.01052368, "balance_loss_clip": 1.03280616, "balance_loss_mlp": 1.04940462, "epoch": 0.18024951149857207, "flos": 26870590414080.0, "grad_norm": 1.6962043616795708, "language_loss": 0.8509469, "learning_rate": 3.7678254915470643e-06, "loss": 0.87302864, "num_input_tokens_seen": 64796210, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0625, "step": 2998, "time_per_iteration": 2.4991931915283203 }, { "auxiliary_loss_clip": 0.01151121, "auxiliary_loss_mlp": 0.01047213, "balance_loss_clip": 1.02939117, "balance_loss_mlp": 1.05285394, "epoch": 0.18030963475124004, "flos": 30226621933440.0, "grad_norm": 1.6873747531303556, "language_loss": 0.84103835, "learning_rate": 3.7676433255522084e-06, "loss": 0.86302173, "num_input_tokens_seen": 64818590, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.984375, "step": 2999, "time_per_iteration": 2.539829730987549 }, { "auxiliary_loss_clip": 0.01151013, "auxiliary_loss_mlp": 0.01054979, "balance_loss_clip": 1.03564274, "balance_loss_mlp": 1.04998565, "epoch": 0.180369758003908, "flos": 22307493329280.0, "grad_norm": 2.118226347443719, "language_loss": 0.74808443, "learning_rate": 3.76746109252814e-06, "loss": 0.77014434, "num_input_tokens_seen": 64838350, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0078125, "step": 3000, "time_per_iteration": 2.466625928878784 }, { "auxiliary_loss_clip": 0.01151726, "auxiliary_loss_mlp": 0.01067395, "balance_loss_clip": 1.04902482, "balance_loss_mlp": 1.05203176, "epoch": 0.18042988125657597, "flos": 23732033788800.0, "grad_norm": 2.384103592632781, "language_loss": 0.70804381, "learning_rate": 3.76727879248177e-06, "loss": 0.73023504, "num_input_tokens_seen": 64858065, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0, "step": 3001, "time_per_iteration": 2.5036659240722656 }, { "auxiliary_loss_clip": 0.01157125, "auxiliary_loss_mlp": 0.01051897, "balance_loss_clip": 1.03263295, "balance_loss_mlp": 1.05260551, "epoch": 0.18049000450924396, "flos": 24093582134400.0, "grad_norm": 1.9338850929464548, "language_loss": 0.8809967, "learning_rate": 3.767096425420011e-06, "loss": 0.90308696, "num_input_tokens_seen": 64877305, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.046875, "step": 3002, "time_per_iteration": 2.4630470275878906 }, { "auxiliary_loss_clip": 0.0115361, "auxiliary_loss_mlp": 0.01055971, "balance_loss_clip": 1.03804159, "balance_loss_mlp": 1.05217028, "epoch": 0.18055012776191193, "flos": 22163168482560.0, "grad_norm": 2.185398921908184, "language_loss": 0.80450082, "learning_rate": 3.7669139913497788e-06, "loss": 0.82659662, "num_input_tokens_seen": 64896955, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.015625, "step": 3003, "time_per_iteration": 2.46163010597229 }, { "auxiliary_loss_clip": 0.01153743, "auxiliary_loss_mlp": 0.01049618, "balance_loss_clip": 1.03061557, "balance_loss_mlp": 1.05079865, "epoch": 0.1806102510145799, "flos": 28913512440960.0, "grad_norm": 2.6012924419332784, "language_loss": 0.67023695, "learning_rate": 3.7667314902779907e-06, "loss": 0.69227052, "num_input_tokens_seen": 64917080, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.03125, "step": 3004, "time_per_iteration": 2.5052297115325928 }, { "auxiliary_loss_clip": 0.01155013, "auxiliary_loss_mlp": 0.01058694, "balance_loss_clip": 1.03928638, "balance_loss_mlp": 1.05341792, "epoch": 0.18067037426724786, "flos": 19025689265280.0, "grad_norm": 1.6812836799333497, "language_loss": 0.85373652, "learning_rate": 3.7665489222115677e-06, "loss": 0.87587357, "num_input_tokens_seen": 64935215, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.015625, "step": 3005, "time_per_iteration": 2.531026840209961 }, { "auxiliary_loss_clip": 0.01148001, "auxiliary_loss_mlp": 0.01044813, "balance_loss_clip": 1.02731287, "balance_loss_mlp": 1.04981399, "epoch": 0.18073049751991582, "flos": 27453635976960.0, "grad_norm": 1.9401290278888619, "language_loss": 0.83207238, "learning_rate": 3.766366287157432e-06, "loss": 0.85400045, "num_input_tokens_seen": 64956275, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.984375, "step": 3006, "time_per_iteration": 2.504532814025879 }, { "auxiliary_loss_clip": 0.01150859, "auxiliary_loss_mlp": 0.01058009, "balance_loss_clip": 1.03789878, "balance_loss_mlp": 1.04964614, "epoch": 0.1807906207725838, "flos": 28729039167360.0, "grad_norm": 1.5824324273816563, "language_loss": 0.77214682, "learning_rate": 3.7661835851225103e-06, "loss": 0.79423553, "num_input_tokens_seen": 64979390, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0078125, "step": 3007, "time_per_iteration": 2.562089204788208 }, { "auxiliary_loss_clip": 0.01058933, "auxiliary_loss_mlp": 0.01001092, "balance_loss_clip": 0.99813557, "balance_loss_mlp": 1.02377558, "epoch": 0.18085074402525175, "flos": 64466515468800.0, "grad_norm": 0.8270310707865766, "language_loss": 0.57017004, "learning_rate": 3.7660008161137294e-06, "loss": 0.5907703, "num_input_tokens_seen": 65043135, "router_z_loss_clip": 0.02954102, "router_z_loss_mlp": 0.3515625, "step": 3008, "time_per_iteration": 3.2299253940582275 }, { "auxiliary_loss_clip": 0.0115234, "auxiliary_loss_mlp": 0.01053053, "balance_loss_clip": 1.03364527, "balance_loss_mlp": 1.05208135, "epoch": 0.18091086727791975, "flos": 23476960333440.0, "grad_norm": 1.6922149210408337, "language_loss": 0.67004764, "learning_rate": 3.765817980138021e-06, "loss": 0.6921016, "num_input_tokens_seen": 65062845, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0, "step": 3009, "time_per_iteration": 2.5115387439727783 }, { "auxiliary_loss_clip": 0.01154003, "auxiliary_loss_mlp": 0.01048315, "balance_loss_clip": 1.03064811, "balance_loss_mlp": 1.05259824, "epoch": 0.1809709905305877, "flos": 24170467196160.0, "grad_norm": 2.16873098594063, "language_loss": 0.75491828, "learning_rate": 3.7656350772023177e-06, "loss": 0.77694142, "num_input_tokens_seen": 65082110, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 1.015625, "step": 3010, "time_per_iteration": 3.9530446529388428 }, { "auxiliary_loss_clip": 0.01144413, "auxiliary_loss_mlp": 0.01043288, "balance_loss_clip": 1.02651477, "balance_loss_mlp": 1.04886472, "epoch": 0.18103111378325568, "flos": 21650902669440.0, "grad_norm": 1.5723366536667172, "language_loss": 0.6742087, "learning_rate": 3.7654521073135553e-06, "loss": 0.69608569, "num_input_tokens_seen": 65101985, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.95703125, "step": 3011, "time_per_iteration": 2.4712488651275635 }, { "auxiliary_loss_clip": 0.01145986, "auxiliary_loss_mlp": 0.01052218, "balance_loss_clip": 1.03391886, "balance_loss_mlp": 1.04803813, "epoch": 0.18109123703592364, "flos": 53686918356480.0, "grad_norm": 1.6352234176041456, "language_loss": 0.71724957, "learning_rate": 3.7652690704786723e-06, "loss": 0.73923159, "num_input_tokens_seen": 65129295, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.98046875, "step": 3012, "time_per_iteration": 4.188018560409546 }, { "auxiliary_loss_clip": 0.0115051, "auxiliary_loss_mlp": 0.01050744, "balance_loss_clip": 1.03258848, "balance_loss_mlp": 1.05419433, "epoch": 0.1811513602885916, "flos": 35845564325760.0, "grad_norm": 1.935962943479968, "language_loss": 0.6239621, "learning_rate": 3.765085966704609e-06, "loss": 0.64597464, "num_input_tokens_seen": 65150625, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.96484375, "step": 3013, "time_per_iteration": 2.608985185623169 }, { "auxiliary_loss_clip": 0.01151774, "auxiliary_loss_mlp": 0.01052914, "balance_loss_clip": 1.03553331, "balance_loss_mlp": 1.05127144, "epoch": 0.18121148354125957, "flos": 23732572492800.0, "grad_norm": 1.5291890991181232, "language_loss": 0.75882828, "learning_rate": 3.764902795998309e-06, "loss": 0.78087515, "num_input_tokens_seen": 65170880, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.0, "step": 3014, "time_per_iteration": 3.9008312225341797 }, { "auxiliary_loss_clip": 0.01154588, "auxiliary_loss_mlp": 0.01046552, "balance_loss_clip": 1.02641773, "balance_loss_mlp": 1.05188334, "epoch": 0.18127160679392756, "flos": 28728320895360.0, "grad_norm": 1.6057072105152281, "language_loss": 0.66008651, "learning_rate": 3.7647195583667184e-06, "loss": 0.68209791, "num_input_tokens_seen": 65192530, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.03125, "step": 3015, "time_per_iteration": 3.9709203243255615 }, { "auxiliary_loss_clip": 0.01148847, "auxiliary_loss_mlp": 0.01043529, "balance_loss_clip": 1.02521896, "balance_loss_mlp": 1.05169213, "epoch": 0.18133173004659553, "flos": 20485062938880.0, "grad_norm": 1.7768779376435597, "language_loss": 0.7778464, "learning_rate": 3.764536253816785e-06, "loss": 0.79977012, "num_input_tokens_seen": 65211675, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.96875, "step": 3016, "time_per_iteration": 2.4646098613739014 }, { "auxiliary_loss_clip": 0.01157519, "auxiliary_loss_mlp": 0.0105903, "balance_loss_clip": 1.04009962, "balance_loss_mlp": 1.05550706, "epoch": 0.1813918532992635, "flos": 22852078404480.0, "grad_norm": 1.6366513104578184, "language_loss": 0.83416843, "learning_rate": 3.7643528823554602e-06, "loss": 0.85633397, "num_input_tokens_seen": 65231185, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0234375, "step": 3017, "time_per_iteration": 2.478405237197876 }, { "auxiliary_loss_clip": 0.01148608, "auxiliary_loss_mlp": 0.01039985, "balance_loss_clip": 1.02204394, "balance_loss_mlp": 1.0523293, "epoch": 0.18145197655193146, "flos": 36065122208640.0, "grad_norm": 20.701943555909875, "language_loss": 0.67829311, "learning_rate": 3.764169443989697e-06, "loss": 0.70017898, "num_input_tokens_seen": 65251645, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9609375, "step": 3018, "time_per_iteration": 2.6030988693237305 }, { "auxiliary_loss_clip": 0.01148861, "auxiliary_loss_mlp": 0.01041032, "balance_loss_clip": 1.02275753, "balance_loss_mlp": 1.04947829, "epoch": 0.18151209980459942, "flos": 24023951619840.0, "grad_norm": 2.463701588834583, "language_loss": 0.76119334, "learning_rate": 3.7639859387264518e-06, "loss": 0.78309226, "num_input_tokens_seen": 65271125, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.9921875, "step": 3019, "time_per_iteration": 2.4925897121429443 }, { "auxiliary_loss_clip": 0.01152657, "auxiliary_loss_mlp": 0.01042926, "balance_loss_clip": 1.02367353, "balance_loss_mlp": 1.05239177, "epoch": 0.1815722230572674, "flos": 23951627585280.0, "grad_norm": 2.2981948656783997, "language_loss": 0.81782329, "learning_rate": 3.7638023665726834e-06, "loss": 0.83977914, "num_input_tokens_seen": 65290600, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0, "step": 3020, "time_per_iteration": 2.4815986156463623 }, { "auxiliary_loss_clip": 0.01150511, "auxiliary_loss_mlp": 0.01043979, "balance_loss_clip": 1.02490497, "balance_loss_mlp": 1.05171716, "epoch": 0.18163234630993536, "flos": 24386469632640.0, "grad_norm": 1.7895626519035437, "language_loss": 0.77123642, "learning_rate": 3.763618727535352e-06, "loss": 0.7931813, "num_input_tokens_seen": 65311040, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.98828125, "step": 3021, "time_per_iteration": 2.49031662940979 }, { "auxiliary_loss_clip": 0.01143937, "auxiliary_loss_mlp": 0.0104373, "balance_loss_clip": 1.02466869, "balance_loss_mlp": 1.04579139, "epoch": 0.18169246956260335, "flos": 24681332378880.0, "grad_norm": 1.6759074888657914, "language_loss": 0.84887516, "learning_rate": 3.763435021621422e-06, "loss": 0.87075186, "num_input_tokens_seen": 65332115, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.98046875, "step": 3022, "time_per_iteration": 2.495826005935669 }, { "auxiliary_loss_clip": 0.01150332, "auxiliary_loss_mlp": 0.01046789, "balance_loss_clip": 1.02678609, "balance_loss_mlp": 1.04886961, "epoch": 0.1817525928152713, "flos": 24243294021120.0, "grad_norm": 1.7602419530341402, "language_loss": 0.69364297, "learning_rate": 3.763251248837859e-06, "loss": 0.7156142, "num_input_tokens_seen": 65352210, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.015625, "step": 3023, "time_per_iteration": 2.5049710273742676 }, { "auxiliary_loss_clip": 0.01148069, "auxiliary_loss_mlp": 0.01044596, "balance_loss_clip": 1.0258801, "balance_loss_mlp": 1.04937041, "epoch": 0.18181271606793928, "flos": 16472081623680.0, "grad_norm": 1.8341788339369474, "language_loss": 0.74018753, "learning_rate": 3.7630674091916317e-06, "loss": 0.76211417, "num_input_tokens_seen": 65370600, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.984375, "step": 3024, "time_per_iteration": 2.468960762023926 }, { "auxiliary_loss_clip": 0.01149671, "auxiliary_loss_mlp": 0.01041037, "balance_loss_clip": 1.02220225, "balance_loss_mlp": 1.05032396, "epoch": 0.18187283932060724, "flos": 18581042805120.0, "grad_norm": 2.5164319788678307, "language_loss": 0.88315421, "learning_rate": 3.7628835026897123e-06, "loss": 0.90506124, "num_input_tokens_seen": 65387270, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9921875, "step": 3025, "time_per_iteration": 2.449169635772705 }, { "auxiliary_loss_clip": 0.01149201, "auxiliary_loss_mlp": 0.01052407, "balance_loss_clip": 1.03316689, "balance_loss_mlp": 1.05151892, "epoch": 0.1819329625732752, "flos": 20266833859200.0, "grad_norm": 1.695844326243581, "language_loss": 0.79300594, "learning_rate": 3.7626995293390735e-06, "loss": 0.81502199, "num_input_tokens_seen": 65406550, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.9765625, "step": 3026, "time_per_iteration": 2.448146343231201 }, { "auxiliary_loss_clip": 0.01150893, "auxiliary_loss_mlp": 0.01050146, "balance_loss_clip": 1.03088176, "balance_loss_mlp": 1.05054545, "epoch": 0.18199308582594317, "flos": 25915186512000.0, "grad_norm": 1.610497777106447, "language_loss": 0.75864333, "learning_rate": 3.762515489146692e-06, "loss": 0.78065372, "num_input_tokens_seen": 65425955, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0078125, "step": 3027, "time_per_iteration": 2.552729606628418 }, { "auxiliary_loss_clip": 0.01153126, "auxiliary_loss_mlp": 0.01048191, "balance_loss_clip": 1.02855754, "balance_loss_mlp": 1.05100095, "epoch": 0.18205320907861114, "flos": 15377524433280.0, "grad_norm": 2.0548273255782314, "language_loss": 0.85912442, "learning_rate": 3.762331382119546e-06, "loss": 0.88113761, "num_input_tokens_seen": 65442820, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0234375, "step": 3028, "time_per_iteration": 2.4399003982543945 }, { "auxiliary_loss_clip": 0.01147026, "auxiliary_loss_mlp": 0.01041592, "balance_loss_clip": 1.02349639, "balance_loss_mlp": 1.05054867, "epoch": 0.18211333233127913, "flos": 25624310175360.0, "grad_norm": 1.6268698090063665, "language_loss": 0.82517135, "learning_rate": 3.7621472082646183e-06, "loss": 0.84705758, "num_input_tokens_seen": 65461825, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.96484375, "step": 3029, "time_per_iteration": 2.5109686851501465 }, { "auxiliary_loss_clip": 0.01150795, "auxiliary_loss_mlp": 0.01047148, "balance_loss_clip": 1.02772927, "balance_loss_mlp": 1.05075443, "epoch": 0.1821734555839471, "flos": 14976007228800.0, "grad_norm": 2.079907227513302, "language_loss": 0.77275729, "learning_rate": 3.761962967588891e-06, "loss": 0.79473674, "num_input_tokens_seen": 65479480, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0, "step": 3030, "time_per_iteration": 2.4335949420928955 }, { "auxiliary_loss_clip": 0.01151946, "auxiliary_loss_mlp": 0.01040752, "balance_loss_clip": 1.02153516, "balance_loss_mlp": 1.05014682, "epoch": 0.18223357883661506, "flos": 20194007034240.0, "grad_norm": 1.8560703478892857, "language_loss": 0.84848285, "learning_rate": 3.761778660099352e-06, "loss": 0.87040979, "num_input_tokens_seen": 65497775, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.015625, "step": 3031, "time_per_iteration": 2.5330874919891357 }, { "auxiliary_loss_clip": 0.01148655, "auxiliary_loss_mlp": 0.0104489, "balance_loss_clip": 1.02690136, "balance_loss_mlp": 1.04867959, "epoch": 0.18229370208928303, "flos": 15231978524160.0, "grad_norm": 1.750048514022885, "language_loss": 0.79894352, "learning_rate": 3.76159428580299e-06, "loss": 0.82087892, "num_input_tokens_seen": 65516505, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 1.0, "step": 3032, "time_per_iteration": 2.46872878074646 }, { "auxiliary_loss_clip": 0.0115642, "auxiliary_loss_mlp": 0.01045317, "balance_loss_clip": 1.02629077, "balance_loss_mlp": 1.05351937, "epoch": 0.182353825341951, "flos": 23840483927040.0, "grad_norm": 1.902840562120219, "language_loss": 0.81474149, "learning_rate": 3.761409844706795e-06, "loss": 0.83675885, "num_input_tokens_seen": 65536160, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.03125, "step": 3033, "time_per_iteration": 2.499819040298462 }, { "auxiliary_loss_clip": 0.01064002, "auxiliary_loss_mlp": 0.0100738, "balance_loss_clip": 1.00445938, "balance_loss_mlp": 1.0291357, "epoch": 0.18241394859461896, "flos": 61190957393280.0, "grad_norm": 0.9119924250748906, "language_loss": 0.634983, "learning_rate": 3.7612253368177625e-06, "loss": 0.65569681, "num_input_tokens_seen": 65589375, "router_z_loss_clip": 0.0291748, "router_z_loss_mlp": 0.34765625, "step": 3034, "time_per_iteration": 3.004518985748291 }, { "auxiliary_loss_clip": 0.01150975, "auxiliary_loss_mlp": 0.01045436, "balance_loss_clip": 1.02750659, "balance_loss_mlp": 1.05151629, "epoch": 0.18247407184728695, "flos": 18471694826880.0, "grad_norm": 1.8684878713011024, "language_loss": 0.79118264, "learning_rate": 3.7610407621428893e-06, "loss": 0.81314671, "num_input_tokens_seen": 65606720, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9921875, "step": 3035, "time_per_iteration": 2.432671308517456 }, { "auxiliary_loss_clip": 0.01147796, "auxiliary_loss_mlp": 0.01042139, "balance_loss_clip": 1.02380419, "balance_loss_mlp": 1.05171752, "epoch": 0.18253419509995492, "flos": 21795191602560.0, "grad_norm": 2.042926655365678, "language_loss": 0.84600925, "learning_rate": 3.7608561206891735e-06, "loss": 0.8679086, "num_input_tokens_seen": 65625495, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9609375, "step": 3036, "time_per_iteration": 2.510718822479248 }, { "auxiliary_loss_clip": 0.01145595, "auxiliary_loss_mlp": 0.01041467, "balance_loss_clip": 1.0236448, "balance_loss_mlp": 1.05026555, "epoch": 0.18259431835262288, "flos": 20149764456960.0, "grad_norm": 1.9128645508036664, "language_loss": 0.79547215, "learning_rate": 3.760671412463617e-06, "loss": 0.81734276, "num_input_tokens_seen": 65643515, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.953125, "step": 3037, "time_per_iteration": 2.4462826251983643 }, { "auxiliary_loss_clip": 0.01152797, "auxiliary_loss_mlp": 0.01052187, "balance_loss_clip": 1.0318377, "balance_loss_mlp": 1.05278969, "epoch": 0.18265444160529085, "flos": 16981653916800.0, "grad_norm": 2.737478654153437, "language_loss": 0.79400235, "learning_rate": 3.7604866374732246e-06, "loss": 0.8160522, "num_input_tokens_seen": 65658155, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.0, "step": 3038, "time_per_iteration": 2.459890842437744 }, { "auxiliary_loss_clip": 0.01147441, "auxiliary_loss_mlp": 0.01040012, "balance_loss_clip": 1.02197599, "balance_loss_mlp": 1.04994738, "epoch": 0.1827145648579588, "flos": 34423250509440.0, "grad_norm": 1.8491478817202123, "language_loss": 0.67551827, "learning_rate": 3.7603017957250023e-06, "loss": 0.69739282, "num_input_tokens_seen": 65679310, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9765625, "step": 3039, "time_per_iteration": 2.57161808013916 }, { "auxiliary_loss_clip": 0.01151209, "auxiliary_loss_mlp": 0.01049781, "balance_loss_clip": 1.03026676, "balance_loss_mlp": 1.05116057, "epoch": 0.18277468811062678, "flos": 53287017264000.0, "grad_norm": 1.926914180186522, "language_loss": 0.73828626, "learning_rate": 3.7601168872259593e-06, "loss": 0.76029611, "num_input_tokens_seen": 65705235, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0, "step": 3040, "time_per_iteration": 2.7892491817474365 }, { "auxiliary_loss_clip": 0.01148895, "auxiliary_loss_mlp": 0.01041999, "balance_loss_clip": 1.02286553, "balance_loss_mlp": 1.05007565, "epoch": 0.18283481136329474, "flos": 31650659602560.0, "grad_norm": 1.9540729226887903, "language_loss": 0.6011278, "learning_rate": 3.7599319119831075e-06, "loss": 0.62303674, "num_input_tokens_seen": 65727575, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.98828125, "step": 3041, "time_per_iteration": 2.534269332885742 }, { "auxiliary_loss_clip": 0.01148874, "auxiliary_loss_mlp": 0.01057075, "balance_loss_clip": 1.0375483, "balance_loss_mlp": 1.04966497, "epoch": 0.18289493461596273, "flos": 53137664513280.0, "grad_norm": 1.643593902948238, "language_loss": 0.60028362, "learning_rate": 3.7597468700034616e-06, "loss": 0.62234312, "num_input_tokens_seen": 65751370, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.9921875, "step": 3042, "time_per_iteration": 2.7697017192840576 }, { "auxiliary_loss_clip": 0.01149534, "auxiliary_loss_mlp": 0.01043376, "balance_loss_clip": 1.02536368, "balance_loss_mlp": 1.05182767, "epoch": 0.1829550578686307, "flos": 25589369220480.0, "grad_norm": 1.524539228742805, "language_loss": 0.87415093, "learning_rate": 3.7595617612940374e-06, "loss": 0.89608008, "num_input_tokens_seen": 65771040, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9765625, "step": 3043, "time_per_iteration": 2.538015127182007 }, { "auxiliary_loss_clip": 0.01149933, "auxiliary_loss_mlp": 0.01047392, "balance_loss_clip": 1.02836585, "balance_loss_mlp": 1.04973066, "epoch": 0.18301518112129866, "flos": 22601422321920.0, "grad_norm": 1.8875078005780308, "language_loss": 0.70533621, "learning_rate": 3.7593765858618552e-06, "loss": 0.72730947, "num_input_tokens_seen": 65789345, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0, "step": 3044, "time_per_iteration": 2.478945732116699 }, { "auxiliary_loss_clip": 0.01153545, "auxiliary_loss_mlp": 0.01052236, "balance_loss_clip": 1.03163719, "balance_loss_mlp": 1.05058241, "epoch": 0.18307530437396663, "flos": 34020799551360.0, "grad_norm": 2.8563130108661277, "language_loss": 0.63764483, "learning_rate": 3.7591913437139365e-06, "loss": 0.65970266, "num_input_tokens_seen": 65810990, "router_z_loss_clip": 0.20703125, "router_z_loss_mlp": 1.03125, "step": 3045, "time_per_iteration": 2.5657060146331787 }, { "auxiliary_loss_clip": 0.01146978, "auxiliary_loss_mlp": 0.01048797, "balance_loss_clip": 1.03085577, "balance_loss_mlp": 1.04985309, "epoch": 0.1831354276266346, "flos": 21279765392640.0, "grad_norm": 2.6788360056449245, "language_loss": 0.79371834, "learning_rate": 3.7590060348573066e-06, "loss": 0.81567609, "num_input_tokens_seen": 65827230, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.96875, "step": 3046, "time_per_iteration": 2.4580368995666504 }, { "auxiliary_loss_clip": 0.01147847, "auxiliary_loss_mlp": 0.01047385, "balance_loss_clip": 1.02803755, "balance_loss_mlp": 1.04669356, "epoch": 0.18319555087930256, "flos": 21032952065280.0, "grad_norm": 1.8574795134901942, "language_loss": 0.78926295, "learning_rate": 3.7588206592989903e-06, "loss": 0.81121528, "num_input_tokens_seen": 65845900, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0078125, "step": 3047, "time_per_iteration": 2.4573686122894287 }, { "auxiliary_loss_clip": 0.01146551, "auxiliary_loss_mlp": 0.01043558, "balance_loss_clip": 1.02598643, "balance_loss_mlp": 1.05070281, "epoch": 0.18325567413197055, "flos": 34382958428160.0, "grad_norm": 1.5248298754396896, "language_loss": 0.80713987, "learning_rate": 3.7586352170460194e-06, "loss": 0.829041, "num_input_tokens_seen": 65868730, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.95703125, "step": 3048, "time_per_iteration": 2.5785257816314697 }, { "auxiliary_loss_clip": 0.0114919, "auxiliary_loss_mlp": 0.0104339, "balance_loss_clip": 1.02465069, "balance_loss_mlp": 1.04903412, "epoch": 0.18331579738463852, "flos": 20558464381440.0, "grad_norm": 3.0847834711519773, "language_loss": 0.86646962, "learning_rate": 3.758449708105424e-06, "loss": 0.88839543, "num_input_tokens_seen": 65888420, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0, "step": 3049, "time_per_iteration": 2.4613938331604004 }, { "auxiliary_loss_clip": 0.011549, "auxiliary_loss_mlp": 0.0104191, "balance_loss_clip": 1.02262211, "balance_loss_mlp": 1.04920077, "epoch": 0.18337592063730648, "flos": 19607872901760.0, "grad_norm": 3.416418802007065, "language_loss": 0.77679312, "learning_rate": 3.75826413248424e-06, "loss": 0.79876125, "num_input_tokens_seen": 65905840, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0546875, "step": 3050, "time_per_iteration": 2.467433214187622 }, { "auxiliary_loss_clip": 0.01143942, "auxiliary_loss_mlp": 0.01045409, "balance_loss_clip": 1.02683592, "balance_loss_mlp": 1.04553795, "epoch": 0.18343604388997445, "flos": 20850885002880.0, "grad_norm": 2.1928298517648934, "language_loss": 0.99540472, "learning_rate": 3.7580784901895035e-06, "loss": 1.01729822, "num_input_tokens_seen": 65922845, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.984375, "step": 3051, "time_per_iteration": 2.475233316421509 }, { "auxiliary_loss_clip": 0.01145453, "auxiliary_loss_mlp": 0.01038393, "balance_loss_clip": 1.02007031, "balance_loss_mlp": 1.04886067, "epoch": 0.1834961671426424, "flos": 24394370624640.0, "grad_norm": 1.6120209387464723, "language_loss": 0.86293966, "learning_rate": 3.7578927812282542e-06, "loss": 0.88477814, "num_input_tokens_seen": 65945555, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.96484375, "step": 3052, "time_per_iteration": 4.023207187652588 }, { "auxiliary_loss_clip": 0.01143761, "auxiliary_loss_mlp": 0.01045555, "balance_loss_clip": 1.02770972, "balance_loss_mlp": 1.04672909, "epoch": 0.18355629039531038, "flos": 21251612108160.0, "grad_norm": 1.7433669215609613, "language_loss": 0.73314136, "learning_rate": 3.7577070056075356e-06, "loss": 0.75503451, "num_input_tokens_seen": 65963965, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.96875, "step": 3053, "time_per_iteration": 2.464771270751953 }, { "auxiliary_loss_clip": 0.01150959, "auxiliary_loss_mlp": 0.01053354, "balance_loss_clip": 1.03342187, "balance_loss_mlp": 1.04984844, "epoch": 0.18361641364797834, "flos": 28656499651200.0, "grad_norm": 1.7191055530648234, "language_loss": 0.6167053, "learning_rate": 3.7575211633343902e-06, "loss": 0.63874841, "num_input_tokens_seen": 65985965, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0078125, "step": 3054, "time_per_iteration": 3.8861782550811768 }, { "auxiliary_loss_clip": 0.01148233, "auxiliary_loss_mlp": 0.01042105, "balance_loss_clip": 1.0243783, "balance_loss_mlp": 1.04891074, "epoch": 0.18367653690064634, "flos": 20918827578240.0, "grad_norm": 2.1230059875026046, "language_loss": 0.784455, "learning_rate": 3.7573352544158663e-06, "loss": 0.80635834, "num_input_tokens_seen": 66005645, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.99609375, "step": 3055, "time_per_iteration": 2.4929752349853516 }, { "auxiliary_loss_clip": 0.01144004, "auxiliary_loss_mlp": 0.01056033, "balance_loss_clip": 1.03772211, "balance_loss_mlp": 1.04754615, "epoch": 0.1837366601533143, "flos": 28765596234240.0, "grad_norm": 1.9467430993353538, "language_loss": 0.70343041, "learning_rate": 3.757149278859014e-06, "loss": 0.72543073, "num_input_tokens_seen": 66025675, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.96484375, "step": 3056, "time_per_iteration": 5.345003366470337 }, { "auxiliary_loss_clip": 0.01148058, "auxiliary_loss_mlp": 0.01045038, "balance_loss_clip": 1.02632236, "balance_loss_mlp": 1.04801822, "epoch": 0.18379678340598227, "flos": 21251432540160.0, "grad_norm": 1.5494509791906843, "language_loss": 0.8065207, "learning_rate": 3.7569632366708842e-06, "loss": 0.82845169, "num_input_tokens_seen": 66046125, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0, "step": 3057, "time_per_iteration": 2.452690601348877 }, { "auxiliary_loss_clip": 0.01154633, "auxiliary_loss_mlp": 0.01049574, "balance_loss_clip": 1.02763975, "balance_loss_mlp": 1.048015, "epoch": 0.18385690665865023, "flos": 20449619193600.0, "grad_norm": 2.0645953206432806, "language_loss": 0.8271032, "learning_rate": 3.756777127858533e-06, "loss": 0.84914523, "num_input_tokens_seen": 66064375, "router_z_loss_clip": 0.21875, "router_z_loss_mlp": 1.0625, "step": 3058, "time_per_iteration": 2.470489501953125 }, { "auxiliary_loss_clip": 0.01149934, "auxiliary_loss_mlp": 0.01052805, "balance_loss_clip": 1.03357649, "balance_loss_mlp": 1.04830182, "epoch": 0.1839170299113182, "flos": 26140562398080.0, "grad_norm": 2.0431582787140243, "language_loss": 0.8570832, "learning_rate": 3.756590952429017e-06, "loss": 0.87911057, "num_input_tokens_seen": 66084590, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.015625, "step": 3059, "time_per_iteration": 2.4919393062591553 }, { "auxiliary_loss_clip": 0.01146399, "auxiliary_loss_mlp": 0.01043285, "balance_loss_clip": 1.02542698, "balance_loss_mlp": 1.04769945, "epoch": 0.18397715316398616, "flos": 31758032332800.0, "grad_norm": 1.7628543772442022, "language_loss": 0.72780669, "learning_rate": 3.756404710389396e-06, "loss": 0.74970353, "num_input_tokens_seen": 66107105, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.98828125, "step": 3060, "time_per_iteration": 2.560713529586792 }, { "auxiliary_loss_clip": 0.01150682, "auxiliary_loss_mlp": 0.01043936, "balance_loss_clip": 1.02457643, "balance_loss_mlp": 1.05038023, "epoch": 0.18403727641665413, "flos": 24611989173120.0, "grad_norm": 1.5699795843335458, "language_loss": 0.72858632, "learning_rate": 3.7562184017467323e-06, "loss": 0.75053245, "num_input_tokens_seen": 66129295, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0, "step": 3061, "time_per_iteration": 2.5312225818634033 }, { "auxiliary_loss_clip": 0.01149457, "auxiliary_loss_mlp": 0.01049576, "balance_loss_clip": 1.0302639, "balance_loss_mlp": 1.05060363, "epoch": 0.18409739966932212, "flos": 23439900476160.0, "grad_norm": 1.6050716234594677, "language_loss": 0.81611705, "learning_rate": 3.7560320265080906e-06, "loss": 0.83810735, "num_input_tokens_seen": 66146910, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.98828125, "step": 3062, "time_per_iteration": 2.5131757259368896 }, { "auxiliary_loss_clip": 0.01154096, "auxiliary_loss_mlp": 0.01043396, "balance_loss_clip": 1.0246923, "balance_loss_mlp": 1.05087423, "epoch": 0.18415752292199009, "flos": 21872112577920.0, "grad_norm": 1.9621314411319142, "language_loss": 0.73587406, "learning_rate": 3.7558455846805383e-06, "loss": 0.75784898, "num_input_tokens_seen": 66165370, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.03125, "step": 3063, "time_per_iteration": 2.473113536834717 }, { "auxiliary_loss_clip": 0.01146556, "auxiliary_loss_mlp": 0.01045281, "balance_loss_clip": 1.02840114, "balance_loss_mlp": 1.04707336, "epoch": 0.18421764617465805, "flos": 25410678036480.0, "grad_norm": 1.7538008191795482, "language_loss": 0.66069829, "learning_rate": 3.7556590762711463e-06, "loss": 0.68261665, "num_input_tokens_seen": 66186210, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.99609375, "step": 3064, "time_per_iteration": 2.537590503692627 }, { "auxiliary_loss_clip": 0.01151779, "auxiliary_loss_mlp": 0.01045385, "balance_loss_clip": 1.0263592, "balance_loss_mlp": 1.05164361, "epoch": 0.18427776942732602, "flos": 27198131558400.0, "grad_norm": 1.7125805257032753, "language_loss": 0.686046, "learning_rate": 3.7554725012869853e-06, "loss": 0.70801771, "num_input_tokens_seen": 66204800, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0, "step": 3065, "time_per_iteration": 2.528850555419922 }, { "auxiliary_loss_clip": 0.01153647, "auxiliary_loss_mlp": 0.01045008, "balance_loss_clip": 1.0261848, "balance_loss_mlp": 1.05251241, "epoch": 0.18433789267999398, "flos": 27852351920640.0, "grad_norm": 2.5327202670395574, "language_loss": 0.72434157, "learning_rate": 3.7552858597351318e-06, "loss": 0.74632812, "num_input_tokens_seen": 66222195, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.015625, "step": 3066, "time_per_iteration": 2.5374269485473633 }, { "auxiliary_loss_clip": 0.01149552, "auxiliary_loss_mlp": 0.01042877, "balance_loss_clip": 1.02485251, "balance_loss_mlp": 1.04977226, "epoch": 0.18439801593266195, "flos": 17856940533120.0, "grad_norm": 2.1176890401709434, "language_loss": 0.82001019, "learning_rate": 3.7550991516226622e-06, "loss": 0.8419345, "num_input_tokens_seen": 66239505, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.99609375, "step": 3067, "time_per_iteration": 2.436671257019043 }, { "auxiliary_loss_clip": 0.01057278, "auxiliary_loss_mlp": 0.01017833, "balance_loss_clip": 1.01482856, "balance_loss_mlp": 1.02261066, "epoch": 0.18445813918532994, "flos": 56389522590720.0, "grad_norm": 1.0331637440949066, "language_loss": 0.5977875, "learning_rate": 3.754912376956657e-06, "loss": 0.61853862, "num_input_tokens_seen": 66295695, "router_z_loss_clip": 0.0300293, "router_z_loss_mlp": 0.34765625, "step": 3068, "time_per_iteration": 2.9521379470825195 }, { "auxiliary_loss_clip": 0.01151477, "auxiliary_loss_mlp": 0.01046462, "balance_loss_clip": 1.02823436, "balance_loss_mlp": 1.05300331, "epoch": 0.1845182624379979, "flos": 20957180325120.0, "grad_norm": 1.6319656394276747, "language_loss": 0.7604804, "learning_rate": 3.7547255357441987e-06, "loss": 0.78245986, "num_input_tokens_seen": 66315315, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.984375, "step": 3069, "time_per_iteration": 2.4742488861083984 }, { "auxiliary_loss_clip": 0.01149006, "auxiliary_loss_mlp": 0.01042466, "balance_loss_clip": 1.02398825, "balance_loss_mlp": 1.04938769, "epoch": 0.18457838569066587, "flos": 20485170679680.0, "grad_norm": 1.7086325036791818, "language_loss": 0.84922028, "learning_rate": 3.7545386279923718e-06, "loss": 0.871135, "num_input_tokens_seen": 66333675, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.99609375, "step": 3070, "time_per_iteration": 2.476768732070923 }, { "auxiliary_loss_clip": 0.01153664, "auxiliary_loss_mlp": 0.01045193, "balance_loss_clip": 1.02613187, "balance_loss_mlp": 1.05236161, "epoch": 0.18463850894333383, "flos": 25010022758400.0, "grad_norm": 3.10391819336588, "language_loss": 0.77655929, "learning_rate": 3.754351653708265e-06, "loss": 0.79854786, "num_input_tokens_seen": 66354075, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.015625, "step": 3071, "time_per_iteration": 2.4964075088500977 }, { "auxiliary_loss_clip": 0.01157518, "auxiliary_loss_mlp": 0.01051125, "balance_loss_clip": 1.03176522, "balance_loss_mlp": 1.05506456, "epoch": 0.1846986321960018, "flos": 16800628348800.0, "grad_norm": 2.3626693506757546, "language_loss": 0.7726804, "learning_rate": 3.7541646128989674e-06, "loss": 0.79476678, "num_input_tokens_seen": 66372520, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0234375, "step": 3072, "time_per_iteration": 2.455862283706665 }, { "auxiliary_loss_clip": 0.01153021, "auxiliary_loss_mlp": 0.01044082, "balance_loss_clip": 1.02424526, "balance_loss_mlp": 1.0497967, "epoch": 0.18475875544866976, "flos": 20814327936000.0, "grad_norm": 1.905286602687363, "language_loss": 0.86237514, "learning_rate": 3.7539775055715715e-06, "loss": 0.88434613, "num_input_tokens_seen": 66390745, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.03125, "step": 3073, "time_per_iteration": 2.4846999645233154 }, { "auxiliary_loss_clip": 0.0115374, "auxiliary_loss_mlp": 0.01044841, "balance_loss_clip": 1.02763927, "balance_loss_mlp": 1.05388927, "epoch": 0.18481887870133773, "flos": 22601422321920.0, "grad_norm": 2.267005652581109, "language_loss": 0.92122602, "learning_rate": 3.7537903317331732e-06, "loss": 0.94321185, "num_input_tokens_seen": 66410525, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.99609375, "step": 3074, "time_per_iteration": 2.4856925010681152 }, { "auxiliary_loss_clip": 0.01150268, "auxiliary_loss_mlp": 0.01046476, "balance_loss_clip": 1.02603137, "balance_loss_mlp": 1.05156696, "epoch": 0.18487900195400572, "flos": 29458815788160.0, "grad_norm": 1.7931726770061527, "language_loss": 0.64313102, "learning_rate": 3.75360309139087e-06, "loss": 0.66509843, "num_input_tokens_seen": 66432535, "router_z_loss_clip": 0.20507812, "router_z_loss_mlp": 0.98828125, "step": 3075, "time_per_iteration": 2.531700849533081 }, { "auxiliary_loss_clip": 0.01151594, "auxiliary_loss_mlp": 0.01049241, "balance_loss_clip": 1.03067982, "balance_loss_mlp": 1.05293536, "epoch": 0.1849391252066737, "flos": 20628777254400.0, "grad_norm": 1.923896754667692, "language_loss": 0.72596443, "learning_rate": 3.753415784551761e-06, "loss": 0.74797279, "num_input_tokens_seen": 66450620, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.984375, "step": 3076, "time_per_iteration": 2.490614891052246 }, { "auxiliary_loss_clip": 0.01153885, "auxiliary_loss_mlp": 0.01048748, "balance_loss_clip": 1.03065169, "balance_loss_mlp": 1.05217266, "epoch": 0.18499924845934165, "flos": 14428549065600.0, "grad_norm": 3.508598197138216, "language_loss": 0.80882168, "learning_rate": 3.7532284112229507e-06, "loss": 0.83084798, "num_input_tokens_seen": 66467865, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.015625, "step": 3077, "time_per_iteration": 2.4244306087493896 }, { "auxiliary_loss_clip": 0.01149983, "auxiliary_loss_mlp": 0.01043694, "balance_loss_clip": 1.02587175, "balance_loss_mlp": 1.05248475, "epoch": 0.18505937171200962, "flos": 23727652329600.0, "grad_norm": 1.9616549273707267, "language_loss": 0.7906003, "learning_rate": 3.7530409714115424e-06, "loss": 0.81253707, "num_input_tokens_seen": 66486245, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9765625, "step": 3078, "time_per_iteration": 2.5170419216156006 }, { "auxiliary_loss_clip": 0.01154265, "auxiliary_loss_mlp": 0.01043699, "balance_loss_clip": 1.0266875, "balance_loss_mlp": 1.05468845, "epoch": 0.18511949496467758, "flos": 25957489754880.0, "grad_norm": 2.3881315567587817, "language_loss": 0.77554017, "learning_rate": 3.7528534651246453e-06, "loss": 0.7975198, "num_input_tokens_seen": 66506510, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9921875, "step": 3079, "time_per_iteration": 2.5125720500946045 }, { "auxiliary_loss_clip": 0.01147424, "auxiliary_loss_mlp": 0.01036636, "balance_loss_clip": 1.01839733, "balance_loss_mlp": 1.04942179, "epoch": 0.18517961821734555, "flos": 42413553912960.0, "grad_norm": 1.9179264509922966, "language_loss": 0.81820375, "learning_rate": 3.752665892369369e-06, "loss": 0.84004438, "num_input_tokens_seen": 66530960, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.98046875, "step": 3080, "time_per_iteration": 2.6660304069519043 }, { "auxiliary_loss_clip": 0.01152775, "auxiliary_loss_mlp": 0.01042037, "balance_loss_clip": 1.02274895, "balance_loss_mlp": 1.0503875, "epoch": 0.18523974147001354, "flos": 24097568544000.0, "grad_norm": 1.710308051016827, "language_loss": 0.74114132, "learning_rate": 3.7524782531528266e-06, "loss": 0.76308948, "num_input_tokens_seen": 66550275, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0234375, "step": 3081, "time_per_iteration": 2.481168508529663 }, { "auxiliary_loss_clip": 0.01153705, "auxiliary_loss_mlp": 0.01050927, "balance_loss_clip": 1.03132892, "balance_loss_mlp": 1.0537684, "epoch": 0.1852998647226815, "flos": 27375278457600.0, "grad_norm": 1.9434422098230817, "language_loss": 0.7200399, "learning_rate": 3.7522905474821334e-06, "loss": 0.74208617, "num_input_tokens_seen": 66569040, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0, "step": 3082, "time_per_iteration": 2.51120924949646 }, { "auxiliary_loss_clip": 0.01155589, "auxiliary_loss_mlp": 0.01047523, "balance_loss_clip": 1.02840209, "balance_loss_mlp": 1.05424857, "epoch": 0.18535998797534947, "flos": 18332757020160.0, "grad_norm": 2.281090178907325, "language_loss": 0.70112681, "learning_rate": 3.752102775364407e-06, "loss": 0.72315788, "num_input_tokens_seen": 66587775, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.015625, "step": 3083, "time_per_iteration": 2.4558258056640625 }, { "auxiliary_loss_clip": 0.01151921, "auxiliary_loss_mlp": 0.01047294, "balance_loss_clip": 1.02905476, "balance_loss_mlp": 1.05493426, "epoch": 0.18542011122801744, "flos": 37845859887360.0, "grad_norm": 2.46683852040148, "language_loss": 0.68677926, "learning_rate": 3.751914936806767e-06, "loss": 0.70877141, "num_input_tokens_seen": 66610800, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.96875, "step": 3084, "time_per_iteration": 2.635845422744751 }, { "auxiliary_loss_clip": 0.0114998, "auxiliary_loss_mlp": 0.01036163, "balance_loss_clip": 1.01853216, "balance_loss_mlp": 1.05160332, "epoch": 0.1854802344806854, "flos": 25186128163200.0, "grad_norm": 1.6418488729688052, "language_loss": 0.78098398, "learning_rate": 3.7517270318163377e-06, "loss": 0.80284548, "num_input_tokens_seen": 66630960, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.984375, "step": 3085, "time_per_iteration": 2.5128519535064697 }, { "auxiliary_loss_clip": 0.01147868, "auxiliary_loss_mlp": 0.01053097, "balance_loss_clip": 1.03525186, "balance_loss_mlp": 1.05007839, "epoch": 0.18554035773335337, "flos": 26684788337280.0, "grad_norm": 5.2036396267265825, "language_loss": 0.73526061, "learning_rate": 3.751539060400244e-06, "loss": 0.75727022, "num_input_tokens_seen": 66650585, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9765625, "step": 3086, "time_per_iteration": 2.5280518531799316 }, { "auxiliary_loss_clip": 0.01152688, "auxiliary_loss_mlp": 0.01052393, "balance_loss_clip": 1.03378487, "balance_loss_mlp": 1.05451655, "epoch": 0.18560048098602133, "flos": 22346887570560.0, "grad_norm": 2.439295567087923, "language_loss": 0.70091796, "learning_rate": 3.7513510225656132e-06, "loss": 0.72296882, "num_input_tokens_seen": 66670045, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.984375, "step": 3087, "time_per_iteration": 2.4795238971710205 }, { "auxiliary_loss_clip": 0.01154392, "auxiliary_loss_mlp": 0.01047239, "balance_loss_clip": 1.02754605, "balance_loss_mlp": 1.05337083, "epoch": 0.18566060423868933, "flos": 17748526308480.0, "grad_norm": 1.9902011835586397, "language_loss": 0.72657716, "learning_rate": 3.7511629183195764e-06, "loss": 0.74859351, "num_input_tokens_seen": 66688790, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0078125, "step": 3088, "time_per_iteration": 2.475595712661743 }, { "auxiliary_loss_clip": 0.01148145, "auxiliary_loss_mlp": 0.0104509, "balance_loss_clip": 1.02688634, "balance_loss_mlp": 1.05055285, "epoch": 0.1857207274913573, "flos": 24677274142080.0, "grad_norm": 1.8960180866354317, "language_loss": 0.91585422, "learning_rate": 3.7509747476692663e-06, "loss": 0.93778658, "num_input_tokens_seen": 66708090, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.9765625, "step": 3089, "time_per_iteration": 2.4941043853759766 }, { "auxiliary_loss_clip": 0.01150493, "auxiliary_loss_mlp": 0.01043587, "balance_loss_clip": 1.02550244, "balance_loss_mlp": 1.05334306, "epoch": 0.18578085074402526, "flos": 28147825198080.0, "grad_norm": 2.403908765356713, "language_loss": 0.58004856, "learning_rate": 3.7507865106218176e-06, "loss": 0.60198939, "num_input_tokens_seen": 66727320, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.97265625, "step": 3090, "time_per_iteration": 2.5752341747283936 }, { "auxiliary_loss_clip": 0.01149931, "auxiliary_loss_mlp": 0.01043263, "balance_loss_clip": 1.02569151, "balance_loss_mlp": 1.05220473, "epoch": 0.18584097399669322, "flos": 23951878980480.0, "grad_norm": 1.7073681888312984, "language_loss": 0.81580234, "learning_rate": 3.7505982071843695e-06, "loss": 0.83773428, "num_input_tokens_seen": 66747505, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.98046875, "step": 3091, "time_per_iteration": 2.4873220920562744 }, { "auxiliary_loss_clip": 0.01153237, "auxiliary_loss_mlp": 0.01052663, "balance_loss_clip": 1.03344607, "balance_loss_mlp": 1.05265975, "epoch": 0.18590109724936119, "flos": 17201678676480.0, "grad_norm": 2.293487773999821, "language_loss": 0.84631544, "learning_rate": 3.7504098373640617e-06, "loss": 0.86837447, "num_input_tokens_seen": 66766425, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0078125, "step": 3092, "time_per_iteration": 2.442911148071289 }, { "auxiliary_loss_clip": 0.01153582, "auxiliary_loss_mlp": 0.01047334, "balance_loss_clip": 1.02851057, "balance_loss_mlp": 1.05029178, "epoch": 0.18596122050202915, "flos": 17234644383360.0, "grad_norm": 2.1558961724504773, "language_loss": 0.93236613, "learning_rate": 3.750221401168038e-06, "loss": 0.95437527, "num_input_tokens_seen": 66781130, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.03125, "step": 3093, "time_per_iteration": 3.924614191055298 }, { "auxiliary_loss_clip": 0.01154744, "auxiliary_loss_mlp": 0.01040665, "balance_loss_clip": 1.02281952, "balance_loss_mlp": 1.05543971, "epoch": 0.18602134375469712, "flos": 19020733188480.0, "grad_norm": 1.7432419813543247, "language_loss": 0.77152628, "learning_rate": 3.750032898603443e-06, "loss": 0.79348034, "num_input_tokens_seen": 66797535, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9921875, "step": 3094, "time_per_iteration": 2.4336225986480713 }, { "auxiliary_loss_clip": 0.0115221, "auxiliary_loss_mlp": 0.01046739, "balance_loss_clip": 1.0295732, "balance_loss_mlp": 1.05301642, "epoch": 0.1860814670073651, "flos": 50950094417280.0, "grad_norm": 1.5917609956787513, "language_loss": 0.69838983, "learning_rate": 3.749844329677425e-06, "loss": 0.72037935, "num_input_tokens_seen": 66821720, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9921875, "step": 3095, "time_per_iteration": 4.100503444671631 }, { "auxiliary_loss_clip": 0.0115444, "auxiliary_loss_mlp": 0.01046759, "balance_loss_clip": 1.02765, "balance_loss_mlp": 1.05353856, "epoch": 0.18614159026003307, "flos": 19390972625280.0, "grad_norm": 2.21679231535304, "language_loss": 0.80802464, "learning_rate": 3.749655694397135e-06, "loss": 0.83003658, "num_input_tokens_seen": 66839060, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0078125, "step": 3096, "time_per_iteration": 2.46443510055542 }, { "auxiliary_loss_clip": 0.01151789, "auxiliary_loss_mlp": 0.01046745, "balance_loss_clip": 1.02768373, "balance_loss_mlp": 1.0513438, "epoch": 0.18620171351270104, "flos": 21798782962560.0, "grad_norm": 2.410609613657071, "language_loss": 0.74931031, "learning_rate": 3.7494669927697255e-06, "loss": 0.77129567, "num_input_tokens_seen": 66857760, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0, "step": 3097, "time_per_iteration": 3.926905393600464 }, { "auxiliary_loss_clip": 0.01152766, "auxiliary_loss_mlp": 0.01049378, "balance_loss_clip": 1.03099549, "balance_loss_mlp": 1.05471134, "epoch": 0.186261836765369, "flos": 16362877299840.0, "grad_norm": 2.360048342700289, "language_loss": 0.65554249, "learning_rate": 3.749278224802352e-06, "loss": 0.67756391, "num_input_tokens_seen": 66876460, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9765625, "step": 3098, "time_per_iteration": 3.893728733062744 }, { "auxiliary_loss_clip": 0.01154316, "auxiliary_loss_mlp": 0.01051904, "balance_loss_clip": 1.03128028, "balance_loss_mlp": 1.05308342, "epoch": 0.18632196001803697, "flos": 23370054480000.0, "grad_norm": 1.5641617323877801, "language_loss": 0.69848347, "learning_rate": 3.7490893905021733e-06, "loss": 0.72054559, "num_input_tokens_seen": 66897960, "router_z_loss_clip": 0.20605469, "router_z_loss_mlp": 1.015625, "step": 3099, "time_per_iteration": 2.5214531421661377 }, { "auxiliary_loss_clip": 0.01152468, "auxiliary_loss_mlp": 0.01053764, "balance_loss_clip": 1.03420162, "balance_loss_mlp": 1.0513835, "epoch": 0.18638208327070493, "flos": 22492002516480.0, "grad_norm": 1.5976978904025578, "language_loss": 0.71828926, "learning_rate": 3.7489004898763494e-06, "loss": 0.74035162, "num_input_tokens_seen": 66917675, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.015625, "step": 3100, "time_per_iteration": 2.5045385360717773 }, { "auxiliary_loss_clip": 0.01152677, "auxiliary_loss_mlp": 0.01053819, "balance_loss_clip": 1.03474498, "balance_loss_mlp": 1.0517652, "epoch": 0.18644220652337293, "flos": 29165245931520.0, "grad_norm": 1.751133525373335, "language_loss": 0.80100483, "learning_rate": 3.7487115229320444e-06, "loss": 0.82306981, "num_input_tokens_seen": 66936000, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0078125, "step": 3101, "time_per_iteration": 2.5751442909240723 }, { "auxiliary_loss_clip": 0.0114815, "auxiliary_loss_mlp": 0.01044816, "balance_loss_clip": 1.02698255, "balance_loss_mlp": 1.05174029, "epoch": 0.1865023297760409, "flos": 24243796811520.0, "grad_norm": 1.7726186003353377, "language_loss": 0.76532215, "learning_rate": 3.7485224896764222e-06, "loss": 0.78725183, "num_input_tokens_seen": 66955700, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9609375, "step": 3102, "time_per_iteration": 2.5086748600006104 }, { "auxiliary_loss_clip": 0.01153106, "auxiliary_loss_mlp": 0.01038526, "balance_loss_clip": 1.01990509, "balance_loss_mlp": 1.05187595, "epoch": 0.18656245302870886, "flos": 19128716449920.0, "grad_norm": 2.1432823025551473, "language_loss": 0.76644146, "learning_rate": 3.7483333901166525e-06, "loss": 0.78835779, "num_input_tokens_seen": 66972815, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.015625, "step": 3103, "time_per_iteration": 2.478790044784546 }, { "auxiliary_loss_clip": 0.01151308, "auxiliary_loss_mlp": 0.01046027, "balance_loss_clip": 1.0275023, "balance_loss_mlp": 1.05140996, "epoch": 0.18662257628137682, "flos": 17786088956160.0, "grad_norm": 2.460155948912239, "language_loss": 0.78981423, "learning_rate": 3.7481442242599054e-06, "loss": 0.81178761, "num_input_tokens_seen": 66992280, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0, "step": 3104, "time_per_iteration": 2.4353342056274414 }, { "auxiliary_loss_clip": 0.01152175, "auxiliary_loss_mlp": 0.01043574, "balance_loss_clip": 1.0261457, "balance_loss_mlp": 1.05450583, "epoch": 0.1866826995340448, "flos": 24024382583040.0, "grad_norm": 1.92220370815928, "language_loss": 0.84992188, "learning_rate": 3.747954992113354e-06, "loss": 0.87187934, "num_input_tokens_seen": 67012220, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9765625, "step": 3105, "time_per_iteration": 2.513829469680786 }, { "auxiliary_loss_clip": 0.01156408, "auxiliary_loss_mlp": 0.01047191, "balance_loss_clip": 1.02699733, "balance_loss_mlp": 1.05084872, "epoch": 0.18674282278671275, "flos": 26141244756480.0, "grad_norm": 1.8116797969652476, "language_loss": 0.87101877, "learning_rate": 3.7477656936841742e-06, "loss": 0.89305478, "num_input_tokens_seen": 67032030, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.0546875, "step": 3106, "time_per_iteration": 2.5709431171417236 }, { "auxiliary_loss_clip": 0.01157486, "auxiliary_loss_mlp": 0.01047778, "balance_loss_clip": 1.02921724, "balance_loss_mlp": 1.05307651, "epoch": 0.18680294603938072, "flos": 19201938324480.0, "grad_norm": 1.8865859943575345, "language_loss": 0.77930021, "learning_rate": 3.7475763289795445e-06, "loss": 0.80135286, "num_input_tokens_seen": 67048920, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.046875, "step": 3107, "time_per_iteration": 2.469219446182251 }, { "auxiliary_loss_clip": 0.01153181, "auxiliary_loss_mlp": 0.01053033, "balance_loss_clip": 1.03348255, "balance_loss_mlp": 1.05195022, "epoch": 0.1868630692920487, "flos": 28544889116160.0, "grad_norm": 1.9507703195092292, "language_loss": 0.73955727, "learning_rate": 3.7473868980066446e-06, "loss": 0.76161945, "num_input_tokens_seen": 67068645, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0078125, "step": 3108, "time_per_iteration": 2.5101871490478516 }, { "auxiliary_loss_clip": 0.01152751, "auxiliary_loss_mlp": 0.01047109, "balance_loss_clip": 1.02762985, "balance_loss_mlp": 1.05220366, "epoch": 0.18692319254471668, "flos": 17238020261760.0, "grad_norm": 2.152650690023936, "language_loss": 0.7431578, "learning_rate": 3.747197400772658e-06, "loss": 0.76515639, "num_input_tokens_seen": 67087075, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0, "step": 3109, "time_per_iteration": 2.46213436126709 }, { "auxiliary_loss_clip": 0.01152811, "auxiliary_loss_mlp": 0.01049098, "balance_loss_clip": 1.02983332, "balance_loss_mlp": 1.05260634, "epoch": 0.18698331579738464, "flos": 23185186156800.0, "grad_norm": 1.5137540432757097, "language_loss": 0.84387285, "learning_rate": 3.747007837284772e-06, "loss": 0.86589193, "num_input_tokens_seen": 67108040, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0, "step": 3110, "time_per_iteration": 2.4795708656311035 }, { "auxiliary_loss_clip": 0.01160109, "auxiliary_loss_mlp": 0.01047061, "balance_loss_clip": 1.0279038, "balance_loss_mlp": 1.05856824, "epoch": 0.1870434390500526, "flos": 25516721963520.0, "grad_norm": 1.6464151197411514, "language_loss": 0.8445121, "learning_rate": 3.7468182075501737e-06, "loss": 0.86658382, "num_input_tokens_seen": 67127605, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.015625, "step": 3111, "time_per_iteration": 2.5129995346069336 }, { "auxiliary_loss_clip": 0.01151524, "auxiliary_loss_mlp": 0.01040871, "balance_loss_clip": 1.02322841, "balance_loss_mlp": 1.05303967, "epoch": 0.18710356230272057, "flos": 19500823393920.0, "grad_norm": 1.8179806223601065, "language_loss": 0.76737678, "learning_rate": 3.7466285115760536e-06, "loss": 0.78930074, "num_input_tokens_seen": 67145785, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.984375, "step": 3112, "time_per_iteration": 2.4636878967285156 }, { "auxiliary_loss_clip": 0.01155368, "auxiliary_loss_mlp": 0.01047047, "balance_loss_clip": 1.02896309, "balance_loss_mlp": 1.05413795, "epoch": 0.18716368555538854, "flos": 26760847386240.0, "grad_norm": 1.8090461063500975, "language_loss": 0.64284563, "learning_rate": 3.7464387493696046e-06, "loss": 0.66486979, "num_input_tokens_seen": 67165930, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.015625, "step": 3113, "time_per_iteration": 2.54286789894104 }, { "auxiliary_loss_clip": 0.01158975, "auxiliary_loss_mlp": 0.01042831, "balance_loss_clip": 1.0237937, "balance_loss_mlp": 1.05253887, "epoch": 0.1872238088080565, "flos": 25189827264000.0, "grad_norm": 2.822479721805275, "language_loss": 0.81575179, "learning_rate": 3.746248920938024e-06, "loss": 0.83776987, "num_input_tokens_seen": 67185830, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0625, "step": 3114, "time_per_iteration": 2.515132188796997 }, { "auxiliary_loss_clip": 0.01152442, "auxiliary_loss_mlp": 0.010463, "balance_loss_clip": 1.02627325, "balance_loss_mlp": 1.051157, "epoch": 0.1872839320607245, "flos": 24134305178880.0, "grad_norm": 2.1273755305467135, "language_loss": 0.57731068, "learning_rate": 3.74605902628851e-06, "loss": 0.59929812, "num_input_tokens_seen": 67206930, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 1.015625, "step": 3115, "time_per_iteration": 2.5168004035949707 }, { "auxiliary_loss_clip": 0.01153322, "auxiliary_loss_mlp": 0.01050518, "balance_loss_clip": 1.03285086, "balance_loss_mlp": 1.05486941, "epoch": 0.18734405531339246, "flos": 21173793292800.0, "grad_norm": 1.9353769287918718, "language_loss": 0.71518385, "learning_rate": 3.745869065428261e-06, "loss": 0.73722231, "num_input_tokens_seen": 67226290, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.984375, "step": 3116, "time_per_iteration": 2.4633114337921143 }, { "auxiliary_loss_clip": 0.01144569, "auxiliary_loss_mlp": 0.01036107, "balance_loss_clip": 1.01828492, "balance_loss_mlp": 1.04781985, "epoch": 0.18740417856606043, "flos": 17237697039360.0, "grad_norm": 2.125579466231959, "language_loss": 0.78677988, "learning_rate": 3.7456790383644833e-06, "loss": 0.8085866, "num_input_tokens_seen": 67244410, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.96875, "step": 3117, "time_per_iteration": 2.4549829959869385 }, { "auxiliary_loss_clip": 0.01150966, "auxiliary_loss_mlp": 0.01044136, "balance_loss_clip": 1.02547979, "balance_loss_mlp": 1.05227172, "epoch": 0.1874643018187284, "flos": 32558049999360.0, "grad_norm": 1.7655467444074753, "language_loss": 0.84176958, "learning_rate": 3.745488945104381e-06, "loss": 0.86372066, "num_input_tokens_seen": 67264470, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.98828125, "step": 3118, "time_per_iteration": 2.5534918308258057 }, { "auxiliary_loss_clip": 0.01151841, "auxiliary_loss_mlp": 0.01048078, "balance_loss_clip": 1.03056622, "balance_loss_mlp": 1.05255294, "epoch": 0.18752442507139636, "flos": 23258156636160.0, "grad_norm": 2.1348499751656074, "language_loss": 0.76614839, "learning_rate": 3.7452987856551636e-06, "loss": 0.78814757, "num_input_tokens_seen": 67284315, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9921875, "step": 3119, "time_per_iteration": 2.4925501346588135 }, { "auxiliary_loss_clip": 0.01149753, "auxiliary_loss_mlp": 0.01051659, "balance_loss_clip": 1.03344393, "balance_loss_mlp": 1.04958797, "epoch": 0.18758454832406432, "flos": 21760933006080.0, "grad_norm": 1.7664021969500743, "language_loss": 0.81714237, "learning_rate": 3.7451085600240406e-06, "loss": 0.83915651, "num_input_tokens_seen": 67302780, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 1.0, "step": 3120, "time_per_iteration": 2.4635581970214844 }, { "auxiliary_loss_clip": 0.01146883, "auxiliary_loss_mlp": 0.01040897, "balance_loss_clip": 1.0238384, "balance_loss_mlp": 1.04979205, "epoch": 0.1876446715767323, "flos": 29570210841600.0, "grad_norm": 2.0977645993640563, "language_loss": 0.84830511, "learning_rate": 3.7449182682182263e-06, "loss": 0.87018293, "num_input_tokens_seen": 67323405, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.96875, "step": 3121, "time_per_iteration": 2.5450336933135986 }, { "auxiliary_loss_clip": 0.01150623, "auxiliary_loss_mlp": 0.01043454, "balance_loss_clip": 1.02591825, "balance_loss_mlp": 1.05224073, "epoch": 0.18770479482940028, "flos": 30339992234880.0, "grad_norm": 1.9106868850624539, "language_loss": 0.70388734, "learning_rate": 3.744727910244937e-06, "loss": 0.72582817, "num_input_tokens_seen": 67345800, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.984375, "step": 3122, "time_per_iteration": 2.555929660797119 }, { "auxiliary_loss_clip": 0.01146203, "auxiliary_loss_mlp": 0.01040464, "balance_loss_clip": 1.02079439, "balance_loss_mlp": 1.04885936, "epoch": 0.18776491808206824, "flos": 14465357527680.0, "grad_norm": 2.0610242069453806, "language_loss": 0.70704651, "learning_rate": 3.7445374861113905e-06, "loss": 0.72891319, "num_input_tokens_seen": 67363575, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.97265625, "step": 3123, "time_per_iteration": 2.447218179702759 }, { "auxiliary_loss_clip": 0.01147335, "auxiliary_loss_mlp": 0.01044419, "balance_loss_clip": 1.02701473, "balance_loss_mlp": 1.05024123, "epoch": 0.1878250413347362, "flos": 24498547044480.0, "grad_norm": 1.8373579610920805, "language_loss": 0.73899716, "learning_rate": 3.7443469958248066e-06, "loss": 0.76091474, "num_input_tokens_seen": 67381765, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.96875, "step": 3124, "time_per_iteration": 2.5184199810028076 }, { "auxiliary_loss_clip": 0.01150326, "auxiliary_loss_mlp": 0.01051149, "balance_loss_clip": 1.03118122, "balance_loss_mlp": 1.05169439, "epoch": 0.18788516458740417, "flos": 39786185692800.0, "grad_norm": 1.6593621790802415, "language_loss": 0.805718, "learning_rate": 3.7441564393924106e-06, "loss": 0.8277328, "num_input_tokens_seen": 67405000, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.984375, "step": 3125, "time_per_iteration": 2.6519694328308105 }, { "auxiliary_loss_clip": 0.01068472, "auxiliary_loss_mlp": 0.01008796, "balance_loss_clip": 1.00541043, "balance_loss_mlp": 1.03182387, "epoch": 0.18794528784007214, "flos": 64699250664960.0, "grad_norm": 0.9350958855775658, "language_loss": 0.63599342, "learning_rate": 3.7439658168214273e-06, "loss": 0.65676612, "num_input_tokens_seen": 67467140, "router_z_loss_clip": 0.03393555, "router_z_loss_mlp": 0.3671875, "step": 3126, "time_per_iteration": 3.1358776092529297 }, { "auxiliary_loss_clip": 0.01146996, "auxiliary_loss_mlp": 0.0104188, "balance_loss_clip": 1.0239749, "balance_loss_mlp": 1.05249214, "epoch": 0.1880054110927401, "flos": 28622061486720.0, "grad_norm": 1.5659630628389833, "language_loss": 0.81625009, "learning_rate": 3.7437751281190857e-06, "loss": 0.83813882, "num_input_tokens_seen": 67487980, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9453125, "step": 3127, "time_per_iteration": 2.559163808822632 }, { "auxiliary_loss_clip": 0.01067416, "auxiliary_loss_mlp": 0.0100548, "balance_loss_clip": 1.00270259, "balance_loss_mlp": 1.03094375, "epoch": 0.1880655343454081, "flos": 64488958490880.0, "grad_norm": 0.7676214681644409, "language_loss": 0.61910951, "learning_rate": 3.7435843732926164e-06, "loss": 0.63983858, "num_input_tokens_seen": 67552500, "router_z_loss_clip": 0.02783203, "router_z_loss_mlp": 0.36523438, "step": 3128, "time_per_iteration": 3.171990394592285 }, { "auxiliary_loss_clip": 0.01154413, "auxiliary_loss_mlp": 0.01040258, "balance_loss_clip": 1.02223349, "balance_loss_mlp": 1.05214572, "epoch": 0.18812565759807606, "flos": 32124464928000.0, "grad_norm": 2.0700950723159557, "language_loss": 0.71073961, "learning_rate": 3.7433935523492536e-06, "loss": 0.73268628, "num_input_tokens_seen": 67573295, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.0234375, "step": 3129, "time_per_iteration": 2.5706887245178223 }, { "auxiliary_loss_clip": 0.01150504, "auxiliary_loss_mlp": 0.01046666, "balance_loss_clip": 1.02798605, "balance_loss_mlp": 1.0513761, "epoch": 0.18818578085074403, "flos": 20624539449600.0, "grad_norm": 1.8057591857563566, "language_loss": 0.85218, "learning_rate": 3.7432026652962314e-06, "loss": 0.87415171, "num_input_tokens_seen": 67590010, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.9921875, "step": 3130, "time_per_iteration": 2.4599008560180664 }, { "auxiliary_loss_clip": 0.01149551, "auxiliary_loss_mlp": 0.01046325, "balance_loss_clip": 1.02737045, "balance_loss_mlp": 1.04948342, "epoch": 0.188245904103412, "flos": 28840506048000.0, "grad_norm": 2.2643723604716706, "language_loss": 0.76882499, "learning_rate": 3.7430117121407897e-06, "loss": 0.79078376, "num_input_tokens_seen": 67611110, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0, "step": 3131, "time_per_iteration": 2.5421576499938965 }, { "auxiliary_loss_clip": 0.01150793, "auxiliary_loss_mlp": 0.01047812, "balance_loss_clip": 1.0284524, "balance_loss_mlp": 1.05407596, "epoch": 0.18830602735607996, "flos": 29420319386880.0, "grad_norm": 1.8846199367728946, "language_loss": 0.81285793, "learning_rate": 3.74282069289017e-06, "loss": 0.83484393, "num_input_tokens_seen": 67631990, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.96484375, "step": 3132, "time_per_iteration": 2.533928871154785 }, { "auxiliary_loss_clip": 0.0115685, "auxiliary_loss_mlp": 0.01048589, "balance_loss_clip": 1.02996814, "balance_loss_mlp": 1.05550933, "epoch": 0.18836615060874792, "flos": 28872933050880.0, "grad_norm": 2.01910711372175, "language_loss": 0.79529423, "learning_rate": 3.742629607551614e-06, "loss": 0.8173486, "num_input_tokens_seen": 67650490, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.015625, "step": 3133, "time_per_iteration": 2.5293681621551514 }, { "auxiliary_loss_clip": 0.01152535, "auxiliary_loss_mlp": 0.0104914, "balance_loss_clip": 1.03060246, "balance_loss_mlp": 1.05256867, "epoch": 0.18842627386141592, "flos": 22601673717120.0, "grad_norm": 2.131161874227553, "language_loss": 0.83038169, "learning_rate": 3.7424384561323698e-06, "loss": 0.8523984, "num_input_tokens_seen": 67668860, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 1.0, "step": 3134, "time_per_iteration": 2.474693775177002 }, { "auxiliary_loss_clip": 0.0114847, "auxiliary_loss_mlp": 0.0104637, "balance_loss_clip": 1.02816641, "balance_loss_mlp": 1.05102158, "epoch": 0.18848639711408388, "flos": 24573600512640.0, "grad_norm": 1.4407064846708555, "language_loss": 0.82912081, "learning_rate": 3.742247238639684e-06, "loss": 0.85106921, "num_input_tokens_seen": 67690220, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9765625, "step": 3135, "time_per_iteration": 4.009828329086304 }, { "auxiliary_loss_clip": 0.01151189, "auxiliary_loss_mlp": 0.01050468, "balance_loss_clip": 1.03234851, "balance_loss_mlp": 1.0501883, "epoch": 0.18854652036675185, "flos": 34166920078080.0, "grad_norm": 1.979347318884326, "language_loss": 0.7840941, "learning_rate": 3.7420559550808083e-06, "loss": 0.80611068, "num_input_tokens_seen": 67709820, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 1.0078125, "step": 3136, "time_per_iteration": 2.582486629486084 }, { "auxiliary_loss_clip": 0.0115096, "auxiliary_loss_mlp": 0.01045726, "balance_loss_clip": 1.02710593, "balance_loss_mlp": 1.0524497, "epoch": 0.1886066436194198, "flos": 24200236592640.0, "grad_norm": 1.9524126023045492, "language_loss": 0.81373942, "learning_rate": 3.741864605462996e-06, "loss": 0.83570623, "num_input_tokens_seen": 67729490, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.984375, "step": 3137, "time_per_iteration": 3.850097417831421 }, { "auxiliary_loss_clip": 0.01152385, "auxiliary_loss_mlp": 0.01052313, "balance_loss_clip": 1.03542125, "balance_loss_mlp": 1.05376983, "epoch": 0.18866676687208778, "flos": 21251109317760.0, "grad_norm": 1.6206905335890842, "language_loss": 0.80927384, "learning_rate": 3.741673189793504e-06, "loss": 0.83132082, "num_input_tokens_seen": 67749665, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.984375, "step": 3138, "time_per_iteration": 2.4799458980560303 }, { "auxiliary_loss_clip": 0.01152592, "auxiliary_loss_mlp": 0.0105697, "balance_loss_clip": 1.0381943, "balance_loss_mlp": 1.05193233, "epoch": 0.18872689012475574, "flos": 37308673013760.0, "grad_norm": 1.8699215926328108, "language_loss": 0.63940114, "learning_rate": 3.7414817080795896e-06, "loss": 0.66149676, "num_input_tokens_seen": 67776230, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 1.0078125, "step": 3139, "time_per_iteration": 4.049190998077393 }, { "auxiliary_loss_clip": 0.01148071, "auxiliary_loss_mlp": 0.01043896, "balance_loss_clip": 1.02433324, "balance_loss_mlp": 1.04923368, "epoch": 0.1887870133774237, "flos": 21652303299840.0, "grad_norm": 2.0520489465338017, "language_loss": 0.71387827, "learning_rate": 3.741290160328514e-06, "loss": 0.735798, "num_input_tokens_seen": 67795080, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.98828125, "step": 3140, "time_per_iteration": 3.913012981414795 }, { "auxiliary_loss_clip": 0.01151518, "auxiliary_loss_mlp": 0.01044899, "balance_loss_clip": 1.02586102, "balance_loss_mlp": 1.05102003, "epoch": 0.1888471366300917, "flos": 15924659374080.0, "grad_norm": 2.30365366529606, "language_loss": 0.87324786, "learning_rate": 3.7410985465475412e-06, "loss": 0.89521205, "num_input_tokens_seen": 67813110, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 1.0, "step": 3141, "time_per_iteration": 2.4305419921875 }, { "auxiliary_loss_clip": 0.01155383, "auxiliary_loss_mlp": 0.01041883, "balance_loss_clip": 1.02291703, "balance_loss_mlp": 1.05205226, "epoch": 0.18890725988275966, "flos": 18551955767040.0, "grad_norm": 1.7922700621115564, "language_loss": 0.77401519, "learning_rate": 3.7409068667439378e-06, "loss": 0.79598784, "num_input_tokens_seen": 67831070, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.03125, "step": 3142, "time_per_iteration": 2.4688639640808105 }, { "auxiliary_loss_clip": 0.01148339, "auxiliary_loss_mlp": 0.0104514, "balance_loss_clip": 1.02864099, "balance_loss_mlp": 1.05101991, "epoch": 0.18896738313542763, "flos": 28840865184000.0, "grad_norm": 2.030329367419712, "language_loss": 0.7862711, "learning_rate": 3.740715120924971e-06, "loss": 0.80820584, "num_input_tokens_seen": 67852170, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.97265625, "step": 3143, "time_per_iteration": 2.5264768600463867 }, { "auxiliary_loss_clip": 0.01151334, "auxiliary_loss_mlp": 0.01048486, "balance_loss_clip": 1.03075993, "balance_loss_mlp": 1.05235648, "epoch": 0.1890275063880956, "flos": 22412747157120.0, "grad_norm": 2.1565042796954863, "language_loss": 0.71755004, "learning_rate": 3.740523309097912e-06, "loss": 0.73954821, "num_input_tokens_seen": 67869945, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9921875, "step": 3144, "time_per_iteration": 2.4710381031036377 }, { "auxiliary_loss_clip": 0.01152065, "auxiliary_loss_mlp": 0.0104388, "balance_loss_clip": 1.025033, "balance_loss_mlp": 1.0510428, "epoch": 0.18908762964076356, "flos": 24243904552320.0, "grad_norm": 2.3287413910208885, "language_loss": 0.73189634, "learning_rate": 3.7403314312700356e-06, "loss": 0.75385571, "num_input_tokens_seen": 67890240, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 1.0078125, "step": 3145, "time_per_iteration": 2.496408700942993 }, { "auxiliary_loss_clip": 0.01145622, "auxiliary_loss_mlp": 0.0103985, "balance_loss_clip": 1.02311277, "balance_loss_mlp": 1.04865885, "epoch": 0.18914775289343153, "flos": 16982910892800.0, "grad_norm": 2.584886752112733, "language_loss": 0.75877869, "learning_rate": 3.740139487448616e-06, "loss": 0.78063345, "num_input_tokens_seen": 67907825, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.96875, "step": 3146, "time_per_iteration": 2.4658172130584717 }, { "auxiliary_loss_clip": 0.01149228, "auxiliary_loss_mlp": 0.01046235, "balance_loss_clip": 1.02823472, "balance_loss_mlp": 1.05017662, "epoch": 0.1892078761460995, "flos": 21543781334400.0, "grad_norm": 2.6856432654672022, "language_loss": 0.78893423, "learning_rate": 3.7399474776409326e-06, "loss": 0.81088883, "num_input_tokens_seen": 67926670, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.98828125, "step": 3147, "time_per_iteration": 2.4659509658813477 }, { "auxiliary_loss_clip": 0.01150195, "auxiliary_loss_mlp": 0.01046159, "balance_loss_clip": 1.02836108, "balance_loss_mlp": 1.05200779, "epoch": 0.18926799939876748, "flos": 23001538896000.0, "grad_norm": 2.647461218268807, "language_loss": 0.66835558, "learning_rate": 3.739755401854267e-06, "loss": 0.69031906, "num_input_tokens_seen": 67943645, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.98046875, "step": 3148, "time_per_iteration": 2.511420726776123 }, { "auxiliary_loss_clip": 0.01146685, "auxiliary_loss_mlp": 0.01035279, "balance_loss_clip": 1.01730204, "balance_loss_mlp": 1.04923701, "epoch": 0.18932812265143545, "flos": 22273019251200.0, "grad_norm": 2.3337575453591346, "language_loss": 0.7572884, "learning_rate": 3.739563260095902e-06, "loss": 0.77910805, "num_input_tokens_seen": 67962345, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9765625, "step": 3149, "time_per_iteration": 2.457821846008301 }, { "auxiliary_loss_clip": 0.01145415, "auxiliary_loss_mlp": 0.01038339, "balance_loss_clip": 1.0216254, "balance_loss_mlp": 1.05214143, "epoch": 0.1893882459041034, "flos": 18624423456000.0, "grad_norm": 2.3082816946167304, "language_loss": 0.80534869, "learning_rate": 3.7393710523731245e-06, "loss": 0.82718623, "num_input_tokens_seen": 67979760, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.93359375, "step": 3150, "time_per_iteration": 2.4700427055358887 }, { "auxiliary_loss_clip": 0.01151781, "auxiliary_loss_mlp": 0.01045225, "balance_loss_clip": 1.0277729, "balance_loss_mlp": 1.0526334, "epoch": 0.18944836915677138, "flos": 22892981016960.0, "grad_norm": 2.3824934336405375, "language_loss": 0.85342324, "learning_rate": 3.7391787786932215e-06, "loss": 0.87539333, "num_input_tokens_seen": 67996895, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9921875, "step": 3151, "time_per_iteration": 2.5126876831054688 }, { "auxiliary_loss_clip": 0.01150023, "auxiliary_loss_mlp": 0.01048597, "balance_loss_clip": 1.03112066, "balance_loss_mlp": 1.05196083, "epoch": 0.18950849240943934, "flos": 26796542526720.0, "grad_norm": 1.7705851441818294, "language_loss": 0.74775183, "learning_rate": 3.7389864390634857e-06, "loss": 0.76973808, "num_input_tokens_seen": 68018365, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.98046875, "step": 3152, "time_per_iteration": 2.566460132598877 }, { "auxiliary_loss_clip": 0.0114865, "auxiliary_loss_mlp": 0.01044455, "balance_loss_clip": 1.02559638, "balance_loss_mlp": 1.05127561, "epoch": 0.1895686156621073, "flos": 24971239048320.0, "grad_norm": 7.170580796420915, "language_loss": 0.7560876, "learning_rate": 3.738794033491209e-06, "loss": 0.77801865, "num_input_tokens_seen": 68037985, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.97265625, "step": 3153, "time_per_iteration": 2.4809963703155518 }, { "auxiliary_loss_clip": 0.0114962, "auxiliary_loss_mlp": 0.01043774, "balance_loss_clip": 1.02596378, "balance_loss_mlp": 1.05097556, "epoch": 0.1896287389147753, "flos": 21944544353280.0, "grad_norm": 2.1901151194797697, "language_loss": 0.79464626, "learning_rate": 3.7386015619836887e-06, "loss": 0.81658018, "num_input_tokens_seen": 68057975, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.984375, "step": 3154, "time_per_iteration": 2.4959819316864014 }, { "auxiliary_loss_clip": 0.01153248, "auxiliary_loss_mlp": 0.01045745, "balance_loss_clip": 1.02711225, "balance_loss_mlp": 1.05119956, "epoch": 0.18968886216744327, "flos": 18179058723840.0, "grad_norm": 3.8812166002470185, "language_loss": 0.72767842, "learning_rate": 3.738409024548223e-06, "loss": 0.74966836, "num_input_tokens_seen": 68074175, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0234375, "step": 3155, "time_per_iteration": 2.4333627223968506 }, { "auxiliary_loss_clip": 0.01146506, "auxiliary_loss_mlp": 0.01039694, "balance_loss_clip": 1.02228904, "balance_loss_mlp": 1.05115366, "epoch": 0.18974898542011123, "flos": 20412487509120.0, "grad_norm": 1.930951595040746, "language_loss": 0.7394954, "learning_rate": 3.7382164211921136e-06, "loss": 0.76135737, "num_input_tokens_seen": 68095230, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.953125, "step": 3156, "time_per_iteration": 2.5054993629455566 }, { "auxiliary_loss_clip": 0.01150891, "auxiliary_loss_mlp": 0.01039563, "balance_loss_clip": 1.02285004, "balance_loss_mlp": 1.05340672, "epoch": 0.1898091086727792, "flos": 23985024255360.0, "grad_norm": 1.7846693102022704, "language_loss": 0.68044341, "learning_rate": 3.7380237519226623e-06, "loss": 0.70234793, "num_input_tokens_seen": 68113805, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9765625, "step": 3157, "time_per_iteration": 2.5053446292877197 }, { "auxiliary_loss_clip": 0.01148152, "auxiliary_loss_mlp": 0.01039432, "balance_loss_clip": 1.02165818, "balance_loss_mlp": 1.05060339, "epoch": 0.18986923192544716, "flos": 27637067756160.0, "grad_norm": 2.022269741424224, "language_loss": 0.79946011, "learning_rate": 3.737831016747176e-06, "loss": 0.82133597, "num_input_tokens_seen": 68133190, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9765625, "step": 3158, "time_per_iteration": 2.5764658451080322 }, { "auxiliary_loss_clip": 0.01156451, "auxiliary_loss_mlp": 0.01040245, "balance_loss_clip": 1.02114749, "balance_loss_mlp": 1.05459285, "epoch": 0.18992935517811513, "flos": 25484151306240.0, "grad_norm": 2.0058725959290684, "language_loss": 0.72372836, "learning_rate": 3.737638215672964e-06, "loss": 0.74569529, "num_input_tokens_seen": 68152330, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 1.0234375, "step": 3159, "time_per_iteration": 2.5042030811309814 }, { "auxiliary_loss_clip": 0.01152741, "auxiliary_loss_mlp": 0.01045316, "balance_loss_clip": 1.02712488, "balance_loss_mlp": 1.05505562, "epoch": 0.1899894784307831, "flos": 17420805596160.0, "grad_norm": 1.9798133284080275, "language_loss": 0.85448277, "learning_rate": 3.7374453487073366e-06, "loss": 0.87646329, "num_input_tokens_seen": 68170185, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9765625, "step": 3160, "time_per_iteration": 2.4976696968078613 }, { "auxiliary_loss_clip": 0.01145544, "auxiliary_loss_mlp": 0.01043105, "balance_loss_clip": 1.02642739, "balance_loss_mlp": 1.0523423, "epoch": 0.19004960168345109, "flos": 27492240119040.0, "grad_norm": 1.7856839243856781, "language_loss": 0.73496377, "learning_rate": 3.7372524158576074e-06, "loss": 0.75685024, "num_input_tokens_seen": 68191665, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9296875, "step": 3161, "time_per_iteration": 2.538550853729248 }, { "auxiliary_loss_clip": 0.01151804, "auxiliary_loss_mlp": 0.01044115, "balance_loss_clip": 1.02650762, "balance_loss_mlp": 1.05504096, "epoch": 0.19010972493611905, "flos": 38654676385920.0, "grad_norm": 1.7342513395066372, "language_loss": 0.80916405, "learning_rate": 3.7370594171310926e-06, "loss": 0.83112323, "num_input_tokens_seen": 68214635, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.96875, "step": 3162, "time_per_iteration": 2.671325206756592 }, { "auxiliary_loss_clip": 0.01150852, "auxiliary_loss_mlp": 0.01040807, "balance_loss_clip": 1.0229615, "balance_loss_mlp": 1.0529387, "epoch": 0.19016984818878702, "flos": 19244744357760.0, "grad_norm": 1.9324326806496026, "language_loss": 0.75051516, "learning_rate": 3.73686635253511e-06, "loss": 0.77243173, "num_input_tokens_seen": 68232150, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.98046875, "step": 3163, "time_per_iteration": 2.4964730739593506 }, { "auxiliary_loss_clip": 0.01151787, "auxiliary_loss_mlp": 0.01040444, "balance_loss_clip": 1.02275324, "balance_loss_mlp": 1.05683708, "epoch": 0.19022997144145498, "flos": 37596891744000.0, "grad_norm": 1.6820047652557382, "language_loss": 0.74424881, "learning_rate": 3.736673222076982e-06, "loss": 0.7661711, "num_input_tokens_seen": 68253370, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.94921875, "step": 3164, "time_per_iteration": 2.6443610191345215 }, { "auxiliary_loss_clip": 0.01151245, "auxiliary_loss_mlp": 0.01035147, "balance_loss_clip": 1.01670527, "balance_loss_mlp": 1.05469489, "epoch": 0.19029009469412295, "flos": 61530921665280.0, "grad_norm": 1.608963656085341, "language_loss": 0.66663098, "learning_rate": 3.7364800257640313e-06, "loss": 0.68849492, "num_input_tokens_seen": 68278895, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.96484375, "step": 3165, "time_per_iteration": 2.8274381160736084 }, { "auxiliary_loss_clip": 0.01151932, "auxiliary_loss_mlp": 0.01045068, "balance_loss_clip": 1.02588749, "balance_loss_mlp": 1.05448627, "epoch": 0.1903502179467909, "flos": 13954851480960.0, "grad_norm": 2.1166121681744725, "language_loss": 0.74428242, "learning_rate": 3.7362867636035835e-06, "loss": 0.7662524, "num_input_tokens_seen": 68294880, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.9765625, "step": 3166, "time_per_iteration": 2.437974214553833 }, { "auxiliary_loss_clip": 0.01075548, "auxiliary_loss_mlp": 0.01022264, "balance_loss_clip": 1.01945055, "balance_loss_mlp": 1.04075432, "epoch": 0.1904103411994589, "flos": 66899641916160.0, "grad_norm": 0.7835375146115401, "language_loss": 0.50419807, "learning_rate": 3.736093435602968e-06, "loss": 0.52517623, "num_input_tokens_seen": 68359665, "router_z_loss_clip": 0.02807617, "router_z_loss_mlp": 0.34765625, "step": 3167, "time_per_iteration": 3.1197216510772705 }, { "auxiliary_loss_clip": 0.01149196, "auxiliary_loss_mlp": 0.01049222, "balance_loss_clip": 1.03139997, "balance_loss_mlp": 1.05367374, "epoch": 0.19047046445212687, "flos": 21908741472000.0, "grad_norm": 1.7110144501730002, "language_loss": 0.74538147, "learning_rate": 3.7359000417695156e-06, "loss": 0.76736569, "num_input_tokens_seen": 68378950, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.953125, "step": 3168, "time_per_iteration": 2.48474383354187 }, { "auxiliary_loss_clip": 0.01069402, "auxiliary_loss_mlp": 0.01010032, "balance_loss_clip": 1.00758791, "balance_loss_mlp": 1.03503203, "epoch": 0.19053058770479483, "flos": 59255156701440.0, "grad_norm": 0.8683824976345667, "language_loss": 0.60058868, "learning_rate": 3.73570658211056e-06, "loss": 0.62138307, "num_input_tokens_seen": 68434235, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.34375, "step": 3169, "time_per_iteration": 2.989609718322754 }, { "auxiliary_loss_clip": 0.01153086, "auxiliary_loss_mlp": 0.01044552, "balance_loss_clip": 1.02711129, "balance_loss_mlp": 1.05174875, "epoch": 0.1905907109574628, "flos": 23951304362880.0, "grad_norm": 1.7158349160760298, "language_loss": 0.78507531, "learning_rate": 3.735513056633436e-06, "loss": 0.80705166, "num_input_tokens_seen": 68453830, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 1.015625, "step": 3170, "time_per_iteration": 2.497584342956543 }, { "auxiliary_loss_clip": 0.01147304, "auxiliary_loss_mlp": 0.01039854, "balance_loss_clip": 1.0214957, "balance_loss_mlp": 1.05168355, "epoch": 0.19065083421013077, "flos": 20812316774400.0, "grad_norm": 1.9664524313434355, "language_loss": 0.78399354, "learning_rate": 3.7353194653454834e-06, "loss": 0.80586511, "num_input_tokens_seen": 68473005, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.95703125, "step": 3171, "time_per_iteration": 2.475539207458496 }, { "auxiliary_loss_clip": 0.01149602, "auxiliary_loss_mlp": 0.01038446, "balance_loss_clip": 1.02075481, "balance_loss_mlp": 1.04957867, "epoch": 0.19071095746279873, "flos": 31284981192960.0, "grad_norm": 2.2034384887901894, "language_loss": 0.78348649, "learning_rate": 3.7351258082540426e-06, "loss": 0.80536699, "num_input_tokens_seen": 68493470, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 1.0, "step": 3172, "time_per_iteration": 2.5749285221099854 }, { "auxiliary_loss_clip": 0.01148392, "auxiliary_loss_mlp": 0.01048354, "balance_loss_clip": 1.03096128, "balance_loss_mlp": 1.05066943, "epoch": 0.1907710807154667, "flos": 14356117290240.0, "grad_norm": 1.6581979914587046, "language_loss": 0.80209804, "learning_rate": 3.7349320853664576e-06, "loss": 0.82406545, "num_input_tokens_seen": 68511290, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9765625, "step": 3173, "time_per_iteration": 2.4515511989593506 }, { "auxiliary_loss_clip": 0.01149721, "auxiliary_loss_mlp": 0.01055247, "balance_loss_clip": 1.03709126, "balance_loss_mlp": 1.05166054, "epoch": 0.1908312039681347, "flos": 26907039740160.0, "grad_norm": 2.2022815003947236, "language_loss": 0.78855997, "learning_rate": 3.7347382966900735e-06, "loss": 0.81060964, "num_input_tokens_seen": 68532575, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.98046875, "step": 3174, "time_per_iteration": 2.5268118381500244 }, { "auxiliary_loss_clip": 0.01150618, "auxiliary_loss_mlp": 0.01039968, "balance_loss_clip": 1.02250385, "balance_loss_mlp": 1.05264235, "epoch": 0.19089132722080265, "flos": 14494695960960.0, "grad_norm": 1.7730857567870504, "language_loss": 0.80848479, "learning_rate": 3.7345444422322395e-06, "loss": 0.83039069, "num_input_tokens_seen": 68548760, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.98046875, "step": 3175, "time_per_iteration": 2.4301459789276123 }, { "auxiliary_loss_clip": 0.01151662, "auxiliary_loss_mlp": 0.01055535, "balance_loss_clip": 1.03691459, "balance_loss_mlp": 1.05181754, "epoch": 0.19095145047347062, "flos": 13952876232960.0, "grad_norm": 2.161511921933932, "language_loss": 0.85482287, "learning_rate": 3.7343505220003067e-06, "loss": 0.87689483, "num_input_tokens_seen": 68563100, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 1.0, "step": 3176, "time_per_iteration": 2.440180540084839 }, { "auxiliary_loss_clip": 0.01154418, "auxiliary_loss_mlp": 0.01049934, "balance_loss_clip": 1.02990675, "balance_loss_mlp": 1.05344999, "epoch": 0.19101157372613858, "flos": 25301832848640.0, "grad_norm": 2.6534116840399826, "language_loss": 0.81194305, "learning_rate": 3.7341565360016285e-06, "loss": 0.83398658, "num_input_tokens_seen": 68581650, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0078125, "step": 3177, "time_per_iteration": 4.0795512199401855 }, { "auxiliary_loss_clip": 0.01145284, "auxiliary_loss_mlp": 0.01042295, "balance_loss_clip": 1.02399683, "balance_loss_mlp": 1.04861164, "epoch": 0.19107169697880655, "flos": 20558212986240.0, "grad_norm": 2.1926691747925817, "language_loss": 0.74258149, "learning_rate": 3.73396248424356e-06, "loss": 0.76445729, "num_input_tokens_seen": 68600360, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.96875, "step": 3178, "time_per_iteration": 2.4631295204162598 }, { "auxiliary_loss_clip": 0.01147418, "auxiliary_loss_mlp": 0.01037415, "balance_loss_clip": 1.02072549, "balance_loss_mlp": 1.04891777, "epoch": 0.19113182023147451, "flos": 22163204396160.0, "grad_norm": 2.1486139257089336, "language_loss": 0.81477666, "learning_rate": 3.7337683667334606e-06, "loss": 0.83662498, "num_input_tokens_seen": 68617885, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.984375, "step": 3179, "time_per_iteration": 3.889183282852173 }, { "auxiliary_loss_clip": 0.01149078, "auxiliary_loss_mlp": 0.01043958, "balance_loss_clip": 1.02627921, "balance_loss_mlp": 1.0503118, "epoch": 0.19119194348414248, "flos": 18581796990720.0, "grad_norm": 2.4919426933757856, "language_loss": 0.79246193, "learning_rate": 3.733574183478691e-06, "loss": 0.81439227, "num_input_tokens_seen": 68634550, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.98828125, "step": 3180, "time_per_iteration": 3.876464605331421 }, { "auxiliary_loss_clip": 0.01145692, "auxiliary_loss_mlp": 0.01047068, "balance_loss_clip": 1.02856636, "balance_loss_mlp": 1.04831636, "epoch": 0.19125206673681047, "flos": 19026623018880.0, "grad_norm": 2.4240010820626456, "language_loss": 0.79360193, "learning_rate": 3.733379934486615e-06, "loss": 0.81552953, "num_input_tokens_seen": 68651895, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.97265625, "step": 3181, "time_per_iteration": 3.9667999744415283 }, { "auxiliary_loss_clip": 0.01150418, "auxiliary_loss_mlp": 0.01051318, "balance_loss_clip": 1.03357911, "balance_loss_mlp": 1.04947078, "epoch": 0.19131218998947844, "flos": 21690153256320.0, "grad_norm": 1.9407152830776222, "language_loss": 0.73976612, "learning_rate": 3.7331856197645973e-06, "loss": 0.76178348, "num_input_tokens_seen": 68671500, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 1.015625, "step": 3182, "time_per_iteration": 2.4660255908966064 }, { "auxiliary_loss_clip": 0.01148531, "auxiliary_loss_mlp": 0.01046799, "balance_loss_clip": 1.0285238, "balance_loss_mlp": 1.04993045, "epoch": 0.1913723132421464, "flos": 18442500048000.0, "grad_norm": 1.6937843582219858, "language_loss": 0.64926827, "learning_rate": 3.7329912393200084e-06, "loss": 0.67122155, "num_input_tokens_seen": 68690570, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.98828125, "step": 3183, "time_per_iteration": 2.454503059387207 }, { "auxiliary_loss_clip": 0.0114809, "auxiliary_loss_mlp": 0.01046713, "balance_loss_clip": 1.02816367, "balance_loss_mlp": 1.04889894, "epoch": 0.19143243649481437, "flos": 27160102033920.0, "grad_norm": 1.6326293695011567, "language_loss": 0.73440486, "learning_rate": 3.7327967931602173e-06, "loss": 0.7563529, "num_input_tokens_seen": 68709735, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9921875, "step": 3184, "time_per_iteration": 2.504800319671631 }, { "auxiliary_loss_clip": 0.01150782, "auxiliary_loss_mlp": 0.01048455, "balance_loss_clip": 1.02885652, "balance_loss_mlp": 1.05035138, "epoch": 0.19149255974748233, "flos": 21718952985600.0, "grad_norm": 2.1772863050356133, "language_loss": 0.88001502, "learning_rate": 3.732602281292598e-06, "loss": 0.90200746, "num_input_tokens_seen": 68727565, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0, "step": 3185, "time_per_iteration": 2.4814553260803223 }, { "auxiliary_loss_clip": 0.01147586, "auxiliary_loss_mlp": 0.0104259, "balance_loss_clip": 1.02470875, "balance_loss_mlp": 1.04997659, "epoch": 0.1915526830001503, "flos": 22963293889920.0, "grad_norm": 2.162555816081298, "language_loss": 0.73130858, "learning_rate": 3.7324077037245267e-06, "loss": 0.75321031, "num_input_tokens_seen": 68748110, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9765625, "step": 3186, "time_per_iteration": 2.4629974365234375 }, { "auxiliary_loss_clip": 0.01153982, "auxiliary_loss_mlp": 0.01044964, "balance_loss_clip": 1.02468669, "balance_loss_mlp": 1.05253315, "epoch": 0.1916128062528183, "flos": 26140741966080.0, "grad_norm": 1.898711330970055, "language_loss": 0.83489311, "learning_rate": 3.7322130604633825e-06, "loss": 0.85688263, "num_input_tokens_seen": 68769765, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.015625, "step": 3187, "time_per_iteration": 2.531708002090454 }, { "auxiliary_loss_clip": 0.01067448, "auxiliary_loss_mlp": 0.01030961, "balance_loss_clip": 1.02844572, "balance_loss_mlp": 1.03261948, "epoch": 0.19167292950548626, "flos": 54925767457920.0, "grad_norm": 0.8679678244749262, "language_loss": 0.55868083, "learning_rate": 3.732018351516544e-06, "loss": 0.57966495, "num_input_tokens_seen": 68826815, "router_z_loss_clip": 0.02514648, "router_z_loss_mlp": 0.34765625, "step": 3188, "time_per_iteration": 3.1304664611816406 }, { "auxiliary_loss_clip": 0.0114789, "auxiliary_loss_mlp": 0.01057277, "balance_loss_clip": 1.0390501, "balance_loss_mlp": 1.04980743, "epoch": 0.19173305275815422, "flos": 29935601942400.0, "grad_norm": 1.7811291238184215, "language_loss": 0.69884908, "learning_rate": 3.731823576891397e-06, "loss": 0.72090071, "num_input_tokens_seen": 68847585, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.984375, "step": 3189, "time_per_iteration": 2.5643346309661865 }, { "auxiliary_loss_clip": 0.01142485, "auxiliary_loss_mlp": 0.01043121, "balance_loss_clip": 1.02603841, "balance_loss_mlp": 1.04699337, "epoch": 0.1917931760108222, "flos": 24752471264640.0, "grad_norm": 1.860248100361372, "language_loss": 0.74005276, "learning_rate": 3.7316287365953266e-06, "loss": 0.76190877, "num_input_tokens_seen": 68866620, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.953125, "step": 3190, "time_per_iteration": 2.5101966857910156 }, { "auxiliary_loss_clip": 0.01146768, "auxiliary_loss_mlp": 0.01063189, "balance_loss_clip": 1.04603529, "balance_loss_mlp": 1.05003715, "epoch": 0.19185329926349015, "flos": 18843550375680.0, "grad_norm": 1.944186359775389, "language_loss": 0.84358323, "learning_rate": 3.73143383063572e-06, "loss": 0.86568284, "num_input_tokens_seen": 68885515, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.96875, "step": 3191, "time_per_iteration": 2.4893345832824707 }, { "auxiliary_loss_clip": 0.01144183, "auxiliary_loss_mlp": 0.01053147, "balance_loss_clip": 1.03530145, "balance_loss_mlp": 1.04809022, "epoch": 0.19191342251615812, "flos": 22086858038400.0, "grad_norm": 1.7834112694160658, "language_loss": 0.89419711, "learning_rate": 3.73123885901997e-06, "loss": 0.91617036, "num_input_tokens_seen": 68903225, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9609375, "step": 3192, "time_per_iteration": 2.493067979812622 }, { "auxiliary_loss_clip": 0.01154147, "auxiliary_loss_mlp": 0.01065, "balance_loss_clip": 1.04534197, "balance_loss_mlp": 1.05284667, "epoch": 0.19197354576882608, "flos": 22199115018240.0, "grad_norm": 1.690244707104225, "language_loss": 0.74654746, "learning_rate": 3.7310438217554687e-06, "loss": 0.76873899, "num_input_tokens_seen": 68922860, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0078125, "step": 3193, "time_per_iteration": 2.5256662368774414 }, { "auxiliary_loss_clip": 0.01151365, "auxiliary_loss_mlp": 0.01054407, "balance_loss_clip": 1.03498816, "balance_loss_mlp": 1.04963624, "epoch": 0.19203366902149407, "flos": 24896185580160.0, "grad_norm": 1.7188398497260684, "language_loss": 0.74874187, "learning_rate": 3.730848718849612e-06, "loss": 0.77079958, "num_input_tokens_seen": 68943000, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.015625, "step": 3194, "time_per_iteration": 2.5067548751831055 }, { "auxiliary_loss_clip": 0.01065705, "auxiliary_loss_mlp": 0.01023314, "balance_loss_clip": 1.02088261, "balance_loss_mlp": 1.03157973, "epoch": 0.19209379227416204, "flos": 68416722789120.0, "grad_norm": 0.7839582617196582, "language_loss": 0.68479341, "learning_rate": 3.7306535503097985e-06, "loss": 0.70568365, "num_input_tokens_seen": 69000255, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.34179688, "step": 3195, "time_per_iteration": 3.0762057304382324 }, { "auxiliary_loss_clip": 0.0115022, "auxiliary_loss_mlp": 0.01057209, "balance_loss_clip": 1.03908873, "balance_loss_mlp": 1.05108166, "epoch": 0.19215391552683, "flos": 22055185221120.0, "grad_norm": 3.130598874691938, "language_loss": 0.73202145, "learning_rate": 3.730458316143429e-06, "loss": 0.75409579, "num_input_tokens_seen": 69019665, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9921875, "step": 3196, "time_per_iteration": 2.489497423171997 }, { "auxiliary_loss_clip": 0.01153771, "auxiliary_loss_mlp": 0.01051308, "balance_loss_clip": 1.03302097, "balance_loss_mlp": 1.05606902, "epoch": 0.19221403877949797, "flos": 20302959962880.0, "grad_norm": 1.9111476493735549, "language_loss": 0.83857322, "learning_rate": 3.7302630163579068e-06, "loss": 0.86062407, "num_input_tokens_seen": 69039055, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.9765625, "step": 3197, "time_per_iteration": 2.4828131198883057 }, { "auxiliary_loss_clip": 0.01152134, "auxiliary_loss_mlp": 0.01049046, "balance_loss_clip": 1.02977014, "balance_loss_mlp": 1.05172539, "epoch": 0.19227416203216594, "flos": 23185329811200.0, "grad_norm": 2.0607643815173335, "language_loss": 0.801126, "learning_rate": 3.7300676509606373e-06, "loss": 0.82313782, "num_input_tokens_seen": 69056370, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 1.0078125, "step": 3198, "time_per_iteration": 2.4836816787719727 }, { "auxiliary_loss_clip": 0.01151327, "auxiliary_loss_mlp": 0.0105729, "balance_loss_clip": 1.03775167, "balance_loss_mlp": 1.05028725, "epoch": 0.1923342852848339, "flos": 25776607841280.0, "grad_norm": 1.933939880424593, "language_loss": 0.78754687, "learning_rate": 3.729872219959029e-06, "loss": 0.80963302, "num_input_tokens_seen": 69075915, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 1.0078125, "step": 3199, "time_per_iteration": 2.5186877250671387 }, { "auxiliary_loss_clip": 0.01150118, "auxiliary_loss_mlp": 0.01049044, "balance_loss_clip": 1.03130615, "balance_loss_mlp": 1.05197692, "epoch": 0.19239440853750187, "flos": 17128349061120.0, "grad_norm": 2.664645988798084, "language_loss": 0.84532988, "learning_rate": 3.7296767233604934e-06, "loss": 0.86732149, "num_input_tokens_seen": 69094145, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.98046875, "step": 3200, "time_per_iteration": 2.468291997909546 }, { "auxiliary_loss_clip": 0.01151762, "auxiliary_loss_mlp": 0.01056978, "balance_loss_clip": 1.03872693, "balance_loss_mlp": 1.05314946, "epoch": 0.19245453179016986, "flos": 16435093593600.0, "grad_norm": 1.8028623523565215, "language_loss": 0.79248005, "learning_rate": 3.729481161172443e-06, "loss": 0.81456745, "num_input_tokens_seen": 69111110, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.984375, "step": 3201, "time_per_iteration": 2.467686891555786 }, { "auxiliary_loss_clip": 0.01150093, "auxiliary_loss_mlp": 0.01045637, "balance_loss_clip": 1.02750492, "balance_loss_mlp": 1.04966176, "epoch": 0.19251465504283782, "flos": 20230276792320.0, "grad_norm": 2.0623798735149568, "language_loss": 0.69697464, "learning_rate": 3.7292855334022927e-06, "loss": 0.71893191, "num_input_tokens_seen": 69130280, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 1.0, "step": 3202, "time_per_iteration": 2.467052459716797 }, { "auxiliary_loss_clip": 0.01147724, "auxiliary_loss_mlp": 0.01038371, "balance_loss_clip": 1.02000058, "balance_loss_mlp": 1.05117679, "epoch": 0.1925747782955058, "flos": 19464374067840.0, "grad_norm": 1.8634995479148577, "language_loss": 0.91071618, "learning_rate": 3.7290898400574627e-06, "loss": 0.93257713, "num_input_tokens_seen": 69149570, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.96875, "step": 3203, "time_per_iteration": 2.469686269760132 }, { "auxiliary_loss_clip": 0.01148038, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.03181362, "balance_loss_mlp": 1.04910803, "epoch": 0.19263490154817375, "flos": 17785586165760.0, "grad_norm": 2.056860739827216, "language_loss": 0.81516421, "learning_rate": 3.7288940811453725e-06, "loss": 0.83716273, "num_input_tokens_seen": 69168190, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.98828125, "step": 3204, "time_per_iteration": 2.4498131275177 }, { "auxiliary_loss_clip": 0.01146233, "auxiliary_loss_mlp": 0.01046989, "balance_loss_clip": 1.02860677, "balance_loss_mlp": 1.04939044, "epoch": 0.19269502480084172, "flos": 17457075354240.0, "grad_norm": 2.209823426667601, "language_loss": 0.76078629, "learning_rate": 3.7286982566734454e-06, "loss": 0.78271854, "num_input_tokens_seen": 69186950, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.96875, "step": 3205, "time_per_iteration": 2.463472604751587 }, { "auxiliary_loss_clip": 0.01151817, "auxiliary_loss_mlp": 0.01051176, "balance_loss_clip": 1.03274667, "balance_loss_mlp": 1.05264449, "epoch": 0.19275514805350968, "flos": 21506901045120.0, "grad_norm": 2.745200460682043, "language_loss": 0.83480865, "learning_rate": 3.728502366649107e-06, "loss": 0.85683858, "num_input_tokens_seen": 69204850, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9921875, "step": 3206, "time_per_iteration": 2.449070453643799 }, { "auxiliary_loss_clip": 0.01065406, "auxiliary_loss_mlp": 0.01029454, "balance_loss_clip": 1.02650976, "balance_loss_mlp": 1.03026462, "epoch": 0.19281527130617768, "flos": 47695979738880.0, "grad_norm": 0.8621807983983307, "language_loss": 0.60779536, "learning_rate": 3.728306411079786e-06, "loss": 0.62874401, "num_input_tokens_seen": 69259200, "router_z_loss_clip": 0.02941895, "router_z_loss_mlp": 0.3515625, "step": 3207, "time_per_iteration": 2.9302284717559814 }, { "auxiliary_loss_clip": 0.01150783, "auxiliary_loss_mlp": 0.01046699, "balance_loss_clip": 1.02809072, "balance_loss_mlp": 1.05180335, "epoch": 0.19287539455884564, "flos": 11801252672640.0, "grad_norm": 2.284910196013241, "language_loss": 0.75615484, "learning_rate": 3.7281103899729125e-06, "loss": 0.77812964, "num_input_tokens_seen": 69275835, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9921875, "step": 3208, "time_per_iteration": 2.443941593170166 }, { "auxiliary_loss_clip": 0.01152123, "auxiliary_loss_mlp": 0.01046669, "balance_loss_clip": 1.02660573, "balance_loss_mlp": 1.05041206, "epoch": 0.1929355178115136, "flos": 20631434860800.0, "grad_norm": 2.0768696796494144, "language_loss": 0.6111908, "learning_rate": 3.7279143033359195e-06, "loss": 0.63317871, "num_input_tokens_seen": 69294810, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.015625, "step": 3209, "time_per_iteration": 2.4678256511688232 }, { "auxiliary_loss_clip": 0.01150235, "auxiliary_loss_mlp": 0.01049696, "balance_loss_clip": 1.02929926, "balance_loss_mlp": 1.04854774, "epoch": 0.19299564106418157, "flos": 40807916058240.0, "grad_norm": 2.07386021028888, "language_loss": 0.79550093, "learning_rate": 3.727718151176243e-06, "loss": 0.81750023, "num_input_tokens_seen": 69316065, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 1.015625, "step": 3210, "time_per_iteration": 2.6179921627044678 }, { "auxiliary_loss_clip": 0.01142024, "auxiliary_loss_mlp": 0.0104488, "balance_loss_clip": 1.0275346, "balance_loss_mlp": 1.04637527, "epoch": 0.19305576431684954, "flos": 11361418634880.0, "grad_norm": 2.1728330935100395, "language_loss": 0.82387543, "learning_rate": 3.7275219335013217e-06, "loss": 0.84574437, "num_input_tokens_seen": 69332900, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.95703125, "step": 3211, "time_per_iteration": 2.431209087371826 }, { "auxiliary_loss_clip": 0.01060777, "auxiliary_loss_mlp": 0.01004552, "balance_loss_clip": 1.00171518, "balance_loss_mlp": 1.02697182, "epoch": 0.1931158875695175, "flos": 54511895975040.0, "grad_norm": 1.3941291124823894, "language_loss": 0.63748085, "learning_rate": 3.7273256503185953e-06, "loss": 0.6581341, "num_input_tokens_seen": 69382535, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.33789062, "step": 3212, "time_per_iteration": 2.9488449096679688 }, { "auxiliary_loss_clip": 0.01148476, "auxiliary_loss_mlp": 0.01045205, "balance_loss_clip": 1.02769351, "balance_loss_mlp": 1.0517695, "epoch": 0.19317601082218547, "flos": 19828436365440.0, "grad_norm": 1.6437219517621218, "language_loss": 0.76377648, "learning_rate": 3.7271293016355074e-06, "loss": 0.78571326, "num_input_tokens_seen": 69400600, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.96875, "step": 3213, "time_per_iteration": 2.4816951751708984 }, { "auxiliary_loss_clip": 0.01150369, "auxiliary_loss_mlp": 0.01047107, "balance_loss_clip": 1.02748513, "balance_loss_mlp": 1.04967058, "epoch": 0.19323613407485346, "flos": 13152068467200.0, "grad_norm": 2.0896913768960355, "language_loss": 0.71206903, "learning_rate": 3.726932887459503e-06, "loss": 0.73404372, "num_input_tokens_seen": 69417350, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0078125, "step": 3214, "time_per_iteration": 2.4275567531585693 }, { "auxiliary_loss_clip": 0.0114502, "auxiliary_loss_mlp": 0.01046989, "balance_loss_clip": 1.02815437, "balance_loss_mlp": 1.04714847, "epoch": 0.19329625732752143, "flos": 14027247342720.0, "grad_norm": 6.0489243579151575, "language_loss": 0.75312471, "learning_rate": 3.72673640779803e-06, "loss": 0.7750448, "num_input_tokens_seen": 69431845, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9765625, "step": 3215, "time_per_iteration": 2.4529800415039062 }, { "auxiliary_loss_clip": 0.01144239, "auxiliary_loss_mlp": 0.01050348, "balance_loss_clip": 1.03308618, "balance_loss_mlp": 1.0482614, "epoch": 0.1933563805801894, "flos": 23441732069760.0, "grad_norm": 2.2067554123506956, "language_loss": 0.88549703, "learning_rate": 3.72653986265854e-06, "loss": 0.90744293, "num_input_tokens_seen": 69453275, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9609375, "step": 3216, "time_per_iteration": 2.5071725845336914 }, { "auxiliary_loss_clip": 0.01145726, "auxiliary_loss_mlp": 0.01052038, "balance_loss_clip": 1.03522968, "balance_loss_mlp": 1.04949403, "epoch": 0.19341650383285736, "flos": 20485314334080.0, "grad_norm": 1.6453542029567751, "language_loss": 0.79796714, "learning_rate": 3.726343252048485e-06, "loss": 0.8199448, "num_input_tokens_seen": 69471830, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9609375, "step": 3217, "time_per_iteration": 2.476911783218384 }, { "auxiliary_loss_clip": 0.01151079, "auxiliary_loss_mlp": 0.0105422, "balance_loss_clip": 1.03446722, "balance_loss_mlp": 1.05015516, "epoch": 0.19347662708552532, "flos": 17858484817920.0, "grad_norm": 2.2001564982500184, "language_loss": 0.61448085, "learning_rate": 3.7261465759753206e-06, "loss": 0.63653386, "num_input_tokens_seen": 69489320, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 1.0078125, "step": 3218, "time_per_iteration": 3.9446442127227783 }, { "auxiliary_loss_clip": 0.01149804, "auxiliary_loss_mlp": 0.01044372, "balance_loss_clip": 1.02700281, "balance_loss_mlp": 1.05242383, "epoch": 0.1935367503381933, "flos": 18187247024640.0, "grad_norm": 1.7059366572738608, "language_loss": 0.80240977, "learning_rate": 3.7259498344465053e-06, "loss": 0.82435155, "num_input_tokens_seen": 69506665, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.97265625, "step": 3219, "time_per_iteration": 2.476681709289551 }, { "auxiliary_loss_clip": 0.01148483, "auxiliary_loss_mlp": 0.01045932, "balance_loss_clip": 1.02740645, "balance_loss_mlp": 1.05185521, "epoch": 0.19359687359086128, "flos": 15957122290560.0, "grad_norm": 2.24647636301509, "language_loss": 0.85823172, "learning_rate": 3.7257530274694993e-06, "loss": 0.88017589, "num_input_tokens_seen": 69523835, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.96875, "step": 3220, "time_per_iteration": 3.790590763092041 }, { "auxiliary_loss_clip": 0.01138903, "auxiliary_loss_mlp": 0.01037956, "balance_loss_clip": 1.02215481, "balance_loss_mlp": 1.04757679, "epoch": 0.19365699684352924, "flos": 21215198695680.0, "grad_norm": 2.117322353060283, "language_loss": 0.83917081, "learning_rate": 3.725556155051766e-06, "loss": 0.86093938, "num_input_tokens_seen": 69542620, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.9140625, "step": 3221, "time_per_iteration": 2.4526350498199463 }, { "auxiliary_loss_clip": 0.01145878, "auxiliary_loss_mlp": 0.01043325, "balance_loss_clip": 1.02720821, "balance_loss_mlp": 1.05173063, "epoch": 0.1937171200961972, "flos": 17311098481920.0, "grad_norm": 3.390935268993888, "language_loss": 0.85679299, "learning_rate": 3.7253592172007702e-06, "loss": 0.878685, "num_input_tokens_seen": 69561130, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.94140625, "step": 3222, "time_per_iteration": 3.908271074295044 }, { "auxiliary_loss_clip": 0.01145045, "auxiliary_loss_mlp": 0.01040463, "balance_loss_clip": 1.02286792, "balance_loss_mlp": 1.0478127, "epoch": 0.19377724334886517, "flos": 22635968227200.0, "grad_norm": 2.342316866652252, "language_loss": 0.78559047, "learning_rate": 3.72516221392398e-06, "loss": 0.80744553, "num_input_tokens_seen": 69580425, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.97265625, "step": 3223, "time_per_iteration": 3.9193129539489746 }, { "auxiliary_loss_clip": 0.01142888, "auxiliary_loss_mlp": 0.01046511, "balance_loss_clip": 1.02926731, "balance_loss_mlp": 1.04883826, "epoch": 0.19383736660153314, "flos": 15077813351040.0, "grad_norm": 2.0590293412271046, "language_loss": 0.75432706, "learning_rate": 3.7249651452288653e-06, "loss": 0.77622104, "num_input_tokens_seen": 69597085, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.94140625, "step": 3224, "time_per_iteration": 2.4472532272338867 }, { "auxiliary_loss_clip": 0.01145876, "auxiliary_loss_mlp": 0.01043427, "balance_loss_clip": 1.02597451, "balance_loss_mlp": 1.05100465, "epoch": 0.1938974898542011, "flos": 47119934350080.0, "grad_norm": 2.412389845640228, "language_loss": 0.71072304, "learning_rate": 3.7247680111229e-06, "loss": 0.73261607, "num_input_tokens_seen": 69618885, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.94921875, "step": 3225, "time_per_iteration": 2.677018165588379 }, { "auxiliary_loss_clip": 0.01144533, "auxiliary_loss_mlp": 0.01050436, "balance_loss_clip": 1.03367484, "balance_loss_mlp": 1.04805446, "epoch": 0.19395761310686907, "flos": 25812554376960.0, "grad_norm": 2.100055569383, "language_loss": 0.69320738, "learning_rate": 3.7245708116135585e-06, "loss": 0.71515703, "num_input_tokens_seen": 69638200, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.96484375, "step": 3226, "time_per_iteration": 2.5355584621429443 }, { "auxiliary_loss_clip": 0.01145094, "auxiliary_loss_mlp": 0.01041277, "balance_loss_clip": 1.02283585, "balance_loss_mlp": 1.05224597, "epoch": 0.19401773635953706, "flos": 23039604334080.0, "grad_norm": 1.9043211839507181, "language_loss": 0.75903261, "learning_rate": 3.7243735467083193e-06, "loss": 0.78089631, "num_input_tokens_seen": 69657550, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9296875, "step": 3227, "time_per_iteration": 2.4698219299316406 }, { "auxiliary_loss_clip": 0.01144329, "auxiliary_loss_mlp": 0.01047092, "balance_loss_clip": 1.02986681, "balance_loss_mlp": 1.04775023, "epoch": 0.19407785961220503, "flos": 15920780705280.0, "grad_norm": 2.285645536238987, "language_loss": 0.69573176, "learning_rate": 3.724176216414662e-06, "loss": 0.717646, "num_input_tokens_seen": 69675005, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.96484375, "step": 3228, "time_per_iteration": 2.449232816696167 }, { "auxiliary_loss_clip": 0.01146603, "auxiliary_loss_mlp": 0.01044021, "balance_loss_clip": 1.0267117, "balance_loss_mlp": 1.05123329, "epoch": 0.194137982864873, "flos": 25921722787200.0, "grad_norm": 1.991496406848578, "language_loss": 0.74223679, "learning_rate": 3.72397882074007e-06, "loss": 0.76414299, "num_input_tokens_seen": 69696455, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.953125, "step": 3229, "time_per_iteration": 2.5111277103424072 }, { "auxiliary_loss_clip": 0.01144002, "auxiliary_loss_mlp": 0.01042777, "balance_loss_clip": 1.02471709, "balance_loss_mlp": 1.04935539, "epoch": 0.19419810611754096, "flos": 13261344618240.0, "grad_norm": 1.9432987998704654, "language_loss": 0.65688258, "learning_rate": 3.7237813596920285e-06, "loss": 0.6787504, "num_input_tokens_seen": 69714245, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9453125, "step": 3230, "time_per_iteration": 2.457453966140747 }, { "auxiliary_loss_clip": 0.01140067, "auxiliary_loss_mlp": 0.01047603, "balance_loss_clip": 1.02932823, "balance_loss_mlp": 1.0468545, "epoch": 0.19425822937020892, "flos": 15705568368000.0, "grad_norm": 1.9221431500157617, "language_loss": 0.81919611, "learning_rate": 3.7235838332780254e-06, "loss": 0.8410728, "num_input_tokens_seen": 69731515, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.93359375, "step": 3231, "time_per_iteration": 2.4412856101989746 }, { "auxiliary_loss_clip": 0.01147296, "auxiliary_loss_mlp": 0.01044233, "balance_loss_clip": 1.02519536, "balance_loss_mlp": 1.0503664, "epoch": 0.1943183526228769, "flos": 23105392093440.0, "grad_norm": 1.9167134228368292, "language_loss": 0.8689779, "learning_rate": 3.72338624150555e-06, "loss": 0.89089322, "num_input_tokens_seen": 69748885, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.96875, "step": 3232, "time_per_iteration": 2.492764472961426 }, { "auxiliary_loss_clip": 0.01144882, "auxiliary_loss_mlp": 0.01045612, "balance_loss_clip": 1.02811241, "balance_loss_mlp": 1.05038869, "epoch": 0.19437847587554485, "flos": 24712610146560.0, "grad_norm": 2.2760274050724365, "language_loss": 0.85095793, "learning_rate": 3.723188584382096e-06, "loss": 0.87286282, "num_input_tokens_seen": 69767540, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9453125, "step": 3233, "time_per_iteration": 2.4919826984405518 }, { "auxiliary_loss_clip": 0.01147694, "auxiliary_loss_mlp": 0.01049237, "balance_loss_clip": 1.03185654, "balance_loss_mlp": 1.04965556, "epoch": 0.19443859912821285, "flos": 23116130259840.0, "grad_norm": 1.854873724188816, "language_loss": 0.89239448, "learning_rate": 3.722990861915158e-06, "loss": 0.91436374, "num_input_tokens_seen": 69789340, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.98046875, "step": 3234, "time_per_iteration": 2.528677463531494 }, { "auxiliary_loss_clip": 0.01146557, "auxiliary_loss_mlp": 0.01045104, "balance_loss_clip": 1.02673411, "balance_loss_mlp": 1.04764998, "epoch": 0.1944987223808808, "flos": 15084385539840.0, "grad_norm": 3.380348999215381, "language_loss": 0.78592414, "learning_rate": 3.722793074112234e-06, "loss": 0.80784076, "num_input_tokens_seen": 69806470, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.98828125, "step": 3235, "time_per_iteration": 2.43924617767334 }, { "auxiliary_loss_clip": 0.01149536, "auxiliary_loss_mlp": 0.0104686, "balance_loss_clip": 1.03031349, "balance_loss_mlp": 1.05397141, "epoch": 0.19455884563354878, "flos": 17126876603520.0, "grad_norm": 1.8873898743412323, "language_loss": 0.79703224, "learning_rate": 3.7225952209808233e-06, "loss": 0.81899619, "num_input_tokens_seen": 69822655, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.95703125, "step": 3236, "time_per_iteration": 2.4616756439208984 }, { "auxiliary_loss_clip": 0.01145441, "auxiliary_loss_mlp": 0.01040688, "balance_loss_clip": 1.02215111, "balance_loss_mlp": 1.05087519, "epoch": 0.19461896888621674, "flos": 20193396503040.0, "grad_norm": 1.5125819741980406, "language_loss": 0.75434589, "learning_rate": 3.72239730252843e-06, "loss": 0.77620715, "num_input_tokens_seen": 69841895, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9453125, "step": 3237, "time_per_iteration": 2.496307134628296 }, { "auxiliary_loss_clip": 0.01149201, "auxiliary_loss_mlp": 0.01051243, "balance_loss_clip": 1.03419662, "balance_loss_mlp": 1.05069721, "epoch": 0.1946790921388847, "flos": 25301365971840.0, "grad_norm": 1.6376438044674047, "language_loss": 0.75144219, "learning_rate": 3.7221993187625583e-06, "loss": 0.77344662, "num_input_tokens_seen": 69862220, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.984375, "step": 3238, "time_per_iteration": 2.5301246643066406 }, { "auxiliary_loss_clip": 0.01146154, "auxiliary_loss_mlp": 0.01048268, "balance_loss_clip": 1.02994573, "balance_loss_mlp": 1.0498271, "epoch": 0.19473921539155267, "flos": 20193396503040.0, "grad_norm": 1.9307245117403302, "language_loss": 0.73514777, "learning_rate": 3.7220012696907155e-06, "loss": 0.757092, "num_input_tokens_seen": 69881830, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9609375, "step": 3239, "time_per_iteration": 2.4515674114227295 }, { "auxiliary_loss_clip": 0.01144506, "auxiliary_loss_mlp": 0.01043674, "balance_loss_clip": 1.02568567, "balance_loss_mlp": 1.04842901, "epoch": 0.19479933864422067, "flos": 20887549810560.0, "grad_norm": 2.019761749575361, "language_loss": 0.73292667, "learning_rate": 3.721803155320412e-06, "loss": 0.75480849, "num_input_tokens_seen": 69900515, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9609375, "step": 3240, "time_per_iteration": 2.483609914779663 }, { "auxiliary_loss_clip": 0.01145559, "auxiliary_loss_mlp": 0.01044231, "balance_loss_clip": 1.02663577, "balance_loss_mlp": 1.05004621, "epoch": 0.19485946189688863, "flos": 23295072839040.0, "grad_norm": 1.8169885085420656, "language_loss": 0.66610563, "learning_rate": 3.7216049756591606e-06, "loss": 0.68800348, "num_input_tokens_seen": 69920060, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.953125, "step": 3241, "time_per_iteration": 2.4957053661346436 }, { "auxiliary_loss_clip": 0.01144304, "auxiliary_loss_mlp": 0.01046736, "balance_loss_clip": 1.02902126, "balance_loss_mlp": 1.05007625, "epoch": 0.1949195851495566, "flos": 23295036925440.0, "grad_norm": 1.3290402295888604, "language_loss": 0.82856548, "learning_rate": 3.7214067307144754e-06, "loss": 0.85047591, "num_input_tokens_seen": 69939820, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9453125, "step": 3242, "time_per_iteration": 2.4981937408447266 }, { "auxiliary_loss_clip": 0.01065503, "auxiliary_loss_mlp": 0.01060697, "balance_loss_clip": 1.05793083, "balance_loss_mlp": 1.03222191, "epoch": 0.19497970840222456, "flos": 64962871557120.0, "grad_norm": 0.8157000311369076, "language_loss": 0.57453251, "learning_rate": 3.721208420493875e-06, "loss": 0.59579444, "num_input_tokens_seen": 70002145, "router_z_loss_clip": 0.02770996, "router_z_loss_mlp": 0.33203125, "step": 3243, "time_per_iteration": 3.103098154067993 }, { "auxiliary_loss_clip": 0.01144108, "auxiliary_loss_mlp": 0.01042668, "balance_loss_clip": 1.02446461, "balance_loss_mlp": 1.04809737, "epoch": 0.19503983165489253, "flos": 19644717277440.0, "grad_norm": 1.964179798243553, "language_loss": 0.83223057, "learning_rate": 3.7210100450048784e-06, "loss": 0.85409832, "num_input_tokens_seen": 70020510, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9609375, "step": 3244, "time_per_iteration": 2.461907148361206 }, { "auxiliary_loss_clip": 0.01147136, "auxiliary_loss_mlp": 0.0104796, "balance_loss_clip": 1.03031754, "balance_loss_mlp": 1.0520103, "epoch": 0.1950999549075605, "flos": 21141976821120.0, "grad_norm": 1.6572460253130694, "language_loss": 0.76886731, "learning_rate": 3.7208116042550088e-06, "loss": 0.79081821, "num_input_tokens_seen": 70040760, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.953125, "step": 3245, "time_per_iteration": 2.4657533168792725 }, { "auxiliary_loss_clip": 0.01145449, "auxiliary_loss_mlp": 0.01039251, "balance_loss_clip": 1.02094078, "balance_loss_mlp": 1.04954076, "epoch": 0.19516007816022846, "flos": 20884820376960.0, "grad_norm": 2.228952094669208, "language_loss": 0.8434304, "learning_rate": 3.7206130982517906e-06, "loss": 0.86527741, "num_input_tokens_seen": 70058720, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.95703125, "step": 3246, "time_per_iteration": 2.4664363861083984 }, { "auxiliary_loss_clip": 0.01147701, "auxiliary_loss_mlp": 0.01046506, "balance_loss_clip": 1.02850533, "balance_loss_mlp": 1.04965281, "epoch": 0.19522020141289645, "flos": 16910515031040.0, "grad_norm": 1.9828444419991869, "language_loss": 0.75256687, "learning_rate": 3.7204145270027514e-06, "loss": 0.77450895, "num_input_tokens_seen": 70076470, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9765625, "step": 3247, "time_per_iteration": 2.429638147354126 }, { "auxiliary_loss_clip": 0.01149725, "auxiliary_loss_mlp": 0.0104362, "balance_loss_clip": 1.02613187, "balance_loss_mlp": 1.05315733, "epoch": 0.19528032466556441, "flos": 26724829023360.0, "grad_norm": 1.5689355354628314, "language_loss": 0.75651407, "learning_rate": 3.720215890515421e-06, "loss": 0.77844751, "num_input_tokens_seen": 70096220, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.96484375, "step": 3248, "time_per_iteration": 2.5247976779937744 }, { "auxiliary_loss_clip": 0.01145983, "auxiliary_loss_mlp": 0.01052557, "balance_loss_clip": 1.03492558, "balance_loss_mlp": 1.049582, "epoch": 0.19534044791823238, "flos": 21032808410880.0, "grad_norm": 5.664224622830337, "language_loss": 0.78275299, "learning_rate": 3.7200171887973316e-06, "loss": 0.80473834, "num_input_tokens_seen": 70114800, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.96484375, "step": 3249, "time_per_iteration": 2.464052438735962 }, { "auxiliary_loss_clip": 0.01146686, "auxiliary_loss_mlp": 0.0104951, "balance_loss_clip": 1.03186679, "balance_loss_mlp": 1.05025268, "epoch": 0.19540057117090034, "flos": 22344050396160.0, "grad_norm": 1.4707656415289707, "language_loss": 0.73205632, "learning_rate": 3.7198184218560176e-06, "loss": 0.75401831, "num_input_tokens_seen": 70134930, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.96484375, "step": 3250, "time_per_iteration": 2.57793927192688 }, { "auxiliary_loss_clip": 0.01141922, "auxiliary_loss_mlp": 0.01041355, "balance_loss_clip": 1.0246774, "balance_loss_mlp": 1.0488019, "epoch": 0.1954606944235683, "flos": 20301631159680.0, "grad_norm": 2.0473182009381032, "language_loss": 0.79516387, "learning_rate": 3.719619589699017e-06, "loss": 0.81699669, "num_input_tokens_seen": 70152045, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9296875, "step": 3251, "time_per_iteration": 2.4506916999816895 }, { "auxiliary_loss_clip": 0.01144757, "auxiliary_loss_mlp": 0.01046557, "balance_loss_clip": 1.02885413, "balance_loss_mlp": 1.04834914, "epoch": 0.19552081767623627, "flos": 17346865449600.0, "grad_norm": 2.9990160122866034, "language_loss": 0.83561045, "learning_rate": 3.7194206923338695e-06, "loss": 0.85752356, "num_input_tokens_seen": 70169240, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.96484375, "step": 3252, "time_per_iteration": 2.4532904624938965 }, { "auxiliary_loss_clip": 0.01149878, "auxiliary_loss_mlp": 0.01045816, "balance_loss_clip": 1.02588391, "balance_loss_mlp": 1.04931283, "epoch": 0.19558094092890424, "flos": 31977626129280.0, "grad_norm": 1.7137838251629534, "language_loss": 0.73488402, "learning_rate": 3.719221729768117e-06, "loss": 0.75684094, "num_input_tokens_seen": 70192690, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 1.0078125, "step": 3253, "time_per_iteration": 2.5580642223358154 }, { "auxiliary_loss_clip": 0.01147137, "auxiliary_loss_mlp": 0.01044651, "balance_loss_clip": 1.02659106, "balance_loss_mlp": 1.04747176, "epoch": 0.19564106418157223, "flos": 22268889187200.0, "grad_norm": 2.151466436565409, "language_loss": 0.7669028, "learning_rate": 3.7190227020093037e-06, "loss": 0.78882062, "num_input_tokens_seen": 70209685, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.99609375, "step": 3254, "time_per_iteration": 2.473896026611328 }, { "auxiliary_loss_clip": 0.01059188, "auxiliary_loss_mlp": 0.01013613, "balance_loss_clip": 1.01103842, "balance_loss_mlp": 1.02605343, "epoch": 0.1957011874342402, "flos": 54364554385920.0, "grad_norm": 0.7636070672890102, "language_loss": 0.55392694, "learning_rate": 3.7188236090649774e-06, "loss": 0.57465494, "num_input_tokens_seen": 70265050, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.33203125, "step": 3255, "time_per_iteration": 3.0624334812164307 }, { "auxiliary_loss_clip": 0.0114783, "auxiliary_loss_mlp": 0.01044376, "balance_loss_clip": 1.02688766, "balance_loss_mlp": 1.05033827, "epoch": 0.19576131068690816, "flos": 16506699356160.0, "grad_norm": 3.7537584967723165, "language_loss": 0.70463306, "learning_rate": 3.718624450942688e-06, "loss": 0.72655511, "num_input_tokens_seen": 70281830, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9765625, "step": 3256, "time_per_iteration": 2.4797887802124023 }, { "auxiliary_loss_clip": 0.01141725, "auxiliary_loss_mlp": 0.01042672, "balance_loss_clip": 1.02507687, "balance_loss_mlp": 1.04697347, "epoch": 0.19582143393957613, "flos": 14719676797440.0, "grad_norm": 2.4013482392703356, "language_loss": 0.80316144, "learning_rate": 3.718425227649987e-06, "loss": 0.82500541, "num_input_tokens_seen": 70297420, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9453125, "step": 3257, "time_per_iteration": 2.4265434741973877 }, { "auxiliary_loss_clip": 0.01144883, "auxiliary_loss_mlp": 0.01043919, "balance_loss_clip": 1.02693176, "balance_loss_mlp": 1.04916072, "epoch": 0.1958815571922441, "flos": 24425504737920.0, "grad_norm": 1.666639657817641, "language_loss": 0.75234628, "learning_rate": 3.7182259391944292e-06, "loss": 0.77423429, "num_input_tokens_seen": 70319210, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9609375, "step": 3258, "time_per_iteration": 2.5035181045532227 }, { "auxiliary_loss_clip": 0.01145869, "auxiliary_loss_mlp": 0.01040694, "balance_loss_clip": 1.02296805, "balance_loss_mlp": 1.04765153, "epoch": 0.19594168044491206, "flos": 24900279730560.0, "grad_norm": 1.8029851693758574, "language_loss": 0.73894787, "learning_rate": 3.7180265855835714e-06, "loss": 0.76081347, "num_input_tokens_seen": 70339045, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.984375, "step": 3259, "time_per_iteration": 2.492037534713745 }, { "auxiliary_loss_clip": 0.01147898, "auxiliary_loss_mlp": 0.01045677, "balance_loss_clip": 1.02682972, "balance_loss_mlp": 1.04924738, "epoch": 0.19600180369758005, "flos": 12057008486400.0, "grad_norm": 2.8315556780521747, "language_loss": 0.77529871, "learning_rate": 3.7178271668249735e-06, "loss": 0.79723442, "num_input_tokens_seen": 70356505, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.98828125, "step": 3260, "time_per_iteration": 3.9303715229034424 }, { "auxiliary_loss_clip": 0.01144323, "auxiliary_loss_mlp": 0.0105027, "balance_loss_clip": 1.03184056, "balance_loss_mlp": 1.04662132, "epoch": 0.19606192695024802, "flos": 20850202644480.0, "grad_norm": 3.483604972060301, "language_loss": 0.81786263, "learning_rate": 3.7176276829261975e-06, "loss": 0.83980858, "num_input_tokens_seen": 70375410, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9765625, "step": 3261, "time_per_iteration": 2.4611728191375732 }, { "auxiliary_loss_clip": 0.01145953, "auxiliary_loss_mlp": 0.01051762, "balance_loss_clip": 1.03357077, "balance_loss_mlp": 1.0496223, "epoch": 0.19612205020291598, "flos": 28475509996800.0, "grad_norm": 2.001978585216225, "language_loss": 0.76463711, "learning_rate": 3.717428133894807e-06, "loss": 0.7866143, "num_input_tokens_seen": 70396315, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9609375, "step": 3262, "time_per_iteration": 3.9432151317596436 }, { "auxiliary_loss_clip": 0.01145941, "auxiliary_loss_mlp": 0.01058888, "balance_loss_clip": 1.04173374, "balance_loss_mlp": 1.05124736, "epoch": 0.19618217345558395, "flos": 25556618995200.0, "grad_norm": 1.698587986651839, "language_loss": 0.8623991, "learning_rate": 3.71722851973837e-06, "loss": 0.88444746, "num_input_tokens_seen": 70417945, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9453125, "step": 3263, "time_per_iteration": 2.53239369392395 }, { "auxiliary_loss_clip": 0.01144432, "auxiliary_loss_mlp": 0.01050981, "balance_loss_clip": 1.03383851, "balance_loss_mlp": 1.04882908, "epoch": 0.1962422967082519, "flos": 25264413855360.0, "grad_norm": 1.6016367714097517, "language_loss": 0.73766315, "learning_rate": 3.717028840464455e-06, "loss": 0.75961727, "num_input_tokens_seen": 70438690, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.95703125, "step": 3264, "time_per_iteration": 3.902057647705078 }, { "auxiliary_loss_clip": 0.01142438, "auxiliary_loss_mlp": 0.01058791, "balance_loss_clip": 1.04231596, "balance_loss_mlp": 1.04970264, "epoch": 0.19630241996091988, "flos": 18807352444800.0, "grad_norm": 3.101649851232967, "language_loss": 0.78705055, "learning_rate": 3.7168290960806344e-06, "loss": 0.80906284, "num_input_tokens_seen": 70455385, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9296875, "step": 3265, "time_per_iteration": 3.896080732345581 }, { "auxiliary_loss_clip": 0.01049601, "auxiliary_loss_mlp": 0.01064561, "balance_loss_clip": 1.06162882, "balance_loss_mlp": 1.01747918, "epoch": 0.19636254321358784, "flos": 62321137896960.0, "grad_norm": 0.8074212583042685, "language_loss": 0.53485888, "learning_rate": 3.716629286594483e-06, "loss": 0.55600053, "num_input_tokens_seen": 70514280, "router_z_loss_clip": 0.02929688, "router_z_loss_mlp": 0.3203125, "step": 3266, "time_per_iteration": 3.0913610458374023 }, { "auxiliary_loss_clip": 0.01145769, "auxiliary_loss_mlp": 0.01055761, "balance_loss_clip": 1.0362823, "balance_loss_mlp": 1.0469377, "epoch": 0.19642266646625584, "flos": 21069329564160.0, "grad_norm": 1.92721057279587, "language_loss": 0.79737359, "learning_rate": 3.7164294120135767e-06, "loss": 0.81938893, "num_input_tokens_seen": 70531800, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.98828125, "step": 3267, "time_per_iteration": 2.481070041656494 }, { "auxiliary_loss_clip": 0.01138052, "auxiliary_loss_mlp": 0.01045991, "balance_loss_clip": 1.02907538, "balance_loss_mlp": 1.04577279, "epoch": 0.1964827897189238, "flos": 14538651229440.0, "grad_norm": 2.2049714663915387, "language_loss": 0.86261749, "learning_rate": 3.7162294723454953e-06, "loss": 0.88445795, "num_input_tokens_seen": 70550615, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.921875, "step": 3268, "time_per_iteration": 2.457596778869629 }, { "auxiliary_loss_clip": 0.01143878, "auxiliary_loss_mlp": 0.0104783, "balance_loss_clip": 1.03072381, "balance_loss_mlp": 1.05041778, "epoch": 0.19654291297159177, "flos": 19244636616960.0, "grad_norm": 2.204105503440973, "language_loss": 0.69305545, "learning_rate": 3.7160294675978197e-06, "loss": 0.7149725, "num_input_tokens_seen": 70568690, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.93359375, "step": 3269, "time_per_iteration": 2.4585342407226562 }, { "auxiliary_loss_clip": 0.01147615, "auxiliary_loss_mlp": 0.01054666, "balance_loss_clip": 1.03639174, "balance_loss_mlp": 1.05052888, "epoch": 0.19660303622425973, "flos": 25775710001280.0, "grad_norm": 1.8472865836757675, "language_loss": 0.80673635, "learning_rate": 3.715829397778135e-06, "loss": 0.82875913, "num_input_tokens_seen": 70588665, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.96875, "step": 3270, "time_per_iteration": 2.527696371078491 }, { "auxiliary_loss_clip": 0.01141087, "auxiliary_loss_mlp": 0.01047274, "balance_loss_clip": 1.03100181, "balance_loss_mlp": 1.04611492, "epoch": 0.1966631594769277, "flos": 20595093275520.0, "grad_norm": 2.308455131534149, "language_loss": 0.84276539, "learning_rate": 3.715629262894028e-06, "loss": 0.86464906, "num_input_tokens_seen": 70606900, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.953125, "step": 3271, "time_per_iteration": 2.465884208679199 }, { "auxiliary_loss_clip": 0.01138768, "auxiliary_loss_mlp": 0.01048941, "balance_loss_clip": 1.0312264, "balance_loss_mlp": 1.04711866, "epoch": 0.19672328272959566, "flos": 23623188600960.0, "grad_norm": 1.9399335323645646, "language_loss": 0.80705202, "learning_rate": 3.715429062953087e-06, "loss": 0.82892907, "num_input_tokens_seen": 70625955, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9140625, "step": 3272, "time_per_iteration": 2.56066632270813 }, { "auxiliary_loss_clip": 0.01144673, "auxiliary_loss_mlp": 0.01044833, "balance_loss_clip": 1.02646303, "balance_loss_mlp": 1.04863906, "epoch": 0.19678340598226365, "flos": 23110922787840.0, "grad_norm": 1.7201844911898556, "language_loss": 0.80426359, "learning_rate": 3.7152287979629043e-06, "loss": 0.82615864, "num_input_tokens_seen": 70646090, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9609375, "step": 3273, "time_per_iteration": 2.4963607788085938 }, { "auxiliary_loss_clip": 0.0114469, "auxiliary_loss_mlp": 0.01052946, "balance_loss_clip": 1.03583956, "balance_loss_mlp": 1.04824257, "epoch": 0.19684352923493162, "flos": 24534852716160.0, "grad_norm": 1.8199148571086863, "language_loss": 0.77767724, "learning_rate": 3.7150284679310735e-06, "loss": 0.79965359, "num_input_tokens_seen": 70666065, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9609375, "step": 3274, "time_per_iteration": 2.535538673400879 }, { "auxiliary_loss_clip": 0.01145005, "auxiliary_loss_mlp": 0.01042486, "balance_loss_clip": 1.0240922, "balance_loss_mlp": 1.04744697, "epoch": 0.19690365248759958, "flos": 21796448578560.0, "grad_norm": 2.5830275603604744, "language_loss": 0.80889618, "learning_rate": 3.7148280728651914e-06, "loss": 0.83077109, "num_input_tokens_seen": 70681580, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9765625, "step": 3275, "time_per_iteration": 2.439413547515869 }, { "auxiliary_loss_clip": 0.01144369, "auxiliary_loss_mlp": 0.01043727, "balance_loss_clip": 1.02557111, "balance_loss_mlp": 1.04668105, "epoch": 0.19696377574026755, "flos": 19056643810560.0, "grad_norm": 1.9432186451848452, "language_loss": 0.80746627, "learning_rate": 3.7146276127728563e-06, "loss": 0.82934725, "num_input_tokens_seen": 70697745, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9765625, "step": 3276, "time_per_iteration": 2.4395346641540527 }, { "auxiliary_loss_clip": 0.01143066, "auxiliary_loss_mlp": 0.01042341, "balance_loss_clip": 1.02435279, "balance_loss_mlp": 1.04770923, "epoch": 0.19702389899293551, "flos": 22820656982400.0, "grad_norm": 2.2838015026144256, "language_loss": 0.8919512, "learning_rate": 3.7144270876616713e-06, "loss": 0.91380525, "num_input_tokens_seen": 70715110, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.953125, "step": 3277, "time_per_iteration": 2.483245372772217 }, { "auxiliary_loss_clip": 0.01148526, "auxiliary_loss_mlp": 0.01050141, "balance_loss_clip": 1.03055525, "balance_loss_mlp": 1.04839182, "epoch": 0.19708402224560348, "flos": 22894237992960.0, "grad_norm": 2.2274695796133845, "language_loss": 0.62425637, "learning_rate": 3.714226497539239e-06, "loss": 0.64624304, "num_input_tokens_seen": 70734715, "router_z_loss_clip": 0.19628906, "router_z_loss_mlp": 1.0, "step": 3278, "time_per_iteration": 2.490159034729004 }, { "auxiliary_loss_clip": 0.01145915, "auxiliary_loss_mlp": 0.01053699, "balance_loss_clip": 1.0354718, "balance_loss_mlp": 1.04879045, "epoch": 0.19714414549827144, "flos": 25662519267840.0, "grad_norm": 2.436958463091841, "language_loss": 0.73708349, "learning_rate": 3.714025842413166e-06, "loss": 0.75907958, "num_input_tokens_seen": 70752650, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.97265625, "step": 3279, "time_per_iteration": 2.5015270709991455 }, { "auxiliary_loss_clip": 0.01145492, "auxiliary_loss_mlp": 0.01043552, "balance_loss_clip": 1.02650511, "balance_loss_mlp": 1.04753923, "epoch": 0.19720426875093944, "flos": 23915824704000.0, "grad_norm": 1.62305445286632, "language_loss": 0.82458544, "learning_rate": 3.713825122291061e-06, "loss": 0.8464759, "num_input_tokens_seen": 70772365, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9765625, "step": 3280, "time_per_iteration": 2.4881441593170166 }, { "auxiliary_loss_clip": 0.01144871, "auxiliary_loss_mlp": 0.01043052, "balance_loss_clip": 1.02629113, "balance_loss_mlp": 1.04776967, "epoch": 0.1972643920036074, "flos": 13881952828800.0, "grad_norm": 1.789959777930888, "language_loss": 0.77634275, "learning_rate": 3.713624337180536e-06, "loss": 0.79822201, "num_input_tokens_seen": 70790340, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.97265625, "step": 3281, "time_per_iteration": 2.4232521057128906 }, { "auxiliary_loss_clip": 0.01141648, "auxiliary_loss_mlp": 0.01053529, "balance_loss_clip": 1.03686357, "balance_loss_mlp": 1.048522, "epoch": 0.19732451525627537, "flos": 19863592801920.0, "grad_norm": 1.5953951347829711, "language_loss": 0.79695153, "learning_rate": 3.7134234870892045e-06, "loss": 0.81890327, "num_input_tokens_seen": 70809295, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9296875, "step": 3282, "time_per_iteration": 2.45591139793396 }, { "auxiliary_loss_clip": 0.01150539, "auxiliary_loss_mlp": 0.01043263, "balance_loss_clip": 1.02561963, "balance_loss_mlp": 1.05167198, "epoch": 0.19738463850894333, "flos": 24973429777920.0, "grad_norm": 1.9731453780424457, "language_loss": 0.71621525, "learning_rate": 3.7132225720246826e-06, "loss": 0.73815322, "num_input_tokens_seen": 70828765, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.98828125, "step": 3283, "time_per_iteration": 2.489757776260376 }, { "auxiliary_loss_clip": 0.01146959, "auxiliary_loss_mlp": 0.01046016, "balance_loss_clip": 1.0288496, "balance_loss_mlp": 1.05017591, "epoch": 0.1974447617616113, "flos": 18368883123840.0, "grad_norm": 1.6709234953542802, "language_loss": 0.79050863, "learning_rate": 3.7130215919945886e-06, "loss": 0.81243837, "num_input_tokens_seen": 70846805, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.96875, "step": 3284, "time_per_iteration": 2.464425802230835 }, { "auxiliary_loss_clip": 0.01148917, "auxiliary_loss_mlp": 0.01042759, "balance_loss_clip": 1.02411497, "balance_loss_mlp": 1.05039358, "epoch": 0.19750488501427926, "flos": 22892945103360.0, "grad_norm": 2.3252644057129883, "language_loss": 0.86338115, "learning_rate": 3.7128205470065445e-06, "loss": 0.88529789, "num_input_tokens_seen": 70863805, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.984375, "step": 3285, "time_per_iteration": 2.464097738265991 }, { "auxiliary_loss_clip": 0.0114525, "auxiliary_loss_mlp": 0.01045183, "balance_loss_clip": 1.02783847, "balance_loss_mlp": 1.05127859, "epoch": 0.19756500826694723, "flos": 21871502046720.0, "grad_norm": 2.492356518776629, "language_loss": 0.87688464, "learning_rate": 3.712619437068174e-06, "loss": 0.89878899, "num_input_tokens_seen": 70882660, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9375, "step": 3286, "time_per_iteration": 2.465358018875122 }, { "auxiliary_loss_clip": 0.01150553, "auxiliary_loss_mlp": 0.01045525, "balance_loss_clip": 1.02572465, "balance_loss_mlp": 1.05215573, "epoch": 0.19762513151961522, "flos": 15158972131200.0, "grad_norm": 2.096956217720314, "language_loss": 0.7797429, "learning_rate": 3.712418262187102e-06, "loss": 0.80170369, "num_input_tokens_seen": 70898765, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.984375, "step": 3287, "time_per_iteration": 2.435394287109375 }, { "auxiliary_loss_clip": 0.01148172, "auxiliary_loss_mlp": 0.01045021, "balance_loss_clip": 1.0260067, "balance_loss_mlp": 1.049698, "epoch": 0.1976852547722832, "flos": 16979175878400.0, "grad_norm": 3.21583424533933, "language_loss": 0.80999452, "learning_rate": 3.7122170223709584e-06, "loss": 0.83192641, "num_input_tokens_seen": 70916370, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.984375, "step": 3288, "time_per_iteration": 2.443194627761841 }, { "auxiliary_loss_clip": 0.011411, "auxiliary_loss_mlp": 0.01047459, "balance_loss_clip": 1.02974474, "balance_loss_mlp": 1.04859209, "epoch": 0.19774537802495115, "flos": 20302924049280.0, "grad_norm": 1.6729819607982717, "language_loss": 0.72793758, "learning_rate": 3.712015717627374e-06, "loss": 0.74982321, "num_input_tokens_seen": 70934870, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.92578125, "step": 3289, "time_per_iteration": 2.470118999481201 }, { "auxiliary_loss_clip": 0.01147146, "auxiliary_loss_mlp": 0.01045356, "balance_loss_clip": 1.02733123, "balance_loss_mlp": 1.05115652, "epoch": 0.19780550127761912, "flos": 27235478724480.0, "grad_norm": 2.9927509656980136, "language_loss": 0.79379028, "learning_rate": 3.7118143479639813e-06, "loss": 0.81571531, "num_input_tokens_seen": 70955140, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9609375, "step": 3290, "time_per_iteration": 2.503899335861206 }, { "auxiliary_loss_clip": 0.01052506, "auxiliary_loss_mlp": 0.01035406, "balance_loss_clip": 1.03270042, "balance_loss_mlp": 1.01749134, "epoch": 0.19786562453028708, "flos": 63550972684800.0, "grad_norm": 0.9164195085825904, "language_loss": 0.6041069, "learning_rate": 3.711612913388418e-06, "loss": 0.62498599, "num_input_tokens_seen": 71012005, "router_z_loss_clip": 0.02709961, "router_z_loss_mlp": 0.34960938, "step": 3291, "time_per_iteration": 3.1165144443511963 }, { "auxiliary_loss_clip": 0.01147371, "auxiliary_loss_mlp": 0.01049143, "balance_loss_clip": 1.02993882, "balance_loss_mlp": 1.04759908, "epoch": 0.19792574778295505, "flos": 26286647011200.0, "grad_norm": 2.0936699685331233, "language_loss": 0.811566, "learning_rate": 3.7114114139083204e-06, "loss": 0.83353114, "num_input_tokens_seen": 71031140, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 1.0, "step": 3292, "time_per_iteration": 2.5200107097625732 }, { "auxiliary_loss_clip": 0.01137774, "auxiliary_loss_mlp": 0.01056205, "balance_loss_clip": 1.03901482, "balance_loss_mlp": 1.04635715, "epoch": 0.19798587103562304, "flos": 19938107566080.0, "grad_norm": 1.686755287288585, "language_loss": 0.81489235, "learning_rate": 3.7112098495313313e-06, "loss": 0.83683217, "num_input_tokens_seen": 71050250, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9140625, "step": 3293, "time_per_iteration": 2.452224016189575 }, { "auxiliary_loss_clip": 0.01155099, "auxiliary_loss_mlp": 0.01057659, "balance_loss_clip": 1.03869319, "balance_loss_mlp": 1.05430508, "epoch": 0.198045994288291, "flos": 20120282369280.0, "grad_norm": 2.0358635770623743, "language_loss": 0.60995269, "learning_rate": 3.711008220265093e-06, "loss": 0.63208026, "num_input_tokens_seen": 71068665, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0078125, "step": 3294, "time_per_iteration": 2.477336883544922 }, { "auxiliary_loss_clip": 0.01144922, "auxiliary_loss_mlp": 0.01059219, "balance_loss_clip": 1.04210019, "balance_loss_mlp": 1.0494597, "epoch": 0.19810611754095897, "flos": 17967653228160.0, "grad_norm": 1.959985700560489, "language_loss": 0.86922944, "learning_rate": 3.710806526117251e-06, "loss": 0.89127082, "num_input_tokens_seen": 71085320, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.953125, "step": 3295, "time_per_iteration": 2.4375922679901123 }, { "auxiliary_loss_clip": 0.01142509, "auxiliary_loss_mlp": 0.01073452, "balance_loss_clip": 1.05729926, "balance_loss_mlp": 1.04845881, "epoch": 0.19816624079362694, "flos": 15084996071040.0, "grad_norm": 4.808877959373324, "language_loss": 0.80634689, "learning_rate": 3.7106047670954544e-06, "loss": 0.82850647, "num_input_tokens_seen": 71102020, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.94140625, "step": 3296, "time_per_iteration": 2.440845251083374 }, { "auxiliary_loss_clip": 0.01146683, "auxiliary_loss_mlp": 0.01057225, "balance_loss_clip": 1.03827047, "balance_loss_mlp": 1.04822838, "epoch": 0.1982263640462949, "flos": 24900315644160.0, "grad_norm": 1.8894313467825055, "language_loss": 0.68063635, "learning_rate": 3.710402943207354e-06, "loss": 0.70267534, "num_input_tokens_seen": 71123390, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.984375, "step": 3297, "time_per_iteration": 2.502390146255493 }, { "auxiliary_loss_clip": 0.0114174, "auxiliary_loss_mlp": 0.01063238, "balance_loss_clip": 1.04758573, "balance_loss_mlp": 1.05035865, "epoch": 0.19828648729896287, "flos": 20376181837440.0, "grad_norm": 1.6258660166346917, "language_loss": 0.81276828, "learning_rate": 3.7102010544606016e-06, "loss": 0.83481812, "num_input_tokens_seen": 71141800, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.9140625, "step": 3298, "time_per_iteration": 2.4734578132629395 }, { "auxiliary_loss_clip": 0.01149212, "auxiliary_loss_mlp": 0.0106634, "balance_loss_clip": 1.04696834, "balance_loss_mlp": 1.05068111, "epoch": 0.19834661055163083, "flos": 18880035615360.0, "grad_norm": 1.9391472278649433, "language_loss": 0.85039878, "learning_rate": 3.7099991008628544e-06, "loss": 0.8725543, "num_input_tokens_seen": 71159505, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.984375, "step": 3299, "time_per_iteration": 2.4561195373535156 }, { "auxiliary_loss_clip": 0.01053449, "auxiliary_loss_mlp": 0.01059295, "balance_loss_clip": 1.05687523, "balance_loss_mlp": 1.02038741, "epoch": 0.19840673380429882, "flos": 60259184640000.0, "grad_norm": 0.8093792061182172, "language_loss": 0.53322506, "learning_rate": 3.7097970824217706e-06, "loss": 0.55435252, "num_input_tokens_seen": 71223265, "router_z_loss_clip": 0.02416992, "router_z_loss_mlp": 0.33203125, "step": 3300, "time_per_iteration": 3.0479633808135986 }, { "auxiliary_loss_clip": 0.01143505, "auxiliary_loss_mlp": 0.01074879, "balance_loss_clip": 1.05625844, "balance_loss_mlp": 1.04855013, "epoch": 0.1984668570569668, "flos": 19902017376000.0, "grad_norm": 1.5639673778527075, "language_loss": 0.73613721, "learning_rate": 3.7095949991450093e-06, "loss": 0.75832105, "num_input_tokens_seen": 71242385, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.94921875, "step": 3301, "time_per_iteration": 2.4841814041137695 }, { "auxiliary_loss_clip": 0.01143101, "auxiliary_loss_mlp": 0.0104329, "balance_loss_clip": 1.0274229, "balance_loss_mlp": 1.0488044, "epoch": 0.19852698030963475, "flos": 15630766295040.0, "grad_norm": 2.3147823743819216, "language_loss": 0.8818171, "learning_rate": 3.709392851040235e-06, "loss": 0.90368104, "num_input_tokens_seen": 71258990, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.9453125, "step": 3302, "time_per_iteration": 3.9555442333221436 }, { "auxiliary_loss_clip": 0.01143085, "auxiliary_loss_mlp": 0.01057239, "balance_loss_clip": 1.0401684, "balance_loss_mlp": 1.04741955, "epoch": 0.19858710356230272, "flos": 43143007311360.0, "grad_norm": 2.256757135372996, "language_loss": 0.73814827, "learning_rate": 3.709190638115111e-06, "loss": 0.76015151, "num_input_tokens_seen": 71282770, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.95703125, "step": 3303, "time_per_iteration": 2.680234909057617 }, { "auxiliary_loss_clip": 0.01142091, "auxiliary_loss_mlp": 0.01049239, "balance_loss_clip": 1.0326457, "balance_loss_mlp": 1.04853475, "epoch": 0.19864722681497068, "flos": 35144084643840.0, "grad_norm": 1.988753306228714, "language_loss": 0.74674618, "learning_rate": 3.7089883603773084e-06, "loss": 0.76865947, "num_input_tokens_seen": 71301410, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9375, "step": 3304, "time_per_iteration": 3.985246419906616 }, { "auxiliary_loss_clip": 0.01143223, "auxiliary_loss_mlp": 0.0104144, "balance_loss_clip": 1.0253582, "balance_loss_mlp": 1.04938078, "epoch": 0.19870735006763865, "flos": 19426200888960.0, "grad_norm": 1.7882038933933493, "language_loss": 0.86209989, "learning_rate": 3.7087860178344955e-06, "loss": 0.88394654, "num_input_tokens_seen": 71319670, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.9375, "step": 3305, "time_per_iteration": 3.791156768798828 }, { "auxiliary_loss_clip": 0.01143018, "auxiliary_loss_mlp": 0.01049059, "balance_loss_clip": 1.03275096, "balance_loss_mlp": 1.04610944, "epoch": 0.19876747332030664, "flos": 23547380947200.0, "grad_norm": 1.4920155829392703, "language_loss": 0.68382108, "learning_rate": 3.7085836104943445e-06, "loss": 0.70574188, "num_input_tokens_seen": 71339850, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.96875, "step": 3306, "time_per_iteration": 3.985128402709961 }, { "auxiliary_loss_clip": 0.01138366, "auxiliary_loss_mlp": 0.01041438, "balance_loss_clip": 1.02609563, "balance_loss_mlp": 1.04466796, "epoch": 0.1988275965729746, "flos": 19829406032640.0, "grad_norm": 1.5141646467517553, "language_loss": 0.76349854, "learning_rate": 3.7083811383645332e-06, "loss": 0.78529656, "num_input_tokens_seen": 71359795, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.9375, "step": 3307, "time_per_iteration": 2.512373447418213 }, { "auxiliary_loss_clip": 0.01141743, "auxiliary_loss_mlp": 0.01042578, "balance_loss_clip": 1.02640092, "balance_loss_mlp": 1.04882705, "epoch": 0.19888771982564257, "flos": 23513625141120.0, "grad_norm": 1.8701478603564805, "language_loss": 0.75587451, "learning_rate": 3.708178601452737e-06, "loss": 0.77771771, "num_input_tokens_seen": 71378885, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9296875, "step": 3308, "time_per_iteration": 2.483595609664917 }, { "auxiliary_loss_clip": 0.01143131, "auxiliary_loss_mlp": 0.01039936, "balance_loss_clip": 1.02349746, "balance_loss_mlp": 1.04790092, "epoch": 0.19894784307831054, "flos": 18150510389760.0, "grad_norm": 1.7133581706617782, "language_loss": 0.76051515, "learning_rate": 3.7079759997666374e-06, "loss": 0.78234583, "num_input_tokens_seen": 71397285, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.953125, "step": 3309, "time_per_iteration": 2.473759651184082 }, { "auxiliary_loss_clip": 0.01142504, "auxiliary_loss_mlp": 0.0104604, "balance_loss_clip": 1.02871931, "balance_loss_mlp": 1.04894185, "epoch": 0.1990079663309785, "flos": 24276044246400.0, "grad_norm": 1.5173938232641113, "language_loss": 0.87892962, "learning_rate": 3.707773333313917e-06, "loss": 0.90081501, "num_input_tokens_seen": 71415775, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9375, "step": 3310, "time_per_iteration": 2.4870429039001465 }, { "auxiliary_loss_clip": 0.01138844, "auxiliary_loss_mlp": 0.01036939, "balance_loss_clip": 1.01952279, "balance_loss_mlp": 1.04530263, "epoch": 0.19906808958364647, "flos": 34897666366080.0, "grad_norm": 2.131569244350195, "language_loss": 0.64231575, "learning_rate": 3.70757060210226e-06, "loss": 0.66407365, "num_input_tokens_seen": 71437315, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9375, "step": 3311, "time_per_iteration": 2.59529972076416 }, { "auxiliary_loss_clip": 0.01144926, "auxiliary_loss_mlp": 0.01040643, "balance_loss_clip": 1.02358389, "balance_loss_mlp": 1.04867578, "epoch": 0.19912821283631443, "flos": 24024885373440.0, "grad_norm": 3.02625516266971, "language_loss": 0.7352463, "learning_rate": 3.707367806139355e-06, "loss": 0.75710195, "num_input_tokens_seen": 71456320, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.96484375, "step": 3312, "time_per_iteration": 2.504673480987549 }, { "auxiliary_loss_clip": 0.01143077, "auxiliary_loss_mlp": 0.01040466, "balance_loss_clip": 1.02437294, "balance_loss_mlp": 1.04908037, "epoch": 0.19918833608898243, "flos": 19859031774720.0, "grad_norm": 1.9249493613525657, "language_loss": 0.83902901, "learning_rate": 3.7071649454328915e-06, "loss": 0.8608644, "num_input_tokens_seen": 71475360, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.9375, "step": 3313, "time_per_iteration": 2.4801502227783203 }, { "auxiliary_loss_clip": 0.01145225, "auxiliary_loss_mlp": 0.01042346, "balance_loss_clip": 1.02532339, "balance_loss_mlp": 1.05091798, "epoch": 0.1992484593416504, "flos": 29095794984960.0, "grad_norm": 2.157846237639418, "language_loss": 0.81201971, "learning_rate": 3.7069620199905625e-06, "loss": 0.83389544, "num_input_tokens_seen": 71496155, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.94140625, "step": 3314, "time_per_iteration": 2.5203349590301514 }, { "auxiliary_loss_clip": 0.01138793, "auxiliary_loss_mlp": 0.01043113, "balance_loss_clip": 1.02744865, "balance_loss_mlp": 1.04727066, "epoch": 0.19930858259431836, "flos": 23295001011840.0, "grad_norm": 1.4845872179821233, "language_loss": 0.87204862, "learning_rate": 3.7067590298200627e-06, "loss": 0.89386761, "num_input_tokens_seen": 71517295, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.9140625, "step": 3315, "time_per_iteration": 2.5165579319000244 }, { "auxiliary_loss_clip": 0.01143529, "auxiliary_loss_mlp": 0.01040941, "balance_loss_clip": 1.02419257, "balance_loss_mlp": 1.04925513, "epoch": 0.19936870584698632, "flos": 25378825651200.0, "grad_norm": 1.4624558570209258, "language_loss": 0.71199262, "learning_rate": 3.7065559749290892e-06, "loss": 0.73383737, "num_input_tokens_seen": 71540000, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9453125, "step": 3316, "time_per_iteration": 2.5338807106018066 }, { "auxiliary_loss_clip": 0.01056908, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 1.03666687, "balance_loss_mlp": 1.02437067, "epoch": 0.1994288290996543, "flos": 62168053109760.0, "grad_norm": 0.875172627060261, "language_loss": 0.6637609, "learning_rate": 3.706352855325342e-06, "loss": 0.68472159, "num_input_tokens_seen": 71607880, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.32421875, "step": 3317, "time_per_iteration": 3.192793607711792 }, { "auxiliary_loss_clip": 0.01146703, "auxiliary_loss_mlp": 0.01044902, "balance_loss_clip": 1.02786732, "balance_loss_mlp": 1.04826319, "epoch": 0.19948895235232225, "flos": 19025832919680.0, "grad_norm": 3.8643990582878955, "language_loss": 0.74262023, "learning_rate": 3.7061496710165233e-06, "loss": 0.76453626, "num_input_tokens_seen": 71625695, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.984375, "step": 3318, "time_per_iteration": 2.4377336502075195 }, { "auxiliary_loss_clip": 0.01138268, "auxiliary_loss_mlp": 0.01043716, "balance_loss_clip": 1.0279268, "balance_loss_mlp": 1.04678059, "epoch": 0.19954907560499022, "flos": 37815803182080.0, "grad_norm": 2.056552336797205, "language_loss": 0.79156661, "learning_rate": 3.7059464220103385e-06, "loss": 0.81338644, "num_input_tokens_seen": 71648520, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.9140625, "step": 3319, "time_per_iteration": 2.6042306423187256 }, { "auxiliary_loss_clip": 0.01145897, "auxiliary_loss_mlp": 0.01045525, "balance_loss_clip": 1.02736974, "balance_loss_mlp": 1.04940784, "epoch": 0.1996091988576582, "flos": 49565199594240.0, "grad_norm": 2.316617894811262, "language_loss": 0.76098871, "learning_rate": 3.7057431083144945e-06, "loss": 0.7829029, "num_input_tokens_seen": 71672185, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.96484375, "step": 3320, "time_per_iteration": 2.7235984802246094 }, { "auxiliary_loss_clip": 0.01141522, "auxiliary_loss_mlp": 0.01048033, "balance_loss_clip": 1.03159392, "balance_loss_mlp": 1.04772973, "epoch": 0.19966932211032618, "flos": 22635788659200.0, "grad_norm": 1.6898963259185729, "language_loss": 0.80243528, "learning_rate": 3.705539729936701e-06, "loss": 0.82433081, "num_input_tokens_seen": 71692890, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9375, "step": 3321, "time_per_iteration": 2.4825096130371094 }, { "auxiliary_loss_clip": 0.01054574, "auxiliary_loss_mlp": 0.01014839, "balance_loss_clip": 1.01250267, "balance_loss_mlp": 1.02241373, "epoch": 0.19972944536299414, "flos": 54082117745280.0, "grad_norm": 0.8917590741315046, "language_loss": 0.65150321, "learning_rate": 3.7053362868846696e-06, "loss": 0.67219734, "num_input_tokens_seen": 71745815, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.3203125, "step": 3322, "time_per_iteration": 2.9012115001678467 }, { "auxiliary_loss_clip": 0.01052328, "auxiliary_loss_mlp": 0.01013424, "balance_loss_clip": 1.01115859, "balance_loss_mlp": 1.02019143, "epoch": 0.1997895686156621, "flos": 69355031817600.0, "grad_norm": 0.7917863043676535, "language_loss": 0.56994981, "learning_rate": 3.7051327791661153e-06, "loss": 0.59060729, "num_input_tokens_seen": 71806915, "router_z_loss_clip": 0.02270508, "router_z_loss_mlp": 0.32226562, "step": 3323, "time_per_iteration": 3.2110209465026855 }, { "auxiliary_loss_clip": 0.01143217, "auxiliary_loss_mlp": 0.01050436, "balance_loss_clip": 1.03278136, "balance_loss_mlp": 1.05057287, "epoch": 0.19984969186833007, "flos": 18552063507840.0, "grad_norm": 2.758327056044599, "language_loss": 0.8051874, "learning_rate": 3.7049292067887555e-06, "loss": 0.82712394, "num_input_tokens_seen": 71824645, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.92578125, "step": 3324, "time_per_iteration": 2.460116386413574 }, { "auxiliary_loss_clip": 0.01139018, "auxiliary_loss_mlp": 0.01048946, "balance_loss_clip": 1.0311718, "balance_loss_mlp": 1.04615617, "epoch": 0.19990981512099804, "flos": 26429678968320.0, "grad_norm": 1.5748525496540875, "language_loss": 0.53704852, "learning_rate": 3.7047255697603092e-06, "loss": 0.55892813, "num_input_tokens_seen": 71845125, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9296875, "step": 3325, "time_per_iteration": 2.504155397415161 }, { "auxiliary_loss_clip": 0.01141031, "auxiliary_loss_mlp": 0.01048693, "balance_loss_clip": 1.03205156, "balance_loss_mlp": 1.04628849, "epoch": 0.19996993837366603, "flos": 16325997010560.0, "grad_norm": 2.0314665834279326, "language_loss": 0.86334896, "learning_rate": 3.7045218680884984e-06, "loss": 0.88524616, "num_input_tokens_seen": 71863500, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9453125, "step": 3326, "time_per_iteration": 2.4758787155151367 }, { "auxiliary_loss_clip": 0.0114042, "auxiliary_loss_mlp": 0.01038739, "balance_loss_clip": 1.02274108, "balance_loss_mlp": 1.04894614, "epoch": 0.200030061626334, "flos": 20844169159680.0, "grad_norm": 1.8829576543234903, "language_loss": 0.72003847, "learning_rate": 3.7043181017810476e-06, "loss": 0.74183005, "num_input_tokens_seen": 71881845, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.9140625, "step": 3327, "time_per_iteration": 2.4607443809509277 }, { "auxiliary_loss_clip": 0.01143282, "auxiliary_loss_mlp": 0.010438, "balance_loss_clip": 1.0253346, "balance_loss_mlp": 1.04661131, "epoch": 0.20009018487900196, "flos": 23762629198080.0, "grad_norm": 2.295729374915168, "language_loss": 0.76751041, "learning_rate": 3.7041142708456833e-06, "loss": 0.78938127, "num_input_tokens_seen": 71900940, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.96484375, "step": 3328, "time_per_iteration": 2.4932138919830322 }, { "auxiliary_loss_clip": 0.01134545, "auxiliary_loss_mlp": 0.01037754, "balance_loss_clip": 1.0216372, "balance_loss_mlp": 1.04524767, "epoch": 0.20015030813166992, "flos": 28111555440000.0, "grad_norm": 1.6910682809512518, "language_loss": 0.68910921, "learning_rate": 3.7039103752901353e-06, "loss": 0.71083218, "num_input_tokens_seen": 71921925, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.89453125, "step": 3329, "time_per_iteration": 2.5668275356292725 }, { "auxiliary_loss_clip": 0.0114522, "auxiliary_loss_mlp": 0.01048653, "balance_loss_clip": 1.02955544, "balance_loss_mlp": 1.04876494, "epoch": 0.2002104313843379, "flos": 26067160955520.0, "grad_norm": 1.85533399301593, "language_loss": 0.81172824, "learning_rate": 3.7037064151221353e-06, "loss": 0.83366692, "num_input_tokens_seen": 71941855, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.9609375, "step": 3330, "time_per_iteration": 2.4962053298950195 }, { "auxiliary_loss_clip": 0.0114139, "auxiliary_loss_mlp": 0.01039657, "balance_loss_clip": 1.02218115, "balance_loss_mlp": 1.04587781, "epoch": 0.20027055463700585, "flos": 22966633854720.0, "grad_norm": 3.655726658261671, "language_loss": 0.76592553, "learning_rate": 3.703502390349417e-06, "loss": 0.787736, "num_input_tokens_seen": 71960915, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.95703125, "step": 3331, "time_per_iteration": 2.4870433807373047 }, { "auxiliary_loss_clip": 0.0114317, "auxiliary_loss_mlp": 0.01045593, "balance_loss_clip": 1.02765179, "balance_loss_mlp": 1.04669416, "epoch": 0.20033067788967382, "flos": 17165660313600.0, "grad_norm": 1.696094759272095, "language_loss": 0.7906239, "learning_rate": 3.7032983009797176e-06, "loss": 0.81251156, "num_input_tokens_seen": 71979220, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9609375, "step": 3332, "time_per_iteration": 2.437427282333374 }, { "auxiliary_loss_clip": 0.01048769, "auxiliary_loss_mlp": 0.01001637, "balance_loss_clip": 0.99919289, "balance_loss_mlp": 1.01654887, "epoch": 0.2003908011423418, "flos": 60825566292480.0, "grad_norm": 0.9356882693148935, "language_loss": 0.61977386, "learning_rate": 3.703094147020776e-06, "loss": 0.64027792, "num_input_tokens_seen": 72033950, "router_z_loss_clip": 0.02441406, "router_z_loss_mlp": 0.32226562, "step": 3333, "time_per_iteration": 3.0008585453033447 }, { "auxiliary_loss_clip": 0.01140072, "auxiliary_loss_mlp": 0.01050366, "balance_loss_clip": 1.03296161, "balance_loss_mlp": 1.04482245, "epoch": 0.20045092439500978, "flos": 24206234163840.0, "grad_norm": 2.3592998486487073, "language_loss": 0.81175065, "learning_rate": 3.7028899284803334e-06, "loss": 0.83365506, "num_input_tokens_seen": 72051395, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.953125, "step": 3334, "time_per_iteration": 2.48812198638916 }, { "auxiliary_loss_clip": 0.01145947, "auxiliary_loss_mlp": 0.01047358, "balance_loss_clip": 1.02851069, "balance_loss_mlp": 1.04834294, "epoch": 0.20051104764767774, "flos": 29387605075200.0, "grad_norm": 2.0935270909217056, "language_loss": 0.73838031, "learning_rate": 3.702685645366134e-06, "loss": 0.76031333, "num_input_tokens_seen": 72071305, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9765625, "step": 3335, "time_per_iteration": 2.558863401412964 }, { "auxiliary_loss_clip": 0.01148124, "auxiliary_loss_mlp": 0.01060203, "balance_loss_clip": 1.0422864, "balance_loss_mlp": 1.05143964, "epoch": 0.2005711709003457, "flos": 23513804709120.0, "grad_norm": 1.8280195887647472, "language_loss": 0.79820621, "learning_rate": 3.7024812976859243e-06, "loss": 0.82028943, "num_input_tokens_seen": 72090165, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.96484375, "step": 3336, "time_per_iteration": 2.505133628845215 }, { "auxiliary_loss_clip": 0.01146586, "auxiliary_loss_mlp": 0.010482, "balance_loss_clip": 1.02915025, "balance_loss_mlp": 1.04664183, "epoch": 0.20063129415301367, "flos": 22523388024960.0, "grad_norm": 2.1049267295577736, "language_loss": 0.77864969, "learning_rate": 3.7022768854474532e-06, "loss": 0.80059755, "num_input_tokens_seen": 72107210, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.99609375, "step": 3337, "time_per_iteration": 2.5241446495056152 }, { "auxiliary_loss_clip": 0.01148194, "auxiliary_loss_mlp": 0.01043882, "balance_loss_clip": 1.02486825, "balance_loss_mlp": 1.05241406, "epoch": 0.20069141740568164, "flos": 25958243940480.0, "grad_norm": 3.100549414683177, "language_loss": 0.68598413, "learning_rate": 3.7020724086584724e-06, "loss": 0.70790493, "num_input_tokens_seen": 72126315, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.95703125, "step": 3338, "time_per_iteration": 2.4912405014038086 }, { "auxiliary_loss_clip": 0.01145487, "auxiliary_loss_mlp": 0.01047964, "balance_loss_clip": 1.03059554, "balance_loss_mlp": 1.05038786, "epoch": 0.2007515406583496, "flos": 24790608529920.0, "grad_norm": 1.8485211469971676, "language_loss": 0.68915385, "learning_rate": 3.701867867326735e-06, "loss": 0.71108842, "num_input_tokens_seen": 72146470, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.94921875, "step": 3339, "time_per_iteration": 2.506314992904663 }, { "auxiliary_loss_clip": 0.01149965, "auxiliary_loss_mlp": 0.01037947, "balance_loss_clip": 1.02123415, "balance_loss_mlp": 1.05142486, "epoch": 0.2008116639110176, "flos": 37925582123520.0, "grad_norm": 2.315991082108775, "language_loss": 0.66385049, "learning_rate": 3.7016632614599974e-06, "loss": 0.68572962, "num_input_tokens_seen": 72166600, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.984375, "step": 3340, "time_per_iteration": 2.6246933937072754 }, { "auxiliary_loss_clip": 0.01145056, "auxiliary_loss_mlp": 0.01035045, "balance_loss_clip": 1.0167942, "balance_loss_mlp": 1.04847121, "epoch": 0.20087178716368556, "flos": 20740531443840.0, "grad_norm": 2.1418882774630745, "language_loss": 0.74283814, "learning_rate": 3.701458591066019e-06, "loss": 0.76463914, "num_input_tokens_seen": 72185160, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.96484375, "step": 3341, "time_per_iteration": 2.4775073528289795 }, { "auxiliary_loss_clip": 0.01142812, "auxiliary_loss_mlp": 0.01041899, "balance_loss_clip": 1.02499497, "balance_loss_mlp": 1.05078268, "epoch": 0.20093191041635353, "flos": 23842279607040.0, "grad_norm": 1.867504738419376, "language_loss": 0.72322273, "learning_rate": 3.70125385615256e-06, "loss": 0.74506986, "num_input_tokens_seen": 72205160, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.921875, "step": 3342, "time_per_iteration": 4.00927472114563 }, { "auxiliary_loss_clip": 0.01143636, "auxiliary_loss_mlp": 0.01043235, "balance_loss_clip": 1.02629542, "balance_loss_mlp": 1.04863453, "epoch": 0.2009920336690215, "flos": 21792067119360.0, "grad_norm": 2.2665008544983234, "language_loss": 0.72318459, "learning_rate": 3.701049056727384e-06, "loss": 0.74505335, "num_input_tokens_seen": 72223555, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.94921875, "step": 3343, "time_per_iteration": 2.4609992504119873 }, { "auxiliary_loss_clip": 0.01144669, "auxiliary_loss_mlp": 0.0104554, "balance_loss_clip": 1.02713358, "balance_loss_mlp": 1.04836011, "epoch": 0.20105215692168946, "flos": 26359222440960.0, "grad_norm": 1.9255396671169056, "language_loss": 0.80611515, "learning_rate": 3.7008441927982574e-06, "loss": 0.82801723, "num_input_tokens_seen": 72242465, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9609375, "step": 3344, "time_per_iteration": 3.8615095615386963 }, { "auxiliary_loss_clip": 0.01142564, "auxiliary_loss_mlp": 0.01041073, "balance_loss_clip": 1.02418089, "balance_loss_mlp": 1.04674125, "epoch": 0.20111228017435742, "flos": 18807280617600.0, "grad_norm": 2.6396481377311902, "language_loss": 0.83433068, "learning_rate": 3.700639264372948e-06, "loss": 0.85616708, "num_input_tokens_seen": 72260655, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9609375, "step": 3345, "time_per_iteration": 2.4421770572662354 }, { "auxiliary_loss_clip": 0.01138389, "auxiliary_loss_mlp": 0.01038986, "balance_loss_clip": 1.02403152, "balance_loss_mlp": 1.04952383, "epoch": 0.20117240342702541, "flos": 19975059682560.0, "grad_norm": 1.8714788212418727, "language_loss": 0.67568547, "learning_rate": 3.7004342714592283e-06, "loss": 0.69745922, "num_input_tokens_seen": 72279055, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.88671875, "step": 3346, "time_per_iteration": 3.8386752605438232 }, { "auxiliary_loss_clip": 0.01142129, "auxiliary_loss_mlp": 0.01040285, "balance_loss_clip": 1.02400649, "balance_loss_mlp": 1.04921877, "epoch": 0.20123252667969338, "flos": 23142703345920.0, "grad_norm": 2.1767164705486772, "language_loss": 0.7351079, "learning_rate": 3.70022921406487e-06, "loss": 0.75693202, "num_input_tokens_seen": 72297895, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9296875, "step": 3347, "time_per_iteration": 3.9205048084259033 }, { "auxiliary_loss_clip": 0.01141399, "auxiliary_loss_mlp": 0.0104378, "balance_loss_clip": 1.02829516, "balance_loss_mlp": 1.04924273, "epoch": 0.20129264993236134, "flos": 23221671396480.0, "grad_norm": 1.6116053153951133, "language_loss": 0.86806357, "learning_rate": 3.70002409219765e-06, "loss": 0.88991535, "num_input_tokens_seen": 72318385, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.921875, "step": 3348, "time_per_iteration": 2.4777939319610596 }, { "auxiliary_loss_clip": 0.01137318, "auxiliary_loss_mlp": 0.01039002, "balance_loss_clip": 1.02091825, "balance_loss_mlp": 1.04696345, "epoch": 0.2013527731850293, "flos": 21871466133120.0, "grad_norm": 1.9143627457821908, "language_loss": 0.71117085, "learning_rate": 3.699818905865346e-06, "loss": 0.73293406, "num_input_tokens_seen": 72338235, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.90234375, "step": 3349, "time_per_iteration": 2.4831435680389404 }, { "auxiliary_loss_clip": 0.01142625, "auxiliary_loss_mlp": 0.01046522, "balance_loss_clip": 1.02803278, "balance_loss_mlp": 1.04917371, "epoch": 0.20141289643769728, "flos": 18040803275520.0, "grad_norm": 1.6741489763229245, "language_loss": 0.71297848, "learning_rate": 3.6996136550757377e-06, "loss": 0.7348699, "num_input_tokens_seen": 72357825, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.93359375, "step": 3350, "time_per_iteration": 2.4470372200012207 }, { "auxiliary_loss_clip": 0.01144349, "auxiliary_loss_mlp": 0.01044964, "balance_loss_clip": 1.02596188, "balance_loss_mlp": 1.04954243, "epoch": 0.20147301969036524, "flos": 23951412103680.0, "grad_norm": 2.367984271473367, "language_loss": 0.7606411, "learning_rate": 3.69940833983661e-06, "loss": 0.78253424, "num_input_tokens_seen": 72376335, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.9453125, "step": 3351, "time_per_iteration": 2.4909465312957764 }, { "auxiliary_loss_clip": 0.0114859, "auxiliary_loss_mlp": 0.01043496, "balance_loss_clip": 1.02526939, "balance_loss_mlp": 1.0521841, "epoch": 0.2015331429430332, "flos": 25588471380480.0, "grad_norm": 1.621834597755387, "language_loss": 0.80302799, "learning_rate": 3.699202960155748e-06, "loss": 0.82494891, "num_input_tokens_seen": 72395440, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.96484375, "step": 3352, "time_per_iteration": 2.4968016147613525 }, { "auxiliary_loss_clip": 0.0114443, "auxiliary_loss_mlp": 0.01042033, "balance_loss_clip": 1.02440143, "balance_loss_mlp": 1.05158472, "epoch": 0.2015932661957012, "flos": 26724972677760.0, "grad_norm": 1.9207287452051456, "language_loss": 0.8024416, "learning_rate": 3.6989975160409396e-06, "loss": 0.82430625, "num_input_tokens_seen": 72414670, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9296875, "step": 3353, "time_per_iteration": 2.522233247756958 }, { "auxiliary_loss_clip": 0.01139189, "auxiliary_loss_mlp": 0.01042086, "balance_loss_clip": 1.0256952, "balance_loss_mlp": 1.04929745, "epoch": 0.20165338944836916, "flos": 15633136592640.0, "grad_norm": 2.873425471873632, "language_loss": 0.89998567, "learning_rate": 3.6987920074999747e-06, "loss": 0.92179847, "num_input_tokens_seen": 72432210, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8984375, "step": 3354, "time_per_iteration": 2.4521515369415283 }, { "auxiliary_loss_clip": 0.01058902, "auxiliary_loss_mlp": 0.01001867, "balance_loss_clip": 0.99917239, "balance_loss_mlp": 1.02672005, "epoch": 0.20171351270103713, "flos": 57912529207680.0, "grad_norm": 0.8813765987650118, "language_loss": 0.55867898, "learning_rate": 3.6985864345406465e-06, "loss": 0.57928669, "num_input_tokens_seen": 72489225, "router_z_loss_clip": 0.02697754, "router_z_loss_mlp": 0.32226562, "step": 3355, "time_per_iteration": 3.0578553676605225 }, { "auxiliary_loss_clip": 0.01139741, "auxiliary_loss_mlp": 0.01043325, "balance_loss_clip": 1.02717185, "balance_loss_mlp": 1.05042207, "epoch": 0.2017736359537051, "flos": 20814363849600.0, "grad_norm": 2.957571768153121, "language_loss": 0.84286106, "learning_rate": 3.698380797170751e-06, "loss": 0.86469173, "num_input_tokens_seen": 72508715, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.890625, "step": 3356, "time_per_iteration": 2.475437879562378 }, { "auxiliary_loss_clip": 0.0115252, "auxiliary_loss_mlp": 0.01044472, "balance_loss_clip": 1.02471876, "balance_loss_mlp": 1.0528121, "epoch": 0.20183375920637306, "flos": 17092043389440.0, "grad_norm": 3.4998159935481166, "language_loss": 0.69326091, "learning_rate": 3.698175095398085e-06, "loss": 0.71523076, "num_input_tokens_seen": 72525135, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.99609375, "step": 3357, "time_per_iteration": 2.4902968406677246 }, { "auxiliary_loss_clip": 0.01146915, "auxiliary_loss_mlp": 0.01041605, "balance_loss_clip": 1.0236398, "balance_loss_mlp": 1.05038166, "epoch": 0.20189388245904102, "flos": 18661339658880.0, "grad_norm": 1.7742012436003491, "language_loss": 0.72027707, "learning_rate": 3.6979693292304493e-06, "loss": 0.74216229, "num_input_tokens_seen": 72543690, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.96484375, "step": 3358, "time_per_iteration": 2.457174301147461 }, { "auxiliary_loss_clip": 0.01139069, "auxiliary_loss_mlp": 0.01050447, "balance_loss_clip": 1.03491402, "balance_loss_mlp": 1.04873848, "epoch": 0.20195400571170902, "flos": 16797539779200.0, "grad_norm": 1.7991046579669427, "language_loss": 0.82894838, "learning_rate": 3.6977634986756463e-06, "loss": 0.85084355, "num_input_tokens_seen": 72560725, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.90234375, "step": 3359, "time_per_iteration": 2.4419612884521484 }, { "auxiliary_loss_clip": 0.01055818, "auxiliary_loss_mlp": 0.01005188, "balance_loss_clip": 1.00267243, "balance_loss_mlp": 1.0238831, "epoch": 0.20201412896437698, "flos": 67174716268800.0, "grad_norm": 0.8137933539136325, "language_loss": 0.59003621, "learning_rate": 3.697557603741482e-06, "loss": 0.61064631, "num_input_tokens_seen": 72621940, "router_z_loss_clip": 0.02514648, "router_z_loss_mlp": 0.3203125, "step": 3360, "time_per_iteration": 3.042719602584839 }, { "auxiliary_loss_clip": 0.01148752, "auxiliary_loss_mlp": 0.01049339, "balance_loss_clip": 1.03188658, "balance_loss_mlp": 1.05344939, "epoch": 0.20207425221704495, "flos": 21325013550720.0, "grad_norm": 6.596897881195141, "language_loss": 0.61656427, "learning_rate": 3.697351644435763e-06, "loss": 0.63854516, "num_input_tokens_seen": 72639135, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.953125, "step": 3361, "time_per_iteration": 2.4719812870025635 }, { "auxiliary_loss_clip": 0.01145491, "auxiliary_loss_mlp": 0.0105537, "balance_loss_clip": 1.0390625, "balance_loss_mlp": 1.05296946, "epoch": 0.2021343754697129, "flos": 22527158952960.0, "grad_norm": 2.1244885038268753, "language_loss": 0.75927514, "learning_rate": 3.6971456207662993e-06, "loss": 0.78128386, "num_input_tokens_seen": 72658525, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.92578125, "step": 3362, "time_per_iteration": 2.528261184692383 }, { "auxiliary_loss_clip": 0.0114279, "auxiliary_loss_mlp": 0.01044846, "balance_loss_clip": 1.0279783, "balance_loss_mlp": 1.05027366, "epoch": 0.20219449872238088, "flos": 19062785036160.0, "grad_norm": 1.6798497732891329, "language_loss": 0.76630056, "learning_rate": 3.6969395327409035e-06, "loss": 0.78817689, "num_input_tokens_seen": 72678085, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.92578125, "step": 3363, "time_per_iteration": 2.456159830093384 }, { "auxiliary_loss_clip": 0.01139727, "auxiliary_loss_mlp": 0.01047462, "balance_loss_clip": 1.03167832, "balance_loss_mlp": 1.04660201, "epoch": 0.20225462197504884, "flos": 24717027519360.0, "grad_norm": 1.8249627363354137, "language_loss": 0.75143522, "learning_rate": 3.696733380367391e-06, "loss": 0.77330714, "num_input_tokens_seen": 72698695, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.9296875, "step": 3364, "time_per_iteration": 2.5274930000305176 }, { "auxiliary_loss_clip": 0.01144942, "auxiliary_loss_mlp": 0.01046671, "balance_loss_clip": 1.02831268, "balance_loss_mlp": 1.0495851, "epoch": 0.2023147452277168, "flos": 22018304931840.0, "grad_norm": 2.342172028804345, "language_loss": 0.71661067, "learning_rate": 3.6965271636535783e-06, "loss": 0.73852682, "num_input_tokens_seen": 72717880, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.953125, "step": 3365, "time_per_iteration": 2.4666757583618164 }, { "auxiliary_loss_clip": 0.01144124, "auxiliary_loss_mlp": 0.01045504, "balance_loss_clip": 1.02851701, "balance_loss_mlp": 1.0503726, "epoch": 0.2023748684803848, "flos": 17745365911680.0, "grad_norm": 2.0251157814020257, "language_loss": 0.85934442, "learning_rate": 3.696320882607286e-06, "loss": 0.88124073, "num_input_tokens_seen": 72736410, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9375, "step": 3366, "time_per_iteration": 2.4918112754821777 }, { "auxiliary_loss_clip": 0.01140729, "auxiliary_loss_mlp": 0.01039372, "balance_loss_clip": 1.02275419, "balance_loss_mlp": 1.04837763, "epoch": 0.20243499173305277, "flos": 31138932493440.0, "grad_norm": 1.6193337811000996, "language_loss": 0.69633532, "learning_rate": 3.696114537236335e-06, "loss": 0.71813631, "num_input_tokens_seen": 72758295, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.921875, "step": 3367, "time_per_iteration": 2.558323383331299 }, { "auxiliary_loss_clip": 0.01145937, "auxiliary_loss_mlp": 0.0104121, "balance_loss_clip": 1.02115941, "balance_loss_mlp": 1.04829288, "epoch": 0.20249511498572073, "flos": 33839235279360.0, "grad_norm": 1.9545010860930367, "language_loss": 0.68057215, "learning_rate": 3.6959081275485512e-06, "loss": 0.7024436, "num_input_tokens_seen": 72782495, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.9765625, "step": 3368, "time_per_iteration": 2.592759132385254 }, { "auxiliary_loss_clip": 0.01142386, "auxiliary_loss_mlp": 0.01047534, "balance_loss_clip": 1.02989113, "balance_loss_mlp": 1.05030751, "epoch": 0.2025552382383887, "flos": 21215629658880.0, "grad_norm": 2.187188729485791, "language_loss": 0.77752078, "learning_rate": 3.6957016535517615e-06, "loss": 0.79942, "num_input_tokens_seen": 72801885, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.921875, "step": 3369, "time_per_iteration": 2.486771821975708 }, { "auxiliary_loss_clip": 0.01144763, "auxiliary_loss_mlp": 0.01052073, "balance_loss_clip": 1.03478813, "balance_loss_mlp": 1.04877496, "epoch": 0.20261536149105666, "flos": 14647388676480.0, "grad_norm": 3.133913120089089, "language_loss": 0.64377493, "learning_rate": 3.695495115253795e-06, "loss": 0.66574329, "num_input_tokens_seen": 72816990, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9609375, "step": 3370, "time_per_iteration": 2.4596588611602783 }, { "auxiliary_loss_clip": 0.01058366, "auxiliary_loss_mlp": 0.01003559, "balance_loss_clip": 1.00124598, "balance_loss_mlp": 1.02673888, "epoch": 0.20267548474372463, "flos": 66783649921920.0, "grad_norm": 0.7786482119844468, "language_loss": 0.58081496, "learning_rate": 3.6952885126624834e-06, "loss": 0.60143423, "num_input_tokens_seen": 72879240, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.31640625, "step": 3371, "time_per_iteration": 3.165849447250366 }, { "auxiliary_loss_clip": 0.01140584, "auxiliary_loss_mlp": 0.0103789, "balance_loss_clip": 1.02050889, "balance_loss_mlp": 1.04703736, "epoch": 0.2027356079963926, "flos": 24680793674880.0, "grad_norm": 1.6194595960142713, "language_loss": 0.91790432, "learning_rate": 3.6950818457856617e-06, "loss": 0.9396891, "num_input_tokens_seen": 72899030, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9375, "step": 3372, "time_per_iteration": 2.5376601219177246 }, { "auxiliary_loss_clip": 0.01145798, "auxiliary_loss_mlp": 0.01044734, "balance_loss_clip": 1.02653098, "balance_loss_mlp": 1.05118346, "epoch": 0.20279573124906058, "flos": 26392762765440.0, "grad_norm": 1.6037825759111843, "language_loss": 0.78877252, "learning_rate": 3.694875114631167e-06, "loss": 0.81067777, "num_input_tokens_seen": 72919190, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9453125, "step": 3373, "time_per_iteration": 2.514941453933716 }, { "auxiliary_loss_clip": 0.01137779, "auxiliary_loss_mlp": 0.01037796, "balance_loss_clip": 1.02060628, "balance_loss_mlp": 1.04741716, "epoch": 0.20285585450172855, "flos": 33799984692480.0, "grad_norm": 1.8307564085833044, "language_loss": 0.71168745, "learning_rate": 3.6946683192068377e-06, "loss": 0.73344314, "num_input_tokens_seen": 72939720, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.90234375, "step": 3374, "time_per_iteration": 2.6207778453826904 }, { "auxiliary_loss_clip": 0.01055897, "auxiliary_loss_mlp": 0.01010605, "balance_loss_clip": 1.00850725, "balance_loss_mlp": 1.0250349, "epoch": 0.20291597775439651, "flos": 71164823598720.0, "grad_norm": 0.996106291451316, "language_loss": 0.62535894, "learning_rate": 3.694461459520516e-06, "loss": 0.64602393, "num_input_tokens_seen": 73000015, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.30859375, "step": 3375, "time_per_iteration": 3.0717735290527344 }, { "auxiliary_loss_clip": 0.01140496, "auxiliary_loss_mlp": 0.01041681, "balance_loss_clip": 1.02452672, "balance_loss_mlp": 1.04789925, "epoch": 0.20297610100706448, "flos": 19494287118720.0, "grad_norm": 1.6727507291471932, "language_loss": 0.82452804, "learning_rate": 3.6942545355800463e-06, "loss": 0.84634984, "num_input_tokens_seen": 73017675, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.92578125, "step": 3376, "time_per_iteration": 2.465221643447876 }, { "auxiliary_loss_clip": 0.01145856, "auxiliary_loss_mlp": 0.01036976, "balance_loss_clip": 1.01839125, "balance_loss_mlp": 1.04971826, "epoch": 0.20303622425973245, "flos": 25044245441280.0, "grad_norm": 1.894695235124889, "language_loss": 0.81826591, "learning_rate": 3.6940475473932743e-06, "loss": 0.84009421, "num_input_tokens_seen": 73036135, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9609375, "step": 3377, "time_per_iteration": 2.4962456226348877 }, { "auxiliary_loss_clip": 0.01140701, "auxiliary_loss_mlp": 0.01048948, "balance_loss_clip": 1.03074479, "balance_loss_mlp": 1.04888153, "epoch": 0.2030963475124004, "flos": 21979988098560.0, "grad_norm": 2.218204192949955, "language_loss": 0.76610446, "learning_rate": 3.69384049496805e-06, "loss": 0.78800088, "num_input_tokens_seen": 73054075, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.91796875, "step": 3378, "time_per_iteration": 2.4990062713623047 }, { "auxiliary_loss_clip": 0.01143287, "auxiliary_loss_mlp": 0.01041616, "balance_loss_clip": 1.02317441, "balance_loss_mlp": 1.04939413, "epoch": 0.2031564707650684, "flos": 19500392430720.0, "grad_norm": 1.9475649728803233, "language_loss": 0.80308712, "learning_rate": 3.6936333783122242e-06, "loss": 0.82493615, "num_input_tokens_seen": 73073530, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9375, "step": 3379, "time_per_iteration": 2.4763295650482178 }, { "auxiliary_loss_clip": 0.0113798, "auxiliary_loss_mlp": 0.01040826, "balance_loss_clip": 1.0240891, "balance_loss_mlp": 1.04878807, "epoch": 0.20321659401773637, "flos": 22747075971840.0, "grad_norm": 1.8126155514507882, "language_loss": 0.86503661, "learning_rate": 3.6934261974336505e-06, "loss": 0.88682473, "num_input_tokens_seen": 73092820, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.890625, "step": 3380, "time_per_iteration": 2.492082357406616 }, { "auxiliary_loss_clip": 0.01144498, "auxiliary_loss_mlp": 0.01044791, "balance_loss_clip": 1.02713597, "balance_loss_mlp": 1.05214369, "epoch": 0.20327671727040433, "flos": 22455840499200.0, "grad_norm": 2.0835025821730357, "language_loss": 0.74684292, "learning_rate": 3.693218952340186e-06, "loss": 0.76873583, "num_input_tokens_seen": 73113385, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.92578125, "step": 3381, "time_per_iteration": 2.496070384979248 }, { "auxiliary_loss_clip": 0.01145509, "auxiliary_loss_mlp": 0.01045688, "balance_loss_clip": 1.02743697, "balance_loss_mlp": 1.0496465, "epoch": 0.2033368405230723, "flos": 19535010163200.0, "grad_norm": 1.8087063887383041, "language_loss": 0.79156107, "learning_rate": 3.6930116430396895e-06, "loss": 0.81347305, "num_input_tokens_seen": 73131195, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.95703125, "step": 3382, "time_per_iteration": 2.4668569564819336 }, { "auxiliary_loss_clip": 0.01146542, "auxiliary_loss_mlp": 0.01037618, "balance_loss_clip": 1.01834214, "balance_loss_mlp": 1.04934502, "epoch": 0.20339696377574026, "flos": 13809233744640.0, "grad_norm": 2.2300127841195985, "language_loss": 0.79803383, "learning_rate": 3.6928042695400214e-06, "loss": 0.81987536, "num_input_tokens_seen": 73148850, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.97265625, "step": 3383, "time_per_iteration": 2.446655035018921 }, { "auxiliary_loss_clip": 0.01140626, "auxiliary_loss_mlp": 0.01041743, "balance_loss_clip": 1.02398038, "balance_loss_mlp": 1.04863882, "epoch": 0.20345708702840823, "flos": 20339409288960.0, "grad_norm": 2.5760471501138262, "language_loss": 0.74186969, "learning_rate": 3.6925968318490464e-06, "loss": 0.76369333, "num_input_tokens_seen": 73166775, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.921875, "step": 3384, "time_per_iteration": 3.9863345623016357 }, { "auxiliary_loss_clip": 0.01148502, "auxiliary_loss_mlp": 0.01044523, "balance_loss_clip": 1.02475858, "balance_loss_mlp": 1.04971111, "epoch": 0.2035172102810762, "flos": 20333950421760.0, "grad_norm": 2.502414502488306, "language_loss": 0.76608717, "learning_rate": 3.6923893299746293e-06, "loss": 0.78801739, "num_input_tokens_seen": 73183215, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.984375, "step": 3385, "time_per_iteration": 2.5245683193206787 }, { "auxiliary_loss_clip": 0.01141965, "auxiliary_loss_mlp": 0.01053113, "balance_loss_clip": 1.03456402, "balance_loss_mlp": 1.04746866, "epoch": 0.2035773335337442, "flos": 23330983461120.0, "grad_norm": 1.8107759506323153, "language_loss": 0.68184143, "learning_rate": 3.692181763924639e-06, "loss": 0.70379221, "num_input_tokens_seen": 73203290, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9453125, "step": 3386, "time_per_iteration": 3.897573232650757 }, { "auxiliary_loss_clip": 0.01143684, "auxiliary_loss_mlp": 0.01059566, "balance_loss_clip": 1.04057586, "balance_loss_mlp": 1.04891849, "epoch": 0.20363745678641215, "flos": 28330287310080.0, "grad_norm": 29.151304166426705, "language_loss": 0.81274378, "learning_rate": 3.691974133706947e-06, "loss": 0.83477628, "num_input_tokens_seen": 73226185, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.9453125, "step": 3387, "time_per_iteration": 2.5495340824127197 }, { "auxiliary_loss_clip": 0.01139253, "auxiliary_loss_mlp": 0.01049341, "balance_loss_clip": 1.03153682, "balance_loss_mlp": 1.04964089, "epoch": 0.20369758003908012, "flos": 18915658928640.0, "grad_norm": 2.262228557783067, "language_loss": 0.79939634, "learning_rate": 3.6917664393294262e-06, "loss": 0.82128227, "num_input_tokens_seen": 73243300, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.8984375, "step": 3388, "time_per_iteration": 3.9106333255767822 }, { "auxiliary_loss_clip": 0.01146379, "auxiliary_loss_mlp": 0.01038547, "balance_loss_clip": 1.02054632, "balance_loss_mlp": 1.05161595, "epoch": 0.20375770329174808, "flos": 19206499351680.0, "grad_norm": 2.8084181248393643, "language_loss": 0.71769488, "learning_rate": 3.6915586807999527e-06, "loss": 0.73954415, "num_input_tokens_seen": 73261490, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9453125, "step": 3389, "time_per_iteration": 3.9378623962402344 }, { "auxiliary_loss_clip": 0.01143442, "auxiliary_loss_mlp": 0.01046552, "balance_loss_clip": 1.0290997, "balance_loss_mlp": 1.05124688, "epoch": 0.20381782654441605, "flos": 19391008538880.0, "grad_norm": 1.993697034469386, "language_loss": 0.87739277, "learning_rate": 3.691350858126404e-06, "loss": 0.89929271, "num_input_tokens_seen": 73280180, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.921875, "step": 3390, "time_per_iteration": 2.4825661182403564 }, { "auxiliary_loss_clip": 0.01145022, "auxiliary_loss_mlp": 0.01046584, "balance_loss_clip": 1.02929842, "balance_loss_mlp": 1.05297351, "epoch": 0.203877949797084, "flos": 24827704300800.0, "grad_norm": 2.049440271434278, "language_loss": 0.71265757, "learning_rate": 3.691142971316662e-06, "loss": 0.73457366, "num_input_tokens_seen": 73300680, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.91796875, "step": 3391, "time_per_iteration": 2.561927318572998 }, { "auxiliary_loss_clip": 0.01142135, "auxiliary_loss_mlp": 0.01047997, "balance_loss_clip": 1.03088474, "balance_loss_mlp": 1.04984498, "epoch": 0.20393807304975198, "flos": 18003707504640.0, "grad_norm": 2.2122904226873876, "language_loss": 0.86491305, "learning_rate": 3.6909350203786086e-06, "loss": 0.88681442, "num_input_tokens_seen": 73316760, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.921875, "step": 3392, "time_per_iteration": 2.456801414489746 }, { "auxiliary_loss_clip": 0.01145808, "auxiliary_loss_mlp": 0.01047971, "balance_loss_clip": 1.03096032, "balance_loss_mlp": 1.04952478, "epoch": 0.20399819630241997, "flos": 24206988349440.0, "grad_norm": 1.4694882333506798, "language_loss": 0.80712306, "learning_rate": 3.69072700532013e-06, "loss": 0.82906091, "num_input_tokens_seen": 73339385, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9609375, "step": 3393, "time_per_iteration": 2.535311698913574 }, { "auxiliary_loss_clip": 0.01142265, "auxiliary_loss_mlp": 0.01039797, "balance_loss_clip": 1.02338219, "balance_loss_mlp": 1.04959369, "epoch": 0.20405831955508794, "flos": 20777124424320.0, "grad_norm": 1.6459430377545825, "language_loss": 0.86100125, "learning_rate": 3.6905189261491137e-06, "loss": 0.88282186, "num_input_tokens_seen": 73357235, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9296875, "step": 3394, "time_per_iteration": 2.469787836074829 }, { "auxiliary_loss_clip": 0.01144948, "auxiliary_loss_mlp": 0.01048668, "balance_loss_clip": 1.03197849, "balance_loss_mlp": 1.05191767, "epoch": 0.2041184428077559, "flos": 15486908325120.0, "grad_norm": 2.6905684589569274, "language_loss": 0.83864415, "learning_rate": 3.69031078287345e-06, "loss": 0.86058033, "num_input_tokens_seen": 73374435, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9296875, "step": 3395, "time_per_iteration": 2.4663913249969482 }, { "auxiliary_loss_clip": 0.01145858, "auxiliary_loss_mlp": 0.0103946, "balance_loss_clip": 1.02113807, "balance_loss_mlp": 1.0493989, "epoch": 0.20417856606042387, "flos": 15588463052160.0, "grad_norm": 2.2183713611268336, "language_loss": 0.83457774, "learning_rate": 3.690102575501033e-06, "loss": 0.85643089, "num_input_tokens_seen": 73391025, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.96484375, "step": 3396, "time_per_iteration": 2.4443514347076416 }, { "auxiliary_loss_clip": 0.01144864, "auxiliary_loss_mlp": 0.01043606, "balance_loss_clip": 1.02547431, "balance_loss_mlp": 1.05202973, "epoch": 0.20423868931309183, "flos": 24279348297600.0, "grad_norm": 2.0497470243991796, "language_loss": 0.77270502, "learning_rate": 3.6898943040397556e-06, "loss": 0.79458976, "num_input_tokens_seen": 73409270, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9296875, "step": 3397, "time_per_iteration": 2.517610549926758 }, { "auxiliary_loss_clip": 0.01143714, "auxiliary_loss_mlp": 0.01045357, "balance_loss_clip": 1.02882266, "balance_loss_mlp": 1.05176592, "epoch": 0.2042988125657598, "flos": 18614870438400.0, "grad_norm": 3.262698241708705, "language_loss": 0.88010907, "learning_rate": 3.689685968497518e-06, "loss": 0.90199983, "num_input_tokens_seen": 73425225, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.921875, "step": 3398, "time_per_iteration": 2.439789295196533 }, { "auxiliary_loss_clip": 0.01147529, "auxiliary_loss_mlp": 0.01047295, "balance_loss_clip": 1.02961636, "balance_loss_mlp": 1.05330014, "epoch": 0.2043589358184278, "flos": 17851230270720.0, "grad_norm": 14.590709769718396, "language_loss": 0.77720773, "learning_rate": 3.6894775688822186e-06, "loss": 0.79915595, "num_input_tokens_seen": 73440940, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9453125, "step": 3399, "time_per_iteration": 2.4315266609191895 }, { "auxiliary_loss_clip": 0.0114614, "auxiliary_loss_mlp": 0.01040201, "balance_loss_clip": 1.02277231, "balance_loss_mlp": 1.05174029, "epoch": 0.20441905907109575, "flos": 21435223455360.0, "grad_norm": 3.853340239835007, "language_loss": 0.76227474, "learning_rate": 3.6892691052017603e-06, "loss": 0.78413814, "num_input_tokens_seen": 73458805, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.94140625, "step": 3400, "time_per_iteration": 2.489750862121582 }, { "auxiliary_loss_clip": 0.01145317, "auxiliary_loss_mlp": 0.01043295, "balance_loss_clip": 1.02703524, "balance_loss_mlp": 1.05340111, "epoch": 0.20447918232376372, "flos": 27707703851520.0, "grad_norm": 1.615737015545974, "language_loss": 0.79439366, "learning_rate": 3.6890605774640487e-06, "loss": 0.81627977, "num_input_tokens_seen": 73479380, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.921875, "step": 3401, "time_per_iteration": 2.5214428901672363 }, { "auxiliary_loss_clip": 0.01144146, "auxiliary_loss_mlp": 0.01040914, "balance_loss_clip": 1.02313995, "balance_loss_mlp": 1.05031264, "epoch": 0.20453930557643168, "flos": 30524214113280.0, "grad_norm": 2.2417799730057864, "language_loss": 0.69644827, "learning_rate": 3.688851985676991e-06, "loss": 0.71829885, "num_input_tokens_seen": 73505105, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9375, "step": 3402, "time_per_iteration": 2.6327743530273438 }, { "auxiliary_loss_clip": 0.01150047, "auxiliary_loss_mlp": 0.01043528, "balance_loss_clip": 1.02539647, "balance_loss_mlp": 1.0548501, "epoch": 0.20459942882909965, "flos": 18987767481600.0, "grad_norm": 3.3744487856371066, "language_loss": 0.81737274, "learning_rate": 3.688643329848496e-06, "loss": 0.8393085, "num_input_tokens_seen": 73523700, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.953125, "step": 3403, "time_per_iteration": 2.5002455711364746 }, { "auxiliary_loss_clip": 0.01149038, "auxiliary_loss_mlp": 0.01043401, "balance_loss_clip": 1.02603197, "balance_loss_mlp": 1.05402052, "epoch": 0.20465955208176762, "flos": 20339050152960.0, "grad_norm": 1.9555534445142377, "language_loss": 0.83281076, "learning_rate": 3.6884346099864772e-06, "loss": 0.8547352, "num_input_tokens_seen": 73542625, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.94921875, "step": 3404, "time_per_iteration": 2.4744954109191895 }, { "auxiliary_loss_clip": 0.01146458, "auxiliary_loss_mlp": 0.01046957, "balance_loss_clip": 1.02983856, "balance_loss_mlp": 1.05099893, "epoch": 0.20471967533443558, "flos": 21251288885760.0, "grad_norm": 1.7673862083180745, "language_loss": 0.85886031, "learning_rate": 3.6882258260988487e-06, "loss": 0.88079441, "num_input_tokens_seen": 73561450, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.953125, "step": 3405, "time_per_iteration": 2.477987766265869 }, { "auxiliary_loss_clip": 0.01140031, "auxiliary_loss_mlp": 0.01042413, "balance_loss_clip": 1.02469862, "balance_loss_mlp": 1.04865706, "epoch": 0.20477979858710357, "flos": 14501555458560.0, "grad_norm": 3.274752384263055, "language_loss": 0.84117639, "learning_rate": 3.6880169781935276e-06, "loss": 0.86300081, "num_input_tokens_seen": 73577155, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9140625, "step": 3406, "time_per_iteration": 2.4417595863342285 }, { "auxiliary_loss_clip": 0.01143267, "auxiliary_loss_mlp": 0.01036927, "balance_loss_clip": 1.02053547, "balance_loss_mlp": 1.05200267, "epoch": 0.20483992183977154, "flos": 11400310085760.0, "grad_norm": 1.9764051349221303, "language_loss": 0.68301338, "learning_rate": 3.6878080662784336e-06, "loss": 0.70481539, "num_input_tokens_seen": 73594900, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9140625, "step": 3407, "time_per_iteration": 2.437732458114624 }, { "auxiliary_loss_clip": 0.0114098, "auxiliary_loss_mlp": 0.01039553, "balance_loss_clip": 1.02247047, "balance_loss_mlp": 1.05016398, "epoch": 0.2049000450924395, "flos": 19060271084160.0, "grad_norm": 2.4740009502221207, "language_loss": 0.84143424, "learning_rate": 3.6875990903614886e-06, "loss": 0.86323953, "num_input_tokens_seen": 73613810, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.90625, "step": 3408, "time_per_iteration": 2.466826915740967 }, { "auxiliary_loss_clip": 0.01146067, "auxiliary_loss_mlp": 0.01041074, "balance_loss_clip": 1.02414656, "balance_loss_mlp": 1.05205822, "epoch": 0.20496016834510747, "flos": 14574561851520.0, "grad_norm": 3.1916399462450453, "language_loss": 0.64583564, "learning_rate": 3.6873900504506166e-06, "loss": 0.66770709, "num_input_tokens_seen": 73631495, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9375, "step": 3409, "time_per_iteration": 2.4426429271698 }, { "auxiliary_loss_clip": 0.01143145, "auxiliary_loss_mlp": 0.01043046, "balance_loss_clip": 1.02598691, "balance_loss_mlp": 1.04939032, "epoch": 0.20502029159777543, "flos": 22126647329280.0, "grad_norm": 1.5871694210133724, "language_loss": 0.80585247, "learning_rate": 3.687180946553745e-06, "loss": 0.82771438, "num_input_tokens_seen": 73652840, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9375, "step": 3410, "time_per_iteration": 2.5037198066711426 }, { "auxiliary_loss_clip": 0.01144057, "auxiliary_loss_mlp": 0.01042599, "balance_loss_clip": 1.02610028, "balance_loss_mlp": 1.0527066, "epoch": 0.2050804148504434, "flos": 25367907916800.0, "grad_norm": 3.119247957631983, "language_loss": 0.76292419, "learning_rate": 3.686971778678803e-06, "loss": 0.78479075, "num_input_tokens_seen": 73672150, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9140625, "step": 3411, "time_per_iteration": 2.5174927711486816 }, { "auxiliary_loss_clip": 0.01146779, "auxiliary_loss_mlp": 0.01042039, "balance_loss_clip": 1.02577901, "balance_loss_mlp": 1.05398059, "epoch": 0.2051405381031114, "flos": 23620171858560.0, "grad_norm": 1.9006264614163761, "language_loss": 0.73407876, "learning_rate": 3.686762546833722e-06, "loss": 0.75596696, "num_input_tokens_seen": 73691940, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9296875, "step": 3412, "time_per_iteration": 2.491511344909668 }, { "auxiliary_loss_clip": 0.01147669, "auxiliary_loss_mlp": 0.01048069, "balance_loss_clip": 1.03029466, "balance_loss_mlp": 1.05136478, "epoch": 0.20520066135577936, "flos": 19565533745280.0, "grad_norm": 2.2327824436655628, "language_loss": 0.77307534, "learning_rate": 3.6865532510264362e-06, "loss": 0.79503268, "num_input_tokens_seen": 73709080, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9609375, "step": 3413, "time_per_iteration": 2.444530487060547 }, { "auxiliary_loss_clip": 0.01143004, "auxiliary_loss_mlp": 0.01044389, "balance_loss_clip": 1.02639997, "balance_loss_mlp": 1.05349314, "epoch": 0.20526078460844732, "flos": 17676345928320.0, "grad_norm": 1.9628002672594662, "language_loss": 0.8483699, "learning_rate": 3.6863438912648823e-06, "loss": 0.87024379, "num_input_tokens_seen": 73727670, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.89453125, "step": 3414, "time_per_iteration": 2.457789421081543 }, { "auxiliary_loss_clip": 0.01142546, "auxiliary_loss_mlp": 0.01035558, "balance_loss_clip": 1.01835632, "balance_loss_mlp": 1.04917431, "epoch": 0.2053209078611153, "flos": 21500328856320.0, "grad_norm": 1.8685879074689105, "language_loss": 0.80952919, "learning_rate": 3.6861344675569986e-06, "loss": 0.83131021, "num_input_tokens_seen": 73747170, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9296875, "step": 3415, "time_per_iteration": 2.4729223251342773 }, { "auxiliary_loss_clip": 0.01143574, "auxiliary_loss_mlp": 0.01039662, "balance_loss_clip": 1.02412903, "balance_loss_mlp": 1.05293822, "epoch": 0.20538103111378325, "flos": 25663524848640.0, "grad_norm": 2.182440485316328, "language_loss": 0.72962075, "learning_rate": 3.6859249799107275e-06, "loss": 0.7514531, "num_input_tokens_seen": 73767690, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.90625, "step": 3416, "time_per_iteration": 2.5387158393859863 }, { "auxiliary_loss_clip": 0.01145488, "auxiliary_loss_mlp": 0.01041779, "balance_loss_clip": 1.02445841, "balance_loss_mlp": 1.05090547, "epoch": 0.20544115436645122, "flos": 23148952312320.0, "grad_norm": 1.9075441474395392, "language_loss": 0.78938067, "learning_rate": 3.6857154283340115e-06, "loss": 0.81125331, "num_input_tokens_seen": 73786900, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3417, "time_per_iteration": 2.489668130874634 }, { "auxiliary_loss_clip": 0.01147135, "auxiliary_loss_mlp": 0.01043417, "balance_loss_clip": 1.02499878, "balance_loss_mlp": 1.0535996, "epoch": 0.20550127761911918, "flos": 19390433921280.0, "grad_norm": 2.535338551880854, "language_loss": 0.87028134, "learning_rate": 3.685505812834798e-06, "loss": 0.89218688, "num_input_tokens_seen": 73804515, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9375, "step": 3418, "time_per_iteration": 2.495079517364502 }, { "auxiliary_loss_clip": 0.01142881, "auxiliary_loss_mlp": 0.01038027, "balance_loss_clip": 1.01995468, "balance_loss_mlp": 1.04952466, "epoch": 0.20556140087178718, "flos": 22893124671360.0, "grad_norm": 8.101474815865293, "language_loss": 0.61863798, "learning_rate": 3.685296133421035e-06, "loss": 0.64044702, "num_input_tokens_seen": 73822910, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.93359375, "step": 3419, "time_per_iteration": 2.4813573360443115 }, { "auxiliary_loss_clip": 0.01150124, "auxiliary_loss_mlp": 0.01049018, "balance_loss_clip": 1.02993202, "balance_loss_mlp": 1.05401635, "epoch": 0.20562152412445514, "flos": 19789652655360.0, "grad_norm": 1.861078999306858, "language_loss": 0.86437881, "learning_rate": 3.685086390100674e-06, "loss": 0.88637024, "num_input_tokens_seen": 73841160, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.9609375, "step": 3420, "time_per_iteration": 2.4895434379577637 }, { "auxiliary_loss_clip": 0.0114425, "auxiliary_loss_mlp": 0.0104458, "balance_loss_clip": 1.02663875, "balance_loss_mlp": 1.05182922, "epoch": 0.2056816473771231, "flos": 31501989210240.0, "grad_norm": 4.357283119661838, "language_loss": 0.70856494, "learning_rate": 3.684876582881668e-06, "loss": 0.73045325, "num_input_tokens_seen": 73862795, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.921875, "step": 3421, "time_per_iteration": 2.5475685596466064 }, { "auxiliary_loss_clip": 0.01145033, "auxiliary_loss_mlp": 0.01040146, "balance_loss_clip": 1.022205, "balance_loss_mlp": 1.05279577, "epoch": 0.20574177062979107, "flos": 23258372117760.0, "grad_norm": 2.019199407692615, "language_loss": 0.70641267, "learning_rate": 3.6846667117719732e-06, "loss": 0.72826445, "num_input_tokens_seen": 73881525, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.921875, "step": 3422, "time_per_iteration": 2.485342025756836 }, { "auxiliary_loss_clip": 0.01065463, "auxiliary_loss_mlp": 0.01007166, "balance_loss_clip": 1.00510406, "balance_loss_mlp": 1.03352499, "epoch": 0.20580189388245904, "flos": 70312518708480.0, "grad_norm": 0.7570846225858799, "language_loss": 0.55591679, "learning_rate": 3.684456776779548e-06, "loss": 0.57664311, "num_input_tokens_seen": 73937775, "router_z_loss_clip": 0.02062988, "router_z_loss_mlp": 0.3203125, "step": 3423, "time_per_iteration": 3.141878128051758 }, { "auxiliary_loss_clip": 0.01148158, "auxiliary_loss_mlp": 0.0104132, "balance_loss_clip": 1.02283049, "balance_loss_mlp": 1.05325413, "epoch": 0.205862017135127, "flos": 30737846252160.0, "grad_norm": 1.704923797806032, "language_loss": 0.71215743, "learning_rate": 3.684246777912353e-06, "loss": 0.73405218, "num_input_tokens_seen": 73958250, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.94921875, "step": 3424, "time_per_iteration": 2.5518150329589844 }, { "auxiliary_loss_clip": 0.01148052, "auxiliary_loss_mlp": 0.01045274, "balance_loss_clip": 1.02763093, "balance_loss_mlp": 1.05661952, "epoch": 0.20592214038779497, "flos": 21324546673920.0, "grad_norm": 1.4389586323721733, "language_loss": 0.75029796, "learning_rate": 3.684036715178351e-06, "loss": 0.77223122, "num_input_tokens_seen": 73977775, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9140625, "step": 3425, "time_per_iteration": 2.5022342205047607 }, { "auxiliary_loss_clip": 0.01149621, "auxiliary_loss_mlp": 0.01054784, "balance_loss_clip": 1.03703403, "balance_loss_mlp": 1.05714607, "epoch": 0.20598226364046296, "flos": 22891652213760.0, "grad_norm": 1.8884839283782509, "language_loss": 0.88061905, "learning_rate": 3.683826588585508e-06, "loss": 0.90266311, "num_input_tokens_seen": 73996590, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.92578125, "step": 3426, "time_per_iteration": 3.9163947105407715 }, { "auxiliary_loss_clip": 0.0114813, "auxiliary_loss_mlp": 0.01045546, "balance_loss_clip": 1.02798629, "balance_loss_mlp": 1.05773854, "epoch": 0.20604238689313092, "flos": 23878549365120.0, "grad_norm": 1.48607812634942, "language_loss": 0.76528472, "learning_rate": 3.6836163981417926e-06, "loss": 0.78722143, "num_input_tokens_seen": 74015935, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.90625, "step": 3427, "time_per_iteration": 2.522843360900879 }, { "auxiliary_loss_clip": 0.01151489, "auxiliary_loss_mlp": 0.01047427, "balance_loss_clip": 1.02910471, "balance_loss_mlp": 1.0560087, "epoch": 0.2061025101457989, "flos": 22491535639680.0, "grad_norm": 2.02935010710152, "language_loss": 0.73742598, "learning_rate": 3.683406143855174e-06, "loss": 0.75941515, "num_input_tokens_seen": 74036575, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.953125, "step": 3428, "time_per_iteration": 3.8896708488464355 }, { "auxiliary_loss_clip": 0.01147596, "auxiliary_loss_mlp": 0.01047746, "balance_loss_clip": 1.02918482, "balance_loss_mlp": 1.05249393, "epoch": 0.20616263339846685, "flos": 22778928357120.0, "grad_norm": 2.2420650675398814, "language_loss": 0.73493791, "learning_rate": 3.6831958257336256e-06, "loss": 0.75689137, "num_input_tokens_seen": 74055365, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.94921875, "step": 3429, "time_per_iteration": 2.4823803901672363 }, { "auxiliary_loss_clip": 0.01156218, "auxiliary_loss_mlp": 0.01043447, "balance_loss_clip": 1.02527952, "balance_loss_mlp": 1.06016052, "epoch": 0.20622275665113482, "flos": 20882198684160.0, "grad_norm": 1.848075061134517, "language_loss": 0.85396689, "learning_rate": 3.6829854437851237e-06, "loss": 0.87596351, "num_input_tokens_seen": 74074875, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9609375, "step": 3430, "time_per_iteration": 3.8895978927612305 }, { "auxiliary_loss_clip": 0.01151413, "auxiliary_loss_mlp": 0.01047365, "balance_loss_clip": 1.02909005, "balance_loss_mlp": 1.05566573, "epoch": 0.20628287990380278, "flos": 19354415558400.0, "grad_norm": 1.6189141403223404, "language_loss": 0.68908387, "learning_rate": 3.6827749980176444e-06, "loss": 0.71107167, "num_input_tokens_seen": 74094505, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.95703125, "step": 3431, "time_per_iteration": 3.907038450241089 }, { "auxiliary_loss_clip": 0.01062635, "auxiliary_loss_mlp": 0.01000522, "balance_loss_clip": 0.99831676, "balance_loss_mlp": 1.03005815, "epoch": 0.20634300315647078, "flos": 71517932248320.0, "grad_norm": 0.9008148223156608, "language_loss": 0.60279167, "learning_rate": 3.6825644884391693e-06, "loss": 0.62342322, "num_input_tokens_seen": 74158500, "router_z_loss_clip": 0.02209473, "router_z_loss_mlp": 0.32617188, "step": 3432, "time_per_iteration": 3.2205822467803955 }, { "auxiliary_loss_clip": 0.01151589, "auxiliary_loss_mlp": 0.01047678, "balance_loss_clip": 1.03015423, "balance_loss_mlp": 1.0581156, "epoch": 0.20640312640913874, "flos": 21723944976000.0, "grad_norm": 2.1541962925190368, "language_loss": 0.72381234, "learning_rate": 3.682353915057679e-06, "loss": 0.74580497, "num_input_tokens_seen": 74176685, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.93359375, "step": 3433, "time_per_iteration": 2.505516290664673 }, { "auxiliary_loss_clip": 0.01150799, "auxiliary_loss_mlp": 0.01043879, "balance_loss_clip": 1.02631903, "balance_loss_mlp": 1.05417705, "epoch": 0.2064632496618067, "flos": 20554621626240.0, "grad_norm": 2.0410983870655715, "language_loss": 0.86793625, "learning_rate": 3.6821432778811604e-06, "loss": 0.88988292, "num_input_tokens_seen": 74194935, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.96875, "step": 3434, "time_per_iteration": 2.4641215801239014 }, { "auxiliary_loss_clip": 0.01151584, "auxiliary_loss_mlp": 0.01041453, "balance_loss_clip": 1.02364278, "balance_loss_mlp": 1.05359268, "epoch": 0.20652337291447467, "flos": 29823273135360.0, "grad_norm": 1.8009695223581512, "language_loss": 0.69180822, "learning_rate": 3.6819325769176004e-06, "loss": 0.71373868, "num_input_tokens_seen": 74215400, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.98046875, "step": 3435, "time_per_iteration": 2.5292909145355225 }, { "auxiliary_loss_clip": 0.011458, "auxiliary_loss_mlp": 0.01041224, "balance_loss_clip": 1.02268672, "balance_loss_mlp": 1.05340314, "epoch": 0.20658349616714264, "flos": 26213640618240.0, "grad_norm": 1.7311838491620684, "language_loss": 0.89145219, "learning_rate": 3.681721812174988e-06, "loss": 0.91332239, "num_input_tokens_seen": 74234090, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.921875, "step": 3436, "time_per_iteration": 2.5203781127929688 }, { "auxiliary_loss_clip": 0.01149242, "auxiliary_loss_mlp": 0.01038345, "balance_loss_clip": 1.02060628, "balance_loss_mlp": 1.05509186, "epoch": 0.2066436194198106, "flos": 25994370044160.0, "grad_norm": 1.5794692490465987, "language_loss": 0.76237673, "learning_rate": 3.6815109836613163e-06, "loss": 0.78425252, "num_input_tokens_seen": 74253345, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.94140625, "step": 3437, "time_per_iteration": 2.5123279094696045 }, { "auxiliary_loss_clip": 0.01145235, "auxiliary_loss_mlp": 0.01040791, "balance_loss_clip": 1.0237205, "balance_loss_mlp": 1.05094373, "epoch": 0.20670374267247857, "flos": 21361067827200.0, "grad_norm": 1.91978374746587, "language_loss": 0.77637076, "learning_rate": 3.6813000913845795e-06, "loss": 0.79823101, "num_input_tokens_seen": 74271615, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9453125, "step": 3438, "time_per_iteration": 2.4639670848846436 }, { "auxiliary_loss_clip": 0.01058122, "auxiliary_loss_mlp": 0.01017284, "balance_loss_clip": 1.01509082, "balance_loss_mlp": 1.02611291, "epoch": 0.20676386592514656, "flos": 66383281952640.0, "grad_norm": 0.8315601938098842, "language_loss": 0.67165112, "learning_rate": 3.6810891353527747e-06, "loss": 0.69240522, "num_input_tokens_seen": 74331390, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.3203125, "step": 3439, "time_per_iteration": 3.052882671356201 }, { "auxiliary_loss_clip": 0.01147508, "auxiliary_loss_mlp": 0.01039726, "balance_loss_clip": 1.02164173, "balance_loss_mlp": 1.0510267, "epoch": 0.20682398917781453, "flos": 17274577328640.0, "grad_norm": 2.085309408634347, "language_loss": 0.8381784, "learning_rate": 3.6808781155739014e-06, "loss": 0.86005074, "num_input_tokens_seen": 74347335, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.96484375, "step": 3440, "time_per_iteration": 2.426562786102295 }, { "auxiliary_loss_clip": 0.01148301, "auxiliary_loss_mlp": 0.01041625, "balance_loss_clip": 1.02476859, "balance_loss_mlp": 1.05366731, "epoch": 0.2068841124304825, "flos": 18077288515200.0, "grad_norm": 2.017883735505245, "language_loss": 0.84953159, "learning_rate": 3.6806670320559614e-06, "loss": 0.87143087, "num_input_tokens_seen": 74366310, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9453125, "step": 3441, "time_per_iteration": 2.457714796066284 }, { "auxiliary_loss_clip": 0.01144295, "auxiliary_loss_mlp": 0.01045621, "balance_loss_clip": 1.02777505, "balance_loss_mlp": 1.0525372, "epoch": 0.20694423568315046, "flos": 27347017432320.0, "grad_norm": 1.6984707396565144, "language_loss": 0.86134458, "learning_rate": 3.680455884806959e-06, "loss": 0.8832438, "num_input_tokens_seen": 74387100, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.91796875, "step": 3442, "time_per_iteration": 2.544130325317383 }, { "auxiliary_loss_clip": 0.01149129, "auxiliary_loss_mlp": 0.01044613, "balance_loss_clip": 1.02617145, "balance_loss_mlp": 1.05371094, "epoch": 0.20700435893581842, "flos": 20229845829120.0, "grad_norm": 2.2771775249195203, "language_loss": 0.72895384, "learning_rate": 3.6802446738349014e-06, "loss": 0.75089121, "num_input_tokens_seen": 74404460, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.953125, "step": 3443, "time_per_iteration": 2.4792420864105225 }, { "auxiliary_loss_clip": 0.01143636, "auxiliary_loss_mlp": 0.01048114, "balance_loss_clip": 1.03126943, "balance_loss_mlp": 1.05045843, "epoch": 0.2070644821884864, "flos": 20631111638400.0, "grad_norm": 5.8337236679246685, "language_loss": 0.85323691, "learning_rate": 3.680033399147797e-06, "loss": 0.8751545, "num_input_tokens_seen": 74423790, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9296875, "step": 3444, "time_per_iteration": 2.4723758697509766 }, { "auxiliary_loss_clip": 0.01055117, "auxiliary_loss_mlp": 0.01013922, "balance_loss_clip": 1.01182353, "balance_loss_mlp": 1.02397847, "epoch": 0.20712460544115438, "flos": 65941077617280.0, "grad_norm": 0.6933469557553174, "language_loss": 0.57140571, "learning_rate": 3.6798220607536585e-06, "loss": 0.59209609, "num_input_tokens_seen": 74488130, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.3125, "step": 3445, "time_per_iteration": 3.090169668197632 }, { "auxiliary_loss_clip": 0.01143195, "auxiliary_loss_mlp": 0.01045379, "balance_loss_clip": 1.02739036, "balance_loss_mlp": 1.05053473, "epoch": 0.20718472869382235, "flos": 19425734012160.0, "grad_norm": 2.297860603109898, "language_loss": 0.78239918, "learning_rate": 3.6796106586604987e-06, "loss": 0.80428493, "num_input_tokens_seen": 74506720, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.92578125, "step": 3446, "time_per_iteration": 2.4777421951293945 }, { "auxiliary_loss_clip": 0.01150507, "auxiliary_loss_mlp": 0.01048828, "balance_loss_clip": 1.02888477, "balance_loss_mlp": 1.05152154, "epoch": 0.2072448519464903, "flos": 24499049834880.0, "grad_norm": 3.6962777278904864, "language_loss": 0.6217978, "learning_rate": 3.679399192876334e-06, "loss": 0.64379114, "num_input_tokens_seen": 74525330, "router_z_loss_clip": 0.19921875, "router_z_loss_mlp": 0.9921875, "step": 3447, "time_per_iteration": 2.496793270111084 }, { "auxiliary_loss_clip": 0.01145696, "auxiliary_loss_mlp": 0.01051776, "balance_loss_clip": 1.03367972, "balance_loss_mlp": 1.04987264, "epoch": 0.20730497519915828, "flos": 23075694524160.0, "grad_norm": 1.6738683230700968, "language_loss": 0.85786176, "learning_rate": 3.679187663409184e-06, "loss": 0.87983644, "num_input_tokens_seen": 74544535, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.95703125, "step": 3448, "time_per_iteration": 2.498732805252075 }, { "auxiliary_loss_clip": 0.01143395, "auxiliary_loss_mlp": 0.01045635, "balance_loss_clip": 1.02595341, "balance_loss_mlp": 1.04866576, "epoch": 0.20736509845182624, "flos": 21069042255360.0, "grad_norm": 1.9561899828668203, "language_loss": 0.75177872, "learning_rate": 3.6789760702670696e-06, "loss": 0.77366894, "num_input_tokens_seen": 74562300, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.94921875, "step": 3449, "time_per_iteration": 2.464247941970825 }, { "auxiliary_loss_clip": 0.01147808, "auxiliary_loss_mlp": 0.01049157, "balance_loss_clip": 1.03059626, "balance_loss_mlp": 1.04966736, "epoch": 0.2074252217044942, "flos": 17633288499840.0, "grad_norm": 2.153896101080447, "language_loss": 0.7690345, "learning_rate": 3.6787644134580134e-06, "loss": 0.79100418, "num_input_tokens_seen": 74580080, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.98046875, "step": 3450, "time_per_iteration": 2.4652669429779053 }, { "auxiliary_loss_clip": 0.01145766, "auxiliary_loss_mlp": 0.01045399, "balance_loss_clip": 1.02748179, "balance_loss_mlp": 1.04831815, "epoch": 0.20748534495716217, "flos": 23546985897600.0, "grad_norm": 2.150165676488678, "language_loss": 0.82426763, "learning_rate": 3.6785526929900436e-06, "loss": 0.84617931, "num_input_tokens_seen": 74598980, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9765625, "step": 3451, "time_per_iteration": 2.4717960357666016 }, { "auxiliary_loss_clip": 0.01051914, "auxiliary_loss_mlp": 0.01004946, "balance_loss_clip": 1.00271654, "balance_loss_mlp": 1.01995158, "epoch": 0.20754546820983016, "flos": 52252935598080.0, "grad_norm": 0.7933962215326386, "language_loss": 0.56631541, "learning_rate": 3.6783409088711875e-06, "loss": 0.58688402, "num_input_tokens_seen": 74655275, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.3203125, "step": 3452, "time_per_iteration": 2.9783737659454346 }, { "auxiliary_loss_clip": 0.01146474, "auxiliary_loss_mlp": 0.01046647, "balance_loss_clip": 1.02819383, "balance_loss_mlp": 1.04946494, "epoch": 0.20760559146249813, "flos": 20412379768320.0, "grad_norm": 2.1034087571824003, "language_loss": 0.88026577, "learning_rate": 3.6781290611094755e-06, "loss": 0.902197, "num_input_tokens_seen": 74674560, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.96875, "step": 3453, "time_per_iteration": 2.458775758743286 }, { "auxiliary_loss_clip": 0.01147936, "auxiliary_loss_mlp": 0.01043801, "balance_loss_clip": 1.02481151, "balance_loss_mlp": 1.05216789, "epoch": 0.2076657147151661, "flos": 23186012169600.0, "grad_norm": 1.6155110819203802, "language_loss": 0.79916549, "learning_rate": 3.6779171497129407e-06, "loss": 0.82108289, "num_input_tokens_seen": 74694500, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.95703125, "step": 3454, "time_per_iteration": 2.5089683532714844 }, { "auxiliary_loss_clip": 0.01144444, "auxiliary_loss_mlp": 0.01042614, "balance_loss_clip": 1.02455342, "balance_loss_mlp": 1.04901147, "epoch": 0.20772583796783406, "flos": 18293219124480.0, "grad_norm": 3.507449059518793, "language_loss": 0.76479256, "learning_rate": 3.6777051746896202e-06, "loss": 0.78666317, "num_input_tokens_seen": 74710485, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.953125, "step": 3455, "time_per_iteration": 2.4360008239746094 }, { "auxiliary_loss_clip": 0.01142701, "auxiliary_loss_mlp": 0.01046182, "balance_loss_clip": 1.02881289, "balance_loss_mlp": 1.04825377, "epoch": 0.20778596122050202, "flos": 17602800831360.0, "grad_norm": 1.8190179966420021, "language_loss": 0.80502951, "learning_rate": 3.6774931360475516e-06, "loss": 0.82691836, "num_input_tokens_seen": 74727450, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3456, "time_per_iteration": 2.475078582763672 }, { "auxiliary_loss_clip": 0.01147737, "auxiliary_loss_mlp": 0.01044097, "balance_loss_clip": 1.02489269, "balance_loss_mlp": 1.05196416, "epoch": 0.20784608447317, "flos": 23805578885760.0, "grad_norm": 1.4938982435524442, "language_loss": 0.78034616, "learning_rate": 3.6772810337947745e-06, "loss": 0.80226445, "num_input_tokens_seen": 74746725, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.95703125, "step": 3457, "time_per_iteration": 2.522665023803711 }, { "auxiliary_loss_clip": 0.01146459, "auxiliary_loss_mlp": 0.01051886, "balance_loss_clip": 1.03147745, "balance_loss_mlp": 1.04904151, "epoch": 0.20790620772583795, "flos": 17639286071040.0, "grad_norm": 1.7702767128760282, "language_loss": 0.83265901, "learning_rate": 3.677068867939333e-06, "loss": 0.85464245, "num_input_tokens_seen": 74765255, "router_z_loss_clip": 0.20410156, "router_z_loss_mlp": 0.97265625, "step": 3458, "time_per_iteration": 2.436004638671875 }, { "auxiliary_loss_clip": 0.01140146, "auxiliary_loss_mlp": 0.01033329, "balance_loss_clip": 1.01606727, "balance_loss_mlp": 1.04754114, "epoch": 0.20796633097850595, "flos": 27673481168640.0, "grad_norm": 2.2028059333248264, "language_loss": 0.75752115, "learning_rate": 3.676856638489272e-06, "loss": 0.77925587, "num_input_tokens_seen": 74785710, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.92578125, "step": 3459, "time_per_iteration": 2.503916025161743 }, { "auxiliary_loss_clip": 0.0113639, "auxiliary_loss_mlp": 0.01033457, "balance_loss_clip": 1.0165534, "balance_loss_mlp": 1.04512143, "epoch": 0.2080264542311739, "flos": 19245606284160.0, "grad_norm": 2.0065577088367834, "language_loss": 0.76949495, "learning_rate": 3.6766443454526382e-06, "loss": 0.79119349, "num_input_tokens_seen": 74804490, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9140625, "step": 3460, "time_per_iteration": 2.469381809234619 }, { "auxiliary_loss_clip": 0.01142873, "auxiliary_loss_mlp": 0.01041558, "balance_loss_clip": 1.02320004, "balance_loss_mlp": 1.04684067, "epoch": 0.20808657748384188, "flos": 27525924097920.0, "grad_norm": 2.0433153932371626, "language_loss": 0.75742304, "learning_rate": 3.6764319888374836e-06, "loss": 0.77926737, "num_input_tokens_seen": 74826340, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9609375, "step": 3461, "time_per_iteration": 2.5256187915802 }, { "auxiliary_loss_clip": 0.0114337, "auxiliary_loss_mlp": 0.01041927, "balance_loss_clip": 1.02346182, "balance_loss_mlp": 1.04474068, "epoch": 0.20814670073650984, "flos": 26906931999360.0, "grad_norm": 1.7905350153707944, "language_loss": 0.88402873, "learning_rate": 3.6762195686518604e-06, "loss": 0.9058817, "num_input_tokens_seen": 74844960, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.984375, "step": 3462, "time_per_iteration": 2.526872396469116 }, { "auxiliary_loss_clip": 0.01052971, "auxiliary_loss_mlp": 0.01012884, "balance_loss_clip": 1.01084518, "balance_loss_mlp": 1.02002978, "epoch": 0.2082068239891778, "flos": 70175735717760.0, "grad_norm": 0.7855419547843473, "language_loss": 0.5912317, "learning_rate": 3.6760070849038226e-06, "loss": 0.61189026, "num_input_tokens_seen": 74909075, "router_z_loss_clip": 0.02038574, "router_z_loss_mlp": 0.33007812, "step": 3463, "time_per_iteration": 3.203125476837158 }, { "auxiliary_loss_clip": 0.01142623, "auxiliary_loss_mlp": 0.01047959, "balance_loss_clip": 1.02912414, "balance_loss_mlp": 1.04573739, "epoch": 0.20826694724184577, "flos": 24608074590720.0, "grad_norm": 2.616588100602417, "language_loss": 0.66117632, "learning_rate": 3.675794537601429e-06, "loss": 0.68308222, "num_input_tokens_seen": 74928125, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.96875, "step": 3464, "time_per_iteration": 2.4984071254730225 }, { "auxiliary_loss_clip": 0.01147818, "auxiliary_loss_mlp": 0.01052498, "balance_loss_clip": 1.03272057, "balance_loss_mlp": 1.04889774, "epoch": 0.20832707049451377, "flos": 12892829034240.0, "grad_norm": 1.8539256933402664, "language_loss": 0.83645403, "learning_rate": 3.6755819267527373e-06, "loss": 0.85845721, "num_input_tokens_seen": 74945090, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 0.984375, "step": 3465, "time_per_iteration": 2.4207570552825928 }, { "auxiliary_loss_clip": 0.01142922, "auxiliary_loss_mlp": 0.01047174, "balance_loss_clip": 1.02903008, "balance_loss_mlp": 1.04695439, "epoch": 0.20838719374718173, "flos": 22198827709440.0, "grad_norm": 2.0557658477474905, "language_loss": 0.81376076, "learning_rate": 3.6753692523658113e-06, "loss": 0.83566177, "num_input_tokens_seen": 74963630, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9609375, "step": 3466, "time_per_iteration": 2.4915719032287598 }, { "auxiliary_loss_clip": 0.0114265, "auxiliary_loss_mlp": 0.01040355, "balance_loss_clip": 1.02421439, "balance_loss_mlp": 1.04887533, "epoch": 0.2084473169998497, "flos": 15158648908800.0, "grad_norm": 1.985310287953193, "language_loss": 0.82146275, "learning_rate": 3.675156514448716e-06, "loss": 0.84329283, "num_input_tokens_seen": 74981875, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.9375, "step": 3467, "time_per_iteration": 2.436784505844116 }, { "auxiliary_loss_clip": 0.01138425, "auxiliary_loss_mlp": 0.01045345, "balance_loss_clip": 1.02821434, "balance_loss_mlp": 1.04799569, "epoch": 0.20850744025251766, "flos": 17456788045440.0, "grad_norm": 2.0835585563558836, "language_loss": 0.81463695, "learning_rate": 3.674943713009518e-06, "loss": 0.8364746, "num_input_tokens_seen": 74999155, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.90625, "step": 3468, "time_per_iteration": 3.9654765129089355 }, { "auxiliary_loss_clip": 0.01145075, "auxiliary_loss_mlp": 0.01050992, "balance_loss_clip": 1.03097677, "balance_loss_mlp": 1.04691124, "epoch": 0.20856756350518563, "flos": 25698968593920.0, "grad_norm": 1.7604334618766821, "language_loss": 0.90014315, "learning_rate": 3.6747308480562856e-06, "loss": 0.92210382, "num_input_tokens_seen": 75017850, "router_z_loss_clip": 0.20019531, "router_z_loss_mlp": 0.98046875, "step": 3469, "time_per_iteration": 2.528637170791626 }, { "auxiliary_loss_clip": 0.01145407, "auxiliary_loss_mlp": 0.01053898, "balance_loss_clip": 1.03542066, "balance_loss_mlp": 1.04992819, "epoch": 0.2086276867578536, "flos": 37889060970240.0, "grad_norm": 1.9401750128908204, "language_loss": 0.76692784, "learning_rate": 3.674517919597092e-06, "loss": 0.78892088, "num_input_tokens_seen": 75039270, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.953125, "step": 3470, "time_per_iteration": 3.947301149368286 }, { "auxiliary_loss_clip": 0.0114311, "auxiliary_loss_mlp": 0.01047504, "balance_loss_clip": 1.02926469, "balance_loss_mlp": 1.04854715, "epoch": 0.20868781001052156, "flos": 25557049958400.0, "grad_norm": 1.7960539067999604, "language_loss": 0.75556266, "learning_rate": 3.674304927640011e-06, "loss": 0.7774688, "num_input_tokens_seen": 75059350, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.9453125, "step": 3471, "time_per_iteration": 2.5118257999420166 }, { "auxiliary_loss_clip": 0.01145453, "auxiliary_loss_mlp": 0.01054841, "balance_loss_clip": 1.03544569, "balance_loss_mlp": 1.04537082, "epoch": 0.20874793326318955, "flos": 27529192235520.0, "grad_norm": 1.785776461227901, "language_loss": 0.75518346, "learning_rate": 3.67409187219312e-06, "loss": 0.77718645, "num_input_tokens_seen": 75080150, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 1.0, "step": 3472, "time_per_iteration": 5.420573472976685 }, { "auxiliary_loss_clip": 0.0114094, "auxiliary_loss_mlp": 0.01048526, "balance_loss_clip": 1.03085911, "balance_loss_mlp": 1.04697871, "epoch": 0.20880805651585752, "flos": 18548795370240.0, "grad_norm": 2.170824314229911, "language_loss": 0.84500551, "learning_rate": 3.6738787532644966e-06, "loss": 0.86690021, "num_input_tokens_seen": 75097920, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9375, "step": 3473, "time_per_iteration": 2.4431381225585938 }, { "auxiliary_loss_clip": 0.01051314, "auxiliary_loss_mlp": 0.01018942, "balance_loss_clip": 1.01705873, "balance_loss_mlp": 1.01880765, "epoch": 0.20886817976852548, "flos": 65946644225280.0, "grad_norm": 0.8810234541222428, "language_loss": 0.63669407, "learning_rate": 3.6736655708622235e-06, "loss": 0.65739667, "num_input_tokens_seen": 75152410, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.32421875, "step": 3474, "time_per_iteration": 3.009323835372925 }, { "auxiliary_loss_clip": 0.01146331, "auxiliary_loss_mlp": 0.01043116, "balance_loss_clip": 1.02491331, "balance_loss_mlp": 1.04920197, "epoch": 0.20892830302119345, "flos": 36539178929280.0, "grad_norm": 2.5501022564958276, "language_loss": 0.7041657, "learning_rate": 3.6734523249943844e-06, "loss": 0.72606015, "num_input_tokens_seen": 75173265, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.96875, "step": 3475, "time_per_iteration": 2.6072826385498047 }, { "auxiliary_loss_clip": 0.01144927, "auxiliary_loss_mlp": 0.01047832, "balance_loss_clip": 1.02986681, "balance_loss_mlp": 1.0492506, "epoch": 0.2089884262738614, "flos": 20956749361920.0, "grad_norm": 1.5455414258641955, "language_loss": 0.70262849, "learning_rate": 3.673239015669065e-06, "loss": 0.72455609, "num_input_tokens_seen": 75193640, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.953125, "step": 3476, "time_per_iteration": 2.4609973430633545 }, { "auxiliary_loss_clip": 0.01142288, "auxiliary_loss_mlp": 0.01048005, "balance_loss_clip": 1.03048122, "balance_loss_mlp": 1.04850566, "epoch": 0.20904854952652938, "flos": 22784028088320.0, "grad_norm": 2.046691081184699, "language_loss": 0.89383841, "learning_rate": 3.6730256428943544e-06, "loss": 0.91574132, "num_input_tokens_seen": 75212545, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9375, "step": 3477, "time_per_iteration": 2.4917027950286865 }, { "auxiliary_loss_clip": 0.01140659, "auxiliary_loss_mlp": 0.01045057, "balance_loss_clip": 1.02708077, "balance_loss_mlp": 1.04641199, "epoch": 0.20910867277919734, "flos": 27303277645440.0, "grad_norm": 2.378608933115297, "language_loss": 0.6805923, "learning_rate": 3.672812206678344e-06, "loss": 0.7024495, "num_input_tokens_seen": 75230865, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.94140625, "step": 3478, "time_per_iteration": 2.5027921199798584 }, { "auxiliary_loss_clip": 0.01140793, "auxiliary_loss_mlp": 0.01046988, "balance_loss_clip": 1.02815342, "balance_loss_mlp": 1.0467217, "epoch": 0.20916879603186533, "flos": 14319237000960.0, "grad_norm": 3.2561614257734024, "language_loss": 0.84668297, "learning_rate": 3.672598707029127e-06, "loss": 0.86856073, "num_input_tokens_seen": 75248285, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9375, "step": 3479, "time_per_iteration": 2.4945197105407715 }, { "auxiliary_loss_clip": 0.01146193, "auxiliary_loss_mlp": 0.01049771, "balance_loss_clip": 1.03169894, "balance_loss_mlp": 1.04982948, "epoch": 0.2092289192845333, "flos": 22273019251200.0, "grad_norm": 2.4845849948899135, "language_loss": 0.73846495, "learning_rate": 3.6723851439548003e-06, "loss": 0.76042461, "num_input_tokens_seen": 75266310, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9609375, "step": 3480, "time_per_iteration": 2.456313371658325 }, { "auxiliary_loss_clip": 0.01139602, "auxiliary_loss_mlp": 0.01045402, "balance_loss_clip": 1.02898073, "balance_loss_mlp": 1.04759455, "epoch": 0.20928904253720126, "flos": 14830712714880.0, "grad_norm": 2.1701416461849217, "language_loss": 0.75628138, "learning_rate": 3.67217151746346e-06, "loss": 0.77813148, "num_input_tokens_seen": 75284175, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.921875, "step": 3481, "time_per_iteration": 2.4525797367095947 }, { "auxiliary_loss_clip": 0.0114304, "auxiliary_loss_mlp": 0.01038444, "balance_loss_clip": 1.02041984, "balance_loss_mlp": 1.04897833, "epoch": 0.20934916578986923, "flos": 23259162216960.0, "grad_norm": 1.9147808075690411, "language_loss": 0.85047889, "learning_rate": 3.671957827563209e-06, "loss": 0.87229371, "num_input_tokens_seen": 75303465, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.94140625, "step": 3482, "time_per_iteration": 2.479706287384033 }, { "auxiliary_loss_clip": 0.01143218, "auxiliary_loss_mlp": 0.0104167, "balance_loss_clip": 1.02366912, "balance_loss_mlp": 1.04955149, "epoch": 0.2094092890425372, "flos": 32014398677760.0, "grad_norm": 2.391971408277807, "language_loss": 0.70502257, "learning_rate": 3.6717440742621494e-06, "loss": 0.72687143, "num_input_tokens_seen": 75325290, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9375, "step": 3483, "time_per_iteration": 2.605017900466919 }, { "auxiliary_loss_clip": 0.01145874, "auxiliary_loss_mlp": 0.01051591, "balance_loss_clip": 1.03357804, "balance_loss_mlp": 1.04945302, "epoch": 0.20946941229520516, "flos": 20010647082240.0, "grad_norm": 1.6690784064889512, "language_loss": 0.74921048, "learning_rate": 3.6715302575683865e-06, "loss": 0.7711851, "num_input_tokens_seen": 75343895, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.96484375, "step": 3484, "time_per_iteration": 2.4560837745666504 }, { "auxiliary_loss_clip": 0.0114273, "auxiliary_loss_mlp": 0.01043309, "balance_loss_clip": 1.02483189, "balance_loss_mlp": 1.04949427, "epoch": 0.20952953554787315, "flos": 30740072895360.0, "grad_norm": 1.743762438103056, "language_loss": 0.70630562, "learning_rate": 3.6713163774900292e-06, "loss": 0.72816598, "num_input_tokens_seen": 75367100, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.93359375, "step": 3485, "time_per_iteration": 2.5761983394622803 }, { "auxiliary_loss_clip": 0.01145363, "auxiliary_loss_mlp": 0.01043823, "balance_loss_clip": 1.02560782, "balance_loss_mlp": 1.04999232, "epoch": 0.20958965880054112, "flos": 27049209770880.0, "grad_norm": 1.9899438852459597, "language_loss": 0.82876098, "learning_rate": 3.6711024340351875e-06, "loss": 0.85065281, "num_input_tokens_seen": 75389925, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.953125, "step": 3486, "time_per_iteration": 2.521205425262451 }, { "auxiliary_loss_clip": 0.01141692, "auxiliary_loss_mlp": 0.01048222, "balance_loss_clip": 1.03097224, "balance_loss_mlp": 1.04737389, "epoch": 0.20964978205320908, "flos": 34204123589760.0, "grad_norm": 2.7789107018935266, "language_loss": 0.87291014, "learning_rate": 3.6708884272119737e-06, "loss": 0.89480925, "num_input_tokens_seen": 75408575, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.94140625, "step": 3487, "time_per_iteration": 2.5841050148010254 }, { "auxiliary_loss_clip": 0.01140881, "auxiliary_loss_mlp": 0.01042173, "balance_loss_clip": 1.02388644, "balance_loss_mlp": 1.04710352, "epoch": 0.20970990530587705, "flos": 23477391296640.0, "grad_norm": 2.402450668867188, "language_loss": 0.72514963, "learning_rate": 3.670674357028504e-06, "loss": 0.74698019, "num_input_tokens_seen": 75427155, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9375, "step": 3488, "time_per_iteration": 2.4870691299438477 }, { "auxiliary_loss_clip": 0.01144122, "auxiliary_loss_mlp": 0.01040722, "balance_loss_clip": 1.02393711, "balance_loss_mlp": 1.05081117, "epoch": 0.209770028558545, "flos": 18551452976640.0, "grad_norm": 2.61336896012205, "language_loss": 0.80677879, "learning_rate": 3.6704602234928945e-06, "loss": 0.82862723, "num_input_tokens_seen": 75444450, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.93359375, "step": 3489, "time_per_iteration": 2.4630377292633057 }, { "auxiliary_loss_clip": 0.0114238, "auxiliary_loss_mlp": 0.01043447, "balance_loss_clip": 1.02672172, "balance_loss_mlp": 1.04873931, "epoch": 0.20983015181121298, "flos": 21617003208960.0, "grad_norm": 3.3796795891511535, "language_loss": 0.72728491, "learning_rate": 3.670246026613266e-06, "loss": 0.74914324, "num_input_tokens_seen": 75462625, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9375, "step": 3490, "time_per_iteration": 2.469637393951416 }, { "auxiliary_loss_clip": 0.01138996, "auxiliary_loss_mlp": 0.01051058, "balance_loss_clip": 1.0344758, "balance_loss_mlp": 1.0496763, "epoch": 0.20989027506388094, "flos": 16614718531200.0, "grad_norm": 1.9332484110123274, "language_loss": 0.70731962, "learning_rate": 3.6700317663977415e-06, "loss": 0.72922015, "num_input_tokens_seen": 75480640, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.89453125, "step": 3491, "time_per_iteration": 2.4549078941345215 }, { "auxiliary_loss_clip": 0.01142477, "auxiliary_loss_mlp": 0.01042227, "balance_loss_clip": 1.02378559, "balance_loss_mlp": 1.0475843, "epoch": 0.20995039831654894, "flos": 23216823060480.0, "grad_norm": 2.4044223282612123, "language_loss": 0.79488534, "learning_rate": 3.669817442854444e-06, "loss": 0.81673241, "num_input_tokens_seen": 75494900, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.94921875, "step": 3492, "time_per_iteration": 2.449737310409546 }, { "auxiliary_loss_clip": 0.01144495, "auxiliary_loss_mlp": 0.0104114, "balance_loss_clip": 1.02366376, "balance_loss_mlp": 1.0517745, "epoch": 0.2100105215692169, "flos": 18147493647360.0, "grad_norm": 1.8055488746368744, "language_loss": 0.86449969, "learning_rate": 3.669603055991502e-06, "loss": 0.88635612, "num_input_tokens_seen": 75513370, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9296875, "step": 3493, "time_per_iteration": 2.483384132385254 }, { "auxiliary_loss_clip": 0.01139444, "auxiliary_loss_mlp": 0.01036798, "balance_loss_clip": 1.02072859, "balance_loss_mlp": 1.04919624, "epoch": 0.21007064482188487, "flos": 15961611490560.0, "grad_norm": 1.6573522158092904, "language_loss": 0.68255848, "learning_rate": 3.6693886058170455e-06, "loss": 0.70432091, "num_input_tokens_seen": 75532480, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.90234375, "step": 3494, "time_per_iteration": 2.437438726425171 }, { "auxiliary_loss_clip": 0.01148613, "auxiliary_loss_mlp": 0.0104149, "balance_loss_clip": 1.02437139, "balance_loss_mlp": 1.05329657, "epoch": 0.21013076807455283, "flos": 32234315696640.0, "grad_norm": 2.2142113057337727, "language_loss": 0.7881155, "learning_rate": 3.6691740923392053e-06, "loss": 0.81001651, "num_input_tokens_seen": 75552745, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.953125, "step": 3495, "time_per_iteration": 2.599248170852661 }, { "auxiliary_loss_clip": 0.0114133, "auxiliary_loss_mlp": 0.01044996, "balance_loss_clip": 1.02805614, "balance_loss_mlp": 1.04813862, "epoch": 0.2101908913272208, "flos": 23696625957120.0, "grad_norm": 1.6780215705173278, "language_loss": 0.77279109, "learning_rate": 3.668959515566116e-06, "loss": 0.79465437, "num_input_tokens_seen": 75574355, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9296875, "step": 3496, "time_per_iteration": 2.4820523262023926 }, { "auxiliary_loss_clip": 0.01147565, "auxiliary_loss_mlp": 0.01050367, "balance_loss_clip": 1.03209233, "balance_loss_mlp": 1.0514257, "epoch": 0.21025101457988876, "flos": 20375786787840.0, "grad_norm": 2.2878869013813765, "language_loss": 0.82417333, "learning_rate": 3.668744875505915e-06, "loss": 0.84615266, "num_input_tokens_seen": 75592215, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9609375, "step": 3497, "time_per_iteration": 2.475620746612549 }, { "auxiliary_loss_clip": 0.0115071, "auxiliary_loss_mlp": 0.01048617, "balance_loss_clip": 1.03126001, "balance_loss_mlp": 1.05447686, "epoch": 0.21031113783255675, "flos": 25775638174080.0, "grad_norm": 1.6997559820466428, "language_loss": 0.67204869, "learning_rate": 3.668530172166741e-06, "loss": 0.69404197, "num_input_tokens_seen": 75610740, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9609375, "step": 3498, "time_per_iteration": 2.5078234672546387 }, { "auxiliary_loss_clip": 0.01149967, "auxiliary_loss_mlp": 0.01044525, "balance_loss_clip": 1.02638185, "balance_loss_mlp": 1.05365896, "epoch": 0.21037126108522472, "flos": 22018197191040.0, "grad_norm": 2.5741904753581246, "language_loss": 0.80801171, "learning_rate": 3.6683154055567352e-06, "loss": 0.82995665, "num_input_tokens_seen": 75631005, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9609375, "step": 3499, "time_per_iteration": 2.5072243213653564 }, { "auxiliary_loss_clip": 0.01147838, "auxiliary_loss_mlp": 0.01040333, "balance_loss_clip": 1.02398968, "balance_loss_mlp": 1.05505872, "epoch": 0.21043138433789269, "flos": 25334403505920.0, "grad_norm": 2.500993462488415, "language_loss": 0.78480041, "learning_rate": 3.668100575684043e-06, "loss": 0.80668211, "num_input_tokens_seen": 75650655, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9296875, "step": 3500, "time_per_iteration": 2.528205633163452 }, { "auxiliary_loss_clip": 0.01148423, "auxiliary_loss_mlp": 0.01041425, "balance_loss_clip": 1.0241282, "balance_loss_mlp": 1.05481553, "epoch": 0.21049150759056065, "flos": 25556654908800.0, "grad_norm": 1.6789282009352116, "language_loss": 0.7441358, "learning_rate": 3.6678856825568094e-06, "loss": 0.76603425, "num_input_tokens_seen": 75669895, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9375, "step": 3501, "time_per_iteration": 2.528432846069336 }, { "auxiliary_loss_clip": 0.01143977, "auxiliary_loss_mlp": 0.01040736, "balance_loss_clip": 1.02404702, "balance_loss_mlp": 1.0526371, "epoch": 0.21055163084322862, "flos": 24495602129280.0, "grad_norm": 1.9296423247596974, "language_loss": 0.75439203, "learning_rate": 3.667670726183183e-06, "loss": 0.77623922, "num_input_tokens_seen": 75689535, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9140625, "step": 3502, "time_per_iteration": 2.5033962726593018 }, { "auxiliary_loss_clip": 0.01145482, "auxiliary_loss_mlp": 0.01041076, "balance_loss_clip": 1.02435124, "balance_loss_mlp": 1.05301523, "epoch": 0.21061175409589658, "flos": 25739045193600.0, "grad_norm": 1.799759001206579, "language_loss": 0.77572274, "learning_rate": 3.667455706571316e-06, "loss": 0.79758835, "num_input_tokens_seen": 75709265, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.921875, "step": 3503, "time_per_iteration": 2.542069673538208 }, { "auxiliary_loss_clip": 0.01151125, "auxiliary_loss_mlp": 0.01050877, "balance_loss_clip": 1.03110027, "balance_loss_mlp": 1.05265212, "epoch": 0.21067187734856455, "flos": 18989168112000.0, "grad_norm": 2.403552093119709, "language_loss": 0.77879643, "learning_rate": 3.6672406237293617e-06, "loss": 0.80081648, "num_input_tokens_seen": 75727050, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.984375, "step": 3504, "time_per_iteration": 2.4449968338012695 }, { "auxiliary_loss_clip": 0.01151215, "auxiliary_loss_mlp": 0.01043022, "balance_loss_clip": 1.02473569, "balance_loss_mlp": 1.05260301, "epoch": 0.21073200060123254, "flos": 24681368292480.0, "grad_norm": 1.6046564672310266, "language_loss": 0.77157354, "learning_rate": 3.6670254776654754e-06, "loss": 0.79351586, "num_input_tokens_seen": 75747175, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.98828125, "step": 3505, "time_per_iteration": 2.5590286254882812 }, { "auxiliary_loss_clip": 0.01141925, "auxiliary_loss_mlp": 0.01045065, "balance_loss_clip": 1.02787471, "balance_loss_mlp": 1.05128527, "epoch": 0.2107921238539005, "flos": 28549342402560.0, "grad_norm": 1.953068988119208, "language_loss": 0.63622701, "learning_rate": 3.6668102683878163e-06, "loss": 0.65809691, "num_input_tokens_seen": 75767690, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.90625, "step": 3506, "time_per_iteration": 2.5331294536590576 }, { "auxiliary_loss_clip": 0.01145524, "auxiliary_loss_mlp": 0.0104196, "balance_loss_clip": 1.02475786, "balance_loss_mlp": 1.05279982, "epoch": 0.21085224710656847, "flos": 25885848078720.0, "grad_norm": 2.438463936858016, "language_loss": 0.82066292, "learning_rate": 3.6665949959045443e-06, "loss": 0.84253776, "num_input_tokens_seen": 75787255, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.92578125, "step": 3507, "time_per_iteration": 2.5448062419891357 }, { "auxiliary_loss_clip": 0.01144369, "auxiliary_loss_mlp": 0.0104361, "balance_loss_clip": 1.0255146, "balance_loss_mlp": 1.0503602, "epoch": 0.21091237035923643, "flos": 14976294537600.0, "grad_norm": 1.6619038281522915, "language_loss": 0.75448108, "learning_rate": 3.666379660223824e-06, "loss": 0.77636081, "num_input_tokens_seen": 75805890, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.94140625, "step": 3508, "time_per_iteration": 2.4403417110443115 }, { "auxiliary_loss_clip": 0.01149069, "auxiliary_loss_mlp": 0.0103968, "balance_loss_clip": 1.02188206, "balance_loss_mlp": 1.05334926, "epoch": 0.2109724936119044, "flos": 16362518163840.0, "grad_norm": 2.8687449900086026, "language_loss": 0.84739113, "learning_rate": 3.6661642613538192e-06, "loss": 0.86927855, "num_input_tokens_seen": 75821620, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.95703125, "step": 3509, "time_per_iteration": 3.896456241607666 }, { "auxiliary_loss_clip": 0.01148731, "auxiliary_loss_mlp": 0.01042102, "balance_loss_clip": 1.02340961, "balance_loss_mlp": 1.05246544, "epoch": 0.21103261686457236, "flos": 31502492000640.0, "grad_norm": 1.9037438441515648, "language_loss": 0.67803782, "learning_rate": 3.6659487993026987e-06, "loss": 0.69994617, "num_input_tokens_seen": 75842490, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.9609375, "step": 3510, "time_per_iteration": 2.5471572875976562 }, { "auxiliary_loss_clip": 0.01145923, "auxiliary_loss_mlp": 0.01041937, "balance_loss_clip": 1.02462816, "balance_loss_mlp": 1.04985523, "epoch": 0.21109274011724033, "flos": 27344072517120.0, "grad_norm": 1.7892971603578736, "language_loss": 0.72244668, "learning_rate": 3.6657332740786327e-06, "loss": 0.74432528, "num_input_tokens_seen": 75865985, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9609375, "step": 3511, "time_per_iteration": 4.017989873886108 }, { "auxiliary_loss_clip": 0.01151675, "auxiliary_loss_mlp": 0.01039866, "balance_loss_clip": 1.02055371, "balance_loss_mlp": 1.05402696, "epoch": 0.21115286336990832, "flos": 17820383466240.0, "grad_norm": 2.4527063390544197, "language_loss": 0.69864905, "learning_rate": 3.665517685689794e-06, "loss": 0.72056448, "num_input_tokens_seen": 75882745, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.9765625, "step": 3512, "time_per_iteration": 2.453972101211548 }, { "auxiliary_loss_clip": 0.01145802, "auxiliary_loss_mlp": 0.01045551, "balance_loss_clip": 1.02676356, "balance_loss_mlp": 1.05064094, "epoch": 0.2112129866225763, "flos": 27197987904000.0, "grad_norm": 2.069099285220911, "language_loss": 0.73352969, "learning_rate": 3.6653020341443584e-06, "loss": 0.75544322, "num_input_tokens_seen": 75904305, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.953125, "step": 3513, "time_per_iteration": 3.9663779735565186 }, { "auxiliary_loss_clip": 0.01142733, "auxiliary_loss_mlp": 0.01036985, "balance_loss_clip": 1.02021182, "balance_loss_mlp": 1.05174506, "epoch": 0.21127310987524425, "flos": 23731279603200.0, "grad_norm": 1.8583472982335658, "language_loss": 0.74312955, "learning_rate": 3.665086319450502e-06, "loss": 0.76492679, "num_input_tokens_seen": 75923710, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.91015625, "step": 3514, "time_per_iteration": 3.998167037963867 }, { "auxiliary_loss_clip": 0.0114705, "auxiliary_loss_mlp": 0.01039077, "balance_loss_clip": 1.02144563, "balance_loss_mlp": 1.05085468, "epoch": 0.21133323312791222, "flos": 18332505624960.0, "grad_norm": 1.9183578023439471, "language_loss": 0.76410556, "learning_rate": 3.6648705416164062e-06, "loss": 0.78596681, "num_input_tokens_seen": 75942625, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9609375, "step": 3515, "time_per_iteration": 2.4602530002593994 }, { "auxiliary_loss_clip": 0.0114548, "auxiliary_loss_mlp": 0.01041181, "balance_loss_clip": 1.02375269, "balance_loss_mlp": 1.05183351, "epoch": 0.21139335638058018, "flos": 17931203902080.0, "grad_norm": 3.7392782285032267, "language_loss": 0.68137503, "learning_rate": 3.6646547006502518e-06, "loss": 0.70324171, "num_input_tokens_seen": 75959930, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.93359375, "step": 3516, "time_per_iteration": 2.4431674480438232 }, { "auxiliary_loss_clip": 0.01150386, "auxiliary_loss_mlp": 0.01045482, "balance_loss_clip": 1.02686131, "balance_loss_mlp": 1.05349886, "epoch": 0.21145347963324815, "flos": 24572092141440.0, "grad_norm": 2.0239714373352458, "language_loss": 0.85461688, "learning_rate": 3.664438796560225e-06, "loss": 0.87657559, "num_input_tokens_seen": 75980335, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.96875, "step": 3517, "time_per_iteration": 2.547074317932129 }, { "auxiliary_loss_clip": 0.0114368, "auxiliary_loss_mlp": 0.01040864, "balance_loss_clip": 1.02328074, "balance_loss_mlp": 1.0494945, "epoch": 0.21151360288591614, "flos": 35845959375360.0, "grad_norm": 1.8949780652280503, "language_loss": 0.62677944, "learning_rate": 3.664222829354512e-06, "loss": 0.6486249, "num_input_tokens_seen": 76002095, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9375, "step": 3518, "time_per_iteration": 2.5984857082366943 }, { "auxiliary_loss_clip": 0.01142707, "auxiliary_loss_mlp": 0.01051199, "balance_loss_clip": 1.03446198, "balance_loss_mlp": 1.04939389, "epoch": 0.2115737261385841, "flos": 24641579001600.0, "grad_norm": 2.1645567423585113, "language_loss": 0.89301145, "learning_rate": 3.664006799041303e-06, "loss": 0.91495049, "num_input_tokens_seen": 76020425, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.93359375, "step": 3519, "time_per_iteration": 2.509235382080078 }, { "auxiliary_loss_clip": 0.01147442, "auxiliary_loss_mlp": 0.01045008, "balance_loss_clip": 1.02687633, "balance_loss_mlp": 1.05171227, "epoch": 0.21163384939125207, "flos": 25226887121280.0, "grad_norm": 1.6011452030968079, "language_loss": 0.80804771, "learning_rate": 3.6637907056287886e-06, "loss": 0.82997221, "num_input_tokens_seen": 76041210, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.95703125, "step": 3520, "time_per_iteration": 2.5072901248931885 }, { "auxiliary_loss_clip": 0.01140977, "auxiliary_loss_mlp": 0.01044125, "balance_loss_clip": 1.02699423, "balance_loss_mlp": 1.04991961, "epoch": 0.21169397264392004, "flos": 26067520091520.0, "grad_norm": 1.5594023821369019, "language_loss": 0.75732553, "learning_rate": 3.6635745491251642e-06, "loss": 0.77917659, "num_input_tokens_seen": 76062685, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.91015625, "step": 3521, "time_per_iteration": 2.559811592102051 }, { "auxiliary_loss_clip": 0.01143548, "auxiliary_loss_mlp": 0.01036431, "balance_loss_clip": 1.02056384, "balance_loss_mlp": 1.04988599, "epoch": 0.211754095896588, "flos": 23108265181440.0, "grad_norm": 2.008220003574389, "language_loss": 0.75439262, "learning_rate": 3.663358329538626e-06, "loss": 0.77619243, "num_input_tokens_seen": 76082300, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.9375, "step": 3522, "time_per_iteration": 2.4853782653808594 }, { "auxiliary_loss_clip": 0.01144264, "auxiliary_loss_mlp": 0.01047465, "balance_loss_clip": 1.02966666, "balance_loss_mlp": 1.05062723, "epoch": 0.21181421914925597, "flos": 27922341571200.0, "grad_norm": 1.9141317472773587, "language_loss": 0.70288134, "learning_rate": 3.663142046877374e-06, "loss": 0.72479862, "num_input_tokens_seen": 76101135, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9375, "step": 3523, "time_per_iteration": 2.5486843585968018 }, { "auxiliary_loss_clip": 0.01145354, "auxiliary_loss_mlp": 0.01045845, "balance_loss_clip": 1.02906024, "balance_loss_mlp": 1.05221987, "epoch": 0.21187434240192393, "flos": 17128636369920.0, "grad_norm": 2.3849530416499825, "language_loss": 0.77496457, "learning_rate": 3.6629257011496085e-06, "loss": 0.79687655, "num_input_tokens_seen": 76119320, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9296875, "step": 3524, "time_per_iteration": 2.4432520866394043 }, { "auxiliary_loss_clip": 0.01143759, "auxiliary_loss_mlp": 0.01038658, "balance_loss_clip": 1.02134871, "balance_loss_mlp": 1.04734206, "epoch": 0.21193446565459192, "flos": 22347318533760.0, "grad_norm": 1.7953453276179026, "language_loss": 0.81513649, "learning_rate": 3.6627092923635338e-06, "loss": 0.83696067, "num_input_tokens_seen": 76137445, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.96484375, "step": 3525, "time_per_iteration": 2.4941959381103516 }, { "auxiliary_loss_clip": 0.01143054, "auxiliary_loss_mlp": 0.01040082, "balance_loss_clip": 1.02317798, "balance_loss_mlp": 1.05028713, "epoch": 0.2119945889072599, "flos": 27199316707200.0, "grad_norm": 2.6790119500304796, "language_loss": 0.75246298, "learning_rate": 3.662492820527356e-06, "loss": 0.77429432, "num_input_tokens_seen": 76159500, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.92578125, "step": 3526, "time_per_iteration": 2.5129032135009766 }, { "auxiliary_loss_clip": 0.0114613, "auxiliary_loss_mlp": 0.01040654, "balance_loss_clip": 1.02338076, "balance_loss_mlp": 1.05019021, "epoch": 0.21205471215992786, "flos": 20991869884800.0, "grad_norm": 2.0006317345068094, "language_loss": 0.76961505, "learning_rate": 3.662276285649284e-06, "loss": 0.79148293, "num_input_tokens_seen": 76177990, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9609375, "step": 3527, "time_per_iteration": 2.504422903060913 }, { "auxiliary_loss_clip": 0.01143123, "auxiliary_loss_mlp": 0.01047661, "balance_loss_clip": 1.02912402, "balance_loss_mlp": 1.04990864, "epoch": 0.21211483541259582, "flos": 20777663128320.0, "grad_norm": 1.7000544997575437, "language_loss": 0.77811289, "learning_rate": 3.662059687737528e-06, "loss": 0.80002069, "num_input_tokens_seen": 76197125, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9296875, "step": 3528, "time_per_iteration": 2.460934638977051 }, { "auxiliary_loss_clip": 0.01142936, "auxiliary_loss_mlp": 0.01045714, "balance_loss_clip": 1.0287385, "balance_loss_mlp": 1.05003464, "epoch": 0.21217495866526379, "flos": 18989994124800.0, "grad_norm": 1.7887422053696096, "language_loss": 0.81555295, "learning_rate": 3.6618430268003024e-06, "loss": 0.83743942, "num_input_tokens_seen": 76216215, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9296875, "step": 3529, "time_per_iteration": 2.46358323097229 }, { "auxiliary_loss_clip": 0.0114627, "auxiliary_loss_mlp": 0.01046396, "balance_loss_clip": 1.02895617, "balance_loss_mlp": 1.04971671, "epoch": 0.21223508191793175, "flos": 20667309569280.0, "grad_norm": 4.096822097973581, "language_loss": 0.7686553, "learning_rate": 3.6616263028458235e-06, "loss": 0.79058194, "num_input_tokens_seen": 76237010, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.96484375, "step": 3530, "time_per_iteration": 2.5035290718078613 }, { "auxiliary_loss_clip": 0.01143573, "auxiliary_loss_mlp": 0.01043523, "balance_loss_clip": 1.02702463, "balance_loss_mlp": 1.05121326, "epoch": 0.21229520517059972, "flos": 21616464504960.0, "grad_norm": 2.093634836689587, "language_loss": 0.83077449, "learning_rate": 3.661409515882308e-06, "loss": 0.8526454, "num_input_tokens_seen": 76255965, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.921875, "step": 3531, "time_per_iteration": 2.510071039199829 }, { "auxiliary_loss_clip": 0.01148077, "auxiliary_loss_mlp": 0.0104319, "balance_loss_clip": 1.02460492, "balance_loss_mlp": 1.05265307, "epoch": 0.2123553284232677, "flos": 13991049411840.0, "grad_norm": 2.4190524924181913, "language_loss": 0.73189527, "learning_rate": 3.661192665917977e-06, "loss": 0.7538079, "num_input_tokens_seen": 76272150, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.95703125, "step": 3532, "time_per_iteration": 2.4520323276519775 }, { "auxiliary_loss_clip": 0.01146606, "auxiliary_loss_mlp": 0.01043295, "balance_loss_clip": 1.02506757, "balance_loss_mlp": 1.05234432, "epoch": 0.21241545167593567, "flos": 18296774570880.0, "grad_norm": 2.7305467005887505, "language_loss": 0.73940933, "learning_rate": 3.660975752961054e-06, "loss": 0.76130831, "num_input_tokens_seen": 76291425, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.94140625, "step": 3533, "time_per_iteration": 2.4747474193573 }, { "auxiliary_loss_clip": 0.01148453, "auxiliary_loss_mlp": 0.01046302, "balance_loss_clip": 1.02930284, "balance_loss_mlp": 1.05274487, "epoch": 0.21247557492860364, "flos": 34713121265280.0, "grad_norm": 2.5325499341031166, "language_loss": 0.71013618, "learning_rate": 3.6607587770197634e-06, "loss": 0.73208374, "num_input_tokens_seen": 76313975, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.95703125, "step": 3534, "time_per_iteration": 2.586634397506714 }, { "auxiliary_loss_clip": 0.01149055, "auxiliary_loss_mlp": 0.0104294, "balance_loss_clip": 1.02502251, "balance_loss_mlp": 1.05359983, "epoch": 0.2125356981812716, "flos": 22053820504320.0, "grad_norm": 2.2731207396338555, "language_loss": 0.71977526, "learning_rate": 3.6605417381023346e-06, "loss": 0.74169523, "num_input_tokens_seen": 76330955, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.953125, "step": 3535, "time_per_iteration": 2.4813952445983887 }, { "auxiliary_loss_clip": 0.011441, "auxiliary_loss_mlp": 0.01054897, "balance_loss_clip": 1.03763604, "balance_loss_mlp": 1.05164051, "epoch": 0.21259582143393957, "flos": 28548336821760.0, "grad_norm": 2.0042451369840073, "language_loss": 0.70454574, "learning_rate": 3.660324636216996e-06, "loss": 0.72653568, "num_input_tokens_seen": 76352680, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.921875, "step": 3536, "time_per_iteration": 2.5248258113861084 }, { "auxiliary_loss_clip": 0.01147357, "auxiliary_loss_mlp": 0.01047897, "balance_loss_clip": 1.03034925, "balance_loss_mlp": 1.05128694, "epoch": 0.21265594468660753, "flos": 20120892900480.0, "grad_norm": 1.902420921838629, "language_loss": 0.87934649, "learning_rate": 3.660107471371981e-06, "loss": 0.901299, "num_input_tokens_seen": 76370750, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9609375, "step": 3537, "time_per_iteration": 2.4910836219787598 }, { "auxiliary_loss_clip": 0.01141833, "auxiliary_loss_mlp": 0.01042268, "balance_loss_clip": 1.02512538, "balance_loss_mlp": 1.04932094, "epoch": 0.21271606793927553, "flos": 23076161400960.0, "grad_norm": 1.7586815262534654, "language_loss": 0.80555475, "learning_rate": 3.659890243575524e-06, "loss": 0.8273958, "num_input_tokens_seen": 76390610, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.92578125, "step": 3538, "time_per_iteration": 2.478022336959839 }, { "auxiliary_loss_clip": 0.01142438, "auxiliary_loss_mlp": 0.01047316, "balance_loss_clip": 1.03087735, "balance_loss_mlp": 1.05037642, "epoch": 0.2127761911919435, "flos": 26388201738240.0, "grad_norm": 1.9262443955716804, "language_loss": 0.86948997, "learning_rate": 3.659672952835863e-06, "loss": 0.89138746, "num_input_tokens_seen": 76408860, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.921875, "step": 3539, "time_per_iteration": 2.519976854324341 }, { "auxiliary_loss_clip": 0.0114562, "auxiliary_loss_mlp": 0.01049002, "balance_loss_clip": 1.03158593, "balance_loss_mlp": 1.05119514, "epoch": 0.21283631444461146, "flos": 20228265630720.0, "grad_norm": 2.064649937758424, "language_loss": 0.57861346, "learning_rate": 3.659455599161237e-06, "loss": 0.60055965, "num_input_tokens_seen": 76424980, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3540, "time_per_iteration": 2.4485085010528564 }, { "auxiliary_loss_clip": 0.01146712, "auxiliary_loss_mlp": 0.01047427, "balance_loss_clip": 1.03017712, "balance_loss_mlp": 1.05280817, "epoch": 0.21289643769727942, "flos": 13516992691200.0, "grad_norm": 2.153222925532116, "language_loss": 0.75115097, "learning_rate": 3.659238182559888e-06, "loss": 0.77309233, "num_input_tokens_seen": 76443135, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.94140625, "step": 3541, "time_per_iteration": 2.474578619003296 }, { "auxiliary_loss_clip": 0.01144221, "auxiliary_loss_mlp": 0.01049847, "balance_loss_clip": 1.03260946, "balance_loss_mlp": 1.05234683, "epoch": 0.2129565609499474, "flos": 24827021942400.0, "grad_norm": 1.7980667626837712, "language_loss": 0.6907112, "learning_rate": 3.6590207030400615e-06, "loss": 0.71265185, "num_input_tokens_seen": 76462470, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.91796875, "step": 3542, "time_per_iteration": 2.519599676132202 }, { "auxiliary_loss_clip": 0.01140805, "auxiliary_loss_mlp": 0.01042633, "balance_loss_clip": 1.02657592, "balance_loss_mlp": 1.04890251, "epoch": 0.21301668420261535, "flos": 23659242877440.0, "grad_norm": 1.9106571513912651, "language_loss": 0.75335586, "learning_rate": 3.658803160610004e-06, "loss": 0.77519023, "num_input_tokens_seen": 76481995, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.91796875, "step": 3543, "time_per_iteration": 2.5182621479034424 }, { "auxiliary_loss_clip": 0.01145251, "auxiliary_loss_mlp": 0.0104138, "balance_loss_clip": 1.02465451, "balance_loss_mlp": 1.05372763, "epoch": 0.21307680745528332, "flos": 16362805472640.0, "grad_norm": 1.7610162177946203, "language_loss": 0.6698519, "learning_rate": 3.6585855552779634e-06, "loss": 0.69171816, "num_input_tokens_seen": 76500245, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9140625, "step": 3544, "time_per_iteration": 2.473414659500122 }, { "auxiliary_loss_clip": 0.01143194, "auxiliary_loss_mlp": 0.01044171, "balance_loss_clip": 1.02770805, "balance_loss_mlp": 1.0500325, "epoch": 0.2131369307079513, "flos": 19099054794240.0, "grad_norm": 1.730196779968166, "language_loss": 0.71306056, "learning_rate": 3.6583678870521934e-06, "loss": 0.73493421, "num_input_tokens_seen": 76519535, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9296875, "step": 3545, "time_per_iteration": 2.4676270484924316 }, { "auxiliary_loss_clip": 0.01148399, "auxiliary_loss_mlp": 0.01046945, "balance_loss_clip": 1.02958786, "balance_loss_mlp": 1.05291152, "epoch": 0.21319705396061928, "flos": 30372275583360.0, "grad_norm": 1.6355862577847118, "language_loss": 0.72104704, "learning_rate": 3.658150155940946e-06, "loss": 0.74300051, "num_input_tokens_seen": 76542065, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.953125, "step": 3546, "time_per_iteration": 2.5312368869781494 }, { "auxiliary_loss_clip": 0.01146774, "auxiliary_loss_mlp": 0.01046188, "balance_loss_clip": 1.02893829, "balance_loss_mlp": 1.05270338, "epoch": 0.21325717721328724, "flos": 21756192410880.0, "grad_norm": 1.8824456930807274, "language_loss": 0.80484486, "learning_rate": 3.657932361952479e-06, "loss": 0.82677448, "num_input_tokens_seen": 76560540, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.94140625, "step": 3547, "time_per_iteration": 2.4782981872558594 }, { "auxiliary_loss_clip": 0.01147721, "auxiliary_loss_mlp": 0.0104762, "balance_loss_clip": 1.02972698, "balance_loss_mlp": 1.05069876, "epoch": 0.2133173004659552, "flos": 28730870760960.0, "grad_norm": 2.381135067275243, "language_loss": 0.74955654, "learning_rate": 3.6577145050950504e-06, "loss": 0.77150989, "num_input_tokens_seen": 76581760, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.96875, "step": 3548, "time_per_iteration": 2.5220673084259033 }, { "auxiliary_loss_clip": 0.01149613, "auxiliary_loss_mlp": 0.01052113, "balance_loss_clip": 1.03295636, "balance_loss_mlp": 1.05276799, "epoch": 0.21337742371862317, "flos": 16837077674880.0, "grad_norm": 1.878665351961248, "language_loss": 0.74476647, "learning_rate": 3.657496585376922e-06, "loss": 0.76678365, "num_input_tokens_seen": 76599940, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.96875, "step": 3549, "time_per_iteration": 2.4586429595947266 }, { "auxiliary_loss_clip": 0.01148499, "auxiliary_loss_mlp": 0.01043806, "balance_loss_clip": 1.02689028, "balance_loss_mlp": 1.05386329, "epoch": 0.21343754697129114, "flos": 24424930120320.0, "grad_norm": 1.8213913336760303, "language_loss": 0.80819499, "learning_rate": 3.657278602806357e-06, "loss": 0.83011806, "num_input_tokens_seen": 76619580, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9453125, "step": 3550, "time_per_iteration": 2.5021402835845947 }, { "auxiliary_loss_clip": 0.01141968, "auxiliary_loss_mlp": 0.01044757, "balance_loss_clip": 1.02786469, "balance_loss_mlp": 1.05151772, "epoch": 0.21349767022395913, "flos": 19277817805440.0, "grad_norm": 1.6003596728323586, "language_loss": 0.87943029, "learning_rate": 3.657060557391621e-06, "loss": 0.90129751, "num_input_tokens_seen": 76638195, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.90625, "step": 3551, "time_per_iteration": 3.9370503425598145 }, { "auxiliary_loss_clip": 0.01143925, "auxiliary_loss_mlp": 0.01045252, "balance_loss_clip": 1.02743018, "balance_loss_mlp": 1.05008912, "epoch": 0.2135577934766271, "flos": 17347547808000.0, "grad_norm": 1.876048119773943, "language_loss": 0.83260179, "learning_rate": 3.656842449140983e-06, "loss": 0.8544935, "num_input_tokens_seen": 76656695, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9375, "step": 3552, "time_per_iteration": 2.4644899368286133 }, { "auxiliary_loss_clip": 0.011432, "auxiliary_loss_mlp": 0.0104743, "balance_loss_clip": 1.02961969, "balance_loss_mlp": 1.05033672, "epoch": 0.21361791672929506, "flos": 24057204635520.0, "grad_norm": 2.2279158702827635, "language_loss": 0.76721311, "learning_rate": 3.656624278062713e-06, "loss": 0.78911936, "num_input_tokens_seen": 76677430, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9296875, "step": 3553, "time_per_iteration": 3.8768880367279053 }, { "auxiliary_loss_clip": 0.01142906, "auxiliary_loss_mlp": 0.01037144, "balance_loss_clip": 1.02149177, "balance_loss_mlp": 1.05063272, "epoch": 0.21367803998196302, "flos": 22162306556160.0, "grad_norm": 1.6914565077620023, "language_loss": 0.72710419, "learning_rate": 3.6564060441650843e-06, "loss": 0.74890471, "num_input_tokens_seen": 76697615, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.921875, "step": 3554, "time_per_iteration": 2.5230181217193604 }, { "auxiliary_loss_clip": 0.01144856, "auxiliary_loss_mlp": 0.01037194, "balance_loss_clip": 1.020648, "balance_loss_mlp": 1.05152011, "epoch": 0.213738163234631, "flos": 20886867452160.0, "grad_norm": 1.9453719626430084, "language_loss": 0.67250288, "learning_rate": 3.6561877474563724e-06, "loss": 0.69432342, "num_input_tokens_seen": 76715685, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.93359375, "step": 3555, "time_per_iteration": 3.844257116317749 }, { "auxiliary_loss_clip": 0.01146231, "auxiliary_loss_mlp": 0.01036516, "balance_loss_clip": 1.01870596, "balance_loss_mlp": 1.05023932, "epoch": 0.21379828648729896, "flos": 28403114135040.0, "grad_norm": 1.810141839448053, "language_loss": 0.64938188, "learning_rate": 3.6559693879448553e-06, "loss": 0.67120934, "num_input_tokens_seen": 76735405, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9609375, "step": 3556, "time_per_iteration": 3.969425916671753 }, { "auxiliary_loss_clip": 0.01144825, "auxiliary_loss_mlp": 0.01049832, "balance_loss_clip": 1.03103232, "balance_loss_mlp": 1.0501579, "epoch": 0.21385840973996692, "flos": 25479662106240.0, "grad_norm": 1.6305723879707046, "language_loss": 0.7264604, "learning_rate": 3.6557509656388125e-06, "loss": 0.74840695, "num_input_tokens_seen": 76754395, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.9453125, "step": 3557, "time_per_iteration": 2.522451877593994 }, { "auxiliary_loss_clip": 0.01149211, "auxiliary_loss_mlp": 0.01043592, "balance_loss_clip": 1.02448297, "balance_loss_mlp": 1.0516597, "epoch": 0.2139185329926349, "flos": 28074280101120.0, "grad_norm": 1.6388769832530974, "language_loss": 0.67129838, "learning_rate": 3.655532480546528e-06, "loss": 0.69322646, "num_input_tokens_seen": 76777210, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.97265625, "step": 3558, "time_per_iteration": 2.5343923568725586 }, { "auxiliary_loss_clip": 0.01149775, "auxiliary_loss_mlp": 0.010386, "balance_loss_clip": 1.02089787, "balance_loss_mlp": 1.05036056, "epoch": 0.21397865624530288, "flos": 19608698914560.0, "grad_norm": 1.7558018207359118, "language_loss": 0.80037296, "learning_rate": 3.655313932676286e-06, "loss": 0.82225668, "num_input_tokens_seen": 76795830, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9921875, "step": 3559, "time_per_iteration": 2.47464919090271 }, { "auxiliary_loss_clip": 0.01142405, "auxiliary_loss_mlp": 0.01045365, "balance_loss_clip": 1.02840126, "balance_loss_mlp": 1.04857469, "epoch": 0.21403877949797084, "flos": 24681476033280.0, "grad_norm": 1.676117873752331, "language_loss": 0.67622256, "learning_rate": 3.655095322036373e-06, "loss": 0.69810027, "num_input_tokens_seen": 76814700, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9375, "step": 3560, "time_per_iteration": 2.500725030899048 }, { "auxiliary_loss_clip": 0.01149404, "auxiliary_loss_mlp": 0.01048651, "balance_loss_clip": 1.03043568, "balance_loss_mlp": 1.0520829, "epoch": 0.2140989027506388, "flos": 19861150677120.0, "grad_norm": 2.041015258211157, "language_loss": 0.73355997, "learning_rate": 3.65487664863508e-06, "loss": 0.75554055, "num_input_tokens_seen": 76833400, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.97265625, "step": 3561, "time_per_iteration": 2.5151164531707764 }, { "auxiliary_loss_clip": 0.01149736, "auxiliary_loss_mlp": 0.01047744, "balance_loss_clip": 1.02987492, "balance_loss_mlp": 1.05335903, "epoch": 0.21415902600330677, "flos": 19135324552320.0, "grad_norm": 2.5556850740918966, "language_loss": 0.77485418, "learning_rate": 3.654657912480698e-06, "loss": 0.79682899, "num_input_tokens_seen": 76850645, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.96484375, "step": 3562, "time_per_iteration": 2.4827659130096436 }, { "auxiliary_loss_clip": 0.0114398, "auxiliary_loss_mlp": 0.01041392, "balance_loss_clip": 1.02358246, "balance_loss_mlp": 1.05006266, "epoch": 0.21421914925597474, "flos": 22272624201600.0, "grad_norm": 1.50161483524611, "language_loss": 0.84616727, "learning_rate": 3.6544391135815237e-06, "loss": 0.86802101, "num_input_tokens_seen": 76870135, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.94140625, "step": 3563, "time_per_iteration": 2.4815804958343506 }, { "auxiliary_loss_clip": 0.01145323, "auxiliary_loss_mlp": 0.01041679, "balance_loss_clip": 1.02465606, "balance_loss_mlp": 1.05071259, "epoch": 0.2142792725086427, "flos": 33875109987840.0, "grad_norm": 1.6610323331814534, "language_loss": 0.76664031, "learning_rate": 3.6542202519458507e-06, "loss": 0.78851032, "num_input_tokens_seen": 76893905, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9453125, "step": 3564, "time_per_iteration": 2.5778067111968994 }, { "auxiliary_loss_clip": 0.01141578, "auxiliary_loss_mlp": 0.01044652, "balance_loss_clip": 1.02710414, "balance_loss_mlp": 1.0491097, "epoch": 0.2143393957613107, "flos": 19860216923520.0, "grad_norm": 1.7685668756307966, "language_loss": 0.88463557, "learning_rate": 3.654001327581981e-06, "loss": 0.90649796, "num_input_tokens_seen": 76914205, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.92578125, "step": 3565, "time_per_iteration": 2.5082437992095947 }, { "auxiliary_loss_clip": 0.01058637, "auxiliary_loss_mlp": 0.01009138, "balance_loss_clip": 1.00655162, "balance_loss_mlp": 1.02620149, "epoch": 0.21439951901397866, "flos": 68530093090560.0, "grad_norm": 0.8376627807814973, "language_loss": 0.52283978, "learning_rate": 3.653782340498215e-06, "loss": 0.54351747, "num_input_tokens_seen": 76975650, "router_z_loss_clip": 0.02587891, "router_z_loss_mlp": 0.32421875, "step": 3566, "time_per_iteration": 3.058013439178467 }, { "auxiliary_loss_clip": 0.0114205, "auxiliary_loss_mlp": 0.0104178, "balance_loss_clip": 1.02522182, "balance_loss_mlp": 1.05110765, "epoch": 0.21445964226664663, "flos": 19682998197120.0, "grad_norm": 1.8212449757147318, "language_loss": 0.6698885, "learning_rate": 3.6535632907028566e-06, "loss": 0.6917268, "num_input_tokens_seen": 76992615, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.91015625, "step": 3567, "time_per_iteration": 2.485316038131714 }, { "auxiliary_loss_clip": 0.01141052, "auxiliary_loss_mlp": 0.01041599, "balance_loss_clip": 1.02551806, "balance_loss_mlp": 1.05000806, "epoch": 0.2145197655193146, "flos": 31107259676160.0, "grad_norm": 1.6866846620293254, "language_loss": 0.74533772, "learning_rate": 3.6533441782042126e-06, "loss": 0.76716423, "num_input_tokens_seen": 77017005, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.91015625, "step": 3568, "time_per_iteration": 2.5850019454956055 }, { "auxiliary_loss_clip": 0.01144938, "auxiliary_loss_mlp": 0.01049729, "balance_loss_clip": 1.03258657, "balance_loss_mlp": 1.05114269, "epoch": 0.21457988877198256, "flos": 20120785159680.0, "grad_norm": 1.607530321449931, "language_loss": 0.77719516, "learning_rate": 3.6531250030105917e-06, "loss": 0.79914182, "num_input_tokens_seen": 77034990, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9375, "step": 3569, "time_per_iteration": 2.4554431438446045 }, { "auxiliary_loss_clip": 0.01155268, "auxiliary_loss_mlp": 0.01048024, "balance_loss_clip": 1.02796125, "balance_loss_mlp": 1.05481076, "epoch": 0.21464001202465052, "flos": 18588045957120.0, "grad_norm": 3.5631171357210323, "language_loss": 0.70051485, "learning_rate": 3.6529057651303053e-06, "loss": 0.72254777, "num_input_tokens_seen": 77052610, "router_z_loss_clip": 0.20117188, "router_z_loss_mlp": 1.0, "step": 3570, "time_per_iteration": 2.443368434906006 }, { "auxiliary_loss_clip": 0.01149791, "auxiliary_loss_mlp": 0.01057184, "balance_loss_clip": 1.03925538, "balance_loss_mlp": 1.05203021, "epoch": 0.21470013527731852, "flos": 21835160461440.0, "grad_norm": 2.0290789763220887, "language_loss": 0.78210878, "learning_rate": 3.6526864645716666e-06, "loss": 0.8041786, "num_input_tokens_seen": 77072475, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9765625, "step": 3571, "time_per_iteration": 2.472045421600342 }, { "auxiliary_loss_clip": 0.01148242, "auxiliary_loss_mlp": 0.01044344, "balance_loss_clip": 1.02517462, "balance_loss_mlp": 1.05278504, "epoch": 0.21476025852998648, "flos": 17603195880960.0, "grad_norm": 2.4974006455726334, "language_loss": 0.82805264, "learning_rate": 3.652467101342991e-06, "loss": 0.84997845, "num_input_tokens_seen": 77089930, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.953125, "step": 3572, "time_per_iteration": 2.4290127754211426 }, { "auxiliary_loss_clip": 0.01151173, "auxiliary_loss_mlp": 0.01044321, "balance_loss_clip": 1.02596283, "balance_loss_mlp": 1.05093896, "epoch": 0.21482038178265445, "flos": 24828135264000.0, "grad_norm": 2.3809034203900614, "language_loss": 0.65197051, "learning_rate": 3.652247675452598e-06, "loss": 0.67392546, "num_input_tokens_seen": 77108970, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 1.0, "step": 3573, "time_per_iteration": 2.5046639442443848 }, { "auxiliary_loss_clip": 0.01138861, "auxiliary_loss_mlp": 0.01047575, "balance_loss_clip": 1.03045702, "balance_loss_mlp": 1.04701424, "epoch": 0.2148805050353224, "flos": 23258228463360.0, "grad_norm": 2.3976471236708963, "language_loss": 0.75522542, "learning_rate": 3.652028186908807e-06, "loss": 0.77708977, "num_input_tokens_seen": 77126045, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.91796875, "step": 3574, "time_per_iteration": 2.4613213539123535 }, { "auxiliary_loss_clip": 0.01141463, "auxiliary_loss_mlp": 0.01043206, "balance_loss_clip": 1.024979, "balance_loss_mlp": 1.0479939, "epoch": 0.21494062828799038, "flos": 21321098968320.0, "grad_norm": 1.96064834845036, "language_loss": 0.71897131, "learning_rate": 3.6518086357199416e-06, "loss": 0.74081802, "num_input_tokens_seen": 77144600, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.9375, "step": 3575, "time_per_iteration": 2.4831526279449463 }, { "auxiliary_loss_clip": 0.0114499, "auxiliary_loss_mlp": 0.01039728, "balance_loss_clip": 1.02248991, "balance_loss_mlp": 1.05170941, "epoch": 0.21500075154065834, "flos": 18843334894080.0, "grad_norm": 1.6833036658207832, "language_loss": 0.67785537, "learning_rate": 3.6515890218943277e-06, "loss": 0.69970256, "num_input_tokens_seen": 77162965, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.93359375, "step": 3576, "time_per_iteration": 2.4558374881744385 }, { "auxiliary_loss_clip": 0.01149612, "auxiliary_loss_mlp": 0.01048159, "balance_loss_clip": 1.02974081, "balance_loss_mlp": 1.05075336, "epoch": 0.2150608747933263, "flos": 18441997257600.0, "grad_norm": 2.0101840738629364, "language_loss": 0.88546002, "learning_rate": 3.651369345440292e-06, "loss": 0.90743768, "num_input_tokens_seen": 77179960, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.98828125, "step": 3577, "time_per_iteration": 2.453063726425171 }, { "auxiliary_loss_clip": 0.01052783, "auxiliary_loss_mlp": 0.01017712, "balance_loss_clip": 1.01531553, "balance_loss_mlp": 1.02178478, "epoch": 0.2151209980459943, "flos": 66598242894720.0, "grad_norm": 0.8079921680765004, "language_loss": 0.56241691, "learning_rate": 3.6511496063661654e-06, "loss": 0.58312184, "num_input_tokens_seen": 77239500, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.30859375, "step": 3578, "time_per_iteration": 3.037062883377075 }, { "auxiliary_loss_clip": 0.01146021, "auxiliary_loss_mlp": 0.01046057, "balance_loss_clip": 1.02892661, "balance_loss_mlp": 1.05138946, "epoch": 0.21518112129866226, "flos": 21575885114880.0, "grad_norm": 1.7700325202793863, "language_loss": 0.88509774, "learning_rate": 3.6509298046802807e-06, "loss": 0.90701854, "num_input_tokens_seen": 77254680, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9453125, "step": 3579, "time_per_iteration": 2.44657826423645 }, { "auxiliary_loss_clip": 0.01145434, "auxiliary_loss_mlp": 0.01049697, "balance_loss_clip": 1.03115964, "balance_loss_mlp": 1.04880536, "epoch": 0.21524124455133023, "flos": 20047635112320.0, "grad_norm": 1.7528567687278847, "language_loss": 0.77901149, "learning_rate": 3.650709940390972e-06, "loss": 0.80096281, "num_input_tokens_seen": 77274060, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.96484375, "step": 3580, "time_per_iteration": 2.490586280822754 }, { "auxiliary_loss_clip": 0.01143119, "auxiliary_loss_mlp": 0.01042559, "balance_loss_clip": 1.0246892, "balance_loss_mlp": 1.05037582, "epoch": 0.2153013678039982, "flos": 23951807153280.0, "grad_norm": 2.2803153540122922, "language_loss": 0.72832328, "learning_rate": 3.6504900135065775e-06, "loss": 0.75018007, "num_input_tokens_seen": 77293255, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9296875, "step": 3581, "time_per_iteration": 2.517043352127075 }, { "auxiliary_loss_clip": 0.01143023, "auxiliary_loss_mlp": 0.01045708, "balance_loss_clip": 1.02552557, "balance_loss_mlp": 1.04977775, "epoch": 0.21536149105666616, "flos": 20594841880320.0, "grad_norm": 4.442121329911587, "language_loss": 0.7050451, "learning_rate": 3.6502700240354357e-06, "loss": 0.72693241, "num_input_tokens_seen": 77312390, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 0.9296875, "step": 3582, "time_per_iteration": 2.523569107055664 }, { "auxiliary_loss_clip": 0.01141598, "auxiliary_loss_mlp": 0.01043787, "balance_loss_clip": 1.0253098, "balance_loss_mlp": 1.04703736, "epoch": 0.21542161430933413, "flos": 12860042895360.0, "grad_norm": 2.4955415298486785, "language_loss": 0.83637029, "learning_rate": 3.650049971985889e-06, "loss": 0.85822409, "num_input_tokens_seen": 77330985, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9453125, "step": 3583, "time_per_iteration": 2.43802809715271 }, { "auxiliary_loss_clip": 0.0114952, "auxiliary_loss_mlp": 0.01052118, "balance_loss_clip": 1.03448737, "balance_loss_mlp": 1.05169475, "epoch": 0.21548173756200212, "flos": 26103933504000.0, "grad_norm": 2.8705602618064066, "language_loss": 0.83255553, "learning_rate": 3.6498298573662824e-06, "loss": 0.85457182, "num_input_tokens_seen": 77350770, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9765625, "step": 3584, "time_per_iteration": 2.5178744792938232 }, { "auxiliary_loss_clip": 0.01144074, "auxiliary_loss_mlp": 0.0104839, "balance_loss_clip": 1.02869678, "balance_loss_mlp": 1.04960918, "epoch": 0.21554186081467008, "flos": 22163779013760.0, "grad_norm": 2.783282601147922, "language_loss": 0.90193373, "learning_rate": 3.6496096801849625e-06, "loss": 0.9238584, "num_input_tokens_seen": 77370510, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.9453125, "step": 3585, "time_per_iteration": 2.4757955074310303 }, { "auxiliary_loss_clip": 0.01145618, "auxiliary_loss_mlp": 0.01047497, "balance_loss_clip": 1.02986574, "balance_loss_mlp": 1.05146742, "epoch": 0.21560198406733805, "flos": 22966741595520.0, "grad_norm": 2.1667632828964924, "language_loss": 0.74153709, "learning_rate": 3.649389440450277e-06, "loss": 0.76346827, "num_input_tokens_seen": 77390645, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9453125, "step": 3586, "time_per_iteration": 2.4842753410339355 }, { "auxiliary_loss_clip": 0.01145771, "auxiliary_loss_mlp": 0.01045278, "balance_loss_clip": 1.02863598, "balance_loss_mlp": 1.04969585, "epoch": 0.215662107320006, "flos": 22784064001920.0, "grad_norm": 1.70291087860618, "language_loss": 0.82933378, "learning_rate": 3.6491691381705804e-06, "loss": 0.85124433, "num_input_tokens_seen": 77409655, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9609375, "step": 3587, "time_per_iteration": 2.4715330600738525 }, { "auxiliary_loss_clip": 0.01143654, "auxiliary_loss_mlp": 0.01039223, "balance_loss_clip": 1.02073419, "balance_loss_mlp": 1.04863787, "epoch": 0.21572223057267398, "flos": 30883859038080.0, "grad_norm": 1.72979547924022, "language_loss": 0.75880909, "learning_rate": 3.648948773354224e-06, "loss": 0.78063786, "num_input_tokens_seen": 77430560, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.94921875, "step": 3588, "time_per_iteration": 2.5583887100219727 }, { "auxiliary_loss_clip": 0.01142757, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.02286947, "balance_loss_mlp": 1.0475564, "epoch": 0.21578235382534194, "flos": 26910487445760.0, "grad_norm": 1.9056310786509005, "language_loss": 0.80832976, "learning_rate": 3.6487283460095643e-06, "loss": 0.83016479, "num_input_tokens_seen": 77455000, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.953125, "step": 3589, "time_per_iteration": 2.5384745597839355 }, { "auxiliary_loss_clip": 0.01147175, "auxiliary_loss_mlp": 0.01038321, "balance_loss_clip": 1.02125061, "balance_loss_mlp": 1.05055535, "epoch": 0.2158424770780099, "flos": 24425720219520.0, "grad_norm": 2.253306228433722, "language_loss": 0.72481948, "learning_rate": 3.648507856144961e-06, "loss": 0.74667454, "num_input_tokens_seen": 77475075, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.96875, "step": 3590, "time_per_iteration": 2.517580270767212 }, { "auxiliary_loss_clip": 0.01149211, "auxiliary_loss_mlp": 0.01046761, "balance_loss_clip": 1.02690125, "balance_loss_mlp": 1.04927266, "epoch": 0.2159026003306779, "flos": 23949975559680.0, "grad_norm": 1.7020889410285884, "language_loss": 0.83844137, "learning_rate": 3.648287303768775e-06, "loss": 0.86040103, "num_input_tokens_seen": 77495945, "router_z_loss_clip": 0.19824219, "router_z_loss_mlp": 1.0, "step": 3591, "time_per_iteration": 2.4802746772766113 }, { "auxiliary_loss_clip": 0.01152807, "auxiliary_loss_mlp": 0.01049403, "balance_loss_clip": 1.02793348, "balance_loss_mlp": 1.0518831, "epoch": 0.21596272358334587, "flos": 30040963511040.0, "grad_norm": 1.7904572745692102, "language_loss": 0.68861133, "learning_rate": 3.6480666888893686e-06, "loss": 0.7106334, "num_input_tokens_seen": 77517140, "router_z_loss_clip": 0.21484375, "router_z_loss_mlp": 1.0078125, "step": 3592, "time_per_iteration": 2.5376124382019043 }, { "auxiliary_loss_clip": 0.01148725, "auxiliary_loss_mlp": 0.0104855, "balance_loss_clip": 1.02970326, "balance_loss_mlp": 1.05187595, "epoch": 0.21602284683601383, "flos": 20376217751040.0, "grad_norm": 2.8009168758448344, "language_loss": 0.83134979, "learning_rate": 3.647846011515108e-06, "loss": 0.85332257, "num_input_tokens_seen": 77536085, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.96875, "step": 3593, "time_per_iteration": 4.00252890586853 }, { "auxiliary_loss_clip": 0.01149, "auxiliary_loss_mlp": 0.01047265, "balance_loss_clip": 1.02765524, "balance_loss_mlp": 1.05034292, "epoch": 0.2160829700886818, "flos": 20777339905920.0, "grad_norm": 2.4369171186483056, "language_loss": 0.74930114, "learning_rate": 3.6476252716543625e-06, "loss": 0.77126378, "num_input_tokens_seen": 77553675, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.984375, "step": 3594, "time_per_iteration": 2.487159013748169 }, { "auxiliary_loss_clip": 0.0114306, "auxiliary_loss_mlp": 0.01046852, "balance_loss_clip": 1.02925658, "balance_loss_mlp": 1.04960811, "epoch": 0.21614309334134976, "flos": 22309755886080.0, "grad_norm": 1.925901922700349, "language_loss": 0.8077302, "learning_rate": 3.6474044693155007e-06, "loss": 0.8296293, "num_input_tokens_seen": 77573360, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9375, "step": 3595, "time_per_iteration": 3.9562859535217285 }, { "auxiliary_loss_clip": 0.01148776, "auxiliary_loss_mlp": 0.01042589, "balance_loss_clip": 1.02408743, "balance_loss_mlp": 1.04937291, "epoch": 0.21620321659401773, "flos": 19609524927360.0, "grad_norm": 2.115330708841751, "language_loss": 0.78726572, "learning_rate": 3.647183604506897e-06, "loss": 0.80917943, "num_input_tokens_seen": 77591865, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.9921875, "step": 3596, "time_per_iteration": 2.4537434577941895 }, { "auxiliary_loss_clip": 0.01143569, "auxiliary_loss_mlp": 0.01044486, "balance_loss_clip": 1.02768922, "balance_loss_mlp": 1.04990554, "epoch": 0.2162633398466857, "flos": 18844555956480.0, "grad_norm": 3.513138316973973, "language_loss": 0.82952386, "learning_rate": 3.6469626772369253e-06, "loss": 0.85140443, "num_input_tokens_seen": 77611600, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9375, "step": 3597, "time_per_iteration": 5.387464284896851 }, { "auxiliary_loss_clip": 0.01146667, "auxiliary_loss_mlp": 0.01045365, "balance_loss_clip": 1.02692389, "balance_loss_mlp": 1.05024123, "epoch": 0.21632346309935369, "flos": 18768820129920.0, "grad_norm": 1.6361357290717213, "language_loss": 0.80536246, "learning_rate": 3.6467416875139642e-06, "loss": 0.82728273, "num_input_tokens_seen": 77630665, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.96484375, "step": 3598, "time_per_iteration": 2.479151487350464 }, { "auxiliary_loss_clip": 0.01149497, "auxiliary_loss_mlp": 0.01052552, "balance_loss_clip": 1.03307343, "balance_loss_mlp": 1.05102515, "epoch": 0.21638358635202165, "flos": 26324173745280.0, "grad_norm": 1.7326991462664338, "language_loss": 0.81925476, "learning_rate": 3.6465206353463934e-06, "loss": 0.84127522, "num_input_tokens_seen": 77650835, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.984375, "step": 3599, "time_per_iteration": 2.518336296081543 }, { "auxiliary_loss_clip": 0.01141554, "auxiliary_loss_mlp": 0.01040444, "balance_loss_clip": 1.02387393, "balance_loss_mlp": 1.04861462, "epoch": 0.21644370960468962, "flos": 20740854666240.0, "grad_norm": 1.7039219840218804, "language_loss": 0.76507306, "learning_rate": 3.6462995207425947e-06, "loss": 0.78689301, "num_input_tokens_seen": 77669000, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9296875, "step": 3600, "time_per_iteration": 2.4731385707855225 }, { "auxiliary_loss_clip": 0.01144526, "auxiliary_loss_mlp": 0.01042516, "balance_loss_clip": 1.02610087, "balance_loss_mlp": 1.04975963, "epoch": 0.21650383285735758, "flos": 23952238116480.0, "grad_norm": 2.130967795942025, "language_loss": 0.79907501, "learning_rate": 3.6460783437109533e-06, "loss": 0.82094544, "num_input_tokens_seen": 77688745, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.94921875, "step": 3601, "time_per_iteration": 2.4847893714904785 }, { "auxiliary_loss_clip": 0.01146764, "auxiliary_loss_mlp": 0.01048565, "balance_loss_clip": 1.03147078, "balance_loss_mlp": 1.05144072, "epoch": 0.21656395611002555, "flos": 23696087253120.0, "grad_norm": 2.51006557059249, "language_loss": 0.82783473, "learning_rate": 3.6458571042598565e-06, "loss": 0.84978807, "num_input_tokens_seen": 77708445, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.953125, "step": 3602, "time_per_iteration": 2.49224853515625 }, { "auxiliary_loss_clip": 0.01145126, "auxiliary_loss_mlp": 0.01045605, "balance_loss_clip": 1.02842653, "balance_loss_mlp": 1.04956126, "epoch": 0.2166240793626935, "flos": 20666052593280.0, "grad_norm": 2.034836361122223, "language_loss": 0.74692917, "learning_rate": 3.645635802397693e-06, "loss": 0.76883644, "num_input_tokens_seen": 77728465, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.953125, "step": 3603, "time_per_iteration": 2.471083641052246 }, { "auxiliary_loss_clip": 0.01145103, "auxiliary_loss_mlp": 0.01041887, "balance_loss_clip": 1.02526951, "balance_loss_mlp": 1.05256522, "epoch": 0.2166842026153615, "flos": 21580410228480.0, "grad_norm": 1.590601947818951, "language_loss": 0.73788416, "learning_rate": 3.645414438132855e-06, "loss": 0.75975406, "num_input_tokens_seen": 77746735, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.92578125, "step": 3604, "time_per_iteration": 2.483905792236328 }, { "auxiliary_loss_clip": 0.01141981, "auxiliary_loss_mlp": 0.01045317, "balance_loss_clip": 1.02780557, "balance_loss_mlp": 1.04994321, "epoch": 0.21674432586802947, "flos": 25629948610560.0, "grad_norm": 1.6520904863710753, "language_loss": 0.7995373, "learning_rate": 3.6451930114737366e-06, "loss": 0.82141018, "num_input_tokens_seen": 77768105, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.921875, "step": 3605, "time_per_iteration": 2.49764347076416 }, { "auxiliary_loss_clip": 0.0105721, "auxiliary_loss_mlp": 0.01001609, "balance_loss_clip": 0.99969023, "balance_loss_mlp": 1.02616596, "epoch": 0.21680444912069743, "flos": 56417783616000.0, "grad_norm": 0.8748949656489844, "language_loss": 0.58362788, "learning_rate": 3.6449715224287347e-06, "loss": 0.60421604, "num_input_tokens_seen": 77833750, "router_z_loss_clip": 0.01916504, "router_z_loss_mlp": 0.31054688, "step": 3606, "time_per_iteration": 3.183675527572632 }, { "auxiliary_loss_clip": 0.01146612, "auxiliary_loss_mlp": 0.01044959, "balance_loss_clip": 1.02677977, "balance_loss_mlp": 1.05006671, "epoch": 0.2168645723733654, "flos": 23878944414720.0, "grad_norm": 1.7945519072431888, "language_loss": 0.72903943, "learning_rate": 3.644749971006248e-06, "loss": 0.7509551, "num_input_tokens_seen": 77853780, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.96484375, "step": 3607, "time_per_iteration": 2.4967780113220215 }, { "auxiliary_loss_clip": 0.01147959, "auxiliary_loss_mlp": 0.01046791, "balance_loss_clip": 1.0283618, "balance_loss_mlp": 1.04995072, "epoch": 0.21692469562603336, "flos": 16946174257920.0, "grad_norm": 1.8677942071435472, "language_loss": 0.76413828, "learning_rate": 3.6445283572146765e-06, "loss": 0.78608584, "num_input_tokens_seen": 77872575, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9765625, "step": 3608, "time_per_iteration": 2.4671854972839355 }, { "auxiliary_loss_clip": 0.01147519, "auxiliary_loss_mlp": 0.01045008, "balance_loss_clip": 1.02816355, "balance_loss_mlp": 1.05116606, "epoch": 0.21698481887870133, "flos": 25119047514240.0, "grad_norm": 1.931849330165976, "language_loss": 0.74262506, "learning_rate": 3.6443066810624255e-06, "loss": 0.76455033, "num_input_tokens_seen": 77892700, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.96484375, "step": 3609, "time_per_iteration": 2.5019099712371826 }, { "auxiliary_loss_clip": 0.01146817, "auxiliary_loss_mlp": 0.01048046, "balance_loss_clip": 1.03064132, "balance_loss_mlp": 1.05137706, "epoch": 0.2170449421313693, "flos": 17894682748800.0, "grad_norm": 1.8518577398089655, "language_loss": 0.89058232, "learning_rate": 3.6440849425579e-06, "loss": 0.91253102, "num_input_tokens_seen": 77911060, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.953125, "step": 3610, "time_per_iteration": 2.465941905975342 }, { "auxiliary_loss_clip": 0.01146992, "auxiliary_loss_mlp": 0.01040322, "balance_loss_clip": 1.02291751, "balance_loss_mlp": 1.05165458, "epoch": 0.2171050653840373, "flos": 22638446265600.0, "grad_norm": 1.6623593724987544, "language_loss": 0.7792474, "learning_rate": 3.6438631417095095e-06, "loss": 0.80112052, "num_input_tokens_seen": 77929930, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.953125, "step": 3611, "time_per_iteration": 2.4710662364959717 }, { "auxiliary_loss_clip": 0.01140756, "auxiliary_loss_mlp": 0.01047755, "balance_loss_clip": 1.03043413, "balance_loss_mlp": 1.04873228, "epoch": 0.21716518863670525, "flos": 19499997381120.0, "grad_norm": 2.17043770277042, "language_loss": 0.63360083, "learning_rate": 3.6436412785256637e-06, "loss": 0.65548593, "num_input_tokens_seen": 77949060, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.921875, "step": 3612, "time_per_iteration": 2.4791007041931152 }, { "auxiliary_loss_clip": 0.01144622, "auxiliary_loss_mlp": 0.01040714, "balance_loss_clip": 1.02425122, "balance_loss_mlp": 1.04986167, "epoch": 0.21722531188937322, "flos": 19792022952960.0, "grad_norm": 1.710304556958193, "language_loss": 0.75663972, "learning_rate": 3.643419353014776e-06, "loss": 0.77849305, "num_input_tokens_seen": 77967920, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.94921875, "step": 3613, "time_per_iteration": 2.468435525894165 }, { "auxiliary_loss_clip": 0.01139377, "auxiliary_loss_mlp": 0.01043247, "balance_loss_clip": 1.02586627, "balance_loss_mlp": 1.04749119, "epoch": 0.21728543514204118, "flos": 13334386924800.0, "grad_norm": 2.725137549766752, "language_loss": 0.70938694, "learning_rate": 3.643197365185261e-06, "loss": 0.73121321, "num_input_tokens_seen": 77985330, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.91796875, "step": 3614, "time_per_iteration": 2.4638309478759766 }, { "auxiliary_loss_clip": 0.01144313, "auxiliary_loss_mlp": 0.01048315, "balance_loss_clip": 1.03111339, "balance_loss_mlp": 1.05060673, "epoch": 0.21734555839470915, "flos": 15231870783360.0, "grad_norm": 1.6908221191893877, "language_loss": 0.72884268, "learning_rate": 3.6429753150455378e-06, "loss": 0.75076902, "num_input_tokens_seen": 78003105, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9375, "step": 3615, "time_per_iteration": 2.4391252994537354 }, { "auxiliary_loss_clip": 0.01147746, "auxiliary_loss_mlp": 0.01043692, "balance_loss_clip": 1.02542901, "balance_loss_mlp": 1.04962635, "epoch": 0.2174056816473771, "flos": 19973982274560.0, "grad_norm": 2.0845325397558003, "language_loss": 0.8973186, "learning_rate": 3.6427532026040263e-06, "loss": 0.91923302, "num_input_tokens_seen": 78019655, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.98046875, "step": 3616, "time_per_iteration": 2.4861371517181396 }, { "auxiliary_loss_clip": 0.01144871, "auxiliary_loss_mlp": 0.0104019, "balance_loss_clip": 1.02284455, "balance_loss_mlp": 1.05032587, "epoch": 0.21746580490004508, "flos": 16687293960960.0, "grad_norm": 2.532530587718748, "language_loss": 0.81328559, "learning_rate": 3.642531027869148e-06, "loss": 0.83513618, "num_input_tokens_seen": 78036025, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3617, "time_per_iteration": 2.4435417652130127 }, { "auxiliary_loss_clip": 0.01145061, "auxiliary_loss_mlp": 0.01040357, "balance_loss_clip": 1.02334619, "balance_loss_mlp": 1.0494802, "epoch": 0.21752592815271307, "flos": 25772298209280.0, "grad_norm": 1.6704808358896843, "language_loss": 0.75190145, "learning_rate": 3.642308790849329e-06, "loss": 0.77375555, "num_input_tokens_seen": 78055645, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.95703125, "step": 3618, "time_per_iteration": 2.522794246673584 }, { "auxiliary_loss_clip": 0.01143899, "auxiliary_loss_mlp": 0.01047967, "balance_loss_clip": 1.03019309, "balance_loss_mlp": 1.04881406, "epoch": 0.21758605140538104, "flos": 11254692349440.0, "grad_norm": 9.700616571580893, "language_loss": 0.69170034, "learning_rate": 3.642086491552996e-06, "loss": 0.71361899, "num_input_tokens_seen": 78071660, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.94921875, "step": 3619, "time_per_iteration": 2.4313876628875732 }, { "auxiliary_loss_clip": 0.01147858, "auxiliary_loss_mlp": 0.01043876, "balance_loss_clip": 1.02655482, "balance_loss_mlp": 1.05076945, "epoch": 0.217646174658049, "flos": 19242625455360.0, "grad_norm": 1.8898493245114354, "language_loss": 0.78639066, "learning_rate": 3.641864129988579e-06, "loss": 0.80830801, "num_input_tokens_seen": 78091265, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.97265625, "step": 3620, "time_per_iteration": 2.4896883964538574 }, { "auxiliary_loss_clip": 0.01139467, "auxiliary_loss_mlp": 0.01038047, "balance_loss_clip": 1.02173948, "balance_loss_mlp": 1.04835081, "epoch": 0.21770629791071697, "flos": 21945083057280.0, "grad_norm": 1.8203229505956489, "language_loss": 0.79809952, "learning_rate": 3.641641706164509e-06, "loss": 0.8198747, "num_input_tokens_seen": 78110095, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.91015625, "step": 3621, "time_per_iteration": 2.4742586612701416 }, { "auxiliary_loss_clip": 0.01140594, "auxiliary_loss_mlp": 0.01040864, "balance_loss_clip": 1.02450871, "balance_loss_mlp": 1.0473851, "epoch": 0.21776642116338493, "flos": 24936764970240.0, "grad_norm": 1.927826351227982, "language_loss": 0.87455332, "learning_rate": 3.641419220089221e-06, "loss": 0.89636791, "num_input_tokens_seen": 78129475, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.93359375, "step": 3622, "time_per_iteration": 2.520951986312866 }, { "auxiliary_loss_clip": 0.01143905, "auxiliary_loss_mlp": 0.01035851, "balance_loss_clip": 1.01678991, "balance_loss_mlp": 1.04881525, "epoch": 0.2178265444160529, "flos": 17821317219840.0, "grad_norm": 1.8159898996070263, "language_loss": 0.76968312, "learning_rate": 3.641196671771152e-06, "loss": 0.79148066, "num_input_tokens_seen": 78146880, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.953125, "step": 3623, "time_per_iteration": 2.441681146621704 }, { "auxiliary_loss_clip": 0.01147217, "auxiliary_loss_mlp": 0.01049381, "balance_loss_clip": 1.03129745, "balance_loss_mlp": 1.05077195, "epoch": 0.2178866676687209, "flos": 17712902995200.0, "grad_norm": 1.9382360576218265, "language_loss": 0.84221625, "learning_rate": 3.640974061218741e-06, "loss": 0.86418223, "num_input_tokens_seen": 78165065, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.96484375, "step": 3624, "time_per_iteration": 2.45865797996521 }, { "auxiliary_loss_clip": 0.01148003, "auxiliary_loss_mlp": 0.01053652, "balance_loss_clip": 1.03612816, "balance_loss_mlp": 1.0524404, "epoch": 0.21794679092138886, "flos": 16945851035520.0, "grad_norm": 2.363839644627453, "language_loss": 0.77470672, "learning_rate": 3.640751388440429e-06, "loss": 0.79672325, "num_input_tokens_seen": 78180005, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.95703125, "step": 3625, "time_per_iteration": 2.444514036178589 }, { "auxiliary_loss_clip": 0.01058655, "auxiliary_loss_mlp": 0.01005409, "balance_loss_clip": 1.00347793, "balance_loss_mlp": 1.02718711, "epoch": 0.21800691417405682, "flos": 63718566566400.0, "grad_norm": 0.8132621846479342, "language_loss": 0.60723758, "learning_rate": 3.64052865344466e-06, "loss": 0.62787825, "num_input_tokens_seen": 78245350, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.31445312, "step": 3626, "time_per_iteration": 3.190155267715454 }, { "auxiliary_loss_clip": 0.0114601, "auxiliary_loss_mlp": 0.0104336, "balance_loss_clip": 1.02468002, "balance_loss_mlp": 1.04918528, "epoch": 0.21806703742672479, "flos": 21616392677760.0, "grad_norm": 13.128371151688196, "language_loss": 0.90482986, "learning_rate": 3.6403058562398795e-06, "loss": 0.9267236, "num_input_tokens_seen": 78264165, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.96875, "step": 3627, "time_per_iteration": 2.481536626815796 }, { "auxiliary_loss_clip": 0.01141286, "auxiliary_loss_mlp": 0.01041302, "balance_loss_clip": 1.02374244, "balance_loss_mlp": 1.04733932, "epoch": 0.21812716067939275, "flos": 19354882435200.0, "grad_norm": 2.5320396758082255, "language_loss": 0.73403406, "learning_rate": 3.6400829968345365e-06, "loss": 0.75585997, "num_input_tokens_seen": 78283745, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9375, "step": 3628, "time_per_iteration": 2.4715020656585693 }, { "auxiliary_loss_clip": 0.01141442, "auxiliary_loss_mlp": 0.01038622, "balance_loss_clip": 1.02183712, "balance_loss_mlp": 1.04764438, "epoch": 0.21818728393206072, "flos": 23548063305600.0, "grad_norm": 1.9156057262728625, "language_loss": 0.77113801, "learning_rate": 3.6398600752370826e-06, "loss": 0.79293865, "num_input_tokens_seen": 78302900, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9375, "step": 3629, "time_per_iteration": 2.4826674461364746 }, { "auxiliary_loss_clip": 0.01144191, "auxiliary_loss_mlp": 0.01037117, "balance_loss_clip": 1.02022481, "balance_loss_mlp": 1.05085874, "epoch": 0.21824740718472868, "flos": 30225652266240.0, "grad_norm": 1.6668240206129643, "language_loss": 0.71209669, "learning_rate": 3.63963709145597e-06, "loss": 0.73390973, "num_input_tokens_seen": 78326470, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.93359375, "step": 3630, "time_per_iteration": 2.5550537109375 }, { "auxiliary_loss_clip": 0.01138059, "auxiliary_loss_mlp": 0.01039329, "balance_loss_clip": 1.02423728, "balance_loss_mlp": 1.04895008, "epoch": 0.21830753043739667, "flos": 26134672567680.0, "grad_norm": 1.8533452354298952, "language_loss": 0.76693219, "learning_rate": 3.6394140454996544e-06, "loss": 0.78870606, "num_input_tokens_seen": 78345810, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.890625, "step": 3631, "time_per_iteration": 2.5063252449035645 }, { "auxiliary_loss_clip": 0.01142078, "auxiliary_loss_mlp": 0.01041019, "balance_loss_clip": 1.02460384, "balance_loss_mlp": 1.04873252, "epoch": 0.21836765369006464, "flos": 21720712752000.0, "grad_norm": 2.071201212822213, "language_loss": 0.75280201, "learning_rate": 3.639190937376594e-06, "loss": 0.77463293, "num_input_tokens_seen": 78364085, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.93359375, "step": 3632, "time_per_iteration": 2.497148275375366 }, { "auxiliary_loss_clip": 0.01139785, "auxiliary_loss_mlp": 0.0103725, "balance_loss_clip": 1.02122831, "balance_loss_mlp": 1.04706156, "epoch": 0.2184277769427326, "flos": 19937604775680.0, "grad_norm": 1.9358825925279877, "language_loss": 0.83569181, "learning_rate": 3.638967767095249e-06, "loss": 0.85746217, "num_input_tokens_seen": 78381385, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.92578125, "step": 3633, "time_per_iteration": 2.4512972831726074 }, { "auxiliary_loss_clip": 0.01142586, "auxiliary_loss_mlp": 0.01048735, "balance_loss_clip": 1.03226018, "balance_loss_mlp": 1.0504328, "epoch": 0.21848790019540057, "flos": 20340235301760.0, "grad_norm": 1.6451880420502412, "language_loss": 0.81631148, "learning_rate": 3.6387445346640823e-06, "loss": 0.83822477, "num_input_tokens_seen": 78400500, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.921875, "step": 3634, "time_per_iteration": 3.941770315170288 }, { "auxiliary_loss_clip": 0.01147616, "auxiliary_loss_mlp": 0.01040289, "balance_loss_clip": 1.02319455, "balance_loss_mlp": 1.05170536, "epoch": 0.21854802344806853, "flos": 15450818135040.0, "grad_norm": 1.798714765303193, "language_loss": 0.75250709, "learning_rate": 3.638521240091558e-06, "loss": 0.77438611, "num_input_tokens_seen": 78418340, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9609375, "step": 3635, "time_per_iteration": 2.4588232040405273 }, { "auxiliary_loss_clip": 0.01141987, "auxiliary_loss_mlp": 0.01046854, "balance_loss_clip": 1.03035593, "balance_loss_mlp": 1.05063283, "epoch": 0.2186081467007365, "flos": 16320717711360.0, "grad_norm": 1.9832423990796486, "language_loss": 0.88396287, "learning_rate": 3.6382978833861445e-06, "loss": 0.90585124, "num_input_tokens_seen": 78434375, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9140625, "step": 3636, "time_per_iteration": 3.813763380050659 }, { "auxiliary_loss_clip": 0.01143248, "auxiliary_loss_mlp": 0.01046511, "balance_loss_clip": 1.02951169, "balance_loss_mlp": 1.04955184, "epoch": 0.2186682699534045, "flos": 21689255416320.0, "grad_norm": 2.3114329984956288, "language_loss": 0.75702161, "learning_rate": 3.638074464556311e-06, "loss": 0.77891922, "num_input_tokens_seen": 78451735, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9375, "step": 3637, "time_per_iteration": 2.513801336288452 }, { "auxiliary_loss_clip": 0.01148248, "auxiliary_loss_mlp": 0.01041774, "balance_loss_clip": 1.0241909, "balance_loss_mlp": 1.0506494, "epoch": 0.21872839320607246, "flos": 17739260599680.0, "grad_norm": 2.5139209244103413, "language_loss": 0.89873165, "learning_rate": 3.63785098361053e-06, "loss": 0.92063189, "num_input_tokens_seen": 78462730, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9765625, "step": 3638, "time_per_iteration": 3.826472282409668 }, { "auxiliary_loss_clip": 0.01144837, "auxiliary_loss_mlp": 0.01049855, "balance_loss_clip": 1.03262949, "balance_loss_mlp": 1.05179572, "epoch": 0.21878851645874042, "flos": 18652289431680.0, "grad_norm": 2.5183371032028847, "language_loss": 0.89690542, "learning_rate": 3.637627440557275e-06, "loss": 0.91885227, "num_input_tokens_seen": 78476300, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9296875, "step": 3639, "time_per_iteration": 3.837825298309326 }, { "auxiliary_loss_clip": 0.0114594, "auxiliary_loss_mlp": 0.01041079, "balance_loss_clip": 1.02466381, "balance_loss_mlp": 1.05218041, "epoch": 0.2188486397114084, "flos": 25557301353600.0, "grad_norm": 1.709724373566246, "language_loss": 0.79270327, "learning_rate": 3.637403835405024e-06, "loss": 0.81457347, "num_input_tokens_seen": 78496135, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9375, "step": 3640, "time_per_iteration": 2.525104284286499 }, { "auxiliary_loss_clip": 0.01145625, "auxiliary_loss_mlp": 0.01049927, "balance_loss_clip": 1.03105664, "balance_loss_mlp": 1.05294275, "epoch": 0.21890876296407635, "flos": 17892061056000.0, "grad_norm": 2.2225262190120434, "language_loss": 0.7225095, "learning_rate": 3.637180168162255e-06, "loss": 0.74446499, "num_input_tokens_seen": 78513855, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9296875, "step": 3641, "time_per_iteration": 2.43376088142395 }, { "auxiliary_loss_clip": 0.01146105, "auxiliary_loss_mlp": 0.01041923, "balance_loss_clip": 1.02554381, "balance_loss_mlp": 1.05464506, "epoch": 0.21896888621674432, "flos": 17749100926080.0, "grad_norm": 2.030019751850636, "language_loss": 0.80627084, "learning_rate": 3.63695643883745e-06, "loss": 0.82815111, "num_input_tokens_seen": 78531740, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9140625, "step": 3642, "time_per_iteration": 2.4765424728393555 }, { "auxiliary_loss_clip": 0.0114958, "auxiliary_loss_mlp": 0.01042077, "balance_loss_clip": 1.02371883, "balance_loss_mlp": 1.05376923, "epoch": 0.21902900946941228, "flos": 23076161400960.0, "grad_norm": 1.8374898644029638, "language_loss": 0.71810317, "learning_rate": 3.6367326474390928e-06, "loss": 0.74001974, "num_input_tokens_seen": 78549600, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.953125, "step": 3643, "time_per_iteration": 2.483795404434204 }, { "auxiliary_loss_clip": 0.01148375, "auxiliary_loss_mlp": 0.0104626, "balance_loss_clip": 1.02830696, "balance_loss_mlp": 1.05338323, "epoch": 0.21908913272208028, "flos": 48178545004800.0, "grad_norm": 2.0406962697804314, "language_loss": 0.68092835, "learning_rate": 3.6365087939756696e-06, "loss": 0.70287472, "num_input_tokens_seen": 78573350, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.94921875, "step": 3644, "time_per_iteration": 2.713334321975708 }, { "auxiliary_loss_clip": 0.01150413, "auxiliary_loss_mlp": 0.01043152, "balance_loss_clip": 1.02603352, "balance_loss_mlp": 1.05313182, "epoch": 0.21914925597474824, "flos": 22236749493120.0, "grad_norm": 2.2829473981765935, "language_loss": 0.7772764, "learning_rate": 3.636284878455669e-06, "loss": 0.79921204, "num_input_tokens_seen": 78591005, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.97265625, "step": 3645, "time_per_iteration": 2.5022952556610107 }, { "auxiliary_loss_clip": 0.0114576, "auxiliary_loss_mlp": 0.01050254, "balance_loss_clip": 1.03362453, "balance_loss_mlp": 1.0537138, "epoch": 0.2192093792274162, "flos": 22125605834880.0, "grad_norm": 1.563831984290412, "language_loss": 0.82444644, "learning_rate": 3.636060900887582e-06, "loss": 0.84640658, "num_input_tokens_seen": 78610645, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.91796875, "step": 3646, "time_per_iteration": 2.4960408210754395 }, { "auxiliary_loss_clip": 0.01144568, "auxiliary_loss_mlp": 0.01038617, "balance_loss_clip": 1.02210617, "balance_loss_mlp": 1.05328739, "epoch": 0.21926950248008417, "flos": 15669442264320.0, "grad_norm": 4.85036222292203, "language_loss": 0.82190681, "learning_rate": 3.635836861279901e-06, "loss": 0.84373856, "num_input_tokens_seen": 78628340, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9140625, "step": 3647, "time_per_iteration": 2.4506752490997314 }, { "auxiliary_loss_clip": 0.01142161, "auxiliary_loss_mlp": 0.01046295, "balance_loss_clip": 1.02974844, "balance_loss_mlp": 1.04939592, "epoch": 0.21932962573275214, "flos": 30262496641920.0, "grad_norm": 1.6246101633908578, "language_loss": 0.72502953, "learning_rate": 3.635612759641123e-06, "loss": 0.74691403, "num_input_tokens_seen": 78649355, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9296875, "step": 3648, "time_per_iteration": 2.567897319793701 }, { "auxiliary_loss_clip": 0.0114548, "auxiliary_loss_mlp": 0.01043936, "balance_loss_clip": 1.02517247, "balance_loss_mlp": 1.04953015, "epoch": 0.2193897489854201, "flos": 10780132838400.0, "grad_norm": 3.0201132396923374, "language_loss": 0.74562955, "learning_rate": 3.635388595979745e-06, "loss": 0.76752371, "num_input_tokens_seen": 78664915, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.9609375, "step": 3649, "time_per_iteration": 2.447448492050171 }, { "auxiliary_loss_clip": 0.01138237, "auxiliary_loss_mlp": 0.01042251, "balance_loss_clip": 1.02588344, "balance_loss_mlp": 1.04891825, "epoch": 0.21944987223808807, "flos": 19133313390720.0, "grad_norm": 1.9803500677800379, "language_loss": 0.86170495, "learning_rate": 3.635164370304267e-06, "loss": 0.88350987, "num_input_tokens_seen": 78681475, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.89453125, "step": 3650, "time_per_iteration": 2.4618756771087646 }, { "auxiliary_loss_clip": 0.01141725, "auxiliary_loss_mlp": 0.01042329, "balance_loss_clip": 1.02408957, "balance_loss_mlp": 1.04817271, "epoch": 0.21950999549075606, "flos": 22711093522560.0, "grad_norm": 2.5548960574917077, "language_loss": 0.83520174, "learning_rate": 3.6349400826231927e-06, "loss": 0.85704225, "num_input_tokens_seen": 78702300, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.93359375, "step": 3651, "time_per_iteration": 2.498929500579834 }, { "auxiliary_loss_clip": 0.01139039, "auxiliary_loss_mlp": 0.01045421, "balance_loss_clip": 1.02868378, "balance_loss_mlp": 1.04700971, "epoch": 0.21957011874342403, "flos": 10561329141120.0, "grad_norm": 1.8457845826012673, "language_loss": 0.74156588, "learning_rate": 3.634715732945027e-06, "loss": 0.76341045, "num_input_tokens_seen": 78720230, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.921875, "step": 3652, "time_per_iteration": 2.470423698425293 }, { "auxiliary_loss_clip": 0.01060054, "auxiliary_loss_mlp": 0.01007264, "balance_loss_clip": 1.00474834, "balance_loss_mlp": 1.02900076, "epoch": 0.219630241996092, "flos": 65747913252480.0, "grad_norm": 0.7462818197462179, "language_loss": 0.51611871, "learning_rate": 3.6344913212782764e-06, "loss": 0.53679192, "num_input_tokens_seen": 78780200, "router_z_loss_clip": 0.02514648, "router_z_loss_mlp": 0.31054688, "step": 3653, "time_per_iteration": 3.102757692337036 }, { "auxiliary_loss_clip": 0.0114469, "auxiliary_loss_mlp": 0.01052735, "balance_loss_clip": 1.03584301, "balance_loss_mlp": 1.05241537, "epoch": 0.21969036524875996, "flos": 23696518216320.0, "grad_norm": 1.9216240295261209, "language_loss": 0.75231087, "learning_rate": 3.6342668476314514e-06, "loss": 0.77428514, "num_input_tokens_seen": 78800575, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.921875, "step": 3654, "time_per_iteration": 2.4946677684783936 }, { "auxiliary_loss_clip": 0.01146627, "auxiliary_loss_mlp": 0.01043833, "balance_loss_clip": 1.02712011, "balance_loss_mlp": 1.0511167, "epoch": 0.21975048850142792, "flos": 19640910435840.0, "grad_norm": 1.9469209674510415, "language_loss": 0.72504675, "learning_rate": 3.634042312013064e-06, "loss": 0.7469514, "num_input_tokens_seen": 78819585, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.953125, "step": 3655, "time_per_iteration": 2.4748759269714355 }, { "auxiliary_loss_clip": 0.01141575, "auxiliary_loss_mlp": 0.01044035, "balance_loss_clip": 1.02739382, "balance_loss_mlp": 1.04839385, "epoch": 0.21981061175409589, "flos": 22448550038400.0, "grad_norm": 1.614652637087909, "language_loss": 0.80729222, "learning_rate": 3.6338177144316276e-06, "loss": 0.82914829, "num_input_tokens_seen": 78837330, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9296875, "step": 3656, "time_per_iteration": 2.482865571975708 }, { "auxiliary_loss_clip": 0.01143675, "auxiliary_loss_mlp": 0.01035171, "balance_loss_clip": 1.01876819, "balance_loss_mlp": 1.05168152, "epoch": 0.21987073500676388, "flos": 18151049093760.0, "grad_norm": 2.2298524192170337, "language_loss": 0.85244489, "learning_rate": 3.63359305489566e-06, "loss": 0.87423337, "num_input_tokens_seen": 78854955, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.921875, "step": 3657, "time_per_iteration": 2.482724189758301 }, { "auxiliary_loss_clip": 0.01140985, "auxiliary_loss_mlp": 0.01035948, "balance_loss_clip": 1.01921058, "balance_loss_mlp": 1.04731679, "epoch": 0.21993085825943184, "flos": 25626177682560.0, "grad_norm": 1.593714119942887, "language_loss": 0.8024711, "learning_rate": 3.6333683334136803e-06, "loss": 0.82424045, "num_input_tokens_seen": 78874965, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9375, "step": 3658, "time_per_iteration": 2.5224077701568604 }, { "auxiliary_loss_clip": 0.01056484, "auxiliary_loss_mlp": 0.0100364, "balance_loss_clip": 1.00124407, "balance_loss_mlp": 1.02552271, "epoch": 0.2199909815120998, "flos": 70923217743360.0, "grad_norm": 0.7900246713165853, "language_loss": 0.58273327, "learning_rate": 3.6331435499942095e-06, "loss": 0.60333455, "num_input_tokens_seen": 78937740, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.30859375, "step": 3659, "time_per_iteration": 3.1907103061676025 }, { "auxiliary_loss_clip": 0.01139608, "auxiliary_loss_mlp": 0.01038551, "balance_loss_clip": 1.02139711, "balance_loss_mlp": 1.04802346, "epoch": 0.22005110476476777, "flos": 21543529939200.0, "grad_norm": 2.696780289267304, "language_loss": 0.74779725, "learning_rate": 3.632918704645772e-06, "loss": 0.76957881, "num_input_tokens_seen": 78955055, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9140625, "step": 3660, "time_per_iteration": 2.4659106731414795 }, { "auxiliary_loss_clip": 0.01142099, "auxiliary_loss_mlp": 0.01036919, "balance_loss_clip": 1.0202539, "balance_loss_mlp": 1.04845166, "epoch": 0.22011122801743574, "flos": 22054502862720.0, "grad_norm": 3.1856760480098596, "language_loss": 0.81429476, "learning_rate": 3.632693797376893e-06, "loss": 0.8360849, "num_input_tokens_seen": 78974895, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9375, "step": 3661, "time_per_iteration": 2.490460157394409 }, { "auxiliary_loss_clip": 0.01140223, "auxiliary_loss_mlp": 0.01037095, "balance_loss_clip": 1.02156186, "balance_loss_mlp": 1.04879379, "epoch": 0.2201713512701037, "flos": 26687589598080.0, "grad_norm": 1.8064672549138183, "language_loss": 0.73478425, "learning_rate": 3.632468828196102e-06, "loss": 0.75655746, "num_input_tokens_seen": 78994990, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.9140625, "step": 3662, "time_per_iteration": 2.4958136081695557 }, { "auxiliary_loss_clip": 0.01141665, "auxiliary_loss_mlp": 0.01046504, "balance_loss_clip": 1.03128076, "balance_loss_mlp": 1.05183005, "epoch": 0.22023147452277167, "flos": 22162198815360.0, "grad_norm": 1.5224027796327866, "language_loss": 0.78394896, "learning_rate": 3.632243797111929e-06, "loss": 0.8058306, "num_input_tokens_seen": 79014405, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8984375, "step": 3663, "time_per_iteration": 2.5077242851257324 }, { "auxiliary_loss_clip": 0.0114471, "auxiliary_loss_mlp": 0.01043132, "balance_loss_clip": 1.02620459, "balance_loss_mlp": 1.05108988, "epoch": 0.22029159777543966, "flos": 22523280284160.0, "grad_norm": 2.7084428543002894, "language_loss": 0.80384552, "learning_rate": 3.632018704132908e-06, "loss": 0.82572401, "num_input_tokens_seen": 79032375, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9375, "step": 3664, "time_per_iteration": 2.4723453521728516 }, { "auxiliary_loss_clip": 0.01147557, "auxiliary_loss_mlp": 0.01042802, "balance_loss_clip": 1.02400255, "balance_loss_mlp": 1.04995108, "epoch": 0.22035172102810763, "flos": 13042469093760.0, "grad_norm": 3.0486953401049157, "language_loss": 0.7663306, "learning_rate": 3.6317935492675742e-06, "loss": 0.78823417, "num_input_tokens_seen": 79049635, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.9765625, "step": 3665, "time_per_iteration": 2.448838710784912 }, { "auxiliary_loss_clip": 0.0114115, "auxiliary_loss_mlp": 0.01044565, "balance_loss_clip": 1.02795887, "balance_loss_mlp": 1.04885221, "epoch": 0.2204118442807756, "flos": 12165817760640.0, "grad_norm": 6.116396510208304, "language_loss": 0.9792217, "learning_rate": 3.631568332524466e-06, "loss": 1.00107884, "num_input_tokens_seen": 79062890, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.921875, "step": 3666, "time_per_iteration": 2.439445972442627 }, { "auxiliary_loss_clip": 0.0113987, "auxiliary_loss_mlp": 0.01039501, "balance_loss_clip": 1.02220345, "balance_loss_mlp": 1.04725361, "epoch": 0.22047196753344356, "flos": 40108806673920.0, "grad_norm": 1.6330381886257763, "language_loss": 0.80579484, "learning_rate": 3.631343053912122e-06, "loss": 0.8275885, "num_input_tokens_seen": 79085495, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.92578125, "step": 3667, "time_per_iteration": 2.6320183277130127 }, { "auxiliary_loss_clip": 0.01147423, "auxiliary_loss_mlp": 0.0104222, "balance_loss_clip": 1.02388561, "balance_loss_mlp": 1.05143595, "epoch": 0.22053209078611152, "flos": 20701137202560.0, "grad_norm": 1.8777696066668195, "language_loss": 0.77532154, "learning_rate": 3.631117713439087e-06, "loss": 0.79721797, "num_input_tokens_seen": 79101820, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9609375, "step": 3668, "time_per_iteration": 2.4619507789611816 }, { "auxiliary_loss_clip": 0.01145385, "auxiliary_loss_mlp": 0.01042848, "balance_loss_clip": 1.02602732, "balance_loss_mlp": 1.0519855, "epoch": 0.2205922140387795, "flos": 24716309247360.0, "grad_norm": 1.569287312341001, "language_loss": 0.70834434, "learning_rate": 3.630892311113904e-06, "loss": 0.7302267, "num_input_tokens_seen": 79123320, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.93359375, "step": 3669, "time_per_iteration": 2.549187421798706 }, { "auxiliary_loss_clip": 0.01142422, "auxiliary_loss_mlp": 0.0104068, "balance_loss_clip": 1.02399051, "balance_loss_mlp": 1.04939771, "epoch": 0.22065233729144745, "flos": 23477247642240.0, "grad_norm": 1.8146824316071914, "language_loss": 0.85552239, "learning_rate": 3.6306668469451215e-06, "loss": 0.87735343, "num_input_tokens_seen": 79141615, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9296875, "step": 3670, "time_per_iteration": 2.4990475177764893 }, { "auxiliary_loss_clip": 0.0114929, "auxiliary_loss_mlp": 0.01036347, "balance_loss_clip": 1.01957417, "balance_loss_mlp": 1.05288315, "epoch": 0.22071246054411545, "flos": 35225566646400.0, "grad_norm": 2.1291781836743158, "language_loss": 0.76536262, "learning_rate": 3.6304413209412886e-06, "loss": 0.78721905, "num_input_tokens_seen": 79164910, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.96484375, "step": 3671, "time_per_iteration": 2.6145853996276855 }, { "auxiliary_loss_clip": 0.01144796, "auxiliary_loss_mlp": 0.01041223, "balance_loss_clip": 1.02474844, "balance_loss_mlp": 1.0518136, "epoch": 0.2207725837967834, "flos": 18150294908160.0, "grad_norm": 2.3503532740269657, "language_loss": 0.8092314, "learning_rate": 3.6302157331109573e-06, "loss": 0.83109158, "num_input_tokens_seen": 79179685, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9296875, "step": 3672, "time_per_iteration": 2.4619386196136475 }, { "auxiliary_loss_clip": 0.01145072, "auxiliary_loss_mlp": 0.0105105, "balance_loss_clip": 1.0342176, "balance_loss_mlp": 1.05122495, "epoch": 0.22083270704945138, "flos": 20479675898880.0, "grad_norm": 2.3034004383442435, "language_loss": 0.73609316, "learning_rate": 3.629990083462682e-06, "loss": 0.75805432, "num_input_tokens_seen": 79196285, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9375, "step": 3673, "time_per_iteration": 2.5093424320220947 }, { "auxiliary_loss_clip": 0.01149887, "auxiliary_loss_mlp": 0.01037157, "balance_loss_clip": 1.019454, "balance_loss_mlp": 1.05529356, "epoch": 0.22089283030211934, "flos": 34125801984000.0, "grad_norm": 2.2949680228327387, "language_loss": 0.76352382, "learning_rate": 3.6297643720050203e-06, "loss": 0.78539425, "num_input_tokens_seen": 79216060, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9453125, "step": 3674, "time_per_iteration": 2.579538583755493 }, { "auxiliary_loss_clip": 0.01146634, "auxiliary_loss_mlp": 0.0104245, "balance_loss_clip": 1.02410352, "balance_loss_mlp": 1.05316186, "epoch": 0.2209529535547873, "flos": 18077216688000.0, "grad_norm": 1.894215358627427, "language_loss": 0.74819696, "learning_rate": 3.6295385987465293e-06, "loss": 0.77008778, "num_input_tokens_seen": 79235145, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.93359375, "step": 3675, "time_per_iteration": 2.4678127765655518 }, { "auxiliary_loss_clip": 0.01147333, "auxiliary_loss_mlp": 0.01043317, "balance_loss_clip": 1.02668738, "balance_loss_mlp": 1.05281138, "epoch": 0.22101307680745527, "flos": 27235335070080.0, "grad_norm": 1.7417854274428934, "language_loss": 0.8036139, "learning_rate": 3.629312763695772e-06, "loss": 0.8255204, "num_input_tokens_seen": 79256960, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9453125, "step": 3676, "time_per_iteration": 3.993628740310669 }, { "auxiliary_loss_clip": 0.0114399, "auxiliary_loss_mlp": 0.0105294, "balance_loss_clip": 1.03688216, "balance_loss_mlp": 1.05007839, "epoch": 0.22107320006012326, "flos": 16543256423040.0, "grad_norm": 2.1028858011637044, "language_loss": 0.7478261, "learning_rate": 3.6290868668613107e-06, "loss": 0.76979542, "num_input_tokens_seen": 79274860, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.9375, "step": 3677, "time_per_iteration": 2.4271187782287598 }, { "auxiliary_loss_clip": 0.01144462, "auxiliary_loss_mlp": 0.01047655, "balance_loss_clip": 1.03104877, "balance_loss_mlp": 1.05059886, "epoch": 0.22113332331279123, "flos": 22054466949120.0, "grad_norm": 1.9586398980863564, "language_loss": 0.83289063, "learning_rate": 3.628860908251712e-06, "loss": 0.85481173, "num_input_tokens_seen": 79294005, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9375, "step": 3678, "time_per_iteration": 3.858799695968628 }, { "auxiliary_loss_clip": 0.01148664, "auxiliary_loss_mlp": 0.01045174, "balance_loss_clip": 1.02799582, "balance_loss_mlp": 1.05585194, "epoch": 0.2211934465654592, "flos": 26612787525120.0, "grad_norm": 1.6854689128131881, "language_loss": 0.89197218, "learning_rate": 3.6286348878755452e-06, "loss": 0.91391051, "num_input_tokens_seen": 79314005, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9296875, "step": 3679, "time_per_iteration": 2.513601064682007 }, { "auxiliary_loss_clip": 0.01148758, "auxiliary_loss_mlp": 0.01048387, "balance_loss_clip": 1.03058934, "balance_loss_mlp": 1.053038, "epoch": 0.22125356981812716, "flos": 16360363347840.0, "grad_norm": 2.1682293397480197, "language_loss": 0.86875153, "learning_rate": 3.6284088057413803e-06, "loss": 0.89072299, "num_input_tokens_seen": 79331030, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.95703125, "step": 3680, "time_per_iteration": 3.865264892578125 }, { "auxiliary_loss_clip": 0.01145312, "auxiliary_loss_mlp": 0.01045334, "balance_loss_clip": 1.02808475, "balance_loss_mlp": 1.05418384, "epoch": 0.22131369307079513, "flos": 21651118151040.0, "grad_norm": 1.8531043000338605, "language_loss": 0.80983686, "learning_rate": 3.6281826618577894e-06, "loss": 0.83174336, "num_input_tokens_seen": 79348560, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.91015625, "step": 3681, "time_per_iteration": 3.8877694606781006 }, { "auxiliary_loss_clip": 0.01140827, "auxiliary_loss_mlp": 0.01039926, "balance_loss_clip": 1.0238204, "balance_loss_mlp": 1.05273151, "epoch": 0.2213738163234631, "flos": 19609524927360.0, "grad_norm": 2.402608127636083, "language_loss": 0.79009694, "learning_rate": 3.62795645623335e-06, "loss": 0.81190443, "num_input_tokens_seen": 79367175, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8828125, "step": 3682, "time_per_iteration": 2.462883234024048 }, { "auxiliary_loss_clip": 0.01145412, "auxiliary_loss_mlp": 0.01043668, "balance_loss_clip": 1.02597713, "balance_loss_mlp": 1.05126786, "epoch": 0.22143393957613106, "flos": 23623404082560.0, "grad_norm": 1.6709490669346292, "language_loss": 0.7757749, "learning_rate": 3.627730188876638e-06, "loss": 0.79766572, "num_input_tokens_seen": 79388435, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.94140625, "step": 3683, "time_per_iteration": 2.5260422229766846 }, { "auxiliary_loss_clip": 0.01146471, "auxiliary_loss_mlp": 0.01045966, "balance_loss_clip": 1.02910984, "balance_loss_mlp": 1.05078614, "epoch": 0.22149406282879905, "flos": 26177801823360.0, "grad_norm": 1.7436645663998651, "language_loss": 0.71987033, "learning_rate": 3.627503859796234e-06, "loss": 0.74179465, "num_input_tokens_seen": 79407910, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.95703125, "step": 3684, "time_per_iteration": 2.5098743438720703 }, { "auxiliary_loss_clip": 0.01147407, "auxiliary_loss_mlp": 0.01045172, "balance_loss_clip": 1.02681375, "balance_loss_mlp": 1.05373001, "epoch": 0.221554186081467, "flos": 14538758970240.0, "grad_norm": 2.014940004976963, "language_loss": 0.79955041, "learning_rate": 3.6272774690007207e-06, "loss": 0.82147616, "num_input_tokens_seen": 79424020, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9375, "step": 3685, "time_per_iteration": 2.45466685295105 }, { "auxiliary_loss_clip": 0.01141289, "auxiliary_loss_mlp": 0.01043735, "balance_loss_clip": 1.02770185, "balance_loss_mlp": 1.05174458, "epoch": 0.22161430933413498, "flos": 22238257864320.0, "grad_norm": 1.460559051763859, "language_loss": 0.87351525, "learning_rate": 3.6270510164986823e-06, "loss": 0.8953656, "num_input_tokens_seen": 79445605, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8984375, "step": 3686, "time_per_iteration": 2.512413501739502 }, { "auxiliary_loss_clip": 0.01141087, "auxiliary_loss_mlp": 0.0104366, "balance_loss_clip": 1.02692246, "balance_loss_mlp": 1.04958034, "epoch": 0.22167443258680294, "flos": 23476529370240.0, "grad_norm": 1.8454285457021515, "language_loss": 0.77676046, "learning_rate": 3.626824502298707e-06, "loss": 0.79860795, "num_input_tokens_seen": 79463850, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9140625, "step": 3687, "time_per_iteration": 2.5025110244750977 }, { "auxiliary_loss_clip": 0.01150485, "auxiliary_loss_mlp": 0.01048743, "balance_loss_clip": 1.03001475, "balance_loss_mlp": 1.052894, "epoch": 0.2217345558394709, "flos": 23221132692480.0, "grad_norm": 1.7656213793162465, "language_loss": 0.84786153, "learning_rate": 3.626597926409383e-06, "loss": 0.86985373, "num_input_tokens_seen": 79482845, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.9765625, "step": 3688, "time_per_iteration": 2.490243911743164 }, { "auxiliary_loss_clip": 0.01150094, "auxiliary_loss_mlp": 0.01045729, "balance_loss_clip": 1.02833652, "balance_loss_mlp": 1.05262768, "epoch": 0.22179467909213887, "flos": 20011078045440.0, "grad_norm": 1.8662833406547925, "language_loss": 0.81484002, "learning_rate": 3.6263712888393027e-06, "loss": 0.83679825, "num_input_tokens_seen": 79501550, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9765625, "step": 3689, "time_per_iteration": 2.4656248092651367 }, { "auxiliary_loss_clip": 0.01145363, "auxiliary_loss_mlp": 0.01045806, "balance_loss_clip": 1.02829432, "balance_loss_mlp": 1.05160832, "epoch": 0.22185480234480687, "flos": 19683034110720.0, "grad_norm": 1.7573168730945004, "language_loss": 0.70149481, "learning_rate": 3.626144589597061e-06, "loss": 0.72340655, "num_input_tokens_seen": 79519680, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9375, "step": 3690, "time_per_iteration": 2.4605987071990967 }, { "auxiliary_loss_clip": 0.01150821, "auxiliary_loss_mlp": 0.01034849, "balance_loss_clip": 1.01688433, "balance_loss_mlp": 1.05291915, "epoch": 0.22191492559747483, "flos": 21981316901760.0, "grad_norm": 2.380317988330153, "language_loss": 0.71913499, "learning_rate": 3.6259178286912528e-06, "loss": 0.74099165, "num_input_tokens_seen": 79539000, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.98046875, "step": 3691, "time_per_iteration": 2.499126672744751 }, { "auxiliary_loss_clip": 0.01146752, "auxiliary_loss_mlp": 0.01048003, "balance_loss_clip": 1.02990675, "balance_loss_mlp": 1.05370879, "epoch": 0.2219750488501428, "flos": 23222066446080.0, "grad_norm": 1.9075207206380853, "language_loss": 0.71115589, "learning_rate": 3.625691006130477e-06, "loss": 0.73310345, "num_input_tokens_seen": 79559695, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9296875, "step": 3692, "time_per_iteration": 2.5016849040985107 }, { "auxiliary_loss_clip": 0.01151242, "auxiliary_loss_mlp": 0.01047084, "balance_loss_clip": 1.02976263, "balance_loss_mlp": 1.05302072, "epoch": 0.22203517210281076, "flos": 22453685683200.0, "grad_norm": 2.1123716258581475, "language_loss": 0.87749833, "learning_rate": 3.6254641219233362e-06, "loss": 0.89948165, "num_input_tokens_seen": 79579095, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.98046875, "step": 3693, "time_per_iteration": 2.515249729156494 }, { "auxiliary_loss_clip": 0.01141521, "auxiliary_loss_mlp": 0.0103962, "balance_loss_clip": 1.02424228, "balance_loss_mlp": 1.0509944, "epoch": 0.22209529535547873, "flos": 17564555825280.0, "grad_norm": 2.0242585478664936, "language_loss": 0.85407394, "learning_rate": 3.6252371760784325e-06, "loss": 0.87588537, "num_input_tokens_seen": 79596430, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.90625, "step": 3694, "time_per_iteration": 2.4541285037994385 }, { "auxiliary_loss_clip": 0.01149483, "auxiliary_loss_mlp": 0.01038081, "balance_loss_clip": 1.02059257, "balance_loss_mlp": 1.05045676, "epoch": 0.2221554186081467, "flos": 21469015175040.0, "grad_norm": 3.3586843476040937, "language_loss": 0.68968511, "learning_rate": 3.6250101686043725e-06, "loss": 0.71156073, "num_input_tokens_seen": 79615825, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9921875, "step": 3695, "time_per_iteration": 2.493748426437378 }, { "auxiliary_loss_clip": 0.01143723, "auxiliary_loss_mlp": 0.01041513, "balance_loss_clip": 1.02580094, "balance_loss_mlp": 1.05276489, "epoch": 0.22221554186081466, "flos": 27673445255040.0, "grad_norm": 1.47421382366915, "language_loss": 0.7192471, "learning_rate": 3.6247830995097637e-06, "loss": 0.74109948, "num_input_tokens_seen": 79637875, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.91015625, "step": 3696, "time_per_iteration": 2.5276834964752197 }, { "auxiliary_loss_clip": 0.01145648, "auxiliary_loss_mlp": 0.01039328, "balance_loss_clip": 1.02226949, "balance_loss_mlp": 1.05087078, "epoch": 0.22227566511348265, "flos": 25958926298880.0, "grad_norm": 1.6818172623123477, "language_loss": 0.87839991, "learning_rate": 3.624555968803217e-06, "loss": 0.90024966, "num_input_tokens_seen": 79656970, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.94921875, "step": 3697, "time_per_iteration": 2.52862286567688 }, { "auxiliary_loss_clip": 0.01139461, "auxiliary_loss_mlp": 0.01044774, "balance_loss_clip": 1.0291214, "balance_loss_mlp": 1.04986525, "epoch": 0.22233578836615062, "flos": 39203678833920.0, "grad_norm": 1.669859697950233, "language_loss": 0.65910578, "learning_rate": 3.624328776493346e-06, "loss": 0.68094814, "num_input_tokens_seen": 79680275, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.89453125, "step": 3698, "time_per_iteration": 2.6353225708007812 }, { "auxiliary_loss_clip": 0.01147765, "auxiliary_loss_mlp": 0.0104111, "balance_loss_clip": 1.0234195, "balance_loss_mlp": 1.05194044, "epoch": 0.22239591161881858, "flos": 36283782251520.0, "grad_norm": 1.9755363262102776, "language_loss": 0.83273733, "learning_rate": 3.6241015225887637e-06, "loss": 0.85462606, "num_input_tokens_seen": 79701255, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.95703125, "step": 3699, "time_per_iteration": 2.5937366485595703 }, { "auxiliary_loss_clip": 0.01146613, "auxiliary_loss_mlp": 0.01045472, "balance_loss_clip": 1.02782893, "balance_loss_mlp": 1.05308998, "epoch": 0.22245603487148655, "flos": 19719591177600.0, "grad_norm": 1.451525337132977, "language_loss": 0.79560864, "learning_rate": 3.62387420709809e-06, "loss": 0.8175295, "num_input_tokens_seen": 79721315, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9375, "step": 3700, "time_per_iteration": 2.4647536277770996 }, { "auxiliary_loss_clip": 0.01153599, "auxiliary_loss_mlp": 0.01048479, "balance_loss_clip": 1.03010917, "balance_loss_mlp": 1.05552959, "epoch": 0.2225161581241545, "flos": 46280450615040.0, "grad_norm": 1.9434004159447367, "language_loss": 0.72122073, "learning_rate": 3.623646830029943e-06, "loss": 0.74324155, "num_input_tokens_seen": 79742705, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.98046875, "step": 3701, "time_per_iteration": 2.677196502685547 }, { "auxiliary_loss_clip": 0.01144906, "auxiliary_loss_mlp": 0.01041323, "balance_loss_clip": 1.02421606, "balance_loss_mlp": 1.05053234, "epoch": 0.22257628137682248, "flos": 23696194993920.0, "grad_norm": 2.000403203863157, "language_loss": 0.80033457, "learning_rate": 3.6234193913929454e-06, "loss": 0.8221969, "num_input_tokens_seen": 79763000, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9453125, "step": 3702, "time_per_iteration": 2.4826481342315674 }, { "auxiliary_loss_clip": 0.01137456, "auxiliary_loss_mlp": 0.01039336, "balance_loss_clip": 1.02319455, "balance_loss_mlp": 1.04761505, "epoch": 0.22263640462949044, "flos": 19353984595200.0, "grad_norm": 3.603707379738109, "language_loss": 0.78478861, "learning_rate": 3.623191891195723e-06, "loss": 0.80655658, "num_input_tokens_seen": 79781335, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8984375, "step": 3703, "time_per_iteration": 2.4564783573150635 }, { "auxiliary_loss_clip": 0.01147878, "auxiliary_loss_mlp": 0.01043577, "balance_loss_clip": 1.02459943, "balance_loss_mlp": 1.05185843, "epoch": 0.22269652788215843, "flos": 20776047016320.0, "grad_norm": 2.4046512751941864, "language_loss": 0.74596488, "learning_rate": 3.6229643294469005e-06, "loss": 0.76787937, "num_input_tokens_seen": 79800150, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.9609375, "step": 3704, "time_per_iteration": 2.4687483310699463 }, { "auxiliary_loss_clip": 0.0114323, "auxiliary_loss_mlp": 0.01042611, "balance_loss_clip": 1.02649426, "balance_loss_mlp": 1.05317414, "epoch": 0.2227566511348264, "flos": 47958843467520.0, "grad_norm": 2.320293356566321, "language_loss": 0.64611721, "learning_rate": 3.6227367061551074e-06, "loss": 0.6679756, "num_input_tokens_seen": 79822390, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8984375, "step": 3705, "time_per_iteration": 2.700981616973877 }, { "auxiliary_loss_clip": 0.01068027, "auxiliary_loss_mlp": 0.01025169, "balance_loss_clip": 1.02229619, "balance_loss_mlp": 1.03523946, "epoch": 0.22281677438749437, "flos": 66218953230720.0, "grad_norm": 1.373900303096401, "language_loss": 0.65280473, "learning_rate": 3.6225090213289766e-06, "loss": 0.67373669, "num_input_tokens_seen": 79873350, "router_z_loss_clip": 0.02868652, "router_z_loss_mlp": 0.328125, "step": 3706, "time_per_iteration": 2.996877908706665 }, { "auxiliary_loss_clip": 0.01144159, "auxiliary_loss_mlp": 0.01040476, "balance_loss_clip": 1.02309489, "balance_loss_mlp": 1.04944241, "epoch": 0.22287689764016233, "flos": 21871609787520.0, "grad_norm": 4.391374962328407, "language_loss": 0.80710435, "learning_rate": 3.622281274977141e-06, "loss": 0.8289507, "num_input_tokens_seen": 79891715, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3707, "time_per_iteration": 2.4728705883026123 }, { "auxiliary_loss_clip": 0.01146069, "auxiliary_loss_mlp": 0.01040862, "balance_loss_clip": 1.02309966, "balance_loss_mlp": 1.05234289, "epoch": 0.2229370208928303, "flos": 27672475587840.0, "grad_norm": 1.8827616910312943, "language_loss": 0.77881944, "learning_rate": 3.6220534671082367e-06, "loss": 0.80068874, "num_input_tokens_seen": 79911175, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9375, "step": 3708, "time_per_iteration": 2.5489320755004883 }, { "auxiliary_loss_clip": 0.01148839, "auxiliary_loss_mlp": 0.01041367, "balance_loss_clip": 1.02371264, "balance_loss_mlp": 1.05207324, "epoch": 0.22299714414549826, "flos": 30154657034880.0, "grad_norm": 2.220610467030409, "language_loss": 0.80269277, "learning_rate": 3.6218255977309024e-06, "loss": 0.82459486, "num_input_tokens_seen": 79931875, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.96875, "step": 3709, "time_per_iteration": 2.5538697242736816 }, { "auxiliary_loss_clip": 0.01147618, "auxiliary_loss_mlp": 0.01047679, "balance_loss_clip": 1.03022671, "balance_loss_mlp": 1.05111599, "epoch": 0.22305726739816625, "flos": 23143134309120.0, "grad_norm": 2.32561482452976, "language_loss": 0.68473357, "learning_rate": 3.6215976668537787e-06, "loss": 0.70668656, "num_input_tokens_seen": 79952445, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.96484375, "step": 3710, "time_per_iteration": 2.5554051399230957 }, { "auxiliary_loss_clip": 0.01152782, "auxiliary_loss_mlp": 0.01045473, "balance_loss_clip": 1.02754378, "balance_loss_mlp": 1.05305624, "epoch": 0.22311739065083422, "flos": 19172061187200.0, "grad_norm": 2.2900144944465475, "language_loss": 0.90114224, "learning_rate": 3.6213696744855096e-06, "loss": 0.92312479, "num_input_tokens_seen": 79971030, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.99609375, "step": 3711, "time_per_iteration": 2.4577832221984863 }, { "auxiliary_loss_clip": 0.01145967, "auxiliary_loss_mlp": 0.01056066, "balance_loss_clip": 1.03677773, "balance_loss_mlp": 1.05031848, "epoch": 0.22317751390350218, "flos": 13617757319040.0, "grad_norm": 2.409864764562277, "language_loss": 0.89163744, "learning_rate": 3.6211416206347395e-06, "loss": 0.91365778, "num_input_tokens_seen": 79982085, "router_z_loss_clip": 0.19335938, "router_z_loss_mlp": 0.95703125, "step": 3712, "time_per_iteration": 2.418992280960083 }, { "auxiliary_loss_clip": 0.01149599, "auxiliary_loss_mlp": 0.01045787, "balance_loss_clip": 1.02789402, "balance_loss_mlp": 1.05584049, "epoch": 0.22323763715617015, "flos": 11029065068160.0, "grad_norm": 3.031745674609478, "language_loss": 0.75667822, "learning_rate": 3.620913505310117e-06, "loss": 0.77863204, "num_input_tokens_seen": 79997460, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9375, "step": 3713, "time_per_iteration": 2.432886838912964 }, { "auxiliary_loss_clip": 0.01145778, "auxiliary_loss_mlp": 0.01047615, "balance_loss_clip": 1.03027022, "balance_loss_mlp": 1.05141473, "epoch": 0.22329776040883811, "flos": 41351531466240.0, "grad_norm": 2.0478914624680478, "language_loss": 0.62716836, "learning_rate": 3.6206853285202917e-06, "loss": 0.64910233, "num_input_tokens_seen": 80022450, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3714, "time_per_iteration": 2.6669058799743652 }, { "auxiliary_loss_clip": 0.01145231, "auxiliary_loss_mlp": 0.01037546, "balance_loss_clip": 1.0206902, "balance_loss_mlp": 1.05142832, "epoch": 0.22335788366150608, "flos": 25119478477440.0, "grad_norm": 2.1521685053991733, "language_loss": 0.79212368, "learning_rate": 3.6204570902739164e-06, "loss": 0.81395143, "num_input_tokens_seen": 80042100, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9375, "step": 3715, "time_per_iteration": 2.4889943599700928 }, { "auxiliary_loss_clip": 0.01149484, "auxiliary_loss_mlp": 0.01052724, "balance_loss_clip": 1.03466344, "balance_loss_mlp": 1.05420613, "epoch": 0.22341800691417404, "flos": 16983377769600.0, "grad_norm": 2.51568706931423, "language_loss": 0.76929051, "learning_rate": 3.620228790579645e-06, "loss": 0.79131258, "num_input_tokens_seen": 80059690, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.953125, "step": 3716, "time_per_iteration": 2.493096113204956 }, { "auxiliary_loss_clip": 0.01149025, "auxiliary_loss_mlp": 0.01048199, "balance_loss_clip": 1.03069901, "balance_loss_mlp": 1.05233729, "epoch": 0.22347813016684204, "flos": 14136738975360.0, "grad_norm": 2.970491362940292, "language_loss": 0.78931403, "learning_rate": 3.6200004294461367e-06, "loss": 0.81128627, "num_input_tokens_seen": 80076060, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.96875, "step": 3717, "time_per_iteration": 2.4514658451080322 }, { "auxiliary_loss_clip": 0.01150206, "auxiliary_loss_mlp": 0.01043554, "balance_loss_clip": 1.0251596, "balance_loss_mlp": 1.05318189, "epoch": 0.22353825341951, "flos": 23583147914880.0, "grad_norm": 2.140174347387616, "language_loss": 0.67732906, "learning_rate": 3.6197720068820497e-06, "loss": 0.69926661, "num_input_tokens_seen": 80094760, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.96875, "step": 3718, "time_per_iteration": 4.0651535987854 }, { "auxiliary_loss_clip": 0.01149688, "auxiliary_loss_mlp": 0.01039353, "balance_loss_clip": 1.02051759, "balance_loss_mlp": 1.05227637, "epoch": 0.22359837667217797, "flos": 29824206888960.0, "grad_norm": 1.8175159317809746, "language_loss": 0.80696416, "learning_rate": 3.619543522896045e-06, "loss": 0.82885456, "num_input_tokens_seen": 80114475, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.97265625, "step": 3719, "time_per_iteration": 2.530877113342285 }, { "auxiliary_loss_clip": 0.01153315, "auxiliary_loss_mlp": 0.01054047, "balance_loss_clip": 1.03512859, "balance_loss_mlp": 1.05225849, "epoch": 0.22365849992484593, "flos": 17603088140160.0, "grad_norm": 2.0413056576709936, "language_loss": 0.86886346, "learning_rate": 3.6193149774967885e-06, "loss": 0.89093709, "num_input_tokens_seen": 80132920, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 1.0078125, "step": 3720, "time_per_iteration": 3.8375890254974365 }, { "auxiliary_loss_clip": 0.01148974, "auxiliary_loss_mlp": 0.01041147, "balance_loss_clip": 1.02334905, "balance_loss_mlp": 1.05610657, "epoch": 0.2237186231775139, "flos": 22710949868160.0, "grad_norm": 1.9344655338544474, "language_loss": 0.75021207, "learning_rate": 3.619086370692945e-06, "loss": 0.77211326, "num_input_tokens_seen": 80152845, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9296875, "step": 3721, "time_per_iteration": 2.491048812866211 }, { "auxiliary_loss_clip": 0.01152996, "auxiliary_loss_mlp": 0.0104473, "balance_loss_clip": 1.02632427, "balance_loss_mlp": 1.05476606, "epoch": 0.22377874643018186, "flos": 13371518609280.0, "grad_norm": 2.1541300426128904, "language_loss": 0.79333848, "learning_rate": 3.6188577024931844e-06, "loss": 0.81531572, "num_input_tokens_seen": 80170680, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.984375, "step": 3722, "time_per_iteration": 5.255339860916138 }, { "auxiliary_loss_clip": 0.01148635, "auxiliary_loss_mlp": 0.01041755, "balance_loss_clip": 1.02528, "balance_loss_mlp": 1.05549264, "epoch": 0.22383886968284986, "flos": 17894970057600.0, "grad_norm": 2.0902499806623807, "language_loss": 0.82231128, "learning_rate": 3.618628972906178e-06, "loss": 0.84421515, "num_input_tokens_seen": 80189030, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9296875, "step": 3723, "time_per_iteration": 2.4416961669921875 }, { "auxiliary_loss_clip": 0.01151555, "auxiliary_loss_mlp": 0.01049824, "balance_loss_clip": 1.03171611, "balance_loss_mlp": 1.05473256, "epoch": 0.22389899293551782, "flos": 23879123982720.0, "grad_norm": 1.791485936686604, "language_loss": 0.85028291, "learning_rate": 3.6184001819405984e-06, "loss": 0.87229675, "num_input_tokens_seen": 80208365, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.96875, "step": 3724, "time_per_iteration": 2.5059454441070557 }, { "auxiliary_loss_clip": 0.01148572, "auxiliary_loss_mlp": 0.01041273, "balance_loss_clip": 1.0241425, "balance_loss_mlp": 1.05428553, "epoch": 0.2239591161881858, "flos": 27272430840960.0, "grad_norm": 1.9578565337938896, "language_loss": 0.79037762, "learning_rate": 3.618171329605121e-06, "loss": 0.81227607, "num_input_tokens_seen": 80228685, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.94140625, "step": 3725, "time_per_iteration": 2.5258426666259766 }, { "auxiliary_loss_clip": 0.01146487, "auxiliary_loss_mlp": 0.01040287, "balance_loss_clip": 1.02333534, "balance_loss_mlp": 1.05291438, "epoch": 0.22401923944085375, "flos": 22236857233920.0, "grad_norm": 1.7265876759586163, "language_loss": 0.7744844, "learning_rate": 3.6179424159084254e-06, "loss": 0.79635215, "num_input_tokens_seen": 80247635, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9375, "step": 3726, "time_per_iteration": 2.514009475708008 }, { "auxiliary_loss_clip": 0.01158521, "auxiliary_loss_mlp": 0.01049589, "balance_loss_clip": 1.02931142, "balance_loss_mlp": 1.05574679, "epoch": 0.22407936269352172, "flos": 12053668521600.0, "grad_norm": 2.5603947947595183, "language_loss": 0.72031391, "learning_rate": 3.6177134408591914e-06, "loss": 0.74239492, "num_input_tokens_seen": 80260045, "router_z_loss_clip": 0.203125, "router_z_loss_mlp": 1.03125, "step": 3727, "time_per_iteration": 2.4262595176696777 }, { "auxiliary_loss_clip": 0.01154069, "auxiliary_loss_mlp": 0.0104116, "balance_loss_clip": 1.02096629, "balance_loss_mlp": 1.05414844, "epoch": 0.22413948594618968, "flos": 19353553632000.0, "grad_norm": 2.1435431935305105, "language_loss": 0.86664379, "learning_rate": 3.6174844044661013e-06, "loss": 0.88859612, "num_input_tokens_seen": 80277680, "router_z_loss_clip": 0.20214844, "router_z_loss_mlp": 1.0, "step": 3728, "time_per_iteration": 2.479055643081665 }, { "auxiliary_loss_clip": 0.01151631, "auxiliary_loss_mlp": 0.01045583, "balance_loss_clip": 1.02581811, "balance_loss_mlp": 1.05661964, "epoch": 0.22419960919885765, "flos": 24170000319360.0, "grad_norm": 2.301691964047213, "language_loss": 0.80374575, "learning_rate": 3.6172553067378406e-06, "loss": 0.82571787, "num_input_tokens_seen": 80294795, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.953125, "step": 3729, "time_per_iteration": 2.546344757080078 }, { "auxiliary_loss_clip": 0.01147148, "auxiliary_loss_mlp": 0.01049014, "balance_loss_clip": 1.03265822, "balance_loss_mlp": 1.05450249, "epoch": 0.22425973245152564, "flos": 27378977558400.0, "grad_norm": 1.5204158529212857, "language_loss": 0.86573738, "learning_rate": 3.6170261476830964e-06, "loss": 0.88769895, "num_input_tokens_seen": 80315425, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.92578125, "step": 3730, "time_per_iteration": 2.554219961166382 }, { "auxiliary_loss_clip": 0.01143599, "auxiliary_loss_mlp": 0.01040453, "balance_loss_clip": 1.0232985, "balance_loss_mlp": 1.05254841, "epoch": 0.2243198557041936, "flos": 13735652734080.0, "grad_norm": 1.6836828981418084, "language_loss": 0.73020375, "learning_rate": 3.616796927310559e-06, "loss": 0.7520442, "num_input_tokens_seen": 80333905, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.91015625, "step": 3731, "time_per_iteration": 2.4643397331237793 }, { "auxiliary_loss_clip": 0.01149595, "auxiliary_loss_mlp": 0.01041745, "balance_loss_clip": 1.02484059, "balance_loss_mlp": 1.05424213, "epoch": 0.22437997895686157, "flos": 19530700531200.0, "grad_norm": 1.744294600684333, "language_loss": 0.75290155, "learning_rate": 3.6165676456289195e-06, "loss": 0.7748149, "num_input_tokens_seen": 80352165, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.953125, "step": 3732, "time_per_iteration": 2.485269069671631 }, { "auxiliary_loss_clip": 0.01150233, "auxiliary_loss_mlp": 0.01054421, "balance_loss_clip": 1.03699303, "balance_loss_mlp": 1.0556159, "epoch": 0.22444010220952954, "flos": 23696230907520.0, "grad_norm": 1.7237079521563352, "language_loss": 0.87843704, "learning_rate": 3.616338302646873e-06, "loss": 0.90048361, "num_input_tokens_seen": 80371305, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3733, "time_per_iteration": 2.4895823001861572 }, { "auxiliary_loss_clip": 0.01146855, "auxiliary_loss_mlp": 0.01043839, "balance_loss_clip": 1.02605307, "balance_loss_mlp": 1.05231953, "epoch": 0.2245002254621975, "flos": 22382905933440.0, "grad_norm": 1.523094300556258, "language_loss": 0.84879023, "learning_rate": 3.6161088983731166e-06, "loss": 0.87069714, "num_input_tokens_seen": 80391020, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9453125, "step": 3734, "time_per_iteration": 2.5046606063842773 }, { "auxiliary_loss_clip": 0.01148686, "auxiliary_loss_mlp": 0.01050807, "balance_loss_clip": 1.03350925, "balance_loss_mlp": 1.05464196, "epoch": 0.22456034871486547, "flos": 26942303917440.0, "grad_norm": 1.9071457808772514, "language_loss": 0.76704371, "learning_rate": 3.6158794328163482e-06, "loss": 0.78903866, "num_input_tokens_seen": 80411365, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.94140625, "step": 3735, "time_per_iteration": 2.5094211101531982 }, { "auxiliary_loss_clip": 0.01142735, "auxiliary_loss_mlp": 0.01045612, "balance_loss_clip": 1.02932799, "balance_loss_mlp": 1.05406833, "epoch": 0.22462047196753343, "flos": 28983538005120.0, "grad_norm": 1.8007652046754945, "language_loss": 0.84617877, "learning_rate": 3.6156499059852702e-06, "loss": 0.86806226, "num_input_tokens_seen": 80431075, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.88671875, "step": 3736, "time_per_iteration": 2.542078733444214 }, { "auxiliary_loss_clip": 0.01145573, "auxiliary_loss_mlp": 0.01042823, "balance_loss_clip": 1.02643192, "balance_loss_mlp": 1.05265939, "epoch": 0.22468059522020142, "flos": 20011329440640.0, "grad_norm": 1.7088417648534848, "language_loss": 0.86357021, "learning_rate": 3.615420317888586e-06, "loss": 0.88545418, "num_input_tokens_seen": 80449240, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9296875, "step": 3737, "time_per_iteration": 2.4655988216400146 }, { "auxiliary_loss_clip": 0.01147967, "auxiliary_loss_mlp": 0.01051576, "balance_loss_clip": 1.03345561, "balance_loss_mlp": 1.05307126, "epoch": 0.2247407184728694, "flos": 29314239546240.0, "grad_norm": 6.394013948519083, "language_loss": 0.79037571, "learning_rate": 3.6151906685350006e-06, "loss": 0.81237113, "num_input_tokens_seen": 80467900, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.94921875, "step": 3738, "time_per_iteration": 2.5471558570861816 }, { "auxiliary_loss_clip": 0.01145106, "auxiliary_loss_mlp": 0.01040501, "balance_loss_clip": 1.02440155, "balance_loss_mlp": 1.05173981, "epoch": 0.22480084172553735, "flos": 22310366417280.0, "grad_norm": 1.5292402951189834, "language_loss": 0.76056468, "learning_rate": 3.614960957933224e-06, "loss": 0.78242075, "num_input_tokens_seen": 80487100, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.93359375, "step": 3739, "time_per_iteration": 2.4855587482452393 }, { "auxiliary_loss_clip": 0.01144507, "auxiliary_loss_mlp": 0.01044899, "balance_loss_clip": 1.02748239, "balance_loss_mlp": 1.05169499, "epoch": 0.22486096497820532, "flos": 25591272641280.0, "grad_norm": 2.0747798960609876, "language_loss": 0.74296063, "learning_rate": 3.6147311860919655e-06, "loss": 0.76485467, "num_input_tokens_seen": 80508625, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9296875, "step": 3740, "time_per_iteration": 2.5343406200408936 }, { "auxiliary_loss_clip": 0.01142548, "auxiliary_loss_mlp": 0.01040575, "balance_loss_clip": 1.02356362, "balance_loss_mlp": 1.05079722, "epoch": 0.22492108823087328, "flos": 17639824775040.0, "grad_norm": 2.037841451386636, "language_loss": 0.75821769, "learning_rate": 3.614501353019939e-06, "loss": 0.78004897, "num_input_tokens_seen": 80527345, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.91796875, "step": 3741, "time_per_iteration": 2.446380138397217 }, { "auxiliary_loss_clip": 0.01146416, "auxiliary_loss_mlp": 0.01037361, "balance_loss_clip": 1.02070737, "balance_loss_mlp": 1.0545516, "epoch": 0.22498121148354125, "flos": 16034653797120.0, "grad_norm": 1.8784571387325737, "language_loss": 0.87143487, "learning_rate": 3.6142714587258592e-06, "loss": 0.89327264, "num_input_tokens_seen": 80545545, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.91796875, "step": 3742, "time_per_iteration": 2.4719290733337402 }, { "auxiliary_loss_clip": 0.01143044, "auxiliary_loss_mlp": 0.01045924, "balance_loss_clip": 1.0278523, "balance_loss_mlp": 1.05257273, "epoch": 0.22504133473620924, "flos": 24023772051840.0, "grad_norm": 2.129164997699333, "language_loss": 0.81759846, "learning_rate": 3.614041503218444e-06, "loss": 0.83948821, "num_input_tokens_seen": 80565040, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.90625, "step": 3743, "time_per_iteration": 2.4882960319519043 }, { "auxiliary_loss_clip": 0.01145547, "auxiliary_loss_mlp": 0.01036427, "balance_loss_clip": 1.01986909, "balance_loss_mlp": 1.05258763, "epoch": 0.2251014579888772, "flos": 16763963541120.0, "grad_norm": 2.2370006284317054, "language_loss": 0.63460398, "learning_rate": 3.6138114865064134e-06, "loss": 0.65642375, "num_input_tokens_seen": 80582815, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9296875, "step": 3744, "time_per_iteration": 2.473238229751587 }, { "auxiliary_loss_clip": 0.01142075, "auxiliary_loss_mlp": 0.01042487, "balance_loss_clip": 1.02583385, "balance_loss_mlp": 1.04882228, "epoch": 0.22516158124154517, "flos": 13991013498240.0, "grad_norm": 3.2353996146698374, "language_loss": 0.76261175, "learning_rate": 3.613581408598489e-06, "loss": 0.78445733, "num_input_tokens_seen": 80600865, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.93359375, "step": 3745, "time_per_iteration": 2.46761417388916 }, { "auxiliary_loss_clip": 0.01142966, "auxiliary_loss_mlp": 0.01039105, "balance_loss_clip": 1.02251148, "balance_loss_mlp": 1.05081737, "epoch": 0.22522170449421314, "flos": 14390016750720.0, "grad_norm": 1.8031279148852415, "language_loss": 0.80663961, "learning_rate": 3.6133512695033965e-06, "loss": 0.82846034, "num_input_tokens_seen": 80617455, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.921875, "step": 3746, "time_per_iteration": 2.4726667404174805 }, { "auxiliary_loss_clip": 0.01144822, "auxiliary_loss_mlp": 0.01044333, "balance_loss_clip": 1.02729833, "balance_loss_mlp": 1.05066156, "epoch": 0.2252818277468811, "flos": 23805542972160.0, "grad_norm": 2.500448639645975, "language_loss": 0.85988522, "learning_rate": 3.613121069229862e-06, "loss": 0.88177681, "num_input_tokens_seen": 80635125, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.94140625, "step": 3747, "time_per_iteration": 2.5330967903137207 }, { "auxiliary_loss_clip": 0.01142422, "auxiliary_loss_mlp": 0.01033592, "balance_loss_clip": 1.01762986, "balance_loss_mlp": 1.04935443, "epoch": 0.22534195099954907, "flos": 24718033100160.0, "grad_norm": 1.8286245404500574, "language_loss": 0.76485628, "learning_rate": 3.6128908077866145e-06, "loss": 0.78661638, "num_input_tokens_seen": 80656370, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.9296875, "step": 3748, "time_per_iteration": 2.5658774375915527 }, { "auxiliary_loss_clip": 0.01145704, "auxiliary_loss_mlp": 0.01041432, "balance_loss_clip": 1.02510023, "balance_loss_mlp": 1.05198729, "epoch": 0.22540207425221703, "flos": 21032341534080.0, "grad_norm": 1.589880718510697, "language_loss": 0.79625857, "learning_rate": 3.6126604851823864e-06, "loss": 0.81812996, "num_input_tokens_seen": 80676495, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.9375, "step": 3749, "time_per_iteration": 2.4867939949035645 }, { "auxiliary_loss_clip": 0.01137812, "auxiliary_loss_mlp": 0.01040787, "balance_loss_clip": 1.02470565, "balance_loss_mlp": 1.0489409, "epoch": 0.22546219750488503, "flos": 19390362094080.0, "grad_norm": 1.5775909093323732, "language_loss": 0.79593003, "learning_rate": 3.6124301014259108e-06, "loss": 0.81771606, "num_input_tokens_seen": 80694755, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.890625, "step": 3750, "time_per_iteration": 2.4919381141662598 }, { "auxiliary_loss_clip": 0.011435, "auxiliary_loss_mlp": 0.01041468, "balance_loss_clip": 1.02426624, "balance_loss_mlp": 1.05058241, "epoch": 0.225522320757553, "flos": 25192628524800.0, "grad_norm": 1.7790766233019943, "language_loss": 0.82149136, "learning_rate": 3.6121996565259244e-06, "loss": 0.84334099, "num_input_tokens_seen": 80713670, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9296875, "step": 3751, "time_per_iteration": 2.5348052978515625 }, { "auxiliary_loss_clip": 0.01146192, "auxiliary_loss_mlp": 0.01044035, "balance_loss_clip": 1.02677321, "balance_loss_mlp": 1.05171072, "epoch": 0.22558244401022096, "flos": 17163110448000.0, "grad_norm": 1.8039284872374655, "language_loss": 0.84048533, "learning_rate": 3.611969150491165e-06, "loss": 0.8623876, "num_input_tokens_seen": 80731450, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9453125, "step": 3752, "time_per_iteration": 2.4641730785369873 }, { "auxiliary_loss_clip": 0.0114055, "auxiliary_loss_mlp": 0.01034505, "balance_loss_clip": 1.01850736, "balance_loss_mlp": 1.04923558, "epoch": 0.22564256726288892, "flos": 15231008856960.0, "grad_norm": 1.7329783395150555, "language_loss": 0.78971446, "learning_rate": 3.611738583330375e-06, "loss": 0.81146502, "num_input_tokens_seen": 80748415, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.9140625, "step": 3753, "time_per_iteration": 2.437626600265503 }, { "auxiliary_loss_clip": 0.01138854, "auxiliary_loss_mlp": 0.01041388, "balance_loss_clip": 1.02401972, "balance_loss_mlp": 1.04840422, "epoch": 0.2257026905155569, "flos": 34568652764160.0, "grad_norm": 1.8549069732956112, "language_loss": 0.78265977, "learning_rate": 3.611507955052295e-06, "loss": 0.80446219, "num_input_tokens_seen": 80770835, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90625, "step": 3754, "time_per_iteration": 2.6030919551849365 }, { "auxiliary_loss_clip": 0.01142399, "auxiliary_loss_mlp": 0.01045647, "balance_loss_clip": 1.02862406, "balance_loss_mlp": 1.05250812, "epoch": 0.22576281376822485, "flos": 19938430788480.0, "grad_norm": 1.8174641143642385, "language_loss": 0.70445085, "learning_rate": 3.6112772656656727e-06, "loss": 0.72633129, "num_input_tokens_seen": 80787840, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8984375, "step": 3755, "time_per_iteration": 2.454359769821167 }, { "auxiliary_loss_clip": 0.01149527, "auxiliary_loss_mlp": 0.01046171, "balance_loss_clip": 1.02920794, "balance_loss_mlp": 1.05413175, "epoch": 0.22582293702089282, "flos": 24602005192320.0, "grad_norm": 2.126872733698487, "language_loss": 0.77207291, "learning_rate": 3.6110465151792547e-06, "loss": 0.79402989, "num_input_tokens_seen": 80806335, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.953125, "step": 3756, "time_per_iteration": 2.497143030166626 }, { "auxiliary_loss_clip": 0.0114704, "auxiliary_loss_mlp": 0.01045867, "balance_loss_clip": 1.02872503, "balance_loss_mlp": 1.05290508, "epoch": 0.2258830602735608, "flos": 23035438356480.0, "grad_norm": 1.7806164640992803, "language_loss": 0.82440841, "learning_rate": 3.6108157036017916e-06, "loss": 0.84633756, "num_input_tokens_seen": 80825355, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.94140625, "step": 3757, "time_per_iteration": 2.4888768196105957 }, { "auxiliary_loss_clip": 0.01143716, "auxiliary_loss_mlp": 0.01047219, "balance_loss_clip": 1.02977836, "balance_loss_mlp": 1.05052376, "epoch": 0.22594318352622877, "flos": 22158427887360.0, "grad_norm": 1.7845505705425593, "language_loss": 0.72952461, "learning_rate": 3.6105848309420358e-06, "loss": 0.75143397, "num_input_tokens_seen": 80842570, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9296875, "step": 3758, "time_per_iteration": 2.4747960567474365 }, { "auxiliary_loss_clip": 0.01145087, "auxiliary_loss_mlp": 0.01047324, "balance_loss_clip": 1.02904952, "balance_loss_mlp": 1.05136836, "epoch": 0.22600330677889674, "flos": 20594303176320.0, "grad_norm": 2.0887878133401916, "language_loss": 0.7730751, "learning_rate": 3.6103538972087412e-06, "loss": 0.79499924, "num_input_tokens_seen": 80858745, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.9375, "step": 3759, "time_per_iteration": 2.4446780681610107 }, { "auxiliary_loss_clip": 0.01143758, "auxiliary_loss_mlp": 0.01043565, "balance_loss_clip": 1.02554071, "balance_loss_mlp": 1.04911923, "epoch": 0.2260634300315647, "flos": 35659798162560.0, "grad_norm": 1.7350156265916372, "language_loss": 0.78873885, "learning_rate": 3.6101229024106655e-06, "loss": 0.81061208, "num_input_tokens_seen": 80880085, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9453125, "step": 3760, "time_per_iteration": 4.031078338623047 }, { "auxiliary_loss_clip": 0.01064709, "auxiliary_loss_mlp": 0.01013834, "balance_loss_clip": 1.01094937, "balance_loss_mlp": 1.03228486, "epoch": 0.22612355328423267, "flos": 72090455126400.0, "grad_norm": 0.9433739534870522, "language_loss": 0.60082626, "learning_rate": 3.609891846556569e-06, "loss": 0.62161171, "num_input_tokens_seen": 80937660, "router_z_loss_clip": 0.02880859, "router_z_loss_mlp": 0.32421875, "step": 3761, "time_per_iteration": 4.449028015136719 }, { "auxiliary_loss_clip": 0.01145269, "auxiliary_loss_mlp": 0.0104454, "balance_loss_clip": 1.02698016, "balance_loss_mlp": 1.04873288, "epoch": 0.22618367653690064, "flos": 22783776693120.0, "grad_norm": 2.150795773598875, "language_loss": 0.7740311, "learning_rate": 3.609660729655211e-06, "loss": 0.79592919, "num_input_tokens_seen": 80956265, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.96875, "step": 3762, "time_per_iteration": 2.4788243770599365 }, { "auxiliary_loss_clip": 0.01144513, "auxiliary_loss_mlp": 0.01040386, "balance_loss_clip": 1.02305317, "balance_loss_mlp": 1.05030465, "epoch": 0.22624379978956863, "flos": 20448254476800.0, "grad_norm": 1.9637881498042107, "language_loss": 0.79118937, "learning_rate": 3.6094295517153573e-06, "loss": 0.81303835, "num_input_tokens_seen": 80975185, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.94140625, "step": 3763, "time_per_iteration": 3.925650119781494 }, { "auxiliary_loss_clip": 0.01145838, "auxiliary_loss_mlp": 0.01051656, "balance_loss_clip": 1.03327346, "balance_loss_mlp": 1.04962254, "epoch": 0.2263039230422366, "flos": 17494314779520.0, "grad_norm": 1.6106221895592063, "language_loss": 0.91116643, "learning_rate": 3.6091983127457743e-06, "loss": 0.93314135, "num_input_tokens_seen": 80992830, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9609375, "step": 3764, "time_per_iteration": 3.941519260406494 }, { "auxiliary_loss_clip": 0.01141491, "auxiliary_loss_mlp": 0.01053668, "balance_loss_clip": 1.03610826, "balance_loss_mlp": 1.04954624, "epoch": 0.22636404629490456, "flos": 28329748606080.0, "grad_norm": 1.9648437019656757, "language_loss": 0.75109166, "learning_rate": 3.6089670127552293e-06, "loss": 0.77304322, "num_input_tokens_seen": 81013675, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.91796875, "step": 3765, "time_per_iteration": 2.5244264602661133 }, { "auxiliary_loss_clip": 0.01140797, "auxiliary_loss_mlp": 0.01045888, "balance_loss_clip": 1.02940106, "balance_loss_mlp": 1.04942632, "epoch": 0.22642416954757252, "flos": 17489143221120.0, "grad_norm": 2.1033664225916713, "language_loss": 0.89537859, "learning_rate": 3.608735651752494e-06, "loss": 0.91724545, "num_input_tokens_seen": 81030345, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9140625, "step": 3766, "time_per_iteration": 2.4539778232574463 }, { "auxiliary_loss_clip": 0.0113944, "auxiliary_loss_mlp": 0.01043436, "balance_loss_clip": 1.02661574, "balance_loss_mlp": 1.05060518, "epoch": 0.2264842928002405, "flos": 24384530298240.0, "grad_norm": 1.5312900800867268, "language_loss": 0.74727404, "learning_rate": 3.6085042297463417e-06, "loss": 0.76910281, "num_input_tokens_seen": 81051000, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.88671875, "step": 3767, "time_per_iteration": 2.502889394760132 }, { "auxiliary_loss_clip": 0.01142132, "auxiliary_loss_mlp": 0.01043209, "balance_loss_clip": 1.02599549, "balance_loss_mlp": 1.04825461, "epoch": 0.22654441605290845, "flos": 19830519354240.0, "grad_norm": 1.5794634569504102, "language_loss": 0.71861488, "learning_rate": 3.6082727467455477e-06, "loss": 0.74046832, "num_input_tokens_seen": 81071205, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9375, "step": 3768, "time_per_iteration": 2.5062062740325928 }, { "auxiliary_loss_clip": 0.0114653, "auxiliary_loss_mlp": 0.01053816, "balance_loss_clip": 1.03555298, "balance_loss_mlp": 1.05414271, "epoch": 0.22660453930557642, "flos": 27454569730560.0, "grad_norm": 1.7046375484399263, "language_loss": 0.78210497, "learning_rate": 3.6080412027588905e-06, "loss": 0.80410838, "num_input_tokens_seen": 81091880, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.921875, "step": 3769, "time_per_iteration": 2.531702995300293 }, { "auxiliary_loss_clip": 0.01146389, "auxiliary_loss_mlp": 0.01040093, "balance_loss_clip": 1.02217615, "balance_loss_mlp": 1.05036592, "epoch": 0.2266646625582444, "flos": 23988148738560.0, "grad_norm": 2.717380597310113, "language_loss": 0.68468416, "learning_rate": 3.6078095977951488e-06, "loss": 0.70654905, "num_input_tokens_seen": 81113290, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9609375, "step": 3770, "time_per_iteration": 2.525883913040161 }, { "auxiliary_loss_clip": 0.01146007, "auxiliary_loss_mlp": 0.01041868, "balance_loss_clip": 1.02457106, "balance_loss_mlp": 1.05125737, "epoch": 0.22672478581091238, "flos": 26028054023040.0, "grad_norm": 1.532883077572624, "language_loss": 0.80691355, "learning_rate": 3.6075779318631067e-06, "loss": 0.82879233, "num_input_tokens_seen": 81133535, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.94921875, "step": 3771, "time_per_iteration": 2.513721227645874 }, { "auxiliary_loss_clip": 0.01134945, "auxiliary_loss_mlp": 0.01048749, "balance_loss_clip": 1.03205955, "balance_loss_mlp": 1.04653049, "epoch": 0.22678490906358034, "flos": 23841812730240.0, "grad_norm": 1.5566274797892232, "language_loss": 0.78857291, "learning_rate": 3.6073462049715486e-06, "loss": 0.81040978, "num_input_tokens_seen": 81154650, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8828125, "step": 3772, "time_per_iteration": 2.5212740898132324 }, { "auxiliary_loss_clip": 0.01059102, "auxiliary_loss_mlp": 0.01007897, "balance_loss_clip": 1.00502396, "balance_loss_mlp": 1.0272572, "epoch": 0.2268450323162483, "flos": 65048088574080.0, "grad_norm": 0.6566932697682215, "language_loss": 0.54394305, "learning_rate": 3.607114417129261e-06, "loss": 0.56461298, "num_input_tokens_seen": 81221240, "router_z_loss_clip": 0.02868652, "router_z_loss_mlp": 0.31835938, "step": 3773, "time_per_iteration": 3.184884786605835 }, { "auxiliary_loss_clip": 0.01139362, "auxiliary_loss_mlp": 0.01037003, "balance_loss_clip": 1.01951492, "balance_loss_mlp": 1.04871321, "epoch": 0.22690515556891627, "flos": 22526081544960.0, "grad_norm": 2.1207485482808255, "language_loss": 0.70671856, "learning_rate": 3.6068825683450334e-06, "loss": 0.72848219, "num_input_tokens_seen": 81241520, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.90625, "step": 3774, "time_per_iteration": 2.4756317138671875 }, { "auxiliary_loss_clip": 0.0113912, "auxiliary_loss_mlp": 0.01043588, "balance_loss_clip": 1.02646971, "balance_loss_mlp": 1.04784679, "epoch": 0.22696527882158424, "flos": 18223444955520.0, "grad_norm": 2.0198724764576275, "language_loss": 0.74883151, "learning_rate": 3.606650658627658e-06, "loss": 0.77065861, "num_input_tokens_seen": 81256825, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9140625, "step": 3775, "time_per_iteration": 2.4631528854370117 }, { "auxiliary_loss_clip": 0.01140371, "auxiliary_loss_mlp": 0.01039339, "balance_loss_clip": 1.02357996, "balance_loss_mlp": 1.04844356, "epoch": 0.22702540207425223, "flos": 17019252478080.0, "grad_norm": 2.175791350084697, "language_loss": 0.81688499, "learning_rate": 3.606418687985928e-06, "loss": 0.83868206, "num_input_tokens_seen": 81275695, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.921875, "step": 3776, "time_per_iteration": 2.4413795471191406 }, { "auxiliary_loss_clip": 0.01143243, "auxiliary_loss_mlp": 0.01043995, "balance_loss_clip": 1.02750874, "balance_loss_mlp": 1.04965746, "epoch": 0.2270855253269202, "flos": 21325731822720.0, "grad_norm": 2.2050904201899497, "language_loss": 0.82539207, "learning_rate": 3.606186656428641e-06, "loss": 0.84726441, "num_input_tokens_seen": 81294920, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9375, "step": 3777, "time_per_iteration": 2.5051093101501465 }, { "auxiliary_loss_clip": 0.01142667, "auxiliary_loss_mlp": 0.01040435, "balance_loss_clip": 1.02319717, "balance_loss_mlp": 1.0492413, "epoch": 0.22714564857958816, "flos": 23550469516800.0, "grad_norm": 1.8702863457014975, "language_loss": 0.72608888, "learning_rate": 3.6059545639645955e-06, "loss": 0.74791992, "num_input_tokens_seen": 81314275, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.93359375, "step": 3778, "time_per_iteration": 2.477700710296631 }, { "auxiliary_loss_clip": 0.01142657, "auxiliary_loss_mlp": 0.01038858, "balance_loss_clip": 1.02139401, "balance_loss_mlp": 1.04836857, "epoch": 0.22720577183225613, "flos": 25989880844160.0, "grad_norm": 2.2217497680931326, "language_loss": 0.64189589, "learning_rate": 3.605722410602591e-06, "loss": 0.66371107, "num_input_tokens_seen": 81333890, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3779, "time_per_iteration": 2.543538808822632 }, { "auxiliary_loss_clip": 0.01139935, "auxiliary_loss_mlp": 0.01046832, "balance_loss_clip": 1.0306673, "balance_loss_mlp": 1.04991436, "epoch": 0.2272658950849241, "flos": 20814076540800.0, "grad_norm": 1.659606831428131, "language_loss": 0.70448661, "learning_rate": 3.6054901963514323e-06, "loss": 0.72635436, "num_input_tokens_seen": 81353640, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8984375, "step": 3780, "time_per_iteration": 2.4533581733703613 }, { "auxiliary_loss_clip": 0.01142944, "auxiliary_loss_mlp": 0.01043549, "balance_loss_clip": 1.0260725, "balance_loss_mlp": 1.05120695, "epoch": 0.22732601833759206, "flos": 23909324342400.0, "grad_norm": 1.611781251376716, "language_loss": 0.89610302, "learning_rate": 3.6052579212199246e-06, "loss": 0.91796792, "num_input_tokens_seen": 81371595, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.91796875, "step": 3781, "time_per_iteration": 2.5037286281585693 }, { "auxiliary_loss_clip": 0.0114272, "auxiliary_loss_mlp": 0.01042906, "balance_loss_clip": 1.02572787, "balance_loss_mlp": 1.04835463, "epoch": 0.22738614159026002, "flos": 15924407978880.0, "grad_norm": 2.389629331668866, "language_loss": 0.74701846, "learning_rate": 3.6050255852168753e-06, "loss": 0.76887476, "num_input_tokens_seen": 81388435, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9453125, "step": 3782, "time_per_iteration": 2.4253599643707275 }, { "auxiliary_loss_clip": 0.01139264, "auxiliary_loss_mlp": 0.01042741, "balance_loss_clip": 1.0268743, "balance_loss_mlp": 1.04727137, "epoch": 0.22744626484292801, "flos": 24205515891840.0, "grad_norm": 1.4346944939360413, "language_loss": 0.82502806, "learning_rate": 3.604793188351095e-06, "loss": 0.84684813, "num_input_tokens_seen": 81410195, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.921875, "step": 3783, "time_per_iteration": 2.515467882156372 }, { "auxiliary_loss_clip": 0.01141366, "auxiliary_loss_mlp": 0.01045747, "balance_loss_clip": 1.02745998, "balance_loss_mlp": 1.0483532, "epoch": 0.22750638809559598, "flos": 24791614110720.0, "grad_norm": 2.272300436461767, "language_loss": 0.75946468, "learning_rate": 3.6045607306313964e-06, "loss": 0.78133583, "num_input_tokens_seen": 81430060, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.9296875, "step": 3784, "time_per_iteration": 2.524325370788574 }, { "auxiliary_loss_clip": 0.01137181, "auxiliary_loss_mlp": 0.01045165, "balance_loss_clip": 1.02815354, "balance_loss_mlp": 1.04586279, "epoch": 0.22756651134826394, "flos": 22236498097920.0, "grad_norm": 1.6651755550909122, "language_loss": 0.70710468, "learning_rate": 3.604328212066594e-06, "loss": 0.72892815, "num_input_tokens_seen": 81447375, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9140625, "step": 3785, "time_per_iteration": 2.495887041091919 }, { "auxiliary_loss_clip": 0.01062434, "auxiliary_loss_mlp": 0.01004997, "balance_loss_clip": 1.00199318, "balance_loss_mlp": 1.03035784, "epoch": 0.2276266346009319, "flos": 62707466626560.0, "grad_norm": 0.8214829949422767, "language_loss": 0.61960161, "learning_rate": 3.6040956326655047e-06, "loss": 0.64027596, "num_input_tokens_seen": 81505235, "router_z_loss_clip": 0.0300293, "router_z_loss_mlp": 0.3203125, "step": 3786, "time_per_iteration": 3.0658440589904785 }, { "auxiliary_loss_clip": 0.01148694, "auxiliary_loss_mlp": 0.01041435, "balance_loss_clip": 1.02354169, "balance_loss_mlp": 1.05210066, "epoch": 0.22768675785359987, "flos": 18613936684800.0, "grad_norm": 2.5325895711420165, "language_loss": 0.86354452, "learning_rate": 3.6038629924369486e-06, "loss": 0.88544583, "num_input_tokens_seen": 81518685, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.96484375, "step": 3787, "time_per_iteration": 2.5012753009796143 }, { "auxiliary_loss_clip": 0.01140439, "auxiliary_loss_mlp": 0.01040704, "balance_loss_clip": 1.0245626, "balance_loss_mlp": 1.04981518, "epoch": 0.22774688110626784, "flos": 26870195364480.0, "grad_norm": 1.4958763709871286, "language_loss": 0.72413027, "learning_rate": 3.6036302913897474e-06, "loss": 0.74594176, "num_input_tokens_seen": 81538940, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.90625, "step": 3788, "time_per_iteration": 2.5145907402038574 }, { "auxiliary_loss_clip": 0.01142674, "auxiliary_loss_mlp": 0.01031796, "balance_loss_clip": 1.01546407, "balance_loss_mlp": 1.05093503, "epoch": 0.2278070043589358, "flos": 15553593924480.0, "grad_norm": 2.605086925372986, "language_loss": 0.67548752, "learning_rate": 3.6033975295327243e-06, "loss": 0.69723225, "num_input_tokens_seen": 81555525, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.91796875, "step": 3789, "time_per_iteration": 2.4470176696777344 }, { "auxiliary_loss_clip": 0.01142063, "auxiliary_loss_mlp": 0.01036301, "balance_loss_clip": 1.01939738, "balance_loss_mlp": 1.04975843, "epoch": 0.2278671276116038, "flos": 22416805393920.0, "grad_norm": 2.2608336985588866, "language_loss": 0.7591126, "learning_rate": 3.6031647068747065e-06, "loss": 0.78089625, "num_input_tokens_seen": 81576305, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.92578125, "step": 3790, "time_per_iteration": 2.4971706867218018 }, { "auxiliary_loss_clip": 0.01137528, "auxiliary_loss_mlp": 0.01040018, "balance_loss_clip": 1.0233885, "balance_loss_mlp": 1.04657471, "epoch": 0.22792725086427176, "flos": 20631363033600.0, "grad_norm": 1.8972888634199538, "language_loss": 0.90998828, "learning_rate": 3.602931823424522e-06, "loss": 0.93176377, "num_input_tokens_seen": 81594115, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.90625, "step": 3791, "time_per_iteration": 2.4714484214782715 }, { "auxiliary_loss_clip": 0.01142243, "auxiliary_loss_mlp": 0.01037273, "balance_loss_clip": 1.02024984, "balance_loss_mlp": 1.04749084, "epoch": 0.22798737411693973, "flos": 31428946903680.0, "grad_norm": 1.623764593994017, "language_loss": 0.82271355, "learning_rate": 3.6026988791910026e-06, "loss": 0.84450877, "num_input_tokens_seen": 81615355, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.9453125, "step": 3792, "time_per_iteration": 2.5486273765563965 }, { "auxiliary_loss_clip": 0.01060076, "auxiliary_loss_mlp": 0.01004841, "balance_loss_clip": 1.00194383, "balance_loss_mlp": 1.02827537, "epoch": 0.2280474973696077, "flos": 52396685827200.0, "grad_norm": 1.142175327687419, "language_loss": 0.65640676, "learning_rate": 3.602465874182981e-06, "loss": 0.67705595, "num_input_tokens_seen": 81662075, "router_z_loss_clip": 0.02893066, "router_z_loss_mlp": 0.31835938, "step": 3793, "time_per_iteration": 2.8529915809631348 }, { "auxiliary_loss_clip": 0.01146848, "auxiliary_loss_mlp": 0.01049333, "balance_loss_clip": 1.0314393, "balance_loss_mlp": 1.05020785, "epoch": 0.22810762062227566, "flos": 26396066816640.0, "grad_norm": 2.2173721070114834, "language_loss": 0.77340162, "learning_rate": 3.602232808409293e-06, "loss": 0.79536337, "num_input_tokens_seen": 81681625, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.96875, "step": 3794, "time_per_iteration": 2.5065202713012695 }, { "auxiliary_loss_clip": 0.01142253, "auxiliary_loss_mlp": 0.01040757, "balance_loss_clip": 1.02356696, "balance_loss_mlp": 1.04916, "epoch": 0.22816774387494362, "flos": 25630271832960.0, "grad_norm": 3.550926202434462, "language_loss": 0.80773306, "learning_rate": 3.6019996818787755e-06, "loss": 0.8295632, "num_input_tokens_seen": 81701170, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9296875, "step": 3795, "time_per_iteration": 2.5263593196868896 }, { "auxiliary_loss_clip": 0.01139482, "auxiliary_loss_mlp": 0.01044421, "balance_loss_clip": 1.02787471, "balance_loss_mlp": 1.04832673, "epoch": 0.22822786712761162, "flos": 22451602694400.0, "grad_norm": 1.9032383555189785, "language_loss": 0.77228558, "learning_rate": 3.6017664946002704e-06, "loss": 0.79412454, "num_input_tokens_seen": 81721265, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9140625, "step": 3796, "time_per_iteration": 2.4684410095214844 }, { "auxiliary_loss_clip": 0.01138999, "auxiliary_loss_mlp": 0.01033836, "balance_loss_clip": 1.01743293, "balance_loss_mlp": 1.04671454, "epoch": 0.22828799038027958, "flos": 12202554395520.0, "grad_norm": 2.2397266787385055, "language_loss": 0.9573195, "learning_rate": 3.6015332465826188e-06, "loss": 0.97904783, "num_input_tokens_seen": 81736565, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.921875, "step": 3797, "time_per_iteration": 2.4536428451538086 }, { "auxiliary_loss_clip": 0.01141693, "auxiliary_loss_mlp": 0.01039314, "balance_loss_clip": 1.02304161, "balance_loss_mlp": 1.04979289, "epoch": 0.22834811363294755, "flos": 22085708803200.0, "grad_norm": 1.6349549294335768, "language_loss": 0.81551516, "learning_rate": 3.601299937834666e-06, "loss": 0.83732522, "num_input_tokens_seen": 81756240, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.91796875, "step": 3798, "time_per_iteration": 2.4940571784973145 }, { "auxiliary_loss_clip": 0.01140351, "auxiliary_loss_mlp": 0.01038917, "balance_loss_clip": 1.02103519, "balance_loss_mlp": 1.04633498, "epoch": 0.2284082368856155, "flos": 24860634094080.0, "grad_norm": 1.852596531196744, "language_loss": 0.79055333, "learning_rate": 3.6010665683652596e-06, "loss": 0.81234598, "num_input_tokens_seen": 81775720, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9375, "step": 3799, "time_per_iteration": 2.5178029537200928 }, { "auxiliary_loss_clip": 0.01141554, "auxiliary_loss_mlp": 0.01050331, "balance_loss_clip": 1.03334355, "balance_loss_mlp": 1.04904866, "epoch": 0.22846836013828348, "flos": 23292882109440.0, "grad_norm": 1.727150681078651, "language_loss": 0.75177896, "learning_rate": 3.6008331381832484e-06, "loss": 0.77369779, "num_input_tokens_seen": 81795830, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.921875, "step": 3800, "time_per_iteration": 2.494016647338867 }, { "auxiliary_loss_clip": 0.0114112, "auxiliary_loss_mlp": 0.01039571, "balance_loss_clip": 1.02418089, "balance_loss_mlp": 1.04995513, "epoch": 0.22852848339095144, "flos": 27416288810880.0, "grad_norm": 1.765461583281549, "language_loss": 0.64105272, "learning_rate": 3.600599647297484e-06, "loss": 0.66285962, "num_input_tokens_seen": 81815745, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.9140625, "step": 3801, "time_per_iteration": 2.539137840270996 }, { "auxiliary_loss_clip": 0.0113746, "auxiliary_loss_mlp": 0.01036963, "balance_loss_clip": 1.02101278, "balance_loss_mlp": 1.04838383, "epoch": 0.2285886066436194, "flos": 26321157002880.0, "grad_norm": 1.7252415684085467, "language_loss": 0.81245035, "learning_rate": 3.60036609571682e-06, "loss": 0.8341946, "num_input_tokens_seen": 81835155, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.890625, "step": 3802, "time_per_iteration": 4.026008605957031 }, { "auxiliary_loss_clip": 0.01140728, "auxiliary_loss_mlp": 0.01048499, "balance_loss_clip": 1.03166687, "balance_loss_mlp": 1.04882586, "epoch": 0.2286487298962874, "flos": 29716475022720.0, "grad_norm": 1.652461827834478, "language_loss": 0.7882247, "learning_rate": 3.600132483450114e-06, "loss": 0.81011701, "num_input_tokens_seen": 81855655, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.921875, "step": 3803, "time_per_iteration": 3.971120834350586 }, { "auxiliary_loss_clip": 0.0113733, "auxiliary_loss_mlp": 0.01041124, "balance_loss_clip": 1.02438724, "balance_loss_mlp": 1.04465175, "epoch": 0.22870885314895537, "flos": 21287199507840.0, "grad_norm": 1.6603573816675787, "language_loss": 0.85285091, "learning_rate": 3.5998988105062235e-06, "loss": 0.87463546, "num_input_tokens_seen": 81876385, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.92578125, "step": 3804, "time_per_iteration": 2.4726722240448 }, { "auxiliary_loss_clip": 0.01141496, "auxiliary_loss_mlp": 0.01040041, "balance_loss_clip": 1.02419853, "balance_loss_mlp": 1.04696929, "epoch": 0.22876897640162333, "flos": 14939450161920.0, "grad_norm": 5.492244628028297, "language_loss": 0.76282132, "learning_rate": 3.59966507689401e-06, "loss": 0.78463674, "num_input_tokens_seen": 81893225, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.9453125, "step": 3805, "time_per_iteration": 3.834423065185547 }, { "auxiliary_loss_clip": 0.01141243, "auxiliary_loss_mlp": 0.01042035, "balance_loss_clip": 1.02455878, "balance_loss_mlp": 1.04620624, "epoch": 0.2288290996542913, "flos": 18113917409280.0, "grad_norm": 2.2541529506121893, "language_loss": 0.78802192, "learning_rate": 3.5994312826223363e-06, "loss": 0.80985463, "num_input_tokens_seen": 81911350, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.953125, "step": 3806, "time_per_iteration": 3.906050682067871 }, { "auxiliary_loss_clip": 0.01141162, "auxiliary_loss_mlp": 0.01046997, "balance_loss_clip": 1.03033113, "balance_loss_mlp": 1.04831529, "epoch": 0.22888922290695926, "flos": 39855457071360.0, "grad_norm": 2.7532976262152946, "language_loss": 0.70008928, "learning_rate": 3.5991974277000684e-06, "loss": 0.72197086, "num_input_tokens_seen": 81935420, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9296875, "step": 3807, "time_per_iteration": 2.6643049716949463 }, { "auxiliary_loss_clip": 0.01147283, "auxiliary_loss_mlp": 0.01052289, "balance_loss_clip": 1.03469384, "balance_loss_mlp": 1.05117655, "epoch": 0.22894934615962723, "flos": 23403774372480.0, "grad_norm": 2.8897350321861137, "language_loss": 0.65497482, "learning_rate": 3.5989635121360733e-06, "loss": 0.67697054, "num_input_tokens_seen": 81953845, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9609375, "step": 3808, "time_per_iteration": 2.491262197494507 }, { "auxiliary_loss_clip": 0.01141624, "auxiliary_loss_mlp": 0.01046721, "balance_loss_clip": 1.03061569, "balance_loss_mlp": 1.04964042, "epoch": 0.22900946941229522, "flos": 18843011671680.0, "grad_norm": 1.8780214227379721, "language_loss": 0.7512365, "learning_rate": 3.598729535939222e-06, "loss": 0.77311993, "num_input_tokens_seen": 81972100, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.91796875, "step": 3809, "time_per_iteration": 2.4627346992492676 }, { "auxiliary_loss_clip": 0.0114068, "auxiliary_loss_mlp": 0.01042969, "balance_loss_clip": 1.02713788, "balance_loss_mlp": 1.05093634, "epoch": 0.22906959266496318, "flos": 22929394429440.0, "grad_norm": 1.591118628950878, "language_loss": 0.81481934, "learning_rate": 3.5984954991183862e-06, "loss": 0.83665586, "num_input_tokens_seen": 81992760, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8984375, "step": 3810, "time_per_iteration": 2.491070508956909 }, { "auxiliary_loss_clip": 0.01137041, "auxiliary_loss_mlp": 0.0103836, "balance_loss_clip": 1.02293468, "balance_loss_mlp": 1.04759479, "epoch": 0.22912971591763115, "flos": 19354523299200.0, "grad_norm": 2.059677745172372, "language_loss": 0.78771317, "learning_rate": 3.598261401682441e-06, "loss": 0.8094672, "num_input_tokens_seen": 82009080, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.89453125, "step": 3811, "time_per_iteration": 2.4747867584228516 }, { "auxiliary_loss_clip": 0.01140179, "auxiliary_loss_mlp": 0.01045575, "balance_loss_clip": 1.02889764, "balance_loss_mlp": 1.04866982, "epoch": 0.22918983917029911, "flos": 19933546538880.0, "grad_norm": 1.7221236672909976, "language_loss": 0.82648832, "learning_rate": 3.5980272436402632e-06, "loss": 0.84834588, "num_input_tokens_seen": 82026705, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9140625, "step": 3812, "time_per_iteration": 2.4383721351623535 }, { "auxiliary_loss_clip": 0.01147014, "auxiliary_loss_mlp": 0.01048114, "balance_loss_clip": 1.03172266, "balance_loss_mlp": 1.05175161, "epoch": 0.22924996242296708, "flos": 16690885320960.0, "grad_norm": 2.681252892547636, "language_loss": 0.83150911, "learning_rate": 3.5977930250007324e-06, "loss": 0.85346043, "num_input_tokens_seen": 82043245, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.953125, "step": 3813, "time_per_iteration": 2.505434989929199 }, { "auxiliary_loss_clip": 0.01141934, "auxiliary_loss_mlp": 0.01043715, "balance_loss_clip": 1.02809858, "balance_loss_mlp": 1.0489682, "epoch": 0.22931008567563504, "flos": 33036164956800.0, "grad_norm": 1.9203510676517872, "language_loss": 0.70085204, "learning_rate": 3.5975587457727298e-06, "loss": 0.72270858, "num_input_tokens_seen": 82066870, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.9296875, "step": 3814, "time_per_iteration": 2.554291009902954 }, { "auxiliary_loss_clip": 0.01138584, "auxiliary_loss_mlp": 0.01045849, "balance_loss_clip": 1.02943397, "balance_loss_mlp": 1.04697764, "epoch": 0.229370208928303, "flos": 23330696152320.0, "grad_norm": 3.1871160639541496, "language_loss": 0.67329043, "learning_rate": 3.597324405965139e-06, "loss": 0.69513476, "num_input_tokens_seen": 82083180, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9140625, "step": 3815, "time_per_iteration": 2.4789533615112305 }, { "auxiliary_loss_clip": 0.01142082, "auxiliary_loss_mlp": 0.01044556, "balance_loss_clip": 1.02822399, "balance_loss_mlp": 1.04939759, "epoch": 0.229430332180971, "flos": 28617213150720.0, "grad_norm": 1.6132183050914477, "language_loss": 0.83506107, "learning_rate": 3.597090005586848e-06, "loss": 0.85692739, "num_input_tokens_seen": 82102950, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.92578125, "step": 3816, "time_per_iteration": 2.5163795948028564 }, { "auxiliary_loss_clip": 0.01143699, "auxiliary_loss_mlp": 0.0103684, "balance_loss_clip": 1.02047276, "balance_loss_mlp": 1.05129778, "epoch": 0.22949045543363897, "flos": 17238199829760.0, "grad_norm": 3.0560563199613755, "language_loss": 0.87261951, "learning_rate": 3.596855544646742e-06, "loss": 0.89442492, "num_input_tokens_seen": 82119510, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.921875, "step": 3817, "time_per_iteration": 2.4487740993499756 }, { "auxiliary_loss_clip": 0.01142876, "auxiliary_loss_mlp": 0.01046787, "balance_loss_clip": 1.03025258, "balance_loss_mlp": 1.05004144, "epoch": 0.22955057868630693, "flos": 27489438858240.0, "grad_norm": 1.5806373951356316, "language_loss": 0.75013942, "learning_rate": 3.5966210231537154e-06, "loss": 0.77203608, "num_input_tokens_seen": 82140095, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9296875, "step": 3818, "time_per_iteration": 2.5315351486206055 }, { "auxiliary_loss_clip": 0.01142493, "auxiliary_loss_mlp": 0.0104314, "balance_loss_clip": 1.02574718, "balance_loss_mlp": 1.04909372, "epoch": 0.2296107019389749, "flos": 23476421629440.0, "grad_norm": 1.6189976204403758, "language_loss": 0.74435014, "learning_rate": 3.596386441116659e-06, "loss": 0.76620638, "num_input_tokens_seen": 82159510, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.93359375, "step": 3819, "time_per_iteration": 2.4995837211608887 }, { "auxiliary_loss_clip": 0.01140676, "auxiliary_loss_mlp": 0.01042057, "balance_loss_clip": 1.0257138, "balance_loss_mlp": 1.04933095, "epoch": 0.22967082519164286, "flos": 31285160760960.0, "grad_norm": 1.9416712337850468, "language_loss": 0.80806929, "learning_rate": 3.5961517985444684e-06, "loss": 0.82989663, "num_input_tokens_seen": 82179580, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.9140625, "step": 3820, "time_per_iteration": 2.532374620437622 }, { "auxiliary_loss_clip": 0.01147089, "auxiliary_loss_mlp": 0.01043227, "balance_loss_clip": 1.02488089, "balance_loss_mlp": 1.0511595, "epoch": 0.22973094844431083, "flos": 14642935390080.0, "grad_norm": 2.2395642741011907, "language_loss": 0.69394183, "learning_rate": 3.595917095446042e-06, "loss": 0.71584499, "num_input_tokens_seen": 82195585, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.95703125, "step": 3821, "time_per_iteration": 2.4714033603668213 }, { "auxiliary_loss_clip": 0.01141137, "auxiliary_loss_mlp": 0.01032897, "balance_loss_clip": 1.01632667, "balance_loss_mlp": 1.04943359, "epoch": 0.2297910716969788, "flos": 22823853292800.0, "grad_norm": 3.770816813658393, "language_loss": 0.8285839, "learning_rate": 3.5956823318302796e-06, "loss": 0.85032427, "num_input_tokens_seen": 82217530, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.91796875, "step": 3822, "time_per_iteration": 2.522010087966919 }, { "auxiliary_loss_clip": 0.01141536, "auxiliary_loss_mlp": 0.01044392, "balance_loss_clip": 1.02676082, "balance_loss_mlp": 1.04937506, "epoch": 0.2298511949496468, "flos": 23039029716480.0, "grad_norm": 1.7383753619129751, "language_loss": 0.66328043, "learning_rate": 3.5954475077060833e-06, "loss": 0.68513972, "num_input_tokens_seen": 82237980, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.921875, "step": 3823, "time_per_iteration": 2.509443521499634 }, { "auxiliary_loss_clip": 0.01063255, "auxiliary_loss_mlp": 0.0101087, "balance_loss_clip": 1.00822341, "balance_loss_mlp": 1.03087282, "epoch": 0.22991131820231475, "flos": 66890914911360.0, "grad_norm": 0.8212113152229649, "language_loss": 0.56740868, "learning_rate": 3.595212623082357e-06, "loss": 0.58814996, "num_input_tokens_seen": 82301785, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.32421875, "step": 3824, "time_per_iteration": 3.1910150051116943 }, { "auxiliary_loss_clip": 0.01137658, "auxiliary_loss_mlp": 0.0103776, "balance_loss_clip": 1.02242947, "balance_loss_mlp": 1.0482955, "epoch": 0.22997144145498272, "flos": 17887248633600.0, "grad_norm": 2.116026198568313, "language_loss": 0.73158908, "learning_rate": 3.594977677968009e-06, "loss": 0.75334322, "num_input_tokens_seen": 82317355, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.89453125, "step": 3825, "time_per_iteration": 2.4186367988586426 }, { "auxiliary_loss_clip": 0.01145406, "auxiliary_loss_mlp": 0.01042915, "balance_loss_clip": 1.02524841, "balance_loss_mlp": 1.05263805, "epoch": 0.23003156470765068, "flos": 24676843178880.0, "grad_norm": 1.8410682189738636, "language_loss": 0.87458098, "learning_rate": 3.5947426723719473e-06, "loss": 0.89646417, "num_input_tokens_seen": 82336645, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9296875, "step": 3826, "time_per_iteration": 2.5176613330841064 }, { "auxiliary_loss_clip": 0.01146486, "auxiliary_loss_mlp": 0.01044633, "balance_loss_clip": 1.02644157, "balance_loss_mlp": 1.05012012, "epoch": 0.23009168796031865, "flos": 15814126247040.0, "grad_norm": 2.7141658320884305, "language_loss": 0.81710285, "learning_rate": 3.594507606303083e-06, "loss": 0.83901405, "num_input_tokens_seen": 82354225, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.96484375, "step": 3827, "time_per_iteration": 2.430938482284546 }, { "auxiliary_loss_clip": 0.01141494, "auxiliary_loss_mlp": 0.01043725, "balance_loss_clip": 1.02738142, "balance_loss_mlp": 1.05030704, "epoch": 0.2301518112129866, "flos": 16212842190720.0, "grad_norm": 1.9751340020467025, "language_loss": 0.86717385, "learning_rate": 3.5942724797703314e-06, "loss": 0.88902605, "num_input_tokens_seen": 82370240, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9140625, "step": 3828, "time_per_iteration": 2.456181526184082 }, { "auxiliary_loss_clip": 0.01142163, "auxiliary_loss_mlp": 0.01048161, "balance_loss_clip": 1.03056622, "balance_loss_mlp": 1.04899776, "epoch": 0.2302119344656546, "flos": 20595452411520.0, "grad_norm": 2.702448964518725, "language_loss": 0.7096352, "learning_rate": 3.594037292782607e-06, "loss": 0.73153841, "num_input_tokens_seen": 82389145, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.93359375, "step": 3829, "time_per_iteration": 2.459882974624634 }, { "auxiliary_loss_clip": 0.01140894, "auxiliary_loss_mlp": 0.01040766, "balance_loss_clip": 1.02496445, "balance_loss_mlp": 1.05167603, "epoch": 0.23027205771832257, "flos": 26796901662720.0, "grad_norm": 1.549577637464306, "language_loss": 0.84011352, "learning_rate": 3.5938020453488293e-06, "loss": 0.86193019, "num_input_tokens_seen": 82409185, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.890625, "step": 3830, "time_per_iteration": 2.5353152751922607 }, { "auxiliary_loss_clip": 0.01140612, "auxiliary_loss_mlp": 0.01055128, "balance_loss_clip": 1.03780675, "balance_loss_mlp": 1.04815245, "epoch": 0.23033218097099054, "flos": 43873143068160.0, "grad_norm": 1.6741859117511557, "language_loss": 0.67033261, "learning_rate": 3.5935667374779177e-06, "loss": 0.69229007, "num_input_tokens_seen": 82432070, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.92578125, "step": 3831, "time_per_iteration": 2.6661062240600586 }, { "auxiliary_loss_clip": 0.01144175, "auxiliary_loss_mlp": 0.01052054, "balance_loss_clip": 1.03459597, "balance_loss_mlp": 1.05039907, "epoch": 0.2303923042236585, "flos": 26067663745920.0, "grad_norm": 2.5035242826385775, "language_loss": 0.75889176, "learning_rate": 3.5933313691787957e-06, "loss": 0.78085405, "num_input_tokens_seen": 82450625, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9375, "step": 3832, "time_per_iteration": 2.504793882369995 }, { "auxiliary_loss_clip": 0.01143565, "auxiliary_loss_mlp": 0.01050451, "balance_loss_clip": 1.03204536, "balance_loss_mlp": 1.05040371, "epoch": 0.23045242747632647, "flos": 18296379521280.0, "grad_norm": 1.7936537228790101, "language_loss": 0.87557948, "learning_rate": 3.593095940460389e-06, "loss": 0.89751959, "num_input_tokens_seen": 82468575, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.9296875, "step": 3833, "time_per_iteration": 2.4286067485809326 }, { "auxiliary_loss_clip": 0.01142146, "auxiliary_loss_mlp": 0.01049031, "balance_loss_clip": 1.03168607, "balance_loss_mlp": 1.04949403, "epoch": 0.23051255072899443, "flos": 25520528805120.0, "grad_norm": 1.9718912084989362, "language_loss": 0.74635273, "learning_rate": 3.592860451331624e-06, "loss": 0.76826453, "num_input_tokens_seen": 82488655, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.92578125, "step": 3834, "time_per_iteration": 2.5284528732299805 }, { "auxiliary_loss_clip": 0.01141755, "auxiliary_loss_mlp": 0.01050864, "balance_loss_clip": 1.03326917, "balance_loss_mlp": 1.05016017, "epoch": 0.2305726739816624, "flos": 21215198695680.0, "grad_norm": 2.2333355779254864, "language_loss": 0.86094487, "learning_rate": 3.592624901801432e-06, "loss": 0.88287109, "num_input_tokens_seen": 82507220, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9140625, "step": 3835, "time_per_iteration": 2.4663779735565186 }, { "auxiliary_loss_clip": 0.01148601, "auxiliary_loss_mlp": 0.01054121, "balance_loss_clip": 1.03645456, "balance_loss_mlp": 1.0513097, "epoch": 0.2306327972343304, "flos": 23331127115520.0, "grad_norm": 3.4003041509781187, "language_loss": 0.82386899, "learning_rate": 3.5923892918787432e-06, "loss": 0.84589624, "num_input_tokens_seen": 82527920, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.97265625, "step": 3836, "time_per_iteration": 2.517117500305176 }, { "auxiliary_loss_clip": 0.01146475, "auxiliary_loss_mlp": 0.01045132, "balance_loss_clip": 1.02827597, "balance_loss_mlp": 1.0531019, "epoch": 0.23069292048699835, "flos": 20666734951680.0, "grad_norm": 1.559265949107669, "language_loss": 0.7916609, "learning_rate": 3.5921536215724934e-06, "loss": 0.81357694, "num_input_tokens_seen": 82549040, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.93359375, "step": 3837, "time_per_iteration": 2.4776663780212402 }, { "auxiliary_loss_clip": 0.01060051, "auxiliary_loss_mlp": 0.01026808, "balance_loss_clip": 1.02401888, "balance_loss_mlp": 1.02801991, "epoch": 0.23075304373966632, "flos": 70454832393600.0, "grad_norm": 0.912766394497269, "language_loss": 0.65461057, "learning_rate": 3.5919178908916184e-06, "loss": 0.67547917, "num_input_tokens_seen": 82604070, "router_z_loss_clip": 0.0279541, "router_z_loss_mlp": 0.3203125, "step": 3838, "time_per_iteration": 3.2839622497558594 }, { "auxiliary_loss_clip": 0.0114192, "auxiliary_loss_mlp": 0.01051431, "balance_loss_clip": 1.03490829, "balance_loss_mlp": 1.0497818, "epoch": 0.23081316699233428, "flos": 16617986668800.0, "grad_norm": 5.101928083078731, "language_loss": 0.75530744, "learning_rate": 3.591682099845058e-06, "loss": 0.77724099, "num_input_tokens_seen": 82619665, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.921875, "step": 3839, "time_per_iteration": 2.4276270866394043 }, { "auxiliary_loss_clip": 0.01144768, "auxiliary_loss_mlp": 0.01042005, "balance_loss_clip": 1.0250771, "balance_loss_mlp": 1.05017149, "epoch": 0.23087329024500225, "flos": 13298081253120.0, "grad_norm": 1.8937687223680755, "language_loss": 0.68575513, "learning_rate": 3.591446248441752e-06, "loss": 0.70762283, "num_input_tokens_seen": 82637530, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9453125, "step": 3840, "time_per_iteration": 2.4603428840637207 }, { "auxiliary_loss_clip": 0.01145123, "auxiliary_loss_mlp": 0.01046329, "balance_loss_clip": 1.02725589, "balance_loss_mlp": 1.05184031, "epoch": 0.23093341349767021, "flos": 17785729820160.0, "grad_norm": 1.8459834958821786, "language_loss": 0.79221344, "learning_rate": 3.591210336690645e-06, "loss": 0.81412798, "num_input_tokens_seen": 82656130, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.93359375, "step": 3841, "time_per_iteration": 2.4445559978485107 }, { "auxiliary_loss_clip": 0.01141935, "auxiliary_loss_mlp": 0.01042558, "balance_loss_clip": 1.02708459, "balance_loss_mlp": 1.04989052, "epoch": 0.23099353675033818, "flos": 23988076911360.0, "grad_norm": 1.7884083522897205, "language_loss": 0.82989627, "learning_rate": 3.590974364600683e-06, "loss": 0.85174119, "num_input_tokens_seen": 82675295, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.921875, "step": 3842, "time_per_iteration": 2.524271011352539 }, { "auxiliary_loss_clip": 0.01140929, "auxiliary_loss_mlp": 0.01044546, "balance_loss_clip": 1.02705169, "balance_loss_mlp": 1.04844177, "epoch": 0.23105366000300617, "flos": 35995168471680.0, "grad_norm": 1.5083534706415045, "language_loss": 0.66638505, "learning_rate": 3.5907383321808135e-06, "loss": 0.68823981, "num_input_tokens_seen": 82703260, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.921875, "step": 3843, "time_per_iteration": 4.240300178527832 }, { "auxiliary_loss_clip": 0.01138534, "auxiliary_loss_mlp": 0.01040535, "balance_loss_clip": 1.02377391, "balance_loss_mlp": 1.04902744, "epoch": 0.23111378325567414, "flos": 31245335556480.0, "grad_norm": 1.6430761430604675, "language_loss": 0.77082574, "learning_rate": 3.590502239439987e-06, "loss": 0.79261637, "num_input_tokens_seen": 82725060, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.89453125, "step": 3844, "time_per_iteration": 2.543577194213867 }, { "auxiliary_loss_clip": 0.01144138, "auxiliary_loss_mlp": 0.01041694, "balance_loss_clip": 1.02396727, "balance_loss_mlp": 1.05036449, "epoch": 0.2311739065083421, "flos": 19208223204480.0, "grad_norm": 1.6398405207036997, "language_loss": 0.78454804, "learning_rate": 3.590266086387156e-06, "loss": 0.80640638, "num_input_tokens_seen": 82742960, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9375, "step": 3845, "time_per_iteration": 3.821563959121704 }, { "auxiliary_loss_clip": 0.0113583, "auxiliary_loss_mlp": 0.01033524, "balance_loss_clip": 1.01834846, "balance_loss_mlp": 1.0487833, "epoch": 0.23123402976101007, "flos": 23360178240000.0, "grad_norm": 2.6310094541423523, "language_loss": 0.7646395, "learning_rate": 3.590029873031276e-06, "loss": 0.78633302, "num_input_tokens_seen": 82760205, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.87109375, "step": 3846, "time_per_iteration": 2.476886749267578 }, { "auxiliary_loss_clip": 0.01141862, "auxiliary_loss_mlp": 0.01045897, "balance_loss_clip": 1.02876687, "balance_loss_mlp": 1.04949832, "epoch": 0.23129415301367803, "flos": 13735365425280.0, "grad_norm": 2.165352134545719, "language_loss": 0.69579184, "learning_rate": 3.589793599381304e-06, "loss": 0.71766943, "num_input_tokens_seen": 82778590, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.921875, "step": 3847, "time_per_iteration": 5.312956094741821 }, { "auxiliary_loss_clip": 0.01057627, "auxiliary_loss_mlp": 0.01002592, "balance_loss_clip": 0.9999097, "balance_loss_mlp": 1.02565765, "epoch": 0.231354276266346, "flos": 69737015001600.0, "grad_norm": 0.7870726500548657, "language_loss": 0.60987103, "learning_rate": 3.589557265446198e-06, "loss": 0.63047326, "num_input_tokens_seen": 82833925, "router_z_loss_clip": 0.02685547, "router_z_loss_mlp": 0.3203125, "step": 3848, "time_per_iteration": 3.0114665031433105 }, { "auxiliary_loss_clip": 0.01140586, "auxiliary_loss_mlp": 0.01043049, "balance_loss_clip": 1.02600265, "balance_loss_mlp": 1.04901254, "epoch": 0.231414399519014, "flos": 18835900778880.0, "grad_norm": 2.3355145433679847, "language_loss": 0.77743661, "learning_rate": 3.589320871234923e-06, "loss": 0.79927301, "num_input_tokens_seen": 82850625, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9140625, "step": 3849, "time_per_iteration": 2.457279682159424 }, { "auxiliary_loss_clip": 0.011409, "auxiliary_loss_mlp": 0.01034976, "balance_loss_clip": 1.01844192, "balance_loss_mlp": 1.0480783, "epoch": 0.23147452277168196, "flos": 36135470995200.0, "grad_norm": 2.479188714480854, "language_loss": 0.71602857, "learning_rate": 3.5890844167564405e-06, "loss": 0.73778737, "num_input_tokens_seen": 82872105, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9296875, "step": 3850, "time_per_iteration": 2.6168932914733887 }, { "auxiliary_loss_clip": 0.01136947, "auxiliary_loss_mlp": 0.01038339, "balance_loss_clip": 1.02185214, "balance_loss_mlp": 1.04639745, "epoch": 0.23153464602434992, "flos": 20812927305600.0, "grad_norm": 2.6271865114060233, "language_loss": 0.76548934, "learning_rate": 3.588847902019718e-06, "loss": 0.78724217, "num_input_tokens_seen": 82890595, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.90625, "step": 3851, "time_per_iteration": 2.4774720668792725 }, { "auxiliary_loss_clip": 0.01137687, "auxiliary_loss_mlp": 0.01043787, "balance_loss_clip": 1.02622771, "balance_loss_mlp": 1.04779649, "epoch": 0.2315947692770179, "flos": 19939256801280.0, "grad_norm": 2.5256997850736544, "language_loss": 0.6998266, "learning_rate": 3.588611327033723e-06, "loss": 0.7216413, "num_input_tokens_seen": 82908910, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.8984375, "step": 3852, "time_per_iteration": 2.4849374294281006 }, { "auxiliary_loss_clip": 0.01140521, "auxiliary_loss_mlp": 0.01040062, "balance_loss_clip": 1.02337289, "balance_loss_mlp": 1.04879582, "epoch": 0.23165489252968585, "flos": 12855553695360.0, "grad_norm": 2.5346236463063443, "language_loss": 0.67734349, "learning_rate": 3.588374691807428e-06, "loss": 0.69914937, "num_input_tokens_seen": 82925405, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.91796875, "step": 3853, "time_per_iteration": 2.426400899887085 }, { "auxiliary_loss_clip": 0.01141457, "auxiliary_loss_mlp": 0.01032695, "balance_loss_clip": 1.01554036, "balance_loss_mlp": 1.04881477, "epoch": 0.23171501578235382, "flos": 30628282792320.0, "grad_norm": 1.6250669213373403, "language_loss": 0.79966307, "learning_rate": 3.5881379963498053e-06, "loss": 0.82140458, "num_input_tokens_seen": 82945615, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.92578125, "step": 3854, "time_per_iteration": 2.5514519214630127 }, { "auxiliary_loss_clip": 0.01145809, "auxiliary_loss_mlp": 0.01038874, "balance_loss_clip": 1.02102816, "balance_loss_mlp": 1.04829156, "epoch": 0.23177513903502178, "flos": 23842782397440.0, "grad_norm": 2.0511072582149388, "language_loss": 0.65545201, "learning_rate": 3.587901240669831e-06, "loss": 0.6772989, "num_input_tokens_seen": 82967570, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9765625, "step": 3855, "time_per_iteration": 2.4980320930480957 }, { "auxiliary_loss_clip": 0.01140511, "auxiliary_loss_mlp": 0.0104281, "balance_loss_clip": 1.02618074, "balance_loss_mlp": 1.04687631, "epoch": 0.23183526228768978, "flos": 29570282668800.0, "grad_norm": 2.30012831211227, "language_loss": 0.70869339, "learning_rate": 3.5876644247764815e-06, "loss": 0.73052657, "num_input_tokens_seen": 82987435, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.93359375, "step": 3856, "time_per_iteration": 2.5305869579315186 }, { "auxiliary_loss_clip": 0.01139731, "auxiliary_loss_mlp": 0.01035158, "balance_loss_clip": 1.01993537, "balance_loss_mlp": 1.04897952, "epoch": 0.23189538554035774, "flos": 34458694254720.0, "grad_norm": 1.735820234879577, "language_loss": 0.77080292, "learning_rate": 3.5874275486787387e-06, "loss": 0.79255176, "num_input_tokens_seen": 83010505, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.90625, "step": 3857, "time_per_iteration": 2.5866823196411133 }, { "auxiliary_loss_clip": 0.01144826, "auxiliary_loss_mlp": 0.01045322, "balance_loss_clip": 1.02673745, "balance_loss_mlp": 1.0495106, "epoch": 0.2319555087930257, "flos": 18003815245440.0, "grad_norm": 2.374185909561831, "language_loss": 0.91817927, "learning_rate": 3.587190612385584e-06, "loss": 0.94008076, "num_input_tokens_seen": 83026705, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.953125, "step": 3858, "time_per_iteration": 2.4532933235168457 }, { "auxiliary_loss_clip": 0.01136068, "auxiliary_loss_mlp": 0.01039142, "balance_loss_clip": 1.02275693, "balance_loss_mlp": 1.04774094, "epoch": 0.23201563204569367, "flos": 23143852581120.0, "grad_norm": 2.079204252408466, "language_loss": 0.7625826, "learning_rate": 3.5869536159060026e-06, "loss": 0.78433478, "num_input_tokens_seen": 83046500, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8828125, "step": 3859, "time_per_iteration": 2.4774491786956787 }, { "auxiliary_loss_clip": 0.0113565, "auxiliary_loss_mlp": 0.01035721, "balance_loss_clip": 1.01932335, "balance_loss_mlp": 1.04524052, "epoch": 0.23207575529836164, "flos": 20667991927680.0, "grad_norm": 1.7480786587319432, "language_loss": 0.84172213, "learning_rate": 3.58671655924898e-06, "loss": 0.8634358, "num_input_tokens_seen": 83065280, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.90625, "step": 3860, "time_per_iteration": 2.4892523288726807 }, { "auxiliary_loss_clip": 0.01137636, "auxiliary_loss_mlp": 0.01039338, "balance_loss_clip": 1.02260101, "balance_loss_mlp": 1.04789352, "epoch": 0.2321358785510296, "flos": 16472189364480.0, "grad_norm": 2.134680288835568, "language_loss": 0.82989979, "learning_rate": 3.586479442423508e-06, "loss": 0.85166955, "num_input_tokens_seen": 83082310, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 3861, "time_per_iteration": 2.439852237701416 }, { "auxiliary_loss_clip": 0.01140771, "auxiliary_loss_mlp": 0.01043783, "balance_loss_clip": 1.02748704, "balance_loss_mlp": 1.04822111, "epoch": 0.2321960018036976, "flos": 21616320850560.0, "grad_norm": 1.7529717747984928, "language_loss": 0.85414535, "learning_rate": 3.586242265438576e-06, "loss": 0.87599087, "num_input_tokens_seen": 83102065, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.92578125, "step": 3862, "time_per_iteration": 2.5251784324645996 }, { "auxiliary_loss_clip": 0.01135052, "auxiliary_loss_mlp": 0.01040011, "balance_loss_clip": 1.02534866, "balance_loss_mlp": 1.04705191, "epoch": 0.23225612505636556, "flos": 22271474966400.0, "grad_norm": 1.564704802638278, "language_loss": 0.75201893, "learning_rate": 3.5860050283031773e-06, "loss": 0.77376962, "num_input_tokens_seen": 83121445, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.87890625, "step": 3863, "time_per_iteration": 2.467568874359131 }, { "auxiliary_loss_clip": 0.01137916, "auxiliary_loss_mlp": 0.01043098, "balance_loss_clip": 1.02831614, "balance_loss_mlp": 1.05123568, "epoch": 0.23231624830903352, "flos": 17052325925760.0, "grad_norm": 1.8680201032155854, "language_loss": 0.74057639, "learning_rate": 3.58576773102631e-06, "loss": 0.7623865, "num_input_tokens_seen": 83138175, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8671875, "step": 3864, "time_per_iteration": 2.4378421306610107 }, { "auxiliary_loss_clip": 0.01136265, "auxiliary_loss_mlp": 0.01036708, "balance_loss_clip": 1.02113926, "balance_loss_mlp": 1.04606533, "epoch": 0.2323763715617015, "flos": 34640043045120.0, "grad_norm": 2.7416466798183023, "language_loss": 0.70445943, "learning_rate": 3.5855303736169714e-06, "loss": 0.72618914, "num_input_tokens_seen": 83161975, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.90234375, "step": 3865, "time_per_iteration": 2.5766425132751465 }, { "auxiliary_loss_clip": 0.01146896, "auxiliary_loss_mlp": 0.01051125, "balance_loss_clip": 1.03316021, "balance_loss_mlp": 1.05008733, "epoch": 0.23243649481436945, "flos": 25551698832000.0, "grad_norm": 1.6256467839466064, "language_loss": 0.94573206, "learning_rate": 3.5852929560841617e-06, "loss": 0.96771222, "num_input_tokens_seen": 83180905, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.96875, "step": 3866, "time_per_iteration": 2.5211191177368164 }, { "auxiliary_loss_clip": 0.01135624, "auxiliary_loss_mlp": 0.0104217, "balance_loss_clip": 1.02630329, "balance_loss_mlp": 1.04737973, "epoch": 0.23249661806703742, "flos": 20483482740480.0, "grad_norm": 2.458536620801291, "language_loss": 0.73355782, "learning_rate": 3.5850554784368846e-06, "loss": 0.75533581, "num_input_tokens_seen": 83196390, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8828125, "step": 3867, "time_per_iteration": 2.4340367317199707 }, { "auxiliary_loss_clip": 0.0113763, "auxiliary_loss_mlp": 0.01041278, "balance_loss_clip": 1.0249939, "balance_loss_mlp": 1.04692268, "epoch": 0.23255674131970538, "flos": 20376612800640.0, "grad_norm": 2.040641254640148, "language_loss": 0.82228827, "learning_rate": 3.584817940684145e-06, "loss": 0.84407735, "num_input_tokens_seen": 83216165, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.90625, "step": 3868, "time_per_iteration": 2.5256593227386475 }, { "auxiliary_loss_clip": 0.01135548, "auxiliary_loss_mlp": 0.01038591, "balance_loss_clip": 1.02334428, "balance_loss_mlp": 1.04832304, "epoch": 0.23261686457237338, "flos": 17056096853760.0, "grad_norm": 1.8760626362597206, "language_loss": 0.73345244, "learning_rate": 3.58458034283495e-06, "loss": 0.75519383, "num_input_tokens_seen": 83233845, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.875, "step": 3869, "time_per_iteration": 2.4304311275482178 }, { "auxiliary_loss_clip": 0.01139479, "auxiliary_loss_mlp": 0.01044473, "balance_loss_clip": 1.02862978, "balance_loss_mlp": 1.05055928, "epoch": 0.23267698782504134, "flos": 29169878785920.0, "grad_norm": 1.7292728076248929, "language_loss": 0.79928046, "learning_rate": 3.5843426848983097e-06, "loss": 0.82112002, "num_input_tokens_seen": 83254930, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.890625, "step": 3870, "time_per_iteration": 2.542407989501953 }, { "auxiliary_loss_clip": 0.01141789, "auxiliary_loss_mlp": 0.01042973, "balance_loss_clip": 1.02581882, "balance_loss_mlp": 1.04966986, "epoch": 0.2327371110777093, "flos": 21174655219200.0, "grad_norm": 1.9047410950404113, "language_loss": 0.70478112, "learning_rate": 3.5841049668832357e-06, "loss": 0.72662878, "num_input_tokens_seen": 83272095, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.921875, "step": 3871, "time_per_iteration": 2.4544219970703125 }, { "auxiliary_loss_clip": 0.01144054, "auxiliary_loss_mlp": 0.01052536, "balance_loss_clip": 1.03401089, "balance_loss_mlp": 1.05062127, "epoch": 0.23279723433037727, "flos": 24863112132480.0, "grad_norm": 2.836907616866857, "language_loss": 0.68890458, "learning_rate": 3.5838671887987433e-06, "loss": 0.7108705, "num_input_tokens_seen": 83290980, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.93359375, "step": 3872, "time_per_iteration": 2.4970076084136963 }, { "auxiliary_loss_clip": 0.01146815, "auxiliary_loss_mlp": 0.01042906, "balance_loss_clip": 1.02546549, "balance_loss_mlp": 1.05118036, "epoch": 0.23285735758304524, "flos": 38800617344640.0, "grad_norm": 1.672684882477217, "language_loss": 0.77751243, "learning_rate": 3.5836293506538474e-06, "loss": 0.79940963, "num_input_tokens_seen": 83315175, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.95703125, "step": 3873, "time_per_iteration": 2.6193504333496094 }, { "auxiliary_loss_clip": 0.01056773, "auxiliary_loss_mlp": 0.01001309, "balance_loss_clip": 0.99873394, "balance_loss_mlp": 1.02578092, "epoch": 0.2329174808357132, "flos": 53944113692160.0, "grad_norm": 1.1689777781454427, "language_loss": 0.60540795, "learning_rate": 3.5833914524575687e-06, "loss": 0.62598872, "num_input_tokens_seen": 83372060, "router_z_loss_clip": 0.02575684, "router_z_loss_mlp": 0.30859375, "step": 3874, "time_per_iteration": 3.0353944301605225 }, { "auxiliary_loss_clip": 0.01138977, "auxiliary_loss_mlp": 0.01043925, "balance_loss_clip": 1.02712822, "balance_loss_mlp": 1.04921067, "epoch": 0.23297760408838117, "flos": 21216024708480.0, "grad_norm": 2.6277655526850032, "language_loss": 0.80481827, "learning_rate": 3.583153494218927e-06, "loss": 0.8266474, "num_input_tokens_seen": 83389795, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 3875, "time_per_iteration": 2.485226631164551 }, { "auxiliary_loss_clip": 0.01138258, "auxiliary_loss_mlp": 0.0103916, "balance_loss_clip": 1.02392483, "balance_loss_mlp": 1.04905701, "epoch": 0.23303772734104916, "flos": 28403006394240.0, "grad_norm": 1.6853381214517749, "language_loss": 0.61576331, "learning_rate": 3.5829154759469464e-06, "loss": 0.63753748, "num_input_tokens_seen": 83410005, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.890625, "step": 3876, "time_per_iteration": 2.536346912384033 }, { "auxiliary_loss_clip": 0.01141642, "auxiliary_loss_mlp": 0.01048797, "balance_loss_clip": 1.03128493, "balance_loss_mlp": 1.05050385, "epoch": 0.23309785059371713, "flos": 24314720215680.0, "grad_norm": 1.6985794808355865, "language_loss": 0.70356828, "learning_rate": 3.5826773976506523e-06, "loss": 0.72547269, "num_input_tokens_seen": 83430250, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9140625, "step": 3877, "time_per_iteration": 2.534437894821167 }, { "auxiliary_loss_clip": 0.011398, "auxiliary_loss_mlp": 0.01050241, "balance_loss_clip": 1.0328362, "balance_loss_mlp": 1.04890609, "epoch": 0.2331579738463851, "flos": 15992925171840.0, "grad_norm": 2.1714397946726067, "language_loss": 0.81068015, "learning_rate": 3.582439259339073e-06, "loss": 0.83258057, "num_input_tokens_seen": 83447950, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90625, "step": 3878, "time_per_iteration": 2.465212345123291 }, { "auxiliary_loss_clip": 0.01143656, "auxiliary_loss_mlp": 0.01046563, "balance_loss_clip": 1.02855039, "balance_loss_mlp": 1.04863083, "epoch": 0.23321809709905306, "flos": 36426957863040.0, "grad_norm": 1.6037705352642662, "language_loss": 0.75057268, "learning_rate": 3.5822010610212374e-06, "loss": 0.77247488, "num_input_tokens_seen": 83467785, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.953125, "step": 3879, "time_per_iteration": 2.5989508628845215 }, { "auxiliary_loss_clip": 0.01138101, "auxiliary_loss_mlp": 0.01041038, "balance_loss_clip": 1.02449131, "balance_loss_mlp": 1.04645097, "epoch": 0.23327822035172102, "flos": 21324762155520.0, "grad_norm": 2.2462974434482486, "language_loss": 0.89797312, "learning_rate": 3.5819628027061795e-06, "loss": 0.91976452, "num_input_tokens_seen": 83485390, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.91796875, "step": 3880, "time_per_iteration": 2.483361005783081 }, { "auxiliary_loss_clip": 0.01142857, "auxiliary_loss_mlp": 0.01046325, "balance_loss_clip": 1.02980256, "balance_loss_mlp": 1.04999447, "epoch": 0.233338343604389, "flos": 19171881619200.0, "grad_norm": 1.6620705529013815, "language_loss": 0.72041571, "learning_rate": 3.5817244844029334e-06, "loss": 0.74230754, "num_input_tokens_seen": 83504890, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9296875, "step": 3881, "time_per_iteration": 2.498307943344116 }, { "auxiliary_loss_clip": 0.01136969, "auxiliary_loss_mlp": 0.01045805, "balance_loss_clip": 1.02950907, "balance_loss_mlp": 1.04594922, "epoch": 0.23339846685705698, "flos": 26908368543360.0, "grad_norm": 1.5791640800027342, "language_loss": 0.67771339, "learning_rate": 3.581486106120537e-06, "loss": 0.69954115, "num_input_tokens_seen": 83526475, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.91015625, "step": 3882, "time_per_iteration": 2.5511155128479004 }, { "auxiliary_loss_clip": 0.01140384, "auxiliary_loss_mlp": 0.01054301, "balance_loss_clip": 1.03750455, "balance_loss_mlp": 1.04691851, "epoch": 0.23345859010972494, "flos": 32343160884480.0, "grad_norm": 1.9285207808270142, "language_loss": 0.7706449, "learning_rate": 3.5812476678680287e-06, "loss": 0.79259181, "num_input_tokens_seen": 83546620, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.93359375, "step": 3883, "time_per_iteration": 2.581169366836548 }, { "auxiliary_loss_clip": 0.01048267, "auxiliary_loss_mlp": 0.0101228, "balance_loss_clip": 1.00988436, "balance_loss_mlp": 1.01710343, "epoch": 0.2335187133623929, "flos": 58484229050880.0, "grad_norm": 0.7829750611551594, "language_loss": 0.59136397, "learning_rate": 3.58100916965445e-06, "loss": 0.61196947, "num_input_tokens_seen": 83616160, "router_z_loss_clip": 0.02392578, "router_z_loss_mlp": 0.3125, "step": 3884, "time_per_iteration": 4.723447561264038 }, { "auxiliary_loss_clip": 0.01138772, "auxiliary_loss_mlp": 0.01040771, "balance_loss_clip": 1.02486897, "balance_loss_mlp": 1.04691803, "epoch": 0.23357883661506088, "flos": 24502317972480.0, "grad_norm": 1.7572443028085434, "language_loss": 0.80123103, "learning_rate": 3.5807706114888455e-06, "loss": 0.82302648, "num_input_tokens_seen": 83636795, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.91796875, "step": 3885, "time_per_iteration": 2.5103328227996826 }, { "auxiliary_loss_clip": 0.01137116, "auxiliary_loss_mlp": 0.01038277, "balance_loss_clip": 1.02206421, "balance_loss_mlp": 1.04637575, "epoch": 0.23363895986772884, "flos": 18948516894720.0, "grad_norm": 3.1936562239829733, "language_loss": 0.87671596, "learning_rate": 3.580531993380261e-06, "loss": 0.89846992, "num_input_tokens_seen": 83654050, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.90625, "step": 3886, "time_per_iteration": 3.885406732559204 }, { "auxiliary_loss_clip": 0.01141547, "auxiliary_loss_mlp": 0.01041764, "balance_loss_clip": 1.02524137, "balance_loss_mlp": 1.0499208, "epoch": 0.2336990831203968, "flos": 31686821619840.0, "grad_norm": 1.8412773078932787, "language_loss": 0.73235929, "learning_rate": 3.5802933153377445e-06, "loss": 0.75419235, "num_input_tokens_seen": 83673720, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9140625, "step": 3887, "time_per_iteration": 2.5670831203460693 }, { "auxiliary_loss_clip": 0.01140141, "auxiliary_loss_mlp": 0.01039883, "balance_loss_clip": 1.02354538, "balance_loss_mlp": 1.04831398, "epoch": 0.23375920637306477, "flos": 27709750926720.0, "grad_norm": 2.127421224223921, "language_loss": 0.8443743, "learning_rate": 3.5800545773703475e-06, "loss": 0.86617458, "num_input_tokens_seen": 83693470, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.91796875, "step": 3888, "time_per_iteration": 3.891103982925415 }, { "auxiliary_loss_clip": 0.01136098, "auxiliary_loss_mlp": 0.010496, "balance_loss_clip": 1.03320837, "balance_loss_mlp": 1.04672337, "epoch": 0.23381932962573276, "flos": 17675627656320.0, "grad_norm": 2.1541866264489764, "language_loss": 0.87523293, "learning_rate": 3.5798157794871225e-06, "loss": 0.89708996, "num_input_tokens_seen": 83711620, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.89453125, "step": 3889, "time_per_iteration": 3.9016480445861816 }, { "auxiliary_loss_clip": 0.01141431, "auxiliary_loss_mlp": 0.01043115, "balance_loss_clip": 1.02693796, "balance_loss_mlp": 1.04795372, "epoch": 0.23387945287840073, "flos": 14390842763520.0, "grad_norm": 2.6326985098687343, "language_loss": 0.77065277, "learning_rate": 3.579576921697125e-06, "loss": 0.79249829, "num_input_tokens_seen": 83727890, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.93359375, "step": 3890, "time_per_iteration": 2.4231884479522705 }, { "auxiliary_loss_clip": 0.01140473, "auxiliary_loss_mlp": 0.0104134, "balance_loss_clip": 1.02505565, "balance_loss_mlp": 1.0494082, "epoch": 0.2339395761310687, "flos": 46097988503040.0, "grad_norm": 1.9054751782421078, "language_loss": 0.73347116, "learning_rate": 3.579338004009412e-06, "loss": 0.75528932, "num_input_tokens_seen": 83749370, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.91015625, "step": 3891, "time_per_iteration": 2.675495147705078 }, { "auxiliary_loss_clip": 0.0113353, "auxiliary_loss_mlp": 0.01036143, "balance_loss_clip": 1.01957333, "balance_loss_mlp": 1.04576969, "epoch": 0.23399969938373666, "flos": 22382044007040.0, "grad_norm": 1.5461773667785577, "language_loss": 0.82806522, "learning_rate": 3.5790990264330433e-06, "loss": 0.84976196, "num_input_tokens_seen": 83769560, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.87890625, "step": 3892, "time_per_iteration": 2.481219530105591 }, { "auxiliary_loss_clip": 0.01140694, "auxiliary_loss_mlp": 0.01040324, "balance_loss_clip": 1.02330089, "balance_loss_mlp": 1.04788804, "epoch": 0.23405982263640462, "flos": 43508542066560.0, "grad_norm": 1.720690245422527, "language_loss": 0.65039945, "learning_rate": 3.578859988977082e-06, "loss": 0.67220962, "num_input_tokens_seen": 83795635, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.92578125, "step": 3893, "time_per_iteration": 2.6804397106170654 }, { "auxiliary_loss_clip": 0.01138301, "auxiliary_loss_mlp": 0.01040285, "balance_loss_clip": 1.02293956, "balance_loss_mlp": 1.0496676, "epoch": 0.2341199458890726, "flos": 22564685687040.0, "grad_norm": 2.0364959493641352, "language_loss": 0.7936303, "learning_rate": 3.5786208916505916e-06, "loss": 0.8154161, "num_input_tokens_seen": 83814090, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.88671875, "step": 3894, "time_per_iteration": 2.481437921524048 }, { "auxiliary_loss_clip": 0.01137133, "auxiliary_loss_mlp": 0.01035601, "balance_loss_clip": 1.019889, "balance_loss_mlp": 1.04764712, "epoch": 0.23418006914174055, "flos": 25633970933760.0, "grad_norm": 1.458486154604844, "language_loss": 0.81793237, "learning_rate": 3.5783817344626383e-06, "loss": 0.83965969, "num_input_tokens_seen": 83836870, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.89453125, "step": 3895, "time_per_iteration": 2.5715560913085938 }, { "auxiliary_loss_clip": 0.01137814, "auxiliary_loss_mlp": 0.0104193, "balance_loss_clip": 1.02556217, "balance_loss_mlp": 1.04715276, "epoch": 0.23424019239440855, "flos": 13545936074880.0, "grad_norm": 1.9769652745745885, "language_loss": 0.80589288, "learning_rate": 3.578142517422292e-06, "loss": 0.82769036, "num_input_tokens_seen": 83853275, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.90625, "step": 3896, "time_per_iteration": 2.436908006668091 }, { "auxiliary_loss_clip": 0.01142672, "auxiliary_loss_mlp": 0.01043541, "balance_loss_clip": 1.02647007, "balance_loss_mlp": 1.04979098, "epoch": 0.2343003156470765, "flos": 22419498913920.0, "grad_norm": 1.6244565057673777, "language_loss": 0.83231145, "learning_rate": 3.577903240538623e-06, "loss": 0.85417354, "num_input_tokens_seen": 83872340, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9296875, "step": 3897, "time_per_iteration": 2.489732503890991 }, { "auxiliary_loss_clip": 0.01143983, "auxiliary_loss_mlp": 0.01044087, "balance_loss_clip": 1.02718341, "balance_loss_mlp": 1.05006433, "epoch": 0.23436043889974448, "flos": 14790815683200.0, "grad_norm": 1.8328462533289243, "language_loss": 0.7941401, "learning_rate": 3.577663903820705e-06, "loss": 0.81602079, "num_input_tokens_seen": 83888795, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9375, "step": 3898, "time_per_iteration": 2.4647743701934814 }, { "auxiliary_loss_clip": 0.0113725, "auxiliary_loss_mlp": 0.01043116, "balance_loss_clip": 1.02671325, "balance_loss_mlp": 1.04949903, "epoch": 0.23442056215241244, "flos": 22965700101120.0, "grad_norm": 1.9410393175299077, "language_loss": 0.74091434, "learning_rate": 3.577424507277614e-06, "loss": 0.76271796, "num_input_tokens_seen": 83906820, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87890625, "step": 3899, "time_per_iteration": 2.4941747188568115 }, { "auxiliary_loss_clip": 0.01139195, "auxiliary_loss_mlp": 0.01045957, "balance_loss_clip": 1.0284692, "balance_loss_mlp": 1.04753602, "epoch": 0.2344806854050804, "flos": 23071887682560.0, "grad_norm": 1.8650509345712998, "language_loss": 0.75300896, "learning_rate": 3.5771850509184277e-06, "loss": 0.77486044, "num_input_tokens_seen": 83926370, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.91796875, "step": 3900, "time_per_iteration": 2.4813058376312256 }, { "auxiliary_loss_clip": 0.01140615, "auxiliary_loss_mlp": 0.0104822, "balance_loss_clip": 1.03128076, "balance_loss_mlp": 1.04993057, "epoch": 0.23454080865774837, "flos": 16327074418560.0, "grad_norm": 1.6681834044481456, "language_loss": 0.67143226, "learning_rate": 3.5769455347522256e-06, "loss": 0.69332063, "num_input_tokens_seen": 83944600, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90625, "step": 3901, "time_per_iteration": 2.4504616260528564 }, { "auxiliary_loss_clip": 0.01050604, "auxiliary_loss_mlp": 0.01007714, "balance_loss_clip": 1.00496066, "balance_loss_mlp": 1.01949203, "epoch": 0.23460093191041637, "flos": 67760958142080.0, "grad_norm": 0.8001935585197626, "language_loss": 0.58227837, "learning_rate": 3.576705958788091e-06, "loss": 0.60286152, "num_input_tokens_seen": 84005100, "router_z_loss_clip": 0.02758789, "router_z_loss_mlp": 0.3125, "step": 3902, "time_per_iteration": 3.065045118331909 }, { "auxiliary_loss_clip": 0.01142157, "auxiliary_loss_mlp": 0.01047638, "balance_loss_clip": 1.02976847, "balance_loss_mlp": 1.05072904, "epoch": 0.23466105516308433, "flos": 20077619990400.0, "grad_norm": 1.8213488801254738, "language_loss": 0.80515373, "learning_rate": 3.576466323035108e-06, "loss": 0.82705164, "num_input_tokens_seen": 84023775, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9140625, "step": 3903, "time_per_iteration": 2.4547183513641357 }, { "auxiliary_loss_clip": 0.01142017, "auxiliary_loss_mlp": 0.01038033, "balance_loss_clip": 1.02062857, "balance_loss_mlp": 1.04921222, "epoch": 0.2347211784157523, "flos": 24535714642560.0, "grad_norm": 7.138695730647568, "language_loss": 0.82007992, "learning_rate": 3.5762266275023645e-06, "loss": 0.84188044, "num_input_tokens_seen": 84042605, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9296875, "step": 3904, "time_per_iteration": 2.5136892795562744 }, { "auxiliary_loss_clip": 0.01141154, "auxiliary_loss_mlp": 0.0104485, "balance_loss_clip": 1.02801764, "balance_loss_mlp": 1.05031919, "epoch": 0.23478130166842026, "flos": 23805040181760.0, "grad_norm": 1.916148565901912, "language_loss": 0.7112627, "learning_rate": 3.57598687219895e-06, "loss": 0.73312277, "num_input_tokens_seen": 84061520, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.91015625, "step": 3905, "time_per_iteration": 2.4774937629699707 }, { "auxiliary_loss_clip": 0.01137806, "auxiliary_loss_mlp": 0.01036648, "balance_loss_clip": 1.02025723, "balance_loss_mlp": 1.0490067, "epoch": 0.23484142492108823, "flos": 24093618048000.0, "grad_norm": 1.8983864017640657, "language_loss": 0.71217763, "learning_rate": 3.5757470571339543e-06, "loss": 0.73392224, "num_input_tokens_seen": 84081800, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.88671875, "step": 3906, "time_per_iteration": 2.522651433944702 }, { "auxiliary_loss_clip": 0.01145365, "auxiliary_loss_mlp": 0.01040217, "balance_loss_clip": 1.02097678, "balance_loss_mlp": 1.0479939, "epoch": 0.2349015481737562, "flos": 29095830898560.0, "grad_norm": 1.893808304748174, "language_loss": 0.73377323, "learning_rate": 3.575507182316473e-06, "loss": 0.75562906, "num_input_tokens_seen": 84102340, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.9765625, "step": 3907, "time_per_iteration": 2.515697717666626 }, { "auxiliary_loss_clip": 0.01141802, "auxiliary_loss_mlp": 0.0105483, "balance_loss_clip": 1.03820062, "balance_loss_mlp": 1.0493679, "epoch": 0.23496167142642416, "flos": 18916305373440.0, "grad_norm": 1.7014428253444835, "language_loss": 0.7304846, "learning_rate": 3.575267247755601e-06, "loss": 0.75245094, "num_input_tokens_seen": 84120370, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.92578125, "step": 3908, "time_per_iteration": 2.468299388885498 }, { "auxiliary_loss_clip": 0.01054662, "auxiliary_loss_mlp": 0.01011091, "balance_loss_clip": 1.00799108, "balance_loss_mlp": 1.02380228, "epoch": 0.23502179467909215, "flos": 55868062896000.0, "grad_norm": 1.018578457523391, "language_loss": 0.73377752, "learning_rate": 3.5750272534604367e-06, "loss": 0.75443506, "num_input_tokens_seen": 84165515, "router_z_loss_clip": 0.03100586, "router_z_loss_mlp": 0.30859375, "step": 3909, "time_per_iteration": 2.8235323429107666 }, { "auxiliary_loss_clip": 0.01140549, "auxiliary_loss_mlp": 0.01042584, "balance_loss_clip": 1.02597845, "balance_loss_mlp": 1.0481267, "epoch": 0.23508191793176011, "flos": 23401763210880.0, "grad_norm": 1.732458368307781, "language_loss": 0.88237053, "learning_rate": 3.5747871994400822e-06, "loss": 0.90420187, "num_input_tokens_seen": 84184540, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.921875, "step": 3910, "time_per_iteration": 2.513522148132324 }, { "auxiliary_loss_clip": 0.01142453, "auxiliary_loss_mlp": 0.01047484, "balance_loss_clip": 1.03134334, "balance_loss_mlp": 1.04961431, "epoch": 0.23514204118442808, "flos": 20047671025920.0, "grad_norm": 2.0271397646745686, "language_loss": 0.76440597, "learning_rate": 3.5745470857036386e-06, "loss": 0.78630537, "num_input_tokens_seen": 84202025, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9296875, "step": 3911, "time_per_iteration": 2.479975700378418 }, { "auxiliary_loss_clip": 0.01135819, "auxiliary_loss_mlp": 0.01044657, "balance_loss_clip": 1.02912378, "balance_loss_mlp": 1.04835916, "epoch": 0.23520216443709605, "flos": 21580589796480.0, "grad_norm": 1.630799436186817, "language_loss": 0.81790316, "learning_rate": 3.5743069122602122e-06, "loss": 0.83970797, "num_input_tokens_seen": 84221895, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.875, "step": 3912, "time_per_iteration": 2.503821611404419 }, { "auxiliary_loss_clip": 0.01135728, "auxiliary_loss_mlp": 0.01050813, "balance_loss_clip": 1.03363526, "balance_loss_mlp": 1.04719067, "epoch": 0.235262287689764, "flos": 23185796688000.0, "grad_norm": 2.3027068895250578, "language_loss": 0.71685147, "learning_rate": 3.574066679118909e-06, "loss": 0.73871684, "num_input_tokens_seen": 84240455, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8828125, "step": 3913, "time_per_iteration": 2.4711244106292725 }, { "auxiliary_loss_clip": 0.01147371, "auxiliary_loss_mlp": 0.01047528, "balance_loss_clip": 1.02970648, "balance_loss_mlp": 1.05085874, "epoch": 0.23532241094243198, "flos": 23185222070400.0, "grad_norm": 2.053895647179321, "language_loss": 0.76081491, "learning_rate": 3.57382638628884e-06, "loss": 0.78276384, "num_input_tokens_seen": 84261605, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.96484375, "step": 3914, "time_per_iteration": 2.5263497829437256 }, { "auxiliary_loss_clip": 0.01141186, "auxiliary_loss_mlp": 0.01039038, "balance_loss_clip": 1.02146697, "balance_loss_mlp": 1.04934502, "epoch": 0.23538253419509997, "flos": 17019324305280.0, "grad_norm": 2.752238866441083, "language_loss": 0.90003806, "learning_rate": 3.5735860337791174e-06, "loss": 0.92184031, "num_input_tokens_seen": 84278675, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.91796875, "step": 3915, "time_per_iteration": 2.4246578216552734 }, { "auxiliary_loss_clip": 0.0105122, "auxiliary_loss_mlp": 0.01002659, "balance_loss_clip": 0.99977463, "balance_loss_mlp": 1.02036059, "epoch": 0.23544265744776793, "flos": 63448588967040.0, "grad_norm": 0.8087117048195201, "language_loss": 0.59384465, "learning_rate": 3.573345621598854e-06, "loss": 0.6143834, "num_input_tokens_seen": 84329765, "router_z_loss_clip": 0.02880859, "router_z_loss_mlp": 0.30859375, "step": 3916, "time_per_iteration": 3.021982192993164 }, { "auxiliary_loss_clip": 0.0104853, "auxiliary_loss_mlp": 0.01002426, "balance_loss_clip": 0.99950582, "balance_loss_mlp": 1.01781464, "epoch": 0.2355027807004359, "flos": 70515343831680.0, "grad_norm": 0.7651836993342482, "language_loss": 0.49435413, "learning_rate": 3.5731051497571675e-06, "loss": 0.51486367, "num_input_tokens_seen": 84393680, "router_z_loss_clip": 0.0291748, "router_z_loss_mlp": 0.30664062, "step": 3917, "time_per_iteration": 3.1082370281219482 }, { "auxiliary_loss_clip": 0.01142394, "auxiliary_loss_mlp": 0.01051428, "balance_loss_clip": 1.03482187, "balance_loss_mlp": 1.04877341, "epoch": 0.23556290395310386, "flos": 21434289701760.0, "grad_norm": 1.8422079680119803, "language_loss": 0.76346481, "learning_rate": 3.5728646182631756e-06, "loss": 0.78540301, "num_input_tokens_seen": 84412640, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.93359375, "step": 3918, "time_per_iteration": 2.512970447540283 }, { "auxiliary_loss_clip": 0.01145452, "auxiliary_loss_mlp": 0.01043075, "balance_loss_clip": 1.02649283, "balance_loss_mlp": 1.0496546, "epoch": 0.23562302720577183, "flos": 18186421011840.0, "grad_norm": 1.9238291024351235, "language_loss": 0.69336665, "learning_rate": 3.5726240271259995e-06, "loss": 0.71525198, "num_input_tokens_seen": 84431605, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.95703125, "step": 3919, "time_per_iteration": 2.449821949005127 }, { "auxiliary_loss_clip": 0.01137305, "auxiliary_loss_mlp": 0.01040993, "balance_loss_clip": 1.02412462, "balance_loss_mlp": 1.04861355, "epoch": 0.2356831504584398, "flos": 33730497832320.0, "grad_norm": 1.8761885870499866, "language_loss": 0.70557648, "learning_rate": 3.5723833763547634e-06, "loss": 0.72735947, "num_input_tokens_seen": 84454210, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.88671875, "step": 3920, "time_per_iteration": 2.57745099067688 }, { "auxiliary_loss_clip": 0.01138906, "auxiliary_loss_mlp": 0.01047484, "balance_loss_clip": 1.03127182, "balance_loss_mlp": 1.04933357, "epoch": 0.23574327371110776, "flos": 24932778560640.0, "grad_norm": 1.5703195511197419, "language_loss": 0.7719717, "learning_rate": 3.5721426659585916e-06, "loss": 0.79383564, "num_input_tokens_seen": 84475540, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8984375, "step": 3921, "time_per_iteration": 2.488485336303711 }, { "auxiliary_loss_clip": 0.01139325, "auxiliary_loss_mlp": 0.01041997, "balance_loss_clip": 1.02461672, "balance_loss_mlp": 1.04796314, "epoch": 0.23580339696377575, "flos": 17822107319040.0, "grad_norm": 2.236090375826057, "language_loss": 0.75034499, "learning_rate": 3.571901895946612e-06, "loss": 0.77215821, "num_input_tokens_seen": 84494580, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9140625, "step": 3922, "time_per_iteration": 2.469052314758301 }, { "auxiliary_loss_clip": 0.01137575, "auxiliary_loss_mlp": 0.0103911, "balance_loss_clip": 1.02254009, "balance_loss_mlp": 1.04701293, "epoch": 0.23586352021644372, "flos": 26286611097600.0, "grad_norm": 1.9433217657802515, "language_loss": 0.80318928, "learning_rate": 3.571661066327956e-06, "loss": 0.82495618, "num_input_tokens_seen": 84513850, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.90625, "step": 3923, "time_per_iteration": 2.4922537803649902 }, { "auxiliary_loss_clip": 0.01137609, "auxiliary_loss_mlp": 0.0104963, "balance_loss_clip": 1.03290439, "balance_loss_mlp": 1.04728293, "epoch": 0.23592364346911168, "flos": 14246697484800.0, "grad_norm": 2.061213698897614, "language_loss": 0.74842989, "learning_rate": 3.571420177111754e-06, "loss": 0.7703023, "num_input_tokens_seen": 84532315, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.90234375, "step": 3924, "time_per_iteration": 2.4794678688049316 }, { "auxiliary_loss_clip": 0.011397, "auxiliary_loss_mlp": 0.01045032, "balance_loss_clip": 1.02916551, "balance_loss_mlp": 1.04910183, "epoch": 0.23598376672177965, "flos": 18587938216320.0, "grad_norm": 2.1023057837312176, "language_loss": 0.82502615, "learning_rate": 3.5711792283071416e-06, "loss": 0.84687352, "num_input_tokens_seen": 84550970, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.90625, "step": 3925, "time_per_iteration": 2.4691638946533203 }, { "auxiliary_loss_clip": 0.01140517, "auxiliary_loss_mlp": 0.01047864, "balance_loss_clip": 1.03057921, "balance_loss_mlp": 1.0481565, "epoch": 0.2360438899744476, "flos": 22675542036480.0, "grad_norm": 1.7883318590633175, "language_loss": 0.5955925, "learning_rate": 3.5709382199232564e-06, "loss": 0.61747628, "num_input_tokens_seen": 84571655, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.921875, "step": 3926, "time_per_iteration": 3.948566198348999 }, { "auxiliary_loss_clip": 0.0113274, "auxiliary_loss_mlp": 0.01045333, "balance_loss_clip": 1.02947235, "balance_loss_mlp": 1.04554796, "epoch": 0.23610401322711558, "flos": 29570139014400.0, "grad_norm": 1.8143228164806104, "language_loss": 0.71519744, "learning_rate": 3.570697151969235e-06, "loss": 0.73697817, "num_input_tokens_seen": 84593130, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.87109375, "step": 3927, "time_per_iteration": 2.5328779220581055 }, { "auxiliary_loss_clip": 0.01136721, "auxiliary_loss_mlp": 0.01046375, "balance_loss_clip": 1.03098464, "balance_loss_mlp": 1.04623508, "epoch": 0.23616413647978354, "flos": 17858520731520.0, "grad_norm": 2.545564948643491, "language_loss": 0.74907982, "learning_rate": 3.570456024454221e-06, "loss": 0.77091086, "num_input_tokens_seen": 84612410, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.90625, "step": 3928, "time_per_iteration": 3.8292877674102783 }, { "auxiliary_loss_clip": 0.01139067, "auxiliary_loss_mlp": 0.01047448, "balance_loss_clip": 1.02976918, "balance_loss_mlp": 1.04752469, "epoch": 0.23622425973245154, "flos": 11034847157760.0, "grad_norm": 2.256731628894427, "language_loss": 0.8140347, "learning_rate": 3.5702148373873576e-06, "loss": 0.83589989, "num_input_tokens_seen": 84627610, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.91796875, "step": 3929, "time_per_iteration": 2.452659845352173 }, { "auxiliary_loss_clip": 0.01147413, "auxiliary_loss_mlp": 0.01045388, "balance_loss_clip": 1.02747095, "balance_loss_mlp": 1.05065298, "epoch": 0.2362843829851195, "flos": 23404061681280.0, "grad_norm": 1.7462434159651674, "language_loss": 0.71962023, "learning_rate": 3.569973590777789e-06, "loss": 0.74154824, "num_input_tokens_seen": 84648415, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.96875, "step": 3930, "time_per_iteration": 3.964031934738159 }, { "auxiliary_loss_clip": 0.0113976, "auxiliary_loss_mlp": 0.01038141, "balance_loss_clip": 1.02072477, "balance_loss_mlp": 1.0475049, "epoch": 0.23634450623778747, "flos": 39529855261440.0, "grad_norm": 1.8580834409379015, "language_loss": 0.7394855, "learning_rate": 3.569732284634665e-06, "loss": 0.76126444, "num_input_tokens_seen": 84670080, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.921875, "step": 3931, "time_per_iteration": 4.095070838928223 }, { "auxiliary_loss_clip": 0.01140891, "auxiliary_loss_mlp": 0.01044377, "balance_loss_clip": 1.02706778, "balance_loss_mlp": 1.04965174, "epoch": 0.23640462949045543, "flos": 24207167917440.0, "grad_norm": 1.8976224586096047, "language_loss": 0.80191481, "learning_rate": 3.569490918967136e-06, "loss": 0.82376742, "num_input_tokens_seen": 84686465, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9140625, "step": 3932, "time_per_iteration": 2.4973556995391846 }, { "auxiliary_loss_clip": 0.01139979, "auxiliary_loss_mlp": 0.01038948, "balance_loss_clip": 1.02396381, "balance_loss_mlp": 1.05092621, "epoch": 0.2364647527431234, "flos": 26177622255360.0, "grad_norm": 1.6163705759353606, "language_loss": 0.85352945, "learning_rate": 3.5692494937843537e-06, "loss": 0.87531865, "num_input_tokens_seen": 84708825, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.890625, "step": 3933, "time_per_iteration": 2.528136968612671 }, { "auxiliary_loss_clip": 0.01145461, "auxiliary_loss_mlp": 0.01037581, "balance_loss_clip": 1.02006936, "balance_loss_mlp": 1.05252266, "epoch": 0.23652487599579136, "flos": 22637009721600.0, "grad_norm": 2.182236270210386, "language_loss": 0.83170092, "learning_rate": 3.5690080090954727e-06, "loss": 0.85353136, "num_input_tokens_seen": 84726165, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9296875, "step": 3934, "time_per_iteration": 2.5364127159118652 }, { "auxiliary_loss_clip": 0.01141216, "auxiliary_loss_mlp": 0.01040847, "balance_loss_clip": 1.02426493, "balance_loss_mlp": 1.04952478, "epoch": 0.23658499924845935, "flos": 21762261809280.0, "grad_norm": 1.6205813050637048, "language_loss": 0.78775108, "learning_rate": 3.5687664649096515e-06, "loss": 0.80957168, "num_input_tokens_seen": 84745815, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9140625, "step": 3935, "time_per_iteration": 2.6684787273406982 }, { "auxiliary_loss_clip": 0.01139244, "auxiliary_loss_mlp": 0.01037077, "balance_loss_clip": 1.02054286, "balance_loss_mlp": 1.05053043, "epoch": 0.23664512250112732, "flos": 21798998444160.0, "grad_norm": 2.388919586933395, "language_loss": 0.79404444, "learning_rate": 3.5685248612360487e-06, "loss": 0.81580764, "num_input_tokens_seen": 84765415, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.88671875, "step": 3936, "time_per_iteration": 2.4872429370880127 }, { "auxiliary_loss_clip": 0.01139687, "auxiliary_loss_mlp": 0.01041289, "balance_loss_clip": 1.02423, "balance_loss_mlp": 1.04887772, "epoch": 0.23670524575379528, "flos": 22637871648000.0, "grad_norm": 1.4842627305867546, "language_loss": 0.79028368, "learning_rate": 3.568283198083826e-06, "loss": 0.81209338, "num_input_tokens_seen": 84787080, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.90625, "step": 3937, "time_per_iteration": 2.4900200366973877 }, { "auxiliary_loss_clip": 0.0113683, "auxiliary_loss_mlp": 0.0103931, "balance_loss_clip": 1.02399135, "balance_loss_mlp": 1.05026221, "epoch": 0.23676536900646325, "flos": 16725000263040.0, "grad_norm": 1.8297871874424303, "language_loss": 0.8571865, "learning_rate": 3.568041475462147e-06, "loss": 0.87894785, "num_input_tokens_seen": 84805395, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8671875, "step": 3938, "time_per_iteration": 2.4186174869537354 }, { "auxiliary_loss_clip": 0.01136785, "auxiliary_loss_mlp": 0.01043648, "balance_loss_clip": 1.02749515, "balance_loss_mlp": 1.04913139, "epoch": 0.23682549225913122, "flos": 11135611785600.0, "grad_norm": 3.397432783826968, "language_loss": 0.93718779, "learning_rate": 3.5677996933801785e-06, "loss": 0.95899212, "num_input_tokens_seen": 84818090, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.875, "step": 3939, "time_per_iteration": 2.39170503616333 }, { "auxiliary_loss_clip": 0.01142144, "auxiliary_loss_mlp": 0.01041576, "balance_loss_clip": 1.02383757, "balance_loss_mlp": 1.0491488, "epoch": 0.23688561551179918, "flos": 22559226819840.0, "grad_norm": 1.6402071361574484, "language_loss": 0.82345641, "learning_rate": 3.567557851847088e-06, "loss": 0.84529364, "num_input_tokens_seen": 84837695, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.9296875, "step": 3940, "time_per_iteration": 2.4751064777374268 }, { "auxiliary_loss_clip": 0.01145405, "auxiliary_loss_mlp": 0.01042581, "balance_loss_clip": 1.0253669, "balance_loss_mlp": 1.05100298, "epoch": 0.23694573876446715, "flos": 18514895909760.0, "grad_norm": 2.5032698185442164, "language_loss": 0.89317167, "learning_rate": 3.5673159508720464e-06, "loss": 0.91505146, "num_input_tokens_seen": 84854630, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9453125, "step": 3941, "time_per_iteration": 2.4235877990722656 }, { "auxiliary_loss_clip": 0.01138672, "auxiliary_loss_mlp": 0.01044586, "balance_loss_clip": 1.02694249, "balance_loss_mlp": 1.04688692, "epoch": 0.23700586201713514, "flos": 15335723980800.0, "grad_norm": 2.0149054123520354, "language_loss": 0.84831798, "learning_rate": 3.5670739904642274e-06, "loss": 0.87015057, "num_input_tokens_seen": 84871805, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.91796875, "step": 3942, "time_per_iteration": 2.4413068294525146 }, { "auxiliary_loss_clip": 0.01141622, "auxiliary_loss_mlp": 0.01042509, "balance_loss_clip": 1.02401924, "balance_loss_mlp": 1.05033088, "epoch": 0.2370659852698031, "flos": 23947605262080.0, "grad_norm": 2.3475563282965446, "language_loss": 0.80695939, "learning_rate": 3.5668319706328065e-06, "loss": 0.82880068, "num_input_tokens_seen": 84889815, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.9140625, "step": 3943, "time_per_iteration": 2.485149621963501 }, { "auxiliary_loss_clip": 0.01146253, "auxiliary_loss_mlp": 0.01041858, "balance_loss_clip": 1.02334464, "balance_loss_mlp": 1.0494616, "epoch": 0.23712610852247107, "flos": 15332527670400.0, "grad_norm": 3.0363949070963425, "language_loss": 0.68336964, "learning_rate": 3.566589891386959e-06, "loss": 0.70525068, "num_input_tokens_seen": 84904380, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.96875, "step": 3944, "time_per_iteration": 2.433305501937866 }, { "auxiliary_loss_clip": 0.01143092, "auxiliary_loss_mlp": 0.0104207, "balance_loss_clip": 1.02464151, "balance_loss_mlp": 1.05021644, "epoch": 0.23718623177513903, "flos": 19682567233920.0, "grad_norm": 1.6782983896946988, "language_loss": 0.75348294, "learning_rate": 3.566347752735866e-06, "loss": 0.77533454, "num_input_tokens_seen": 84922935, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9296875, "step": 3945, "time_per_iteration": 2.4688780307769775 }, { "auxiliary_loss_clip": 0.01142854, "auxiliary_loss_mlp": 0.01037145, "balance_loss_clip": 1.02095652, "balance_loss_mlp": 1.05215609, "epoch": 0.237246355027807, "flos": 24973322037120.0, "grad_norm": 5.074352142804752, "language_loss": 0.63578582, "learning_rate": 3.5661055546887094e-06, "loss": 0.65758586, "num_input_tokens_seen": 84943685, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.90625, "step": 3946, "time_per_iteration": 2.497145175933838 }, { "auxiliary_loss_clip": 0.01141591, "auxiliary_loss_mlp": 0.01039417, "balance_loss_clip": 1.02183366, "balance_loss_mlp": 1.05051029, "epoch": 0.23730647828047496, "flos": 15377416692480.0, "grad_norm": 2.1949623900238815, "language_loss": 0.77132422, "learning_rate": 3.5658632972546734e-06, "loss": 0.79313427, "num_input_tokens_seen": 84959505, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.91015625, "step": 3947, "time_per_iteration": 2.445823907852173 }, { "auxiliary_loss_clip": 0.01147264, "auxiliary_loss_mlp": 0.01038549, "balance_loss_clip": 1.02147794, "balance_loss_mlp": 1.05529022, "epoch": 0.23736660153314296, "flos": 28150662372480.0, "grad_norm": 1.5971272677834116, "language_loss": 0.80679178, "learning_rate": 3.565620980442944e-06, "loss": 0.82864988, "num_input_tokens_seen": 84982130, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.921875, "step": 3948, "time_per_iteration": 2.5474438667297363 }, { "auxiliary_loss_clip": 0.01142966, "auxiliary_loss_mlp": 0.01040697, "balance_loss_clip": 1.02335215, "balance_loss_mlp": 1.05101395, "epoch": 0.23742672478581092, "flos": 22086570729600.0, "grad_norm": 2.823507288278894, "language_loss": 0.80223703, "learning_rate": 3.5653786042627107e-06, "loss": 0.82407367, "num_input_tokens_seen": 85000640, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.921875, "step": 3949, "time_per_iteration": 2.4812347888946533 }, { "auxiliary_loss_clip": 0.01143528, "auxiliary_loss_mlp": 0.01038447, "balance_loss_clip": 1.0210427, "balance_loss_mlp": 1.04910231, "epoch": 0.2374868480384789, "flos": 19537093152000.0, "grad_norm": 1.8102651179446572, "language_loss": 0.72998846, "learning_rate": 3.565136168723163e-06, "loss": 0.75180829, "num_input_tokens_seen": 85018970, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9453125, "step": 3950, "time_per_iteration": 2.452219247817993 }, { "auxiliary_loss_clip": 0.01139911, "auxiliary_loss_mlp": 0.01037543, "balance_loss_clip": 1.02178335, "balance_loss_mlp": 1.04995847, "epoch": 0.23754697129114685, "flos": 19422501788160.0, "grad_norm": 1.8666215482168278, "language_loss": 0.72963095, "learning_rate": 3.564893673833495e-06, "loss": 0.75140548, "num_input_tokens_seen": 85035905, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8984375, "step": 3951, "time_per_iteration": 2.477597236633301 }, { "auxiliary_loss_clip": 0.01145948, "auxiliary_loss_mlp": 0.01037052, "balance_loss_clip": 1.01980257, "balance_loss_mlp": 1.0529089, "epoch": 0.23760709454381482, "flos": 19501002961920.0, "grad_norm": 1.815378487055948, "language_loss": 0.73925626, "learning_rate": 3.564651119602903e-06, "loss": 0.76108629, "num_input_tokens_seen": 85054560, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9296875, "step": 3952, "time_per_iteration": 2.448608160018921 }, { "auxiliary_loss_clip": 0.01142812, "auxiliary_loss_mlp": 0.01040801, "balance_loss_clip": 1.02444553, "balance_loss_mlp": 1.04986143, "epoch": 0.23766721779648278, "flos": 27636600879360.0, "grad_norm": 1.7442604742771743, "language_loss": 0.71120048, "learning_rate": 3.564408506040583e-06, "loss": 0.73303664, "num_input_tokens_seen": 85074425, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.92578125, "step": 3953, "time_per_iteration": 2.5318973064422607 }, { "auxiliary_loss_clip": 0.01143735, "auxiliary_loss_mlp": 0.01042166, "balance_loss_clip": 1.02398694, "balance_loss_mlp": 1.05054033, "epoch": 0.23772734104915075, "flos": 23404348990080.0, "grad_norm": 2.1595141958886948, "language_loss": 0.81395143, "learning_rate": 3.5641658331557356e-06, "loss": 0.83581042, "num_input_tokens_seen": 85092865, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9296875, "step": 3954, "time_per_iteration": 2.477665662765503 }, { "auxiliary_loss_clip": 0.01145098, "auxiliary_loss_mlp": 0.0104064, "balance_loss_clip": 1.0218761, "balance_loss_mlp": 1.0517329, "epoch": 0.23778746430181874, "flos": 15705496540800.0, "grad_norm": 2.5805000593326017, "language_loss": 0.65784132, "learning_rate": 3.5639231009575634e-06, "loss": 0.67969871, "num_input_tokens_seen": 85110175, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.93359375, "step": 3955, "time_per_iteration": 2.46594500541687 }, { "auxiliary_loss_clip": 0.01140781, "auxiliary_loss_mlp": 0.01049972, "balance_loss_clip": 1.03294921, "balance_loss_mlp": 1.05019832, "epoch": 0.2378475875544867, "flos": 19426452284160.0, "grad_norm": 1.328022084464102, "language_loss": 0.83903825, "learning_rate": 3.5636803094552704e-06, "loss": 0.86094582, "num_input_tokens_seen": 85129925, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90625, "step": 3956, "time_per_iteration": 2.497138261795044 }, { "auxiliary_loss_clip": 0.01137173, "auxiliary_loss_mlp": 0.01036753, "balance_loss_clip": 1.0203141, "balance_loss_mlp": 1.0494777, "epoch": 0.23790771080715467, "flos": 22268565964800.0, "grad_norm": 3.413898872465083, "language_loss": 0.85249853, "learning_rate": 3.5634374586580635e-06, "loss": 0.87423784, "num_input_tokens_seen": 85147755, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.875, "step": 3957, "time_per_iteration": 2.4723591804504395 }, { "auxiliary_loss_clip": 0.01138687, "auxiliary_loss_mlp": 0.01039436, "balance_loss_clip": 1.02340221, "balance_loss_mlp": 1.04856527, "epoch": 0.23796783405982264, "flos": 20047311889920.0, "grad_norm": 2.007039573848824, "language_loss": 0.70248044, "learning_rate": 3.563194548575151e-06, "loss": 0.72426164, "num_input_tokens_seen": 85165270, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.90234375, "step": 3958, "time_per_iteration": 2.4631295204162598 }, { "auxiliary_loss_clip": 0.01140501, "auxiliary_loss_mlp": 0.01041678, "balance_loss_clip": 1.02417839, "balance_loss_mlp": 1.04856992, "epoch": 0.2380279573124906, "flos": 14245943299200.0, "grad_norm": 2.3975994239428897, "language_loss": 0.65777755, "learning_rate": 3.562951579215745e-06, "loss": 0.6795994, "num_input_tokens_seen": 85181555, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.921875, "step": 3959, "time_per_iteration": 2.45035982131958 }, { "auxiliary_loss_clip": 0.01142162, "auxiliary_loss_mlp": 0.01041088, "balance_loss_clip": 1.02473235, "balance_loss_mlp": 1.05071712, "epoch": 0.23808808056515857, "flos": 21179180332800.0, "grad_norm": 2.0060329720712513, "language_loss": 0.72367537, "learning_rate": 3.5627085505890586e-06, "loss": 0.74550784, "num_input_tokens_seen": 85199455, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9140625, "step": 3960, "time_per_iteration": 2.452486515045166 }, { "auxiliary_loss_clip": 0.0113909, "auxiliary_loss_mlp": 0.01039038, "balance_loss_clip": 1.02199078, "balance_loss_mlp": 1.04818082, "epoch": 0.23814820381782653, "flos": 22528308188160.0, "grad_norm": 1.6964748910259586, "language_loss": 0.74333221, "learning_rate": 3.562465462704307e-06, "loss": 0.76511347, "num_input_tokens_seen": 85219170, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.91015625, "step": 3961, "time_per_iteration": 2.5136148929595947 }, { "auxiliary_loss_clip": 0.01139688, "auxiliary_loss_mlp": 0.01048669, "balance_loss_clip": 1.02985823, "balance_loss_mlp": 1.04612982, "epoch": 0.23820832707049452, "flos": 22304332932480.0, "grad_norm": 2.832241513732003, "language_loss": 0.65052652, "learning_rate": 3.5622223155707085e-06, "loss": 0.67241007, "num_input_tokens_seen": 85238480, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.93359375, "step": 3962, "time_per_iteration": 2.4691953659057617 }, { "auxiliary_loss_clip": 0.01139765, "auxiliary_loss_mlp": 0.01045065, "balance_loss_clip": 1.02822089, "balance_loss_mlp": 1.04872751, "epoch": 0.2382684503231625, "flos": 24864225454080.0, "grad_norm": 1.714322698704314, "language_loss": 0.74619544, "learning_rate": 3.561979109197483e-06, "loss": 0.7680437, "num_input_tokens_seen": 85259180, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.91015625, "step": 3963, "time_per_iteration": 2.5245327949523926 }, { "auxiliary_loss_clip": 0.01142719, "auxiliary_loss_mlp": 0.01042494, "balance_loss_clip": 1.02504134, "balance_loss_mlp": 1.05021596, "epoch": 0.23832857357583045, "flos": 21871609787520.0, "grad_norm": 1.9594363541144182, "language_loss": 0.76955175, "learning_rate": 3.5617358435938538e-06, "loss": 0.79140389, "num_input_tokens_seen": 85278550, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.921875, "step": 3964, "time_per_iteration": 2.481938600540161 }, { "auxiliary_loss_clip": 0.01136778, "auxiliary_loss_mlp": 0.01038394, "balance_loss_clip": 1.02214539, "balance_loss_mlp": 1.04791629, "epoch": 0.23838869682849842, "flos": 21288061434240.0, "grad_norm": 2.419066314932413, "language_loss": 0.71960402, "learning_rate": 3.561492518769045e-06, "loss": 0.74135578, "num_input_tokens_seen": 85297345, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.88671875, "step": 3965, "time_per_iteration": 2.472059965133667 }, { "auxiliary_loss_clip": 0.01136777, "auxiliary_loss_mlp": 0.01048009, "balance_loss_clip": 1.03154612, "balance_loss_mlp": 1.04872739, "epoch": 0.23844882008116638, "flos": 16180594755840.0, "grad_norm": 1.9911051483563766, "language_loss": 0.78424752, "learning_rate": 3.561249134732282e-06, "loss": 0.80609542, "num_input_tokens_seen": 85315105, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8828125, "step": 3966, "time_per_iteration": 2.441378355026245 }, { "auxiliary_loss_clip": 0.01136348, "auxiliary_loss_mlp": 0.01043996, "balance_loss_clip": 1.02779543, "balance_loss_mlp": 1.04732788, "epoch": 0.23850894333383435, "flos": 21069724613760.0, "grad_norm": 4.9692328656745275, "language_loss": 0.68777585, "learning_rate": 3.561005691492797e-06, "loss": 0.70957935, "num_input_tokens_seen": 85334735, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.890625, "step": 3967, "time_per_iteration": 2.4646878242492676 }, { "auxiliary_loss_clip": 0.01138159, "auxiliary_loss_mlp": 0.01051939, "balance_loss_clip": 1.03483272, "balance_loss_mlp": 1.04808927, "epoch": 0.23856906658650234, "flos": 17201606849280.0, "grad_norm": 1.8435598392310026, "language_loss": 0.67899531, "learning_rate": 3.5607621890598185e-06, "loss": 0.70089626, "num_input_tokens_seen": 85352875, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.90234375, "step": 3968, "time_per_iteration": 3.9481637477874756 }, { "auxiliary_loss_clip": 0.01136585, "auxiliary_loss_mlp": 0.01046357, "balance_loss_clip": 1.03007329, "balance_loss_mlp": 1.04685426, "epoch": 0.2386291898391703, "flos": 29494223619840.0, "grad_norm": 1.9558433743833874, "language_loss": 0.76996201, "learning_rate": 3.5605186274425823e-06, "loss": 0.79179138, "num_input_tokens_seen": 85372205, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8984375, "step": 3969, "time_per_iteration": 2.571903705596924 }, { "auxiliary_loss_clip": 0.01132186, "auxiliary_loss_mlp": 0.01036617, "balance_loss_clip": 1.02094114, "balance_loss_mlp": 1.04533362, "epoch": 0.23868931309183827, "flos": 21142443697920.0, "grad_norm": 2.5196209282103395, "language_loss": 0.76462293, "learning_rate": 3.5602750066503225e-06, "loss": 0.78631097, "num_input_tokens_seen": 85389705, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8671875, "step": 3970, "time_per_iteration": 3.89278244972229 }, { "auxiliary_loss_clip": 0.01136704, "auxiliary_loss_mlp": 0.0104455, "balance_loss_clip": 1.02671623, "balance_loss_mlp": 1.04549873, "epoch": 0.23874943634450624, "flos": 25659394784640.0, "grad_norm": 2.829029785386079, "language_loss": 0.85254788, "learning_rate": 3.5600313266922793e-06, "loss": 0.87436044, "num_input_tokens_seen": 85407855, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9140625, "step": 3971, "time_per_iteration": 2.527083396911621 }, { "auxiliary_loss_clip": 0.01061553, "auxiliary_loss_mlp": 0.01032116, "balance_loss_clip": 1.02950501, "balance_loss_mlp": 1.03046513, "epoch": 0.2388095595971742, "flos": 58986618624000.0, "grad_norm": 0.7466750360280583, "language_loss": 0.62789834, "learning_rate": 3.5597875875776915e-06, "loss": 0.64883506, "num_input_tokens_seen": 85470885, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.31054688, "step": 3972, "time_per_iteration": 5.947951078414917 }, { "auxiliary_loss_clip": 0.01137177, "auxiliary_loss_mlp": 0.01038769, "balance_loss_clip": 1.02292562, "balance_loss_mlp": 1.04796362, "epoch": 0.23886968284984217, "flos": 16800341040000.0, "grad_norm": 1.9256348001755261, "language_loss": 0.82074296, "learning_rate": 3.5595437893158013e-06, "loss": 0.84250242, "num_input_tokens_seen": 85488460, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.890625, "step": 3973, "time_per_iteration": 2.4738667011260986 }, { "auxiliary_loss_clip": 0.01136849, "auxiliary_loss_mlp": 0.01042778, "balance_loss_clip": 1.02586246, "balance_loss_mlp": 1.04824185, "epoch": 0.23892980610251013, "flos": 22382654538240.0, "grad_norm": 1.5211989477123078, "language_loss": 0.79570699, "learning_rate": 3.5592999319158546e-06, "loss": 0.81750321, "num_input_tokens_seen": 85508590, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.88671875, "step": 3974, "time_per_iteration": 2.477973222732544 }, { "auxiliary_loss_clip": 0.01137543, "auxiliary_loss_mlp": 0.01038452, "balance_loss_clip": 1.02123821, "balance_loss_mlp": 1.04680228, "epoch": 0.23898992935517813, "flos": 12823198519680.0, "grad_norm": 1.9301674092197851, "language_loss": 0.8474136, "learning_rate": 3.5590560153870984e-06, "loss": 0.86917353, "num_input_tokens_seen": 85525970, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.90625, "step": 3975, "time_per_iteration": 2.4412105083465576 }, { "auxiliary_loss_clip": 0.01134056, "auxiliary_loss_mlp": 0.01036509, "balance_loss_clip": 1.02061868, "balance_loss_mlp": 1.04637599, "epoch": 0.2390500526078461, "flos": 22345666508160.0, "grad_norm": 2.134308958551566, "language_loss": 0.83328235, "learning_rate": 3.5588120397387816e-06, "loss": 0.85498804, "num_input_tokens_seen": 85543700, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.87890625, "step": 3976, "time_per_iteration": 2.452361583709717 }, { "auxiliary_loss_clip": 0.01132305, "auxiliary_loss_mlp": 0.01028625, "balance_loss_clip": 1.01381922, "balance_loss_mlp": 1.04614353, "epoch": 0.23911017586051406, "flos": 22635142214400.0, "grad_norm": 1.8860395157023286, "language_loss": 0.74449527, "learning_rate": 3.5585680049801566e-06, "loss": 0.76610458, "num_input_tokens_seen": 85562765, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.86328125, "step": 3977, "time_per_iteration": 2.4739456176757812 }, { "auxiliary_loss_clip": 0.01137233, "auxiliary_loss_mlp": 0.0104661, "balance_loss_clip": 1.02914619, "balance_loss_mlp": 1.0471127, "epoch": 0.23917029911318202, "flos": 23653281219840.0, "grad_norm": 1.664046830161543, "language_loss": 0.71941131, "learning_rate": 3.5583239111204764e-06, "loss": 0.74124974, "num_input_tokens_seen": 85581755, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.90234375, "step": 3978, "time_per_iteration": 2.4662177562713623 }, { "auxiliary_loss_clip": 0.01141823, "auxiliary_loss_mlp": 0.01043432, "balance_loss_clip": 1.02681398, "balance_loss_mlp": 1.05093622, "epoch": 0.23923042236585, "flos": 22783597125120.0, "grad_norm": 2.3660999159247846, "language_loss": 0.78796947, "learning_rate": 3.558079758168997e-06, "loss": 0.80982196, "num_input_tokens_seen": 85599455, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.91015625, "step": 3979, "time_per_iteration": 2.495063066482544 }, { "auxiliary_loss_clip": 0.0113617, "auxiliary_loss_mlp": 0.01045959, "balance_loss_clip": 1.02898359, "balance_loss_mlp": 1.04747653, "epoch": 0.23929054561851795, "flos": 28147717457280.0, "grad_norm": 1.6605348406115603, "language_loss": 0.81941289, "learning_rate": 3.557835546134977e-06, "loss": 0.84123421, "num_input_tokens_seen": 85619970, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 3980, "time_per_iteration": 2.520268201828003 }, { "auxiliary_loss_clip": 0.01135043, "auxiliary_loss_mlp": 0.01034767, "balance_loss_clip": 1.01817322, "balance_loss_mlp": 1.0474534, "epoch": 0.23935066887118592, "flos": 21686525982720.0, "grad_norm": 1.6224649275307341, "language_loss": 0.83710909, "learning_rate": 3.5575912750276775e-06, "loss": 0.85880721, "num_input_tokens_seen": 85638850, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 3981, "time_per_iteration": 2.4701154232025146 }, { "auxiliary_loss_clip": 0.01140566, "auxiliary_loss_mlp": 0.01041616, "balance_loss_clip": 1.02475977, "balance_loss_mlp": 1.04866493, "epoch": 0.2394107921238539, "flos": 32122274198400.0, "grad_norm": 1.9078441834076623, "language_loss": 0.7674858, "learning_rate": 3.5573469448563607e-06, "loss": 0.78930759, "num_input_tokens_seen": 85656285, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.91796875, "step": 3982, "time_per_iteration": 2.5302188396453857 }, { "auxiliary_loss_clip": 0.01134836, "auxiliary_loss_mlp": 0.01039886, "balance_loss_clip": 1.02411413, "balance_loss_mlp": 1.04822445, "epoch": 0.23947091537652188, "flos": 17019180650880.0, "grad_norm": 1.7727430092289242, "language_loss": 0.78074133, "learning_rate": 3.5571025556302915e-06, "loss": 0.80248857, "num_input_tokens_seen": 85673020, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 3983, "time_per_iteration": 2.4654898643493652 }, { "auxiliary_loss_clip": 0.0113695, "auxiliary_loss_mlp": 0.01046138, "balance_loss_clip": 1.02915716, "balance_loss_mlp": 1.04787898, "epoch": 0.23953103862918984, "flos": 20593584904320.0, "grad_norm": 1.639254639375331, "language_loss": 0.73171169, "learning_rate": 3.556858107358737e-06, "loss": 0.75354254, "num_input_tokens_seen": 85692565, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.890625, "step": 3984, "time_per_iteration": 2.44163179397583 }, { "auxiliary_loss_clip": 0.01138325, "auxiliary_loss_mlp": 0.01043282, "balance_loss_clip": 1.02671218, "balance_loss_mlp": 1.04701936, "epoch": 0.2395911618818578, "flos": 20704405340160.0, "grad_norm": 1.9095998085832677, "language_loss": 0.78782797, "learning_rate": 3.5566136000509674e-06, "loss": 0.80964398, "num_input_tokens_seen": 85709730, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9140625, "step": 3985, "time_per_iteration": 2.4814839363098145 }, { "auxiliary_loss_clip": 0.01141461, "auxiliary_loss_mlp": 0.01045093, "balance_loss_clip": 1.02799869, "balance_loss_mlp": 1.05112171, "epoch": 0.23965128513452577, "flos": 27053519402880.0, "grad_norm": 4.908616376625626, "language_loss": 0.73345089, "learning_rate": 3.556369033716254e-06, "loss": 0.75531644, "num_input_tokens_seen": 85730045, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.90234375, "step": 3986, "time_per_iteration": 2.517117738723755 }, { "auxiliary_loss_clip": 0.0114098, "auxiliary_loss_mlp": 0.01047655, "balance_loss_clip": 1.03144205, "balance_loss_mlp": 1.0478617, "epoch": 0.23971140838719374, "flos": 23144319457920.0, "grad_norm": 1.8545304696474336, "language_loss": 0.87701595, "learning_rate": 3.556124408363871e-06, "loss": 0.8989023, "num_input_tokens_seen": 85747590, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.9296875, "step": 3987, "time_per_iteration": 2.4876015186309814 }, { "auxiliary_loss_clip": 0.01130497, "auxiliary_loss_mlp": 0.01037688, "balance_loss_clip": 1.02281094, "balance_loss_mlp": 1.04693699, "epoch": 0.23977153163986173, "flos": 18034554309120.0, "grad_norm": 2.373854888345927, "language_loss": 0.82931048, "learning_rate": 3.5558797240030945e-06, "loss": 0.85099232, "num_input_tokens_seen": 85763460, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8359375, "step": 3988, "time_per_iteration": 2.41373872756958 }, { "auxiliary_loss_clip": 0.01135521, "auxiliary_loss_mlp": 0.01043392, "balance_loss_clip": 1.02684534, "balance_loss_mlp": 1.04639196, "epoch": 0.2398316548925297, "flos": 18113378705280.0, "grad_norm": 1.7521141535360507, "language_loss": 0.84783864, "learning_rate": 3.5556349806432035e-06, "loss": 0.86962777, "num_input_tokens_seen": 85782050, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.890625, "step": 3989, "time_per_iteration": 2.4745383262634277 }, { "auxiliary_loss_clip": 0.01135227, "auxiliary_loss_mlp": 0.01040487, "balance_loss_clip": 1.02421451, "balance_loss_mlp": 1.04692364, "epoch": 0.23989177814519766, "flos": 12567730014720.0, "grad_norm": 1.9697368931617802, "language_loss": 0.84787071, "learning_rate": 3.555390178293477e-06, "loss": 0.86962783, "num_input_tokens_seen": 85797400, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8828125, "step": 3990, "time_per_iteration": 2.43513560295105 }, { "auxiliary_loss_clip": 0.01135093, "auxiliary_loss_mlp": 0.01041673, "balance_loss_clip": 1.02636635, "balance_loss_mlp": 1.04736102, "epoch": 0.23995190139786562, "flos": 25264593423360.0, "grad_norm": 1.8229457699983245, "language_loss": 0.75786585, "learning_rate": 3.5551453169631994e-06, "loss": 0.77963352, "num_input_tokens_seen": 85818995, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.87890625, "step": 3991, "time_per_iteration": 2.5203909873962402 }, { "auxiliary_loss_clip": 0.01068414, "auxiliary_loss_mlp": 0.01006875, "balance_loss_clip": 1.00413346, "balance_loss_mlp": 1.03619683, "epoch": 0.2400120246505336, "flos": 61960379650560.0, "grad_norm": 0.9117369820175281, "language_loss": 0.63759255, "learning_rate": 3.554900396661656e-06, "loss": 0.65834546, "num_input_tokens_seen": 85876695, "router_z_loss_clip": 0.02746582, "router_z_loss_mlp": 0.32226562, "step": 3992, "time_per_iteration": 3.018669366836548 }, { "auxiliary_loss_clip": 0.01069123, "auxiliary_loss_mlp": 0.0100298, "balance_loss_clip": 1.00011909, "balance_loss_mlp": 1.03703213, "epoch": 0.24007214790320155, "flos": 66708560540160.0, "grad_norm": 0.7707499201329443, "language_loss": 0.62959009, "learning_rate": 3.5546554173981334e-06, "loss": 0.65031111, "num_input_tokens_seen": 85940990, "router_z_loss_clip": 0.02856445, "router_z_loss_mlp": 0.3203125, "step": 3993, "time_per_iteration": 3.2032968997955322 }, { "auxiliary_loss_clip": 0.01143752, "auxiliary_loss_mlp": 0.01045809, "balance_loss_clip": 1.02865493, "balance_loss_mlp": 1.05249727, "epoch": 0.24013227115586952, "flos": 25809070757760.0, "grad_norm": 1.8171469543381615, "language_loss": 0.76800859, "learning_rate": 3.5544103791819218e-06, "loss": 0.78990424, "num_input_tokens_seen": 85961165, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9140625, "step": 3994, "time_per_iteration": 2.505934476852417 }, { "auxiliary_loss_clip": 0.01139491, "auxiliary_loss_mlp": 0.0105256, "balance_loss_clip": 1.03382039, "balance_loss_mlp": 1.04892695, "epoch": 0.2401923944085375, "flos": 25557480921600.0, "grad_norm": 1.5692411716200685, "language_loss": 0.78287238, "learning_rate": 3.5541652820223124e-06, "loss": 0.80479288, "num_input_tokens_seen": 85982710, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.90625, "step": 3995, "time_per_iteration": 2.53275990486145 }, { "auxiliary_loss_clip": 0.01066285, "auxiliary_loss_mlp": 0.01002623, "balance_loss_clip": 0.99959469, "balance_loss_mlp": 1.03406787, "epoch": 0.24025251766120548, "flos": 54941138478720.0, "grad_norm": 0.9174261599935211, "language_loss": 0.63477373, "learning_rate": 3.5539201259286006e-06, "loss": 0.65546286, "num_input_tokens_seen": 86046935, "router_z_loss_clip": 0.03027344, "router_z_loss_mlp": 0.32226562, "step": 3996, "time_per_iteration": 3.1554665565490723 }, { "auxiliary_loss_clip": 0.01141255, "auxiliary_loss_mlp": 0.0104385, "balance_loss_clip": 1.02694619, "balance_loss_mlp": 1.04796433, "epoch": 0.24031264091387344, "flos": 20631075724800.0, "grad_norm": 2.4624761840933913, "language_loss": 0.69674158, "learning_rate": 3.5536749109100808e-06, "loss": 0.71859258, "num_input_tokens_seen": 86064355, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9296875, "step": 3997, "time_per_iteration": 2.4571852684020996 }, { "auxiliary_loss_clip": 0.01136713, "auxiliary_loss_mlp": 0.01040637, "balance_loss_clip": 1.02419829, "balance_loss_mlp": 1.04811811, "epoch": 0.2403727641665414, "flos": 20886256920960.0, "grad_norm": 1.8165339035744112, "language_loss": 0.87067366, "learning_rate": 3.5534296369760535e-06, "loss": 0.89244711, "num_input_tokens_seen": 86081340, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.88671875, "step": 3998, "time_per_iteration": 2.4480056762695312 }, { "auxiliary_loss_clip": 0.01139987, "auxiliary_loss_mlp": 0.01037591, "balance_loss_clip": 1.02144969, "balance_loss_mlp": 1.04512143, "epoch": 0.24043288741920937, "flos": 22820046451200.0, "grad_norm": 2.625815702329025, "language_loss": 0.75828671, "learning_rate": 3.5531843041358183e-06, "loss": 0.78006244, "num_input_tokens_seen": 86102260, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.94921875, "step": 3999, "time_per_iteration": 2.4692041873931885 }, { "auxiliary_loss_clip": 0.01137577, "auxiliary_loss_mlp": 0.01037018, "balance_loss_clip": 1.02138984, "balance_loss_mlp": 1.04904616, "epoch": 0.24049301067187734, "flos": 27959652823680.0, "grad_norm": 2.506204996050225, "language_loss": 0.72493315, "learning_rate": 3.552938912398679e-06, "loss": 0.74667907, "num_input_tokens_seen": 86123400, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8828125, "step": 4000, "time_per_iteration": 2.509265184402466 }, { "auxiliary_loss_clip": 0.01144377, "auxiliary_loss_mlp": 0.01042367, "balance_loss_clip": 1.02560663, "balance_loss_mlp": 1.05054688, "epoch": 0.24055313392454533, "flos": 27451409333760.0, "grad_norm": 2.484234752238978, "language_loss": 0.66680908, "learning_rate": 3.5526934617739397e-06, "loss": 0.68867654, "num_input_tokens_seen": 86144060, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.94140625, "step": 4001, "time_per_iteration": 2.5130362510681152 }, { "auxiliary_loss_clip": 0.01138359, "auxiliary_loss_mlp": 0.01041049, "balance_loss_clip": 1.02358437, "balance_loss_mlp": 1.04753757, "epoch": 0.2406132571772133, "flos": 25556618995200.0, "grad_norm": 2.4164660077841673, "language_loss": 0.83005917, "learning_rate": 3.5524479522709095e-06, "loss": 0.85185319, "num_input_tokens_seen": 86163005, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.90625, "step": 4002, "time_per_iteration": 2.4853460788726807 }, { "auxiliary_loss_clip": 0.01140245, "auxiliary_loss_mlp": 0.01043302, "balance_loss_clip": 1.02706563, "balance_loss_mlp": 1.05009103, "epoch": 0.24067338042988126, "flos": 24791398629120.0, "grad_norm": 1.8855679377089574, "language_loss": 0.83113748, "learning_rate": 3.552202383898897e-06, "loss": 0.85297292, "num_input_tokens_seen": 86182580, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.90234375, "step": 4003, "time_per_iteration": 2.481126546859741 }, { "auxiliary_loss_clip": 0.01139379, "auxiliary_loss_mlp": 0.01038338, "balance_loss_clip": 1.02150559, "balance_loss_mlp": 1.04859936, "epoch": 0.24073350368254923, "flos": 21177923356800.0, "grad_norm": 2.0157681374715373, "language_loss": 0.87196654, "learning_rate": 3.551956756667215e-06, "loss": 0.89374363, "num_input_tokens_seen": 86200665, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.90625, "step": 4004, "time_per_iteration": 2.4361188411712646 }, { "auxiliary_loss_clip": 0.01140697, "auxiliary_loss_mlp": 0.01050107, "balance_loss_clip": 1.0338707, "balance_loss_mlp": 1.04746139, "epoch": 0.2407936269352172, "flos": 22494300986880.0, "grad_norm": 1.9151056220164917, "language_loss": 0.77814651, "learning_rate": 3.551711070585177e-06, "loss": 0.80005455, "num_input_tokens_seen": 86221640, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.93359375, "step": 4005, "time_per_iteration": 2.520220994949341 }, { "auxiliary_loss_clip": 0.01134992, "auxiliary_loss_mlp": 0.0104128, "balance_loss_clip": 1.02504337, "balance_loss_mlp": 1.0473398, "epoch": 0.24085375018788516, "flos": 18551129754240.0, "grad_norm": 1.5859267998666626, "language_loss": 0.79048336, "learning_rate": 3.5514653256620995e-06, "loss": 0.81224608, "num_input_tokens_seen": 86240795, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.875, "step": 4006, "time_per_iteration": 2.4443840980529785 }, { "auxiliary_loss_clip": 0.01142805, "auxiliary_loss_mlp": 0.01042127, "balance_loss_clip": 1.02378106, "balance_loss_mlp": 1.04730976, "epoch": 0.24091387344055312, "flos": 24170539023360.0, "grad_norm": 1.7717677634358056, "language_loss": 0.71664584, "learning_rate": 3.551219521907302e-06, "loss": 0.73849511, "num_input_tokens_seen": 86262000, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.953125, "step": 4007, "time_per_iteration": 2.5071864128112793 }, { "auxiliary_loss_clip": 0.01136435, "auxiliary_loss_mlp": 0.01046941, "balance_loss_clip": 1.03124166, "balance_loss_mlp": 1.04801679, "epoch": 0.24097399669322112, "flos": 11036319615360.0, "grad_norm": 1.7214033724700089, "language_loss": 0.75532359, "learning_rate": 3.5509736593301042e-06, "loss": 0.77715731, "num_input_tokens_seen": 86279680, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8828125, "step": 4008, "time_per_iteration": 2.4445674419403076 }, { "auxiliary_loss_clip": 0.0113846, "auxiliary_loss_mlp": 0.01036544, "balance_loss_clip": 1.01994967, "balance_loss_mlp": 1.04816914, "epoch": 0.24103411994588908, "flos": 17165085696000.0, "grad_norm": 2.3031269236369405, "language_loss": 0.74468368, "learning_rate": 3.5507277379398295e-06, "loss": 0.76643372, "num_input_tokens_seen": 86297180, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.90234375, "step": 4009, "time_per_iteration": 3.9632134437561035 }, { "auxiliary_loss_clip": 0.01139336, "auxiliary_loss_mlp": 0.01046905, "balance_loss_clip": 1.03094244, "balance_loss_mlp": 1.05020785, "epoch": 0.24109424319855705, "flos": 20667956014080.0, "grad_norm": 1.6979809062265927, "language_loss": 0.8008495, "learning_rate": 3.550481757745804e-06, "loss": 0.82271194, "num_input_tokens_seen": 86317660, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.890625, "step": 4010, "time_per_iteration": 2.5193350315093994 }, { "auxiliary_loss_clip": 0.01138566, "auxiliary_loss_mlp": 0.01049528, "balance_loss_clip": 1.03083563, "balance_loss_mlp": 1.0458076, "epoch": 0.241154366451225, "flos": 28181796485760.0, "grad_norm": 2.071542368828239, "language_loss": 0.70992875, "learning_rate": 3.5502357187573555e-06, "loss": 0.73180968, "num_input_tokens_seen": 86338325, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.9296875, "step": 4011, "time_per_iteration": 3.890289068222046 }, { "auxiliary_loss_clip": 0.01136381, "auxiliary_loss_mlp": 0.01037187, "balance_loss_clip": 1.02118874, "balance_loss_mlp": 1.04618669, "epoch": 0.24121448970389298, "flos": 21689722293120.0, "grad_norm": 1.5833672057936394, "language_loss": 0.68968737, "learning_rate": 3.5499896209838118e-06, "loss": 0.71142304, "num_input_tokens_seen": 86357615, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.90234375, "step": 4012, "time_per_iteration": 2.47371506690979 }, { "auxiliary_loss_clip": 0.01141271, "auxiliary_loss_mlp": 0.0104084, "balance_loss_clip": 1.02229118, "balance_loss_mlp": 1.04912853, "epoch": 0.24127461295656094, "flos": 39676191269760.0, "grad_norm": 1.57219625585838, "language_loss": 0.73478413, "learning_rate": 3.5497434644345073e-06, "loss": 0.75660521, "num_input_tokens_seen": 86380355, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.921875, "step": 4013, "time_per_iteration": 4.027489900588989 }, { "auxiliary_loss_clip": 0.01138881, "auxiliary_loss_mlp": 0.0103808, "balance_loss_clip": 1.02236795, "balance_loss_mlp": 1.04773545, "epoch": 0.2413347362092289, "flos": 19135863256320.0, "grad_norm": 1.8922714284321955, "language_loss": 0.88267845, "learning_rate": 3.5494972491187753e-06, "loss": 0.90444803, "num_input_tokens_seen": 86399125, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.91015625, "step": 4014, "time_per_iteration": 3.910104513168335 }, { "auxiliary_loss_clip": 0.01143656, "auxiliary_loss_mlp": 0.01041916, "balance_loss_clip": 1.02455962, "balance_loss_mlp": 1.04959679, "epoch": 0.2413948594618969, "flos": 26939430829440.0, "grad_norm": 2.0787102611897463, "language_loss": 0.94615471, "learning_rate": 3.549250975045952e-06, "loss": 0.96801043, "num_input_tokens_seen": 86418625, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.9375, "step": 4015, "time_per_iteration": 2.5011236667633057 }, { "auxiliary_loss_clip": 0.01138793, "auxiliary_loss_mlp": 0.01037079, "balance_loss_clip": 1.0203774, "balance_loss_mlp": 1.04723287, "epoch": 0.24145498271456486, "flos": 25228108183680.0, "grad_norm": 1.7187333379497716, "language_loss": 0.82547224, "learning_rate": 3.5490046422253768e-06, "loss": 0.84723091, "num_input_tokens_seen": 86438375, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9140625, "step": 4016, "time_per_iteration": 2.512253761291504 }, { "auxiliary_loss_clip": 0.01133672, "auxiliary_loss_mlp": 0.01039702, "balance_loss_clip": 1.02373946, "balance_loss_mlp": 1.04827762, "epoch": 0.24151510596723283, "flos": 40661759617920.0, "grad_norm": 2.3675526535557436, "language_loss": 0.69431508, "learning_rate": 3.54875825066639e-06, "loss": 0.71604884, "num_input_tokens_seen": 86463230, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8515625, "step": 4017, "time_per_iteration": 2.6196069717407227 }, { "auxiliary_loss_clip": 0.01142133, "auxiliary_loss_mlp": 0.01044605, "balance_loss_clip": 1.02743936, "balance_loss_mlp": 1.04939866, "epoch": 0.2415752292199008, "flos": 18146667634560.0, "grad_norm": 2.0168052506565792, "language_loss": 0.84802401, "learning_rate": 3.5485118003783353e-06, "loss": 0.86989141, "num_input_tokens_seen": 86481230, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9296875, "step": 4018, "time_per_iteration": 2.4456775188446045 }, { "auxiliary_loss_clip": 0.0106632, "auxiliary_loss_mlp": 0.01013281, "balance_loss_clip": 1.01001501, "balance_loss_mlp": 1.03343785, "epoch": 0.24163535247256876, "flos": 67288409792640.0, "grad_norm": 0.8663931180035782, "language_loss": 0.60664678, "learning_rate": 3.548265291370558e-06, "loss": 0.62744284, "num_input_tokens_seen": 86541260, "router_z_loss_clip": 0.03271484, "router_z_loss_mlp": 0.328125, "step": 4019, "time_per_iteration": 3.155238389968872 }, { "auxiliary_loss_clip": 0.01137867, "auxiliary_loss_mlp": 0.01035443, "balance_loss_clip": 1.0200417, "balance_loss_mlp": 1.04750574, "epoch": 0.24169547572523672, "flos": 24929941386240.0, "grad_norm": 2.359788645487031, "language_loss": 0.73339701, "learning_rate": 3.5480187236524055e-06, "loss": 0.75513005, "num_input_tokens_seen": 86559580, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.90234375, "step": 4020, "time_per_iteration": 2.502662420272827 }, { "auxiliary_loss_clip": 0.01139294, "auxiliary_loss_mlp": 0.01039899, "balance_loss_clip": 1.02373433, "balance_loss_mlp": 1.05080891, "epoch": 0.24175559897790472, "flos": 18728312567040.0, "grad_norm": 1.9768458246863325, "language_loss": 0.81792641, "learning_rate": 3.5477720972332285e-06, "loss": 0.83971834, "num_input_tokens_seen": 86577560, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8828125, "step": 4021, "time_per_iteration": 2.4518306255340576 }, { "auxiliary_loss_clip": 0.01143618, "auxiliary_loss_mlp": 0.01048488, "balance_loss_clip": 1.030774, "balance_loss_mlp": 1.05078673, "epoch": 0.24181572223057268, "flos": 23039281111680.0, "grad_norm": 2.05570486676068, "language_loss": 0.7639665, "learning_rate": 3.547525412122378e-06, "loss": 0.7858876, "num_input_tokens_seen": 86595350, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.92578125, "step": 4022, "time_per_iteration": 2.4873530864715576 }, { "auxiliary_loss_clip": 0.01144153, "auxiliary_loss_mlp": 0.01046434, "balance_loss_clip": 1.02939904, "balance_loss_mlp": 1.04968894, "epoch": 0.24187584548324065, "flos": 20376145923840.0, "grad_norm": 1.9875679826483217, "language_loss": 0.75400615, "learning_rate": 3.5472786683292083e-06, "loss": 0.77591205, "num_input_tokens_seen": 86614805, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9453125, "step": 4023, "time_per_iteration": 2.453672409057617 }, { "auxiliary_loss_clip": 0.01140754, "auxiliary_loss_mlp": 0.01046744, "balance_loss_clip": 1.03121066, "balance_loss_mlp": 1.05193782, "epoch": 0.2419359687359086, "flos": 21397517153280.0, "grad_norm": 2.114505645837123, "language_loss": 0.82224846, "learning_rate": 3.5470318658630766e-06, "loss": 0.84412348, "num_input_tokens_seen": 86633700, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.890625, "step": 4024, "time_per_iteration": 2.497727632522583 }, { "auxiliary_loss_clip": 0.01137683, "auxiliary_loss_mlp": 0.01048786, "balance_loss_clip": 1.03219187, "balance_loss_mlp": 1.04944491, "epoch": 0.24199609198857658, "flos": 18369385914240.0, "grad_norm": 1.7496607399330797, "language_loss": 0.85934615, "learning_rate": 3.5467850047333424e-06, "loss": 0.88121086, "num_input_tokens_seen": 86650905, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8828125, "step": 4025, "time_per_iteration": 2.443627119064331 }, { "auxiliary_loss_clip": 0.01142177, "auxiliary_loss_mlp": 0.01052093, "balance_loss_clip": 1.03566551, "balance_loss_mlp": 1.04909801, "epoch": 0.24205621524124454, "flos": 19463871277440.0, "grad_norm": 1.8229256029068253, "language_loss": 0.7112931, "learning_rate": 3.546538084949365e-06, "loss": 0.73323584, "num_input_tokens_seen": 86669185, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9296875, "step": 4026, "time_per_iteration": 2.46136474609375 }, { "auxiliary_loss_clip": 0.01137134, "auxiliary_loss_mlp": 0.01043838, "balance_loss_clip": 1.02887678, "balance_loss_mlp": 1.04906118, "epoch": 0.2421163384939125, "flos": 14976330451200.0, "grad_norm": 1.7495904446395052, "language_loss": 0.64138669, "learning_rate": 3.546291106520509e-06, "loss": 0.66319644, "num_input_tokens_seen": 86686805, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.87890625, "step": 4027, "time_per_iteration": 2.426593065261841 }, { "auxiliary_loss_clip": 0.01142523, "auxiliary_loss_mlp": 0.0104826, "balance_loss_clip": 1.03343022, "balance_loss_mlp": 1.05068684, "epoch": 0.2421764617465805, "flos": 18662057930880.0, "grad_norm": 2.360022711660216, "language_loss": 0.70774424, "learning_rate": 3.5460440694561388e-06, "loss": 0.72965205, "num_input_tokens_seen": 86705520, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.921875, "step": 4028, "time_per_iteration": 2.452448844909668 }, { "auxiliary_loss_clip": 0.01059892, "auxiliary_loss_mlp": 0.01051486, "balance_loss_clip": 1.04861355, "balance_loss_mlp": 1.02817714, "epoch": 0.24223658499924847, "flos": 64347327164160.0, "grad_norm": 0.878518679788975, "language_loss": 0.55416369, "learning_rate": 3.545796973765623e-06, "loss": 0.57527745, "num_input_tokens_seen": 86767320, "router_z_loss_clip": 0.02868652, "router_z_loss_mlp": 0.31640625, "step": 4029, "time_per_iteration": 3.0769143104553223 }, { "auxiliary_loss_clip": 0.0114017, "auxiliary_loss_mlp": 0.01040281, "balance_loss_clip": 1.02331734, "balance_loss_mlp": 1.04940057, "epoch": 0.24229670825191643, "flos": 25775243124480.0, "grad_norm": 1.809499603653329, "language_loss": 0.7423811, "learning_rate": 3.54554981945833e-06, "loss": 0.76418555, "num_input_tokens_seen": 86788110, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90625, "step": 4030, "time_per_iteration": 2.5213863849639893 }, { "auxiliary_loss_clip": 0.01139374, "auxiliary_loss_mlp": 0.01049528, "balance_loss_clip": 1.03262448, "balance_loss_mlp": 1.04818726, "epoch": 0.2423568315045844, "flos": 20667094087680.0, "grad_norm": 1.9114087950746785, "language_loss": 0.76333964, "learning_rate": 3.5453026065436343e-06, "loss": 0.78522861, "num_input_tokens_seen": 86807640, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.91015625, "step": 4031, "time_per_iteration": 2.4732439517974854 }, { "auxiliary_loss_clip": 0.01142659, "auxiliary_loss_mlp": 0.01042421, "balance_loss_clip": 1.02589858, "balance_loss_mlp": 1.04864502, "epoch": 0.24241695475725236, "flos": 22416805393920.0, "grad_norm": 9.864055672115125, "language_loss": 0.65701401, "learning_rate": 3.5450553350309083e-06, "loss": 0.67886484, "num_input_tokens_seen": 86826795, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.94140625, "step": 4032, "time_per_iteration": 2.4698197841644287 }, { "auxiliary_loss_clip": 0.01136217, "auxiliary_loss_mlp": 0.01043106, "balance_loss_clip": 1.0269177, "balance_loss_mlp": 1.04686618, "epoch": 0.24247707800992033, "flos": 17128995505920.0, "grad_norm": 1.9823951926086707, "language_loss": 0.8145631, "learning_rate": 3.5448080049295286e-06, "loss": 0.83635634, "num_input_tokens_seen": 86843175, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.89453125, "step": 4033, "time_per_iteration": 2.4242241382598877 }, { "auxiliary_loss_clip": 0.01134762, "auxiliary_loss_mlp": 0.01037914, "balance_loss_clip": 1.02196956, "balance_loss_mlp": 1.04680347, "epoch": 0.2425372012625883, "flos": 31613743399680.0, "grad_norm": 2.059133417146216, "language_loss": 0.69210446, "learning_rate": 3.5445606162488754e-06, "loss": 0.71383119, "num_input_tokens_seen": 86863185, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 4034, "time_per_iteration": 2.554236888885498 }, { "auxiliary_loss_clip": 0.01137766, "auxiliary_loss_mlp": 0.01038168, "balance_loss_clip": 1.02087092, "balance_loss_mlp": 1.0473752, "epoch": 0.24259732451525629, "flos": 16326032924160.0, "grad_norm": 2.1118081569800844, "language_loss": 0.96589363, "learning_rate": 3.5443131689983283e-06, "loss": 0.9876529, "num_input_tokens_seen": 86880040, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90625, "step": 4035, "time_per_iteration": 2.4275107383728027 }, { "auxiliary_loss_clip": 0.01131902, "auxiliary_loss_mlp": 0.01047797, "balance_loss_clip": 1.03286576, "balance_loss_mlp": 1.04572725, "epoch": 0.24265744776792425, "flos": 22856639431680.0, "grad_norm": 1.6439710611721405, "language_loss": 0.77736443, "learning_rate": 3.5440656631872715e-06, "loss": 0.79916137, "num_input_tokens_seen": 86900610, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.86328125, "step": 4036, "time_per_iteration": 2.4847378730773926 }, { "auxiliary_loss_clip": 0.01139944, "auxiliary_loss_mlp": 0.01043557, "balance_loss_clip": 1.02678394, "balance_loss_mlp": 1.04948533, "epoch": 0.24271757102059222, "flos": 21871573873920.0, "grad_norm": 1.6469766585788734, "language_loss": 0.74463308, "learning_rate": 3.5438180988250898e-06, "loss": 0.76646805, "num_input_tokens_seen": 86919385, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.90234375, "step": 4037, "time_per_iteration": 2.451775074005127 }, { "auxiliary_loss_clip": 0.01139752, "auxiliary_loss_mlp": 0.0104062, "balance_loss_clip": 1.0238955, "balance_loss_mlp": 1.04858065, "epoch": 0.24277769427326018, "flos": 19208582340480.0, "grad_norm": 2.246972033295773, "language_loss": 0.76664799, "learning_rate": 3.543570475921171e-06, "loss": 0.78845167, "num_input_tokens_seen": 86938885, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.9140625, "step": 4038, "time_per_iteration": 2.4636266231536865 }, { "auxiliary_loss_clip": 0.01138436, "auxiliary_loss_mlp": 0.01045143, "balance_loss_clip": 1.02794147, "balance_loss_mlp": 1.0478878, "epoch": 0.24283781752592815, "flos": 19499889640320.0, "grad_norm": 2.6188263338597744, "language_loss": 0.72135538, "learning_rate": 3.543322794484905e-06, "loss": 0.74319118, "num_input_tokens_seen": 86957705, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.90625, "step": 4039, "time_per_iteration": 2.4310665130615234 }, { "auxiliary_loss_clip": 0.01136141, "auxiliary_loss_mlp": 0.0104262, "balance_loss_clip": 1.02613306, "balance_loss_mlp": 1.04678738, "epoch": 0.2428979407785961, "flos": 19902196944000.0, "grad_norm": 1.7444580805893588, "language_loss": 0.78360045, "learning_rate": 3.5430750545256843e-06, "loss": 0.80538797, "num_input_tokens_seen": 86975845, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.890625, "step": 4040, "time_per_iteration": 2.4592251777648926 }, { "auxiliary_loss_clip": 0.01131997, "auxiliary_loss_mlp": 0.01037372, "balance_loss_clip": 1.02260792, "balance_loss_mlp": 1.04556763, "epoch": 0.2429580640312641, "flos": 24715878284160.0, "grad_norm": 2.6310216486297535, "language_loss": 0.80486047, "learning_rate": 3.5428272560529027e-06, "loss": 0.82655418, "num_input_tokens_seen": 86994800, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.86328125, "step": 4041, "time_per_iteration": 2.476301670074463 }, { "auxiliary_loss_clip": 0.01136728, "auxiliary_loss_mlp": 0.01040752, "balance_loss_clip": 1.02504027, "balance_loss_mlp": 1.04810846, "epoch": 0.24301818728393207, "flos": 25630343660160.0, "grad_norm": 2.282259658551061, "language_loss": 0.76753813, "learning_rate": 3.542579399075957e-06, "loss": 0.78931296, "num_input_tokens_seen": 87016845, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.88671875, "step": 4042, "time_per_iteration": 2.5149056911468506 }, { "auxiliary_loss_clip": 0.01133798, "auxiliary_loss_mlp": 0.01036918, "balance_loss_clip": 1.02257752, "balance_loss_mlp": 1.0466764, "epoch": 0.24307831053660003, "flos": 26141388410880.0, "grad_norm": 1.7637867610036688, "language_loss": 0.8133074, "learning_rate": 3.542331483604246e-06, "loss": 0.83501458, "num_input_tokens_seen": 87036270, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.87109375, "step": 4043, "time_per_iteration": 2.4873154163360596 }, { "auxiliary_loss_clip": 0.01137191, "auxiliary_loss_mlp": 0.01040037, "balance_loss_clip": 1.0231216, "balance_loss_mlp": 1.04432464, "epoch": 0.243138433789268, "flos": 14972415868800.0, "grad_norm": 2.1509043816604785, "language_loss": 0.73233163, "learning_rate": 3.5420835096471706e-06, "loss": 0.7541039, "num_input_tokens_seen": 87049920, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9296875, "step": 4044, "time_per_iteration": 2.4235780239105225 }, { "auxiliary_loss_clip": 0.01140309, "auxiliary_loss_mlp": 0.01038501, "balance_loss_clip": 1.02231216, "balance_loss_mlp": 1.05030537, "epoch": 0.24319855704193596, "flos": 25191694771200.0, "grad_norm": 2.8451621229440485, "language_loss": 0.83558333, "learning_rate": 3.5418354772141337e-06, "loss": 0.85737145, "num_input_tokens_seen": 87068230, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8984375, "step": 4045, "time_per_iteration": 2.4864866733551025 }, { "auxiliary_loss_clip": 0.01138557, "auxiliary_loss_mlp": 0.01041362, "balance_loss_clip": 1.0253284, "balance_loss_mlp": 1.04856563, "epoch": 0.24325868029460393, "flos": 22127221946880.0, "grad_norm": 1.821537583769903, "language_loss": 0.86747873, "learning_rate": 3.541587386314541e-06, "loss": 0.88927794, "num_input_tokens_seen": 87086435, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8984375, "step": 4046, "time_per_iteration": 2.486362934112549 }, { "auxiliary_loss_clip": 0.01132203, "auxiliary_loss_mlp": 0.0103968, "balance_loss_clip": 1.02346706, "balance_loss_mlp": 1.04492712, "epoch": 0.2433188035472719, "flos": 23582106420480.0, "grad_norm": 1.9192628022174583, "language_loss": 0.72702831, "learning_rate": 3.5413392369578e-06, "loss": 0.74874711, "num_input_tokens_seen": 87105340, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87109375, "step": 4047, "time_per_iteration": 2.4669559001922607 }, { "auxiliary_loss_clip": 0.01135916, "auxiliary_loss_mlp": 0.01041457, "balance_loss_clip": 1.0247556, "balance_loss_mlp": 1.0457046, "epoch": 0.2433789267999399, "flos": 24462815990400.0, "grad_norm": 5.697449461316768, "language_loss": 0.73322058, "learning_rate": 3.5410910291533213e-06, "loss": 0.75499427, "num_input_tokens_seen": 87125780, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.90234375, "step": 4048, "time_per_iteration": 2.53613543510437 }, { "auxiliary_loss_clip": 0.01135602, "auxiliary_loss_mlp": 0.01041108, "balance_loss_clip": 1.02532506, "balance_loss_mlp": 1.04708469, "epoch": 0.24343905005260785, "flos": 16727909264640.0, "grad_norm": 2.6248426886139065, "language_loss": 0.72820157, "learning_rate": 3.5408427629105155e-06, "loss": 0.74996865, "num_input_tokens_seen": 87144470, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.88671875, "step": 4049, "time_per_iteration": 2.4777519702911377 }, { "auxiliary_loss_clip": 0.01132618, "auxiliary_loss_mlp": 0.0103845, "balance_loss_clip": 1.02309632, "balance_loss_mlp": 1.04463613, "epoch": 0.24349917330527582, "flos": 20043756443520.0, "grad_norm": 1.6276326373015744, "language_loss": 0.73530889, "learning_rate": 3.5405944382387985e-06, "loss": 0.75701952, "num_input_tokens_seen": 87162830, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.87890625, "step": 4050, "time_per_iteration": 2.459789276123047 }, { "auxiliary_loss_clip": 0.0113232, "auxiliary_loss_mlp": 0.01039294, "balance_loss_clip": 1.02421427, "balance_loss_mlp": 1.04559028, "epoch": 0.24355929655794378, "flos": 17420554200960.0, "grad_norm": 2.1705735361507257, "language_loss": 0.75452745, "learning_rate": 3.5403460551475854e-06, "loss": 0.77624363, "num_input_tokens_seen": 87180905, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8671875, "step": 4051, "time_per_iteration": 3.895613193511963 }, { "auxiliary_loss_clip": 0.01132301, "auxiliary_loss_mlp": 0.01037272, "balance_loss_clip": 1.02159572, "balance_loss_mlp": 1.04424596, "epoch": 0.24361941981061175, "flos": 25410929431680.0, "grad_norm": 3.0676387916664503, "language_loss": 0.7027396, "learning_rate": 3.540097613646296e-06, "loss": 0.72443533, "num_input_tokens_seen": 87202290, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8828125, "step": 4052, "time_per_iteration": 2.4962563514709473 }, { "auxiliary_loss_clip": 0.01136615, "auxiliary_loss_mlp": 0.01046365, "balance_loss_clip": 1.03015232, "balance_loss_mlp": 1.04705977, "epoch": 0.2436795430632797, "flos": 22820800636800.0, "grad_norm": 1.6534481317750906, "language_loss": 0.81419563, "learning_rate": 3.539849113744351e-06, "loss": 0.83602548, "num_input_tokens_seen": 87221650, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.89453125, "step": 4053, "time_per_iteration": 3.848836898803711 }, { "auxiliary_loss_clip": 0.01140763, "auxiliary_loss_mlp": 0.01034143, "balance_loss_clip": 1.01781154, "balance_loss_mlp": 1.04834402, "epoch": 0.2437396663159477, "flos": 15157786982400.0, "grad_norm": 1.4912288182510745, "language_loss": 0.77769941, "learning_rate": 3.539600555451172e-06, "loss": 0.79944849, "num_input_tokens_seen": 87238515, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.92578125, "step": 4054, "time_per_iteration": 2.4377007484436035 }, { "auxiliary_loss_clip": 0.01134458, "auxiliary_loss_mlp": 0.01043352, "balance_loss_clip": 1.02814126, "balance_loss_mlp": 1.04503155, "epoch": 0.24379978956861567, "flos": 22091131756800.0, "grad_norm": 1.9397013614202052, "language_loss": 0.83960235, "learning_rate": 3.5393519387761866e-06, "loss": 0.86138046, "num_input_tokens_seen": 87256290, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.890625, "step": 4055, "time_per_iteration": 3.852353811264038 }, { "auxiliary_loss_clip": 0.01141319, "auxiliary_loss_mlp": 0.0104162, "balance_loss_clip": 1.02453756, "balance_loss_mlp": 1.04580224, "epoch": 0.24385991282128364, "flos": 31467766527360.0, "grad_norm": 4.4800963952960675, "language_loss": 0.5493229, "learning_rate": 3.5391032637288217e-06, "loss": 0.57115227, "num_input_tokens_seen": 87277085, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.95703125, "step": 4056, "time_per_iteration": 3.9575254917144775 }, { "auxiliary_loss_clip": 0.011397, "auxiliary_loss_mlp": 0.01043652, "balance_loss_clip": 1.02685571, "balance_loss_mlp": 1.04760122, "epoch": 0.2439200360739516, "flos": 23838795987840.0, "grad_norm": 2.459315683197236, "language_loss": 0.80207765, "learning_rate": 3.538854530318506e-06, "loss": 0.82391113, "num_input_tokens_seen": 87293020, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.921875, "step": 4057, "time_per_iteration": 2.4474027156829834 }, { "auxiliary_loss_clip": 0.01135572, "auxiliary_loss_mlp": 0.01041529, "balance_loss_clip": 1.02579367, "balance_loss_mlp": 1.04702687, "epoch": 0.24398015932661957, "flos": 19169978198400.0, "grad_norm": 1.8630746170736825, "language_loss": 0.79213774, "learning_rate": 3.538605738554673e-06, "loss": 0.81390882, "num_input_tokens_seen": 87311445, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8828125, "step": 4058, "time_per_iteration": 2.4544289112091064 }, { "auxiliary_loss_clip": 0.01142285, "auxiliary_loss_mlp": 0.01041733, "balance_loss_clip": 1.02615285, "balance_loss_mlp": 1.04742622, "epoch": 0.24404028257928753, "flos": 25262474520960.0, "grad_norm": 1.600095120395256, "language_loss": 0.85666716, "learning_rate": 3.538356888446756e-06, "loss": 0.87850738, "num_input_tokens_seen": 87332055, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.94921875, "step": 4059, "time_per_iteration": 2.490020513534546 }, { "auxiliary_loss_clip": 0.01134348, "auxiliary_loss_mlp": 0.01034394, "balance_loss_clip": 1.01945758, "balance_loss_mlp": 1.04674125, "epoch": 0.2441004058319555, "flos": 26467600752000.0, "grad_norm": 1.8892113063149738, "language_loss": 0.74171263, "learning_rate": 3.5381079800041913e-06, "loss": 0.76340008, "num_input_tokens_seen": 87351295, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.875, "step": 4060, "time_per_iteration": 2.532970905303955 }, { "auxiliary_loss_clip": 0.01144365, "auxiliary_loss_mlp": 0.01051296, "balance_loss_clip": 1.03244948, "balance_loss_mlp": 1.04972601, "epoch": 0.2441605290846235, "flos": 26760524163840.0, "grad_norm": 1.9407742695780077, "language_loss": 0.73363197, "learning_rate": 3.5378590132364182e-06, "loss": 0.75558859, "num_input_tokens_seen": 87370650, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9453125, "step": 4061, "time_per_iteration": 2.502943992614746 }, { "auxiliary_loss_clip": 0.01137266, "auxiliary_loss_mlp": 0.01039947, "balance_loss_clip": 1.02527285, "balance_loss_mlp": 1.04880667, "epoch": 0.24422065233729146, "flos": 21105850717440.0, "grad_norm": 1.6194004129683834, "language_loss": 0.76063454, "learning_rate": 3.5376099881528768e-06, "loss": 0.78240669, "num_input_tokens_seen": 87389020, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.88671875, "step": 4062, "time_per_iteration": 2.469365358352661 }, { "auxiliary_loss_clip": 0.01134905, "auxiliary_loss_mlp": 0.01037331, "balance_loss_clip": 1.02123773, "balance_loss_mlp": 1.04885554, "epoch": 0.24428077558995942, "flos": 25263156879360.0, "grad_norm": 1.6730340506265031, "language_loss": 0.84970325, "learning_rate": 3.537360904763011e-06, "loss": 0.87142563, "num_input_tokens_seen": 87409695, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.859375, "step": 4063, "time_per_iteration": 2.506002187728882 }, { "auxiliary_loss_clip": 0.01142222, "auxiliary_loss_mlp": 0.01038237, "balance_loss_clip": 1.02116656, "balance_loss_mlp": 1.04870045, "epoch": 0.24434089884262739, "flos": 20485278420480.0, "grad_norm": 2.66518728199237, "language_loss": 0.68352938, "learning_rate": 3.5371117630762656e-06, "loss": 0.70533395, "num_input_tokens_seen": 87428250, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.93359375, "step": 4064, "time_per_iteration": 2.470822334289551 }, { "auxiliary_loss_clip": 0.01143421, "auxiliary_loss_mlp": 0.01041958, "balance_loss_clip": 1.02525663, "balance_loss_mlp": 1.04944158, "epoch": 0.24440102209529535, "flos": 23621895711360.0, "grad_norm": 1.5454309516118314, "language_loss": 0.69904846, "learning_rate": 3.536862563102088e-06, "loss": 0.7209022, "num_input_tokens_seen": 87449380, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.94140625, "step": 4065, "time_per_iteration": 2.484794855117798 }, { "auxiliary_loss_clip": 0.01142463, "auxiliary_loss_mlp": 0.01051585, "balance_loss_clip": 1.03255892, "balance_loss_mlp": 1.04860425, "epoch": 0.24446114534796332, "flos": 20554729367040.0, "grad_norm": 1.7802221069446296, "language_loss": 0.84472269, "learning_rate": 3.5366133048499282e-06, "loss": 0.86666316, "num_input_tokens_seen": 87465365, "router_z_loss_clip": 0.19042969, "router_z_loss_mlp": 0.9375, "step": 4066, "time_per_iteration": 2.4629104137420654 }, { "auxiliary_loss_clip": 0.0106767, "auxiliary_loss_mlp": 0.01021871, "balance_loss_clip": 1.0191052, "balance_loss_mlp": 1.03568184, "epoch": 0.24452126860063128, "flos": 60389575009920.0, "grad_norm": 0.7446047766543488, "language_loss": 0.52333724, "learning_rate": 3.5363639883292374e-06, "loss": 0.54423261, "num_input_tokens_seen": 87522525, "router_z_loss_clip": 0.02770996, "router_z_loss_mlp": 0.3203125, "step": 4067, "time_per_iteration": 2.985053062438965 }, { "auxiliary_loss_clip": 0.01142297, "auxiliary_loss_mlp": 0.01044452, "balance_loss_clip": 1.02744675, "balance_loss_mlp": 1.04918861, "epoch": 0.24458139185329927, "flos": 15121660878720.0, "grad_norm": 34.38098016696533, "language_loss": 0.72736186, "learning_rate": 3.5361146135494706e-06, "loss": 0.74922931, "num_input_tokens_seen": 87539170, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.9296875, "step": 4068, "time_per_iteration": 2.4414100646972656 }, { "auxiliary_loss_clip": 0.01141237, "auxiliary_loss_mlp": 0.01041396, "balance_loss_clip": 1.02462339, "balance_loss_mlp": 1.05134571, "epoch": 0.24464151510596724, "flos": 27998723842560.0, "grad_norm": 1.43210436677185, "language_loss": 0.77544004, "learning_rate": 3.5358651805200835e-06, "loss": 0.79726636, "num_input_tokens_seen": 87558875, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 4069, "time_per_iteration": 2.512798547744751 }, { "auxiliary_loss_clip": 0.01141182, "auxiliary_loss_mlp": 0.01048425, "balance_loss_clip": 1.03112769, "balance_loss_mlp": 1.05193329, "epoch": 0.2447016383586352, "flos": 19792884879360.0, "grad_norm": 1.8226621590424263, "language_loss": 0.80272418, "learning_rate": 3.5356156892505347e-06, "loss": 0.82462025, "num_input_tokens_seen": 87576485, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.890625, "step": 4070, "time_per_iteration": 2.469149112701416 }, { "auxiliary_loss_clip": 0.01140234, "auxiliary_loss_mlp": 0.01046835, "balance_loss_clip": 1.03121865, "balance_loss_mlp": 1.04959106, "epoch": 0.24476176161130317, "flos": 26067340523520.0, "grad_norm": 1.4300526426197795, "language_loss": 0.84138709, "learning_rate": 3.5353661397502854e-06, "loss": 0.86325777, "num_input_tokens_seen": 87598620, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.90625, "step": 4071, "time_per_iteration": 2.508418321609497 }, { "auxiliary_loss_clip": 0.01143825, "auxiliary_loss_mlp": 0.01046741, "balance_loss_clip": 1.02846646, "balance_loss_mlp": 1.04827952, "epoch": 0.24482188486397113, "flos": 18843550375680.0, "grad_norm": 1.8898736937091074, "language_loss": 0.79955626, "learning_rate": 3.535116532028798e-06, "loss": 0.82146192, "num_input_tokens_seen": 87616595, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.953125, "step": 4072, "time_per_iteration": 2.4743151664733887 }, { "auxiliary_loss_clip": 0.0113802, "auxiliary_loss_mlp": 0.01045417, "balance_loss_clip": 1.0299561, "balance_loss_mlp": 1.04973102, "epoch": 0.2448820081166391, "flos": 21251791676160.0, "grad_norm": 1.4795522934866072, "language_loss": 0.70538652, "learning_rate": 3.5348668660955382e-06, "loss": 0.72722089, "num_input_tokens_seen": 87635755, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8828125, "step": 4073, "time_per_iteration": 2.4402835369110107 }, { "auxiliary_loss_clip": 0.01138286, "auxiliary_loss_mlp": 0.0104153, "balance_loss_clip": 1.02647972, "balance_loss_mlp": 1.04988575, "epoch": 0.2449421313693071, "flos": 23950586090880.0, "grad_norm": 2.964440383414789, "language_loss": 0.67443877, "learning_rate": 3.5346171419599728e-06, "loss": 0.69623697, "num_input_tokens_seen": 87652885, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8828125, "step": 4074, "time_per_iteration": 2.4838645458221436 }, { "auxiliary_loss_clip": 0.01064746, "auxiliary_loss_mlp": 0.01004486, "balance_loss_clip": 1.00172067, "balance_loss_mlp": 1.03301358, "epoch": 0.24500225462197506, "flos": 60687669980160.0, "grad_norm": 0.9458971487566288, "language_loss": 0.68708861, "learning_rate": 3.5343673596315718e-06, "loss": 0.70778096, "num_input_tokens_seen": 87713220, "router_z_loss_clip": 0.02770996, "router_z_loss_mlp": 0.31640625, "step": 4075, "time_per_iteration": 3.1723291873931885 }, { "auxiliary_loss_clip": 0.01140115, "auxiliary_loss_mlp": 0.01043811, "balance_loss_clip": 1.02831411, "balance_loss_mlp": 1.05173886, "epoch": 0.24506237787464302, "flos": 26284204886400.0, "grad_norm": 1.8391468379660316, "language_loss": 0.79658312, "learning_rate": 3.5341175191198063e-06, "loss": 0.81842244, "num_input_tokens_seen": 87732680, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8828125, "step": 4076, "time_per_iteration": 2.584825038909912 }, { "auxiliary_loss_clip": 0.01142423, "auxiliary_loss_mlp": 0.01044286, "balance_loss_clip": 1.02665544, "balance_loss_mlp": 1.04806006, "epoch": 0.245122501127311, "flos": 20552287242240.0, "grad_norm": 1.7448386600850843, "language_loss": 0.81647831, "learning_rate": 3.533867620434151e-06, "loss": 0.83834547, "num_input_tokens_seen": 87751880, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9453125, "step": 4077, "time_per_iteration": 2.494725465774536 }, { "auxiliary_loss_clip": 0.01143173, "auxiliary_loss_mlp": 0.01052031, "balance_loss_clip": 1.03401899, "balance_loss_mlp": 1.05042064, "epoch": 0.24518262437997895, "flos": 29132603447040.0, "grad_norm": 2.020925474261981, "language_loss": 0.62357163, "learning_rate": 3.533617663584082e-06, "loss": 0.64552367, "num_input_tokens_seen": 87771795, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.9296875, "step": 4078, "time_per_iteration": 2.5168673992156982 }, { "auxiliary_loss_clip": 0.01138767, "auxiliary_loss_mlp": 0.01038235, "balance_loss_clip": 1.02190363, "balance_loss_mlp": 1.05067515, "epoch": 0.24524274763264692, "flos": 23476924419840.0, "grad_norm": 1.8200411817212252, "language_loss": 0.75671136, "learning_rate": 3.5333676485790765e-06, "loss": 0.77848136, "num_input_tokens_seen": 87793640, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8828125, "step": 4079, "time_per_iteration": 2.4849886894226074 }, { "auxiliary_loss_clip": 0.01137612, "auxiliary_loss_mlp": 0.01047147, "balance_loss_clip": 1.02990961, "balance_loss_mlp": 1.04865408, "epoch": 0.24530287088531488, "flos": 17201175886080.0, "grad_norm": 2.2931360785316164, "language_loss": 0.75349057, "learning_rate": 3.5331175754286173e-06, "loss": 0.77533817, "num_input_tokens_seen": 87812390, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.890625, "step": 4080, "time_per_iteration": 2.4441003799438477 }, { "auxiliary_loss_clip": 0.01133946, "auxiliary_loss_mlp": 0.01037174, "balance_loss_clip": 1.02097392, "balance_loss_mlp": 1.04747415, "epoch": 0.24536299413798288, "flos": 14867449349760.0, "grad_norm": 1.8048196251006043, "language_loss": 0.82769412, "learning_rate": 3.532867444142186e-06, "loss": 0.84940529, "num_input_tokens_seen": 87830640, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8671875, "step": 4081, "time_per_iteration": 2.433222532272339 }, { "auxiliary_loss_clip": 0.01136995, "auxiliary_loss_mlp": 0.01038919, "balance_loss_clip": 1.02404189, "balance_loss_mlp": 1.04931116, "epoch": 0.24542311739065084, "flos": 35262051886080.0, "grad_norm": 1.8522997211957157, "language_loss": 0.73161626, "learning_rate": 3.532617254729267e-06, "loss": 0.75337535, "num_input_tokens_seen": 87850450, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.87890625, "step": 4082, "time_per_iteration": 2.5775513648986816 }, { "auxiliary_loss_clip": 0.01134515, "auxiliary_loss_mlp": 0.0104216, "balance_loss_clip": 1.02738976, "balance_loss_mlp": 1.0468384, "epoch": 0.2454832406433188, "flos": 21503130117120.0, "grad_norm": 1.5402297556309827, "language_loss": 0.71965253, "learning_rate": 3.5323670071993485e-06, "loss": 0.74141932, "num_input_tokens_seen": 87868810, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.875, "step": 4083, "time_per_iteration": 2.4501304626464844 }, { "auxiliary_loss_clip": 0.01140659, "auxiliary_loss_mlp": 0.01045707, "balance_loss_clip": 1.02753973, "balance_loss_mlp": 1.04809427, "epoch": 0.24554336389598677, "flos": 14756664827520.0, "grad_norm": 1.9381086845247468, "language_loss": 0.74687231, "learning_rate": 3.532116701561919e-06, "loss": 0.76873595, "num_input_tokens_seen": 87885685, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.92578125, "step": 4084, "time_per_iteration": 2.4502248764038086 }, { "auxiliary_loss_clip": 0.01133711, "auxiliary_loss_mlp": 0.01037917, "balance_loss_clip": 1.02199113, "balance_loss_mlp": 1.04646969, "epoch": 0.24560348714865474, "flos": 14976402278400.0, "grad_norm": 1.865250680231567, "language_loss": 0.85333598, "learning_rate": 3.531866337826471e-06, "loss": 0.87505221, "num_input_tokens_seen": 87903715, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.87109375, "step": 4085, "time_per_iteration": 2.419264793395996 }, { "auxiliary_loss_clip": 0.01138423, "auxiliary_loss_mlp": 0.0104649, "balance_loss_clip": 1.0299319, "balance_loss_mlp": 1.04865062, "epoch": 0.2456636104013227, "flos": 22675326554880.0, "grad_norm": 1.713268661462722, "language_loss": 0.78403044, "learning_rate": 3.5316159160024982e-06, "loss": 0.80587959, "num_input_tokens_seen": 87923375, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8984375, "step": 4086, "time_per_iteration": 2.4861416816711426 }, { "auxiliary_loss_clip": 0.01135043, "auxiliary_loss_mlp": 0.01044039, "balance_loss_clip": 1.02833891, "balance_loss_mlp": 1.04861414, "epoch": 0.2457237336539907, "flos": 27417869009280.0, "grad_norm": 1.545981466030764, "language_loss": 0.74836206, "learning_rate": 3.531365436099496e-06, "loss": 0.77015293, "num_input_tokens_seen": 87943115, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.86328125, "step": 4087, "time_per_iteration": 2.5017342567443848 }, { "auxiliary_loss_clip": 0.01141999, "auxiliary_loss_mlp": 0.01044123, "balance_loss_clip": 1.02656364, "balance_loss_mlp": 1.05301726, "epoch": 0.24578385690665866, "flos": 20412379768320.0, "grad_norm": 3.283443743595779, "language_loss": 0.79976809, "learning_rate": 3.5311148981269635e-06, "loss": 0.82162929, "num_input_tokens_seen": 87959505, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.890625, "step": 4088, "time_per_iteration": 2.448570728302002 }, { "auxiliary_loss_clip": 0.01130331, "auxiliary_loss_mlp": 0.01033197, "balance_loss_clip": 1.01837337, "balance_loss_mlp": 1.04556406, "epoch": 0.24584398015932662, "flos": 23915393740800.0, "grad_norm": 1.6427124833382099, "language_loss": 0.77147615, "learning_rate": 3.5308643020944e-06, "loss": 0.79311138, "num_input_tokens_seen": 87979725, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.84765625, "step": 4089, "time_per_iteration": 2.4820361137390137 }, { "auxiliary_loss_clip": 0.01135437, "auxiliary_loss_mlp": 0.01043998, "balance_loss_clip": 1.02785742, "balance_loss_mlp": 1.04603422, "epoch": 0.2459041034119946, "flos": 41496359103360.0, "grad_norm": 1.8941020167731808, "language_loss": 0.81727433, "learning_rate": 3.530613648011309e-06, "loss": 0.83906865, "num_input_tokens_seen": 87998270, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.89453125, "step": 4090, "time_per_iteration": 2.6089696884155273 }, { "auxiliary_loss_clip": 0.01139823, "auxiliary_loss_mlp": 0.01047409, "balance_loss_clip": 1.03048158, "balance_loss_mlp": 1.04956567, "epoch": 0.24596422666466256, "flos": 19936814676480.0, "grad_norm": 1.7123550379275714, "language_loss": 0.73638427, "learning_rate": 3.5303629358871946e-06, "loss": 0.75825655, "num_input_tokens_seen": 88016760, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.90234375, "step": 4091, "time_per_iteration": 2.4320342540740967 }, { "auxiliary_loss_clip": 0.01139653, "auxiliary_loss_mlp": 0.01038742, "balance_loss_clip": 1.02310228, "balance_loss_mlp": 1.05259371, "epoch": 0.24602434991733052, "flos": 21544391865600.0, "grad_norm": 1.834197281047163, "language_loss": 0.76913041, "learning_rate": 3.5301121657315653e-06, "loss": 0.79091442, "num_input_tokens_seen": 88036465, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8671875, "step": 4092, "time_per_iteration": 2.4533281326293945 }, { "auxiliary_loss_clip": 0.01141496, "auxiliary_loss_mlp": 0.01039501, "balance_loss_clip": 1.02257311, "balance_loss_mlp": 1.04856396, "epoch": 0.24608447316999849, "flos": 23185078416000.0, "grad_norm": 2.388470905291148, "language_loss": 0.81346023, "learning_rate": 3.5298613375539287e-06, "loss": 0.83527023, "num_input_tokens_seen": 88053270, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.9296875, "step": 4093, "time_per_iteration": 3.9435291290283203 }, { "auxiliary_loss_clip": 0.01140672, "auxiliary_loss_mlp": 0.01041834, "balance_loss_clip": 1.02462018, "balance_loss_mlp": 1.04778075, "epoch": 0.24614459642266648, "flos": 19641951930240.0, "grad_norm": 1.9344439020196036, "language_loss": 0.86864609, "learning_rate": 3.529610451363797e-06, "loss": 0.89047116, "num_input_tokens_seen": 88072305, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9296875, "step": 4094, "time_per_iteration": 2.447977304458618 }, { "auxiliary_loss_clip": 0.01064532, "auxiliary_loss_mlp": 0.01010698, "balance_loss_clip": 1.00799227, "balance_loss_mlp": 1.03252172, "epoch": 0.24620471967533444, "flos": 61739816186880.0, "grad_norm": 0.7568676523378695, "language_loss": 0.57576215, "learning_rate": 3.5293595071706833e-06, "loss": 0.59651446, "num_input_tokens_seen": 88137995, "router_z_loss_clip": 0.02709961, "router_z_loss_mlp": 0.3203125, "step": 4095, "time_per_iteration": 4.571071624755859 }, { "auxiliary_loss_clip": 0.01063496, "auxiliary_loss_mlp": 0.01009493, "balance_loss_clip": 1.00666749, "balance_loss_mlp": 1.03167844, "epoch": 0.2462648429280024, "flos": 69154436315520.0, "grad_norm": 0.6456262592503494, "language_loss": 0.56278706, "learning_rate": 3.5291085049841042e-06, "loss": 0.58351696, "num_input_tokens_seen": 88208490, "router_z_loss_clip": 0.02819824, "router_z_loss_mlp": 0.31835938, "step": 4096, "time_per_iteration": 3.204697608947754 }, { "auxiliary_loss_clip": 0.01138997, "auxiliary_loss_mlp": 0.01037473, "balance_loss_clip": 1.02190447, "balance_loss_mlp": 1.05018425, "epoch": 0.24632496618067037, "flos": 29459605887360.0, "grad_norm": 1.6223691944935785, "language_loss": 0.77491939, "learning_rate": 3.5288574448135773e-06, "loss": 0.79668415, "num_input_tokens_seen": 88228050, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.88671875, "step": 4097, "time_per_iteration": 5.357360363006592 }, { "auxiliary_loss_clip": 0.011389, "auxiliary_loss_mlp": 0.0104567, "balance_loss_clip": 1.02681065, "balance_loss_mlp": 1.04731369, "epoch": 0.24638508943333834, "flos": 24316444068480.0, "grad_norm": 3.405726760751185, "language_loss": 0.75832605, "learning_rate": 3.5286063266686235e-06, "loss": 0.78017169, "num_input_tokens_seen": 88248090, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9140625, "step": 4098, "time_per_iteration": 2.4868619441986084 }, { "auxiliary_loss_clip": 0.01138148, "auxiliary_loss_mlp": 0.01045354, "balance_loss_clip": 1.02963042, "balance_loss_mlp": 1.04768944, "epoch": 0.2464452126860063, "flos": 26613254401920.0, "grad_norm": 2.063260349728117, "language_loss": 0.67642742, "learning_rate": 3.528355150558764e-06, "loss": 0.69826245, "num_input_tokens_seen": 88267545, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.90625, "step": 4099, "time_per_iteration": 2.529012680053711 }, { "auxiliary_loss_clip": 0.01131114, "auxiliary_loss_mlp": 0.01043712, "balance_loss_clip": 1.0280602, "balance_loss_mlp": 1.04588151, "epoch": 0.24650533593867427, "flos": 31212405763200.0, "grad_norm": 1.938727500386324, "language_loss": 0.65603858, "learning_rate": 3.5281039164935237e-06, "loss": 0.67778677, "num_input_tokens_seen": 88289785, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8515625, "step": 4100, "time_per_iteration": 2.588988780975342 }, { "auxiliary_loss_clip": 0.01060368, "auxiliary_loss_mlp": 0.01010223, "balance_loss_clip": 1.00776732, "balance_loss_mlp": 1.02893472, "epoch": 0.24656545919134226, "flos": 68494002900480.0, "grad_norm": 0.7288538620007993, "language_loss": 0.61449778, "learning_rate": 3.5278526244824304e-06, "loss": 0.63520366, "num_input_tokens_seen": 88357320, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.31445312, "step": 4101, "time_per_iteration": 3.19824481010437 }, { "auxiliary_loss_clip": 0.01133616, "auxiliary_loss_mlp": 0.0103899, "balance_loss_clip": 1.02145469, "balance_loss_mlp": 1.04597032, "epoch": 0.24662558244401023, "flos": 20084192179200.0, "grad_norm": 1.6323066091303808, "language_loss": 0.73056972, "learning_rate": 3.527601274535012e-06, "loss": 0.75229579, "num_input_tokens_seen": 88377040, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.875, "step": 4102, "time_per_iteration": 2.4850993156433105 }, { "auxiliary_loss_clip": 0.01137169, "auxiliary_loss_mlp": 0.01045346, "balance_loss_clip": 1.02872872, "balance_loss_mlp": 1.04607797, "epoch": 0.2466857056966782, "flos": 30701361012480.0, "grad_norm": 2.144018522191997, "language_loss": 0.76123977, "learning_rate": 3.5273498666608004e-06, "loss": 0.78306496, "num_input_tokens_seen": 88395085, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9140625, "step": 4103, "time_per_iteration": 2.5554981231689453 }, { "auxiliary_loss_clip": 0.01136571, "auxiliary_loss_mlp": 0.01043366, "balance_loss_clip": 1.02615249, "balance_loss_mlp": 1.04699659, "epoch": 0.24674582894934616, "flos": 22528523669760.0, "grad_norm": 2.48316974368977, "language_loss": 0.78191078, "learning_rate": 3.5270984008693288e-06, "loss": 0.80371016, "num_input_tokens_seen": 88413205, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.89453125, "step": 4104, "time_per_iteration": 2.465954065322876 }, { "auxiliary_loss_clip": 0.0113401, "auxiliary_loss_mlp": 0.01042034, "balance_loss_clip": 1.02430797, "balance_loss_mlp": 1.04685986, "epoch": 0.24680595220201412, "flos": 20704297599360.0, "grad_norm": 1.9795100162173453, "language_loss": 0.83584887, "learning_rate": 3.526846877170133e-06, "loss": 0.85760933, "num_input_tokens_seen": 88431525, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.87109375, "step": 4105, "time_per_iteration": 2.493771553039551 }, { "auxiliary_loss_clip": 0.0113923, "auxiliary_loss_mlp": 0.01044767, "balance_loss_clip": 1.02881694, "balance_loss_mlp": 1.05079126, "epoch": 0.2468660754546821, "flos": 21831174051840.0, "grad_norm": 1.8682827933949284, "language_loss": 0.76493382, "learning_rate": 3.52659529557275e-06, "loss": 0.78677386, "num_input_tokens_seen": 88451210, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8828125, "step": 4106, "time_per_iteration": 2.4635419845581055 }, { "auxiliary_loss_clip": 0.01133604, "auxiliary_loss_mlp": 0.01042682, "balance_loss_clip": 1.02521741, "balance_loss_mlp": 1.04612494, "epoch": 0.24692619870735008, "flos": 15267709578240.0, "grad_norm": 2.077973759852045, "language_loss": 0.72125113, "learning_rate": 3.5263436560867205e-06, "loss": 0.74301398, "num_input_tokens_seen": 88467790, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.875, "step": 4107, "time_per_iteration": 2.439735174179077 }, { "auxiliary_loss_clip": 0.01137867, "auxiliary_loss_mlp": 0.01047194, "balance_loss_clip": 1.03043342, "balance_loss_mlp": 1.0501008, "epoch": 0.24698632196001805, "flos": 29680097523840.0, "grad_norm": 1.5410911854115674, "language_loss": 0.65490222, "learning_rate": 3.526091958721587e-06, "loss": 0.67675281, "num_input_tokens_seen": 88490330, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.875, "step": 4108, "time_per_iteration": 2.548874855041504 }, { "auxiliary_loss_clip": 0.01136563, "auxiliary_loss_mlp": 0.01040054, "balance_loss_clip": 1.02295947, "balance_loss_mlp": 1.04641736, "epoch": 0.247046445212686, "flos": 39165469741440.0, "grad_norm": 1.6758207104140455, "language_loss": 0.72649539, "learning_rate": 3.5258402034868936e-06, "loss": 0.74826169, "num_input_tokens_seen": 88512435, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.90234375, "step": 4109, "time_per_iteration": 2.662191867828369 }, { "auxiliary_loss_clip": 0.01136432, "auxiliary_loss_mlp": 0.01049312, "balance_loss_clip": 1.03292036, "balance_loss_mlp": 1.04645455, "epoch": 0.24710656846535398, "flos": 22998845376000.0, "grad_norm": 1.7392706171490195, "language_loss": 0.79162216, "learning_rate": 3.5255883903921866e-06, "loss": 0.81347966, "num_input_tokens_seen": 88529780, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8984375, "step": 4110, "time_per_iteration": 2.5154004096984863 }, { "auxiliary_loss_clip": 0.01138393, "auxiliary_loss_mlp": 0.01039277, "balance_loss_clip": 1.0221827, "balance_loss_mlp": 1.04858494, "epoch": 0.24716669171802194, "flos": 26432803451520.0, "grad_norm": 5.762800884535763, "language_loss": 0.80437064, "learning_rate": 3.5253365194470144e-06, "loss": 0.82614732, "num_input_tokens_seen": 88547200, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8984375, "step": 4111, "time_per_iteration": 2.5539050102233887 }, { "auxiliary_loss_clip": 0.01131997, "auxiliary_loss_mlp": 0.01041172, "balance_loss_clip": 1.02569914, "balance_loss_mlp": 1.04338241, "epoch": 0.2472268149706899, "flos": 23329870139520.0, "grad_norm": 1.8288394313436058, "language_loss": 0.74722904, "learning_rate": 3.5250845906609294e-06, "loss": 0.76896071, "num_input_tokens_seen": 88566415, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.88671875, "step": 4112, "time_per_iteration": 2.544296979904175 }, { "auxiliary_loss_clip": 0.01134956, "auxiliary_loss_mlp": 0.01043721, "balance_loss_clip": 1.02741957, "balance_loss_mlp": 1.04607558, "epoch": 0.24728693822335787, "flos": 23768734510080.0, "grad_norm": 3.4356536410933773, "language_loss": 0.8243717, "learning_rate": 3.5248326040434835e-06, "loss": 0.84615839, "num_input_tokens_seen": 88585225, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.88671875, "step": 4113, "time_per_iteration": 2.5113749504089355 }, { "auxiliary_loss_clip": 0.01132211, "auxiliary_loss_mlp": 0.01035406, "balance_loss_clip": 1.01901507, "balance_loss_mlp": 1.04434848, "epoch": 0.24734706147602586, "flos": 19317499355520.0, "grad_norm": 3.0417471672715535, "language_loss": 0.87562013, "learning_rate": 3.5245805596042322e-06, "loss": 0.89729631, "num_input_tokens_seen": 88603280, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87890625, "step": 4114, "time_per_iteration": 2.4476752281188965 }, { "auxiliary_loss_clip": 0.01134047, "auxiliary_loss_mlp": 0.01039178, "balance_loss_clip": 1.02357388, "balance_loss_mlp": 1.04583478, "epoch": 0.24740718472869383, "flos": 28036932935040.0, "grad_norm": 5.295685154302838, "language_loss": 0.75272739, "learning_rate": 3.524328457352734e-06, "loss": 0.7744596, "num_input_tokens_seen": 88624925, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8828125, "step": 4115, "time_per_iteration": 2.4928290843963623 }, { "auxiliary_loss_clip": 0.01056036, "auxiliary_loss_mlp": 0.01015575, "balance_loss_clip": 1.01291704, "balance_loss_mlp": 1.02508521, "epoch": 0.2474673079813618, "flos": 68107569408000.0, "grad_norm": 0.8338452042551918, "language_loss": 0.58185357, "learning_rate": 3.5240762972985475e-06, "loss": 0.60256964, "num_input_tokens_seen": 88691475, "router_z_loss_clip": 0.02661133, "router_z_loss_mlp": 0.30859375, "step": 4116, "time_per_iteration": 3.1565306186676025 }, { "auxiliary_loss_clip": 0.0113411, "auxiliary_loss_mlp": 0.01038298, "balance_loss_clip": 1.02299154, "balance_loss_mlp": 1.04683065, "epoch": 0.24752743123402976, "flos": 29462119839360.0, "grad_norm": 1.5625448633516894, "language_loss": 0.83590049, "learning_rate": 3.523824079451235e-06, "loss": 0.85762453, "num_input_tokens_seen": 88713425, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.875, "step": 4117, "time_per_iteration": 2.5341148376464844 }, { "auxiliary_loss_clip": 0.01055427, "auxiliary_loss_mlp": 0.01006875, "balance_loss_clip": 1.00427616, "balance_loss_mlp": 1.02448988, "epoch": 0.24758755448669773, "flos": 58350459824640.0, "grad_norm": 0.9123680651022748, "language_loss": 0.63533372, "learning_rate": 3.5235718038203602e-06, "loss": 0.65595675, "num_input_tokens_seen": 88769995, "router_z_loss_clip": 0.02600098, "router_z_loss_mlp": 0.30859375, "step": 4118, "time_per_iteration": 2.950375556945801 }, { "auxiliary_loss_clip": 0.01133098, "auxiliary_loss_mlp": 0.0104021, "balance_loss_clip": 1.02446282, "balance_loss_mlp": 1.04592729, "epoch": 0.2476476777393657, "flos": 20484416494080.0, "grad_norm": 2.4873226192479656, "language_loss": 0.7943207, "learning_rate": 3.523319470415491e-06, "loss": 0.81605375, "num_input_tokens_seen": 88789970, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.87109375, "step": 4119, "time_per_iteration": 2.455875873565674 }, { "auxiliary_loss_clip": 0.01131863, "auxiliary_loss_mlp": 0.01037366, "balance_loss_clip": 1.02155876, "balance_loss_mlp": 1.04501939, "epoch": 0.24770780099203366, "flos": 20485853038080.0, "grad_norm": 1.8427978295052687, "language_loss": 0.74550653, "learning_rate": 3.5230670792461943e-06, "loss": 0.7671988, "num_input_tokens_seen": 88810000, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 4120, "time_per_iteration": 2.4649267196655273 }, { "auxiliary_loss_clip": 0.01134902, "auxiliary_loss_mlp": 0.01044436, "balance_loss_clip": 1.02754426, "balance_loss_mlp": 1.04624057, "epoch": 0.24776792424470165, "flos": 15153405523200.0, "grad_norm": 2.182065880818938, "language_loss": 0.88365006, "learning_rate": 3.522814630322041e-06, "loss": 0.90544343, "num_input_tokens_seen": 88827515, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.88671875, "step": 4121, "time_per_iteration": 2.4269187450408936 }, { "auxiliary_loss_clip": 0.01134885, "auxiliary_loss_mlp": 0.01038116, "balance_loss_clip": 1.02135527, "balance_loss_mlp": 1.04520166, "epoch": 0.2478280474973696, "flos": 21725453347200.0, "grad_norm": 2.159037410109823, "language_loss": 0.69463098, "learning_rate": 3.5225621236526045e-06, "loss": 0.71636099, "num_input_tokens_seen": 88845025, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.89453125, "step": 4122, "time_per_iteration": 2.498807430267334 }, { "auxiliary_loss_clip": 0.01134942, "auxiliary_loss_mlp": 0.01038636, "balance_loss_clip": 1.02098072, "balance_loss_mlp": 1.04498637, "epoch": 0.24788817075003758, "flos": 20412200200320.0, "grad_norm": 1.998811382021973, "language_loss": 0.79991055, "learning_rate": 3.5223095592474596e-06, "loss": 0.82164633, "num_input_tokens_seen": 88861740, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.8984375, "step": 4123, "time_per_iteration": 2.43495774269104 }, { "auxiliary_loss_clip": 0.01135053, "auxiliary_loss_mlp": 0.01040293, "balance_loss_clip": 1.02492642, "balance_loss_mlp": 1.0473305, "epoch": 0.24794829400270554, "flos": 22594455083520.0, "grad_norm": 3.2761388808036664, "language_loss": 0.74523181, "learning_rate": 3.5220569371161846e-06, "loss": 0.76698524, "num_input_tokens_seen": 88879740, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.875, "step": 4124, "time_per_iteration": 2.4723997116088867 }, { "auxiliary_loss_clip": 0.01131765, "auxiliary_loss_mlp": 0.01035489, "balance_loss_clip": 1.02032542, "balance_loss_mlp": 1.04656541, "epoch": 0.2480084172553735, "flos": 39676047615360.0, "grad_norm": 1.3897683628639261, "language_loss": 0.73728502, "learning_rate": 3.521804257268357e-06, "loss": 0.75895751, "num_input_tokens_seen": 88904095, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8515625, "step": 4125, "time_per_iteration": 2.6128714084625244 }, { "auxiliary_loss_clip": 0.01138155, "auxiliary_loss_mlp": 0.01044408, "balance_loss_clip": 1.02771854, "balance_loss_mlp": 1.04626656, "epoch": 0.24806854050804147, "flos": 22053712763520.0, "grad_norm": 1.7210794423064408, "language_loss": 0.69306964, "learning_rate": 3.5215515197135595e-06, "loss": 0.71489525, "num_input_tokens_seen": 88920740, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.91796875, "step": 4126, "time_per_iteration": 2.458662271499634 }, { "auxiliary_loss_clip": 0.01133213, "auxiliary_loss_mlp": 0.01043275, "balance_loss_clip": 1.02723539, "balance_loss_mlp": 1.0453968, "epoch": 0.24812866376070947, "flos": 15486764670720.0, "grad_norm": 2.4455643340884565, "language_loss": 0.81533092, "learning_rate": 3.5212987244613764e-06, "loss": 0.83709586, "num_input_tokens_seen": 88938510, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87890625, "step": 4127, "time_per_iteration": 2.428116798400879 }, { "auxiliary_loss_clip": 0.01137078, "auxiliary_loss_mlp": 0.01045655, "balance_loss_clip": 1.0298959, "balance_loss_mlp": 1.04758251, "epoch": 0.24818878701337743, "flos": 14757419013120.0, "grad_norm": 2.2617922051682156, "language_loss": 0.84331787, "learning_rate": 3.5210458715213927e-06, "loss": 0.86514521, "num_input_tokens_seen": 88955235, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.890625, "step": 4128, "time_per_iteration": 2.4427614212036133 }, { "auxiliary_loss_clip": 0.01134885, "auxiliary_loss_mlp": 0.01045944, "balance_loss_clip": 1.03039312, "balance_loss_mlp": 1.04664075, "epoch": 0.2482489102660454, "flos": 27089501852160.0, "grad_norm": 1.9977213139412209, "language_loss": 0.65618068, "learning_rate": 3.5207929609031973e-06, "loss": 0.67798901, "num_input_tokens_seen": 88975210, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8828125, "step": 4129, "time_per_iteration": 2.503971576690674 }, { "auxiliary_loss_clip": 0.01134311, "auxiliary_loss_mlp": 0.01046554, "balance_loss_clip": 1.02954245, "balance_loss_mlp": 1.04544759, "epoch": 0.24830903351871336, "flos": 26467528924800.0, "grad_norm": 1.7045115354924187, "language_loss": 0.75359976, "learning_rate": 3.5205399926163806e-06, "loss": 0.77540845, "num_input_tokens_seen": 88996120, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.890625, "step": 4130, "time_per_iteration": 2.5205657482147217 }, { "auxiliary_loss_clip": 0.01135868, "auxiliary_loss_mlp": 0.01044324, "balance_loss_clip": 1.02737236, "balance_loss_mlp": 1.04598248, "epoch": 0.24836915677138133, "flos": 10228436870400.0, "grad_norm": 3.7692582261319676, "language_loss": 0.76716638, "learning_rate": 3.520286966670535e-06, "loss": 0.78896832, "num_input_tokens_seen": 89008685, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8984375, "step": 4131, "time_per_iteration": 2.387798547744751 }, { "auxiliary_loss_clip": 0.01131024, "auxiliary_loss_mlp": 0.01038101, "balance_loss_clip": 1.02335501, "balance_loss_mlp": 1.04542875, "epoch": 0.2484292800240493, "flos": 30080429579520.0, "grad_norm": 1.5291615743670672, "language_loss": 0.83708346, "learning_rate": 3.520033883075255e-06, "loss": 0.85877466, "num_input_tokens_seen": 89031160, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.85546875, "step": 4132, "time_per_iteration": 2.565124034881592 }, { "auxiliary_loss_clip": 0.01133831, "auxiliary_loss_mlp": 0.01038792, "balance_loss_clip": 1.02257931, "balance_loss_mlp": 1.04569304, "epoch": 0.24848940327671726, "flos": 13442944803840.0, "grad_norm": 2.193663130064117, "language_loss": 0.71229583, "learning_rate": 3.5197807418401386e-06, "loss": 0.73402208, "num_input_tokens_seen": 89047235, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87890625, "step": 4133, "time_per_iteration": 2.3985390663146973 }, { "auxiliary_loss_clip": 0.01142178, "auxiliary_loss_mlp": 0.01042925, "balance_loss_clip": 1.02395856, "balance_loss_mlp": 1.04767001, "epoch": 0.24854952652938525, "flos": 19970247260160.0, "grad_norm": 3.5055315120170607, "language_loss": 0.61182427, "learning_rate": 3.5195275429747834e-06, "loss": 0.63367528, "num_input_tokens_seen": 89064790, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.9453125, "step": 4134, "time_per_iteration": 3.953975200653076 }, { "auxiliary_loss_clip": 0.01137234, "auxiliary_loss_mlp": 0.01035686, "balance_loss_clip": 1.02018833, "balance_loss_mlp": 1.04667902, "epoch": 0.24860964978205322, "flos": 18150187167360.0, "grad_norm": 1.93554283831392, "language_loss": 0.78558326, "learning_rate": 3.5192742864887914e-06, "loss": 0.80731243, "num_input_tokens_seen": 89083250, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.90625, "step": 4135, "time_per_iteration": 2.431358575820923 }, { "auxiliary_loss_clip": 0.01140005, "auxiliary_loss_mlp": 0.01034724, "balance_loss_clip": 1.01970935, "balance_loss_mlp": 1.05107546, "epoch": 0.24866977303472118, "flos": 11728641329280.0, "grad_norm": 2.29881803158516, "language_loss": 0.82624, "learning_rate": 3.5190209723917662e-06, "loss": 0.84798723, "num_input_tokens_seen": 89100905, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.890625, "step": 4136, "time_per_iteration": 2.4477345943450928 }, { "auxiliary_loss_clip": 0.01139471, "auxiliary_loss_mlp": 0.01044306, "balance_loss_clip": 1.02857077, "balance_loss_mlp": 1.04851508, "epoch": 0.24872989628738915, "flos": 34823582565120.0, "grad_norm": 2.217537457400903, "language_loss": 0.71316165, "learning_rate": 3.518767600693314e-06, "loss": 0.73499948, "num_input_tokens_seen": 89122630, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.91015625, "step": 4137, "time_per_iteration": 3.8904759883880615 }, { "auxiliary_loss_clip": 0.01136134, "auxiliary_loss_mlp": 0.01044069, "balance_loss_clip": 1.02885747, "balance_loss_mlp": 1.04501224, "epoch": 0.2487900195400571, "flos": 13699347062400.0, "grad_norm": 1.766045216350489, "language_loss": 0.66476864, "learning_rate": 3.518514171403042e-06, "loss": 0.68657064, "num_input_tokens_seen": 89141050, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.91015625, "step": 4138, "time_per_iteration": 3.920454502105713 }, { "auxiliary_loss_clip": 0.01133156, "auxiliary_loss_mlp": 0.01037499, "balance_loss_clip": 1.02262175, "balance_loss_mlp": 1.04730868, "epoch": 0.24885014279272508, "flos": 25337815297920.0, "grad_norm": 1.769361083350018, "language_loss": 0.84024715, "learning_rate": 3.51826068453056e-06, "loss": 0.86195368, "num_input_tokens_seen": 89160810, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.859375, "step": 4139, "time_per_iteration": 3.9366371631622314 }, { "auxiliary_loss_clip": 0.01138749, "auxiliary_loss_mlp": 0.01042614, "balance_loss_clip": 1.02607942, "balance_loss_mlp": 1.0472734, "epoch": 0.24891026604539307, "flos": 20631434860800.0, "grad_norm": 1.5991495290196178, "language_loss": 0.78892207, "learning_rate": 3.518007140085481e-06, "loss": 0.8107357, "num_input_tokens_seen": 89180610, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.9140625, "step": 4140, "time_per_iteration": 2.489905834197998 }, { "auxiliary_loss_clip": 0.01066833, "auxiliary_loss_mlp": 0.01015497, "balance_loss_clip": 1.01271975, "balance_loss_mlp": 1.03531528, "epoch": 0.24897038929806103, "flos": 66960294030720.0, "grad_norm": 0.835255527780356, "language_loss": 0.61080587, "learning_rate": 3.51775353807742e-06, "loss": 0.63162911, "num_input_tokens_seen": 89241880, "router_z_loss_clip": 0.02783203, "router_z_loss_mlp": 0.31640625, "step": 4141, "time_per_iteration": 3.1490509510040283 }, { "auxiliary_loss_clip": 0.01140593, "auxiliary_loss_mlp": 0.0105144, "balance_loss_clip": 1.03539467, "balance_loss_mlp": 1.04959226, "epoch": 0.249030512550729, "flos": 36392555612160.0, "grad_norm": 1.8668805328904083, "language_loss": 0.72289711, "learning_rate": 3.5174998785159913e-06, "loss": 0.74481738, "num_input_tokens_seen": 89263340, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.91015625, "step": 4142, "time_per_iteration": 2.5804572105407715 }, { "auxiliary_loss_clip": 0.01136742, "auxiliary_loss_mlp": 0.01044221, "balance_loss_clip": 1.02848577, "balance_loss_mlp": 1.04826462, "epoch": 0.24909063580339696, "flos": 20154576879360.0, "grad_norm": 1.7925851204580667, "language_loss": 0.80945563, "learning_rate": 3.5172461614108157e-06, "loss": 0.83126521, "num_input_tokens_seen": 89282870, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.88671875, "step": 4143, "time_per_iteration": 2.440176486968994 }, { "auxiliary_loss_clip": 0.01132263, "auxiliary_loss_mlp": 0.01041459, "balance_loss_clip": 1.0267787, "balance_loss_mlp": 1.04568291, "epoch": 0.24915075905606493, "flos": 26396569607040.0, "grad_norm": 2.1581419578117162, "language_loss": 0.58885288, "learning_rate": 3.5169923867715137e-06, "loss": 0.6105901, "num_input_tokens_seen": 89303830, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8671875, "step": 4144, "time_per_iteration": 2.4857988357543945 }, { "auxiliary_loss_clip": 0.01133628, "auxiliary_loss_mlp": 0.01044381, "balance_loss_clip": 1.02858579, "balance_loss_mlp": 1.04573131, "epoch": 0.2492108823087329, "flos": 27527216987520.0, "grad_norm": 2.281258382833677, "language_loss": 0.78374982, "learning_rate": 3.516738554607708e-06, "loss": 0.80552995, "num_input_tokens_seen": 89324350, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.87890625, "step": 4145, "time_per_iteration": 2.50010347366333 }, { "auxiliary_loss_clip": 0.01145507, "auxiliary_loss_mlp": 0.01045957, "balance_loss_clip": 1.02702641, "balance_loss_mlp": 1.04864669, "epoch": 0.24927100556140086, "flos": 16691388111360.0, "grad_norm": 2.052127979293232, "language_loss": 0.64908004, "learning_rate": 3.5164846649290253e-06, "loss": 0.67099464, "num_input_tokens_seen": 89342875, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.96875, "step": 4146, "time_per_iteration": 2.4350132942199707 }, { "auxiliary_loss_clip": 0.01063905, "auxiliary_loss_mlp": 0.01005714, "balance_loss_clip": 1.0032109, "balance_loss_mlp": 1.03261697, "epoch": 0.24933112881406885, "flos": 62772464286720.0, "grad_norm": 0.9531508578075826, "language_loss": 0.67370272, "learning_rate": 3.5162307177450915e-06, "loss": 0.694399, "num_input_tokens_seen": 89404925, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.3125, "step": 4147, "time_per_iteration": 3.2251107692718506 }, { "auxiliary_loss_clip": 0.01140392, "auxiliary_loss_mlp": 0.01048581, "balance_loss_clip": 1.03197479, "balance_loss_mlp": 1.05035698, "epoch": 0.24939125206673682, "flos": 26651894457600.0, "grad_norm": 1.746907068961737, "language_loss": 0.89238155, "learning_rate": 3.5159767130655366e-06, "loss": 0.9142713, "num_input_tokens_seen": 89425090, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8984375, "step": 4148, "time_per_iteration": 2.500751495361328 }, { "auxiliary_loss_clip": 0.01145366, "auxiliary_loss_mlp": 0.01053265, "balance_loss_clip": 1.03350067, "balance_loss_mlp": 1.05057216, "epoch": 0.24945137531940478, "flos": 20704333512960.0, "grad_norm": 1.9258421579499412, "language_loss": 0.67764956, "learning_rate": 3.5157226508999935e-06, "loss": 0.69963586, "num_input_tokens_seen": 89442615, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.94921875, "step": 4149, "time_per_iteration": 2.470442056655884 }, { "auxiliary_loss_clip": 0.01138122, "auxiliary_loss_mlp": 0.01047912, "balance_loss_clip": 1.03153312, "balance_loss_mlp": 1.04998696, "epoch": 0.24951149857207275, "flos": 23768662682880.0, "grad_norm": 1.912190989239639, "language_loss": 0.71309853, "learning_rate": 3.515468531258095e-06, "loss": 0.73495889, "num_input_tokens_seen": 89463025, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8828125, "step": 4150, "time_per_iteration": 2.4951517581939697 }, { "auxiliary_loss_clip": 0.0113772, "auxiliary_loss_mlp": 0.01050888, "balance_loss_clip": 1.03403223, "balance_loss_mlp": 1.04742658, "epoch": 0.2495716218247407, "flos": 15664881237120.0, "grad_norm": 1.7474259508498113, "language_loss": 0.72915947, "learning_rate": 3.515214354149478e-06, "loss": 0.75104547, "num_input_tokens_seen": 89480225, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.90234375, "step": 4151, "time_per_iteration": 2.4381375312805176 }, { "auxiliary_loss_clip": 0.0114556, "auxiliary_loss_mlp": 0.01056592, "balance_loss_clip": 1.03884172, "balance_loss_mlp": 1.04968703, "epoch": 0.24963174507740868, "flos": 24052499953920.0, "grad_norm": 3.3546524943280547, "language_loss": 0.63228846, "learning_rate": 3.514960119583781e-06, "loss": 0.65430999, "num_input_tokens_seen": 89496985, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9609375, "step": 4152, "time_per_iteration": 2.4764370918273926 }, { "auxiliary_loss_clip": 0.0113756, "auxiliary_loss_mlp": 0.01044984, "balance_loss_clip": 1.02871156, "balance_loss_mlp": 1.05012274, "epoch": 0.24969186833007664, "flos": 21799501234560.0, "grad_norm": 2.0244603188268737, "language_loss": 0.7734158, "learning_rate": 3.514705827570645e-06, "loss": 0.79524124, "num_input_tokens_seen": 89514420, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.875, "step": 4153, "time_per_iteration": 2.470973253250122 }, { "auxiliary_loss_clip": 0.0113755, "auxiliary_loss_mlp": 0.01041148, "balance_loss_clip": 1.02428544, "balance_loss_mlp": 1.04892111, "epoch": 0.24975199158274464, "flos": 19938143479680.0, "grad_norm": 1.865760363128039, "language_loss": 0.76502669, "learning_rate": 3.514451478119711e-06, "loss": 0.78681374, "num_input_tokens_seen": 89532925, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8828125, "step": 4154, "time_per_iteration": 2.451986789703369 }, { "auxiliary_loss_clip": 0.0114268, "auxiliary_loss_mlp": 0.01052193, "balance_loss_clip": 1.03339338, "balance_loss_mlp": 1.04781628, "epoch": 0.2498121148354126, "flos": 25338389915520.0, "grad_norm": 2.6817734073371646, "language_loss": 0.70719802, "learning_rate": 3.5141970712406258e-06, "loss": 0.72914672, "num_input_tokens_seen": 89552855, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.94921875, "step": 4155, "time_per_iteration": 2.497109889984131 }, { "auxiliary_loss_clip": 0.01143294, "auxiliary_loss_mlp": 0.01048637, "balance_loss_clip": 1.0315783, "balance_loss_mlp": 1.05075443, "epoch": 0.24987223808808057, "flos": 20558787603840.0, "grad_norm": 1.6084050246604336, "language_loss": 0.74715471, "learning_rate": 3.513942606943036e-06, "loss": 0.76907396, "num_input_tokens_seen": 89572830, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.92578125, "step": 4156, "time_per_iteration": 2.460038900375366 }, { "auxiliary_loss_clip": 0.01139968, "auxiliary_loss_mlp": 0.0104346, "balance_loss_clip": 1.02682424, "balance_loss_mlp": 1.04996419, "epoch": 0.24993236134074853, "flos": 19749037351680.0, "grad_norm": 1.8985512213082045, "language_loss": 0.76432747, "learning_rate": 3.513688085236591e-06, "loss": 0.78616178, "num_input_tokens_seen": 89590345, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8984375, "step": 4157, "time_per_iteration": 2.4519455432891846 }, { "auxiliary_loss_clip": 0.01141005, "auxiliary_loss_mlp": 0.01045755, "balance_loss_clip": 1.02869618, "balance_loss_mlp": 1.0492022, "epoch": 0.2499924845934165, "flos": 18770292587520.0, "grad_norm": 1.7668680141416644, "language_loss": 0.81271493, "learning_rate": 3.513433506130942e-06, "loss": 0.83458251, "num_input_tokens_seen": 89610295, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.91796875, "step": 4158, "time_per_iteration": 2.4693708419799805 }, { "auxiliary_loss_clip": 0.01141119, "auxiliary_loss_mlp": 0.01037432, "balance_loss_clip": 1.02132642, "balance_loss_mlp": 1.05027866, "epoch": 0.25005260784608446, "flos": 16872198197760.0, "grad_norm": 3.7156259452614226, "language_loss": 0.75666922, "learning_rate": 3.5131788696357427e-06, "loss": 0.77845466, "num_input_tokens_seen": 89627795, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.90625, "step": 4159, "time_per_iteration": 2.4497740268707275 }, { "auxiliary_loss_clip": 0.01145293, "auxiliary_loss_mlp": 0.01039281, "balance_loss_clip": 1.02175701, "balance_loss_mlp": 1.05150247, "epoch": 0.2501127310987524, "flos": 22124923476480.0, "grad_norm": 1.8047055020471803, "language_loss": 0.71542376, "learning_rate": 3.512924175760649e-06, "loss": 0.73726952, "num_input_tokens_seen": 89648090, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9375, "step": 4160, "time_per_iteration": 2.496434211730957 }, { "auxiliary_loss_clip": 0.01065653, "auxiliary_loss_mlp": 0.01010544, "balance_loss_clip": 1.00761151, "balance_loss_mlp": 1.03379035, "epoch": 0.2501728543514204, "flos": 69458061980160.0, "grad_norm": 1.4093985330473309, "language_loss": 0.5676856, "learning_rate": 3.5126694245153186e-06, "loss": 0.58844757, "num_input_tokens_seen": 89710345, "router_z_loss_clip": 0.02929688, "router_z_loss_mlp": 0.31835938, "step": 4161, "time_per_iteration": 3.13490629196167 }, { "auxiliary_loss_clip": 0.01149452, "auxiliary_loss_mlp": 0.01043808, "balance_loss_clip": 1.0255568, "balance_loss_mlp": 1.05269098, "epoch": 0.25023297760408836, "flos": 16289978647680.0, "grad_norm": 1.8886294082541608, "language_loss": 0.80873513, "learning_rate": 3.5124146159094125e-06, "loss": 0.83066767, "num_input_tokens_seen": 89729390, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.96875, "step": 4162, "time_per_iteration": 2.4309563636779785 }, { "auxiliary_loss_clip": 0.01142699, "auxiliary_loss_mlp": 0.01047687, "balance_loss_clip": 1.02991295, "balance_loss_mlp": 1.04795563, "epoch": 0.2502931008567563, "flos": 12237998140800.0, "grad_norm": 2.28594224687545, "language_loss": 0.87970221, "learning_rate": 3.5121597499525927e-06, "loss": 0.90160608, "num_input_tokens_seen": 89742805, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9453125, "step": 4163, "time_per_iteration": 2.4407927989959717 }, { "auxiliary_loss_clip": 0.01142467, "auxiliary_loss_mlp": 0.01037191, "balance_loss_clip": 1.02019215, "balance_loss_mlp": 1.05085945, "epoch": 0.25035322410942434, "flos": 23181882105600.0, "grad_norm": 1.865401638695856, "language_loss": 0.83061886, "learning_rate": 3.5119048266545232e-06, "loss": 0.85241544, "num_input_tokens_seen": 89761145, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.91796875, "step": 4164, "time_per_iteration": 2.473572254180908 }, { "auxiliary_loss_clip": 0.0113967, "auxiliary_loss_mlp": 0.0104742, "balance_loss_clip": 1.03155351, "balance_loss_mlp": 1.05319333, "epoch": 0.2504133473620923, "flos": 20917534688640.0, "grad_norm": 1.9125283953142125, "language_loss": 0.74465358, "learning_rate": 3.5116498460248716e-06, "loss": 0.76652443, "num_input_tokens_seen": 89780905, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.86328125, "step": 4165, "time_per_iteration": 2.484860897064209 }, { "auxiliary_loss_clip": 0.01145828, "auxiliary_loss_mlp": 0.01047326, "balance_loss_clip": 1.02868199, "balance_loss_mlp": 1.05078995, "epoch": 0.2504734706147603, "flos": 20776549806720.0, "grad_norm": 2.0507911789421502, "language_loss": 0.74047416, "learning_rate": 3.5113948080733062e-06, "loss": 0.76240575, "num_input_tokens_seen": 89799230, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.94921875, "step": 4166, "time_per_iteration": 2.462557315826416 }, { "auxiliary_loss_clip": 0.01137918, "auxiliary_loss_mlp": 0.01043569, "balance_loss_clip": 1.02672434, "balance_loss_mlp": 1.04855919, "epoch": 0.25053359386742824, "flos": 24349373861760.0, "grad_norm": 2.09976412017585, "language_loss": 0.82060194, "learning_rate": 3.5111397128094973e-06, "loss": 0.84241682, "num_input_tokens_seen": 89818240, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.89453125, "step": 4167, "time_per_iteration": 2.4972617626190186 }, { "auxiliary_loss_clip": 0.01140849, "auxiliary_loss_mlp": 0.01038228, "balance_loss_clip": 1.02172971, "balance_loss_mlp": 1.05163801, "epoch": 0.2505937171200962, "flos": 21214336769280.0, "grad_norm": 2.054851954769205, "language_loss": 0.80056596, "learning_rate": 3.51088456024312e-06, "loss": 0.82235676, "num_input_tokens_seen": 89834485, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.890625, "step": 4168, "time_per_iteration": 2.438650131225586 }, { "auxiliary_loss_clip": 0.01146844, "auxiliary_loss_mlp": 0.01040599, "balance_loss_clip": 1.02117956, "balance_loss_mlp": 1.05052149, "epoch": 0.25065384037276417, "flos": 41427231379200.0, "grad_norm": 2.126656782524993, "language_loss": 0.69406831, "learning_rate": 3.510629350383849e-06, "loss": 0.71594274, "num_input_tokens_seen": 89855645, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.96484375, "step": 4169, "time_per_iteration": 2.6453967094421387 }, { "auxiliary_loss_clip": 0.01136376, "auxiliary_loss_mlp": 0.01044208, "balance_loss_clip": 1.02722025, "balance_loss_mlp": 1.04808378, "epoch": 0.25071396362543213, "flos": 26102389219200.0, "grad_norm": 1.9771147746937217, "language_loss": 0.77625197, "learning_rate": 3.510374083241361e-06, "loss": 0.79805779, "num_input_tokens_seen": 89874895, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 4170, "time_per_iteration": 2.485753297805786 }, { "auxiliary_loss_clip": 0.01142444, "auxiliary_loss_mlp": 0.01038632, "balance_loss_clip": 1.02189541, "balance_loss_mlp": 1.0515573, "epoch": 0.2507740868781001, "flos": 19098982967040.0, "grad_norm": 3.611110537148921, "language_loss": 0.7630496, "learning_rate": 3.5101187588253368e-06, "loss": 0.78486037, "num_input_tokens_seen": 89891700, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.90625, "step": 4171, "time_per_iteration": 2.446475028991699 }, { "auxiliary_loss_clip": 0.01064769, "auxiliary_loss_mlp": 0.01001621, "balance_loss_clip": 0.99879533, "balance_loss_mlp": 1.03214395, "epoch": 0.25083421013076806, "flos": 64341868296960.0, "grad_norm": 0.857841162514146, "language_loss": 0.60078698, "learning_rate": 3.509863377145458e-06, "loss": 0.6214509, "num_input_tokens_seen": 89955775, "router_z_loss_clip": 0.02819824, "router_z_loss_mlp": 0.32617188, "step": 4172, "time_per_iteration": 3.09700083732605 }, { "auxiliary_loss_clip": 0.01142723, "auxiliary_loss_mlp": 0.01044628, "balance_loss_clip": 1.02682996, "balance_loss_mlp": 1.05039191, "epoch": 0.25089433338343603, "flos": 24279599692800.0, "grad_norm": 1.4102480689598935, "language_loss": 0.7921629, "learning_rate": 3.509607938211409e-06, "loss": 0.81403637, "num_input_tokens_seen": 89977150, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.92578125, "step": 4173, "time_per_iteration": 2.502424955368042 }, { "auxiliary_loss_clip": 0.01143599, "auxiliary_loss_mlp": 0.01042776, "balance_loss_clip": 1.0256815, "balance_loss_mlp": 1.05263257, "epoch": 0.250954456636104, "flos": 14721472477440.0, "grad_norm": 2.0044229646551286, "language_loss": 0.83234298, "learning_rate": 3.509352442032875e-06, "loss": 0.85420674, "num_input_tokens_seen": 89994925, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.91015625, "step": 4174, "time_per_iteration": 2.429774284362793 }, { "auxiliary_loss_clip": 0.01143749, "auxiliary_loss_mlp": 0.01042085, "balance_loss_clip": 1.02400124, "balance_loss_mlp": 1.05192494, "epoch": 0.25101457988877196, "flos": 22273593868800.0, "grad_norm": 2.1608847281794765, "language_loss": 0.70987308, "learning_rate": 3.509096888619545e-06, "loss": 0.73173141, "num_input_tokens_seen": 90013235, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.91796875, "step": 4175, "time_per_iteration": 2.4989845752716064 }, { "auxiliary_loss_clip": 0.01142065, "auxiliary_loss_mlp": 0.0103872, "balance_loss_clip": 1.02148223, "balance_loss_mlp": 1.0490185, "epoch": 0.2510747031414399, "flos": 25188929424000.0, "grad_norm": 1.8169542884227203, "language_loss": 0.80711234, "learning_rate": 3.50884127798111e-06, "loss": 0.82892025, "num_input_tokens_seen": 90032150, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9296875, "step": 4176, "time_per_iteration": 3.9752893447875977 }, { "auxiliary_loss_clip": 0.01141462, "auxiliary_loss_mlp": 0.01040742, "balance_loss_clip": 1.02236021, "balance_loss_mlp": 1.05044758, "epoch": 0.25113482639410795, "flos": 20704189858560.0, "grad_norm": 1.9224592461560206, "language_loss": 0.82283115, "learning_rate": 3.5085856101272623e-06, "loss": 0.84465325, "num_input_tokens_seen": 90049085, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.91015625, "step": 4177, "time_per_iteration": 2.473034620285034 }, { "auxiliary_loss_clip": 0.01141949, "auxiliary_loss_mlp": 0.01046338, "balance_loss_clip": 1.02939796, "balance_loss_mlp": 1.05204487, "epoch": 0.2511949496467759, "flos": 21506936958720.0, "grad_norm": 1.9924762754780125, "language_loss": 0.82722461, "learning_rate": 3.508329885067698e-06, "loss": 0.8491075, "num_input_tokens_seen": 90067695, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8984375, "step": 4178, "time_per_iteration": 2.466153860092163 }, { "auxiliary_loss_clip": 0.01135279, "auxiliary_loss_mlp": 0.0104161, "balance_loss_clip": 1.02644026, "balance_loss_mlp": 1.04768181, "epoch": 0.2512550728994439, "flos": 20701999128960.0, "grad_norm": 2.1561708476378585, "language_loss": 0.75757837, "learning_rate": 3.508074102812112e-06, "loss": 0.7793473, "num_input_tokens_seen": 90083890, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.875, "step": 4179, "time_per_iteration": 3.81021785736084 }, { "auxiliary_loss_clip": 0.01140715, "auxiliary_loss_mlp": 0.01054255, "balance_loss_clip": 1.03704143, "balance_loss_mlp": 1.04818749, "epoch": 0.25131519615211184, "flos": 18478626151680.0, "grad_norm": 1.9185456355355701, "language_loss": 0.69872034, "learning_rate": 3.507818263370206e-06, "loss": 0.72067004, "num_input_tokens_seen": 90100995, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.92578125, "step": 4180, "time_per_iteration": 5.278890371322632 }, { "auxiliary_loss_clip": 0.01138748, "auxiliary_loss_mlp": 0.01043631, "balance_loss_clip": 1.02760959, "balance_loss_mlp": 1.04974568, "epoch": 0.2513753194047798, "flos": 20484955198080.0, "grad_norm": 1.995425797384359, "language_loss": 0.85685432, "learning_rate": 3.5075623667516796e-06, "loss": 0.87867814, "num_input_tokens_seen": 90120365, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.890625, "step": 4181, "time_per_iteration": 2.474801540374756 }, { "auxiliary_loss_clip": 0.01139143, "auxiliary_loss_mlp": 0.01046045, "balance_loss_clip": 1.03013098, "balance_loss_mlp": 1.04999173, "epoch": 0.25143544265744777, "flos": 37670077704960.0, "grad_norm": 3.9749724867567418, "language_loss": 0.67747855, "learning_rate": 3.507306412966238e-06, "loss": 0.69933039, "num_input_tokens_seen": 90142610, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.890625, "step": 4182, "time_per_iteration": 2.5966238975524902 }, { "auxiliary_loss_clip": 0.01063349, "auxiliary_loss_mlp": 0.01026949, "balance_loss_clip": 1.0239327, "balance_loss_mlp": 1.03030455, "epoch": 0.25149556591011574, "flos": 69367457923200.0, "grad_norm": 0.8560227166837914, "language_loss": 0.70161653, "learning_rate": 3.5070504020235853e-06, "loss": 0.72251946, "num_input_tokens_seen": 90200555, "router_z_loss_clip": 0.03015137, "router_z_loss_mlp": 0.33007812, "step": 4183, "time_per_iteration": 3.0870320796966553 }, { "auxiliary_loss_clip": 0.01134507, "auxiliary_loss_mlp": 0.0104425, "balance_loss_clip": 1.02709532, "balance_loss_mlp": 1.04513431, "epoch": 0.2515556891627837, "flos": 13990402967040.0, "grad_norm": 1.6896042445699488, "language_loss": 0.74174297, "learning_rate": 3.506794333933431e-06, "loss": 0.76353055, "num_input_tokens_seen": 90218120, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.89453125, "step": 4184, "time_per_iteration": 2.4292690753936768 }, { "auxiliary_loss_clip": 0.01139644, "auxiliary_loss_mlp": 0.01043826, "balance_loss_clip": 1.02749419, "balance_loss_mlp": 1.05046868, "epoch": 0.25161581241545167, "flos": 22163527618560.0, "grad_norm": 1.7818869984981005, "language_loss": 0.83138132, "learning_rate": 3.506538208705484e-06, "loss": 0.85321593, "num_input_tokens_seen": 90236790, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.890625, "step": 4185, "time_per_iteration": 2.486410140991211 }, { "auxiliary_loss_clip": 0.01063517, "auxiliary_loss_mlp": 0.01008483, "balance_loss_clip": 1.00587213, "balance_loss_mlp": 1.03037906, "epoch": 0.25167593566811963, "flos": 69358407696000.0, "grad_norm": 0.7869294508285208, "language_loss": 0.61501241, "learning_rate": 3.5062820263494574e-06, "loss": 0.63573235, "num_input_tokens_seen": 90297070, "router_z_loss_clip": 0.02612305, "router_z_loss_mlp": 0.33203125, "step": 4186, "time_per_iteration": 2.9835898876190186 }, { "auxiliary_loss_clip": 0.01136558, "auxiliary_loss_mlp": 0.01039395, "balance_loss_clip": 1.02263379, "balance_loss_mlp": 1.04699922, "epoch": 0.2517360589207876, "flos": 13261452359040.0, "grad_norm": 2.0066865175613406, "language_loss": 0.7929405, "learning_rate": 3.5060257868750656e-06, "loss": 0.81470001, "num_input_tokens_seen": 90315255, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 4187, "time_per_iteration": 2.4751923084259033 }, { "auxiliary_loss_clip": 0.01138664, "auxiliary_loss_mlp": 0.01052485, "balance_loss_clip": 1.03593898, "balance_loss_mlp": 1.05053771, "epoch": 0.25179618217345556, "flos": 20376828282240.0, "grad_norm": 1.8385430477000717, "language_loss": 0.79772449, "learning_rate": 3.5057694902920244e-06, "loss": 0.81963599, "num_input_tokens_seen": 90334990, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8828125, "step": 4188, "time_per_iteration": 2.4640276432037354 }, { "auxiliary_loss_clip": 0.01135285, "auxiliary_loss_mlp": 0.0104055, "balance_loss_clip": 1.02487385, "balance_loss_mlp": 1.04671311, "epoch": 0.25185630542612353, "flos": 27664718250240.0, "grad_norm": 2.031540834424975, "language_loss": 0.74576753, "learning_rate": 3.5055131366100534e-06, "loss": 0.76752591, "num_input_tokens_seen": 90351825, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.88671875, "step": 4189, "time_per_iteration": 2.5053956508636475 }, { "auxiliary_loss_clip": 0.01133757, "auxiliary_loss_mlp": 0.01039103, "balance_loss_clip": 1.02413058, "balance_loss_mlp": 1.04851651, "epoch": 0.25191642867879155, "flos": 20996430912000.0, "grad_norm": 1.9637885925217269, "language_loss": 0.84712291, "learning_rate": 3.5052567258388745e-06, "loss": 0.86885148, "num_input_tokens_seen": 90369860, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8515625, "step": 4190, "time_per_iteration": 2.4560749530792236 }, { "auxiliary_loss_clip": 0.01137342, "auxiliary_loss_mlp": 0.01046898, "balance_loss_clip": 1.0291121, "balance_loss_mlp": 1.04900908, "epoch": 0.2519765519314595, "flos": 21105671149440.0, "grad_norm": 2.202536247116072, "language_loss": 0.75739604, "learning_rate": 3.5050002579882082e-06, "loss": 0.77923846, "num_input_tokens_seen": 90389245, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.8828125, "step": 4191, "time_per_iteration": 2.470231294631958 }, { "auxiliary_loss_clip": 0.01059783, "auxiliary_loss_mlp": 0.01012123, "balance_loss_clip": 1.00958395, "balance_loss_mlp": 1.02758467, "epoch": 0.2520366751841275, "flos": 62744993360640.0, "grad_norm": 0.7534936057776785, "language_loss": 0.57155108, "learning_rate": 3.5047437330677823e-06, "loss": 0.59227014, "num_input_tokens_seen": 90456735, "router_z_loss_clip": 0.02539062, "router_z_loss_mlp": 0.32226562, "step": 4192, "time_per_iteration": 3.1769280433654785 }, { "auxiliary_loss_clip": 0.01136027, "auxiliary_loss_mlp": 0.01038005, "balance_loss_clip": 1.02149463, "balance_loss_mlp": 1.04902005, "epoch": 0.25209679843679544, "flos": 22230716008320.0, "grad_norm": 1.8323658179129558, "language_loss": 0.75855321, "learning_rate": 3.504487151087323e-06, "loss": 0.78029352, "num_input_tokens_seen": 90474165, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8671875, "step": 4193, "time_per_iteration": 2.4681785106658936 }, { "auxiliary_loss_clip": 0.01138503, "auxiliary_loss_mlp": 0.01044118, "balance_loss_clip": 1.02784634, "balance_loss_mlp": 1.04756474, "epoch": 0.2521569216894634, "flos": 12166643773440.0, "grad_norm": 2.3896342816784797, "language_loss": 0.84265482, "learning_rate": 3.5042305120565598e-06, "loss": 0.86448097, "num_input_tokens_seen": 90491660, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.91015625, "step": 4194, "time_per_iteration": 2.447232723236084 }, { "auxiliary_loss_clip": 0.01137137, "auxiliary_loss_mlp": 0.010472, "balance_loss_clip": 1.03268027, "balance_loss_mlp": 1.04684329, "epoch": 0.2522170449421314, "flos": 23699786353920.0, "grad_norm": 1.396932775015312, "language_loss": 0.88539612, "learning_rate": 3.5039738159852253e-06, "loss": 0.90723956, "num_input_tokens_seen": 90514025, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.90234375, "step": 4195, "time_per_iteration": 2.5419132709503174 }, { "auxiliary_loss_clip": 0.01138152, "auxiliary_loss_mlp": 0.01041596, "balance_loss_clip": 1.02333307, "balance_loss_mlp": 1.04839814, "epoch": 0.25227716819479934, "flos": 20955456472320.0, "grad_norm": 1.9735093684314593, "language_loss": 0.85445356, "learning_rate": 3.503717062883053e-06, "loss": 0.8762511, "num_input_tokens_seen": 90533530, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.8984375, "step": 4196, "time_per_iteration": 2.5253732204437256 }, { "auxiliary_loss_clip": 0.01139285, "auxiliary_loss_mlp": 0.0104558, "balance_loss_clip": 1.02954614, "balance_loss_mlp": 1.04881656, "epoch": 0.2523372914474673, "flos": 23331342597120.0, "grad_norm": 2.0790486736924, "language_loss": 0.83333433, "learning_rate": 3.5034602527597786e-06, "loss": 0.85518295, "num_input_tokens_seen": 90554025, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.90625, "step": 4197, "time_per_iteration": 2.4959659576416016 }, { "auxiliary_loss_clip": 0.01140421, "auxiliary_loss_mlp": 0.01047752, "balance_loss_clip": 1.02997828, "balance_loss_mlp": 1.04965854, "epoch": 0.25239741470013527, "flos": 36970321875840.0, "grad_norm": 2.0747589258228687, "language_loss": 0.73114145, "learning_rate": 3.5032033856251405e-06, "loss": 0.75302321, "num_input_tokens_seen": 90576930, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.90625, "step": 4198, "time_per_iteration": 2.620894193649292 }, { "auxiliary_loss_clip": 0.011404, "auxiliary_loss_mlp": 0.01048371, "balance_loss_clip": 1.03133643, "balance_loss_mlp": 1.04802895, "epoch": 0.25245753795280323, "flos": 18515757836160.0, "grad_norm": 3.5151171684970803, "language_loss": 0.76735646, "learning_rate": 3.50294646148888e-06, "loss": 0.78924417, "num_input_tokens_seen": 90595710, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.92578125, "step": 4199, "time_per_iteration": 2.434978485107422 }, { "auxiliary_loss_clip": 0.01139983, "auxiliary_loss_mlp": 0.01043251, "balance_loss_clip": 1.0275507, "balance_loss_mlp": 1.04898381, "epoch": 0.2525176612054712, "flos": 32344884737280.0, "grad_norm": 2.1192055170616775, "language_loss": 0.73023307, "learning_rate": 3.502689480360739e-06, "loss": 0.75206542, "num_input_tokens_seen": 90617945, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.91015625, "step": 4200, "time_per_iteration": 2.5768635272979736 }, { "auxiliary_loss_clip": 0.011385, "auxiliary_loss_mlp": 0.01046406, "balance_loss_clip": 1.03102779, "balance_loss_mlp": 1.04712224, "epoch": 0.25257778445813917, "flos": 45258217459200.0, "grad_norm": 1.5001810120049923, "language_loss": 0.82476807, "learning_rate": 3.5024324422504616e-06, "loss": 0.84661722, "num_input_tokens_seen": 90640855, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.9140625, "step": 4201, "time_per_iteration": 2.6686558723449707 }, { "auxiliary_loss_clip": 0.01142046, "auxiliary_loss_mlp": 0.01046506, "balance_loss_clip": 1.03034127, "balance_loss_mlp": 1.04998851, "epoch": 0.25263790771080713, "flos": 23367791923200.0, "grad_norm": 1.7214107686608102, "language_loss": 0.7541973, "learning_rate": 3.5021753471677965e-06, "loss": 0.77608275, "num_input_tokens_seen": 90661350, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.921875, "step": 4202, "time_per_iteration": 2.4822378158569336 }, { "auxiliary_loss_clip": 0.01136413, "auxiliary_loss_mlp": 0.01040575, "balance_loss_clip": 1.0249939, "balance_loss_mlp": 1.04878116, "epoch": 0.25269803096347515, "flos": 18515039564160.0, "grad_norm": 1.7511325239256903, "language_loss": 0.73413515, "learning_rate": 3.501918195122491e-06, "loss": 0.75590503, "num_input_tokens_seen": 90680540, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.875, "step": 4203, "time_per_iteration": 2.4302821159362793 }, { "auxiliary_loss_clip": 0.01138214, "auxiliary_loss_mlp": 0.0103596, "balance_loss_clip": 1.01983047, "balance_loss_mlp": 1.04800355, "epoch": 0.2527581542161431, "flos": 24610552629120.0, "grad_norm": 1.5472668683929487, "language_loss": 0.77644837, "learning_rate": 3.501660986124297e-06, "loss": 0.79819012, "num_input_tokens_seen": 90703460, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.90234375, "step": 4204, "time_per_iteration": 2.5121028423309326 }, { "auxiliary_loss_clip": 0.01139326, "auxiliary_loss_mlp": 0.01045362, "balance_loss_clip": 1.02925658, "balance_loss_mlp": 1.04817724, "epoch": 0.2528182774688111, "flos": 12641275111680.0, "grad_norm": 2.19102062835548, "language_loss": 0.72055447, "learning_rate": 3.5014037201829684e-06, "loss": 0.7424013, "num_input_tokens_seen": 90718815, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.9140625, "step": 4205, "time_per_iteration": 2.437669038772583 }, { "auxiliary_loss_clip": 0.01134108, "auxiliary_loss_mlp": 0.01038419, "balance_loss_clip": 1.02400088, "balance_loss_mlp": 1.0493623, "epoch": 0.25287840072147905, "flos": 46936789879680.0, "grad_norm": 1.403505911169174, "language_loss": 0.75744498, "learning_rate": 3.50114639730826e-06, "loss": 0.77917033, "num_input_tokens_seen": 90742125, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.84765625, "step": 4206, "time_per_iteration": 2.7025554180145264 }, { "auxiliary_loss_clip": 0.01138613, "auxiliary_loss_mlp": 0.010399, "balance_loss_clip": 1.02403307, "balance_loss_mlp": 1.04880166, "epoch": 0.252938523974147, "flos": 18879712392960.0, "grad_norm": 4.6215588214798515, "language_loss": 0.79163945, "learning_rate": 3.5008890175099296e-06, "loss": 0.81342459, "num_input_tokens_seen": 90760785, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8984375, "step": 4207, "time_per_iteration": 2.43373441696167 }, { "auxiliary_loss_clip": 0.01134826, "auxiliary_loss_mlp": 0.01044349, "balance_loss_clip": 1.02874446, "balance_loss_mlp": 1.04857302, "epoch": 0.252998647226815, "flos": 21434720664960.0, "grad_norm": 1.7895918267329247, "language_loss": 0.76200271, "learning_rate": 3.5006315807977375e-06, "loss": 0.7837944, "num_input_tokens_seen": 90780045, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.86328125, "step": 4208, "time_per_iteration": 2.4729163646698 }, { "auxiliary_loss_clip": 0.01135787, "auxiliary_loss_mlp": 0.0103902, "balance_loss_clip": 1.02295017, "balance_loss_mlp": 1.04868937, "epoch": 0.25305877047948294, "flos": 25442171285760.0, "grad_norm": 1.8590489646466555, "language_loss": 0.7008754, "learning_rate": 3.5003740871814456e-06, "loss": 0.72262353, "num_input_tokens_seen": 90797980, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8671875, "step": 4209, "time_per_iteration": 2.511843204498291 }, { "auxiliary_loss_clip": 0.01062785, "auxiliary_loss_mlp": 0.01004952, "balance_loss_clip": 1.00258017, "balance_loss_mlp": 1.03029263, "epoch": 0.2531188937321509, "flos": 60185603629440.0, "grad_norm": 0.8191499491510887, "language_loss": 0.55120933, "learning_rate": 3.5001165366708175e-06, "loss": 0.57188666, "num_input_tokens_seen": 90864865, "router_z_loss_clip": 0.02368164, "router_z_loss_mlp": 0.32421875, "step": 4210, "time_per_iteration": 3.170170307159424 }, { "auxiliary_loss_clip": 0.01139075, "auxiliary_loss_mlp": 0.01032988, "balance_loss_clip": 1.01718044, "balance_loss_mlp": 1.04859781, "epoch": 0.25317901698481887, "flos": 19682387665920.0, "grad_norm": 1.922388245782303, "language_loss": 0.80087245, "learning_rate": 3.4998589292756204e-06, "loss": 0.82259309, "num_input_tokens_seen": 90882885, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.90625, "step": 4211, "time_per_iteration": 2.4402592182159424 }, { "auxiliary_loss_clip": 0.01132742, "auxiliary_loss_mlp": 0.01037597, "balance_loss_clip": 1.02265966, "balance_loss_mlp": 1.04719043, "epoch": 0.25323914023748684, "flos": 24424355502720.0, "grad_norm": 1.4759722425328348, "language_loss": 0.78258979, "learning_rate": 3.499601265005622e-06, "loss": 0.80429316, "num_input_tokens_seen": 90902985, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.85546875, "step": 4212, "time_per_iteration": 2.5043137073516846 }, { "auxiliary_loss_clip": 0.01136982, "auxiliary_loss_mlp": 0.01036159, "balance_loss_clip": 1.01995873, "balance_loss_mlp": 1.04785132, "epoch": 0.2532992634901548, "flos": 25447450584960.0, "grad_norm": 2.2470724681967775, "language_loss": 0.53966498, "learning_rate": 3.4993435438705938e-06, "loss": 0.56139636, "num_input_tokens_seen": 90923550, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.890625, "step": 4213, "time_per_iteration": 2.4886763095855713 }, { "auxiliary_loss_clip": 0.01139737, "auxiliary_loss_mlp": 0.01042346, "balance_loss_clip": 1.0251441, "balance_loss_mlp": 1.04992712, "epoch": 0.25335938674282277, "flos": 18880538405760.0, "grad_norm": 1.9330683233623105, "language_loss": 0.64979219, "learning_rate": 3.499085765880308e-06, "loss": 0.67161304, "num_input_tokens_seen": 90943260, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8984375, "step": 4214, "time_per_iteration": 2.5021302700042725 }, { "auxiliary_loss_clip": 0.01058913, "auxiliary_loss_mlp": 0.01004698, "balance_loss_clip": 1.00238502, "balance_loss_mlp": 1.02683544, "epoch": 0.25341950999549073, "flos": 53062649936640.0, "grad_norm": 0.85323966422804, "language_loss": 0.58088517, "learning_rate": 3.4988279310445396e-06, "loss": 0.60152125, "num_input_tokens_seen": 90996295, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.3203125, "step": 4215, "time_per_iteration": 2.8505637645721436 }, { "auxiliary_loss_clip": 0.01136663, "auxiliary_loss_mlp": 0.01041, "balance_loss_clip": 1.02406037, "balance_loss_mlp": 1.04920089, "epoch": 0.2534796332481587, "flos": 39020247054720.0, "grad_norm": 2.2628616981130776, "language_loss": 0.83589703, "learning_rate": 3.498570039373066e-06, "loss": 0.85767365, "num_input_tokens_seen": 91017545, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.875, "step": 4216, "time_per_iteration": 2.617138624191284 }, { "auxiliary_loss_clip": 0.01139132, "auxiliary_loss_mlp": 0.01035725, "balance_loss_clip": 1.01944065, "balance_loss_mlp": 1.05096459, "epoch": 0.2535397565008267, "flos": 23586990670080.0, "grad_norm": 1.7903775191197921, "language_loss": 0.80254912, "learning_rate": 3.498312090875666e-06, "loss": 0.82429773, "num_input_tokens_seen": 91037715, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8828125, "step": 4217, "time_per_iteration": 2.472261905670166 }, { "auxiliary_loss_clip": 0.01130774, "auxiliary_loss_mlp": 0.01035642, "balance_loss_clip": 1.02063346, "balance_loss_mlp": 1.04432857, "epoch": 0.2535998797534947, "flos": 19281373251840.0, "grad_norm": 2.3582900042299317, "language_loss": 0.75468767, "learning_rate": 3.4980540855621218e-06, "loss": 0.77635181, "num_input_tokens_seen": 91055295, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.86328125, "step": 4218, "time_per_iteration": 3.92753267288208 }, { "auxiliary_loss_clip": 0.01138893, "auxiliary_loss_mlp": 0.01043535, "balance_loss_clip": 1.02689338, "balance_loss_mlp": 1.04762805, "epoch": 0.25366000300616265, "flos": 24024382583040.0, "grad_norm": 1.6647040564466948, "language_loss": 0.74355841, "learning_rate": 3.4977960234422167e-06, "loss": 0.76538265, "num_input_tokens_seen": 91075485, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.91015625, "step": 4219, "time_per_iteration": 2.488271951675415 }, { "auxiliary_loss_clip": 0.01140345, "auxiliary_loss_mlp": 0.01045409, "balance_loss_clip": 1.02868438, "balance_loss_mlp": 1.05011225, "epoch": 0.2537201262588306, "flos": 16289368116480.0, "grad_norm": 1.8418155013225652, "language_loss": 0.81146717, "learning_rate": 3.497537904525736e-06, "loss": 0.83332467, "num_input_tokens_seen": 91093620, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.90234375, "step": 4220, "time_per_iteration": 3.7576334476470947 }, { "auxiliary_loss_clip": 0.01140574, "auxiliary_loss_mlp": 0.01043154, "balance_loss_clip": 1.02545118, "balance_loss_mlp": 1.05002916, "epoch": 0.2537802495114986, "flos": 23294677789440.0, "grad_norm": 3.514855227980344, "language_loss": 0.706536, "learning_rate": 3.497279728822468e-06, "loss": 0.72837329, "num_input_tokens_seen": 91114110, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.90625, "step": 4221, "time_per_iteration": 2.4882397651672363 }, { "auxiliary_loss_clip": 0.01137414, "auxiliary_loss_mlp": 0.01040459, "balance_loss_clip": 1.02386522, "balance_loss_mlp": 1.04817426, "epoch": 0.25384037276416654, "flos": 17639142416640.0, "grad_norm": 1.5523850397561294, "language_loss": 0.61403978, "learning_rate": 3.497021496342202e-06, "loss": 0.63581848, "num_input_tokens_seen": 91133135, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.890625, "step": 4222, "time_per_iteration": 5.326484441757202 }, { "auxiliary_loss_clip": 0.0113993, "auxiliary_loss_mlp": 0.01047993, "balance_loss_clip": 1.03111291, "balance_loss_mlp": 1.04873896, "epoch": 0.2539004960168345, "flos": 21507044699520.0, "grad_norm": 1.6498739260078286, "language_loss": 0.74132895, "learning_rate": 3.496763207094731e-06, "loss": 0.76320827, "num_input_tokens_seen": 91151805, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.91015625, "step": 4223, "time_per_iteration": 2.448963165283203 }, { "auxiliary_loss_clip": 0.01135084, "auxiliary_loss_mlp": 0.01034404, "balance_loss_clip": 1.01878798, "balance_loss_mlp": 1.04789233, "epoch": 0.2539606192695025, "flos": 23950909313280.0, "grad_norm": 1.8548371265608747, "language_loss": 0.80011797, "learning_rate": 3.49650486108985e-06, "loss": 0.82181287, "num_input_tokens_seen": 91172270, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.87109375, "step": 4224, "time_per_iteration": 2.489011526107788 }, { "auxiliary_loss_clip": 0.01133925, "auxiliary_loss_mlp": 0.01040864, "balance_loss_clip": 1.02447319, "balance_loss_mlp": 1.04649675, "epoch": 0.25402074252217044, "flos": 24169784837760.0, "grad_norm": 1.424822439188258, "language_loss": 0.77400506, "learning_rate": 3.496246458337354e-06, "loss": 0.79575288, "num_input_tokens_seen": 91192080, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.875, "step": 4225, "time_per_iteration": 2.4846835136413574 }, { "auxiliary_loss_clip": 0.01137136, "auxiliary_loss_mlp": 0.01052271, "balance_loss_clip": 1.03487802, "balance_loss_mlp": 1.04766536, "epoch": 0.2540808657748384, "flos": 22303758314880.0, "grad_norm": 1.7785850433381758, "language_loss": 0.84344417, "learning_rate": 3.4959879988470426e-06, "loss": 0.86533821, "num_input_tokens_seen": 91211450, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.89453125, "step": 4226, "time_per_iteration": 2.472200393676758 }, { "auxiliary_loss_clip": 0.01132076, "auxiliary_loss_mlp": 0.01044757, "balance_loss_clip": 1.02800822, "balance_loss_mlp": 1.04446745, "epoch": 0.25414098902750637, "flos": 27599541022080.0, "grad_norm": 1.4159335392411636, "language_loss": 0.7087267, "learning_rate": 3.4957294826287164e-06, "loss": 0.73049504, "num_input_tokens_seen": 91231835, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.875, "step": 4227, "time_per_iteration": 2.5187573432922363 }, { "auxiliary_loss_clip": 0.01056203, "auxiliary_loss_mlp": 0.01005752, "balance_loss_clip": 1.00327241, "balance_loss_mlp": 1.02503896, "epoch": 0.25420111228017434, "flos": 58170834887040.0, "grad_norm": 0.9951948683182206, "language_loss": 0.61819851, "learning_rate": 3.4954709096921785e-06, "loss": 0.63881803, "num_input_tokens_seen": 91288755, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.3125, "step": 4228, "time_per_iteration": 2.9509875774383545 }, { "auxiliary_loss_clip": 0.01137933, "auxiliary_loss_mlp": 0.010427, "balance_loss_clip": 1.02487826, "balance_loss_mlp": 1.04751849, "epoch": 0.2542612355328423, "flos": 11464409905920.0, "grad_norm": 2.183892333931857, "language_loss": 0.86256766, "learning_rate": 3.4952122800472336e-06, "loss": 0.88437396, "num_input_tokens_seen": 91302485, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.90625, "step": 4229, "time_per_iteration": 2.417370557785034 }, { "auxiliary_loss_clip": 0.01138538, "auxiliary_loss_mlp": 0.01042819, "balance_loss_clip": 1.02556992, "balance_loss_mlp": 1.04851246, "epoch": 0.2543213587855103, "flos": 22965879669120.0, "grad_norm": 3.572373678070532, "language_loss": 0.76819456, "learning_rate": 3.4949535937036892e-06, "loss": 0.79000813, "num_input_tokens_seen": 91321120, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8984375, "step": 4230, "time_per_iteration": 2.457958221435547 }, { "auxiliary_loss_clip": 0.01135721, "auxiliary_loss_mlp": 0.01046315, "balance_loss_clip": 1.02901745, "balance_loss_mlp": 1.04769719, "epoch": 0.2543814820381783, "flos": 18253178438400.0, "grad_norm": 1.9593046960191776, "language_loss": 0.74668038, "learning_rate": 3.4946948506713544e-06, "loss": 0.76850069, "num_input_tokens_seen": 91338575, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.87890625, "step": 4231, "time_per_iteration": 2.4661478996276855 }, { "auxiliary_loss_clip": 0.01134455, "auxiliary_loss_mlp": 0.0104023, "balance_loss_clip": 1.0243988, "balance_loss_mlp": 1.04652882, "epoch": 0.25444160529084625, "flos": 15632705629440.0, "grad_norm": 1.6098879607743364, "language_loss": 0.74198204, "learning_rate": 3.4944360509600416e-06, "loss": 0.76372886, "num_input_tokens_seen": 91357355, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.87890625, "step": 4232, "time_per_iteration": 2.4673404693603516 }, { "auxiliary_loss_clip": 0.01137004, "auxiliary_loss_mlp": 0.01041947, "balance_loss_clip": 1.02450669, "balance_loss_mlp": 1.04817772, "epoch": 0.2545017285435142, "flos": 24601610142720.0, "grad_norm": 2.0190196821357156, "language_loss": 0.86538315, "learning_rate": 3.4941771945795637e-06, "loss": 0.88717258, "num_input_tokens_seen": 91376515, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.890625, "step": 4233, "time_per_iteration": 2.5025646686553955 }, { "auxiliary_loss_clip": 0.01128988, "auxiliary_loss_mlp": 0.01037746, "balance_loss_clip": 1.02249932, "balance_loss_mlp": 1.04527724, "epoch": 0.2545618517961822, "flos": 24679069822080.0, "grad_norm": 1.6354068501719654, "language_loss": 0.74583292, "learning_rate": 3.493918281539737e-06, "loss": 0.76750028, "num_input_tokens_seen": 91397595, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8359375, "step": 4234, "time_per_iteration": 2.5048000812530518 }, { "auxiliary_loss_clip": 0.01136507, "auxiliary_loss_mlp": 0.01040265, "balance_loss_clip": 1.0245887, "balance_loss_mlp": 1.0463773, "epoch": 0.25462197504885015, "flos": 23915106432000.0, "grad_norm": 1.601846199283511, "language_loss": 0.74560195, "learning_rate": 3.493659311850379e-06, "loss": 0.76736969, "num_input_tokens_seen": 91417775, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8984375, "step": 4235, "time_per_iteration": 2.483748435974121 }, { "auxiliary_loss_clip": 0.01144624, "auxiliary_loss_mlp": 0.0104397, "balance_loss_clip": 1.02416945, "balance_loss_mlp": 1.04824018, "epoch": 0.2546820983015181, "flos": 24789387467520.0, "grad_norm": 1.8492354655613923, "language_loss": 0.65181762, "learning_rate": 3.4934002855213106e-06, "loss": 0.67370355, "num_input_tokens_seen": 91437665, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.9609375, "step": 4236, "time_per_iteration": 2.475658416748047 }, { "auxiliary_loss_clip": 0.01133817, "auxiliary_loss_mlp": 0.01035151, "balance_loss_clip": 1.0199523, "balance_loss_mlp": 1.04667091, "epoch": 0.2547422215541861, "flos": 18734130570240.0, "grad_norm": 2.0799296297474923, "language_loss": 0.66636449, "learning_rate": 3.493141202562354e-06, "loss": 0.6880542, "num_input_tokens_seen": 91456705, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.875, "step": 4237, "time_per_iteration": 2.4438273906707764 }, { "auxiliary_loss_clip": 0.01137352, "auxiliary_loss_mlp": 0.01048271, "balance_loss_clip": 1.03167677, "balance_loss_mlp": 1.04753327, "epoch": 0.25480234480685404, "flos": 21032449274880.0, "grad_norm": 2.103237877249742, "language_loss": 0.75552005, "learning_rate": 3.492882062983333e-06, "loss": 0.77737629, "num_input_tokens_seen": 91475535, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8984375, "step": 4238, "time_per_iteration": 2.4849650859832764 }, { "auxiliary_loss_clip": 0.01139899, "auxiliary_loss_mlp": 0.01045377, "balance_loss_clip": 1.02739978, "balance_loss_mlp": 1.04998159, "epoch": 0.254862468059522, "flos": 25082167224960.0, "grad_norm": 1.8504438047526204, "language_loss": 0.80615807, "learning_rate": 3.492622866794074e-06, "loss": 0.8280108, "num_input_tokens_seen": 91499140, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.8984375, "step": 4239, "time_per_iteration": 2.51130747795105 }, { "auxiliary_loss_clip": 0.01135368, "auxiliary_loss_mlp": 0.01041671, "balance_loss_clip": 1.02434969, "balance_loss_mlp": 1.04819679, "epoch": 0.25492259131219, "flos": 20558392554240.0, "grad_norm": 1.673732674111367, "language_loss": 0.76899618, "learning_rate": 3.492363614004407e-06, "loss": 0.79076654, "num_input_tokens_seen": 91518335, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.87109375, "step": 4240, "time_per_iteration": 2.463571071624756 }, { "auxiliary_loss_clip": 0.01140483, "auxiliary_loss_mlp": 0.01037106, "balance_loss_clip": 1.0188427, "balance_loss_mlp": 1.046628, "epoch": 0.25498271456485794, "flos": 25042485674880.0, "grad_norm": 1.8979920965358281, "language_loss": 0.83587229, "learning_rate": 3.492104304624162e-06, "loss": 0.85764813, "num_input_tokens_seen": 91537655, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.9375, "step": 4241, "time_per_iteration": 2.484553337097168 }, { "auxiliary_loss_clip": 0.01135811, "auxiliary_loss_mlp": 0.0104233, "balance_loss_clip": 1.02533102, "balance_loss_mlp": 1.04639363, "epoch": 0.2550428378175259, "flos": 26178412354560.0, "grad_norm": 2.0982359015395526, "language_loss": 0.73478484, "learning_rate": 3.4918449386631725e-06, "loss": 0.75656629, "num_input_tokens_seen": 91557545, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.89453125, "step": 4242, "time_per_iteration": 2.5224456787109375 }, { "auxiliary_loss_clip": 0.01137722, "auxiliary_loss_mlp": 0.01037703, "balance_loss_clip": 1.02128792, "balance_loss_mlp": 1.04811943, "epoch": 0.2551029610701939, "flos": 15267170874240.0, "grad_norm": 2.5056840654881714, "language_loss": 0.7301774, "learning_rate": 3.491585516131273e-06, "loss": 0.75193167, "num_input_tokens_seen": 91574405, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.89453125, "step": 4243, "time_per_iteration": 2.424377202987671 }, { "auxiliary_loss_clip": 0.01137341, "auxiliary_loss_mlp": 0.01044327, "balance_loss_clip": 1.02689862, "balance_loss_mlp": 1.04797816, "epoch": 0.2551630843228619, "flos": 18112193556480.0, "grad_norm": 2.0622365826761957, "language_loss": 0.8177439, "learning_rate": 3.491326037038301e-06, "loss": 0.83956063, "num_input_tokens_seen": 91593755, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.89453125, "step": 4244, "time_per_iteration": 2.4709055423736572 }, { "auxiliary_loss_clip": 0.01062896, "auxiliary_loss_mlp": 0.0100328, "balance_loss_clip": 1.00043058, "balance_loss_mlp": 1.02962041, "epoch": 0.25522320757552985, "flos": 70520192167680.0, "grad_norm": 0.6932746418505709, "language_loss": 0.57772487, "learning_rate": 3.4910665013940967e-06, "loss": 0.59838659, "num_input_tokens_seen": 91660335, "router_z_loss_clip": 0.02844238, "router_z_loss_mlp": 0.33203125, "step": 4245, "time_per_iteration": 3.1797287464141846 }, { "auxiliary_loss_clip": 0.01135268, "auxiliary_loss_mlp": 0.01046856, "balance_loss_clip": 1.02991664, "balance_loss_mlp": 1.04513907, "epoch": 0.2552833308281978, "flos": 22893088757760.0, "grad_norm": 2.1392805170611275, "language_loss": 0.65342319, "learning_rate": 3.4908069092085015e-06, "loss": 0.67524445, "num_input_tokens_seen": 91678500, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90234375, "step": 4246, "time_per_iteration": 2.460303544998169 }, { "auxiliary_loss_clip": 0.01130415, "auxiliary_loss_mlp": 0.01040647, "balance_loss_clip": 1.02512622, "balance_loss_mlp": 1.04657865, "epoch": 0.2553434540808658, "flos": 22053605022720.0, "grad_norm": 1.6791292835354428, "language_loss": 0.81431711, "learning_rate": 3.4905472604913585e-06, "loss": 0.83602768, "num_input_tokens_seen": 91696430, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8359375, "step": 4247, "time_per_iteration": 2.4865610599517822 }, { "auxiliary_loss_clip": 0.01141534, "auxiliary_loss_mlp": 0.01046135, "balance_loss_clip": 1.02727616, "balance_loss_mlp": 1.04551506, "epoch": 0.25540357733353375, "flos": 16544190176640.0, "grad_norm": 2.132513464251772, "language_loss": 0.83145237, "learning_rate": 3.490287555252514e-06, "loss": 0.853329, "num_input_tokens_seen": 91713270, "router_z_loss_clip": 0.18847656, "router_z_loss_mlp": 0.9609375, "step": 4248, "time_per_iteration": 2.425347089767456 }, { "auxiliary_loss_clip": 0.01137134, "auxiliary_loss_mlp": 0.0104548, "balance_loss_clip": 1.02870691, "balance_loss_mlp": 1.04752922, "epoch": 0.2554637005862017, "flos": 17565022702080.0, "grad_norm": 1.8644825409418084, "language_loss": 0.84337103, "learning_rate": 3.4900277935018166e-06, "loss": 0.86519718, "num_input_tokens_seen": 91728865, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 4249, "time_per_iteration": 2.4509646892547607 }, { "auxiliary_loss_clip": 0.0105878, "auxiliary_loss_mlp": 0.01006177, "balance_loss_clip": 1.00348306, "balance_loss_mlp": 1.02613378, "epoch": 0.2555238238388697, "flos": 72244763953920.0, "grad_norm": 2.4383107167715954, "language_loss": 0.56350696, "learning_rate": 3.489767975249115e-06, "loss": 0.58415651, "num_input_tokens_seen": 91787470, "router_z_loss_clip": 0.02697754, "router_z_loss_mlp": 0.328125, "step": 4250, "time_per_iteration": 3.0712504386901855 }, { "auxiliary_loss_clip": 0.01135436, "auxiliary_loss_mlp": 0.0103962, "balance_loss_clip": 1.02154803, "balance_loss_mlp": 1.04505157, "epoch": 0.25558394709153764, "flos": 24389414547840.0, "grad_norm": 1.830347828400091, "language_loss": 0.80729854, "learning_rate": 3.4895081005042632e-06, "loss": 0.82904911, "num_input_tokens_seen": 91805640, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.90625, "step": 4251, "time_per_iteration": 2.510310649871826 }, { "auxiliary_loss_clip": 0.01058375, "auxiliary_loss_mlp": 0.01004944, "balance_loss_clip": 1.0021069, "balance_loss_mlp": 1.02526999, "epoch": 0.2556440703442056, "flos": 69231213636480.0, "grad_norm": 0.7948629884145384, "language_loss": 0.66157281, "learning_rate": 3.4892481692771146e-06, "loss": 0.68220603, "num_input_tokens_seen": 91869695, "router_z_loss_clip": 0.02832031, "router_z_loss_mlp": 0.33203125, "step": 4252, "time_per_iteration": 3.1356353759765625 }, { "auxiliary_loss_clip": 0.01132991, "auxiliary_loss_mlp": 0.0103425, "balance_loss_clip": 1.01899123, "balance_loss_mlp": 1.04605126, "epoch": 0.2557041935968736, "flos": 24863902231680.0, "grad_norm": 1.9664496917157692, "language_loss": 0.73673952, "learning_rate": 3.4889881815775267e-06, "loss": 0.75841194, "num_input_tokens_seen": 91889920, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8671875, "step": 4253, "time_per_iteration": 2.5172605514526367 }, { "auxiliary_loss_clip": 0.01135086, "auxiliary_loss_mlp": 0.01043679, "balance_loss_clip": 1.02740693, "balance_loss_mlp": 1.04672408, "epoch": 0.25576431684954154, "flos": 22492110257280.0, "grad_norm": 2.130779566459009, "language_loss": 0.72305417, "learning_rate": 3.488728137415357e-06, "loss": 0.74484181, "num_input_tokens_seen": 91908665, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.88671875, "step": 4254, "time_per_iteration": 2.453979730606079 }, { "auxiliary_loss_clip": 0.01135383, "auxiliary_loss_mlp": 0.01045618, "balance_loss_clip": 1.02861881, "balance_loss_mlp": 1.04652297, "epoch": 0.2558244401022095, "flos": 19826748426240.0, "grad_norm": 1.7157368860575104, "language_loss": 0.8121525, "learning_rate": 3.4884680368004675e-06, "loss": 0.83396256, "num_input_tokens_seen": 91927855, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.88671875, "step": 4255, "time_per_iteration": 2.4901280403137207 }, { "auxiliary_loss_clip": 0.01135842, "auxiliary_loss_mlp": 0.01040043, "balance_loss_clip": 1.02334142, "balance_loss_mlp": 1.04846764, "epoch": 0.2558845633548775, "flos": 23220486247680.0, "grad_norm": 1.492684052403872, "language_loss": 0.85212147, "learning_rate": 3.488207879742721e-06, "loss": 0.87388027, "num_input_tokens_seen": 91948500, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.875, "step": 4256, "time_per_iteration": 2.4608049392700195 }, { "auxiliary_loss_clip": 0.01139283, "auxiliary_loss_mlp": 0.01044864, "balance_loss_clip": 1.02697074, "balance_loss_mlp": 1.04739904, "epoch": 0.2559446866075455, "flos": 16837867774080.0, "grad_norm": 1.613033721984935, "language_loss": 0.75046527, "learning_rate": 3.4879476662519826e-06, "loss": 0.77230674, "num_input_tokens_seen": 91968375, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.91796875, "step": 4257, "time_per_iteration": 2.4668900966644287 }, { "auxiliary_loss_clip": 0.01054902, "auxiliary_loss_mlp": 0.0100183, "balance_loss_clip": 0.99918383, "balance_loss_mlp": 1.02336216, "epoch": 0.25600480986021346, "flos": 57593786895360.0, "grad_norm": 0.8476811476043847, "language_loss": 0.65326786, "learning_rate": 3.4876873963381196e-06, "loss": 0.67383516, "num_input_tokens_seen": 92028490, "router_z_loss_clip": 0.02648926, "router_z_loss_mlp": 0.31640625, "step": 4258, "time_per_iteration": 3.021071672439575 }, { "auxiliary_loss_clip": 0.01133085, "auxiliary_loss_mlp": 0.0103543, "balance_loss_clip": 1.0185616, "balance_loss_mlp": 1.04771686, "epoch": 0.2560649331128814, "flos": 27819529868160.0, "grad_norm": 2.33779245240124, "language_loss": 0.7649889, "learning_rate": 3.4874270700110013e-06, "loss": 0.78667402, "num_input_tokens_seen": 92048060, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8515625, "step": 4259, "time_per_iteration": 4.0569939613342285 }, { "auxiliary_loss_clip": 0.01055371, "auxiliary_loss_mlp": 0.01003344, "balance_loss_clip": 1.00087631, "balance_loss_mlp": 1.0236671, "epoch": 0.2561250563655494, "flos": 70950509101440.0, "grad_norm": 0.8508183057932225, "language_loss": 0.58509767, "learning_rate": 3.4871666872804994e-06, "loss": 0.60568482, "num_input_tokens_seen": 92118180, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.31640625, "step": 4260, "time_per_iteration": 3.187411069869995 }, { "auxiliary_loss_clip": 0.01135288, "auxiliary_loss_mlp": 0.01042057, "balance_loss_clip": 1.02481914, "balance_loss_mlp": 1.0455811, "epoch": 0.25618517961821735, "flos": 27012329481600.0, "grad_norm": 2.136890547841966, "language_loss": 0.7650162, "learning_rate": 3.4869062481564875e-06, "loss": 0.78678966, "num_input_tokens_seen": 92137570, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8984375, "step": 4261, "time_per_iteration": 2.501803159713745 }, { "auxiliary_loss_clip": 0.01134, "auxiliary_loss_mlp": 0.01039349, "balance_loss_clip": 1.02404225, "balance_loss_mlp": 1.04707861, "epoch": 0.2562453028708853, "flos": 23068296322560.0, "grad_norm": 1.5724315535828246, "language_loss": 0.82934326, "learning_rate": 3.486645752648842e-06, "loss": 0.85107672, "num_input_tokens_seen": 92157625, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8671875, "step": 4262, "time_per_iteration": 3.8621058464050293 }, { "auxiliary_loss_clip": 0.01139431, "auxiliary_loss_mlp": 0.01045063, "balance_loss_clip": 1.0275389, "balance_loss_mlp": 1.04591799, "epoch": 0.2563054261235533, "flos": 15120942606720.0, "grad_norm": 2.369098557069217, "language_loss": 0.73770559, "learning_rate": 3.4863852007674405e-06, "loss": 0.75955045, "num_input_tokens_seen": 92175350, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.9375, "step": 4263, "time_per_iteration": 3.8181657791137695 }, { "auxiliary_loss_clip": 0.01134277, "auxiliary_loss_mlp": 0.01052523, "balance_loss_clip": 1.03578615, "balance_loss_mlp": 1.04826665, "epoch": 0.25636554937622125, "flos": 27854865872640.0, "grad_norm": 1.7049896879676416, "language_loss": 0.82727277, "learning_rate": 3.486124592522163e-06, "loss": 0.84914076, "num_input_tokens_seen": 92196070, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.859375, "step": 4264, "time_per_iteration": 4.003083229064941 }, { "auxiliary_loss_clip": 0.01137909, "auxiliary_loss_mlp": 0.01044494, "balance_loss_clip": 1.02724457, "balance_loss_mlp": 1.04895675, "epoch": 0.2564256726288892, "flos": 28906509288960.0, "grad_norm": 1.8748138981506162, "language_loss": 0.745974, "learning_rate": 3.4858639279228924e-06, "loss": 0.76779807, "num_input_tokens_seen": 92216310, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.890625, "step": 4265, "time_per_iteration": 2.5543737411499023 }, { "auxiliary_loss_clip": 0.01132291, "auxiliary_loss_mlp": 0.01038499, "balance_loss_clip": 1.02242935, "balance_loss_mlp": 1.04411197, "epoch": 0.2564857958815572, "flos": 18514931823360.0, "grad_norm": 1.6989385616284476, "language_loss": 0.81559753, "learning_rate": 3.485603206979513e-06, "loss": 0.83730537, "num_input_tokens_seen": 92234510, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8828125, "step": 4266, "time_per_iteration": 2.4520280361175537 }, { "auxiliary_loss_clip": 0.01130671, "auxiliary_loss_mlp": 0.01036648, "balance_loss_clip": 1.02023268, "balance_loss_mlp": 1.04496801, "epoch": 0.25654591913422514, "flos": 25808280658560.0, "grad_norm": 1.819529002477698, "language_loss": 0.7912935, "learning_rate": 3.4853424297019103e-06, "loss": 0.8129667, "num_input_tokens_seen": 92254070, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.85546875, "step": 4267, "time_per_iteration": 2.4838955402374268 }, { "auxiliary_loss_clip": 0.01129619, "auxiliary_loss_mlp": 0.01043319, "balance_loss_clip": 1.02673686, "balance_loss_mlp": 1.04548597, "epoch": 0.2566060423868931, "flos": 19099665325440.0, "grad_norm": 1.6818787845094527, "language_loss": 0.79083562, "learning_rate": 3.4850815960999736e-06, "loss": 0.81256497, "num_input_tokens_seen": 92275060, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84375, "step": 4268, "time_per_iteration": 2.4785845279693604 }, { "auxiliary_loss_clip": 0.01135385, "auxiliary_loss_mlp": 0.01042152, "balance_loss_clip": 1.02636862, "balance_loss_mlp": 1.04756904, "epoch": 0.25666616563956113, "flos": 23842674656640.0, "grad_norm": 1.536640753243411, "language_loss": 0.67991567, "learning_rate": 3.484820706183595e-06, "loss": 0.70169103, "num_input_tokens_seen": 92293610, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.87890625, "step": 4269, "time_per_iteration": 2.488882541656494 }, { "auxiliary_loss_clip": 0.01135864, "auxiliary_loss_mlp": 0.01043008, "balance_loss_clip": 1.02583003, "balance_loss_mlp": 1.04704165, "epoch": 0.2567262888922291, "flos": 14604259420800.0, "grad_norm": 4.375279923508937, "language_loss": 0.79298359, "learning_rate": 3.484559759962666e-06, "loss": 0.81477231, "num_input_tokens_seen": 92308305, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.88671875, "step": 4270, "time_per_iteration": 2.4323253631591797 }, { "auxiliary_loss_clip": 0.01141173, "auxiliary_loss_mlp": 0.01040908, "balance_loss_clip": 1.02182245, "balance_loss_mlp": 1.04765499, "epoch": 0.25678641214489706, "flos": 32923117877760.0, "grad_norm": 1.924829213139971, "language_loss": 0.67804897, "learning_rate": 3.4842987574470816e-06, "loss": 0.69986975, "num_input_tokens_seen": 92329875, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.9375, "step": 4271, "time_per_iteration": 2.5441174507141113 }, { "auxiliary_loss_clip": 0.01136267, "auxiliary_loss_mlp": 0.01045188, "balance_loss_clip": 1.02740145, "balance_loss_mlp": 1.04524159, "epoch": 0.256846535397565, "flos": 24098933260800.0, "grad_norm": 1.4021856584798544, "language_loss": 0.87518334, "learning_rate": 3.4840376986467403e-06, "loss": 0.89699793, "num_input_tokens_seen": 92348780, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.9140625, "step": 4272, "time_per_iteration": 2.4871394634246826 }, { "auxiliary_loss_clip": 0.01137369, "auxiliary_loss_mlp": 0.01048591, "balance_loss_clip": 1.03125834, "balance_loss_mlp": 1.04731131, "epoch": 0.256906658650233, "flos": 19718441942400.0, "grad_norm": 1.7438206426915808, "language_loss": 0.81526935, "learning_rate": 3.483776583571541e-06, "loss": 0.83712894, "num_input_tokens_seen": 92368175, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.90234375, "step": 4273, "time_per_iteration": 2.4389808177948 }, { "auxiliary_loss_clip": 0.01130518, "auxiliary_loss_mlp": 0.01037662, "balance_loss_clip": 1.0213418, "balance_loss_mlp": 1.04648948, "epoch": 0.25696678190290095, "flos": 22926018551040.0, "grad_norm": 1.5280044014747844, "language_loss": 0.77509928, "learning_rate": 3.4835154122313846e-06, "loss": 0.79678106, "num_input_tokens_seen": 92387755, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.83984375, "step": 4274, "time_per_iteration": 2.4776999950408936 }, { "auxiliary_loss_clip": 0.01126859, "auxiliary_loss_mlp": 0.01035443, "balance_loss_clip": 1.01870561, "balance_loss_mlp": 1.04222298, "epoch": 0.2570269051555689, "flos": 27307838672640.0, "grad_norm": 1.9838227675327178, "language_loss": 0.8411442, "learning_rate": 3.4832541846361743e-06, "loss": 0.86276722, "num_input_tokens_seen": 92409850, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.84375, "step": 4275, "time_per_iteration": 2.5042099952697754 }, { "auxiliary_loss_clip": 0.0113439, "auxiliary_loss_mlp": 0.01038005, "balance_loss_clip": 1.02089834, "balance_loss_mlp": 1.04564142, "epoch": 0.2570870284082369, "flos": 27563414918400.0, "grad_norm": 2.3763590356810065, "language_loss": 0.78488755, "learning_rate": 3.4829929007958175e-06, "loss": 0.80661154, "num_input_tokens_seen": 92431250, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.88671875, "step": 4276, "time_per_iteration": 2.53149676322937 }, { "auxiliary_loss_clip": 0.01129966, "auxiliary_loss_mlp": 0.01038342, "balance_loss_clip": 1.02221262, "balance_loss_mlp": 1.04381704, "epoch": 0.25714715166090485, "flos": 28730834847360.0, "grad_norm": 2.012450835994405, "language_loss": 0.79562843, "learning_rate": 3.4827315607202214e-06, "loss": 0.81731147, "num_input_tokens_seen": 92452065, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.86328125, "step": 4277, "time_per_iteration": 2.563547372817993 }, { "auxiliary_loss_clip": 0.01132599, "auxiliary_loss_mlp": 0.01035618, "balance_loss_clip": 1.02027559, "balance_loss_mlp": 1.04521585, "epoch": 0.2572072749135728, "flos": 20116152305280.0, "grad_norm": 2.1371931467654695, "language_loss": 0.78866601, "learning_rate": 3.482470164419295e-06, "loss": 0.81034821, "num_input_tokens_seen": 92470025, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.87109375, "step": 4278, "time_per_iteration": 2.4560399055480957 }, { "auxiliary_loss_clip": 0.01136675, "auxiliary_loss_mlp": 0.01040166, "balance_loss_clip": 1.02411413, "balance_loss_mlp": 1.04707861, "epoch": 0.2572673981662408, "flos": 26030855283840.0, "grad_norm": 2.027621236867215, "language_loss": 0.74841547, "learning_rate": 3.482208711902952e-06, "loss": 0.77018386, "num_input_tokens_seen": 92489825, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8984375, "step": 4279, "time_per_iteration": 2.4732413291931152 }, { "auxiliary_loss_clip": 0.01132767, "auxiliary_loss_mlp": 0.01044969, "balance_loss_clip": 1.0283277, "balance_loss_mlp": 1.04373288, "epoch": 0.25732752141890874, "flos": 16106618695680.0, "grad_norm": 2.1548215560240624, "language_loss": 0.8528375, "learning_rate": 3.4819472031811065e-06, "loss": 0.87461483, "num_input_tokens_seen": 92507270, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.890625, "step": 4280, "time_per_iteration": 2.458425998687744 }, { "auxiliary_loss_clip": 0.0113403, "auxiliary_loss_mlp": 0.0103621, "balance_loss_clip": 1.01925814, "balance_loss_mlp": 1.04504454, "epoch": 0.2573876446715767, "flos": 22524429519360.0, "grad_norm": 2.5462788969277774, "language_loss": 0.79229486, "learning_rate": 3.4816856382636744e-06, "loss": 0.81399715, "num_input_tokens_seen": 92526300, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.890625, "step": 4281, "time_per_iteration": 2.4547972679138184 }, { "auxiliary_loss_clip": 0.01133308, "auxiliary_loss_mlp": 0.01039159, "balance_loss_clip": 1.0227977, "balance_loss_mlp": 1.04589701, "epoch": 0.2574477679242447, "flos": 23950837486080.0, "grad_norm": 1.7492453141727493, "language_loss": 0.87096459, "learning_rate": 3.4814240171605737e-06, "loss": 0.89268923, "num_input_tokens_seen": 92546465, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.875, "step": 4282, "time_per_iteration": 2.5159707069396973 }, { "auxiliary_loss_clip": 0.01134929, "auxiliary_loss_mlp": 0.01043356, "balance_loss_clip": 1.02756095, "balance_loss_mlp": 1.04576659, "epoch": 0.2575078911769127, "flos": 21981711951360.0, "grad_norm": 1.8725946837636442, "language_loss": 0.70466459, "learning_rate": 3.4811623398817267e-06, "loss": 0.72644746, "num_input_tokens_seen": 92567260, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.890625, "step": 4283, "time_per_iteration": 2.470532178878784 }, { "auxiliary_loss_clip": 0.01129831, "auxiliary_loss_mlp": 0.01041244, "balance_loss_clip": 1.02566385, "balance_loss_mlp": 1.04696465, "epoch": 0.25756801442958066, "flos": 21945406279680.0, "grad_norm": 1.751939536048035, "language_loss": 0.80130631, "learning_rate": 3.4809006064370553e-06, "loss": 0.823017, "num_input_tokens_seen": 92585425, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 4284, "time_per_iteration": 2.495485305786133 }, { "auxiliary_loss_clip": 0.011327, "auxiliary_loss_mlp": 0.01041655, "balance_loss_clip": 1.0267067, "balance_loss_mlp": 1.04573846, "epoch": 0.2576281376822486, "flos": 35261980058880.0, "grad_norm": 2.1831331134324237, "language_loss": 0.70507252, "learning_rate": 3.4806388168364835e-06, "loss": 0.72681606, "num_input_tokens_seen": 92604770, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.87109375, "step": 4285, "time_per_iteration": 2.5675129890441895 }, { "auxiliary_loss_clip": 0.01134985, "auxiliary_loss_mlp": 0.01041475, "balance_loss_clip": 1.02604914, "balance_loss_mlp": 1.04825842, "epoch": 0.2576882609349166, "flos": 14132285688960.0, "grad_norm": 2.3283513469079264, "language_loss": 0.58626491, "learning_rate": 3.4803769710899402e-06, "loss": 0.60802948, "num_input_tokens_seen": 92622635, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8671875, "step": 4286, "time_per_iteration": 2.448521375656128 }, { "auxiliary_loss_clip": 0.01137652, "auxiliary_loss_mlp": 0.01046945, "balance_loss_clip": 1.03041053, "balance_loss_mlp": 1.047786, "epoch": 0.25774838418758456, "flos": 23258336204160.0, "grad_norm": 1.7706328248485288, "language_loss": 0.64002293, "learning_rate": 3.480115069207354e-06, "loss": 0.66186887, "num_input_tokens_seen": 92642960, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8984375, "step": 4287, "time_per_iteration": 2.46766996383667 }, { "auxiliary_loss_clip": 0.01134839, "auxiliary_loss_mlp": 0.01037596, "balance_loss_clip": 1.02078712, "balance_loss_mlp": 1.04419923, "epoch": 0.2578085074402525, "flos": 22601745544320.0, "grad_norm": 1.9084685124299732, "language_loss": 0.71812785, "learning_rate": 3.4798531111986557e-06, "loss": 0.73985219, "num_input_tokens_seen": 92662455, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.90625, "step": 4288, "time_per_iteration": 2.5148439407348633 }, { "auxiliary_loss_clip": 0.01130017, "auxiliary_loss_mlp": 0.01035049, "balance_loss_clip": 1.01955795, "balance_loss_mlp": 1.04507995, "epoch": 0.2578686306929205, "flos": 24571840746240.0, "grad_norm": 1.502597285515724, "language_loss": 0.76982141, "learning_rate": 3.4795910970737786e-06, "loss": 0.79147208, "num_input_tokens_seen": 92683520, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84765625, "step": 4289, "time_per_iteration": 2.5107741355895996 }, { "auxiliary_loss_clip": 0.01132714, "auxiliary_loss_mlp": 0.01039761, "balance_loss_clip": 1.02309597, "balance_loss_mlp": 1.04514682, "epoch": 0.25792875394558845, "flos": 18113953322880.0, "grad_norm": 2.0369520516344677, "language_loss": 0.85463005, "learning_rate": 3.4793290268426592e-06, "loss": 0.87635481, "num_input_tokens_seen": 92701450, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 4290, "time_per_iteration": 2.461958885192871 }, { "auxiliary_loss_clip": 0.01134275, "auxiliary_loss_mlp": 0.0104496, "balance_loss_clip": 1.02701879, "balance_loss_mlp": 1.04507387, "epoch": 0.2579888771982564, "flos": 17712902995200.0, "grad_norm": 1.924979463577269, "language_loss": 0.72202563, "learning_rate": 3.4790669005152354e-06, "loss": 0.74381804, "num_input_tokens_seen": 92720355, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.890625, "step": 4291, "time_per_iteration": 2.4278018474578857 }, { "auxiliary_loss_clip": 0.01135541, "auxiliary_loss_mlp": 0.01036604, "balance_loss_clip": 1.01903296, "balance_loss_mlp": 1.04531217, "epoch": 0.2580490004509244, "flos": 16434878112000.0, "grad_norm": 3.1068899944259507, "language_loss": 0.81210399, "learning_rate": 3.4788047181014458e-06, "loss": 0.83382541, "num_input_tokens_seen": 92736755, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.8984375, "step": 4292, "time_per_iteration": 2.438563108444214 }, { "auxiliary_loss_clip": 0.01136564, "auxiliary_loss_mlp": 0.01043022, "balance_loss_clip": 1.02620196, "balance_loss_mlp": 1.04799855, "epoch": 0.25810912370359235, "flos": 33835141128960.0, "grad_norm": 1.855062650717093, "language_loss": 0.66972661, "learning_rate": 3.4785424796112337e-06, "loss": 0.69152242, "num_input_tokens_seen": 92757655, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.88671875, "step": 4293, "time_per_iteration": 2.5690979957580566 }, { "auxiliary_loss_clip": 0.01129519, "auxiliary_loss_mlp": 0.01040664, "balance_loss_clip": 1.02529788, "balance_loss_mlp": 1.04576516, "epoch": 0.2581692469562603, "flos": 25192197561600.0, "grad_norm": 1.8807730214662832, "language_loss": 0.75447571, "learning_rate": 3.478280185054542e-06, "loss": 0.77617759, "num_input_tokens_seen": 92776100, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 4294, "time_per_iteration": 2.4974591732025146 }, { "auxiliary_loss_clip": 0.0113187, "auxiliary_loss_mlp": 0.01043433, "balance_loss_clip": 1.02661228, "balance_loss_mlp": 1.04451776, "epoch": 0.2582293702089283, "flos": 34932212271360.0, "grad_norm": 2.006076830528506, "language_loss": 0.80793488, "learning_rate": 3.478017834441318e-06, "loss": 0.82968789, "num_input_tokens_seen": 92798880, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.875, "step": 4295, "time_per_iteration": 2.5758984088897705 }, { "auxiliary_loss_clip": 0.01138216, "auxiliary_loss_mlp": 0.01046112, "balance_loss_clip": 1.02925587, "balance_loss_mlp": 1.04725373, "epoch": 0.2582894934615963, "flos": 26833746038400.0, "grad_norm": 1.7819490011210475, "language_loss": 0.72633302, "learning_rate": 3.4777554277815096e-06, "loss": 0.74817634, "num_input_tokens_seen": 92817750, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.91015625, "step": 4296, "time_per_iteration": 2.5128419399261475 }, { "auxiliary_loss_clip": 0.01136542, "auxiliary_loss_mlp": 0.0103771, "balance_loss_clip": 1.02091289, "balance_loss_mlp": 1.04786408, "epoch": 0.25834961671426426, "flos": 23515241253120.0, "grad_norm": 1.666221793227512, "language_loss": 0.86407053, "learning_rate": 3.477492965085067e-06, "loss": 0.885813, "num_input_tokens_seen": 92837995, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.88671875, "step": 4297, "time_per_iteration": 2.4850189685821533 }, { "auxiliary_loss_clip": 0.0113415, "auxiliary_loss_mlp": 0.0104824, "balance_loss_clip": 1.03310657, "balance_loss_mlp": 1.04635167, "epoch": 0.25840973996693223, "flos": 22451028076800.0, "grad_norm": 1.747703843573575, "language_loss": 0.8445121, "learning_rate": 3.477230446361943e-06, "loss": 0.86633599, "num_input_tokens_seen": 92857245, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.875, "step": 4298, "time_per_iteration": 2.4790091514587402 }, { "auxiliary_loss_clip": 0.01132663, "auxiliary_loss_mlp": 0.01032738, "balance_loss_clip": 1.01625133, "balance_loss_mlp": 1.04542542, "epoch": 0.2584698632196002, "flos": 11290854366720.0, "grad_norm": 1.9601557968787575, "language_loss": 0.83499813, "learning_rate": 3.4769678716220927e-06, "loss": 0.85665214, "num_input_tokens_seen": 92873265, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.87109375, "step": 4299, "time_per_iteration": 2.4059677124023438 }, { "auxiliary_loss_clip": 0.01131151, "auxiliary_loss_mlp": 0.01036609, "balance_loss_clip": 1.02163005, "balance_loss_mlp": 1.04613113, "epoch": 0.25852998647226816, "flos": 17929982839680.0, "grad_norm": 2.292336099835401, "language_loss": 0.82974982, "learning_rate": 3.4767052408754726e-06, "loss": 0.85142744, "num_input_tokens_seen": 92890880, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.84765625, "step": 4300, "time_per_iteration": 2.450148344039917 }, { "auxiliary_loss_clip": 0.01134564, "auxiliary_loss_mlp": 0.01037653, "balance_loss_clip": 1.02147675, "balance_loss_mlp": 1.0461024, "epoch": 0.2585901097249361, "flos": 33256117889280.0, "grad_norm": 2.328152708076687, "language_loss": 0.6704179, "learning_rate": 3.4764425541320417e-06, "loss": 0.6921401, "num_input_tokens_seen": 92910770, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8828125, "step": 4301, "time_per_iteration": 4.051277160644531 }, { "auxiliary_loss_clip": 0.01137894, "auxiliary_loss_mlp": 0.01042621, "balance_loss_clip": 1.02625406, "balance_loss_mlp": 1.04657769, "epoch": 0.2586502329776041, "flos": 18441278985600.0, "grad_norm": 4.0466807168540395, "language_loss": 0.82658559, "learning_rate": 3.4761798114017617e-06, "loss": 0.84839076, "num_input_tokens_seen": 92929520, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.9140625, "step": 4302, "time_per_iteration": 2.451101303100586 }, { "auxiliary_loss_clip": 0.01135475, "auxiliary_loss_mlp": 0.0104235, "balance_loss_clip": 1.02575636, "balance_loss_mlp": 1.04748285, "epoch": 0.25871035623027205, "flos": 17968120104960.0, "grad_norm": 1.7433857910620012, "language_loss": 0.92078978, "learning_rate": 3.475917012694595e-06, "loss": 0.94256806, "num_input_tokens_seen": 92947890, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.87890625, "step": 4303, "time_per_iteration": 2.4305458068847656 }, { "auxiliary_loss_clip": 0.01136015, "auxiliary_loss_mlp": 0.01037148, "balance_loss_clip": 1.02110243, "balance_loss_mlp": 1.04861426, "epoch": 0.25877047948294, "flos": 27777729415680.0, "grad_norm": 1.6804207237871753, "language_loss": 0.67244387, "learning_rate": 3.475654158020507e-06, "loss": 0.69417548, "num_input_tokens_seen": 92967690, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 4304, "time_per_iteration": 3.9038987159729004 }, { "auxiliary_loss_clip": 0.01137034, "auxiliary_loss_mlp": 0.01048477, "balance_loss_clip": 1.03160882, "balance_loss_mlp": 1.04722714, "epoch": 0.258830602735608, "flos": 27125843437440.0, "grad_norm": 2.829407371533267, "language_loss": 0.72276855, "learning_rate": 3.4753912473894657e-06, "loss": 0.74462366, "num_input_tokens_seen": 92986830, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8984375, "step": 4305, "time_per_iteration": 5.322906017303467 }, { "auxiliary_loss_clip": 0.01136729, "auxiliary_loss_mlp": 0.01043876, "balance_loss_clip": 1.0265187, "balance_loss_mlp": 1.04665709, "epoch": 0.25889072598827595, "flos": 17891486438400.0, "grad_norm": 2.053060677601327, "language_loss": 0.75500274, "learning_rate": 3.4751282808114403e-06, "loss": 0.77680886, "num_input_tokens_seen": 93002740, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90234375, "step": 4306, "time_per_iteration": 2.4541256427764893 }, { "auxiliary_loss_clip": 0.01059388, "auxiliary_loss_mlp": 0.01012948, "balance_loss_clip": 1.01081395, "balance_loss_mlp": 1.02766275, "epoch": 0.2589508492409439, "flos": 53934955724160.0, "grad_norm": 0.861288668337506, "language_loss": 0.5721271, "learning_rate": 3.474865258296403e-06, "loss": 0.59285045, "num_input_tokens_seen": 93058645, "router_z_loss_clip": 0.0213623, "router_z_loss_mlp": 0.31640625, "step": 4307, "time_per_iteration": 3.011105537414551 }, { "auxiliary_loss_clip": 0.01133549, "auxiliary_loss_mlp": 0.0103912, "balance_loss_clip": 1.0229907, "balance_loss_mlp": 1.04740167, "epoch": 0.2590109724936119, "flos": 22125785402880.0, "grad_norm": 2.1239576152913906, "language_loss": 0.72101033, "learning_rate": 3.474602179854327e-06, "loss": 0.742737, "num_input_tokens_seen": 93077140, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.859375, "step": 4308, "time_per_iteration": 2.4674124717712402 }, { "auxiliary_loss_clip": 0.01137302, "auxiliary_loss_mlp": 0.01041627, "balance_loss_clip": 1.02540255, "balance_loss_mlp": 1.04749584, "epoch": 0.2590710957462799, "flos": 13474294398720.0, "grad_norm": 1.7880302295225878, "language_loss": 0.84511191, "learning_rate": 3.4743390454951886e-06, "loss": 0.86690128, "num_input_tokens_seen": 93093580, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8984375, "step": 4309, "time_per_iteration": 2.4120516777038574 }, { "auxiliary_loss_clip": 0.01136559, "auxiliary_loss_mlp": 0.01040739, "balance_loss_clip": 1.02540898, "balance_loss_mlp": 1.04951286, "epoch": 0.25913121899894787, "flos": 22307098279680.0, "grad_norm": 1.6154129750677344, "language_loss": 0.84941947, "learning_rate": 3.474075855228966e-06, "loss": 0.87119246, "num_input_tokens_seen": 93112345, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8671875, "step": 4310, "time_per_iteration": 2.4796388149261475 }, { "auxiliary_loss_clip": 0.0114016, "auxiliary_loss_mlp": 0.01042034, "balance_loss_clip": 1.02530909, "balance_loss_mlp": 1.05036378, "epoch": 0.25919134225161583, "flos": 25811728364160.0, "grad_norm": 1.9960668151718988, "language_loss": 0.77021629, "learning_rate": 3.473812609065639e-06, "loss": 0.7920382, "num_input_tokens_seen": 93131545, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8984375, "step": 4311, "time_per_iteration": 2.4923312664031982 }, { "auxiliary_loss_clip": 0.01137746, "auxiliary_loss_mlp": 0.01039317, "balance_loss_clip": 1.02271152, "balance_loss_mlp": 1.04847002, "epoch": 0.2592514655042838, "flos": 31212262108800.0, "grad_norm": 1.7455650062302737, "language_loss": 0.72432125, "learning_rate": 3.4735493070151904e-06, "loss": 0.74609196, "num_input_tokens_seen": 93150730, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.890625, "step": 4312, "time_per_iteration": 2.556934356689453 }, { "auxiliary_loss_clip": 0.01133491, "auxiliary_loss_mlp": 0.01040496, "balance_loss_clip": 1.02453446, "balance_loss_mlp": 1.04577851, "epoch": 0.25931158875695176, "flos": 18474998878080.0, "grad_norm": 1.807141271622409, "language_loss": 0.69854987, "learning_rate": 3.4732859490876044e-06, "loss": 0.72028971, "num_input_tokens_seen": 93167895, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 4313, "time_per_iteration": 2.4167723655700684 }, { "auxiliary_loss_clip": 0.01133998, "auxiliary_loss_mlp": 0.01044848, "balance_loss_clip": 1.02972043, "balance_loss_mlp": 1.04727793, "epoch": 0.2593717120096197, "flos": 19207935895680.0, "grad_norm": 1.7785550788658764, "language_loss": 0.8045038, "learning_rate": 3.473022535292867e-06, "loss": 0.82629234, "num_input_tokens_seen": 93187650, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8671875, "step": 4314, "time_per_iteration": 2.486570358276367 }, { "auxiliary_loss_clip": 0.01138866, "auxiliary_loss_mlp": 0.01049098, "balance_loss_clip": 1.03096664, "balance_loss_mlp": 1.0479424, "epoch": 0.2594318352622877, "flos": 31248100903680.0, "grad_norm": 2.929451098374261, "language_loss": 0.67458224, "learning_rate": 3.472759065640968e-06, "loss": 0.69646192, "num_input_tokens_seen": 93207370, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.91015625, "step": 4315, "time_per_iteration": 2.5643815994262695 }, { "auxiliary_loss_clip": 0.01132185, "auxiliary_loss_mlp": 0.01039167, "balance_loss_clip": 1.02389681, "balance_loss_mlp": 1.04536295, "epoch": 0.25949195851495566, "flos": 22237144542720.0, "grad_norm": 1.8668570401564877, "language_loss": 0.7972461, "learning_rate": 3.4724955401418976e-06, "loss": 0.81895959, "num_input_tokens_seen": 93227925, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8671875, "step": 4316, "time_per_iteration": 2.4815282821655273 }, { "auxiliary_loss_clip": 0.01135377, "auxiliary_loss_mlp": 0.01039622, "balance_loss_clip": 1.0230763, "balance_loss_mlp": 1.04605103, "epoch": 0.2595520817676236, "flos": 28075716645120.0, "grad_norm": 2.031442456971999, "language_loss": 0.77555966, "learning_rate": 3.4722319588056487e-06, "loss": 0.7973097, "num_input_tokens_seen": 93250020, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.890625, "step": 4317, "time_per_iteration": 2.5453474521636963 }, { "auxiliary_loss_clip": 0.01139693, "auxiliary_loss_mlp": 0.01049346, "balance_loss_clip": 1.03215575, "balance_loss_mlp": 1.05085802, "epoch": 0.2596122050202916, "flos": 20190954378240.0, "grad_norm": 2.2516043329072395, "language_loss": 0.77896023, "learning_rate": 3.4719683216422163e-06, "loss": 0.80085063, "num_input_tokens_seen": 93269070, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.88671875, "step": 4318, "time_per_iteration": 2.4778666496276855 }, { "auxiliary_loss_clip": 0.01133256, "auxiliary_loss_mlp": 0.01038927, "balance_loss_clip": 1.0215342, "balance_loss_mlp": 1.04585278, "epoch": 0.25967232827295955, "flos": 22527949052160.0, "grad_norm": 1.8111615898387874, "language_loss": 0.76416284, "learning_rate": 3.471704628661598e-06, "loss": 0.78588474, "num_input_tokens_seen": 93290250, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.875, "step": 4319, "time_per_iteration": 2.4874022006988525 }, { "auxiliary_loss_clip": 0.01134192, "auxiliary_loss_mlp": 0.01037896, "balance_loss_clip": 1.02162433, "balance_loss_mlp": 1.04837704, "epoch": 0.2597324515256275, "flos": 21068252156160.0, "grad_norm": 1.8612443561335603, "language_loss": 0.76610458, "learning_rate": 3.4714408798737925e-06, "loss": 0.78782547, "num_input_tokens_seen": 93310090, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 4320, "time_per_iteration": 2.5127360820770264 }, { "auxiliary_loss_clip": 0.011381, "auxiliary_loss_mlp": 0.01036123, "balance_loss_clip": 1.01936221, "balance_loss_mlp": 1.04941034, "epoch": 0.2597925747782955, "flos": 22050013662720.0, "grad_norm": 1.4259979678362762, "language_loss": 0.71019328, "learning_rate": 3.471177075288801e-06, "loss": 0.73193556, "num_input_tokens_seen": 93329570, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.88671875, "step": 4321, "time_per_iteration": 2.4645566940307617 }, { "auxiliary_loss_clip": 0.01141332, "auxiliary_loss_mlp": 0.01045502, "balance_loss_clip": 1.02806163, "balance_loss_mlp": 1.04945862, "epoch": 0.2598526980309635, "flos": 19536949497600.0, "grad_norm": 2.132229335963818, "language_loss": 0.74961203, "learning_rate": 3.4709132149166277e-06, "loss": 0.77148032, "num_input_tokens_seen": 93347920, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.91796875, "step": 4322, "time_per_iteration": 2.4651262760162354 }, { "auxiliary_loss_clip": 0.01135885, "auxiliary_loss_mlp": 0.0104618, "balance_loss_clip": 1.02916932, "balance_loss_mlp": 1.04649282, "epoch": 0.25991282128363147, "flos": 24495207079680.0, "grad_norm": 2.3373172025705955, "language_loss": 0.73906636, "learning_rate": 3.470649298767278e-06, "loss": 0.76088703, "num_input_tokens_seen": 93367145, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.89453125, "step": 4323, "time_per_iteration": 2.5006093978881836 }, { "auxiliary_loss_clip": 0.01142966, "auxiliary_loss_mlp": 0.01044554, "balance_loss_clip": 1.02630293, "balance_loss_mlp": 1.04856622, "epoch": 0.25997294453629943, "flos": 24201457655040.0, "grad_norm": 1.9223238986432507, "language_loss": 0.67064184, "learning_rate": 3.4703853268507597e-06, "loss": 0.69251704, "num_input_tokens_seen": 93386555, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.9453125, "step": 4324, "time_per_iteration": 2.510576009750366 }, { "auxiliary_loss_clip": 0.01134775, "auxiliary_loss_mlp": 0.01040191, "balance_loss_clip": 1.025123, "balance_loss_mlp": 1.04811978, "epoch": 0.2600330677889674, "flos": 31431460855680.0, "grad_norm": 1.8769263066404027, "language_loss": 0.70734262, "learning_rate": 3.470121299177082e-06, "loss": 0.72909236, "num_input_tokens_seen": 93405590, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8671875, "step": 4325, "time_per_iteration": 2.5433919429779053 }, { "auxiliary_loss_clip": 0.01133581, "auxiliary_loss_mlp": 0.01034965, "balance_loss_clip": 1.0183351, "balance_loss_mlp": 1.04537439, "epoch": 0.26009319104163536, "flos": 32266527217920.0, "grad_norm": 1.671464634071501, "language_loss": 0.72967935, "learning_rate": 3.469857215756257e-06, "loss": 0.75136477, "num_input_tokens_seen": 93424750, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8828125, "step": 4326, "time_per_iteration": 2.5559964179992676 }, { "auxiliary_loss_clip": 0.01131124, "auxiliary_loss_mlp": 0.01037012, "balance_loss_clip": 1.02187228, "balance_loss_mlp": 1.0467701, "epoch": 0.26015331429430333, "flos": 26286754752000.0, "grad_norm": 7.25479713378289, "language_loss": 0.87177134, "learning_rate": 3.4695930765982997e-06, "loss": 0.89345264, "num_input_tokens_seen": 93443465, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.84375, "step": 4327, "time_per_iteration": 2.524651527404785 }, { "auxiliary_loss_clip": 0.01139147, "auxiliary_loss_mlp": 0.0104924, "balance_loss_clip": 1.03094149, "balance_loss_mlp": 1.04879713, "epoch": 0.2602134375469713, "flos": 21142335957120.0, "grad_norm": 1.4247193025406715, "language_loss": 0.80096036, "learning_rate": 3.4693288817132255e-06, "loss": 0.82284415, "num_input_tokens_seen": 93462580, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.90234375, "step": 4328, "time_per_iteration": 2.4611854553222656 }, { "auxiliary_loss_clip": 0.01135154, "auxiliary_loss_mlp": 0.0103643, "balance_loss_clip": 1.02044415, "balance_loss_mlp": 1.04884243, "epoch": 0.26027356079963926, "flos": 25921327737600.0, "grad_norm": 1.471978146756053, "language_loss": 0.87811702, "learning_rate": 3.4690646311110525e-06, "loss": 0.89983284, "num_input_tokens_seen": 93482790, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.86328125, "step": 4329, "time_per_iteration": 2.4858388900756836 }, { "auxiliary_loss_clip": 0.01132144, "auxiliary_loss_mlp": 0.01036866, "balance_loss_clip": 1.02068949, "balance_loss_mlp": 1.04628491, "epoch": 0.2603336840523072, "flos": 26359222440960.0, "grad_norm": 1.9148802551143955, "language_loss": 0.77761418, "learning_rate": 3.468800324801802e-06, "loss": 0.79930425, "num_input_tokens_seen": 93498795, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 4330, "time_per_iteration": 2.5149054527282715 }, { "auxiliary_loss_clip": 0.01136881, "auxiliary_loss_mlp": 0.01049015, "balance_loss_clip": 1.03211069, "balance_loss_mlp": 1.04786849, "epoch": 0.2603938073049752, "flos": 23513661054720.0, "grad_norm": 1.8524472718268103, "language_loss": 0.75329524, "learning_rate": 3.4685359627954958e-06, "loss": 0.77515423, "num_input_tokens_seen": 93518335, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.890625, "step": 4331, "time_per_iteration": 2.473245143890381 }, { "auxiliary_loss_clip": 0.01136402, "auxiliary_loss_mlp": 0.01041056, "balance_loss_clip": 1.02551126, "balance_loss_mlp": 1.05042911, "epoch": 0.26045393055764315, "flos": 25374300537600.0, "grad_norm": 1.602698463200287, "language_loss": 0.69121569, "learning_rate": 3.4682715451021584e-06, "loss": 0.71299028, "num_input_tokens_seen": 93539170, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.859375, "step": 4332, "time_per_iteration": 2.506272792816162 }, { "auxiliary_loss_clip": 0.01137895, "auxiliary_loss_mlp": 0.01045797, "balance_loss_clip": 1.02873862, "balance_loss_mlp": 1.04811299, "epoch": 0.2605140538103111, "flos": 27635272076160.0, "grad_norm": 2.1471921313898887, "language_loss": 0.79663813, "learning_rate": 3.4680070717318174e-06, "loss": 0.81847513, "num_input_tokens_seen": 93558480, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8984375, "step": 4333, "time_per_iteration": 2.5039665699005127 }, { "auxiliary_loss_clip": 0.01130908, "auxiliary_loss_mlp": 0.01041846, "balance_loss_clip": 1.02602649, "balance_loss_mlp": 1.04669058, "epoch": 0.2605741770629791, "flos": 13769839503360.0, "grad_norm": 1.8284446798052552, "language_loss": 0.80835962, "learning_rate": 3.467742542694501e-06, "loss": 0.83008718, "num_input_tokens_seen": 93575220, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83984375, "step": 4334, "time_per_iteration": 2.426701784133911 }, { "auxiliary_loss_clip": 0.01134329, "auxiliary_loss_mlp": 0.01041876, "balance_loss_clip": 1.02459085, "balance_loss_mlp": 1.0477165, "epoch": 0.26063430031564705, "flos": 26031681296640.0, "grad_norm": 1.8516734475257954, "language_loss": 0.80079341, "learning_rate": 3.46747795800024e-06, "loss": 0.82255542, "num_input_tokens_seen": 93597015, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8671875, "step": 4335, "time_per_iteration": 2.497955083847046 }, { "auxiliary_loss_clip": 0.01056623, "auxiliary_loss_mlp": 0.01005406, "balance_loss_clip": 1.00322485, "balance_loss_mlp": 1.02562809, "epoch": 0.26069442356831507, "flos": 62443809820800.0, "grad_norm": 0.8384123116941329, "language_loss": 0.60807407, "learning_rate": 3.467213317659068e-06, "loss": 0.62869442, "num_input_tokens_seen": 93657775, "router_z_loss_clip": 0.02185059, "router_z_loss_mlp": 0.30859375, "step": 4336, "time_per_iteration": 3.0944931507110596 }, { "auxiliary_loss_clip": 0.01136182, "auxiliary_loss_mlp": 0.01051621, "balance_loss_clip": 1.03512251, "balance_loss_mlp": 1.04722643, "epoch": 0.26075454682098304, "flos": 13626376583040.0, "grad_norm": 1.9051023048609146, "language_loss": 0.7700913, "learning_rate": 3.46694862168102e-06, "loss": 0.7919693, "num_input_tokens_seen": 93676145, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.890625, "step": 4337, "time_per_iteration": 2.432136297225952 }, { "auxiliary_loss_clip": 0.01134951, "auxiliary_loss_mlp": 0.01046102, "balance_loss_clip": 1.02775538, "balance_loss_mlp": 1.04689336, "epoch": 0.260814670073651, "flos": 12126531260160.0, "grad_norm": 2.012743172427396, "language_loss": 0.74410355, "learning_rate": 3.4666838700761334e-06, "loss": 0.76591408, "num_input_tokens_seen": 93692480, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.8828125, "step": 4338, "time_per_iteration": 2.4367189407348633 }, { "auxiliary_loss_clip": 0.01138761, "auxiliary_loss_mlp": 0.01043368, "balance_loss_clip": 1.02589178, "balance_loss_mlp": 1.04776561, "epoch": 0.26087479332631897, "flos": 15122522805120.0, "grad_norm": 2.2507938375832475, "language_loss": 0.80115402, "learning_rate": 3.466419062854447e-06, "loss": 0.82297528, "num_input_tokens_seen": 93710165, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.91015625, "step": 4339, "time_per_iteration": 2.4377293586730957 }, { "auxiliary_loss_clip": 0.01134243, "auxiliary_loss_mlp": 0.01045829, "balance_loss_clip": 1.03077316, "balance_loss_mlp": 1.04764688, "epoch": 0.26093491657898693, "flos": 24680937329280.0, "grad_norm": 4.649915261675891, "language_loss": 0.769485, "learning_rate": 3.4661542000260033e-06, "loss": 0.79128575, "num_input_tokens_seen": 93730185, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8671875, "step": 4340, "time_per_iteration": 2.4971330165863037 }, { "auxiliary_loss_clip": 0.01137593, "auxiliary_loss_mlp": 0.01042648, "balance_loss_clip": 1.02636409, "balance_loss_mlp": 1.04783547, "epoch": 0.2609950398316549, "flos": 25116138512640.0, "grad_norm": 1.8415645070840558, "language_loss": 0.82703632, "learning_rate": 3.465889281600845e-06, "loss": 0.84883875, "num_input_tokens_seen": 93747690, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8984375, "step": 4341, "time_per_iteration": 2.5286595821380615 }, { "auxiliary_loss_clip": 0.01136328, "auxiliary_loss_mlp": 0.01040606, "balance_loss_clip": 1.02310646, "balance_loss_mlp": 1.04919434, "epoch": 0.26105516308432286, "flos": 28548588216960.0, "grad_norm": 1.953596256354727, "language_loss": 0.7619769, "learning_rate": 3.4656243075890183e-06, "loss": 0.78374624, "num_input_tokens_seen": 93767405, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.87109375, "step": 4342, "time_per_iteration": 2.5213069915771484 }, { "auxiliary_loss_clip": 0.01133567, "auxiliary_loss_mlp": 0.01038164, "balance_loss_clip": 1.02122426, "balance_loss_mlp": 1.04664183, "epoch": 0.2611152863369908, "flos": 39530609447040.0, "grad_norm": 1.9490281079783078, "language_loss": 0.66172934, "learning_rate": 3.4653592780005707e-06, "loss": 0.68344665, "num_input_tokens_seen": 93789950, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8671875, "step": 4343, "time_per_iteration": 4.0785863399505615 }, { "auxiliary_loss_clip": 0.01135399, "auxiliary_loss_mlp": 0.01041912, "balance_loss_clip": 1.0249486, "balance_loss_mlp": 1.04547203, "epoch": 0.2611754095896588, "flos": 13735329511680.0, "grad_norm": 2.1652425572053393, "language_loss": 0.74201989, "learning_rate": 3.465094192845553e-06, "loss": 0.76379299, "num_input_tokens_seen": 93807835, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8984375, "step": 4344, "time_per_iteration": 2.435943365097046 }, { "auxiliary_loss_clip": 0.01137379, "auxiliary_loss_mlp": 0.01047486, "balance_loss_clip": 1.03024888, "balance_loss_mlp": 1.04865766, "epoch": 0.26123553284232676, "flos": 21506649649920.0, "grad_norm": 2.110668367856936, "language_loss": 0.86401039, "learning_rate": 3.4648290521340165e-06, "loss": 0.88585907, "num_input_tokens_seen": 93825670, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.88671875, "step": 4345, "time_per_iteration": 3.831815481185913 }, { "auxiliary_loss_clip": 0.01131552, "auxiliary_loss_mlp": 0.0104142, "balance_loss_clip": 1.02474284, "balance_loss_mlp": 1.04711843, "epoch": 0.2612956560949947, "flos": 21139786091520.0, "grad_norm": 2.1651176876427836, "language_loss": 0.76247644, "learning_rate": 3.464563855876015e-06, "loss": 0.78420609, "num_input_tokens_seen": 93844045, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.84375, "step": 4346, "time_per_iteration": 2.4641966819763184 }, { "auxiliary_loss_clip": 0.0113475, "auxiliary_loss_mlp": 0.01046239, "balance_loss_clip": 1.02971661, "balance_loss_mlp": 1.04757154, "epoch": 0.2613557793476627, "flos": 25119011600640.0, "grad_norm": 1.4931742434468174, "language_loss": 0.75883955, "learning_rate": 3.464298604081606e-06, "loss": 0.78064942, "num_input_tokens_seen": 93864380, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.87109375, "step": 4347, "time_per_iteration": 5.333077907562256 }, { "auxiliary_loss_clip": 0.01133513, "auxiliary_loss_mlp": 0.01035985, "balance_loss_clip": 1.01946294, "balance_loss_mlp": 1.046435, "epoch": 0.26141590260033065, "flos": 26067699659520.0, "grad_norm": 1.3098027987371907, "language_loss": 0.7371816, "learning_rate": 3.4640332967608476e-06, "loss": 0.75887656, "num_input_tokens_seen": 93885475, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.87109375, "step": 4348, "time_per_iteration": 2.5034611225128174 }, { "auxiliary_loss_clip": 0.01138183, "auxiliary_loss_mlp": 0.01045454, "balance_loss_clip": 1.02857375, "balance_loss_mlp": 1.04931915, "epoch": 0.2614760258529987, "flos": 25701518459520.0, "grad_norm": 1.7375140142409682, "language_loss": 0.90708494, "learning_rate": 3.463767933923799e-06, "loss": 0.92892134, "num_input_tokens_seen": 93905545, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.890625, "step": 4349, "time_per_iteration": 2.5153303146362305 }, { "auxiliary_loss_clip": 0.01133163, "auxiliary_loss_mlp": 0.01044318, "balance_loss_clip": 1.02846289, "balance_loss_mlp": 1.04840541, "epoch": 0.26153614910566664, "flos": 17457147181440.0, "grad_norm": 1.7399911771752472, "language_loss": 0.79770863, "learning_rate": 3.463502515580524e-06, "loss": 0.81948352, "num_input_tokens_seen": 93924185, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 4350, "time_per_iteration": 2.435309648513794 }, { "auxiliary_loss_clip": 0.01130714, "auxiliary_loss_mlp": 0.01040064, "balance_loss_clip": 1.02419138, "balance_loss_mlp": 1.0470736, "epoch": 0.2615962723583346, "flos": 17712831168000.0, "grad_norm": 1.8883406806372107, "language_loss": 0.62531912, "learning_rate": 3.4632370417410866e-06, "loss": 0.6470269, "num_input_tokens_seen": 93942825, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8359375, "step": 4351, "time_per_iteration": 2.4502627849578857 }, { "auxiliary_loss_clip": 0.01135714, "auxiliary_loss_mlp": 0.01041018, "balance_loss_clip": 1.02434099, "balance_loss_mlp": 1.0475111, "epoch": 0.26165639561100257, "flos": 23257725672960.0, "grad_norm": 2.1624006001693763, "language_loss": 0.83540523, "learning_rate": 3.462971512415555e-06, "loss": 0.85717249, "num_input_tokens_seen": 93962045, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8828125, "step": 4352, "time_per_iteration": 2.467902183532715 }, { "auxiliary_loss_clip": 0.01057089, "auxiliary_loss_mlp": 0.01002713, "balance_loss_clip": 1.00024545, "balance_loss_mlp": 1.02601695, "epoch": 0.26171651886367053, "flos": 66737970800640.0, "grad_norm": 0.8078403445579422, "language_loss": 0.70554298, "learning_rate": 3.462705927613996e-06, "loss": 0.72614098, "num_input_tokens_seen": 94021175, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.31054688, "step": 4353, "time_per_iteration": 2.992595911026001 }, { "auxiliary_loss_clip": 0.01135216, "auxiliary_loss_mlp": 0.01049795, "balance_loss_clip": 1.03229475, "balance_loss_mlp": 1.04904222, "epoch": 0.2617766421163385, "flos": 22349581090560.0, "grad_norm": 1.8976879876419208, "language_loss": 0.77416956, "learning_rate": 3.4624402873464816e-06, "loss": 0.79601973, "num_input_tokens_seen": 94043370, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.86328125, "step": 4354, "time_per_iteration": 2.4995505809783936 }, { "auxiliary_loss_clip": 0.01139156, "auxiliary_loss_mlp": 0.01049358, "balance_loss_clip": 1.03255522, "balance_loss_mlp": 1.04780126, "epoch": 0.26183676536900646, "flos": 26067125041920.0, "grad_norm": 2.3789809099616157, "language_loss": 0.68172598, "learning_rate": 3.462174591623085e-06, "loss": 0.70361108, "num_input_tokens_seen": 94063510, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.9140625, "step": 4355, "time_per_iteration": 2.5177857875823975 }, { "auxiliary_loss_clip": 0.01134407, "auxiliary_loss_mlp": 0.01039949, "balance_loss_clip": 1.02197218, "balance_loss_mlp": 1.04786277, "epoch": 0.26189688862167443, "flos": 20996466825600.0, "grad_norm": 1.813033047681274, "language_loss": 0.67212266, "learning_rate": 3.4619088404538815e-06, "loss": 0.69386619, "num_input_tokens_seen": 94083865, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.8671875, "step": 4356, "time_per_iteration": 2.4530317783355713 }, { "auxiliary_loss_clip": 0.01055464, "auxiliary_loss_mlp": 0.01003698, "balance_loss_clip": 1.00150478, "balance_loss_mlp": 1.02496481, "epoch": 0.2619570118743424, "flos": 65798261141760.0, "grad_norm": 0.6828351518380799, "language_loss": 0.53151172, "learning_rate": 3.4616430338489487e-06, "loss": 0.5521034, "num_input_tokens_seen": 94144095, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.3046875, "step": 4357, "time_per_iteration": 3.0118649005889893 }, { "auxiliary_loss_clip": 0.01137739, "auxiliary_loss_mlp": 0.01046586, "balance_loss_clip": 1.02990866, "balance_loss_mlp": 1.04761243, "epoch": 0.26201713512701036, "flos": 28766817296640.0, "grad_norm": 3.3081410127788997, "language_loss": 0.84170145, "learning_rate": 3.4613771718183654e-06, "loss": 0.8635447, "num_input_tokens_seen": 94163035, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.90234375, "step": 4358, "time_per_iteration": 2.5208942890167236 }, { "auxiliary_loss_clip": 0.0114078, "auxiliary_loss_mlp": 0.01046877, "balance_loss_clip": 1.02813745, "balance_loss_mlp": 1.0478164, "epoch": 0.2620772583796783, "flos": 26432516142720.0, "grad_norm": 2.4053768845524672, "language_loss": 0.6754868, "learning_rate": 3.4611112543722127e-06, "loss": 0.69736338, "num_input_tokens_seen": 94182520, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.9296875, "step": 4359, "time_per_iteration": 2.5184507369995117 }, { "auxiliary_loss_clip": 0.0113616, "auxiliary_loss_mlp": 0.01044848, "balance_loss_clip": 1.02877831, "balance_loss_mlp": 1.04684997, "epoch": 0.2621373816323463, "flos": 20156552127360.0, "grad_norm": 1.9292416933035206, "language_loss": 0.78068674, "learning_rate": 3.4608452815205757e-06, "loss": 0.80249679, "num_input_tokens_seen": 94201795, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.890625, "step": 4360, "time_per_iteration": 2.453817367553711 }, { "auxiliary_loss_clip": 0.01130068, "auxiliary_loss_mlp": 0.01043691, "balance_loss_clip": 1.02813387, "balance_loss_mlp": 1.04485631, "epoch": 0.26219750488501425, "flos": 28621235473920.0, "grad_norm": 1.8066034289848854, "language_loss": 0.67957234, "learning_rate": 3.4605792532735387e-06, "loss": 0.70130992, "num_input_tokens_seen": 94222390, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8515625, "step": 4361, "time_per_iteration": 2.520914077758789 }, { "auxiliary_loss_clip": 0.01137816, "auxiliary_loss_mlp": 0.01051392, "balance_loss_clip": 1.03352213, "balance_loss_mlp": 1.04801321, "epoch": 0.2622576281376823, "flos": 15042549173760.0, "grad_norm": 1.8930153609100715, "language_loss": 0.84361494, "learning_rate": 3.46031316964119e-06, "loss": 0.86550707, "num_input_tokens_seen": 94239980, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.8984375, "step": 4362, "time_per_iteration": 2.420305013656616 }, { "auxiliary_loss_clip": 0.01135568, "auxiliary_loss_mlp": 0.01048365, "balance_loss_clip": 1.03099585, "balance_loss_mlp": 1.04901505, "epoch": 0.26231775139035024, "flos": 26396174557440.0, "grad_norm": 2.008374955171752, "language_loss": 0.65260983, "learning_rate": 3.4600470306336197e-06, "loss": 0.67444909, "num_input_tokens_seen": 94260715, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8671875, "step": 4363, "time_per_iteration": 2.5323779582977295 }, { "auxiliary_loss_clip": 0.01055453, "auxiliary_loss_mlp": 0.01002287, "balance_loss_clip": 0.99983168, "balance_loss_mlp": 1.02494538, "epoch": 0.2623778746430182, "flos": 65408918647680.0, "grad_norm": 0.9307274668232869, "language_loss": 0.611498, "learning_rate": 3.4597808362609194e-06, "loss": 0.63207543, "num_input_tokens_seen": 94321285, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.3046875, "step": 4364, "time_per_iteration": 3.1833338737487793 }, { "auxiliary_loss_clip": 0.01139623, "auxiliary_loss_mlp": 0.01046348, "balance_loss_clip": 1.02777529, "balance_loss_mlp": 1.0491575, "epoch": 0.26243799789568617, "flos": 12604215254400.0, "grad_norm": 2.6533038428355833, "language_loss": 0.71551204, "learning_rate": 3.459514586533184e-06, "loss": 0.7373718, "num_input_tokens_seen": 94335420, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.90625, "step": 4365, "time_per_iteration": 2.4383201599121094 }, { "auxiliary_loss_clip": 0.01136676, "auxiliary_loss_mlp": 0.0104548, "balance_loss_clip": 1.02919579, "balance_loss_mlp": 1.04878926, "epoch": 0.26249812114835414, "flos": 28623821253120.0, "grad_norm": 1.634200267917839, "language_loss": 0.77261621, "learning_rate": 3.459248281460509e-06, "loss": 0.79443777, "num_input_tokens_seen": 94357440, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87890625, "step": 4366, "time_per_iteration": 2.5230491161346436 }, { "auxiliary_loss_clip": 0.01137007, "auxiliary_loss_mlp": 0.01046785, "balance_loss_clip": 1.03051329, "balance_loss_mlp": 1.04950857, "epoch": 0.2625582444010221, "flos": 14465393441280.0, "grad_norm": 1.7736634348401512, "language_loss": 0.7609024, "learning_rate": 3.4589819210529927e-06, "loss": 0.78274035, "num_input_tokens_seen": 94375690, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.875, "step": 4367, "time_per_iteration": 2.4637184143066406 }, { "auxiliary_loss_clip": 0.01131882, "auxiliary_loss_mlp": 0.01044352, "balance_loss_clip": 1.02818704, "balance_loss_mlp": 1.04590714, "epoch": 0.26261836765369007, "flos": 16613174246400.0, "grad_norm": 1.4949113623947976, "language_loss": 0.69288307, "learning_rate": 3.458715505320736e-06, "loss": 0.71464539, "num_input_tokens_seen": 94393190, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 4368, "time_per_iteration": 2.4052634239196777 }, { "auxiliary_loss_clip": 0.01131789, "auxiliary_loss_mlp": 0.01045322, "balance_loss_clip": 1.02807212, "balance_loss_mlp": 1.04509187, "epoch": 0.26267849090635803, "flos": 20519932066560.0, "grad_norm": 1.8463338272533485, "language_loss": 0.79133773, "learning_rate": 3.458449034273841e-06, "loss": 0.81310874, "num_input_tokens_seen": 94410975, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8671875, "step": 4369, "time_per_iteration": 2.4713542461395264 }, { "auxiliary_loss_clip": 0.0113353, "auxiliary_loss_mlp": 0.01043684, "balance_loss_clip": 1.02694702, "balance_loss_mlp": 1.04681039, "epoch": 0.262738614159026, "flos": 21323936142720.0, "grad_norm": 1.9694195857894399, "language_loss": 0.83284676, "learning_rate": 3.4581825079224133e-06, "loss": 0.85461891, "num_input_tokens_seen": 94429985, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8671875, "step": 4370, "time_per_iteration": 2.4807357788085938 }, { "auxiliary_loss_clip": 0.01137896, "auxiliary_loss_mlp": 0.01053275, "balance_loss_clip": 1.03465497, "balance_loss_mlp": 1.04769802, "epoch": 0.26279873741169396, "flos": 17603590930560.0, "grad_norm": 1.7047614499356187, "language_loss": 0.7109977, "learning_rate": 3.4579159262765575e-06, "loss": 0.73290938, "num_input_tokens_seen": 94448660, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.90234375, "step": 4371, "time_per_iteration": 2.434968948364258 }, { "auxiliary_loss_clip": 0.01054277, "auxiliary_loss_mlp": 0.01002426, "balance_loss_clip": 0.99993402, "balance_loss_mlp": 1.02334547, "epoch": 0.2628588606643619, "flos": 60949746587520.0, "grad_norm": 0.6924481574463518, "language_loss": 0.56473207, "learning_rate": 3.457649289346384e-06, "loss": 0.58529913, "num_input_tokens_seen": 94515630, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.30859375, "step": 4372, "time_per_iteration": 3.197979688644409 }, { "auxiliary_loss_clip": 0.01132318, "auxiliary_loss_mlp": 0.01040234, "balance_loss_clip": 1.02430749, "balance_loss_mlp": 1.04745936, "epoch": 0.2629189839170299, "flos": 27016315891200.0, "grad_norm": 2.035022706478539, "language_loss": 0.7758671, "learning_rate": 3.4573825971420042e-06, "loss": 0.79759264, "num_input_tokens_seen": 94535385, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84765625, "step": 4373, "time_per_iteration": 2.506159543991089 }, { "auxiliary_loss_clip": 0.01132168, "auxiliary_loss_mlp": 0.01038854, "balance_loss_clip": 1.02312994, "balance_loss_mlp": 1.0460223, "epoch": 0.26297910716969786, "flos": 17019863009280.0, "grad_norm": 2.3511862308392675, "language_loss": 0.71493024, "learning_rate": 3.4571158496735294e-06, "loss": 0.73664051, "num_input_tokens_seen": 94552650, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.859375, "step": 4374, "time_per_iteration": 2.4254071712493896 }, { "auxiliary_loss_clip": 0.01134371, "auxiliary_loss_mlp": 0.01038647, "balance_loss_clip": 1.02125382, "balance_loss_mlp": 1.04837918, "epoch": 0.2630392304223659, "flos": 24897370728960.0, "grad_norm": 1.609419289734388, "language_loss": 0.8082782, "learning_rate": 3.4568490469510756e-06, "loss": 0.83000845, "num_input_tokens_seen": 94574075, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.859375, "step": 4375, "time_per_iteration": 2.4979186058044434 }, { "auxiliary_loss_clip": 0.01129913, "auxiliary_loss_mlp": 0.01040558, "balance_loss_clip": 1.02522171, "balance_loss_mlp": 1.04527855, "epoch": 0.26309935367503384, "flos": 32854026067200.0, "grad_norm": 1.8078354452936152, "language_loss": 0.66554046, "learning_rate": 3.4565821889847603e-06, "loss": 0.68724513, "num_input_tokens_seen": 94594255, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.84375, "step": 4376, "time_per_iteration": 2.576571464538574 }, { "auxiliary_loss_clip": 0.01135173, "auxiliary_loss_mlp": 0.01046234, "balance_loss_clip": 1.02978325, "balance_loss_mlp": 1.04712224, "epoch": 0.2631594769277018, "flos": 15887958652800.0, "grad_norm": 3.0077338320041647, "language_loss": 0.69066262, "learning_rate": 3.4563152757847026e-06, "loss": 0.71247673, "num_input_tokens_seen": 94611410, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8828125, "step": 4377, "time_per_iteration": 2.4835164546966553 }, { "auxiliary_loss_clip": 0.01134221, "auxiliary_loss_mlp": 0.01041054, "balance_loss_clip": 1.024436, "balance_loss_mlp": 1.04691839, "epoch": 0.2632196001803698, "flos": 50804943557760.0, "grad_norm": 1.6605753069230533, "language_loss": 0.79425752, "learning_rate": 3.4560483073610233e-06, "loss": 0.81601036, "num_input_tokens_seen": 94636575, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 4378, "time_per_iteration": 2.7319891452789307 }, { "auxiliary_loss_clip": 0.0113292, "auxiliary_loss_mlp": 0.01045756, "balance_loss_clip": 1.03084302, "balance_loss_mlp": 1.04674029, "epoch": 0.26327972343303774, "flos": 13733031041280.0, "grad_norm": 2.137327397799318, "language_loss": 0.76941848, "learning_rate": 3.455781283723846e-06, "loss": 0.79120523, "num_input_tokens_seen": 94654345, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.859375, "step": 4379, "time_per_iteration": 2.454977512359619 }, { "auxiliary_loss_clip": 0.01140501, "auxiliary_loss_mlp": 0.01041189, "balance_loss_clip": 1.02249706, "balance_loss_mlp": 1.04900861, "epoch": 0.2633398466857057, "flos": 23769057732480.0, "grad_norm": 2.1089604524561727, "language_loss": 0.77812994, "learning_rate": 3.4555142048832975e-06, "loss": 0.79994678, "num_input_tokens_seen": 94673985, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.9140625, "step": 4380, "time_per_iteration": 2.479898452758789 }, { "auxiliary_loss_clip": 0.01135032, "auxiliary_loss_mlp": 0.01039241, "balance_loss_clip": 1.02287352, "balance_loss_mlp": 1.04528928, "epoch": 0.26339996993837367, "flos": 27600223380480.0, "grad_norm": 1.826692734460203, "language_loss": 0.63945073, "learning_rate": 3.4552470708495036e-06, "loss": 0.66119349, "num_input_tokens_seen": 94693145, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8984375, "step": 4381, "time_per_iteration": 2.5132081508636475 }, { "auxiliary_loss_clip": 0.01131987, "auxiliary_loss_mlp": 0.01041809, "balance_loss_clip": 1.02579319, "balance_loss_mlp": 1.04511237, "epoch": 0.26346009319104163, "flos": 16946317912320.0, "grad_norm": 1.672110125342263, "language_loss": 0.82241035, "learning_rate": 3.454979881632595e-06, "loss": 0.84414828, "num_input_tokens_seen": 94710185, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8671875, "step": 4382, "time_per_iteration": 2.451850175857544 }, { "auxiliary_loss_clip": 0.01139758, "auxiliary_loss_mlp": 0.01048976, "balance_loss_clip": 1.03126132, "balance_loss_mlp": 1.04733467, "epoch": 0.2635202164437096, "flos": 37232218915200.0, "grad_norm": 1.9246858529885182, "language_loss": 0.69838762, "learning_rate": 3.4547126372427035e-06, "loss": 0.72027493, "num_input_tokens_seen": 94730280, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.92578125, "step": 4383, "time_per_iteration": 2.594892740249634 }, { "auxiliary_loss_clip": 0.0113506, "auxiliary_loss_mlp": 0.01039454, "balance_loss_clip": 1.02420712, "balance_loss_mlp": 1.04722643, "epoch": 0.26358033969637756, "flos": 20996359084800.0, "grad_norm": 2.3073686909801885, "language_loss": 0.69332898, "learning_rate": 3.4544453376899638e-06, "loss": 0.71507406, "num_input_tokens_seen": 94748560, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8828125, "step": 4384, "time_per_iteration": 3.921372890472412 }, { "auxiliary_loss_clip": 0.01132489, "auxiliary_loss_mlp": 0.01037715, "balance_loss_clip": 1.02187228, "balance_loss_mlp": 1.04610181, "epoch": 0.26364046294904553, "flos": 27746092512000.0, "grad_norm": 2.1278901095699942, "language_loss": 0.6978693, "learning_rate": 3.45417798298451e-06, "loss": 0.71957135, "num_input_tokens_seen": 94767570, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.86328125, "step": 4385, "time_per_iteration": 2.503485918045044 }, { "auxiliary_loss_clip": 0.01137576, "auxiliary_loss_mlp": 0.01042666, "balance_loss_clip": 1.02601218, "balance_loss_mlp": 1.05043864, "epoch": 0.2637005862017135, "flos": 22893088757760.0, "grad_norm": 1.8112342431470179, "language_loss": 0.85424697, "learning_rate": 3.453910573136482e-06, "loss": 0.8760494, "num_input_tokens_seen": 94784985, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.87109375, "step": 4386, "time_per_iteration": 2.4658007621765137 }, { "auxiliary_loss_clip": 0.01135896, "auxiliary_loss_mlp": 0.01042044, "balance_loss_clip": 1.02561712, "balance_loss_mlp": 1.04862571, "epoch": 0.26376070945438146, "flos": 15048834053760.0, "grad_norm": 2.1572991281446243, "language_loss": 0.77008593, "learning_rate": 3.4536431081560196e-06, "loss": 0.79186529, "num_input_tokens_seen": 94802545, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87109375, "step": 4387, "time_per_iteration": 3.8914694786071777 }, { "auxiliary_loss_clip": 0.01135103, "auxiliary_loss_mlp": 0.01045887, "balance_loss_clip": 1.02971041, "balance_loss_mlp": 1.04950428, "epoch": 0.2638208327070494, "flos": 21141833166720.0, "grad_norm": 2.014564591774846, "language_loss": 0.76077414, "learning_rate": 3.453375588053264e-06, "loss": 0.78258407, "num_input_tokens_seen": 94820730, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.85546875, "step": 4388, "time_per_iteration": 3.8621022701263428 }, { "auxiliary_loss_clip": 0.0113214, "auxiliary_loss_mlp": 0.01036308, "balance_loss_clip": 1.01989269, "balance_loss_mlp": 1.0445447, "epoch": 0.26388095595971744, "flos": 21725597001600.0, "grad_norm": 2.027429257872538, "language_loss": 0.86134791, "learning_rate": 3.4531080128383617e-06, "loss": 0.88303244, "num_input_tokens_seen": 94839175, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.875, "step": 4389, "time_per_iteration": 2.4676401615142822 }, { "auxiliary_loss_clip": 0.0105522, "auxiliary_loss_mlp": 0.01001214, "balance_loss_clip": 0.99897271, "balance_loss_mlp": 1.02425861, "epoch": 0.2639410792123854, "flos": 65515537192320.0, "grad_norm": 0.8109249586961071, "language_loss": 0.60355461, "learning_rate": 3.452840382521457e-06, "loss": 0.62411898, "num_input_tokens_seen": 94898865, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.30859375, "step": 4390, "time_per_iteration": 3.1128501892089844 }, { "auxiliary_loss_clip": 0.01134214, "auxiliary_loss_mlp": 0.01037908, "balance_loss_clip": 1.0209806, "balance_loss_mlp": 1.04484177, "epoch": 0.2640012024650534, "flos": 23948574929280.0, "grad_norm": 1.8923204836512224, "language_loss": 0.77629435, "learning_rate": 3.4525726971127e-06, "loss": 0.79801559, "num_input_tokens_seen": 94917490, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.89453125, "step": 4391, "time_per_iteration": 2.4837474822998047 }, { "auxiliary_loss_clip": 0.01054676, "auxiliary_loss_mlp": 0.01000458, "balance_loss_clip": 0.99810952, "balance_loss_mlp": 1.02382708, "epoch": 0.26406132571772134, "flos": 56441163369600.0, "grad_norm": 0.8403385594632397, "language_loss": 0.58728182, "learning_rate": 3.45230495662224e-06, "loss": 0.60783315, "num_input_tokens_seen": 94969065, "router_z_loss_clip": 0.0234375, "router_z_loss_mlp": 0.30859375, "step": 4392, "time_per_iteration": 3.0778396129608154 }, { "auxiliary_loss_clip": 0.01138477, "auxiliary_loss_mlp": 0.01049131, "balance_loss_clip": 1.03255463, "balance_loss_mlp": 1.04842925, "epoch": 0.2641214489703893, "flos": 22090557139200.0, "grad_norm": 1.7706368189852497, "language_loss": 0.6885466, "learning_rate": 3.4520371610602306e-06, "loss": 0.71042269, "num_input_tokens_seen": 94988540, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8984375, "step": 4393, "time_per_iteration": 2.4524474143981934 }, { "auxiliary_loss_clip": 0.01137978, "auxiliary_loss_mlp": 0.0104283, "balance_loss_clip": 1.02485299, "balance_loss_mlp": 1.04583359, "epoch": 0.26418157222305727, "flos": 16544764794240.0, "grad_norm": 1.904628253952847, "language_loss": 0.83799958, "learning_rate": 3.4517693104368267e-06, "loss": 0.85980761, "num_input_tokens_seen": 95004810, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.921875, "step": 4394, "time_per_iteration": 2.4440078735351562 }, { "auxiliary_loss_clip": 0.01143168, "auxiliary_loss_mlp": 0.01045072, "balance_loss_clip": 1.0255456, "balance_loss_mlp": 1.04979718, "epoch": 0.26424169547572524, "flos": 18002486442240.0, "grad_norm": 1.9864580449930078, "language_loss": 0.69995832, "learning_rate": 3.4515014047621856e-06, "loss": 0.72184074, "num_input_tokens_seen": 95024085, "router_z_loss_clip": 0.1953125, "router_z_loss_mlp": 0.93359375, "step": 4395, "time_per_iteration": 2.442099094390869 }, { "auxiliary_loss_clip": 0.01133979, "auxiliary_loss_mlp": 0.01039874, "balance_loss_clip": 1.02297044, "balance_loss_mlp": 1.04644156, "epoch": 0.2643018187283932, "flos": 16983162288000.0, "grad_norm": 1.8868427179142322, "language_loss": 0.86714417, "learning_rate": 3.4512334440464655e-06, "loss": 0.8888827, "num_input_tokens_seen": 95042515, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.875, "step": 4396, "time_per_iteration": 2.4451303482055664 }, { "auxiliary_loss_clip": 0.01052375, "auxiliary_loss_mlp": 0.0100328, "balance_loss_clip": 1.00093174, "balance_loss_mlp": 1.02172089, "epoch": 0.26436194198106117, "flos": 59664359416320.0, "grad_norm": 0.793693081836563, "language_loss": 0.54998028, "learning_rate": 3.4509654282998277e-06, "loss": 0.57053685, "num_input_tokens_seen": 95094835, "router_z_loss_clip": 0.0234375, "router_z_loss_mlp": 0.30664062, "step": 4397, "time_per_iteration": 2.8779022693634033 }, { "auxiliary_loss_clip": 0.01133606, "auxiliary_loss_mlp": 0.01051095, "balance_loss_clip": 1.03459632, "balance_loss_mlp": 1.04676914, "epoch": 0.26442206523372913, "flos": 32921322197760.0, "grad_norm": 1.878345212107727, "language_loss": 0.77683032, "learning_rate": 3.450697357532435e-06, "loss": 0.79867733, "num_input_tokens_seen": 95113480, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8671875, "step": 4398, "time_per_iteration": 2.5841195583343506 }, { "auxiliary_loss_clip": 0.01137948, "auxiliary_loss_mlp": 0.01043456, "balance_loss_clip": 1.02628946, "balance_loss_mlp": 1.04817665, "epoch": 0.2644821884863971, "flos": 21031300039680.0, "grad_norm": 1.6364149498277922, "language_loss": 0.6689086, "learning_rate": 3.4504292317544534e-06, "loss": 0.69072264, "num_input_tokens_seen": 95132580, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8984375, "step": 4399, "time_per_iteration": 2.461745500564575 }, { "auxiliary_loss_clip": 0.01128112, "auxiliary_loss_mlp": 0.01039763, "balance_loss_clip": 1.02473044, "balance_loss_mlp": 1.04560232, "epoch": 0.26454231173906506, "flos": 20776801201920.0, "grad_norm": 1.5377786329882752, "language_loss": 0.86423171, "learning_rate": 3.4501610509760504e-06, "loss": 0.88591039, "num_input_tokens_seen": 95152375, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 4400, "time_per_iteration": 2.4703528881073 }, { "auxiliary_loss_clip": 0.01137073, "auxiliary_loss_mlp": 0.01044143, "balance_loss_clip": 1.02646422, "balance_loss_mlp": 1.0472306, "epoch": 0.264602434991733, "flos": 16618669027200.0, "grad_norm": 1.83724036816166, "language_loss": 0.76162869, "learning_rate": 3.4498928152073944e-06, "loss": 0.78344089, "num_input_tokens_seen": 95170265, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.8984375, "step": 4401, "time_per_iteration": 2.4367334842681885 }, { "auxiliary_loss_clip": 0.0113677, "auxiliary_loss_mlp": 0.01049932, "balance_loss_clip": 1.03128815, "balance_loss_mlp": 1.04487467, "epoch": 0.26466255824440105, "flos": 19062677295360.0, "grad_norm": 1.7982881929494308, "language_loss": 0.88127583, "learning_rate": 3.4496245244586577e-06, "loss": 0.90314287, "num_input_tokens_seen": 95188655, "router_z_loss_clip": 0.18554688, "router_z_loss_mlp": 0.91796875, "step": 4402, "time_per_iteration": 2.4655234813690186 }, { "auxiliary_loss_clip": 0.01134654, "auxiliary_loss_mlp": 0.0104123, "balance_loss_clip": 1.02436185, "balance_loss_mlp": 1.04519701, "epoch": 0.264722681497069, "flos": 22638554006400.0, "grad_norm": 1.550679912136371, "language_loss": 0.77957046, "learning_rate": 3.4493561787400137e-06, "loss": 0.80132926, "num_input_tokens_seen": 95209615, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.89453125, "step": 4403, "time_per_iteration": 2.4736294746398926 }, { "auxiliary_loss_clip": 0.01135645, "auxiliary_loss_mlp": 0.01039187, "balance_loss_clip": 1.02154422, "balance_loss_mlp": 1.04532146, "epoch": 0.264782804749737, "flos": 22492253911680.0, "grad_norm": 1.8916013182602422, "language_loss": 0.88302433, "learning_rate": 3.4490877780616387e-06, "loss": 0.90477264, "num_input_tokens_seen": 95227810, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.90234375, "step": 4404, "time_per_iteration": 2.5107991695404053 }, { "auxiliary_loss_clip": 0.01135569, "auxiliary_loss_mlp": 0.0103916, "balance_loss_clip": 1.02347195, "balance_loss_mlp": 1.04471278, "epoch": 0.26484292800240494, "flos": 16800269212800.0, "grad_norm": 2.051982127819295, "language_loss": 0.7620945, "learning_rate": 3.448819322433709e-06, "loss": 0.78384185, "num_input_tokens_seen": 95245890, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.91015625, "step": 4405, "time_per_iteration": 2.422884702682495 }, { "auxiliary_loss_clip": 0.01136752, "auxiliary_loss_mlp": 0.01039722, "balance_loss_clip": 1.02219832, "balance_loss_mlp": 1.04698753, "epoch": 0.2649030512550729, "flos": 20449583280000.0, "grad_norm": 1.891028267097336, "language_loss": 0.70291102, "learning_rate": 3.4485508118664066e-06, "loss": 0.72467577, "num_input_tokens_seen": 95264955, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.8984375, "step": 4406, "time_per_iteration": 2.468165159225464 }, { "auxiliary_loss_clip": 0.01132826, "auxiliary_loss_mlp": 0.01046143, "balance_loss_clip": 1.03052688, "balance_loss_mlp": 1.04550278, "epoch": 0.2649631745077409, "flos": 22416123035520.0, "grad_norm": 1.685096293836366, "language_loss": 0.83540988, "learning_rate": 3.448282246369912e-06, "loss": 0.85719967, "num_input_tokens_seen": 95284245, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.87109375, "step": 4407, "time_per_iteration": 2.4701218605041504 }, { "auxiliary_loss_clip": 0.01132306, "auxiliary_loss_mlp": 0.01029828, "balance_loss_clip": 1.01299572, "balance_loss_mlp": 1.04525447, "epoch": 0.26502329776040884, "flos": 35116110927360.0, "grad_norm": 1.7163866073149796, "language_loss": 0.76428127, "learning_rate": 3.4480136259544084e-06, "loss": 0.78590262, "num_input_tokens_seen": 95307125, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.87109375, "step": 4408, "time_per_iteration": 2.591057777404785 }, { "auxiliary_loss_clip": 0.0113343, "auxiliary_loss_mlp": 0.0103917, "balance_loss_clip": 1.02250457, "balance_loss_mlp": 1.0468868, "epoch": 0.2650834210130768, "flos": 38687498438400.0, "grad_norm": 1.9917818765000217, "language_loss": 0.71039724, "learning_rate": 3.447744950630084e-06, "loss": 0.73212326, "num_input_tokens_seen": 95329150, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8671875, "step": 4409, "time_per_iteration": 2.585909128189087 }, { "auxiliary_loss_clip": 0.01134261, "auxiliary_loss_mlp": 0.01035692, "balance_loss_clip": 1.0179894, "balance_loss_mlp": 1.04440045, "epoch": 0.26514354426574477, "flos": 24716847951360.0, "grad_norm": 1.8228530004865966, "language_loss": 0.73478806, "learning_rate": 3.4474762204071253e-06, "loss": 0.75648761, "num_input_tokens_seen": 95349880, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.8984375, "step": 4410, "time_per_iteration": 2.5123965740203857 }, { "auxiliary_loss_clip": 0.01139535, "auxiliary_loss_mlp": 0.01049563, "balance_loss_clip": 1.03215873, "balance_loss_mlp": 1.04732561, "epoch": 0.26520366751841273, "flos": 20340055733760.0, "grad_norm": 1.7590672881878984, "language_loss": 0.73282063, "learning_rate": 3.4472074352957244e-06, "loss": 0.75471163, "num_input_tokens_seen": 95368570, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.921875, "step": 4411, "time_per_iteration": 2.4550347328186035 }, { "auxiliary_loss_clip": 0.01134874, "auxiliary_loss_mlp": 0.01041878, "balance_loss_clip": 1.02494979, "balance_loss_mlp": 1.04665375, "epoch": 0.2652637907710807, "flos": 22343870828160.0, "grad_norm": 1.9820284631454241, "language_loss": 0.82164371, "learning_rate": 3.446938595306071e-06, "loss": 0.84341121, "num_input_tokens_seen": 95387065, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8828125, "step": 4412, "time_per_iteration": 2.4732792377471924 }, { "auxiliary_loss_clip": 0.01133034, "auxiliary_loss_mlp": 0.01043892, "balance_loss_clip": 1.02759552, "balance_loss_mlp": 1.04501891, "epoch": 0.26532391402374866, "flos": 19354235990400.0, "grad_norm": 1.6778362846963892, "language_loss": 0.74569327, "learning_rate": 3.4466697004483622e-06, "loss": 0.76746261, "num_input_tokens_seen": 95406345, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8828125, "step": 4413, "time_per_iteration": 2.4489879608154297 }, { "auxiliary_loss_clip": 0.01052106, "auxiliary_loss_mlp": 0.01002123, "balance_loss_clip": 0.99965519, "balance_loss_mlp": 1.02165842, "epoch": 0.26538403727641663, "flos": 44787611422080.0, "grad_norm": 0.8949703794376763, "language_loss": 0.56947482, "learning_rate": 3.446400750732793e-06, "loss": 0.59001714, "num_input_tokens_seen": 95463595, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.3046875, "step": 4414, "time_per_iteration": 3.0585384368896484 }, { "auxiliary_loss_clip": 0.01128767, "auxiliary_loss_mlp": 0.01041151, "balance_loss_clip": 1.0254271, "balance_loss_mlp": 1.04434454, "epoch": 0.26544416052908465, "flos": 28182119708160.0, "grad_norm": 1.5794887496684533, "language_loss": 0.74471915, "learning_rate": 3.4461317461695625e-06, "loss": 0.76641834, "num_input_tokens_seen": 95484115, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84375, "step": 4415, "time_per_iteration": 2.5182065963745117 }, { "auxiliary_loss_clip": 0.01137241, "auxiliary_loss_mlp": 0.01037583, "balance_loss_clip": 1.01856923, "balance_loss_mlp": 1.04614615, "epoch": 0.2655042837817526, "flos": 17565274097280.0, "grad_norm": 2.35836962513854, "language_loss": 0.86787558, "learning_rate": 3.4458626867688707e-06, "loss": 0.88962376, "num_input_tokens_seen": 95501435, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.91015625, "step": 4416, "time_per_iteration": 2.4728195667266846 }, { "auxiliary_loss_clip": 0.01135427, "auxiliary_loss_mlp": 0.01041271, "balance_loss_clip": 1.02411652, "balance_loss_mlp": 1.04712033, "epoch": 0.2655644070344206, "flos": 23404636298880.0, "grad_norm": 1.5378564488628352, "language_loss": 0.76066029, "learning_rate": 3.4455935725409217e-06, "loss": 0.78242725, "num_input_tokens_seen": 95520135, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8828125, "step": 4417, "time_per_iteration": 2.469188690185547 }, { "auxiliary_loss_clip": 0.01131882, "auxiliary_loss_mlp": 0.01040163, "balance_loss_clip": 1.02188861, "balance_loss_mlp": 1.04686546, "epoch": 0.26562453028708854, "flos": 26468462678400.0, "grad_norm": 1.850334748459897, "language_loss": 0.79820752, "learning_rate": 3.4453244034959196e-06, "loss": 0.81992793, "num_input_tokens_seen": 95541705, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.8515625, "step": 4418, "time_per_iteration": 2.5128252506256104 }, { "auxiliary_loss_clip": 0.011354, "auxiliary_loss_mlp": 0.01046733, "balance_loss_clip": 1.02951956, "balance_loss_mlp": 1.04684329, "epoch": 0.2656846535397565, "flos": 19207576759680.0, "grad_norm": 2.9478917069289245, "language_loss": 0.66995668, "learning_rate": 3.445055179644071e-06, "loss": 0.69177806, "num_input_tokens_seen": 95560300, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8828125, "step": 4419, "time_per_iteration": 2.465895175933838 }, { "auxiliary_loss_clip": 0.01136335, "auxiliary_loss_mlp": 0.0104375, "balance_loss_clip": 1.02539134, "balance_loss_mlp": 1.04711974, "epoch": 0.2657447767924245, "flos": 30551325903360.0, "grad_norm": 2.0087665618421284, "language_loss": 0.79492056, "learning_rate": 3.444785900995585e-06, "loss": 0.81672144, "num_input_tokens_seen": 95580150, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.890625, "step": 4420, "time_per_iteration": 2.527631998062134 }, { "auxiliary_loss_clip": 0.01138631, "auxiliary_loss_mlp": 0.01049958, "balance_loss_clip": 1.03076494, "balance_loss_mlp": 1.04736245, "epoch": 0.26580490004509244, "flos": 20922742160640.0, "grad_norm": 1.9127090208141435, "language_loss": 0.81940597, "learning_rate": 3.444516567560673e-06, "loss": 0.84129184, "num_input_tokens_seen": 95597570, "router_z_loss_clip": 0.19238281, "router_z_loss_mlp": 0.9140625, "step": 4421, "time_per_iteration": 2.444425106048584 }, { "auxiliary_loss_clip": 0.01131454, "auxiliary_loss_mlp": 0.01040587, "balance_loss_clip": 1.02357626, "balance_loss_mlp": 1.04619133, "epoch": 0.2658650232977604, "flos": 43945682584320.0, "grad_norm": 1.6161647199042413, "language_loss": 0.65968502, "learning_rate": 3.444247179349548e-06, "loss": 0.68140543, "num_input_tokens_seen": 95619415, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8515625, "step": 4422, "time_per_iteration": 2.6675240993499756 }, { "auxiliary_loss_clip": 0.01135982, "auxiliary_loss_mlp": 0.01042444, "balance_loss_clip": 1.02588606, "balance_loss_mlp": 1.04633951, "epoch": 0.26592514655042837, "flos": 29716439109120.0, "grad_norm": 2.3303351155595085, "language_loss": 0.74321425, "learning_rate": 3.4439777363724252e-06, "loss": 0.76499856, "num_input_tokens_seen": 95639155, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.89453125, "step": 4423, "time_per_iteration": 2.5114827156066895 }, { "auxiliary_loss_clip": 0.01135901, "auxiliary_loss_mlp": 0.01042787, "balance_loss_clip": 1.02573991, "balance_loss_mlp": 1.04578924, "epoch": 0.26598526980309634, "flos": 46677730014720.0, "grad_norm": 1.6090241407596841, "language_loss": 0.77980006, "learning_rate": 3.443708238639522e-06, "loss": 0.80158699, "num_input_tokens_seen": 95663320, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.90234375, "step": 4424, "time_per_iteration": 2.68935227394104 }, { "auxiliary_loss_clip": 0.01135531, "auxiliary_loss_mlp": 0.01040881, "balance_loss_clip": 1.02440631, "balance_loss_mlp": 1.04718769, "epoch": 0.2660453930557643, "flos": 11509442582400.0, "grad_norm": 1.8836175228529204, "language_loss": 0.78959823, "learning_rate": 3.4434386861610573e-06, "loss": 0.81136227, "num_input_tokens_seen": 95680260, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8828125, "step": 4425, "time_per_iteration": 2.4139957427978516 }, { "auxiliary_loss_clip": 0.0113015, "auxiliary_loss_mlp": 0.01042828, "balance_loss_clip": 1.02727103, "balance_loss_mlp": 1.04527903, "epoch": 0.26610551630843227, "flos": 24791578197120.0, "grad_norm": 1.7964269739641976, "language_loss": 0.80213284, "learning_rate": 3.4431690789472532e-06, "loss": 0.82386261, "num_input_tokens_seen": 95701140, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8515625, "step": 4426, "time_per_iteration": 3.975665807723999 }, { "auxiliary_loss_clip": 0.01139945, "auxiliary_loss_mlp": 0.01046101, "balance_loss_clip": 1.02926826, "balance_loss_mlp": 1.05145621, "epoch": 0.26616563956110023, "flos": 27636385397760.0, "grad_norm": 2.068400261967983, "language_loss": 0.76865125, "learning_rate": 3.442899417008333e-06, "loss": 0.79051173, "num_input_tokens_seen": 95722060, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8828125, "step": 4427, "time_per_iteration": 2.5139057636260986 }, { "auxiliary_loss_clip": 0.01131206, "auxiliary_loss_mlp": 0.01033643, "balance_loss_clip": 1.01789546, "balance_loss_mlp": 1.0458442, "epoch": 0.26622576281376825, "flos": 28362893880960.0, "grad_norm": 1.6952692981825217, "language_loss": 0.77060419, "learning_rate": 3.4426297003545227e-06, "loss": 0.79225266, "num_input_tokens_seen": 95742495, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8515625, "step": 4428, "time_per_iteration": 3.933774709701538 }, { "auxiliary_loss_clip": 0.01135723, "auxiliary_loss_mlp": 0.01033551, "balance_loss_clip": 1.01794648, "balance_loss_mlp": 1.04676044, "epoch": 0.2662858860664362, "flos": 18041341979520.0, "grad_norm": 2.4592273753539384, "language_loss": 0.83013546, "learning_rate": 3.4423599289960495e-06, "loss": 0.85182822, "num_input_tokens_seen": 95761510, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.890625, "step": 4429, "time_per_iteration": 2.461719512939453 }, { "auxiliary_loss_clip": 0.01132042, "auxiliary_loss_mlp": 0.01038983, "balance_loss_clip": 1.02297354, "balance_loss_mlp": 1.04529011, "epoch": 0.2663460093191042, "flos": 22745818995840.0, "grad_norm": 2.1823767386241033, "language_loss": 0.71914274, "learning_rate": 3.442090102943143e-06, "loss": 0.74085295, "num_input_tokens_seen": 95782385, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8671875, "step": 4430, "time_per_iteration": 3.9195423126220703 }, { "auxiliary_loss_clip": 0.0113646, "auxiliary_loss_mlp": 0.01046566, "balance_loss_clip": 1.02854156, "balance_loss_mlp": 1.04740763, "epoch": 0.26640613257177215, "flos": 16508782344960.0, "grad_norm": 1.8607893258429027, "language_loss": 0.81681693, "learning_rate": 3.441820222206035e-06, "loss": 0.83864719, "num_input_tokens_seen": 95800595, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.890625, "step": 4431, "time_per_iteration": 2.4371337890625 }, { "auxiliary_loss_clip": 0.01140995, "auxiliary_loss_mlp": 0.01048513, "balance_loss_clip": 1.03090596, "balance_loss_mlp": 1.04736936, "epoch": 0.2664662558244401, "flos": 23075945919360.0, "grad_norm": 2.3903125244734302, "language_loss": 0.76645029, "learning_rate": 3.44155028679496e-06, "loss": 0.78834534, "num_input_tokens_seen": 95818480, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9375, "step": 4432, "time_per_iteration": 2.4914894104003906 }, { "auxiliary_loss_clip": 0.01133896, "auxiliary_loss_mlp": 0.01040071, "balance_loss_clip": 1.02290535, "balance_loss_mlp": 1.04558706, "epoch": 0.2665263790771081, "flos": 23769273214080.0, "grad_norm": 2.0461595576500677, "language_loss": 0.83005869, "learning_rate": 3.441280296720154e-06, "loss": 0.85179836, "num_input_tokens_seen": 95837205, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8828125, "step": 4433, "time_per_iteration": 2.4777402877807617 }, { "auxiliary_loss_clip": 0.01135171, "auxiliary_loss_mlp": 0.01041782, "balance_loss_clip": 1.02524757, "balance_loss_mlp": 1.04785633, "epoch": 0.26658650232977604, "flos": 28001273708160.0, "grad_norm": 3.199174363521554, "language_loss": 0.76575482, "learning_rate": 3.441010251991854e-06, "loss": 0.7875244, "num_input_tokens_seen": 95858395, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.875, "step": 4434, "time_per_iteration": 2.5426814556121826 }, { "auxiliary_loss_clip": 0.01131591, "auxiliary_loss_mlp": 0.01039074, "balance_loss_clip": 1.02343345, "balance_loss_mlp": 1.04512107, "epoch": 0.266646625582444, "flos": 22163635359360.0, "grad_norm": 2.937938457865961, "language_loss": 0.8254832, "learning_rate": 3.440740152620301e-06, "loss": 0.84718984, "num_input_tokens_seen": 95877875, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.86328125, "step": 4435, "time_per_iteration": 2.4734201431274414 }, { "auxiliary_loss_clip": 0.01138601, "auxiliary_loss_mlp": 0.01052282, "balance_loss_clip": 1.03395915, "balance_loss_mlp": 1.04637182, "epoch": 0.266706748835112, "flos": 27853537069440.0, "grad_norm": 1.9505350548091456, "language_loss": 0.87434733, "learning_rate": 3.4404699986157376e-06, "loss": 0.89625609, "num_input_tokens_seen": 95895820, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.921875, "step": 4436, "time_per_iteration": 2.5144879817962646 }, { "auxiliary_loss_clip": 0.01136122, "auxiliary_loss_mlp": 0.0104101, "balance_loss_clip": 1.02488065, "balance_loss_mlp": 1.04679179, "epoch": 0.26676687208777994, "flos": 25812123413760.0, "grad_norm": 1.4290723409488262, "language_loss": 0.78764188, "learning_rate": 3.440199789988407e-06, "loss": 0.80941319, "num_input_tokens_seen": 95918025, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.89453125, "step": 4437, "time_per_iteration": 2.504765510559082 }, { "auxiliary_loss_clip": 0.01133623, "auxiliary_loss_mlp": 0.01042896, "balance_loss_clip": 1.02685082, "balance_loss_mlp": 1.04615164, "epoch": 0.2668269953404479, "flos": 36064583504640.0, "grad_norm": 2.8506749984247293, "language_loss": 0.64172232, "learning_rate": 3.439929526748556e-06, "loss": 0.66348749, "num_input_tokens_seen": 95937725, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 4438, "time_per_iteration": 2.584415912628174 }, { "auxiliary_loss_clip": 0.01133058, "auxiliary_loss_mlp": 0.01037317, "balance_loss_clip": 1.02179635, "balance_loss_mlp": 1.04523516, "epoch": 0.26688711859311587, "flos": 26570987072640.0, "grad_norm": 1.9000176880281914, "language_loss": 0.75672364, "learning_rate": 3.4396592089064334e-06, "loss": 0.77842736, "num_input_tokens_seen": 95956335, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.87890625, "step": 4439, "time_per_iteration": 2.4841136932373047 }, { "auxiliary_loss_clip": 0.01136144, "auxiliary_loss_mlp": 0.01035692, "balance_loss_clip": 1.01738119, "balance_loss_mlp": 1.04587185, "epoch": 0.26694724184578383, "flos": 26761565658240.0, "grad_norm": 1.7880272099052008, "language_loss": 0.7138195, "learning_rate": 3.4393888364722897e-06, "loss": 0.73553789, "num_input_tokens_seen": 95977135, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.90234375, "step": 4440, "time_per_iteration": 2.514580011367798 }, { "auxiliary_loss_clip": 0.01137471, "auxiliary_loss_mlp": 0.01042898, "balance_loss_clip": 1.02555263, "balance_loss_mlp": 1.04660153, "epoch": 0.2670073650984518, "flos": 20959586536320.0, "grad_norm": 2.3683622624742515, "language_loss": 0.67253649, "learning_rate": 3.439118409456376e-06, "loss": 0.69434011, "num_input_tokens_seen": 95995435, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90625, "step": 4441, "time_per_iteration": 2.4575541019439697 }, { "auxiliary_loss_clip": 0.01135184, "auxiliary_loss_mlp": 0.01037222, "balance_loss_clip": 1.02022266, "balance_loss_mlp": 1.04688859, "epoch": 0.2670674883511198, "flos": 28366054277760.0, "grad_norm": 1.5921703670961977, "language_loss": 0.76596045, "learning_rate": 3.4388479278689486e-06, "loss": 0.78768456, "num_input_tokens_seen": 96016340, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 4442, "time_per_iteration": 2.538790225982666 }, { "auxiliary_loss_clip": 0.01053483, "auxiliary_loss_mlp": 0.01015706, "balance_loss_clip": 1.01361966, "balance_loss_mlp": 1.02280092, "epoch": 0.2671276116037878, "flos": 58971319430400.0, "grad_norm": 0.936963993931264, "language_loss": 0.61263198, "learning_rate": 3.4385773917202637e-06, "loss": 0.63332385, "num_input_tokens_seen": 96071205, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.30664062, "step": 4443, "time_per_iteration": 2.97869873046875 }, { "auxiliary_loss_clip": 0.01135915, "auxiliary_loss_mlp": 0.01038134, "balance_loss_clip": 1.02210045, "balance_loss_mlp": 1.04665709, "epoch": 0.26718773485645575, "flos": 43945072053120.0, "grad_norm": 1.5649999552478715, "language_loss": 0.764521, "learning_rate": 3.4383068010205793e-06, "loss": 0.78626144, "num_input_tokens_seen": 96094240, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.89453125, "step": 4444, "time_per_iteration": 2.660470485687256 }, { "auxiliary_loss_clip": 0.01132796, "auxiliary_loss_mlp": 0.01040516, "balance_loss_clip": 1.02318311, "balance_loss_mlp": 1.04499459, "epoch": 0.2672478581091237, "flos": 25228323665280.0, "grad_norm": 1.8134249082380376, "language_loss": 0.80822152, "learning_rate": 3.438036155780158e-06, "loss": 0.82995462, "num_input_tokens_seen": 96114105, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.87890625, "step": 4445, "time_per_iteration": 2.4832053184509277 }, { "auxiliary_loss_clip": 0.01134746, "auxiliary_loss_mlp": 0.01037767, "balance_loss_clip": 1.01984954, "balance_loss_mlp": 1.04472148, "epoch": 0.2673079813617917, "flos": 15268176455040.0, "grad_norm": 1.996826027025563, "language_loss": 0.89135188, "learning_rate": 3.43776545600926e-06, "loss": 0.913077, "num_input_tokens_seen": 96132140, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.90234375, "step": 4446, "time_per_iteration": 2.446345329284668 }, { "auxiliary_loss_clip": 0.01132126, "auxiliary_loss_mlp": 0.01042088, "balance_loss_clip": 1.02598298, "balance_loss_mlp": 1.04466605, "epoch": 0.26736810461445965, "flos": 25812733944960.0, "grad_norm": 1.634068297609999, "language_loss": 0.67965502, "learning_rate": 3.437494701718153e-06, "loss": 0.70139718, "num_input_tokens_seen": 96152090, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.875, "step": 4447, "time_per_iteration": 2.484032154083252 }, { "auxiliary_loss_clip": 0.01133362, "auxiliary_loss_mlp": 0.01034987, "balance_loss_clip": 1.01805902, "balance_loss_mlp": 1.04410148, "epoch": 0.2674282278671276, "flos": 24312709054080.0, "grad_norm": 2.0194208078523306, "language_loss": 0.82845271, "learning_rate": 3.4372238929171026e-06, "loss": 0.85013616, "num_input_tokens_seen": 96170015, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.890625, "step": 4448, "time_per_iteration": 2.497568130493164 }, { "auxiliary_loss_clip": 0.01130389, "auxiliary_loss_mlp": 0.01046978, "balance_loss_clip": 1.03009748, "balance_loss_mlp": 1.04454434, "epoch": 0.2674883511197956, "flos": 22815521337600.0, "grad_norm": 1.927339025547578, "language_loss": 0.84195495, "learning_rate": 3.436953029616378e-06, "loss": 0.86372864, "num_input_tokens_seen": 96188065, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.859375, "step": 4449, "time_per_iteration": 2.4583044052124023 }, { "auxiliary_loss_clip": 0.01140151, "auxiliary_loss_mlp": 0.01044097, "balance_loss_clip": 1.02565491, "balance_loss_mlp": 1.04530263, "epoch": 0.26754847437246354, "flos": 25370170473600.0, "grad_norm": 1.8051908734107982, "language_loss": 0.84214652, "learning_rate": 3.4366821118262506e-06, "loss": 0.863989, "num_input_tokens_seen": 96205780, "router_z_loss_clip": 0.18457031, "router_z_loss_mlp": 0.94921875, "step": 4450, "time_per_iteration": 2.5101914405822754 }, { "auxiliary_loss_clip": 0.01128869, "auxiliary_loss_mlp": 0.01041118, "balance_loss_clip": 1.02621698, "balance_loss_mlp": 1.04316378, "epoch": 0.2676085976251315, "flos": 20230420446720.0, "grad_norm": 2.013327536810982, "language_loss": 0.80978924, "learning_rate": 3.4364111395569937e-06, "loss": 0.83148909, "num_input_tokens_seen": 96224990, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.85546875, "step": 4451, "time_per_iteration": 2.461092710494995 }, { "auxiliary_loss_clip": 0.01133471, "auxiliary_loss_mlp": 0.01037352, "balance_loss_clip": 1.02154469, "balance_loss_mlp": 1.04682684, "epoch": 0.26766872087779947, "flos": 28038225824640.0, "grad_norm": 1.62764778878327, "language_loss": 0.8618347, "learning_rate": 3.436140112818882e-06, "loss": 0.8835429, "num_input_tokens_seen": 96245345, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 4452, "time_per_iteration": 2.5395350456237793 }, { "auxiliary_loss_clip": 0.01136316, "auxiliary_loss_mlp": 0.01041014, "balance_loss_clip": 1.02416992, "balance_loss_mlp": 1.04668736, "epoch": 0.26772884413046744, "flos": 18325179250560.0, "grad_norm": 2.359166756892329, "language_loss": 0.83310664, "learning_rate": 3.435869031622194e-06, "loss": 0.85487998, "num_input_tokens_seen": 96259000, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8984375, "step": 4453, "time_per_iteration": 2.4129626750946045 }, { "auxiliary_loss_clip": 0.0113169, "auxiliary_loss_mlp": 0.01051997, "balance_loss_clip": 1.03455687, "balance_loss_mlp": 1.04435182, "epoch": 0.2677889673831354, "flos": 22127509255680.0, "grad_norm": 1.724063855901959, "language_loss": 0.79805887, "learning_rate": 3.435597895977208e-06, "loss": 0.81989574, "num_input_tokens_seen": 96277000, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.875, "step": 4454, "time_per_iteration": 2.5173137187957764 }, { "auxiliary_loss_clip": 0.01136136, "auxiliary_loss_mlp": 0.0103995, "balance_loss_clip": 1.02382708, "balance_loss_mlp": 1.0466367, "epoch": 0.2678490906358034, "flos": 23729699404800.0, "grad_norm": 1.6844859696127175, "language_loss": 0.72249997, "learning_rate": 3.435326705894206e-06, "loss": 0.74426085, "num_input_tokens_seen": 96297010, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.89453125, "step": 4455, "time_per_iteration": 2.490076780319214 }, { "auxiliary_loss_clip": 0.01132077, "auxiliary_loss_mlp": 0.01040184, "balance_loss_clip": 1.02440131, "balance_loss_mlp": 1.04727602, "epoch": 0.2679092138884714, "flos": 21762872340480.0, "grad_norm": 1.6195466068543625, "language_loss": 0.73906225, "learning_rate": 3.435055461383471e-06, "loss": 0.76078486, "num_input_tokens_seen": 96315780, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84765625, "step": 4456, "time_per_iteration": 2.5066957473754883 }, { "auxiliary_loss_clip": 0.01135502, "auxiliary_loss_mlp": 0.01039031, "balance_loss_clip": 1.0217576, "balance_loss_mlp": 1.04563761, "epoch": 0.26796933714113935, "flos": 19861186590720.0, "grad_norm": 2.232233308914488, "language_loss": 0.70824397, "learning_rate": 3.4347841624552896e-06, "loss": 0.72998929, "num_input_tokens_seen": 96333465, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8984375, "step": 4457, "time_per_iteration": 2.437558174133301 }, { "auxiliary_loss_clip": 0.01138856, "auxiliary_loss_mlp": 0.01045689, "balance_loss_clip": 1.02891612, "balance_loss_mlp": 1.05026937, "epoch": 0.2680294603938073, "flos": 20047886507520.0, "grad_norm": 1.663733752442136, "language_loss": 0.78927398, "learning_rate": 3.4345128091199493e-06, "loss": 0.8111195, "num_input_tokens_seen": 96352005, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.88671875, "step": 4458, "time_per_iteration": 2.480062484741211 }, { "auxiliary_loss_clip": 0.01054834, "auxiliary_loss_mlp": 0.01005963, "balance_loss_clip": 1.00386477, "balance_loss_mlp": 1.02338958, "epoch": 0.2680895836464753, "flos": 72113763052800.0, "grad_norm": 0.8564401803522023, "language_loss": 0.58723879, "learning_rate": 3.434241401387739e-06, "loss": 0.60784686, "num_input_tokens_seen": 96406265, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.31445312, "step": 4459, "time_per_iteration": 3.077289342880249 }, { "auxiliary_loss_clip": 0.01130235, "auxiliary_loss_mlp": 0.0103658, "balance_loss_clip": 1.02110088, "balance_loss_mlp": 1.04438305, "epoch": 0.26814970689914325, "flos": 20449044576000.0, "grad_norm": 1.9442154225510224, "language_loss": 0.85024744, "learning_rate": 3.4339699392689507e-06, "loss": 0.87191558, "num_input_tokens_seen": 96425225, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.859375, "step": 4460, "time_per_iteration": 2.476144790649414 }, { "auxiliary_loss_clip": 0.01131691, "auxiliary_loss_mlp": 0.01042906, "balance_loss_clip": 1.02618098, "balance_loss_mlp": 1.04542732, "epoch": 0.2682098301518112, "flos": 17566674727680.0, "grad_norm": 1.7168779623766486, "language_loss": 0.68439347, "learning_rate": 3.4336984227738796e-06, "loss": 0.70613939, "num_input_tokens_seen": 96443780, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.859375, "step": 4461, "time_per_iteration": 2.4315009117126465 }, { "auxiliary_loss_clip": 0.01133164, "auxiliary_loss_mlp": 0.01043805, "balance_loss_clip": 1.02721143, "balance_loss_mlp": 1.04621565, "epoch": 0.2682699534044792, "flos": 18333259810560.0, "grad_norm": 2.3451939456163493, "language_loss": 0.66936266, "learning_rate": 3.43342685191282e-06, "loss": 0.69113237, "num_input_tokens_seen": 96464530, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.87109375, "step": 4462, "time_per_iteration": 2.511976957321167 }, { "auxiliary_loss_clip": 0.01134364, "auxiliary_loss_mlp": 0.01042177, "balance_loss_clip": 1.02507067, "balance_loss_mlp": 1.04803717, "epoch": 0.26833007665714714, "flos": 25301294144640.0, "grad_norm": 1.5735121097819866, "language_loss": 0.69258094, "learning_rate": 3.4331552266960705e-06, "loss": 0.71434635, "num_input_tokens_seen": 96483345, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.86328125, "step": 4463, "time_per_iteration": 2.511164426803589 }, { "auxiliary_loss_clip": 0.0113545, "auxiliary_loss_mlp": 0.01036113, "balance_loss_clip": 1.01939964, "balance_loss_mlp": 1.04653895, "epoch": 0.2683901999098151, "flos": 16099759198080.0, "grad_norm": 3.273662903170675, "language_loss": 0.77834737, "learning_rate": 3.432883547133931e-06, "loss": 0.80006301, "num_input_tokens_seen": 96498305, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.890625, "step": 4464, "time_per_iteration": 2.4469563961029053 }, { "auxiliary_loss_clip": 0.01131637, "auxiliary_loss_mlp": 0.01038741, "balance_loss_clip": 1.02184892, "balance_loss_mlp": 1.04538012, "epoch": 0.2684503231624831, "flos": 27308054154240.0, "grad_norm": 2.0069322679637165, "language_loss": 0.7084372, "learning_rate": 3.432611813236704e-06, "loss": 0.73014092, "num_input_tokens_seen": 96519740, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.86328125, "step": 4465, "time_per_iteration": 2.5041544437408447 }, { "auxiliary_loss_clip": 0.01054276, "auxiliary_loss_mlp": 0.01005137, "balance_loss_clip": 1.00307488, "balance_loss_mlp": 1.02317047, "epoch": 0.26851044641515104, "flos": 71858007239040.0, "grad_norm": 0.6968314741479403, "language_loss": 0.53123236, "learning_rate": 3.4323400250146943e-06, "loss": 0.55182648, "num_input_tokens_seen": 96588870, "router_z_loss_clip": 0.02062988, "router_z_loss_mlp": 0.3125, "step": 4466, "time_per_iteration": 3.2394464015960693 }, { "auxiliary_loss_clip": 0.01131453, "auxiliary_loss_mlp": 0.01046165, "balance_loss_clip": 1.02918971, "balance_loss_mlp": 1.0459156, "epoch": 0.268570569667819, "flos": 18733771434240.0, "grad_norm": 1.890755355284894, "language_loss": 0.73568755, "learning_rate": 3.4320681824782057e-06, "loss": 0.75746375, "num_input_tokens_seen": 96605100, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.85546875, "step": 4467, "time_per_iteration": 2.4348206520080566 }, { "auxiliary_loss_clip": 0.01135754, "auxiliary_loss_mlp": 0.01041603, "balance_loss_clip": 1.02437735, "balance_loss_mlp": 1.04707944, "epoch": 0.268630692920487, "flos": 18178376365440.0, "grad_norm": 4.981042260305903, "language_loss": 0.8063519, "learning_rate": 3.4317962856375493e-06, "loss": 0.82812548, "num_input_tokens_seen": 96621410, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.88671875, "step": 4468, "time_per_iteration": 3.9129655361175537 }, { "auxiliary_loss_clip": 0.01052207, "auxiliary_loss_mlp": 0.01005272, "balance_loss_clip": 1.00304246, "balance_loss_mlp": 1.02143228, "epoch": 0.268690816173155, "flos": 68731768978560.0, "grad_norm": 0.8703904716961157, "language_loss": 0.59581596, "learning_rate": 3.4315243345030334e-06, "loss": 0.61639071, "num_input_tokens_seen": 96684810, "router_z_loss_clip": 0.02233887, "router_z_loss_mlp": 0.30859375, "step": 4469, "time_per_iteration": 3.179001569747925 }, { "auxiliary_loss_clip": 0.01134464, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.02638459, "balance_loss_mlp": 1.04687393, "epoch": 0.26875093942582295, "flos": 23293636295040.0, "grad_norm": 1.9500546033035722, "language_loss": 0.81915796, "learning_rate": 3.431252329084972e-06, "loss": 0.84094548, "num_input_tokens_seen": 96701920, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.875, "step": 4470, "time_per_iteration": 3.908431053161621 }, { "auxiliary_loss_clip": 0.01125968, "auxiliary_loss_mlp": 0.01035808, "balance_loss_clip": 1.01982176, "balance_loss_mlp": 1.04352975, "epoch": 0.2688110626784909, "flos": 21543458112000.0, "grad_norm": 1.623681387017176, "language_loss": 0.82678616, "learning_rate": 3.4309802693936786e-06, "loss": 0.84840387, "num_input_tokens_seen": 96721260, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 4471, "time_per_iteration": 2.4778761863708496 }, { "auxiliary_loss_clip": 0.01130815, "auxiliary_loss_mlp": 0.01036603, "balance_loss_clip": 1.02116549, "balance_loss_mlp": 1.04690909, "epoch": 0.2688711859311589, "flos": 28400600183040.0, "grad_norm": 2.1691327257781827, "language_loss": 0.69347453, "learning_rate": 3.43070815543947e-06, "loss": 0.71514869, "num_input_tokens_seen": 96740385, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83984375, "step": 4472, "time_per_iteration": 3.958587646484375 }, { "auxiliary_loss_clip": 0.01132434, "auxiliary_loss_mlp": 0.01039736, "balance_loss_clip": 1.02401257, "balance_loss_mlp": 1.04723597, "epoch": 0.26893130918382685, "flos": 25994944661760.0, "grad_norm": 1.4681094291967434, "language_loss": 0.67814219, "learning_rate": 3.4304359872326656e-06, "loss": 0.69986385, "num_input_tokens_seen": 96761860, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8515625, "step": 4473, "time_per_iteration": 2.515994071960449 }, { "auxiliary_loss_clip": 0.01130949, "auxiliary_loss_mlp": 0.01041643, "balance_loss_clip": 1.02606201, "balance_loss_mlp": 1.04741693, "epoch": 0.2689914324364948, "flos": 20339624770560.0, "grad_norm": 1.641293172043035, "language_loss": 0.82879007, "learning_rate": 3.4301637647835843e-06, "loss": 0.85051596, "num_input_tokens_seen": 96781890, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 4474, "time_per_iteration": 2.507427930831909 }, { "auxiliary_loss_clip": 0.01129625, "auxiliary_loss_mlp": 0.01043082, "balance_loss_clip": 1.02753735, "balance_loss_mlp": 1.04649305, "epoch": 0.2690515556891628, "flos": 19464553635840.0, "grad_norm": 1.904040319482147, "language_loss": 0.70662582, "learning_rate": 3.4298914881025494e-06, "loss": 0.72835284, "num_input_tokens_seen": 96800390, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83203125, "step": 4475, "time_per_iteration": 2.450082302093506 }, { "auxiliary_loss_clip": 0.01132329, "auxiliary_loss_mlp": 0.01041691, "balance_loss_clip": 1.02557397, "balance_loss_mlp": 1.04596257, "epoch": 0.26911167894183075, "flos": 18146631720960.0, "grad_norm": 1.7369068766465128, "language_loss": 0.7305274, "learning_rate": 3.4296191571998863e-06, "loss": 0.75226754, "num_input_tokens_seen": 96816685, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.86328125, "step": 4476, "time_per_iteration": 2.438258409500122 }, { "auxiliary_loss_clip": 0.0112935, "auxiliary_loss_mlp": 0.01036284, "balance_loss_clip": 1.02084649, "balance_loss_mlp": 1.04601526, "epoch": 0.2691718021944987, "flos": 19975131509760.0, "grad_norm": 1.8722585325229495, "language_loss": 0.80441916, "learning_rate": 3.429346772085922e-06, "loss": 0.82607549, "num_input_tokens_seen": 96836285, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83203125, "step": 4477, "time_per_iteration": 2.4471535682678223 }, { "auxiliary_loss_clip": 0.01132065, "auxiliary_loss_mlp": 0.0104217, "balance_loss_clip": 1.02567101, "balance_loss_mlp": 1.04469132, "epoch": 0.2692319254471667, "flos": 37447215770880.0, "grad_norm": 1.8068465549908073, "language_loss": 0.65133625, "learning_rate": 3.429074332770984e-06, "loss": 0.67307866, "num_input_tokens_seen": 96857745, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.875, "step": 4478, "time_per_iteration": 2.602989673614502 }, { "auxiliary_loss_clip": 0.01131567, "auxiliary_loss_mlp": 0.01041107, "balance_loss_clip": 1.02500129, "balance_loss_mlp": 1.04510784, "epoch": 0.26929204869983464, "flos": 22127796564480.0, "grad_norm": 1.700708858425864, "language_loss": 0.80772454, "learning_rate": 3.4288018392654047e-06, "loss": 0.82945126, "num_input_tokens_seen": 96877295, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.86328125, "step": 4479, "time_per_iteration": 2.4724135398864746 }, { "auxiliary_loss_clip": 0.01132969, "auxiliary_loss_mlp": 0.01046133, "balance_loss_clip": 1.02981305, "balance_loss_mlp": 1.04545176, "epoch": 0.2693521719525026, "flos": 19792813052160.0, "grad_norm": 1.896304496365282, "language_loss": 0.80687439, "learning_rate": 3.4285292915795166e-06, "loss": 0.82866538, "num_input_tokens_seen": 96896160, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.875, "step": 4480, "time_per_iteration": 2.487715482711792 }, { "auxiliary_loss_clip": 0.01125898, "auxiliary_loss_mlp": 0.01031421, "balance_loss_clip": 1.01665127, "balance_loss_mlp": 1.04424119, "epoch": 0.2694122952051706, "flos": 20994383836800.0, "grad_norm": 2.436575140678998, "language_loss": 0.77802223, "learning_rate": 3.4282566897236543e-06, "loss": 0.79959542, "num_input_tokens_seen": 96915410, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.81640625, "step": 4481, "time_per_iteration": 2.4573376178741455 }, { "auxiliary_loss_clip": 0.01134206, "auxiliary_loss_mlp": 0.01047288, "balance_loss_clip": 1.0306102, "balance_loss_mlp": 1.04635561, "epoch": 0.2694724184578386, "flos": 25849291011840.0, "grad_norm": 1.7859771917790284, "language_loss": 0.73889458, "learning_rate": 3.4279840337081547e-06, "loss": 0.76070952, "num_input_tokens_seen": 96937865, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.87890625, "step": 4482, "time_per_iteration": 2.521101236343384 }, { "auxiliary_loss_clip": 0.01133918, "auxiliary_loss_mlp": 0.01035023, "balance_loss_clip": 1.01850033, "balance_loss_mlp": 1.04760814, "epoch": 0.26953254171050656, "flos": 21726961718400.0, "grad_norm": 2.2148510306918436, "language_loss": 0.72242403, "learning_rate": 3.4277113235433584e-06, "loss": 0.7441135, "num_input_tokens_seen": 96957710, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.86328125, "step": 4483, "time_per_iteration": 2.461747646331787 }, { "auxiliary_loss_clip": 0.01133807, "auxiliary_loss_mlp": 0.01044447, "balance_loss_clip": 1.02686334, "balance_loss_mlp": 1.04421449, "epoch": 0.2695926649631745, "flos": 19682926369920.0, "grad_norm": 2.3120521400146785, "language_loss": 0.86616027, "learning_rate": 3.427438559239605e-06, "loss": 0.88794279, "num_input_tokens_seen": 96975890, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.89453125, "step": 4484, "time_per_iteration": 2.456489086151123 }, { "auxiliary_loss_clip": 0.0113266, "auxiliary_loss_mlp": 0.01038744, "balance_loss_clip": 1.02324736, "balance_loss_mlp": 1.04497385, "epoch": 0.2696527882158425, "flos": 32886596724480.0, "grad_norm": 1.5196953009618055, "language_loss": 0.66475618, "learning_rate": 3.427165740807239e-06, "loss": 0.68647021, "num_input_tokens_seen": 96998595, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.875, "step": 4485, "time_per_iteration": 2.5537586212158203 }, { "auxiliary_loss_clip": 0.0113309, "auxiliary_loss_mlp": 0.01041535, "balance_loss_clip": 1.02530456, "balance_loss_mlp": 1.04597104, "epoch": 0.26971291146851045, "flos": 12124843320960.0, "grad_norm": 2.125504934129669, "language_loss": 0.72858, "learning_rate": 3.426892868256604e-06, "loss": 0.75032622, "num_input_tokens_seen": 97013715, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87109375, "step": 4486, "time_per_iteration": 2.452258586883545 }, { "auxiliary_loss_clip": 0.01137306, "auxiliary_loss_mlp": 0.01038815, "balance_loss_clip": 1.02232802, "balance_loss_mlp": 1.04852247, "epoch": 0.2697730347211784, "flos": 22634459856000.0, "grad_norm": 1.8744404073676186, "language_loss": 0.83635271, "learning_rate": 3.4266199415980495e-06, "loss": 0.85811388, "num_input_tokens_seen": 97031570, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.890625, "step": 4487, "time_per_iteration": 2.4684674739837646 }, { "auxiliary_loss_clip": 0.01135805, "auxiliary_loss_mlp": 0.0104552, "balance_loss_clip": 1.02796006, "balance_loss_mlp": 1.04700112, "epoch": 0.2698331579738464, "flos": 23513050523520.0, "grad_norm": 2.03147005709025, "language_loss": 0.71960688, "learning_rate": 3.4263469608419234e-06, "loss": 0.74142015, "num_input_tokens_seen": 97049815, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.890625, "step": 4488, "time_per_iteration": 2.47016978263855 }, { "auxiliary_loss_clip": 0.0113656, "auxiliary_loss_mlp": 0.01047112, "balance_loss_clip": 1.03041077, "balance_loss_mlp": 1.0489459, "epoch": 0.26989328122651435, "flos": 24641040297600.0, "grad_norm": 1.6044421258417176, "language_loss": 0.83872449, "learning_rate": 3.426073925998578e-06, "loss": 0.86056119, "num_input_tokens_seen": 97067570, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.875, "step": 4489, "time_per_iteration": 2.4867477416992188 }, { "auxiliary_loss_clip": 0.01136936, "auxiliary_loss_mlp": 0.01053158, "balance_loss_clip": 1.03543162, "balance_loss_mlp": 1.04765892, "epoch": 0.2699534044791823, "flos": 10772555068800.0, "grad_norm": 2.1547531606984154, "language_loss": 0.90044028, "learning_rate": 3.4258008370783656e-06, "loss": 0.92234123, "num_input_tokens_seen": 97082180, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.890625, "step": 4490, "time_per_iteration": 2.455303192138672 }, { "auxiliary_loss_clip": 0.01130965, "auxiliary_loss_mlp": 0.01040765, "balance_loss_clip": 1.0249933, "balance_loss_mlp": 1.04656577, "epoch": 0.2700135277318503, "flos": 36171597098880.0, "grad_norm": 1.9764664791951108, "language_loss": 0.73507702, "learning_rate": 3.4255276940916434e-06, "loss": 0.75679427, "num_input_tokens_seen": 97103470, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84375, "step": 4491, "time_per_iteration": 2.5791406631469727 }, { "auxiliary_loss_clip": 0.01139844, "auxiliary_loss_mlp": 0.01040714, "balance_loss_clip": 1.02353024, "balance_loss_mlp": 1.05170155, "epoch": 0.27007365098451824, "flos": 17418614866560.0, "grad_norm": 2.8361353936820315, "language_loss": 0.74176615, "learning_rate": 3.4252544970487676e-06, "loss": 0.76357174, "num_input_tokens_seen": 97118100, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8828125, "step": 4492, "time_per_iteration": 2.434933662414551 }, { "auxiliary_loss_clip": 0.01134174, "auxiliary_loss_mlp": 0.01041738, "balance_loss_clip": 1.02463174, "balance_loss_mlp": 1.047948, "epoch": 0.2701337742371862, "flos": 23185688947200.0, "grad_norm": 1.8472826097835011, "language_loss": 0.88937056, "learning_rate": 3.4249812459600986e-06, "loss": 0.91112965, "num_input_tokens_seen": 97136765, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.859375, "step": 4493, "time_per_iteration": 2.4738576412200928 }, { "auxiliary_loss_clip": 0.01135452, "auxiliary_loss_mlp": 0.0104161, "balance_loss_clip": 1.02562451, "balance_loss_mlp": 1.04807925, "epoch": 0.2701938974898542, "flos": 24389450461440.0, "grad_norm": 1.5272798836155599, "language_loss": 0.7136057, "learning_rate": 3.424707940835998e-06, "loss": 0.73537636, "num_input_tokens_seen": 97157470, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87109375, "step": 4494, "time_per_iteration": 2.5200817584991455 }, { "auxiliary_loss_clip": 0.01130556, "auxiliary_loss_mlp": 0.01037467, "balance_loss_clip": 1.0217495, "balance_loss_mlp": 1.04662108, "epoch": 0.2702540207425222, "flos": 26214322976640.0, "grad_norm": 1.9929262314491847, "language_loss": 0.86222291, "learning_rate": 3.42443458168683e-06, "loss": 0.88390315, "num_input_tokens_seen": 97176905, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.83984375, "step": 4495, "time_per_iteration": 2.5080244541168213 }, { "auxiliary_loss_clip": 0.011337, "auxiliary_loss_mlp": 0.01048554, "balance_loss_clip": 1.031757, "balance_loss_mlp": 1.04766893, "epoch": 0.27031414399519016, "flos": 22926377687040.0, "grad_norm": 1.6136925832004694, "language_loss": 0.76982236, "learning_rate": 3.424161168522959e-06, "loss": 0.79164493, "num_input_tokens_seen": 97196380, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.859375, "step": 4496, "time_per_iteration": 2.497774124145508 }, { "auxiliary_loss_clip": 0.01053889, "auxiliary_loss_mlp": 0.01020633, "balance_loss_clip": 1.01836765, "balance_loss_mlp": 1.02394223, "epoch": 0.2703742672478581, "flos": 63019780404480.0, "grad_norm": 0.7493503153876653, "language_loss": 0.50180721, "learning_rate": 3.423887701354754e-06, "loss": 0.52255243, "num_input_tokens_seen": 97260100, "router_z_loss_clip": 0.02270508, "router_z_loss_mlp": 0.29882812, "step": 4497, "time_per_iteration": 3.117485523223877 }, { "auxiliary_loss_clip": 0.01135873, "auxiliary_loss_mlp": 0.01041869, "balance_loss_clip": 1.02552497, "balance_loss_mlp": 1.04976439, "epoch": 0.2704343905005261, "flos": 18840820942080.0, "grad_norm": 1.6947844386984008, "language_loss": 0.72821695, "learning_rate": 3.4236141801925847e-06, "loss": 0.74999434, "num_input_tokens_seen": 97277935, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.859375, "step": 4498, "time_per_iteration": 2.4921469688415527 }, { "auxiliary_loss_clip": 0.01052944, "auxiliary_loss_mlp": 0.0100983, "balance_loss_clip": 1.00763667, "balance_loss_mlp": 1.02313972, "epoch": 0.27049451375319405, "flos": 71233412618880.0, "grad_norm": 0.7646858720568477, "language_loss": 0.59203541, "learning_rate": 3.4233406050468237e-06, "loss": 0.61266315, "num_input_tokens_seen": 97338845, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.296875, "step": 4499, "time_per_iteration": 3.108187437057495 }, { "auxiliary_loss_clip": 0.0113302, "auxiliary_loss_mlp": 0.01036574, "balance_loss_clip": 1.02006364, "balance_loss_mlp": 1.04772568, "epoch": 0.270554637005862, "flos": 24278594112000.0, "grad_norm": 2.1201332504003307, "language_loss": 0.73298663, "learning_rate": 3.4230669759278438e-06, "loss": 0.7546826, "num_input_tokens_seen": 97356640, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8515625, "step": 4500, "time_per_iteration": 2.4929091930389404 }, { "auxiliary_loss_clip": 0.01130706, "auxiliary_loss_mlp": 0.01040348, "balance_loss_clip": 1.02370667, "balance_loss_mlp": 1.04372525, "epoch": 0.27061476025853, "flos": 17632318832640.0, "grad_norm": 2.7233723108406362, "language_loss": 0.81230032, "learning_rate": 3.4227932928460215e-06, "loss": 0.83401084, "num_input_tokens_seen": 97372585, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.87109375, "step": 4501, "time_per_iteration": 2.4246280193328857 }, { "auxiliary_loss_clip": 0.01136309, "auxiliary_loss_mlp": 0.01045477, "balance_loss_clip": 1.02825761, "balance_loss_mlp": 1.04752362, "epoch": 0.27067488351119795, "flos": 22710123855360.0, "grad_norm": 1.812207054460621, "language_loss": 0.72659326, "learning_rate": 3.422519555811735e-06, "loss": 0.74841112, "num_input_tokens_seen": 97393315, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.88671875, "step": 4502, "time_per_iteration": 2.4950778484344482 }, { "auxiliary_loss_clip": 0.01135259, "auxiliary_loss_mlp": 0.01040848, "balance_loss_clip": 1.02216828, "balance_loss_mlp": 1.04360592, "epoch": 0.2707350067638659, "flos": 41719616087040.0, "grad_norm": 1.791494056997056, "language_loss": 0.68339229, "learning_rate": 3.4222457648353642e-06, "loss": 0.70515329, "num_input_tokens_seen": 97417860, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.91796875, "step": 4503, "time_per_iteration": 2.6462838649749756 }, { "auxiliary_loss_clip": 0.01134479, "auxiliary_loss_mlp": 0.01043854, "balance_loss_clip": 1.02722442, "balance_loss_mlp": 1.04622531, "epoch": 0.2707951300165339, "flos": 20193037367040.0, "grad_norm": 2.054613132643784, "language_loss": 0.67850375, "learning_rate": 3.4219719199272918e-06, "loss": 0.7002871, "num_input_tokens_seen": 97436780, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8828125, "step": 4504, "time_per_iteration": 2.4632985591888428 }, { "auxiliary_loss_clip": 0.01134495, "auxiliary_loss_mlp": 0.01042868, "balance_loss_clip": 1.02723372, "balance_loss_mlp": 1.04881811, "epoch": 0.27085525326920185, "flos": 21433966479360.0, "grad_norm": 2.0689987185770646, "language_loss": 0.75738418, "learning_rate": 3.421698021097902e-06, "loss": 0.77915782, "num_input_tokens_seen": 97456190, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 4505, "time_per_iteration": 2.498300075531006 }, { "auxiliary_loss_clip": 0.01138273, "auxiliary_loss_mlp": 0.01049905, "balance_loss_clip": 1.03186893, "balance_loss_mlp": 1.0463866, "epoch": 0.2709153765218698, "flos": 17675232606720.0, "grad_norm": 2.1497932242345414, "language_loss": 0.73312676, "learning_rate": 3.42142406835758e-06, "loss": 0.75500858, "num_input_tokens_seen": 97474545, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.91796875, "step": 4506, "time_per_iteration": 2.4734561443328857 }, { "auxiliary_loss_clip": 0.01137283, "auxiliary_loss_mlp": 0.01041198, "balance_loss_clip": 1.02250588, "balance_loss_mlp": 1.04710126, "epoch": 0.2709754997745378, "flos": 24456243801600.0, "grad_norm": 1.836943615367812, "language_loss": 0.80320871, "learning_rate": 3.421150061716715e-06, "loss": 0.82499355, "num_input_tokens_seen": 97494520, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.90234375, "step": 4507, "time_per_iteration": 2.520786762237549 }, { "auxiliary_loss_clip": 0.01048294, "auxiliary_loss_mlp": 0.01056238, "balance_loss_clip": 1.05399692, "balance_loss_mlp": 1.01836276, "epoch": 0.2710356230272058, "flos": 65210798206080.0, "grad_norm": 0.7694362762333541, "language_loss": 0.50923347, "learning_rate": 3.420876001185698e-06, "loss": 0.5302788, "num_input_tokens_seen": 97552455, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.30078125, "step": 4508, "time_per_iteration": 3.014505386352539 }, { "auxiliary_loss_clip": 0.01131249, "auxiliary_loss_mlp": 0.01039567, "balance_loss_clip": 1.02408171, "balance_loss_mlp": 1.04657722, "epoch": 0.27109574627987376, "flos": 25484438615040.0, "grad_norm": 2.111522848287951, "language_loss": 0.74809122, "learning_rate": 3.4206018867749197e-06, "loss": 0.76979935, "num_input_tokens_seen": 97572650, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84375, "step": 4509, "time_per_iteration": 3.9875450134277344 }, { "auxiliary_loss_clip": 0.01128733, "auxiliary_loss_mlp": 0.01036484, "balance_loss_clip": 1.02105808, "balance_loss_mlp": 1.04472888, "epoch": 0.2711558695325417, "flos": 19682782715520.0, "grad_norm": 1.7216038772281823, "language_loss": 0.71361291, "learning_rate": 3.4203277184947757e-06, "loss": 0.73526514, "num_input_tokens_seen": 97591150, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83984375, "step": 4510, "time_per_iteration": 2.5063767433166504 }, { "auxiliary_loss_clip": 0.01133443, "auxiliary_loss_mlp": 0.01038429, "balance_loss_clip": 1.02222848, "balance_loss_mlp": 1.04705787, "epoch": 0.2712159927852097, "flos": 18587758648320.0, "grad_norm": 3.403490170924706, "language_loss": 0.6988374, "learning_rate": 3.4200534963556627e-06, "loss": 0.72055614, "num_input_tokens_seen": 97607410, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8671875, "step": 4511, "time_per_iteration": 2.44765305519104 }, { "auxiliary_loss_clip": 0.0113362, "auxiliary_loss_mlp": 0.01045603, "balance_loss_clip": 1.02882981, "balance_loss_mlp": 1.04502702, "epoch": 0.27127611603787766, "flos": 25630235919360.0, "grad_norm": 1.9080793685557247, "language_loss": 0.80677909, "learning_rate": 3.419779220367979e-06, "loss": 0.82857138, "num_input_tokens_seen": 97626870, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.88671875, "step": 4512, "time_per_iteration": 3.9211089611053467 }, { "auxiliary_loss_clip": 0.01131011, "auxiliary_loss_mlp": 0.0103935, "balance_loss_clip": 1.02440715, "balance_loss_mlp": 1.0463922, "epoch": 0.2713362392905456, "flos": 23148952312320.0, "grad_norm": 1.430485187586471, "language_loss": 0.80418915, "learning_rate": 3.419504890542124e-06, "loss": 0.82589281, "num_input_tokens_seen": 97646595, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.84765625, "step": 4513, "time_per_iteration": 3.975996255874634 }, { "auxiliary_loss_clip": 0.0113376, "auxiliary_loss_mlp": 0.01049444, "balance_loss_clip": 1.0335176, "balance_loss_mlp": 1.04513669, "epoch": 0.2713963625432136, "flos": 18366045949440.0, "grad_norm": 2.334520851297644, "language_loss": 0.88142931, "learning_rate": 3.4192305068885026e-06, "loss": 0.90326136, "num_input_tokens_seen": 97665485, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.88671875, "step": 4514, "time_per_iteration": 2.4503769874572754 }, { "auxiliary_loss_clip": 0.01133266, "auxiliary_loss_mlp": 0.01055101, "balance_loss_clip": 1.03885925, "balance_loss_mlp": 1.04706645, "epoch": 0.27145648579588155, "flos": 22491751121280.0, "grad_norm": 1.554522541509588, "language_loss": 0.91609871, "learning_rate": 3.418956069417517e-06, "loss": 0.93798238, "num_input_tokens_seen": 97683800, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.86328125, "step": 4515, "time_per_iteration": 2.463665723800659 }, { "auxiliary_loss_clip": 0.01141318, "auxiliary_loss_mlp": 0.01067802, "balance_loss_clip": 1.04812062, "balance_loss_mlp": 1.04930282, "epoch": 0.2715166090485495, "flos": 19239177749760.0, "grad_norm": 2.7451578477662646, "language_loss": 0.7333613, "learning_rate": 3.4186815781395756e-06, "loss": 0.75545245, "num_input_tokens_seen": 97700505, "router_z_loss_clip": 0.19726562, "router_z_loss_mlp": 0.921875, "step": 4516, "time_per_iteration": 2.467583417892456 }, { "auxiliary_loss_clip": 0.01133934, "auxiliary_loss_mlp": 0.01047806, "balance_loss_clip": 1.030604, "balance_loss_mlp": 1.04697859, "epoch": 0.2715767323012175, "flos": 17709598944000.0, "grad_norm": 2.2870885129489267, "language_loss": 0.76476532, "learning_rate": 3.4184070330650866e-06, "loss": 0.78658271, "num_input_tokens_seen": 97717410, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8671875, "step": 4517, "time_per_iteration": 2.4246950149536133 }, { "auxiliary_loss_clip": 0.01133425, "auxiliary_loss_mlp": 0.01052315, "balance_loss_clip": 1.03493452, "balance_loss_mlp": 1.04598355, "epoch": 0.27163685555388545, "flos": 22382834106240.0, "grad_norm": 2.372222550632514, "language_loss": 0.77671146, "learning_rate": 3.4181324342044607e-06, "loss": 0.79856884, "num_input_tokens_seen": 97734545, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.875, "step": 4518, "time_per_iteration": 2.481722354888916 }, { "auxiliary_loss_clip": 0.01134034, "auxiliary_loss_mlp": 0.01050823, "balance_loss_clip": 1.03518283, "balance_loss_mlp": 1.04599023, "epoch": 0.2716969788065534, "flos": 22346708002560.0, "grad_norm": 1.6212334987709804, "language_loss": 0.68420774, "learning_rate": 3.41785778156811e-06, "loss": 0.70605624, "num_input_tokens_seen": 97754000, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8828125, "step": 4519, "time_per_iteration": 2.455794095993042 }, { "auxiliary_loss_clip": 0.01131368, "auxiliary_loss_mlp": 0.01047483, "balance_loss_clip": 1.03209317, "balance_loss_mlp": 1.04451013, "epoch": 0.2717571020592214, "flos": 25228467319680.0, "grad_norm": 1.9584277545815256, "language_loss": 0.75739163, "learning_rate": 3.417583075166451e-06, "loss": 0.77918011, "num_input_tokens_seen": 97772080, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8671875, "step": 4520, "time_per_iteration": 2.529151439666748 }, { "auxiliary_loss_clip": 0.01136813, "auxiliary_loss_mlp": 0.0106243, "balance_loss_clip": 1.04423881, "balance_loss_mlp": 1.04765594, "epoch": 0.2718172253118894, "flos": 20189769229440.0, "grad_norm": 2.1146451925651273, "language_loss": 0.76435244, "learning_rate": 3.4173083150099e-06, "loss": 0.78634489, "num_input_tokens_seen": 97789370, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.890625, "step": 4521, "time_per_iteration": 2.437901735305786 }, { "auxiliary_loss_clip": 0.01137862, "auxiliary_loss_mlp": 0.01058956, "balance_loss_clip": 1.04125333, "balance_loss_mlp": 1.04735672, "epoch": 0.27187734856455736, "flos": 14319129260160.0, "grad_norm": 3.9913512880427455, "language_loss": 0.75335217, "learning_rate": 3.417033501108875e-06, "loss": 0.77532035, "num_input_tokens_seen": 97807385, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.90625, "step": 4522, "time_per_iteration": 2.4655120372772217 }, { "auxiliary_loss_clip": 0.01139252, "auxiliary_loss_mlp": 0.01048811, "balance_loss_clip": 1.03197908, "balance_loss_mlp": 1.04965436, "epoch": 0.27193747181722533, "flos": 21107682311040.0, "grad_norm": 2.0659216191700804, "language_loss": 0.7276448, "learning_rate": 3.416758633473798e-06, "loss": 0.74952543, "num_input_tokens_seen": 97827930, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.89453125, "step": 4523, "time_per_iteration": 2.4735543727874756 }, { "auxiliary_loss_clip": 0.01129957, "auxiliary_loss_mlp": 0.01045761, "balance_loss_clip": 1.02874994, "balance_loss_mlp": 1.04455543, "epoch": 0.2719975950698933, "flos": 19682782715520.0, "grad_norm": 1.4446352297525407, "language_loss": 0.74178195, "learning_rate": 3.4164837121150915e-06, "loss": 0.76353908, "num_input_tokens_seen": 97847440, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.85546875, "step": 4524, "time_per_iteration": 2.519047260284424 }, { "auxiliary_loss_clip": 0.01136139, "auxiliary_loss_mlp": 0.01050136, "balance_loss_clip": 1.03314865, "balance_loss_mlp": 1.0476408, "epoch": 0.27205771832256126, "flos": 24754482426240.0, "grad_norm": 1.7937474124848043, "language_loss": 0.76185799, "learning_rate": 3.4162087370431803e-06, "loss": 0.78372073, "num_input_tokens_seen": 97867620, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.88671875, "step": 4525, "time_per_iteration": 2.564070224761963 }, { "auxiliary_loss_clip": 0.01132906, "auxiliary_loss_mlp": 0.01057974, "balance_loss_clip": 1.04087985, "balance_loss_mlp": 1.04694307, "epoch": 0.2721178415752292, "flos": 21755581879680.0, "grad_norm": 1.8174603307248056, "language_loss": 0.8178674, "learning_rate": 3.4159337082684926e-06, "loss": 0.83977622, "num_input_tokens_seen": 97884345, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.859375, "step": 4526, "time_per_iteration": 2.505852222442627 }, { "auxiliary_loss_clip": 0.01139133, "auxiliary_loss_mlp": 0.01048912, "balance_loss_clip": 1.03064942, "balance_loss_mlp": 1.04656744, "epoch": 0.2721779648278972, "flos": 12676826597760.0, "grad_norm": 2.028992057716322, "language_loss": 0.76744032, "learning_rate": 3.4156586258014566e-06, "loss": 0.78932077, "num_input_tokens_seen": 97901500, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.92578125, "step": 4527, "time_per_iteration": 2.4834792613983154 }, { "auxiliary_loss_clip": 0.0113377, "auxiliary_loss_mlp": 0.01048756, "balance_loss_clip": 1.03108954, "balance_loss_mlp": 1.04606915, "epoch": 0.27223808808056515, "flos": 16253206099200.0, "grad_norm": 2.137968595103775, "language_loss": 0.82192433, "learning_rate": 3.415383489652503e-06, "loss": 0.84374952, "num_input_tokens_seen": 97917800, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.875, "step": 4528, "time_per_iteration": 2.504047155380249 }, { "auxiliary_loss_clip": 0.01133242, "auxiliary_loss_mlp": 0.01047456, "balance_loss_clip": 1.03101695, "balance_loss_mlp": 1.04689765, "epoch": 0.2722982113332331, "flos": 27745805203200.0, "grad_norm": 1.7959854124871708, "language_loss": 0.76980835, "learning_rate": 3.4151082998320666e-06, "loss": 0.79161537, "num_input_tokens_seen": 97937225, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.859375, "step": 4529, "time_per_iteration": 2.582016944885254 }, { "auxiliary_loss_clip": 0.0113605, "auxiliary_loss_mlp": 0.01042834, "balance_loss_clip": 1.02671683, "balance_loss_mlp": 1.04599571, "epoch": 0.2723583345859011, "flos": 21726243446400.0, "grad_norm": 1.9958450420762517, "language_loss": 0.81962132, "learning_rate": 3.4148330563505805e-06, "loss": 0.84141016, "num_input_tokens_seen": 97956845, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8984375, "step": 4530, "time_per_iteration": 2.5035455226898193 }, { "auxiliary_loss_clip": 0.01134139, "auxiliary_loss_mlp": 0.01039352, "balance_loss_clip": 1.02248383, "balance_loss_mlp": 1.04578352, "epoch": 0.27241845783856905, "flos": 17347260499200.0, "grad_norm": 2.3399823153507984, "language_loss": 0.91392994, "learning_rate": 3.4145577592184838e-06, "loss": 0.93566489, "num_input_tokens_seen": 97972465, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8828125, "step": 4531, "time_per_iteration": 2.527639865875244 }, { "auxiliary_loss_clip": 0.01136972, "auxiliary_loss_mlp": 0.01045172, "balance_loss_clip": 1.02794635, "balance_loss_mlp": 1.04604626, "epoch": 0.272478581091237, "flos": 24754302858240.0, "grad_norm": 2.010270778560973, "language_loss": 0.76374209, "learning_rate": 3.4142824084462155e-06, "loss": 0.78556353, "num_input_tokens_seen": 97990770, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.91015625, "step": 4532, "time_per_iteration": 2.566694736480713 }, { "auxiliary_loss_clip": 0.01130554, "auxiliary_loss_mlp": 0.01036226, "balance_loss_clip": 1.0199064, "balance_loss_mlp": 1.04488254, "epoch": 0.272538704343905, "flos": 17890624512000.0, "grad_norm": 2.4813344743454135, "language_loss": 0.88942873, "learning_rate": 3.4140070040442162e-06, "loss": 0.91109645, "num_input_tokens_seen": 98005775, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.85546875, "step": 4533, "time_per_iteration": 2.502898693084717 }, { "auxiliary_loss_clip": 0.01131778, "auxiliary_loss_mlp": 0.01038994, "balance_loss_clip": 1.02257931, "balance_loss_mlp": 1.04630351, "epoch": 0.272598827596573, "flos": 22932016122240.0, "grad_norm": 2.0965447354022793, "language_loss": 0.71254146, "learning_rate": 3.413731546022929e-06, "loss": 0.73424923, "num_input_tokens_seen": 98025750, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.85546875, "step": 4534, "time_per_iteration": 2.5189716815948486 }, { "auxiliary_loss_clip": 0.01134518, "auxiliary_loss_mlp": 0.01043168, "balance_loss_clip": 1.02514362, "balance_loss_mlp": 1.04502296, "epoch": 0.27265895084924097, "flos": 24238409771520.0, "grad_norm": 1.8372180279749224, "language_loss": 0.9122079, "learning_rate": 3.4134560343928005e-06, "loss": 0.93398476, "num_input_tokens_seen": 98044955, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.8984375, "step": 4535, "time_per_iteration": 2.5325028896331787 }, { "auxiliary_loss_clip": 0.01139675, "auxiliary_loss_mlp": 0.01040974, "balance_loss_clip": 1.02341449, "balance_loss_mlp": 1.05037284, "epoch": 0.27271907410190893, "flos": 27013155494400.0, "grad_norm": 1.7193548804221872, "language_loss": 0.73081309, "learning_rate": 3.4131804691642778e-06, "loss": 0.75261962, "num_input_tokens_seen": 98065860, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.89453125, "step": 4536, "time_per_iteration": 2.549884080886841 }, { "auxiliary_loss_clip": 0.01135189, "auxiliary_loss_mlp": 0.01041147, "balance_loss_clip": 1.02295589, "balance_loss_mlp": 1.04706407, "epoch": 0.2727791973545769, "flos": 34452588942720.0, "grad_norm": 2.277847281739677, "language_loss": 0.71621346, "learning_rate": 3.41290485034781e-06, "loss": 0.73797685, "num_input_tokens_seen": 98085450, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.87890625, "step": 4537, "time_per_iteration": 2.6100454330444336 }, { "auxiliary_loss_clip": 0.01134142, "auxiliary_loss_mlp": 0.01039952, "balance_loss_clip": 1.02258348, "balance_loss_mlp": 1.04514956, "epoch": 0.27283932060724486, "flos": 15041723160960.0, "grad_norm": 2.2771498325026354, "language_loss": 0.78167504, "learning_rate": 3.4126291779538485e-06, "loss": 0.80341601, "num_input_tokens_seen": 98099115, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.890625, "step": 4538, "time_per_iteration": 2.428267240524292 }, { "auxiliary_loss_clip": 0.01134011, "auxiliary_loss_mlp": 0.01046232, "balance_loss_clip": 1.02950668, "balance_loss_mlp": 1.04652429, "epoch": 0.2728994438599128, "flos": 21652411040640.0, "grad_norm": 1.425192569244219, "language_loss": 0.90423155, "learning_rate": 3.412353451992847e-06, "loss": 0.92603391, "num_input_tokens_seen": 98118415, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.875, "step": 4539, "time_per_iteration": 2.5085067749023438 }, { "auxiliary_loss_clip": 0.01134218, "auxiliary_loss_mlp": 0.01044258, "balance_loss_clip": 1.02647233, "balance_loss_mlp": 1.04686761, "epoch": 0.2729595671125808, "flos": 17488424949120.0, "grad_norm": 1.844457530065952, "language_loss": 0.87922788, "learning_rate": 3.4120776724752607e-06, "loss": 0.90101266, "num_input_tokens_seen": 98136300, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.875, "step": 4540, "time_per_iteration": 2.465928077697754 }, { "auxiliary_loss_clip": 0.01135935, "auxiliary_loss_mlp": 0.01036787, "balance_loss_clip": 1.01976442, "balance_loss_mlp": 1.04666734, "epoch": 0.27301969036524876, "flos": 19318145800320.0, "grad_norm": 2.3825066153611116, "language_loss": 0.82133216, "learning_rate": 3.4118018394115476e-06, "loss": 0.84305942, "num_input_tokens_seen": 98154580, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.89453125, "step": 4541, "time_per_iteration": 2.513720750808716 }, { "auxiliary_loss_clip": 0.0113461, "auxiliary_loss_mlp": 0.01041682, "balance_loss_clip": 1.02539766, "balance_loss_mlp": 1.04741848, "epoch": 0.2730798136179167, "flos": 21065666376960.0, "grad_norm": 1.8803055934048047, "language_loss": 0.79488873, "learning_rate": 3.4115259528121678e-06, "loss": 0.81665164, "num_input_tokens_seen": 98173115, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.87109375, "step": 4542, "time_per_iteration": 2.483837127685547 }, { "auxiliary_loss_clip": 0.01138569, "auxiliary_loss_mlp": 0.01035083, "balance_loss_clip": 1.01819146, "balance_loss_mlp": 1.05092573, "epoch": 0.2731399368705847, "flos": 19171737964800.0, "grad_norm": 1.8949736974276161, "language_loss": 0.89418852, "learning_rate": 3.411250012687582e-06, "loss": 0.91592509, "num_input_tokens_seen": 98190260, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.875, "step": 4543, "time_per_iteration": 2.4988725185394287 }, { "auxiliary_loss_clip": 0.01138362, "auxiliary_loss_mlp": 0.01041098, "balance_loss_clip": 1.02368736, "balance_loss_mlp": 1.04709506, "epoch": 0.27320006012325265, "flos": 18290130554880.0, "grad_norm": 2.034149102206107, "language_loss": 0.63689369, "learning_rate": 3.410974019048255e-06, "loss": 0.65868831, "num_input_tokens_seen": 98207115, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.91015625, "step": 4544, "time_per_iteration": 2.5010993480682373 }, { "auxiliary_loss_clip": 0.01135425, "auxiliary_loss_mlp": 0.01043668, "balance_loss_clip": 1.02603102, "balance_loss_mlp": 1.04805183, "epoch": 0.2732601833759206, "flos": 34860929731200.0, "grad_norm": 1.5807263799839293, "language_loss": 0.70152879, "learning_rate": 3.410697971904651e-06, "loss": 0.72331977, "num_input_tokens_seen": 98230610, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.875, "step": 4545, "time_per_iteration": 2.6295549869537354 }, { "auxiliary_loss_clip": 0.01053268, "auxiliary_loss_mlp": 0.01030257, "balance_loss_clip": 1.02825379, "balance_loss_mlp": 1.02363515, "epoch": 0.2733203066285886, "flos": 53910824762880.0, "grad_norm": 0.7293292716508629, "language_loss": 0.61675727, "learning_rate": 3.4104218712672383e-06, "loss": 0.63759255, "num_input_tokens_seen": 98293585, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.296875, "step": 4546, "time_per_iteration": 3.1195716857910156 }, { "auxiliary_loss_clip": 0.01140405, "auxiliary_loss_mlp": 0.01049276, "balance_loss_clip": 1.03212225, "balance_loss_mlp": 1.05238104, "epoch": 0.2733804298812566, "flos": 20660378244480.0, "grad_norm": 3.905394443296496, "language_loss": 0.64940608, "learning_rate": 3.410145717146488e-06, "loss": 0.67130291, "num_input_tokens_seen": 98311680, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8828125, "step": 4547, "time_per_iteration": 2.475234031677246 }, { "auxiliary_loss_clip": 0.01134661, "auxiliary_loss_mlp": 0.01040846, "balance_loss_clip": 1.02483606, "balance_loss_mlp": 1.04916179, "epoch": 0.27344055313392457, "flos": 25884339707520.0, "grad_norm": 2.198511963583579, "language_loss": 0.77509344, "learning_rate": 3.4098695095528694e-06, "loss": 0.79684854, "num_input_tokens_seen": 98330770, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.85546875, "step": 4548, "time_per_iteration": 2.493037462234497 }, { "auxiliary_loss_clip": 0.01133561, "auxiliary_loss_mlp": 0.01042392, "balance_loss_clip": 1.02710915, "balance_loss_mlp": 1.04755318, "epoch": 0.27350067638659253, "flos": 22929753565440.0, "grad_norm": 1.912576373255411, "language_loss": 0.82793516, "learning_rate": 3.4095932484968585e-06, "loss": 0.84969467, "num_input_tokens_seen": 98349860, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.859375, "step": 4549, "time_per_iteration": 2.4980874061584473 }, { "auxiliary_loss_clip": 0.0113607, "auxiliary_loss_mlp": 0.01045693, "balance_loss_clip": 1.02658331, "balance_loss_mlp": 1.04653347, "epoch": 0.2735607996392605, "flos": 16574821499520.0, "grad_norm": 2.000172001813114, "language_loss": 0.71037185, "learning_rate": 3.4093169339889305e-06, "loss": 0.73218948, "num_input_tokens_seen": 98367040, "router_z_loss_clip": 0.19140625, "router_z_loss_mlp": 0.89453125, "step": 4550, "time_per_iteration": 2.4170401096343994 }, { "auxiliary_loss_clip": 0.01136571, "auxiliary_loss_mlp": 0.01037124, "balance_loss_clip": 1.02113783, "balance_loss_mlp": 1.05068707, "epoch": 0.27362092289192846, "flos": 19645291895040.0, "grad_norm": 2.179207603740516, "language_loss": 0.78682387, "learning_rate": 3.409040566039563e-06, "loss": 0.80856085, "num_input_tokens_seen": 98384010, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.859375, "step": 4551, "time_per_iteration": 3.9150991439819336 }, { "auxiliary_loss_clip": 0.01136329, "auxiliary_loss_mlp": 0.01039873, "balance_loss_clip": 1.02264726, "balance_loss_mlp": 1.04843521, "epoch": 0.27368104614459643, "flos": 17639142416640.0, "grad_norm": 3.2988581350396196, "language_loss": 0.70636439, "learning_rate": 3.4087641446592362e-06, "loss": 0.72812641, "num_input_tokens_seen": 98399625, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.87890625, "step": 4552, "time_per_iteration": 2.4587855339050293 }, { "auxiliary_loss_clip": 0.01138829, "auxiliary_loss_mlp": 0.01037539, "balance_loss_clip": 1.02065921, "balance_loss_mlp": 1.05076075, "epoch": 0.2737411693972644, "flos": 21580015178880.0, "grad_norm": 2.2001870194970405, "language_loss": 0.71733767, "learning_rate": 3.408487669858431e-06, "loss": 0.73910129, "num_input_tokens_seen": 98417310, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8828125, "step": 4553, "time_per_iteration": 3.913203239440918 }, { "auxiliary_loss_clip": 0.01135816, "auxiliary_loss_mlp": 0.01036815, "balance_loss_clip": 1.01958954, "balance_loss_mlp": 1.04922092, "epoch": 0.27380129264993236, "flos": 25484043565440.0, "grad_norm": 1.6222088934420227, "language_loss": 0.5904721, "learning_rate": 3.4082111416476337e-06, "loss": 0.61219847, "num_input_tokens_seen": 98438670, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8671875, "step": 4554, "time_per_iteration": 2.5157361030578613 }, { "auxiliary_loss_clip": 0.01141064, "auxiliary_loss_mlp": 0.01036901, "balance_loss_clip": 1.01937759, "balance_loss_mlp": 1.0498749, "epoch": 0.2738614159026003, "flos": 18661196004480.0, "grad_norm": 1.8048778681902888, "language_loss": 0.73818231, "learning_rate": 3.4079345600373275e-06, "loss": 0.75996196, "num_input_tokens_seen": 98456060, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9140625, "step": 4555, "time_per_iteration": 3.91890549659729 }, { "auxiliary_loss_clip": 0.01139159, "auxiliary_loss_mlp": 0.01038201, "balance_loss_clip": 1.02134478, "balance_loss_mlp": 1.05026579, "epoch": 0.2739215391552683, "flos": 23477139901440.0, "grad_norm": 2.0295278033400703, "language_loss": 0.77817547, "learning_rate": 3.407657925038002e-06, "loss": 0.79994905, "num_input_tokens_seen": 98473765, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.890625, "step": 4556, "time_per_iteration": 2.48006010055542 }, { "auxiliary_loss_clip": 0.01148668, "auxiliary_loss_mlp": 0.01048852, "balance_loss_clip": 1.03080368, "balance_loss_mlp": 1.05174685, "epoch": 0.27398166240793626, "flos": 17128636369920.0, "grad_norm": 2.3092389758684733, "language_loss": 0.82308888, "learning_rate": 3.4073812366601473e-06, "loss": 0.84506404, "num_input_tokens_seen": 98490590, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.96875, "step": 4557, "time_per_iteration": 2.437056064605713 }, { "auxiliary_loss_clip": 0.01137128, "auxiliary_loss_mlp": 0.01039233, "balance_loss_clip": 1.02279413, "balance_loss_mlp": 1.04937029, "epoch": 0.2740417856606042, "flos": 23404744039680.0, "grad_norm": 2.1755077170486974, "language_loss": 0.73102397, "learning_rate": 3.4071044949142547e-06, "loss": 0.75278759, "num_input_tokens_seen": 98510590, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.875, "step": 4558, "time_per_iteration": 2.466090679168701 }, { "auxiliary_loss_clip": 0.01138486, "auxiliary_loss_mlp": 0.01048936, "balance_loss_clip": 1.03229451, "balance_loss_mlp": 1.04960597, "epoch": 0.2741019089132722, "flos": 12780428400000.0, "grad_norm": 2.070459637288418, "language_loss": 0.68244147, "learning_rate": 3.406827699810819e-06, "loss": 0.70431572, "num_input_tokens_seen": 98527875, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.890625, "step": 4559, "time_per_iteration": 2.446929693222046 }, { "auxiliary_loss_clip": 0.01134343, "auxiliary_loss_mlp": 0.01048939, "balance_loss_clip": 1.03227329, "balance_loss_mlp": 1.04784644, "epoch": 0.27416203216594015, "flos": 20631542601600.0, "grad_norm": 2.3035048338008672, "language_loss": 0.72085547, "learning_rate": 3.4065508513603353e-06, "loss": 0.7426883, "num_input_tokens_seen": 98547575, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8671875, "step": 4560, "time_per_iteration": 2.447115421295166 }, { "auxiliary_loss_clip": 0.01136009, "auxiliary_loss_mlp": 0.01042221, "balance_loss_clip": 1.02529347, "balance_loss_mlp": 1.04733944, "epoch": 0.27422215541860817, "flos": 26541576812160.0, "grad_norm": 1.8328021548746425, "language_loss": 0.81741536, "learning_rate": 3.406273949573303e-06, "loss": 0.83919764, "num_input_tokens_seen": 98566290, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.88671875, "step": 4561, "time_per_iteration": 2.513230085372925 }, { "auxiliary_loss_clip": 0.01139087, "auxiliary_loss_mlp": 0.01040951, "balance_loss_clip": 1.02479851, "balance_loss_mlp": 1.04942358, "epoch": 0.27428227867127614, "flos": 23331163029120.0, "grad_norm": 1.6631405409144717, "language_loss": 0.75517094, "learning_rate": 3.4059969944602214e-06, "loss": 0.77697128, "num_input_tokens_seen": 98586255, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.89453125, "step": 4562, "time_per_iteration": 2.497772216796875 }, { "auxiliary_loss_clip": 0.01137349, "auxiliary_loss_mlp": 0.01035431, "balance_loss_clip": 1.0202558, "balance_loss_mlp": 1.05024946, "epoch": 0.2743424019239441, "flos": 23035115134080.0, "grad_norm": 1.478002416572373, "language_loss": 0.7414192, "learning_rate": 3.4057199860315928e-06, "loss": 0.763147, "num_input_tokens_seen": 98606030, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.87109375, "step": 4563, "time_per_iteration": 2.502514600753784 }, { "auxiliary_loss_clip": 0.0113979, "auxiliary_loss_mlp": 0.01046838, "balance_loss_clip": 1.02808678, "balance_loss_mlp": 1.04804015, "epoch": 0.27440252517661207, "flos": 21981101420160.0, "grad_norm": 2.23302614141156, "language_loss": 0.62527025, "learning_rate": 3.4054429242979213e-06, "loss": 0.64713651, "num_input_tokens_seen": 98625225, "router_z_loss_clip": 0.1875, "router_z_loss_mlp": 0.91796875, "step": 4564, "time_per_iteration": 2.5031139850616455 }, { "auxiliary_loss_clip": 0.01137666, "auxiliary_loss_mlp": 0.01037282, "balance_loss_clip": 1.02018762, "balance_loss_mlp": 1.0494144, "epoch": 0.27446264842928003, "flos": 40187451502080.0, "grad_norm": 2.836232348313694, "language_loss": 0.78230941, "learning_rate": 3.4051658092697135e-06, "loss": 0.80405891, "num_input_tokens_seen": 98649470, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8828125, "step": 4565, "time_per_iteration": 2.6563615798950195 }, { "auxiliary_loss_clip": 0.0113559, "auxiliary_loss_mlp": 0.01044374, "balance_loss_clip": 1.02851903, "balance_loss_mlp": 1.04876149, "epoch": 0.274522771681948, "flos": 13479681438720.0, "grad_norm": 1.9236852047404116, "language_loss": 0.68851674, "learning_rate": 3.404888640957477e-06, "loss": 0.71031642, "num_input_tokens_seen": 98666915, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 4566, "time_per_iteration": 2.441493511199951 }, { "auxiliary_loss_clip": 0.01134174, "auxiliary_loss_mlp": 0.01043571, "balance_loss_clip": 1.0286634, "balance_loss_mlp": 1.04991722, "epoch": 0.27458289493461596, "flos": 28622133313920.0, "grad_norm": 1.6503796768321621, "language_loss": 0.61022818, "learning_rate": 3.404611419371723e-06, "loss": 0.63200557, "num_input_tokens_seen": 98688240, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.84375, "step": 4567, "time_per_iteration": 2.569157361984253 }, { "auxiliary_loss_clip": 0.01136726, "auxiliary_loss_mlp": 0.01038326, "balance_loss_clip": 1.02156472, "balance_loss_mlp": 1.04932678, "epoch": 0.2746430181872839, "flos": 20119815492480.0, "grad_norm": 1.6885363360434804, "language_loss": 0.82458138, "learning_rate": 3.4043341445229627e-06, "loss": 0.84633189, "num_input_tokens_seen": 98708245, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.875, "step": 4568, "time_per_iteration": 2.4927494525909424 }, { "auxiliary_loss_clip": 0.01139452, "auxiliary_loss_mlp": 0.010349, "balance_loss_clip": 1.01884246, "balance_loss_mlp": 1.05130601, "epoch": 0.2747031414399519, "flos": 20193468330240.0, "grad_norm": 2.1991352942450755, "language_loss": 0.68204969, "learning_rate": 3.4040568164217117e-06, "loss": 0.70379317, "num_input_tokens_seen": 98724575, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8828125, "step": 4569, "time_per_iteration": 2.492509365081787 }, { "auxiliary_loss_clip": 0.0113717, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.02115417, "balance_loss_mlp": 1.04870653, "epoch": 0.27476326469261986, "flos": 13516346246400.0, "grad_norm": 9.007408380908185, "language_loss": 0.70312631, "learning_rate": 3.4037794350784848e-06, "loss": 0.72488105, "num_input_tokens_seen": 98740700, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8828125, "step": 4570, "time_per_iteration": 2.4613230228424072 }, { "auxiliary_loss_clip": 0.01049002, "auxiliary_loss_mlp": 0.01007548, "balance_loss_clip": 1.00569975, "balance_loss_mlp": 1.01979113, "epoch": 0.2748233879452878, "flos": 65937127121280.0, "grad_norm": 0.942563165754217, "language_loss": 0.55777043, "learning_rate": 3.4035020005038014e-06, "loss": 0.57833588, "num_input_tokens_seen": 98803030, "router_z_loss_clip": 0.01843262, "router_z_loss_mlp": 0.29101562, "step": 4571, "time_per_iteration": 3.203378438949585 }, { "auxiliary_loss_clip": 0.01141869, "auxiliary_loss_mlp": 0.01043163, "balance_loss_clip": 1.02751076, "balance_loss_mlp": 1.05145311, "epoch": 0.2748835111979558, "flos": 17384212615680.0, "grad_norm": 2.068183950512689, "language_loss": 0.7749722, "learning_rate": 3.4032245127081812e-06, "loss": 0.79682249, "num_input_tokens_seen": 98820505, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.90625, "step": 4572, "time_per_iteration": 2.4626593589782715 }, { "auxiliary_loss_clip": 0.01132598, "auxiliary_loss_mlp": 0.01036873, "balance_loss_clip": 1.02263308, "balance_loss_mlp": 1.05023885, "epoch": 0.27494363445062375, "flos": 23587565287680.0, "grad_norm": 1.501407483636727, "language_loss": 0.81304598, "learning_rate": 3.402946971702147e-06, "loss": 0.8347407, "num_input_tokens_seen": 98842150, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.82421875, "step": 4573, "time_per_iteration": 2.5049312114715576 }, { "auxiliary_loss_clip": 0.01132046, "auxiliary_loss_mlp": 0.01036138, "balance_loss_clip": 1.02087915, "balance_loss_mlp": 1.04834378, "epoch": 0.2750037577032918, "flos": 17164582905600.0, "grad_norm": 1.7141275791767274, "language_loss": 0.79177904, "learning_rate": 3.402669377496223e-06, "loss": 0.81346095, "num_input_tokens_seen": 98861050, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8359375, "step": 4574, "time_per_iteration": 2.4765501022338867 }, { "auxiliary_loss_clip": 0.01136942, "auxiliary_loss_mlp": 0.01045437, "balance_loss_clip": 1.03008282, "balance_loss_mlp": 1.05014479, "epoch": 0.27506388095595974, "flos": 24491903028480.0, "grad_norm": 1.9186590239359997, "language_loss": 0.74363697, "learning_rate": 3.402391730100936e-06, "loss": 0.76546073, "num_input_tokens_seen": 98879695, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8671875, "step": 4575, "time_per_iteration": 2.4925880432128906 }, { "auxiliary_loss_clip": 0.01134581, "auxiliary_loss_mlp": 0.01037777, "balance_loss_clip": 1.0229001, "balance_loss_mlp": 1.04958606, "epoch": 0.2751240042086277, "flos": 38764706722560.0, "grad_norm": 2.732052768823713, "language_loss": 0.71694636, "learning_rate": 3.402114029526814e-06, "loss": 0.73866999, "num_input_tokens_seen": 98902035, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8515625, "step": 4576, "time_per_iteration": 2.6238465309143066 }, { "auxiliary_loss_clip": 0.01134804, "auxiliary_loss_mlp": 0.01033686, "balance_loss_clip": 1.01745009, "balance_loss_mlp": 1.04876637, "epoch": 0.27518412746129567, "flos": 26907039740160.0, "grad_norm": 1.5583287478991623, "language_loss": 0.73238248, "learning_rate": 3.4018362757843866e-06, "loss": 0.75406742, "num_input_tokens_seen": 98921835, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.86328125, "step": 4577, "time_per_iteration": 2.5132336616516113 }, { "auxiliary_loss_clip": 0.01137106, "auxiliary_loss_mlp": 0.01034216, "balance_loss_clip": 1.01780105, "balance_loss_mlp": 1.05023623, "epoch": 0.27524425071396363, "flos": 24900531125760.0, "grad_norm": 2.2249873432139275, "language_loss": 0.76289082, "learning_rate": 3.401558468884188e-06, "loss": 0.78460395, "num_input_tokens_seen": 98939610, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87109375, "step": 4578, "time_per_iteration": 2.505601644515991 }, { "auxiliary_loss_clip": 0.01135907, "auxiliary_loss_mlp": 0.01048893, "balance_loss_clip": 1.02998614, "balance_loss_mlp": 1.04796696, "epoch": 0.2753043739666316, "flos": 26288047641600.0, "grad_norm": 2.1972983703964446, "language_loss": 0.66364121, "learning_rate": 3.4012806088367516e-06, "loss": 0.68548918, "num_input_tokens_seen": 98962250, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.87890625, "step": 4579, "time_per_iteration": 2.550537586212158 }, { "auxiliary_loss_clip": 0.01135821, "auxiliary_loss_mlp": 0.01046981, "balance_loss_clip": 1.02928996, "balance_loss_mlp": 1.046386, "epoch": 0.27536449721929956, "flos": 24206772867840.0, "grad_norm": 2.5320200695833113, "language_loss": 0.80215883, "learning_rate": 3.4010026956526137e-06, "loss": 0.82398689, "num_input_tokens_seen": 98981845, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.89453125, "step": 4580, "time_per_iteration": 2.496250629425049 }, { "auxiliary_loss_clip": 0.01134044, "auxiliary_loss_mlp": 0.01044916, "balance_loss_clip": 1.02727282, "balance_loss_mlp": 1.04756594, "epoch": 0.27542462047196753, "flos": 19537272720000.0, "grad_norm": 1.5139570505187923, "language_loss": 0.6707052, "learning_rate": 3.4007247293423137e-06, "loss": 0.69249481, "num_input_tokens_seen": 99001855, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.8671875, "step": 4581, "time_per_iteration": 2.4664371013641357 }, { "auxiliary_loss_clip": 0.01136224, "auxiliary_loss_mlp": 0.01043964, "balance_loss_clip": 1.0289433, "balance_loss_mlp": 1.04740608, "epoch": 0.2754847437246355, "flos": 14319165173760.0, "grad_norm": 1.6728912319234226, "language_loss": 0.78178883, "learning_rate": 3.400446709916392e-06, "loss": 0.80359066, "num_input_tokens_seen": 99019880, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.890625, "step": 4582, "time_per_iteration": 2.4569613933563232 }, { "auxiliary_loss_clip": 0.01132824, "auxiliary_loss_mlp": 0.01036115, "balance_loss_clip": 1.02153611, "balance_loss_mlp": 1.04833663, "epoch": 0.27554486697730346, "flos": 18838773866880.0, "grad_norm": 1.6159689151314314, "language_loss": 0.84426177, "learning_rate": 3.4001686373853895e-06, "loss": 0.86595118, "num_input_tokens_seen": 99037570, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.84375, "step": 4583, "time_per_iteration": 2.461517333984375 }, { "auxiliary_loss_clip": 0.0113665, "auxiliary_loss_mlp": 0.01041025, "balance_loss_clip": 1.02548647, "balance_loss_mlp": 1.04785728, "epoch": 0.2756049902299714, "flos": 22382295402240.0, "grad_norm": 1.7807271308862918, "language_loss": 0.66878927, "learning_rate": 3.3998905117598528e-06, "loss": 0.69056606, "num_input_tokens_seen": 99056875, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.890625, "step": 4584, "time_per_iteration": 2.490936756134033 }, { "auxiliary_loss_clip": 0.01130476, "auxiliary_loss_mlp": 0.01045287, "balance_loss_clip": 1.02997458, "balance_loss_mlp": 1.04617989, "epoch": 0.2756651134826394, "flos": 19573901614080.0, "grad_norm": 1.790713801037036, "language_loss": 0.77208102, "learning_rate": 3.399612333050327e-06, "loss": 0.79383862, "num_input_tokens_seen": 99074685, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84375, "step": 4585, "time_per_iteration": 2.4676833152770996 }, { "auxiliary_loss_clip": 0.01137817, "auxiliary_loss_mlp": 0.01041711, "balance_loss_clip": 1.0253675, "balance_loss_mlp": 1.04920793, "epoch": 0.27572523673530736, "flos": 23586559706880.0, "grad_norm": 1.7011141515270383, "language_loss": 0.71570462, "learning_rate": 3.399334101267362e-06, "loss": 0.73749989, "num_input_tokens_seen": 99095300, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8828125, "step": 4586, "time_per_iteration": 2.5040640830993652 }, { "auxiliary_loss_clip": 0.0113486, "auxiliary_loss_mlp": 0.01039382, "balance_loss_clip": 1.02432632, "balance_loss_mlp": 1.04937541, "epoch": 0.2757853599879754, "flos": 22820118278400.0, "grad_norm": 1.432002778350927, "language_loss": 0.80630773, "learning_rate": 3.3990558164215073e-06, "loss": 0.82805014, "num_input_tokens_seen": 99115965, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8515625, "step": 4587, "time_per_iteration": 2.5126943588256836 }, { "auxiliary_loss_clip": 0.01130888, "auxiliary_loss_mlp": 0.01044379, "balance_loss_clip": 1.02853024, "balance_loss_mlp": 1.04575706, "epoch": 0.27584548324064334, "flos": 18551704371840.0, "grad_norm": 1.8149224819445304, "language_loss": 0.82794118, "learning_rate": 3.398777478523316e-06, "loss": 0.84969383, "num_input_tokens_seen": 99134265, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 4588, "time_per_iteration": 2.461327314376831 }, { "auxiliary_loss_clip": 0.01130994, "auxiliary_loss_mlp": 0.01038042, "balance_loss_clip": 1.02262807, "balance_loss_mlp": 1.04643202, "epoch": 0.2759056064933113, "flos": 23769883745280.0, "grad_norm": 1.4085446921144915, "language_loss": 0.75447285, "learning_rate": 3.398499087583342e-06, "loss": 0.77616322, "num_input_tokens_seen": 99156185, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84765625, "step": 4589, "time_per_iteration": 2.553173780441284 }, { "auxiliary_loss_clip": 0.01127977, "auxiliary_loss_mlp": 0.01047273, "balance_loss_clip": 1.03208554, "balance_loss_mlp": 1.04425371, "epoch": 0.27596572974597927, "flos": 24281898163200.0, "grad_norm": 2.0634787727533084, "language_loss": 0.88573277, "learning_rate": 3.398220643612143e-06, "loss": 0.90748525, "num_input_tokens_seen": 99176735, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8359375, "step": 4590, "time_per_iteration": 2.5340561866760254 }, { "auxiliary_loss_clip": 0.01132416, "auxiliary_loss_mlp": 0.01043328, "balance_loss_clip": 1.02677011, "balance_loss_mlp": 1.04610705, "epoch": 0.27602585299864724, "flos": 35040985632000.0, "grad_norm": 3.4237737632953165, "language_loss": 0.71114272, "learning_rate": 3.397942146620277e-06, "loss": 0.7329002, "num_input_tokens_seen": 99199765, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.86328125, "step": 4591, "time_per_iteration": 2.637291669845581 }, { "auxiliary_loss_clip": 0.01134457, "auxiliary_loss_mlp": 0.01045856, "balance_loss_clip": 1.02938724, "balance_loss_mlp": 1.04801261, "epoch": 0.2760859762513152, "flos": 24309405002880.0, "grad_norm": 2.232678537375648, "language_loss": 0.79896331, "learning_rate": 3.3976635966183046e-06, "loss": 0.82076645, "num_input_tokens_seen": 99218435, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.86328125, "step": 4592, "time_per_iteration": 2.498731851577759 }, { "auxiliary_loss_clip": 0.01048412, "auxiliary_loss_mlp": 0.0101713, "balance_loss_clip": 1.01482964, "balance_loss_mlp": 1.01924181, "epoch": 0.27614609950398317, "flos": 71260739890560.0, "grad_norm": 0.7129621497084639, "language_loss": 0.61616051, "learning_rate": 3.3973849936167886e-06, "loss": 0.63681591, "num_input_tokens_seen": 99276200, "router_z_loss_clip": 0.02294922, "router_z_loss_mlp": 0.29296875, "step": 4593, "time_per_iteration": 4.492316961288452 }, { "auxiliary_loss_clip": 0.01134457, "auxiliary_loss_mlp": 0.0103791, "balance_loss_clip": 1.02247262, "balance_loss_mlp": 1.04787672, "epoch": 0.27620622275665113, "flos": 29674854138240.0, "grad_norm": 1.920824590972662, "language_loss": 0.77162486, "learning_rate": 3.3971063376262937e-06, "loss": 0.79334855, "num_input_tokens_seen": 99297625, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8671875, "step": 4594, "time_per_iteration": 2.576279401779175 }, { "auxiliary_loss_clip": 0.01132345, "auxiliary_loss_mlp": 0.01036536, "balance_loss_clip": 1.02109838, "balance_loss_mlp": 1.04820716, "epoch": 0.2762663460093191, "flos": 15378063137280.0, "grad_norm": 1.5315550399294706, "language_loss": 0.91426075, "learning_rate": 3.3968276286573866e-06, "loss": 0.93594956, "num_input_tokens_seen": 99315790, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83984375, "step": 4595, "time_per_iteration": 3.827636957168579 }, { "auxiliary_loss_clip": 0.01136114, "auxiliary_loss_mlp": 0.0104464, "balance_loss_clip": 1.02791476, "balance_loss_mlp": 1.04879415, "epoch": 0.27632646926198706, "flos": 20704082117760.0, "grad_norm": 4.192918899918743, "language_loss": 0.69795585, "learning_rate": 3.3965488667206353e-06, "loss": 0.7197634, "num_input_tokens_seen": 99334615, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.875, "step": 4596, "time_per_iteration": 3.8693325519561768 }, { "auxiliary_loss_clip": 0.01141486, "auxiliary_loss_mlp": 0.01040704, "balance_loss_clip": 1.02425277, "balance_loss_mlp": 1.04971099, "epoch": 0.276386592514655, "flos": 32813374849920.0, "grad_norm": 1.8773563912365383, "language_loss": 0.6335355, "learning_rate": 3.3962700518266113e-06, "loss": 0.65535748, "num_input_tokens_seen": 99356685, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.91796875, "step": 4597, "time_per_iteration": 4.007552146911621 }, { "auxiliary_loss_clip": 0.01132355, "auxiliary_loss_mlp": 0.01039899, "balance_loss_clip": 1.02487826, "balance_loss_mlp": 1.04920948, "epoch": 0.276446715767323, "flos": 18551704371840.0, "grad_norm": 1.840628109046771, "language_loss": 0.8612287, "learning_rate": 3.395991183985887e-06, "loss": 0.8829512, "num_input_tokens_seen": 99374810, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83203125, "step": 4598, "time_per_iteration": 2.438201904296875 }, { "auxiliary_loss_clip": 0.01135549, "auxiliary_loss_mlp": 0.01041898, "balance_loss_clip": 1.02549505, "balance_loss_mlp": 1.04862309, "epoch": 0.27650683901999096, "flos": 22819615488000.0, "grad_norm": 2.2165094479230283, "language_loss": 0.80011135, "learning_rate": 3.395712263209037e-06, "loss": 0.82188582, "num_input_tokens_seen": 99391290, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8671875, "step": 4599, "time_per_iteration": 2.4806160926818848 }, { "auxiliary_loss_clip": 0.01138887, "auxiliary_loss_mlp": 0.01042543, "balance_loss_clip": 1.02674139, "balance_loss_mlp": 1.0491581, "epoch": 0.276566962272659, "flos": 21361534704000.0, "grad_norm": 1.9762959438806968, "language_loss": 0.78983051, "learning_rate": 3.395433289506639e-06, "loss": 0.81164479, "num_input_tokens_seen": 99409120, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8984375, "step": 4600, "time_per_iteration": 2.4653170108795166 }, { "auxiliary_loss_clip": 0.01138893, "auxiliary_loss_mlp": 0.01042616, "balance_loss_clip": 1.02629614, "balance_loss_mlp": 1.04839945, "epoch": 0.27662708552532694, "flos": 17710604524800.0, "grad_norm": 1.7444942742048704, "language_loss": 0.7285068, "learning_rate": 3.3951542628892694e-06, "loss": 0.75032187, "num_input_tokens_seen": 99426180, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.90625, "step": 4601, "time_per_iteration": 2.4686474800109863 }, { "auxiliary_loss_clip": 0.01132798, "auxiliary_loss_mlp": 0.01039234, "balance_loss_clip": 1.02275932, "balance_loss_mlp": 1.04688692, "epoch": 0.2766872087779949, "flos": 21252725429760.0, "grad_norm": 1.708999000764837, "language_loss": 0.80188298, "learning_rate": 3.3948751833675113e-06, "loss": 0.82360339, "num_input_tokens_seen": 99447720, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.859375, "step": 4602, "time_per_iteration": 2.511312246322632 }, { "auxiliary_loss_clip": 0.01138989, "auxiliary_loss_mlp": 0.01047483, "balance_loss_clip": 1.03021002, "balance_loss_mlp": 1.04688752, "epoch": 0.2767473320306629, "flos": 12931900053120.0, "grad_norm": 2.13447140711144, "language_loss": 0.76816416, "learning_rate": 3.3945960509519455e-06, "loss": 0.79002881, "num_input_tokens_seen": 99464720, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.921875, "step": 4603, "time_per_iteration": 2.438870668411255 }, { "auxiliary_loss_clip": 0.01131126, "auxiliary_loss_mlp": 0.01040565, "balance_loss_clip": 1.02596235, "balance_loss_mlp": 1.04755402, "epoch": 0.27680745528333084, "flos": 15012851604480.0, "grad_norm": 1.602328345326561, "language_loss": 0.81437278, "learning_rate": 3.3943168656531585e-06, "loss": 0.83608973, "num_input_tokens_seen": 99482310, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8359375, "step": 4604, "time_per_iteration": 2.425107002258301 }, { "auxiliary_loss_clip": 0.0113816, "auxiliary_loss_mlp": 0.01030156, "balance_loss_clip": 1.01462293, "balance_loss_mlp": 1.04865098, "epoch": 0.2768675785359988, "flos": 22637835734400.0, "grad_norm": 1.6500592578607074, "language_loss": 0.69909453, "learning_rate": 3.3940376274817363e-06, "loss": 0.72077769, "num_input_tokens_seen": 99501255, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.89453125, "step": 4605, "time_per_iteration": 2.49198317527771 }, { "auxiliary_loss_clip": 0.01052537, "auxiliary_loss_mlp": 0.01006983, "balance_loss_clip": 1.00488496, "balance_loss_mlp": 1.02370477, "epoch": 0.27692770178866677, "flos": 66130542881280.0, "grad_norm": 0.7299985541976967, "language_loss": 0.572155, "learning_rate": 3.3937583364482673e-06, "loss": 0.59275019, "num_input_tokens_seen": 99568925, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.2890625, "step": 4606, "time_per_iteration": 3.184474229812622 }, { "auxiliary_loss_clip": 0.01138329, "auxiliary_loss_mlp": 0.01050128, "balance_loss_clip": 1.03345096, "balance_loss_mlp": 1.04978597, "epoch": 0.27698782504133473, "flos": 26464979059200.0, "grad_norm": 1.8458984425525677, "language_loss": 0.69815886, "learning_rate": 3.3934789925633424e-06, "loss": 0.72004342, "num_input_tokens_seen": 99588455, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8828125, "step": 4607, "time_per_iteration": 2.5114805698394775 }, { "auxiliary_loss_clip": 0.01131389, "auxiliary_loss_mlp": 0.01038998, "balance_loss_clip": 1.02457976, "balance_loss_mlp": 1.04782677, "epoch": 0.2770479482940027, "flos": 25884806584320.0, "grad_norm": 1.6081970818954272, "language_loss": 0.69798607, "learning_rate": 3.393199595837555e-06, "loss": 0.71968985, "num_input_tokens_seen": 99609355, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8359375, "step": 4608, "time_per_iteration": 2.5204861164093018 }, { "auxiliary_loss_clip": 0.0113714, "auxiliary_loss_mlp": 0.01035028, "balance_loss_clip": 1.01941204, "balance_loss_mlp": 1.04778123, "epoch": 0.27710807154667066, "flos": 22857249962880.0, "grad_norm": 1.972353890755828, "language_loss": 0.72875333, "learning_rate": 3.392920146281499e-06, "loss": 0.75047493, "num_input_tokens_seen": 99628780, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.89453125, "step": 4609, "time_per_iteration": 2.4687631130218506 }, { "auxiliary_loss_clip": 0.01136345, "auxiliary_loss_mlp": 0.01048988, "balance_loss_clip": 1.03259659, "balance_loss_mlp": 1.04715848, "epoch": 0.27716819479933863, "flos": 17711071401600.0, "grad_norm": 2.408845802618741, "language_loss": 0.83589554, "learning_rate": 3.3926406439057714e-06, "loss": 0.85774887, "num_input_tokens_seen": 99644545, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.890625, "step": 4610, "time_per_iteration": 2.449624538421631 }, { "auxiliary_loss_clip": 0.01139466, "auxiliary_loss_mlp": 0.01045142, "balance_loss_clip": 1.02897739, "balance_loss_mlp": 1.04844689, "epoch": 0.2772283180520066, "flos": 19646046080640.0, "grad_norm": 1.9213066496269768, "language_loss": 0.68998498, "learning_rate": 3.3923610887209705e-06, "loss": 0.71183103, "num_input_tokens_seen": 99663125, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.91015625, "step": 4611, "time_per_iteration": 2.43761944770813 }, { "auxiliary_loss_clip": 0.01132492, "auxiliary_loss_mlp": 0.01036424, "balance_loss_clip": 1.02102864, "balance_loss_mlp": 1.04899406, "epoch": 0.27728844130467456, "flos": 21032628842880.0, "grad_norm": 2.017142248449521, "language_loss": 0.73475802, "learning_rate": 3.392081480737698e-06, "loss": 0.75644714, "num_input_tokens_seen": 99682645, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 4612, "time_per_iteration": 2.5138583183288574 }, { "auxiliary_loss_clip": 0.01137321, "auxiliary_loss_mlp": 0.01047604, "balance_loss_clip": 1.03110552, "balance_loss_mlp": 1.04812837, "epoch": 0.2773485645573425, "flos": 18989204025600.0, "grad_norm": 2.2523710539717676, "language_loss": 0.66650975, "learning_rate": 3.3918018199665563e-06, "loss": 0.68835902, "num_input_tokens_seen": 99700520, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.890625, "step": 4613, "time_per_iteration": 2.5125415325164795 }, { "auxiliary_loss_clip": 0.01133574, "auxiliary_loss_mlp": 0.0104357, "balance_loss_clip": 1.02677321, "balance_loss_mlp": 1.04712319, "epoch": 0.27740868781001055, "flos": 21468440557440.0, "grad_norm": 1.8977628211819404, "language_loss": 0.79769313, "learning_rate": 3.39152210641815e-06, "loss": 0.81946456, "num_input_tokens_seen": 99720355, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 4614, "time_per_iteration": 2.49385929107666 }, { "auxiliary_loss_clip": 0.01135459, "auxiliary_loss_mlp": 0.01041659, "balance_loss_clip": 1.02510047, "balance_loss_mlp": 1.04695857, "epoch": 0.2774688110626785, "flos": 19827825834240.0, "grad_norm": 2.3926590683343276, "language_loss": 0.79429418, "learning_rate": 3.3912423401030865e-06, "loss": 0.81606537, "num_input_tokens_seen": 99736090, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8828125, "step": 4615, "time_per_iteration": 2.430574655532837 }, { "auxiliary_loss_clip": 0.01138612, "auxiliary_loss_mlp": 0.01044126, "balance_loss_clip": 1.02747238, "balance_loss_mlp": 1.04752231, "epoch": 0.2775289343153465, "flos": 18216226321920.0, "grad_norm": 3.720420433279169, "language_loss": 0.62649488, "learning_rate": 3.3909625210319735e-06, "loss": 0.64832234, "num_input_tokens_seen": 99751805, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.91015625, "step": 4616, "time_per_iteration": 2.4214630126953125 }, { "auxiliary_loss_clip": 0.01135591, "auxiliary_loss_mlp": 0.0104046, "balance_loss_clip": 1.02416444, "balance_loss_mlp": 1.0474236, "epoch": 0.27758905756801444, "flos": 16472476673280.0, "grad_norm": 2.2341739739399817, "language_loss": 0.82611156, "learning_rate": 3.3906826492154226e-06, "loss": 0.84787208, "num_input_tokens_seen": 99770610, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8828125, "step": 4617, "time_per_iteration": 2.4496500492095947 }, { "auxiliary_loss_clip": 0.01135363, "auxiliary_loss_mlp": 0.01044167, "balance_loss_clip": 1.02847958, "balance_loss_mlp": 1.04729736, "epoch": 0.2776491808206824, "flos": 18728240739840.0, "grad_norm": 2.1544750664592383, "language_loss": 0.77242184, "learning_rate": 3.3904027246640458e-06, "loss": 0.79421711, "num_input_tokens_seen": 99787305, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8828125, "step": 4618, "time_per_iteration": 2.4499518871307373 }, { "auxiliary_loss_clip": 0.01137892, "auxiliary_loss_mlp": 0.01035762, "balance_loss_clip": 1.02046168, "balance_loss_mlp": 1.04982305, "epoch": 0.27770930407335037, "flos": 28038189911040.0, "grad_norm": 2.37773122826289, "language_loss": 0.84655428, "learning_rate": 3.390122747388459e-06, "loss": 0.8682909, "num_input_tokens_seen": 99808940, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.87890625, "step": 4619, "time_per_iteration": 2.5177862644195557 }, { "auxiliary_loss_clip": 0.01131473, "auxiliary_loss_mlp": 0.01037864, "balance_loss_clip": 1.02368402, "balance_loss_mlp": 1.04664564, "epoch": 0.27776942732601834, "flos": 23549823072000.0, "grad_norm": 1.4548170800413518, "language_loss": 0.766186, "learning_rate": 3.3898427173992778e-06, "loss": 0.78787935, "num_input_tokens_seen": 99829575, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.84765625, "step": 4620, "time_per_iteration": 2.4922432899475098 }, { "auxiliary_loss_clip": 0.01132501, "auxiliary_loss_mlp": 0.01038563, "balance_loss_clip": 1.0229466, "balance_loss_mlp": 1.04697704, "epoch": 0.2778295505786863, "flos": 23908713811200.0, "grad_norm": 1.9086954474472708, "language_loss": 0.78203785, "learning_rate": 3.389562634707122e-06, "loss": 0.80374849, "num_input_tokens_seen": 99847575, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.85546875, "step": 4621, "time_per_iteration": 2.48516583442688 }, { "auxiliary_loss_clip": 0.01135726, "auxiliary_loss_mlp": 0.01048185, "balance_loss_clip": 1.03118563, "balance_loss_mlp": 1.04773724, "epoch": 0.27788967383135427, "flos": 25554571920000.0, "grad_norm": 2.6892484883373324, "language_loss": 0.87321627, "learning_rate": 3.389282499322611e-06, "loss": 0.89505535, "num_input_tokens_seen": 99864995, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 4622, "time_per_iteration": 2.500290870666504 }, { "auxiliary_loss_clip": 0.0113539, "auxiliary_loss_mlp": 0.0104805, "balance_loss_clip": 1.03165841, "balance_loss_mlp": 1.04671729, "epoch": 0.27794979708402223, "flos": 16252631481600.0, "grad_norm": 2.4981290509802685, "language_loss": 0.81307638, "learning_rate": 3.389002311256369e-06, "loss": 0.83491075, "num_input_tokens_seen": 99881540, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.88671875, "step": 4623, "time_per_iteration": 2.4483935832977295 }, { "auxiliary_loss_clip": 0.01138352, "auxiliary_loss_mlp": 0.01043049, "balance_loss_clip": 1.02765942, "balance_loss_mlp": 1.05023789, "epoch": 0.2780099203366902, "flos": 20667632791680.0, "grad_norm": 1.8807988287576614, "language_loss": 0.81256026, "learning_rate": 3.3887220705190204e-06, "loss": 0.83437425, "num_input_tokens_seen": 99899595, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8828125, "step": 4624, "time_per_iteration": 2.500089645385742 }, { "auxiliary_loss_clip": 0.01135479, "auxiliary_loss_mlp": 0.01045102, "balance_loss_clip": 1.02953315, "balance_loss_mlp": 1.04980206, "epoch": 0.27807004358935816, "flos": 17739583822080.0, "grad_norm": 2.4221090331913104, "language_loss": 0.76574713, "learning_rate": 3.388441777121191e-06, "loss": 0.78755295, "num_input_tokens_seen": 99913020, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 4625, "time_per_iteration": 2.4088611602783203 }, { "auxiliary_loss_clip": 0.011335, "auxiliary_loss_mlp": 0.01048861, "balance_loss_clip": 1.0328517, "balance_loss_mlp": 1.04790866, "epoch": 0.2781301668420261, "flos": 16727119165440.0, "grad_norm": 1.9638919907378065, "language_loss": 0.69876915, "learning_rate": 3.388161431073511e-06, "loss": 0.72059274, "num_input_tokens_seen": 99931405, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.85546875, "step": 4626, "time_per_iteration": 2.4396591186523438 }, { "auxiliary_loss_clip": 0.01142932, "auxiliary_loss_mlp": 0.010414, "balance_loss_clip": 1.02475786, "balance_loss_mlp": 1.05166233, "epoch": 0.27819029009469415, "flos": 13844749317120.0, "grad_norm": 2.1576220693030583, "language_loss": 0.92569131, "learning_rate": 3.38788103238661e-06, "loss": 0.94753462, "num_input_tokens_seen": 99948100, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.9140625, "step": 4627, "time_per_iteration": 2.4525160789489746 }, { "auxiliary_loss_clip": 0.01140116, "auxiliary_loss_mlp": 0.01047684, "balance_loss_clip": 1.03215671, "balance_loss_mlp": 1.05061054, "epoch": 0.2782504133473621, "flos": 27089286370560.0, "grad_norm": 1.771325674008576, "language_loss": 0.85447681, "learning_rate": 3.387600581071121e-06, "loss": 0.87635469, "num_input_tokens_seen": 99966470, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.89453125, "step": 4628, "time_per_iteration": 2.5035147666931152 }, { "auxiliary_loss_clip": 0.01134644, "auxiliary_loss_mlp": 0.010482, "balance_loss_clip": 1.03315556, "balance_loss_mlp": 1.04887438, "epoch": 0.2783105366000301, "flos": 21068826773760.0, "grad_norm": 1.670936955088362, "language_loss": 0.79214472, "learning_rate": 3.387320077137679e-06, "loss": 0.81397319, "num_input_tokens_seen": 99985930, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.859375, "step": 4629, "time_per_iteration": 2.4578633308410645 }, { "auxiliary_loss_clip": 0.01131093, "auxiliary_loss_mlp": 0.01039744, "balance_loss_clip": 1.02612448, "balance_loss_mlp": 1.0500648, "epoch": 0.27837065985269804, "flos": 26501823434880.0, "grad_norm": 1.497832332109914, "language_loss": 0.84271258, "learning_rate": 3.3870395205969208e-06, "loss": 0.86442095, "num_input_tokens_seen": 100006235, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.80859375, "step": 4630, "time_per_iteration": 2.518714189529419 }, { "auxiliary_loss_clip": 0.01140787, "auxiliary_loss_mlp": 0.01043951, "balance_loss_clip": 1.0272733, "balance_loss_mlp": 1.05204332, "epoch": 0.278430783105366, "flos": 20223201813120.0, "grad_norm": 1.9549841781803832, "language_loss": 0.80847609, "learning_rate": 3.386758911459485e-06, "loss": 0.8303234, "num_input_tokens_seen": 100023655, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.88671875, "step": 4631, "time_per_iteration": 2.47141432762146 }, { "auxiliary_loss_clip": 0.01142271, "auxiliary_loss_mlp": 0.01046675, "balance_loss_clip": 1.0311656, "balance_loss_mlp": 1.05402708, "epoch": 0.278490906358034, "flos": 25592888753280.0, "grad_norm": 2.0747228682446237, "language_loss": 0.70919967, "learning_rate": 3.3864782497360126e-06, "loss": 0.73108912, "num_input_tokens_seen": 100043280, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8828125, "step": 4632, "time_per_iteration": 2.504539966583252 }, { "auxiliary_loss_clip": 0.01134133, "auxiliary_loss_mlp": 0.01045889, "balance_loss_clip": 1.03116643, "balance_loss_mlp": 1.0519768, "epoch": 0.27855102961070194, "flos": 16171544528640.0, "grad_norm": 1.807767930057554, "language_loss": 0.82274908, "learning_rate": 3.386197535437145e-06, "loss": 0.8445493, "num_input_tokens_seen": 100057690, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8203125, "step": 4633, "time_per_iteration": 2.4443516731262207 }, { "auxiliary_loss_clip": 0.01139087, "auxiliary_loss_mlp": 0.01041495, "balance_loss_clip": 1.02411437, "balance_loss_mlp": 1.05067718, "epoch": 0.2786111528633699, "flos": 22927598749440.0, "grad_norm": 1.8050828777217738, "language_loss": 0.87717938, "learning_rate": 3.385916768573529e-06, "loss": 0.89898521, "num_input_tokens_seen": 100075875, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8828125, "step": 4634, "time_per_iteration": 3.9709975719451904 }, { "auxiliary_loss_clip": 0.01137798, "auxiliary_loss_mlp": 0.01038674, "balance_loss_clip": 1.02243757, "balance_loss_mlp": 1.05144477, "epoch": 0.27867127611603787, "flos": 23404205335680.0, "grad_norm": 1.59518622394602, "language_loss": 0.76611876, "learning_rate": 3.38563594915581e-06, "loss": 0.78788352, "num_input_tokens_seen": 100092930, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.86328125, "step": 4635, "time_per_iteration": 2.4802212715148926 }, { "auxiliary_loss_clip": 0.0113704, "auxiliary_loss_mlp": 0.01041878, "balance_loss_clip": 1.02565312, "balance_loss_mlp": 1.04923368, "epoch": 0.27873139936870583, "flos": 19829010983040.0, "grad_norm": 1.794915685729954, "language_loss": 0.64868551, "learning_rate": 3.385355077194637e-06, "loss": 0.67047471, "num_input_tokens_seen": 100110790, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87890625, "step": 4636, "time_per_iteration": 2.469794750213623 }, { "auxiliary_loss_clip": 0.01141235, "auxiliary_loss_mlp": 0.0103852, "balance_loss_clip": 1.02160442, "balance_loss_mlp": 1.05248439, "epoch": 0.2787915226213738, "flos": 17707659609600.0, "grad_norm": 4.274519144435391, "language_loss": 0.83001739, "learning_rate": 3.3850741527006604e-06, "loss": 0.85181487, "num_input_tokens_seen": 100126970, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.88671875, "step": 4637, "time_per_iteration": 3.8785922527313232 }, { "auxiliary_loss_clip": 0.01133943, "auxiliary_loss_mlp": 0.01039326, "balance_loss_clip": 1.02479434, "balance_loss_mlp": 1.04989481, "epoch": 0.27885164587404176, "flos": 22090557139200.0, "grad_norm": 1.758961681170378, "language_loss": 0.75673717, "learning_rate": 3.384793175684533e-06, "loss": 0.77846986, "num_input_tokens_seen": 100146720, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.84375, "step": 4638, "time_per_iteration": 5.311922550201416 }, { "auxiliary_loss_clip": 0.01136754, "auxiliary_loss_mlp": 0.01045785, "balance_loss_clip": 1.03001356, "balance_loss_mlp": 1.04937911, "epoch": 0.27891176912670973, "flos": 19207684500480.0, "grad_norm": 1.8653470294793633, "language_loss": 0.71865749, "learning_rate": 3.38451214615691e-06, "loss": 0.74048293, "num_input_tokens_seen": 100165920, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.875, "step": 4639, "time_per_iteration": 2.4498205184936523 }, { "auxiliary_loss_clip": 0.01136152, "auxiliary_loss_mlp": 0.0103241, "balance_loss_clip": 1.01604223, "balance_loss_mlp": 1.0496676, "epoch": 0.27897189237937775, "flos": 27600007898880.0, "grad_norm": 2.1380822734677754, "language_loss": 0.65906382, "learning_rate": 3.384231064128447e-06, "loss": 0.68074942, "num_input_tokens_seen": 100185525, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.86328125, "step": 4640, "time_per_iteration": 2.5318596363067627 }, { "auxiliary_loss_clip": 0.01135477, "auxiliary_loss_mlp": 0.010349, "balance_loss_clip": 1.01931942, "balance_loss_mlp": 1.04885209, "epoch": 0.2790320156320457, "flos": 21178210665600.0, "grad_norm": 1.9278913985396178, "language_loss": 0.71259046, "learning_rate": 3.383949929609804e-06, "loss": 0.73429424, "num_input_tokens_seen": 100204850, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8671875, "step": 4641, "time_per_iteration": 2.4448297023773193 }, { "auxiliary_loss_clip": 0.01138124, "auxiliary_loss_mlp": 0.01042751, "balance_loss_clip": 1.02585912, "balance_loss_mlp": 1.04830003, "epoch": 0.2790921388847137, "flos": 22783920347520.0, "grad_norm": 1.957325918943367, "language_loss": 0.74633741, "learning_rate": 3.383668742611641e-06, "loss": 0.76814616, "num_input_tokens_seen": 100224520, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8984375, "step": 4642, "time_per_iteration": 2.4847025871276855 }, { "auxiliary_loss_clip": 0.01136596, "auxiliary_loss_mlp": 0.0104041, "balance_loss_clip": 1.02393484, "balance_loss_mlp": 1.04867053, "epoch": 0.27915226213738165, "flos": 23400649889280.0, "grad_norm": 2.1100799080283545, "language_loss": 0.85647136, "learning_rate": 3.3833875031446205e-06, "loss": 0.87824142, "num_input_tokens_seen": 100243935, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.875, "step": 4643, "time_per_iteration": 2.456735849380493 }, { "auxiliary_loss_clip": 0.01137087, "auxiliary_loss_mlp": 0.0104107, "balance_loss_clip": 1.02535868, "balance_loss_mlp": 1.05064344, "epoch": 0.2792123853900496, "flos": 22747794243840.0, "grad_norm": 1.8946945641504331, "language_loss": 0.82946539, "learning_rate": 3.383106211219407e-06, "loss": 0.85124689, "num_input_tokens_seen": 100262290, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.86328125, "step": 4644, "time_per_iteration": 2.494978666305542 }, { "auxiliary_loss_clip": 0.01134918, "auxiliary_loss_mlp": 0.01036251, "balance_loss_clip": 1.01990795, "balance_loss_mlp": 1.04775858, "epoch": 0.2792725086427176, "flos": 15049372757760.0, "grad_norm": 1.892878466725826, "language_loss": 0.79279178, "learning_rate": 3.3828248668466673e-06, "loss": 0.81450349, "num_input_tokens_seen": 100280015, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.87109375, "step": 4645, "time_per_iteration": 2.4478471279144287 }, { "auxiliary_loss_clip": 0.01054798, "auxiliary_loss_mlp": 0.01006665, "balance_loss_clip": 1.00500762, "balance_loss_mlp": 1.02667546, "epoch": 0.27933263189538554, "flos": 62544861757440.0, "grad_norm": 0.7915247945497649, "language_loss": 0.62312913, "learning_rate": 3.3825434700370705e-06, "loss": 0.64374375, "num_input_tokens_seen": 100338935, "router_z_loss_clip": 0.01660156, "router_z_loss_mlp": 0.28125, "step": 4646, "time_per_iteration": 3.079763174057007 }, { "auxiliary_loss_clip": 0.01128319, "auxiliary_loss_mlp": 0.01033401, "balance_loss_clip": 1.01925063, "balance_loss_mlp": 1.04661345, "epoch": 0.2793927551480535, "flos": 25118365155840.0, "grad_norm": 1.7608623144698794, "language_loss": 0.89532757, "learning_rate": 3.3822620208012865e-06, "loss": 0.9169448, "num_input_tokens_seen": 100359905, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.81640625, "step": 4647, "time_per_iteration": 2.489983320236206 }, { "auxiliary_loss_clip": 0.01133951, "auxiliary_loss_mlp": 0.010401, "balance_loss_clip": 1.02406621, "balance_loss_mlp": 1.0466013, "epoch": 0.27945287840072147, "flos": 21324582587520.0, "grad_norm": 1.9478249621395332, "language_loss": 0.87316859, "learning_rate": 3.381980519149988e-06, "loss": 0.89490914, "num_input_tokens_seen": 100376955, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 4648, "time_per_iteration": 2.4849448204040527 }, { "auxiliary_loss_clip": 0.01135597, "auxiliary_loss_mlp": 0.01038999, "balance_loss_clip": 1.0233345, "balance_loss_mlp": 1.04791689, "epoch": 0.27951300165338944, "flos": 27450547407360.0, "grad_norm": 2.736839534181503, "language_loss": 0.7321009, "learning_rate": 3.38169896509385e-06, "loss": 0.75384688, "num_input_tokens_seen": 100397545, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.87890625, "step": 4649, "time_per_iteration": 2.5066704750061035 }, { "auxiliary_loss_clip": 0.01132828, "auxiliary_loss_mlp": 0.01040465, "balance_loss_clip": 1.02331066, "balance_loss_mlp": 1.04653049, "epoch": 0.2795731249060574, "flos": 15159008044800.0, "grad_norm": 2.101797812487062, "language_loss": 0.8039723, "learning_rate": 3.381417358643549e-06, "loss": 0.82570517, "num_input_tokens_seen": 100415080, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.86328125, "step": 4650, "time_per_iteration": 2.462523937225342 }, { "auxiliary_loss_clip": 0.01050221, "auxiliary_loss_mlp": 0.01000866, "balance_loss_clip": 0.99922115, "balance_loss_mlp": 1.02170265, "epoch": 0.27963324815872537, "flos": 60120103178880.0, "grad_norm": 0.8230019284281413, "language_loss": 0.58858562, "learning_rate": 3.3811356998097624e-06, "loss": 0.60909647, "num_input_tokens_seen": 100471105, "router_z_loss_clip": 0.01647949, "router_z_loss_mlp": 0.28515625, "step": 4651, "time_per_iteration": 3.1186089515686035 }, { "auxiliary_loss_clip": 0.01134346, "auxiliary_loss_mlp": 0.01039804, "balance_loss_clip": 1.02224481, "balance_loss_mlp": 1.04502296, "epoch": 0.27969337141139333, "flos": 21765960910080.0, "grad_norm": 1.812836074479688, "language_loss": 0.73807573, "learning_rate": 3.3808539886031726e-06, "loss": 0.75981724, "num_input_tokens_seen": 100492520, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.89453125, "step": 4652, "time_per_iteration": 2.51096773147583 }, { "auxiliary_loss_clip": 0.01137738, "auxiliary_loss_mlp": 0.01043435, "balance_loss_clip": 1.02734184, "balance_loss_mlp": 1.05036414, "epoch": 0.27975349466406135, "flos": 39851398834560.0, "grad_norm": 2.502985430123954, "language_loss": 0.79445595, "learning_rate": 3.380572225034461e-06, "loss": 0.81626773, "num_input_tokens_seen": 100512870, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.875, "step": 4653, "time_per_iteration": 2.5960543155670166 }, { "auxiliary_loss_clip": 0.01132844, "auxiliary_loss_mlp": 0.01044168, "balance_loss_clip": 1.02861083, "balance_loss_mlp": 1.0471499, "epoch": 0.2798136179167293, "flos": 21579799697280.0, "grad_norm": 2.5208494292977224, "language_loss": 0.78539127, "learning_rate": 3.380290409114312e-06, "loss": 0.80716133, "num_input_tokens_seen": 100531655, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.859375, "step": 4654, "time_per_iteration": 2.486231565475464 }, { "auxiliary_loss_clip": 0.01136203, "auxiliary_loss_mlp": 0.01042487, "balance_loss_clip": 1.02590466, "balance_loss_mlp": 1.0479126, "epoch": 0.2798737411693973, "flos": 21537676022400.0, "grad_norm": 1.9872940548248765, "language_loss": 0.80891532, "learning_rate": 3.3800085408534127e-06, "loss": 0.83070219, "num_input_tokens_seen": 100548005, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8828125, "step": 4655, "time_per_iteration": 2.4547884464263916 }, { "auxiliary_loss_clip": 0.01132684, "auxiliary_loss_mlp": 0.01044243, "balance_loss_clip": 1.02822113, "balance_loss_mlp": 1.04590023, "epoch": 0.27993386442206525, "flos": 26981051713920.0, "grad_norm": 1.6474509075930468, "language_loss": 0.8139441, "learning_rate": 3.3797266202624506e-06, "loss": 0.83571333, "num_input_tokens_seen": 100567980, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8671875, "step": 4656, "time_per_iteration": 2.518514633178711 }, { "auxiliary_loss_clip": 0.01131487, "auxiliary_loss_mlp": 0.01037495, "balance_loss_clip": 1.02202153, "balance_loss_mlp": 1.04661679, "epoch": 0.2799939876747332, "flos": 24349876652160.0, "grad_norm": 1.6604523840892014, "language_loss": 0.8338995, "learning_rate": 3.3794446473521176e-06, "loss": 0.85558927, "num_input_tokens_seen": 100588630, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84765625, "step": 4657, "time_per_iteration": 2.514014482498169 }, { "auxiliary_loss_clip": 0.01133442, "auxiliary_loss_mlp": 0.01040034, "balance_loss_clip": 1.025491, "balance_loss_mlp": 1.0475688, "epoch": 0.2800541109274012, "flos": 33656988648960.0, "grad_norm": 2.4918422205157698, "language_loss": 0.63566917, "learning_rate": 3.379162622133105e-06, "loss": 0.65740395, "num_input_tokens_seen": 100608775, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.859375, "step": 4658, "time_per_iteration": 2.5853729248046875 }, { "auxiliary_loss_clip": 0.01135029, "auxiliary_loss_mlp": 0.01043335, "balance_loss_clip": 1.02761126, "balance_loss_mlp": 1.04754281, "epoch": 0.28011423418006914, "flos": 21614417429760.0, "grad_norm": 1.8807526319047856, "language_loss": 0.78358066, "learning_rate": 3.3788805446161073e-06, "loss": 0.80536425, "num_input_tokens_seen": 100627975, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.875, "step": 4659, "time_per_iteration": 2.465083360671997 }, { "auxiliary_loss_clip": 0.01140306, "auxiliary_loss_mlp": 0.01047816, "balance_loss_clip": 1.03217566, "balance_loss_mlp": 1.05222738, "epoch": 0.2801743574327371, "flos": 23112431159040.0, "grad_norm": 1.9774280767325765, "language_loss": 0.79141867, "learning_rate": 3.3785984148118215e-06, "loss": 0.81329989, "num_input_tokens_seen": 100645430, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8828125, "step": 4660, "time_per_iteration": 2.5049471855163574 }, { "auxiliary_loss_clip": 0.01132757, "auxiliary_loss_mlp": 0.01038396, "balance_loss_clip": 1.02390051, "balance_loss_mlp": 1.0484935, "epoch": 0.2802344806854051, "flos": 12641418766080.0, "grad_norm": 6.059393810413076, "language_loss": 0.80035114, "learning_rate": 3.3783162327309453e-06, "loss": 0.82206261, "num_input_tokens_seen": 100663775, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.84375, "step": 4661, "time_per_iteration": 2.4421238899230957 }, { "auxiliary_loss_clip": 0.01142377, "auxiliary_loss_mlp": 0.01061564, "balance_loss_clip": 1.04595995, "balance_loss_mlp": 1.05529571, "epoch": 0.28029460393807304, "flos": 37267878142080.0, "grad_norm": 1.4969033196095396, "language_loss": 0.79085261, "learning_rate": 3.3780339983841794e-06, "loss": 0.81289196, "num_input_tokens_seen": 100686085, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.87109375, "step": 4662, "time_per_iteration": 2.6454389095306396 }, { "auxiliary_loss_clip": 0.0114134, "auxiliary_loss_mlp": 0.01049158, "balance_loss_clip": 1.03169394, "balance_loss_mlp": 1.0507617, "epoch": 0.280354727190741, "flos": 20741106061440.0, "grad_norm": 1.6525757578697031, "language_loss": 0.69633472, "learning_rate": 3.377751711782227e-06, "loss": 0.71823967, "num_input_tokens_seen": 100705135, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.90625, "step": 4663, "time_per_iteration": 2.4704174995422363 }, { "auxiliary_loss_clip": 0.0113989, "auxiliary_loss_mlp": 0.01050691, "balance_loss_clip": 1.03298199, "balance_loss_mlp": 1.05206609, "epoch": 0.28041485044340897, "flos": 21471026336640.0, "grad_norm": 1.8941372347905725, "language_loss": 0.77440459, "learning_rate": 3.377469372935791e-06, "loss": 0.79631042, "num_input_tokens_seen": 100724960, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.875, "step": 4664, "time_per_iteration": 2.502960205078125 }, { "auxiliary_loss_clip": 0.01131817, "auxiliary_loss_mlp": 0.01040317, "balance_loss_clip": 1.02450943, "balance_loss_mlp": 1.04790795, "epoch": 0.28047497369607693, "flos": 14794263388800.0, "grad_norm": 1.825875473275072, "language_loss": 0.79251087, "learning_rate": 3.377186981855578e-06, "loss": 0.81423217, "num_input_tokens_seen": 100741995, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83984375, "step": 4665, "time_per_iteration": 2.431840658187866 }, { "auxiliary_loss_clip": 0.01133431, "auxiliary_loss_mlp": 0.01042347, "balance_loss_clip": 1.02681422, "balance_loss_mlp": 1.04857588, "epoch": 0.2805350969487449, "flos": 23070738447360.0, "grad_norm": 1.8281536181997462, "language_loss": 0.80704117, "learning_rate": 3.3769045385522968e-06, "loss": 0.82879895, "num_input_tokens_seen": 100758985, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84765625, "step": 4666, "time_per_iteration": 2.494594097137451 }, { "auxiliary_loss_clip": 0.01139036, "auxiliary_loss_mlp": 0.01051374, "balance_loss_clip": 1.03437471, "balance_loss_mlp": 1.05119634, "epoch": 0.2805952202014129, "flos": 20479855466880.0, "grad_norm": 2.2980128590611093, "language_loss": 0.84344864, "learning_rate": 3.376622043036658e-06, "loss": 0.86535275, "num_input_tokens_seen": 100777820, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.87890625, "step": 4667, "time_per_iteration": 2.44889497756958 }, { "auxiliary_loss_clip": 0.01140408, "auxiliary_loss_mlp": 0.0104335, "balance_loss_clip": 1.02726865, "balance_loss_mlp": 1.05176044, "epoch": 0.2806553434540809, "flos": 27417330305280.0, "grad_norm": 1.928996573687587, "language_loss": 0.79086703, "learning_rate": 3.376339495319373e-06, "loss": 0.81270462, "num_input_tokens_seen": 100798205, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.88671875, "step": 4668, "time_per_iteration": 2.553072214126587 }, { "auxiliary_loss_clip": 0.01137158, "auxiliary_loss_mlp": 0.01042154, "balance_loss_clip": 1.02533364, "balance_loss_mlp": 1.04752767, "epoch": 0.28071546670674885, "flos": 26505019745280.0, "grad_norm": 1.5091640598919005, "language_loss": 0.76038778, "learning_rate": 3.3760568954111563e-06, "loss": 0.78218096, "num_input_tokens_seen": 100819800, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8984375, "step": 4669, "time_per_iteration": 2.497065782546997 }, { "auxiliary_loss_clip": 0.01135942, "auxiliary_loss_mlp": 0.01042237, "balance_loss_clip": 1.02620304, "balance_loss_mlp": 1.04823804, "epoch": 0.2807755899594168, "flos": 20558679863040.0, "grad_norm": 2.2696752445615753, "language_loss": 0.78690445, "learning_rate": 3.375774243322725e-06, "loss": 0.8086862, "num_input_tokens_seen": 100837880, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 4670, "time_per_iteration": 2.4826855659484863 }, { "auxiliary_loss_clip": 0.01138186, "auxiliary_loss_mlp": 0.0104167, "balance_loss_clip": 1.0246948, "balance_loss_mlp": 1.04864466, "epoch": 0.2808357132120848, "flos": 24313319585280.0, "grad_norm": 2.0356286995718262, "language_loss": 0.79217494, "learning_rate": 3.3754915390647955e-06, "loss": 0.81397355, "num_input_tokens_seen": 100856350, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.89453125, "step": 4671, "time_per_iteration": 2.4786062240600586 }, { "auxiliary_loss_clip": 0.01133258, "auxiliary_loss_mlp": 0.01036192, "balance_loss_clip": 1.02161288, "balance_loss_mlp": 1.05028105, "epoch": 0.28089583646475275, "flos": 26432408401920.0, "grad_norm": 1.6784567376005906, "language_loss": 0.75085711, "learning_rate": 3.37520878264809e-06, "loss": 0.77255166, "num_input_tokens_seen": 100876135, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.828125, "step": 4672, "time_per_iteration": 2.5482256412506104 }, { "auxiliary_loss_clip": 0.01136752, "auxiliary_loss_mlp": 0.01045472, "balance_loss_clip": 1.02722144, "balance_loss_mlp": 1.04885244, "epoch": 0.2809559597174207, "flos": 23111820627840.0, "grad_norm": 2.634300385260944, "language_loss": 0.75716639, "learning_rate": 3.3749259740833286e-06, "loss": 0.77898872, "num_input_tokens_seen": 100894790, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.87890625, "step": 4673, "time_per_iteration": 2.4507699012756348 }, { "auxiliary_loss_clip": 0.01134793, "auxiliary_loss_mlp": 0.01036419, "balance_loss_clip": 1.02011156, "balance_loss_mlp": 1.04711676, "epoch": 0.2810160829700887, "flos": 20923496346240.0, "grad_norm": 4.785705439553046, "language_loss": 0.72321594, "learning_rate": 3.374643113381237e-06, "loss": 0.74492806, "num_input_tokens_seen": 100915100, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.875, "step": 4674, "time_per_iteration": 2.5269148349761963 }, { "auxiliary_loss_clip": 0.0113859, "auxiliary_loss_mlp": 0.01040756, "balance_loss_clip": 1.02342343, "balance_loss_mlp": 1.04930913, "epoch": 0.28107620622275664, "flos": 14355901808640.0, "grad_norm": 1.858853739525625, "language_loss": 0.77281743, "learning_rate": 3.374360200552541e-06, "loss": 0.79461092, "num_input_tokens_seen": 100932795, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.89453125, "step": 4675, "time_per_iteration": 2.4232425689697266 }, { "auxiliary_loss_clip": 0.01137352, "auxiliary_loss_mlp": 0.01042789, "balance_loss_clip": 1.02550411, "balance_loss_mlp": 1.04757643, "epoch": 0.2811363294754246, "flos": 20919078973440.0, "grad_norm": 2.4808772889117425, "language_loss": 0.70596874, "learning_rate": 3.374077235607968e-06, "loss": 0.72777021, "num_input_tokens_seen": 100950505, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8984375, "step": 4676, "time_per_iteration": 3.947171211242676 }, { "auxiliary_loss_clip": 0.01131647, "auxiliary_loss_mlp": 0.01039171, "balance_loss_clip": 1.02412677, "balance_loss_mlp": 1.04938483, "epoch": 0.28119645272809257, "flos": 20594841880320.0, "grad_norm": 2.3294083278064424, "language_loss": 0.70333344, "learning_rate": 3.3737942185582487e-06, "loss": 0.72504163, "num_input_tokens_seen": 100968790, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 4677, "time_per_iteration": 2.475841522216797 }, { "auxiliary_loss_clip": 0.01136212, "auxiliary_loss_mlp": 0.01041463, "balance_loss_clip": 1.02352786, "balance_loss_mlp": 1.04954958, "epoch": 0.28125657598076054, "flos": 25337420248320.0, "grad_norm": 1.7081320259762658, "language_loss": 0.63821709, "learning_rate": 3.3735111494141153e-06, "loss": 0.65999377, "num_input_tokens_seen": 100990205, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.8671875, "step": 4678, "time_per_iteration": 3.977376937866211 }, { "auxiliary_loss_clip": 0.01131478, "auxiliary_loss_mlp": 0.01046183, "balance_loss_clip": 1.02992308, "balance_loss_mlp": 1.04508829, "epoch": 0.2813166992334285, "flos": 24827093769600.0, "grad_norm": 2.6783100389590193, "language_loss": 0.70292902, "learning_rate": 3.3732280281863013e-06, "loss": 0.72470564, "num_input_tokens_seen": 101009815, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.86328125, "step": 4679, "time_per_iteration": 2.4982869625091553 }, { "auxiliary_loss_clip": 0.01134404, "auxiliary_loss_mlp": 0.0104093, "balance_loss_clip": 1.02402616, "balance_loss_mlp": 1.04626298, "epoch": 0.2813768224860965, "flos": 21760753438080.0, "grad_norm": 2.0187722360658777, "language_loss": 0.7468375, "learning_rate": 3.3729448548855422e-06, "loss": 0.76859081, "num_input_tokens_seen": 101026780, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8828125, "step": 4680, "time_per_iteration": 5.311014175415039 }, { "auxiliary_loss_clip": 0.01136101, "auxiliary_loss_mlp": 0.01036522, "balance_loss_clip": 1.02082205, "balance_loss_mlp": 1.04826379, "epoch": 0.2814369457387645, "flos": 24316803204480.0, "grad_norm": 1.6024097917190927, "language_loss": 0.76807112, "learning_rate": 3.3726616295225774e-06, "loss": 0.78979731, "num_input_tokens_seen": 101046215, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.87890625, "step": 4681, "time_per_iteration": 2.49507999420166 }, { "auxiliary_loss_clip": 0.01134625, "auxiliary_loss_mlp": 0.01036597, "balance_loss_clip": 1.01977646, "balance_loss_mlp": 1.04732132, "epoch": 0.28149706899143245, "flos": 18515326872960.0, "grad_norm": 1.9789940742861796, "language_loss": 0.73860472, "learning_rate": 3.372378352108146e-06, "loss": 0.76031697, "num_input_tokens_seen": 101063365, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.875, "step": 4682, "time_per_iteration": 2.472379446029663 }, { "auxiliary_loss_clip": 0.01132025, "auxiliary_loss_mlp": 0.01040261, "balance_loss_clip": 1.02409637, "balance_loss_mlp": 1.04698801, "epoch": 0.2815571922441004, "flos": 24863255786880.0, "grad_norm": 1.4273151875720393, "language_loss": 0.80722851, "learning_rate": 3.3720950226529894e-06, "loss": 0.82895136, "num_input_tokens_seen": 101083835, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8515625, "step": 4683, "time_per_iteration": 2.5053915977478027 }, { "auxiliary_loss_clip": 0.01136627, "auxiliary_loss_mlp": 0.01043158, "balance_loss_clip": 1.02611113, "balance_loss_mlp": 1.04814434, "epoch": 0.2816173154967684, "flos": 19901622326400.0, "grad_norm": 1.659026564442993, "language_loss": 0.76461124, "learning_rate": 3.371811641167852e-06, "loss": 0.78640902, "num_input_tokens_seen": 101101740, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8828125, "step": 4684, "time_per_iteration": 2.47290301322937 }, { "auxiliary_loss_clip": 0.01130605, "auxiliary_loss_mlp": 0.01035478, "balance_loss_clip": 1.01920629, "balance_loss_mlp": 1.04519558, "epoch": 0.28167743874943635, "flos": 17491333950720.0, "grad_norm": 1.6823195652100573, "language_loss": 0.75767767, "learning_rate": 3.3715282076634807e-06, "loss": 0.77933848, "num_input_tokens_seen": 101120480, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.85546875, "step": 4685, "time_per_iteration": 2.420778512954712 }, { "auxiliary_loss_clip": 0.0112924, "auxiliary_loss_mlp": 0.0103942, "balance_loss_clip": 1.02358854, "balance_loss_mlp": 1.04516852, "epoch": 0.2817375620021043, "flos": 25302120157440.0, "grad_norm": 1.5394203625869685, "language_loss": 0.7567457, "learning_rate": 3.3712447221506218e-06, "loss": 0.77843225, "num_input_tokens_seen": 101142910, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83984375, "step": 4686, "time_per_iteration": 2.518691062927246 }, { "auxiliary_loss_clip": 0.01136342, "auxiliary_loss_mlp": 0.01046542, "balance_loss_clip": 1.0294472, "balance_loss_mlp": 1.046767, "epoch": 0.2817976852547723, "flos": 18693227957760.0, "grad_norm": 2.539644538037921, "language_loss": 0.62803447, "learning_rate": 3.370961184640025e-06, "loss": 0.6498633, "num_input_tokens_seen": 101160030, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8984375, "step": 4687, "time_per_iteration": 2.4222514629364014 }, { "auxiliary_loss_clip": 0.01138436, "auxiliary_loss_mlp": 0.01046815, "balance_loss_clip": 1.03016126, "balance_loss_mlp": 1.04987812, "epoch": 0.28185780850744024, "flos": 22742263549440.0, "grad_norm": 2.7094397140838304, "language_loss": 0.76364696, "learning_rate": 3.3706775951424433e-06, "loss": 0.78549945, "num_input_tokens_seen": 101177675, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8828125, "step": 4688, "time_per_iteration": 2.4920835494995117 }, { "auxiliary_loss_clip": 0.01131613, "auxiliary_loss_mlp": 0.01040008, "balance_loss_clip": 1.02411187, "balance_loss_mlp": 1.04632497, "epoch": 0.2819179317601082, "flos": 14933919467520.0, "grad_norm": 2.044095694164778, "language_loss": 0.78572249, "learning_rate": 3.37039395366863e-06, "loss": 0.80743873, "num_input_tokens_seen": 101192225, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 4689, "time_per_iteration": 2.3981752395629883 }, { "auxiliary_loss_clip": 0.01134561, "auxiliary_loss_mlp": 0.01040096, "balance_loss_clip": 1.02306104, "balance_loss_mlp": 1.04711533, "epoch": 0.2819780550127762, "flos": 23145325038720.0, "grad_norm": 1.809451019127587, "language_loss": 0.78349173, "learning_rate": 3.37011026022934e-06, "loss": 0.80523837, "num_input_tokens_seen": 101210870, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.875, "step": 4690, "time_per_iteration": 2.4640517234802246 }, { "auxiliary_loss_clip": 0.01135545, "auxiliary_loss_mlp": 0.01043145, "balance_loss_clip": 1.02644372, "balance_loss_mlp": 1.04716718, "epoch": 0.28203817826544414, "flos": 21616356764160.0, "grad_norm": 1.8436194577083889, "language_loss": 0.88001931, "learning_rate": 3.369826514835332e-06, "loss": 0.90180624, "num_input_tokens_seen": 101229965, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8828125, "step": 4691, "time_per_iteration": 2.447051763534546 }, { "auxiliary_loss_clip": 0.01138927, "auxiliary_loss_mlp": 0.01037794, "balance_loss_clip": 1.02105677, "balance_loss_mlp": 1.04798508, "epoch": 0.2820983015181121, "flos": 24026788794240.0, "grad_norm": 1.707664412372289, "language_loss": 0.81836301, "learning_rate": 3.3695427174973654e-06, "loss": 0.84013021, "num_input_tokens_seen": 101250980, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.91015625, "step": 4692, "time_per_iteration": 2.5095040798187256 }, { "auxiliary_loss_clip": 0.0113517, "auxiliary_loss_mlp": 0.01039465, "balance_loss_clip": 1.02226353, "balance_loss_mlp": 1.04725289, "epoch": 0.2821584247707801, "flos": 30007925976960.0, "grad_norm": 1.4627436532560878, "language_loss": 0.74534678, "learning_rate": 3.3692588682262022e-06, "loss": 0.76709318, "num_input_tokens_seen": 101273335, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.875, "step": 4693, "time_per_iteration": 2.5320353507995605 }, { "auxiliary_loss_clip": 0.01135199, "auxiliary_loss_mlp": 0.01030841, "balance_loss_clip": 1.01453304, "balance_loss_mlp": 1.04714811, "epoch": 0.2822185480234481, "flos": 21396762967680.0, "grad_norm": 1.564350009376512, "language_loss": 0.77377886, "learning_rate": 3.3689749670326046e-06, "loss": 0.79543924, "num_input_tokens_seen": 101292110, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8828125, "step": 4694, "time_per_iteration": 2.472336769104004 }, { "auxiliary_loss_clip": 0.01129908, "auxiliary_loss_mlp": 0.01038441, "balance_loss_clip": 1.02252686, "balance_loss_mlp": 1.04513192, "epoch": 0.28227867127611606, "flos": 27452809964160.0, "grad_norm": 3.063117105800809, "language_loss": 0.66775811, "learning_rate": 3.3686910139273392e-06, "loss": 0.68944162, "num_input_tokens_seen": 101312815, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84765625, "step": 4695, "time_per_iteration": 2.5332584381103516 }, { "auxiliary_loss_clip": 0.01139287, "auxiliary_loss_mlp": 0.01046802, "balance_loss_clip": 1.02851534, "balance_loss_mlp": 1.04896796, "epoch": 0.282338794528784, "flos": 22593736811520.0, "grad_norm": 2.0800048778759628, "language_loss": 0.7519868, "learning_rate": 3.3684070089211736e-06, "loss": 0.7738477, "num_input_tokens_seen": 101329045, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.90234375, "step": 4696, "time_per_iteration": 2.4623842239379883 }, { "auxiliary_loss_clip": 0.01139865, "auxiliary_loss_mlp": 0.01042291, "balance_loss_clip": 1.02629948, "balance_loss_mlp": 1.05123281, "epoch": 0.282398917781452, "flos": 42010923386880.0, "grad_norm": 1.7089259004937878, "language_loss": 0.62166047, "learning_rate": 3.368122952024877e-06, "loss": 0.64348203, "num_input_tokens_seen": 101352715, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.88671875, "step": 4697, "time_per_iteration": 2.642490863800049 }, { "auxiliary_loss_clip": 0.01130383, "auxiliary_loss_mlp": 0.01034831, "balance_loss_clip": 1.0200845, "balance_loss_mlp": 1.04581571, "epoch": 0.28245904103411995, "flos": 23224724052480.0, "grad_norm": 1.3628769521253323, "language_loss": 0.72929955, "learning_rate": 3.3678388432492214e-06, "loss": 0.75095177, "num_input_tokens_seen": 101374640, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.84375, "step": 4698, "time_per_iteration": 2.5442404747009277 }, { "auxiliary_loss_clip": 0.01128589, "auxiliary_loss_mlp": 0.01040047, "balance_loss_clip": 1.02472878, "balance_loss_mlp": 1.04443169, "epoch": 0.2825191642867879, "flos": 25374623760000.0, "grad_norm": 1.6165742193973889, "language_loss": 0.74727345, "learning_rate": 3.3675546826049788e-06, "loss": 0.76895982, "num_input_tokens_seen": 101393595, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.84375, "step": 4699, "time_per_iteration": 2.48295259475708 }, { "auxiliary_loss_clip": 0.011335, "auxiliary_loss_mlp": 0.01040644, "balance_loss_clip": 1.02289343, "balance_loss_mlp": 1.04529488, "epoch": 0.2825792875394559, "flos": 17236799199360.0, "grad_norm": 3.1663148127786225, "language_loss": 0.79763937, "learning_rate": 3.3672704701029265e-06, "loss": 0.81938088, "num_input_tokens_seen": 101409265, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.8828125, "step": 4700, "time_per_iteration": 2.4402315616607666 }, { "auxiliary_loss_clip": 0.0112765, "auxiliary_loss_mlp": 0.01040516, "balance_loss_clip": 1.02640176, "balance_loss_mlp": 1.04586601, "epoch": 0.28263941079212385, "flos": 26723967096960.0, "grad_norm": 1.680548655114273, "language_loss": 0.81107593, "learning_rate": 3.3669862057538402e-06, "loss": 0.83275759, "num_input_tokens_seen": 101428365, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.81640625, "step": 4701, "time_per_iteration": 2.497485399246216 }, { "auxiliary_loss_clip": 0.01132017, "auxiliary_loss_mlp": 0.01042387, "balance_loss_clip": 1.02671111, "balance_loss_mlp": 1.0467689, "epoch": 0.2826995340447918, "flos": 25921327737600.0, "grad_norm": 3.2387084831838613, "language_loss": 0.73055613, "learning_rate": 3.3667018895685004e-06, "loss": 0.7523002, "num_input_tokens_seen": 101447280, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8515625, "step": 4702, "time_per_iteration": 2.490112781524658 }, { "auxiliary_loss_clip": 0.01132058, "auxiliary_loss_mlp": 0.01038399, "balance_loss_clip": 1.02205563, "balance_loss_mlp": 1.04792786, "epoch": 0.2827596572974598, "flos": 22379709623040.0, "grad_norm": 2.430677406168262, "language_loss": 0.78453732, "learning_rate": 3.3664175215576886e-06, "loss": 0.80624187, "num_input_tokens_seen": 101465435, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.84375, "step": 4703, "time_per_iteration": 2.4634835720062256 }, { "auxiliary_loss_clip": 0.01130177, "auxiliary_loss_mlp": 0.01042811, "balance_loss_clip": 1.02628911, "balance_loss_mlp": 1.04469764, "epoch": 0.28281978055012774, "flos": 33547137880320.0, "grad_norm": 3.4428029687930426, "language_loss": 0.6942811, "learning_rate": 3.3661331017321867e-06, "loss": 0.71601099, "num_input_tokens_seen": 101486355, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.85546875, "step": 4704, "time_per_iteration": 2.5568742752075195 }, { "auxiliary_loss_clip": 0.01131363, "auxiliary_loss_mlp": 0.01040748, "balance_loss_clip": 1.02416599, "balance_loss_mlp": 1.04648542, "epoch": 0.2828799038027957, "flos": 23440870143360.0, "grad_norm": 2.0524254386113316, "language_loss": 0.7003684, "learning_rate": 3.3658486301027807e-06, "loss": 0.72208953, "num_input_tokens_seen": 101505875, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8515625, "step": 4705, "time_per_iteration": 2.4652371406555176 }, { "auxiliary_loss_clip": 0.01058365, "auxiliary_loss_mlp": 0.01011737, "balance_loss_clip": 1.00973403, "balance_loss_mlp": 1.0279808, "epoch": 0.2829400270554637, "flos": 69873690251520.0, "grad_norm": 0.7299933658844707, "language_loss": 0.59259087, "learning_rate": 3.3655641066802577e-06, "loss": 0.6132918, "num_input_tokens_seen": 101565045, "router_z_loss_clip": 0.02001953, "router_z_loss_mlp": 0.3046875, "step": 4706, "time_per_iteration": 3.135972261428833 }, { "auxiliary_loss_clip": 0.01129113, "auxiliary_loss_mlp": 0.01037109, "balance_loss_clip": 1.02357817, "balance_loss_mlp": 1.04742968, "epoch": 0.2830001503081317, "flos": 24789028331520.0, "grad_norm": 1.4913215959102017, "language_loss": 0.81970263, "learning_rate": 3.365279531475407e-06, "loss": 0.84136486, "num_input_tokens_seen": 101585825, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.81640625, "step": 4707, "time_per_iteration": 2.506624460220337 }, { "auxiliary_loss_clip": 0.01134599, "auxiliary_loss_mlp": 0.01037623, "balance_loss_clip": 1.02037358, "balance_loss_mlp": 1.04570174, "epoch": 0.28306027356079966, "flos": 27669387018240.0, "grad_norm": 12.385470300628992, "language_loss": 0.80639982, "learning_rate": 3.36499490449902e-06, "loss": 0.82812202, "num_input_tokens_seen": 101606105, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.88671875, "step": 4708, "time_per_iteration": 2.51481556892395 }, { "auxiliary_loss_clip": 0.01054627, "auxiliary_loss_mlp": 0.01005862, "balance_loss_clip": 1.00390685, "balance_loss_mlp": 1.02452946, "epoch": 0.2831203968134676, "flos": 60527938199040.0, "grad_norm": 2.721919206377024, "language_loss": 0.62865335, "learning_rate": 3.3647102257618895e-06, "loss": 0.6492582, "num_input_tokens_seen": 101656875, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.30078125, "step": 4709, "time_per_iteration": 2.9783692359924316 }, { "auxiliary_loss_clip": 0.01131771, "auxiliary_loss_mlp": 0.01039479, "balance_loss_clip": 1.02360082, "balance_loss_mlp": 1.04779005, "epoch": 0.2831805200661356, "flos": 22054790171520.0, "grad_norm": 1.4144429776286562, "language_loss": 0.74253172, "learning_rate": 3.3644254952748103e-06, "loss": 0.7642442, "num_input_tokens_seen": 101676225, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8359375, "step": 4710, "time_per_iteration": 2.4740030765533447 }, { "auxiliary_loss_clip": 0.01132994, "auxiliary_loss_mlp": 0.01046886, "balance_loss_clip": 1.02968454, "balance_loss_mlp": 1.04654408, "epoch": 0.28324064331880355, "flos": 22600668136320.0, "grad_norm": 1.9063968460596288, "language_loss": 0.79028165, "learning_rate": 3.364140713048579e-06, "loss": 0.81208044, "num_input_tokens_seen": 101693710, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.86328125, "step": 4711, "time_per_iteration": 2.487467050552368 }, { "auxiliary_loss_clip": 0.01136432, "auxiliary_loss_mlp": 0.01038609, "balance_loss_clip": 1.02159834, "balance_loss_mlp": 1.04913926, "epoch": 0.2833007665714715, "flos": 30404127968640.0, "grad_norm": 1.9009144961443603, "language_loss": 0.70517015, "learning_rate": 3.363855879093996e-06, "loss": 0.7269206, "num_input_tokens_seen": 101714010, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.87109375, "step": 4712, "time_per_iteration": 2.5259032249450684 }, { "auxiliary_loss_clip": 0.01134578, "auxiliary_loss_mlp": 0.01047876, "balance_loss_clip": 1.03073347, "balance_loss_mlp": 1.04738808, "epoch": 0.2833608898241395, "flos": 23549499849600.0, "grad_norm": 6.38714358348936, "language_loss": 0.82537991, "learning_rate": 3.3635709934218605e-06, "loss": 0.84720445, "num_input_tokens_seen": 101732995, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.875, "step": 4713, "time_per_iteration": 2.4866795539855957 }, { "auxiliary_loss_clip": 0.0113339, "auxiliary_loss_mlp": 0.0103666, "balance_loss_clip": 1.01966119, "balance_loss_mlp": 1.04828835, "epoch": 0.28342101307680745, "flos": 20266726118400.0, "grad_norm": 1.752271317093026, "language_loss": 0.7556783, "learning_rate": 3.3632860560429766e-06, "loss": 0.7773788, "num_input_tokens_seen": 101751385, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8515625, "step": 4714, "time_per_iteration": 2.4572300910949707 }, { "auxiliary_loss_clip": 0.01130722, "auxiliary_loss_mlp": 0.0104148, "balance_loss_clip": 1.02564931, "balance_loss_mlp": 1.04593635, "epoch": 0.2834811363294754, "flos": 30847050576000.0, "grad_norm": 1.4814945967555986, "language_loss": 0.78107297, "learning_rate": 3.3630010669681494e-06, "loss": 0.80279493, "num_input_tokens_seen": 101773825, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 4715, "time_per_iteration": 2.5582869052886963 }, { "auxiliary_loss_clip": 0.01131681, "auxiliary_loss_mlp": 0.01036252, "balance_loss_clip": 1.01996851, "balance_loss_mlp": 1.04684997, "epoch": 0.2835412595821434, "flos": 22711021695360.0, "grad_norm": 2.402417640287365, "language_loss": 0.73766732, "learning_rate": 3.3627160262081845e-06, "loss": 0.75934672, "num_input_tokens_seen": 101791920, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8515625, "step": 4716, "time_per_iteration": 2.466165065765381 }, { "auxiliary_loss_clip": 0.0113842, "auxiliary_loss_mlp": 0.01041026, "balance_loss_clip": 1.02338314, "balance_loss_mlp": 1.04701078, "epoch": 0.28360138283481134, "flos": 18077719478400.0, "grad_norm": 2.3993980527672782, "language_loss": 0.74473989, "learning_rate": 3.3624309337738917e-06, "loss": 0.76653433, "num_input_tokens_seen": 101809515, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9140625, "step": 4717, "time_per_iteration": 2.4536995887756348 }, { "auxiliary_loss_clip": 0.01136632, "auxiliary_loss_mlp": 0.01041125, "balance_loss_clip": 1.02445972, "balance_loss_mlp": 1.04832852, "epoch": 0.2836615060874793, "flos": 17854785717120.0, "grad_norm": 1.663937191529075, "language_loss": 0.66973209, "learning_rate": 3.3621457896760813e-06, "loss": 0.69150966, "num_input_tokens_seen": 101827735, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8828125, "step": 4718, "time_per_iteration": 3.9115090370178223 }, { "auxiliary_loss_clip": 0.01138024, "auxiliary_loss_mlp": 0.01042544, "balance_loss_clip": 1.02541399, "balance_loss_mlp": 1.04893422, "epoch": 0.2837216293401473, "flos": 25740302169600.0, "grad_norm": 2.0499590508269487, "language_loss": 0.7209022, "learning_rate": 3.361860593925566e-06, "loss": 0.74270797, "num_input_tokens_seen": 101845970, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.890625, "step": 4719, "time_per_iteration": 2.5109617710113525 }, { "auxiliary_loss_clip": 0.01132519, "auxiliary_loss_mlp": 0.01038234, "balance_loss_clip": 1.02199769, "balance_loss_mlp": 1.04881787, "epoch": 0.2837817525928153, "flos": 20923532259840.0, "grad_norm": 1.809507359981325, "language_loss": 0.80428976, "learning_rate": 3.3615753465331605e-06, "loss": 0.82599729, "num_input_tokens_seen": 101865040, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8359375, "step": 4720, "time_per_iteration": 3.87336802482605 }, { "auxiliary_loss_clip": 0.01136313, "auxiliary_loss_mlp": 0.0104503, "balance_loss_clip": 1.02813852, "balance_loss_mlp": 1.04937088, "epoch": 0.28384187584548326, "flos": 18916700423040.0, "grad_norm": 1.884075931729606, "language_loss": 0.79353535, "learning_rate": 3.3612900475096817e-06, "loss": 0.81534874, "num_input_tokens_seen": 101883735, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8671875, "step": 4721, "time_per_iteration": 3.861086368560791 }, { "auxiliary_loss_clip": 0.0113221, "auxiliary_loss_mlp": 0.01037711, "balance_loss_clip": 1.02111673, "balance_loss_mlp": 1.04687285, "epoch": 0.2839019990981512, "flos": 27343964776320.0, "grad_norm": 1.9426402753750536, "language_loss": 0.82818276, "learning_rate": 3.3610046968659474e-06, "loss": 0.84988189, "num_input_tokens_seen": 101903025, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.85546875, "step": 4722, "time_per_iteration": 3.9532032012939453 }, { "auxiliary_loss_clip": 0.01137738, "auxiliary_loss_mlp": 0.01034675, "balance_loss_clip": 1.0193212, "balance_loss_mlp": 1.05137491, "epoch": 0.2839621223508192, "flos": 18114312458880.0, "grad_norm": 2.556132044522662, "language_loss": 0.70060647, "learning_rate": 3.3607192946127785e-06, "loss": 0.72233057, "num_input_tokens_seen": 101922255, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.86328125, "step": 4723, "time_per_iteration": 2.453517198562622 }, { "auxiliary_loss_clip": 0.01134593, "auxiliary_loss_mlp": 0.01042057, "balance_loss_clip": 1.02503383, "balance_loss_mlp": 1.04932308, "epoch": 0.28402224560348716, "flos": 26358360514560.0, "grad_norm": 1.5058355567117057, "language_loss": 0.78531909, "learning_rate": 3.360433840760998e-06, "loss": 0.80708551, "num_input_tokens_seen": 101943100, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8515625, "step": 4724, "time_per_iteration": 2.5057272911071777 }, { "auxiliary_loss_clip": 0.01136482, "auxiliary_loss_mlp": 0.01042826, "balance_loss_clip": 1.02653062, "balance_loss_mlp": 1.05015254, "epoch": 0.2840823688561551, "flos": 24060795995520.0, "grad_norm": 28.10895659385587, "language_loss": 0.9228074, "learning_rate": 3.36014833532143e-06, "loss": 0.94460046, "num_input_tokens_seen": 101963160, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.86328125, "step": 4725, "time_per_iteration": 2.505505323410034 }, { "auxiliary_loss_clip": 0.01138863, "auxiliary_loss_mlp": 0.01040406, "balance_loss_clip": 1.02343035, "balance_loss_mlp": 1.05074155, "epoch": 0.2841424921088231, "flos": 29459821368960.0, "grad_norm": 1.5682154313137646, "language_loss": 0.88817018, "learning_rate": 3.3598627783049e-06, "loss": 0.90996289, "num_input_tokens_seen": 101984300, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.87890625, "step": 4726, "time_per_iteration": 2.5205931663513184 }, { "auxiliary_loss_clip": 0.01140751, "auxiliary_loss_mlp": 0.01040993, "balance_loss_clip": 1.02442265, "balance_loss_mlp": 1.05300379, "epoch": 0.28420261536149105, "flos": 48100367053440.0, "grad_norm": 1.9131774198179765, "language_loss": 0.78550082, "learning_rate": 3.359577169722238e-06, "loss": 0.80731827, "num_input_tokens_seen": 102005765, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.87890625, "step": 4727, "time_per_iteration": 2.702305793762207 }, { "auxiliary_loss_clip": 0.01134982, "auxiliary_loss_mlp": 0.01039549, "balance_loss_clip": 1.02495766, "balance_loss_mlp": 1.05193925, "epoch": 0.284262738614159, "flos": 25666146541440.0, "grad_norm": 3.1379058234082033, "language_loss": 0.66309613, "learning_rate": 3.3592915095842733e-06, "loss": 0.68484139, "num_input_tokens_seen": 102022755, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.828125, "step": 4728, "time_per_iteration": 2.4990599155426025 }, { "auxiliary_loss_clip": 0.01132246, "auxiliary_loss_mlp": 0.01044401, "balance_loss_clip": 1.02796221, "balance_loss_mlp": 1.04750478, "epoch": 0.284322861866827, "flos": 19718980646400.0, "grad_norm": 2.063108497694796, "language_loss": 0.76223814, "learning_rate": 3.3590057979018386e-06, "loss": 0.78400469, "num_input_tokens_seen": 102041850, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84765625, "step": 4729, "time_per_iteration": 2.4794201850891113 }, { "auxiliary_loss_clip": 0.0114067, "auxiliary_loss_mlp": 0.0104673, "balance_loss_clip": 1.03019571, "balance_loss_mlp": 1.05185926, "epoch": 0.28438298511949495, "flos": 23915250086400.0, "grad_norm": 1.7184516731523425, "language_loss": 0.66588783, "learning_rate": 3.3587200346857674e-06, "loss": 0.68776178, "num_input_tokens_seen": 102059500, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.88671875, "step": 4730, "time_per_iteration": 2.480522394180298 }, { "auxiliary_loss_clip": 0.01138573, "auxiliary_loss_mlp": 0.01038106, "balance_loss_clip": 1.0219059, "balance_loss_mlp": 1.05130935, "epoch": 0.2844431083721629, "flos": 26067340523520.0, "grad_norm": 1.6456524848253418, "language_loss": 0.74268937, "learning_rate": 3.3584342199468965e-06, "loss": 0.76445609, "num_input_tokens_seen": 102080460, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87109375, "step": 4731, "time_per_iteration": 2.5260608196258545 }, { "auxiliary_loss_clip": 0.01135157, "auxiliary_loss_mlp": 0.01036512, "balance_loss_clip": 1.02079999, "balance_loss_mlp": 1.04849172, "epoch": 0.2845032316248309, "flos": 25810435474560.0, "grad_norm": 1.6961116532717184, "language_loss": 0.83608443, "learning_rate": 3.3581483536960638e-06, "loss": 0.85780108, "num_input_tokens_seen": 102100950, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8671875, "step": 4732, "time_per_iteration": 2.4942800998687744 }, { "auxiliary_loss_clip": 0.01137472, "auxiliary_loss_mlp": 0.01046038, "balance_loss_clip": 1.02924097, "balance_loss_mlp": 1.05037546, "epoch": 0.2845633548774989, "flos": 19823192979840.0, "grad_norm": 1.8246031411029138, "language_loss": 0.78896689, "learning_rate": 3.357862435944109e-06, "loss": 0.81080198, "num_input_tokens_seen": 102119345, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.87109375, "step": 4733, "time_per_iteration": 2.472062587738037 }, { "auxiliary_loss_clip": 0.0114256, "auxiliary_loss_mlp": 0.01045989, "balance_loss_clip": 1.02935982, "balance_loss_mlp": 1.0521431, "epoch": 0.28462347813016686, "flos": 23182815859200.0, "grad_norm": 3.3004229928283757, "language_loss": 0.71081805, "learning_rate": 3.357576466701875e-06, "loss": 0.73270357, "num_input_tokens_seen": 102139050, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.90625, "step": 4734, "time_per_iteration": 2.497408628463745 }, { "auxiliary_loss_clip": 0.01131444, "auxiliary_loss_mlp": 0.01034341, "balance_loss_clip": 1.0189873, "balance_loss_mlp": 1.04670358, "epoch": 0.2846836013828348, "flos": 18660477732480.0, "grad_norm": 1.7426088739326626, "language_loss": 0.74127781, "learning_rate": 3.3572904459802056e-06, "loss": 0.76293564, "num_input_tokens_seen": 102157935, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.84765625, "step": 4735, "time_per_iteration": 2.4707870483398438 }, { "auxiliary_loss_clip": 0.01134153, "auxiliary_loss_mlp": 0.01046126, "balance_loss_clip": 1.03090322, "balance_loss_mlp": 1.04817796, "epoch": 0.2847437246355028, "flos": 14173511523840.0, "grad_norm": 2.1240895209337767, "language_loss": 0.79744029, "learning_rate": 3.357004373789946e-06, "loss": 0.81924307, "num_input_tokens_seen": 102175325, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.859375, "step": 4736, "time_per_iteration": 2.4379019737243652 }, { "auxiliary_loss_clip": 0.01136827, "auxiliary_loss_mlp": 0.01043253, "balance_loss_clip": 1.02727914, "balance_loss_mlp": 1.05037081, "epoch": 0.28480384788817076, "flos": 29278364837760.0, "grad_norm": 8.798450198103916, "language_loss": 0.60212672, "learning_rate": 3.3567182501419453e-06, "loss": 0.62392753, "num_input_tokens_seen": 102196625, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.86328125, "step": 4737, "time_per_iteration": 2.5652883052825928 }, { "auxiliary_loss_clip": 0.01130246, "auxiliary_loss_mlp": 0.0103611, "balance_loss_clip": 1.02149475, "balance_loss_mlp": 1.04727602, "epoch": 0.2848639711408387, "flos": 22601314581120.0, "grad_norm": 1.885323219328637, "language_loss": 0.8646127, "learning_rate": 3.356432075047052e-06, "loss": 0.88627625, "num_input_tokens_seen": 102214975, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.828125, "step": 4738, "time_per_iteration": 2.4835622310638428 }, { "auxiliary_loss_clip": 0.01136406, "auxiliary_loss_mlp": 0.01047261, "balance_loss_clip": 1.02955866, "balance_loss_mlp": 1.04765987, "epoch": 0.2849240943935067, "flos": 17599460866560.0, "grad_norm": 2.046325646361376, "language_loss": 0.89789557, "learning_rate": 3.356145848516118e-06, "loss": 0.91973221, "num_input_tokens_seen": 102231885, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.890625, "step": 4739, "time_per_iteration": 2.4538729190826416 }, { "auxiliary_loss_clip": 0.01135925, "auxiliary_loss_mlp": 0.01042032, "balance_loss_clip": 1.02620125, "balance_loss_mlp": 1.05142641, "epoch": 0.28498421764617465, "flos": 24862573428480.0, "grad_norm": 1.5038525496078459, "language_loss": 0.72297508, "learning_rate": 3.355859570559998e-06, "loss": 0.74475467, "num_input_tokens_seen": 102252725, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84375, "step": 4740, "time_per_iteration": 2.4934005737304688 }, { "auxiliary_loss_clip": 0.01133152, "auxiliary_loss_mlp": 0.01038099, "balance_loss_clip": 1.0233891, "balance_loss_mlp": 1.05004847, "epoch": 0.2850443408988426, "flos": 22782555630720.0, "grad_norm": 1.5795908885278795, "language_loss": 0.77761489, "learning_rate": 3.3555732411895477e-06, "loss": 0.79932743, "num_input_tokens_seen": 102271730, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.83203125, "step": 4741, "time_per_iteration": 2.5275063514709473 }, { "auxiliary_loss_clip": 0.01136714, "auxiliary_loss_mlp": 0.01047095, "balance_loss_clip": 1.03060877, "balance_loss_mlp": 1.04709005, "epoch": 0.2851044641515106, "flos": 18844053166080.0, "grad_norm": 1.6320010165161845, "language_loss": 0.76186025, "learning_rate": 3.3552868604156235e-06, "loss": 0.78369832, "num_input_tokens_seen": 102291325, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.89453125, "step": 4742, "time_per_iteration": 2.44331955909729 }, { "auxiliary_loss_clip": 0.01138391, "auxiliary_loss_mlp": 0.01047286, "balance_loss_clip": 1.02930903, "balance_loss_mlp": 1.04862285, "epoch": 0.28516458740417855, "flos": 18880502492160.0, "grad_norm": 2.252451339469594, "language_loss": 0.57738, "learning_rate": 3.355000428249086e-06, "loss": 0.59923673, "num_input_tokens_seen": 102309000, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.8984375, "step": 4743, "time_per_iteration": 2.4573957920074463 }, { "auxiliary_loss_clip": 0.01138878, "auxiliary_loss_mlp": 0.01047796, "balance_loss_clip": 1.03145218, "balance_loss_mlp": 1.05075824, "epoch": 0.2852247106568465, "flos": 25299821687040.0, "grad_norm": 1.9840499845163087, "language_loss": 0.74415946, "learning_rate": 3.354713944700797e-06, "loss": 0.7660262, "num_input_tokens_seen": 102329240, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8828125, "step": 4744, "time_per_iteration": 2.4875431060791016 }, { "auxiliary_loss_clip": 0.01132135, "auxiliary_loss_mlp": 0.0104134, "balance_loss_clip": 1.02614677, "balance_loss_mlp": 1.04792595, "epoch": 0.2852848339095145, "flos": 11655383541120.0, "grad_norm": 5.119617160920272, "language_loss": 0.77784628, "learning_rate": 3.3544274097816185e-06, "loss": 0.79958099, "num_input_tokens_seen": 102344440, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84375, "step": 4745, "time_per_iteration": 2.4498236179351807 }, { "auxiliary_loss_clip": 0.01126237, "auxiliary_loss_mlp": 0.01037726, "balance_loss_clip": 1.02331376, "balance_loss_mlp": 1.0466888, "epoch": 0.2853449571621825, "flos": 12933228856320.0, "grad_norm": 1.8407971253029718, "language_loss": 0.82538152, "learning_rate": 3.3541408235024173e-06, "loss": 0.8470211, "num_input_tokens_seen": 102360985, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 4746, "time_per_iteration": 2.423583984375 }, { "auxiliary_loss_clip": 0.01136966, "auxiliary_loss_mlp": 0.01039425, "balance_loss_clip": 1.02277088, "balance_loss_mlp": 1.04762411, "epoch": 0.28540508041485046, "flos": 20010575255040.0, "grad_norm": 1.8222464789267914, "language_loss": 0.79477608, "learning_rate": 3.3538541858740604e-06, "loss": 0.81653994, "num_input_tokens_seen": 102380320, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.89453125, "step": 4747, "time_per_iteration": 2.4590752124786377 }, { "auxiliary_loss_clip": 0.01061893, "auxiliary_loss_mlp": 0.01000419, "balance_loss_clip": 0.99838018, "balance_loss_mlp": 1.03259373, "epoch": 0.28546520366751843, "flos": 68139349966080.0, "grad_norm": 0.7724175750167275, "language_loss": 0.605003, "learning_rate": 3.3535674969074173e-06, "loss": 0.62562609, "num_input_tokens_seen": 102439140, "router_z_loss_clip": 0.02038574, "router_z_loss_mlp": 0.29296875, "step": 4748, "time_per_iteration": 3.0662858486175537 }, { "auxiliary_loss_clip": 0.01131096, "auxiliary_loss_mlp": 0.01039155, "balance_loss_clip": 1.02347875, "balance_loss_mlp": 1.04578471, "epoch": 0.2855253269201864, "flos": 13251540205440.0, "grad_norm": 2.3095719462878486, "language_loss": 0.80967367, "learning_rate": 3.3532807566133592e-06, "loss": 0.83137619, "num_input_tokens_seen": 102450990, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.85546875, "step": 4749, "time_per_iteration": 2.4127280712127686 }, { "auxiliary_loss_clip": 0.01131378, "auxiliary_loss_mlp": 0.0104033, "balance_loss_clip": 1.02472568, "balance_loss_mlp": 1.04631472, "epoch": 0.28558545017285436, "flos": 28620876337920.0, "grad_norm": 1.977526904920614, "language_loss": 0.70516992, "learning_rate": 3.3529939650027587e-06, "loss": 0.72688699, "num_input_tokens_seen": 102471820, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8515625, "step": 4750, "time_per_iteration": 2.5238823890686035 }, { "auxiliary_loss_clip": 0.01129817, "auxiliary_loss_mlp": 0.01036751, "balance_loss_clip": 1.02117586, "balance_loss_mlp": 1.04788446, "epoch": 0.2856455734255223, "flos": 34130470752000.0, "grad_norm": 1.6411250892725087, "language_loss": 0.81990552, "learning_rate": 3.3527071220864917e-06, "loss": 0.84157121, "num_input_tokens_seen": 102492625, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 4751, "time_per_iteration": 2.597259283065796 }, { "auxiliary_loss_clip": 0.01129505, "auxiliary_loss_mlp": 0.010398, "balance_loss_clip": 1.02483964, "balance_loss_mlp": 1.04603076, "epoch": 0.2857056966781903, "flos": 39786149779200.0, "grad_norm": 3.619871516431003, "language_loss": 0.80092657, "learning_rate": 3.3524202278754353e-06, "loss": 0.82261956, "num_input_tokens_seen": 102514145, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8359375, "step": 4752, "time_per_iteration": 2.6151862144470215 }, { "auxiliary_loss_clip": 0.01131954, "auxiliary_loss_mlp": 0.01039837, "balance_loss_clip": 1.02354097, "balance_loss_mlp": 1.046139, "epoch": 0.28576581993085826, "flos": 21872292145920.0, "grad_norm": 1.7142032231260595, "language_loss": 0.78314412, "learning_rate": 3.3521332823804676e-06, "loss": 0.80486202, "num_input_tokens_seen": 102532365, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.859375, "step": 4753, "time_per_iteration": 2.548428535461426 }, { "auxiliary_loss_clip": 0.01136969, "auxiliary_loss_mlp": 0.01041854, "balance_loss_clip": 1.02487898, "balance_loss_mlp": 1.04831755, "epoch": 0.2858259431835262, "flos": 19091656592640.0, "grad_norm": 9.787856930838863, "language_loss": 0.89616466, "learning_rate": 3.3518462856124704e-06, "loss": 0.9179529, "num_input_tokens_seen": 102548425, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.88671875, "step": 4754, "time_per_iteration": 2.4473862648010254 }, { "auxiliary_loss_clip": 0.01128708, "auxiliary_loss_mlp": 0.01040417, "balance_loss_clip": 1.02534902, "balance_loss_mlp": 1.04665279, "epoch": 0.2858860664361942, "flos": 20334309557760.0, "grad_norm": 2.0100189704218874, "language_loss": 0.82720423, "learning_rate": 3.3515592375823267e-06, "loss": 0.84889555, "num_input_tokens_seen": 102566370, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 4755, "time_per_iteration": 2.4995319843292236 }, { "auxiliary_loss_clip": 0.0113113, "auxiliary_loss_mlp": 0.01038286, "balance_loss_clip": 1.02302742, "balance_loss_mlp": 1.04554296, "epoch": 0.28594618968886215, "flos": 24461738582400.0, "grad_norm": 3.6864050419794654, "language_loss": 0.83865666, "learning_rate": 3.351272138300922e-06, "loss": 0.86035079, "num_input_tokens_seen": 102588715, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.85546875, "step": 4756, "time_per_iteration": 2.503872871398926 }, { "auxiliary_loss_clip": 0.01058734, "auxiliary_loss_mlp": 0.01004137, "balance_loss_clip": 1.00206244, "balance_loss_mlp": 1.02916121, "epoch": 0.2860063129415301, "flos": 71652850709760.0, "grad_norm": 0.8480753486925144, "language_loss": 0.60966909, "learning_rate": 3.350984987779142e-06, "loss": 0.63029778, "num_input_tokens_seen": 102656715, "router_z_loss_clip": 0.02075195, "router_z_loss_mlp": 0.296875, "step": 4757, "time_per_iteration": 3.234865665435791 }, { "auxiliary_loss_clip": 0.0113295, "auxiliary_loss_mlp": 0.01038185, "balance_loss_clip": 1.0230391, "balance_loss_mlp": 1.04894674, "epoch": 0.2860664361941981, "flos": 20558679863040.0, "grad_norm": 3.448185615087586, "language_loss": 0.66112512, "learning_rate": 3.3506977860278756e-06, "loss": 0.68283647, "num_input_tokens_seen": 102676545, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.83984375, "step": 4758, "time_per_iteration": 2.4770076274871826 }, { "auxiliary_loss_clip": 0.01132985, "auxiliary_loss_mlp": 0.01032786, "balance_loss_clip": 1.01722908, "balance_loss_mlp": 1.04678583, "epoch": 0.2861265594468661, "flos": 35996389534080.0, "grad_norm": 1.3577343200509662, "language_loss": 0.62828749, "learning_rate": 3.3504105330580143e-06, "loss": 0.64994526, "num_input_tokens_seen": 102702875, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.86328125, "step": 4759, "time_per_iteration": 4.194149494171143 }, { "auxiliary_loss_clip": 0.01134237, "auxiliary_loss_mlp": 0.01040717, "balance_loss_clip": 1.02440929, "balance_loss_mlp": 1.0497632, "epoch": 0.28618668269953407, "flos": 20047419630720.0, "grad_norm": 1.8753062707219743, "language_loss": 0.73982608, "learning_rate": 3.3501232288804496e-06, "loss": 0.76157564, "num_input_tokens_seen": 102723160, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.84375, "step": 4760, "time_per_iteration": 2.5073013305664062 }, { "auxiliary_loss_clip": 0.01128201, "auxiliary_loss_mlp": 0.01036865, "balance_loss_clip": 1.02223802, "balance_loss_mlp": 1.04756546, "epoch": 0.28624680595220203, "flos": 24971849579520.0, "grad_norm": 2.159831835700119, "language_loss": 0.7182337, "learning_rate": 3.3498358735060773e-06, "loss": 0.73988438, "num_input_tokens_seen": 102743855, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 4761, "time_per_iteration": 2.496798515319824 }, { "auxiliary_loss_clip": 0.01131409, "auxiliary_loss_mlp": 0.01044662, "balance_loss_clip": 1.02915263, "balance_loss_mlp": 1.04595804, "epoch": 0.28630692920487, "flos": 22492253911680.0, "grad_norm": 1.9892160357474276, "language_loss": 0.74149626, "learning_rate": 3.349548466945793e-06, "loss": 0.76325697, "num_input_tokens_seen": 102761370, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.85546875, "step": 4762, "time_per_iteration": 3.8484153747558594 }, { "auxiliary_loss_clip": 0.01132493, "auxiliary_loss_mlp": 0.0104221, "balance_loss_clip": 1.02684379, "balance_loss_mlp": 1.04944587, "epoch": 0.28636705245753796, "flos": 21249888255360.0, "grad_norm": 1.424141140371822, "language_loss": 0.76091045, "learning_rate": 3.349261009210496e-06, "loss": 0.7826575, "num_input_tokens_seen": 102780885, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 4763, "time_per_iteration": 5.277761220932007 }, { "auxiliary_loss_clip": 0.01130193, "auxiliary_loss_mlp": 0.01035476, "balance_loss_clip": 1.01923966, "balance_loss_mlp": 1.04552507, "epoch": 0.28642717571020593, "flos": 24095772864000.0, "grad_norm": 1.7134803429222991, "language_loss": 0.7697857, "learning_rate": 3.348973500311086e-06, "loss": 0.79144239, "num_input_tokens_seen": 102801000, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84765625, "step": 4764, "time_per_iteration": 2.519638776779175 }, { "auxiliary_loss_clip": 0.01133686, "auxiliary_loss_mlp": 0.01041996, "balance_loss_clip": 1.02430534, "balance_loss_mlp": 1.04795349, "epoch": 0.2864872989628739, "flos": 22601386408320.0, "grad_norm": 2.520492397315918, "language_loss": 0.7088483, "learning_rate": 3.348685940258466e-06, "loss": 0.73060513, "num_input_tokens_seen": 102820230, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.85546875, "step": 4765, "time_per_iteration": 2.458803415298462 }, { "auxiliary_loss_clip": 0.01127894, "auxiliary_loss_mlp": 0.01032864, "balance_loss_clip": 1.01781344, "balance_loss_mlp": 1.04525959, "epoch": 0.28654742221554186, "flos": 32745073138560.0, "grad_norm": 1.4408628837244872, "language_loss": 0.76149237, "learning_rate": 3.3483983290635395e-06, "loss": 0.78309995, "num_input_tokens_seen": 102842670, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.82421875, "step": 4766, "time_per_iteration": 2.5803580284118652 }, { "auxiliary_loss_clip": 0.01128431, "auxiliary_loss_mlp": 0.0103751, "balance_loss_clip": 1.02209628, "balance_loss_mlp": 1.04606819, "epoch": 0.2866075454682098, "flos": 26981626331520.0, "grad_norm": 1.9720886533254751, "language_loss": 0.77938032, "learning_rate": 3.348110666737214e-06, "loss": 0.8010397, "num_input_tokens_seen": 102864480, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.82421875, "step": 4767, "time_per_iteration": 2.504606246948242 }, { "auxiliary_loss_clip": 0.01128933, "auxiliary_loss_mlp": 0.0103975, "balance_loss_clip": 1.02449107, "balance_loss_mlp": 1.04570222, "epoch": 0.2866676687208778, "flos": 23253847004160.0, "grad_norm": 2.4734501467104706, "language_loss": 0.65261769, "learning_rate": 3.3478229532903956e-06, "loss": 0.67430449, "num_input_tokens_seen": 102883740, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83203125, "step": 4768, "time_per_iteration": 2.479811906814575 }, { "auxiliary_loss_clip": 0.01132227, "auxiliary_loss_mlp": 0.0104284, "balance_loss_clip": 1.02644849, "balance_loss_mlp": 1.04546261, "epoch": 0.28672779197354575, "flos": 21579727870080.0, "grad_norm": 1.6409316490079922, "language_loss": 0.70554686, "learning_rate": 3.3475351887339967e-06, "loss": 0.72729748, "num_input_tokens_seen": 102902945, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8671875, "step": 4769, "time_per_iteration": 2.4825360774993896 }, { "auxiliary_loss_clip": 0.01131094, "auxiliary_loss_mlp": 0.01032415, "balance_loss_clip": 1.01787138, "balance_loss_mlp": 1.04602098, "epoch": 0.2867879152262137, "flos": 19865568049920.0, "grad_norm": 1.7926736964487817, "language_loss": 0.74748904, "learning_rate": 3.3472473730789288e-06, "loss": 0.76912415, "num_input_tokens_seen": 102922405, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8515625, "step": 4770, "time_per_iteration": 2.4741244316101074 }, { "auxiliary_loss_clip": 0.01133704, "auxiliary_loss_mlp": 0.01040117, "balance_loss_clip": 1.02422607, "balance_loss_mlp": 1.04743898, "epoch": 0.2868480384788817, "flos": 28213325648640.0, "grad_norm": 2.2053490753145133, "language_loss": 0.67395163, "learning_rate": 3.3469595063361045e-06, "loss": 0.69568986, "num_input_tokens_seen": 102938980, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.86328125, "step": 4771, "time_per_iteration": 2.4895763397216797 }, { "auxiliary_loss_clip": 0.01057571, "auxiliary_loss_mlp": 0.0101662, "balance_loss_clip": 1.0143429, "balance_loss_mlp": 1.02796412, "epoch": 0.2869081617315497, "flos": 65424286690560.0, "grad_norm": 0.7761956974151357, "language_loss": 0.56836599, "learning_rate": 3.3466715885164414e-06, "loss": 0.58910793, "num_input_tokens_seen": 103000405, "router_z_loss_clip": 0.02282715, "router_z_loss_mlp": 0.296875, "step": 4772, "time_per_iteration": 3.0528178215026855 }, { "auxiliary_loss_clip": 0.01133671, "auxiliary_loss_mlp": 0.01036227, "balance_loss_clip": 1.02074146, "balance_loss_mlp": 1.04755998, "epoch": 0.28696828498421767, "flos": 18660729127680.0, "grad_norm": 2.8232346191573043, "language_loss": 0.82734883, "learning_rate": 3.346383619630856e-06, "loss": 0.84904778, "num_input_tokens_seen": 103017970, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.859375, "step": 4773, "time_per_iteration": 2.468287706375122 }, { "auxiliary_loss_clip": 0.01130035, "auxiliary_loss_mlp": 0.01040929, "balance_loss_clip": 1.02403736, "balance_loss_mlp": 1.04365194, "epoch": 0.28702840823688563, "flos": 23659745667840.0, "grad_norm": 2.24701798867789, "language_loss": 0.77438426, "learning_rate": 3.34609559969027e-06, "loss": 0.79609388, "num_input_tokens_seen": 103036385, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.86328125, "step": 4774, "time_per_iteration": 2.4975733757019043 }, { "auxiliary_loss_clip": 0.01130928, "auxiliary_loss_mlp": 0.01036912, "balance_loss_clip": 1.02048516, "balance_loss_mlp": 1.04732323, "epoch": 0.2870885314895536, "flos": 13804744544640.0, "grad_norm": 2.2743057477061077, "language_loss": 0.73339474, "learning_rate": 3.3458075287056034e-06, "loss": 0.75507307, "num_input_tokens_seen": 103052170, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 4775, "time_per_iteration": 2.4508376121520996 }, { "auxiliary_loss_clip": 0.0113512, "auxiliary_loss_mlp": 0.01042047, "balance_loss_clip": 1.02665734, "balance_loss_mlp": 1.04883528, "epoch": 0.28714865474222157, "flos": 17786771314560.0, "grad_norm": 2.4331166600404397, "language_loss": 0.88143575, "learning_rate": 3.34551940668778e-06, "loss": 0.90320742, "num_input_tokens_seen": 103070510, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.86328125, "step": 4776, "time_per_iteration": 2.470951795578003 }, { "auxiliary_loss_clip": 0.01131908, "auxiliary_loss_mlp": 0.01034589, "balance_loss_clip": 1.01950359, "balance_loss_mlp": 1.04736698, "epoch": 0.28720877799488953, "flos": 15997486199040.0, "grad_norm": 2.1407214870124744, "language_loss": 0.74156886, "learning_rate": 3.345231233647726e-06, "loss": 0.7632339, "num_input_tokens_seen": 103089590, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84375, "step": 4777, "time_per_iteration": 2.4350719451904297 }, { "auxiliary_loss_clip": 0.01139836, "auxiliary_loss_mlp": 0.01037171, "balance_loss_clip": 1.01994586, "balance_loss_mlp": 1.05080521, "epoch": 0.2872689012475575, "flos": 20923137210240.0, "grad_norm": 3.3560301306029445, "language_loss": 0.79742837, "learning_rate": 3.3449430095963696e-06, "loss": 0.81919849, "num_input_tokens_seen": 103109080, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.890625, "step": 4778, "time_per_iteration": 2.482659101486206 }, { "auxiliary_loss_clip": 0.01130534, "auxiliary_loss_mlp": 0.01035894, "balance_loss_clip": 1.01928854, "balance_loss_mlp": 1.047369, "epoch": 0.28732902450022546, "flos": 21325121291520.0, "grad_norm": 1.5741669219200538, "language_loss": 0.73888582, "learning_rate": 3.3446547345446386e-06, "loss": 0.76055008, "num_input_tokens_seen": 103127755, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.83203125, "step": 4779, "time_per_iteration": 2.4675467014312744 }, { "auxiliary_loss_clip": 0.01133471, "auxiliary_loss_mlp": 0.01037421, "balance_loss_clip": 1.02027869, "balance_loss_mlp": 1.04778171, "epoch": 0.2873891477528934, "flos": 20850382212480.0, "grad_norm": 1.5580994803068493, "language_loss": 0.76257581, "learning_rate": 3.3443664085034656e-06, "loss": 0.78428471, "num_input_tokens_seen": 103147035, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.859375, "step": 4780, "time_per_iteration": 2.483325958251953 }, { "auxiliary_loss_clip": 0.01128636, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.01812732, "balance_loss_mlp": 1.04644501, "epoch": 0.2874492710055614, "flos": 17420051410560.0, "grad_norm": 1.8504626513481528, "language_loss": 0.81207275, "learning_rate": 3.344078031483784e-06, "loss": 0.83368576, "num_input_tokens_seen": 103165410, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8203125, "step": 4781, "time_per_iteration": 2.440098762512207 }, { "auxiliary_loss_clip": 0.01135949, "auxiliary_loss_mlp": 0.01038731, "balance_loss_clip": 1.02081442, "balance_loss_mlp": 1.04859757, "epoch": 0.28750939425822936, "flos": 13406818700160.0, "grad_norm": 2.2014263367024176, "language_loss": 0.86536855, "learning_rate": 3.3437896034965283e-06, "loss": 0.88711536, "num_input_tokens_seen": 103183710, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.875, "step": 4782, "time_per_iteration": 2.4627480506896973 }, { "auxiliary_loss_clip": 0.01135237, "auxiliary_loss_mlp": 0.01038657, "balance_loss_clip": 1.02109194, "balance_loss_mlp": 1.05092585, "epoch": 0.2875695175108973, "flos": 21870029589120.0, "grad_norm": 1.5504925759959818, "language_loss": 0.70897341, "learning_rate": 3.3435011245526357e-06, "loss": 0.73071241, "num_input_tokens_seen": 103203790, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.84375, "step": 4783, "time_per_iteration": 2.475428819656372 }, { "auxiliary_loss_clip": 0.01134204, "auxiliary_loss_mlp": 0.0103909, "balance_loss_clip": 1.02187598, "balance_loss_mlp": 1.04994845, "epoch": 0.2876296407635653, "flos": 26245457089920.0, "grad_norm": 2.7658984046398296, "language_loss": 0.76780492, "learning_rate": 3.343212594663047e-06, "loss": 0.78953785, "num_input_tokens_seen": 103223925, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.84375, "step": 4784, "time_per_iteration": 2.5379533767700195 }, { "auxiliary_loss_clip": 0.01131838, "auxiliary_loss_mlp": 0.01035925, "balance_loss_clip": 1.01991487, "balance_loss_mlp": 1.04992354, "epoch": 0.28768976401623325, "flos": 25373654092800.0, "grad_norm": 1.4415522101031355, "language_loss": 0.7612136, "learning_rate": 3.3429240138387015e-06, "loss": 0.78289127, "num_input_tokens_seen": 103244760, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8203125, "step": 4785, "time_per_iteration": 2.5074899196624756 }, { "auxiliary_loss_clip": 0.01134258, "auxiliary_loss_mlp": 0.01039383, "balance_loss_clip": 1.02361166, "balance_loss_mlp": 1.04927731, "epoch": 0.28774988726890127, "flos": 30664372982400.0, "grad_norm": 1.8860406449826974, "language_loss": 0.82884783, "learning_rate": 3.3426353820905425e-06, "loss": 0.85058427, "num_input_tokens_seen": 103261995, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 4786, "time_per_iteration": 2.566514015197754 }, { "auxiliary_loss_clip": 0.01133591, "auxiliary_loss_mlp": 0.01034577, "balance_loss_clip": 1.01968217, "balance_loss_mlp": 1.04914284, "epoch": 0.28781001052156924, "flos": 20595452411520.0, "grad_norm": 1.7687225854062654, "language_loss": 0.79890788, "learning_rate": 3.342346699429516e-06, "loss": 0.82058954, "num_input_tokens_seen": 103279780, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.84375, "step": 4787, "time_per_iteration": 2.4819228649139404 }, { "auxiliary_loss_clip": 0.01134014, "auxiliary_loss_mlp": 0.01038308, "balance_loss_clip": 1.02204752, "balance_loss_mlp": 1.04832566, "epoch": 0.2878701337742372, "flos": 26542330997760.0, "grad_norm": 1.872360239207458, "language_loss": 0.83274508, "learning_rate": 3.3420579658665677e-06, "loss": 0.85446835, "num_input_tokens_seen": 103300580, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 4788, "time_per_iteration": 2.5345325469970703 }, { "auxiliary_loss_clip": 0.01139311, "auxiliary_loss_mlp": 0.01045331, "balance_loss_clip": 1.02889192, "balance_loss_mlp": 1.05119789, "epoch": 0.28793025702690517, "flos": 28146855530880.0, "grad_norm": 2.158994788709427, "language_loss": 0.74031931, "learning_rate": 3.3417691814126468e-06, "loss": 0.76216573, "num_input_tokens_seen": 103320430, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8828125, "step": 4789, "time_per_iteration": 2.5233194828033447 }, { "auxiliary_loss_clip": 0.01128216, "auxiliary_loss_mlp": 0.01040794, "balance_loss_clip": 1.02535677, "balance_loss_mlp": 1.04588652, "epoch": 0.28799038027957313, "flos": 23805471144960.0, "grad_norm": 1.959133004982343, "language_loss": 0.83762527, "learning_rate": 3.341480346078704e-06, "loss": 0.8593154, "num_input_tokens_seen": 103337695, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.82421875, "step": 4790, "time_per_iteration": 2.506484031677246 }, { "auxiliary_loss_clip": 0.01134664, "auxiliary_loss_mlp": 0.01041714, "balance_loss_clip": 1.02587724, "balance_loss_mlp": 1.04956412, "epoch": 0.2880505035322411, "flos": 22344122223360.0, "grad_norm": 1.6677747862045547, "language_loss": 0.77428675, "learning_rate": 3.3411914598756922e-06, "loss": 0.79605055, "num_input_tokens_seen": 103357010, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 4791, "time_per_iteration": 2.489596128463745 }, { "auxiliary_loss_clip": 0.01137902, "auxiliary_loss_mlp": 0.01039001, "balance_loss_clip": 1.02330148, "balance_loss_mlp": 1.04949307, "epoch": 0.28811062678490906, "flos": 18004246208640.0, "grad_norm": 3.3803553106229303, "language_loss": 0.70757496, "learning_rate": 3.3409025228145654e-06, "loss": 0.72934401, "num_input_tokens_seen": 103375600, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8828125, "step": 4792, "time_per_iteration": 2.4655561447143555 }, { "auxiliary_loss_clip": 0.01136085, "auxiliary_loss_mlp": 0.01038833, "balance_loss_clip": 1.02300215, "balance_loss_mlp": 1.04938173, "epoch": 0.28817075003757703, "flos": 22090880361600.0, "grad_norm": 2.0352166580668274, "language_loss": 0.79515672, "learning_rate": 3.3406135349062812e-06, "loss": 0.81690586, "num_input_tokens_seen": 103395225, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 4793, "time_per_iteration": 2.489058494567871 }, { "auxiliary_loss_clip": 0.0113102, "auxiliary_loss_mlp": 0.01038379, "balance_loss_clip": 1.02283382, "balance_loss_mlp": 1.04991472, "epoch": 0.288230873290245, "flos": 41683130847360.0, "grad_norm": 1.8580401826731063, "language_loss": 0.77888823, "learning_rate": 3.340324496161797e-06, "loss": 0.80058217, "num_input_tokens_seen": 103417245, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8125, "step": 4794, "time_per_iteration": 2.6707656383514404 }, { "auxiliary_loss_clip": 0.0113524, "auxiliary_loss_mlp": 0.01045181, "balance_loss_clip": 1.02905834, "balance_loss_mlp": 1.0494957, "epoch": 0.28829099654291296, "flos": 18624423456000.0, "grad_norm": 2.310282954587087, "language_loss": 0.83163291, "learning_rate": 3.340035406592074e-06, "loss": 0.85343719, "num_input_tokens_seen": 103435500, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.859375, "step": 4795, "time_per_iteration": 2.450188636779785 }, { "auxiliary_loss_clip": 0.01129073, "auxiliary_loss_mlp": 0.01044464, "balance_loss_clip": 1.02885914, "balance_loss_mlp": 1.04827642, "epoch": 0.2883511197955809, "flos": 24674832017280.0, "grad_norm": 1.957089190341666, "language_loss": 0.74611402, "learning_rate": 3.339746266208074e-06, "loss": 0.76784933, "num_input_tokens_seen": 103451040, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.80859375, "step": 4796, "time_per_iteration": 2.5123722553253174 }, { "auxiliary_loss_clip": 0.01140614, "auxiliary_loss_mlp": 0.01044513, "balance_loss_clip": 1.0264051, "balance_loss_mlp": 1.05108058, "epoch": 0.2884112430482489, "flos": 23112143850240.0, "grad_norm": 4.309211624557082, "language_loss": 0.727139, "learning_rate": 3.3394570750207614e-06, "loss": 0.74899024, "num_input_tokens_seen": 103471330, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.89453125, "step": 4797, "time_per_iteration": 2.482208013534546 }, { "auxiliary_loss_clip": 0.011337, "auxiliary_loss_mlp": 0.01039819, "balance_loss_clip": 1.02356493, "balance_loss_mlp": 1.04911017, "epoch": 0.28847136630091685, "flos": 16873347432960.0, "grad_norm": 2.210913828957587, "language_loss": 0.74305964, "learning_rate": 3.3391678330411017e-06, "loss": 0.76479483, "num_input_tokens_seen": 103488060, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84375, "step": 4798, "time_per_iteration": 2.476799488067627 }, { "auxiliary_loss_clip": 0.01134875, "auxiliary_loss_mlp": 0.01046708, "balance_loss_clip": 1.02886248, "balance_loss_mlp": 1.04747343, "epoch": 0.2885314895535849, "flos": 25657527277440.0, "grad_norm": 2.7489664304719548, "language_loss": 0.64876842, "learning_rate": 3.3388785402800642e-06, "loss": 0.67058426, "num_input_tokens_seen": 103503600, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.87109375, "step": 4799, "time_per_iteration": 2.506563663482666 }, { "auxiliary_loss_clip": 0.01136263, "auxiliary_loss_mlp": 0.01046983, "balance_loss_clip": 1.03018606, "balance_loss_mlp": 1.04938531, "epoch": 0.28859161280625284, "flos": 21107251347840.0, "grad_norm": 1.637506636841782, "language_loss": 0.82345998, "learning_rate": 3.3385891967486178e-06, "loss": 0.84529245, "num_input_tokens_seen": 103524195, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 4800, "time_per_iteration": 2.5154666900634766 }, { "auxiliary_loss_clip": 0.01132291, "auxiliary_loss_mlp": 0.01042594, "balance_loss_clip": 1.02663159, "balance_loss_mlp": 1.05003607, "epoch": 0.2886517360589208, "flos": 26469540086400.0, "grad_norm": 1.6326910816204643, "language_loss": 0.91000038, "learning_rate": 3.3382998024577347e-06, "loss": 0.93174922, "num_input_tokens_seen": 103545235, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8203125, "step": 4801, "time_per_iteration": 4.054848909378052 }, { "auxiliary_loss_clip": 0.01133951, "auxiliary_loss_mlp": 0.01038535, "balance_loss_clip": 1.0224061, "balance_loss_mlp": 1.04932761, "epoch": 0.28871185931158877, "flos": 25265275781760.0, "grad_norm": 2.3541821679087374, "language_loss": 0.73633778, "learning_rate": 3.33801035741839e-06, "loss": 0.75806266, "num_input_tokens_seen": 103563305, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 4802, "time_per_iteration": 2.5044822692871094 }, { "auxiliary_loss_clip": 0.01069246, "auxiliary_loss_mlp": 0.01001624, "balance_loss_clip": 0.99915648, "balance_loss_mlp": 1.03947926, "epoch": 0.28877198256425674, "flos": 66665431284480.0, "grad_norm": 0.8006768473509759, "language_loss": 0.62954712, "learning_rate": 3.337720861641558e-06, "loss": 0.6502558, "num_input_tokens_seen": 103625025, "router_z_loss_clip": 0.0246582, "router_z_loss_mlp": 0.296875, "step": 4803, "time_per_iteration": 4.508761167526245 }, { "auxiliary_loss_clip": 0.01131335, "auxiliary_loss_mlp": 0.0104239, "balance_loss_clip": 1.0263561, "balance_loss_mlp": 1.0473175, "epoch": 0.2888321058169247, "flos": 20303031790080.0, "grad_norm": 1.6978886446916257, "language_loss": 0.70615661, "learning_rate": 3.3374313151382165e-06, "loss": 0.72789383, "num_input_tokens_seen": 103644235, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 4804, "time_per_iteration": 3.8376104831695557 }, { "auxiliary_loss_clip": 0.01136222, "auxiliary_loss_mlp": 0.01042444, "balance_loss_clip": 1.0248965, "balance_loss_mlp": 1.04704642, "epoch": 0.28889222906959267, "flos": 25516721963520.0, "grad_norm": 1.9641367142640125, "language_loss": 0.68220931, "learning_rate": 3.337141717919346e-06, "loss": 0.70399594, "num_input_tokens_seen": 103664700, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.890625, "step": 4805, "time_per_iteration": 3.9375267028808594 }, { "auxiliary_loss_clip": 0.01133818, "auxiliary_loss_mlp": 0.01039189, "balance_loss_clip": 1.02274966, "balance_loss_mlp": 1.04757142, "epoch": 0.28895235232226063, "flos": 32671312560000.0, "grad_norm": 1.7050356727116214, "language_loss": 0.69361347, "learning_rate": 3.3368520699959272e-06, "loss": 0.71534353, "num_input_tokens_seen": 103686595, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.859375, "step": 4806, "time_per_iteration": 2.571686029434204 }, { "auxiliary_loss_clip": 0.01130134, "auxiliary_loss_mlp": 0.01041884, "balance_loss_clip": 1.02602875, "balance_loss_mlp": 1.04752541, "epoch": 0.2890124755749286, "flos": 29714679342720.0, "grad_norm": 1.5588933392474662, "language_loss": 0.71359211, "learning_rate": 3.3365623713789443e-06, "loss": 0.73531234, "num_input_tokens_seen": 103707525, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 4807, "time_per_iteration": 2.5516622066497803 }, { "auxiliary_loss_clip": 0.01134124, "auxiliary_loss_mlp": 0.01038233, "balance_loss_clip": 1.02182937, "balance_loss_mlp": 1.04883218, "epoch": 0.28907259882759656, "flos": 22674464628480.0, "grad_norm": 1.7765624507863376, "language_loss": 0.8142128, "learning_rate": 3.336272622079382e-06, "loss": 0.83593637, "num_input_tokens_seen": 103727905, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8515625, "step": 4808, "time_per_iteration": 2.4795382022857666 }, { "auxiliary_loss_clip": 0.0112926, "auxiliary_loss_mlp": 0.01045846, "balance_loss_clip": 1.02906144, "balance_loss_mlp": 1.04762673, "epoch": 0.2891327220802645, "flos": 22566050403840.0, "grad_norm": 2.097857671502111, "language_loss": 0.7800771, "learning_rate": 3.3359828221082276e-06, "loss": 0.80182815, "num_input_tokens_seen": 103748335, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.81640625, "step": 4809, "time_per_iteration": 2.50803542137146 }, { "auxiliary_loss_clip": 0.01135552, "auxiliary_loss_mlp": 0.01042046, "balance_loss_clip": 1.02374697, "balance_loss_mlp": 1.04639184, "epoch": 0.2891928453329325, "flos": 21652806090240.0, "grad_norm": 2.249228289429198, "language_loss": 0.78785145, "learning_rate": 3.3356929714764714e-06, "loss": 0.80962747, "num_input_tokens_seen": 103767020, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.890625, "step": 4810, "time_per_iteration": 2.48905348777771 }, { "auxiliary_loss_clip": 0.01130864, "auxiliary_loss_mlp": 0.01036246, "balance_loss_clip": 1.02089739, "balance_loss_mlp": 1.04854906, "epoch": 0.28925296858560046, "flos": 23222102359680.0, "grad_norm": 1.6660960630654527, "language_loss": 0.77135199, "learning_rate": 3.3354030701951032e-06, "loss": 0.79302311, "num_input_tokens_seen": 103786355, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.82421875, "step": 4811, "time_per_iteration": 2.5071613788604736 }, { "auxiliary_loss_clip": 0.01131435, "auxiliary_loss_mlp": 0.01045719, "balance_loss_clip": 1.02812326, "balance_loss_mlp": 1.04761589, "epoch": 0.2893130918382685, "flos": 28621666437120.0, "grad_norm": 1.4468512753021738, "language_loss": 0.77316141, "learning_rate": 3.335113118275117e-06, "loss": 0.79493302, "num_input_tokens_seen": 103809345, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.83984375, "step": 4812, "time_per_iteration": 2.5392842292785645 }, { "auxiliary_loss_clip": 0.01057946, "auxiliary_loss_mlp": 0.01005505, "balance_loss_clip": 1.00302505, "balance_loss_mlp": 1.02853632, "epoch": 0.28937321509093644, "flos": 72301288982400.0, "grad_norm": 0.8499728679908352, "language_loss": 0.60314351, "learning_rate": 3.3348231157275085e-06, "loss": 0.62377805, "num_input_tokens_seen": 103871180, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.29296875, "step": 4813, "time_per_iteration": 3.254981517791748 }, { "auxiliary_loss_clip": 0.01130543, "auxiliary_loss_mlp": 0.01033975, "balance_loss_clip": 1.01756024, "balance_loss_mlp": 1.04668212, "epoch": 0.2894333383436044, "flos": 16216397637120.0, "grad_norm": 2.114872011486511, "language_loss": 0.8223474, "learning_rate": 3.3345330625632725e-06, "loss": 0.84399259, "num_input_tokens_seen": 103889040, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 4814, "time_per_iteration": 2.449453592300415 }, { "auxiliary_loss_clip": 0.01135912, "auxiliary_loss_mlp": 0.01047746, "balance_loss_clip": 1.03084183, "balance_loss_mlp": 1.0478431, "epoch": 0.2894934615962724, "flos": 24828278918400.0, "grad_norm": 1.965780591770459, "language_loss": 0.72066939, "learning_rate": 3.3342429587934094e-06, "loss": 0.74250603, "num_input_tokens_seen": 103910380, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.87890625, "step": 4815, "time_per_iteration": 2.545984983444214 }, { "auxiliary_loss_clip": 0.01129077, "auxiliary_loss_mlp": 0.01045423, "balance_loss_clip": 1.03026569, "balance_loss_mlp": 1.04815507, "epoch": 0.28955358484894034, "flos": 20449978329600.0, "grad_norm": 1.6130475487271518, "language_loss": 0.70267874, "learning_rate": 3.3339528044289198e-06, "loss": 0.72442377, "num_input_tokens_seen": 103929955, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.80859375, "step": 4816, "time_per_iteration": 2.487440824508667 }, { "auxiliary_loss_clip": 0.01137669, "auxiliary_loss_mlp": 0.01046576, "balance_loss_clip": 1.02924275, "balance_loss_mlp": 1.04840279, "epoch": 0.2896137081016083, "flos": 22565188477440.0, "grad_norm": 2.2557386508036177, "language_loss": 0.74776685, "learning_rate": 3.3336625994808055e-06, "loss": 0.76960927, "num_input_tokens_seen": 103948020, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.890625, "step": 4817, "time_per_iteration": 2.4889261722564697 }, { "auxiliary_loss_clip": 0.01134364, "auxiliary_loss_mlp": 0.01045556, "balance_loss_clip": 1.02834249, "balance_loss_mlp": 1.04758811, "epoch": 0.28967383135427627, "flos": 26687948734080.0, "grad_norm": 1.8296301109712905, "language_loss": 0.76779127, "learning_rate": 3.3333723439600723e-06, "loss": 0.78959048, "num_input_tokens_seen": 103968740, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8671875, "step": 4818, "time_per_iteration": 2.5151731967926025 }, { "auxiliary_loss_clip": 0.01133953, "auxiliary_loss_mlp": 0.01039126, "balance_loss_clip": 1.02223372, "balance_loss_mlp": 1.0473597, "epoch": 0.28973395460694423, "flos": 15558262692480.0, "grad_norm": 2.0066321350565457, "language_loss": 0.80099845, "learning_rate": 3.3330820378777263e-06, "loss": 0.82272923, "num_input_tokens_seen": 103986005, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8671875, "step": 4819, "time_per_iteration": 2.468355655670166 }, { "auxiliary_loss_clip": 0.0113773, "auxiliary_loss_mlp": 0.01041773, "balance_loss_clip": 1.0240947, "balance_loss_mlp": 1.04752195, "epoch": 0.2897940778596122, "flos": 18697465762560.0, "grad_norm": 2.577461393253605, "language_loss": 0.78371036, "learning_rate": 3.332791681244776e-06, "loss": 0.80550539, "num_input_tokens_seen": 104005070, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.90625, "step": 4820, "time_per_iteration": 2.4459478855133057 }, { "auxiliary_loss_clip": 0.0113795, "auxiliary_loss_mlp": 0.01033672, "balance_loss_clip": 1.01674438, "balance_loss_mlp": 1.04996073, "epoch": 0.28985420111228016, "flos": 18770292587520.0, "grad_norm": 2.23874030287569, "language_loss": 0.73218763, "learning_rate": 3.332501274072231e-06, "loss": 0.75390387, "num_input_tokens_seen": 104022945, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.87890625, "step": 4821, "time_per_iteration": 2.496685028076172 }, { "auxiliary_loss_clip": 0.0113115, "auxiliary_loss_mlp": 0.01042801, "balance_loss_clip": 1.02596903, "balance_loss_mlp": 1.04624677, "epoch": 0.28991432436494813, "flos": 23069840607360.0, "grad_norm": 2.070213449206115, "language_loss": 0.71908134, "learning_rate": 3.332210816371104e-06, "loss": 0.74082088, "num_input_tokens_seen": 104042080, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.84765625, "step": 4822, "time_per_iteration": 2.483287811279297 }, { "auxiliary_loss_clip": 0.01133308, "auxiliary_loss_mlp": 0.01047181, "balance_loss_clip": 1.03084946, "balance_loss_mlp": 1.04901564, "epoch": 0.2899744476176161, "flos": 17603195880960.0, "grad_norm": 1.799295352422266, "language_loss": 0.66154563, "learning_rate": 3.3319203081524102e-06, "loss": 0.68335044, "num_input_tokens_seen": 104060975, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.84375, "step": 4823, "time_per_iteration": 2.472933053970337 }, { "auxiliary_loss_clip": 0.01128024, "auxiliary_loss_mlp": 0.01035871, "balance_loss_clip": 1.02040994, "balance_loss_mlp": 1.04420614, "epoch": 0.29003457087028406, "flos": 22309360836480.0, "grad_norm": 1.7759435331069984, "language_loss": 0.80867624, "learning_rate": 3.331629749427164e-06, "loss": 0.83031523, "num_input_tokens_seen": 104081395, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 4824, "time_per_iteration": 2.4974424839019775 }, { "auxiliary_loss_clip": 0.01132676, "auxiliary_loss_mlp": 0.01036574, "balance_loss_clip": 1.01943183, "balance_loss_mlp": 1.04609108, "epoch": 0.2900946941229521, "flos": 21944975316480.0, "grad_norm": 1.9653051835243993, "language_loss": 0.72104537, "learning_rate": 3.331339140206385e-06, "loss": 0.74273789, "num_input_tokens_seen": 104099995, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8671875, "step": 4825, "time_per_iteration": 2.4883131980895996 }, { "auxiliary_loss_clip": 0.01135572, "auxiliary_loss_mlp": 0.01038781, "balance_loss_clip": 1.02244949, "balance_loss_mlp": 1.04954648, "epoch": 0.29015481737562004, "flos": 17932173569280.0, "grad_norm": 2.4783579879306705, "language_loss": 0.73202938, "learning_rate": 3.331048480501092e-06, "loss": 0.75377291, "num_input_tokens_seen": 104118930, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.859375, "step": 4826, "time_per_iteration": 2.435349225997925 }, { "auxiliary_loss_clip": 0.01131447, "auxiliary_loss_mlp": 0.01037951, "balance_loss_clip": 1.02269804, "balance_loss_mlp": 1.04595518, "epoch": 0.290214940628288, "flos": 22783525297920.0, "grad_norm": 2.0403963243444774, "language_loss": 0.68672895, "learning_rate": 3.3307577703223073e-06, "loss": 0.70842296, "num_input_tokens_seen": 104136940, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.85546875, "step": 4827, "time_per_iteration": 2.4833121299743652 }, { "auxiliary_loss_clip": 0.01138599, "auxiliary_loss_mlp": 0.01034721, "balance_loss_clip": 1.01714945, "balance_loss_mlp": 1.05197084, "epoch": 0.290275063880956, "flos": 20006481104640.0, "grad_norm": 2.047941097087287, "language_loss": 0.7998358, "learning_rate": 3.3304670096810545e-06, "loss": 0.82156897, "num_input_tokens_seen": 104154280, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.8671875, "step": 4828, "time_per_iteration": 2.4618422985076904 }, { "auxiliary_loss_clip": 0.01132476, "auxiliary_loss_mlp": 0.01046634, "balance_loss_clip": 1.02993262, "balance_loss_mlp": 1.04817367, "epoch": 0.29033518713362394, "flos": 22053605022720.0, "grad_norm": 1.8897322067341553, "language_loss": 0.80412304, "learning_rate": 3.33017619858836e-06, "loss": 0.82591414, "num_input_tokens_seen": 104172605, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.84375, "step": 4829, "time_per_iteration": 2.4858477115631104 }, { "auxiliary_loss_clip": 0.01130965, "auxiliary_loss_mlp": 0.01036535, "balance_loss_clip": 1.02047789, "balance_loss_mlp": 1.04874444, "epoch": 0.2903953103862919, "flos": 25630056351360.0, "grad_norm": 7.298634511381947, "language_loss": 0.82487023, "learning_rate": 3.329885337055249e-06, "loss": 0.84654522, "num_input_tokens_seen": 104194120, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8203125, "step": 4830, "time_per_iteration": 2.5036845207214355 }, { "auxiliary_loss_clip": 0.01135841, "auxiliary_loss_mlp": 0.01045695, "balance_loss_clip": 1.02900612, "balance_loss_mlp": 1.0489893, "epoch": 0.29045543363895987, "flos": 16945851035520.0, "grad_norm": 2.7302227823298097, "language_loss": 0.79739934, "learning_rate": 3.3295944250927546e-06, "loss": 0.8192147, "num_input_tokens_seen": 104210875, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8671875, "step": 4831, "time_per_iteration": 2.461907148361206 }, { "auxiliary_loss_clip": 0.01132366, "auxiliary_loss_mlp": 0.01039698, "balance_loss_clip": 1.02504098, "balance_loss_mlp": 1.05016577, "epoch": 0.29051555689162784, "flos": 26395492199040.0, "grad_norm": 2.5945143221270226, "language_loss": 0.75043774, "learning_rate": 3.3293034627119055e-06, "loss": 0.77215838, "num_input_tokens_seen": 104229875, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8203125, "step": 4832, "time_per_iteration": 2.5195889472961426 }, { "auxiliary_loss_clip": 0.01126979, "auxiliary_loss_mlp": 0.01033304, "balance_loss_clip": 1.01929641, "balance_loss_mlp": 1.04437757, "epoch": 0.2905756801442958, "flos": 21103875469440.0, "grad_norm": 1.892831794657348, "language_loss": 0.76046032, "learning_rate": 3.329012449923736e-06, "loss": 0.78206313, "num_input_tokens_seen": 104250405, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.828125, "step": 4833, "time_per_iteration": 2.5252814292907715 }, { "auxiliary_loss_clip": 0.01129947, "auxiliary_loss_mlp": 0.01036193, "balance_loss_clip": 1.02127361, "balance_loss_mlp": 1.04734576, "epoch": 0.29063580339696377, "flos": 15706071158400.0, "grad_norm": 1.6785443288812059, "language_loss": 0.65050089, "learning_rate": 3.3287213867392813e-06, "loss": 0.67216229, "num_input_tokens_seen": 104269185, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.82421875, "step": 4834, "time_per_iteration": 2.4553632736206055 }, { "auxiliary_loss_clip": 0.01128341, "auxiliary_loss_mlp": 0.01029852, "balance_loss_clip": 1.01561904, "balance_loss_mlp": 1.04678524, "epoch": 0.29069592664963173, "flos": 24644990793600.0, "grad_norm": 1.760240464915807, "language_loss": 0.72017121, "learning_rate": 3.3284302731695783e-06, "loss": 0.7417531, "num_input_tokens_seen": 104289400, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.81640625, "step": 4835, "time_per_iteration": 2.5374553203582764 }, { "auxiliary_loss_clip": 0.0112918, "auxiliary_loss_mlp": 0.01038883, "balance_loss_clip": 1.02493584, "balance_loss_mlp": 1.04709959, "epoch": 0.2907560499022997, "flos": 24973753000320.0, "grad_norm": 1.6264698064936323, "language_loss": 0.79439491, "learning_rate": 3.3281391092256668e-06, "loss": 0.8160755, "num_input_tokens_seen": 104310485, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.8203125, "step": 4836, "time_per_iteration": 2.5080299377441406 }, { "auxiliary_loss_clip": 0.01127862, "auxiliary_loss_mlp": 0.01041959, "balance_loss_clip": 1.02643812, "balance_loss_mlp": 1.04637361, "epoch": 0.29081617315496766, "flos": 18657496903680.0, "grad_norm": 2.2021057653584046, "language_loss": 0.81135833, "learning_rate": 3.3278478949185865e-06, "loss": 0.83305657, "num_input_tokens_seen": 104327330, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8125, "step": 4837, "time_per_iteration": 2.4523489475250244 }, { "auxiliary_loss_clip": 0.01128699, "auxiliary_loss_mlp": 0.01041799, "balance_loss_clip": 1.0262785, "balance_loss_mlp": 1.04438984, "epoch": 0.2908762964076356, "flos": 35331035955840.0, "grad_norm": 1.8421027275082524, "language_loss": 0.67342675, "learning_rate": 3.327556630259381e-06, "loss": 0.69513172, "num_input_tokens_seen": 104350350, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84375, "step": 4838, "time_per_iteration": 2.5748260021209717 }, { "auxiliary_loss_clip": 0.0113528, "auxiliary_loss_mlp": 0.01043616, "balance_loss_clip": 1.0272367, "balance_loss_mlp": 1.04851985, "epoch": 0.29093641966030365, "flos": 23076305055360.0, "grad_norm": 1.6941568434437493, "language_loss": 0.71070552, "learning_rate": 3.327265315259095e-06, "loss": 0.73249453, "num_input_tokens_seen": 104369995, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8671875, "step": 4839, "time_per_iteration": 2.495145320892334 }, { "auxiliary_loss_clip": 0.01131997, "auxiliary_loss_mlp": 0.01038491, "balance_loss_clip": 1.02355373, "balance_loss_mlp": 1.04752183, "epoch": 0.2909965429129716, "flos": 35955415094400.0, "grad_norm": 2.48106958081613, "language_loss": 0.76160663, "learning_rate": 3.326973949928776e-06, "loss": 0.78331149, "num_input_tokens_seen": 104392285, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.84375, "step": 4840, "time_per_iteration": 2.5902678966522217 }, { "auxiliary_loss_clip": 0.01132609, "auxiliary_loss_mlp": 0.01045931, "balance_loss_clip": 1.02998114, "balance_loss_mlp": 1.04841816, "epoch": 0.2910566661656396, "flos": 30880231764480.0, "grad_norm": 2.2589549137596174, "language_loss": 0.60363734, "learning_rate": 3.326682534279471e-06, "loss": 0.62542272, "num_input_tokens_seen": 104412640, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 4841, "time_per_iteration": 2.5507590770721436 }, { "auxiliary_loss_clip": 0.01130102, "auxiliary_loss_mlp": 0.01036664, "balance_loss_clip": 1.02024889, "balance_loss_mlp": 1.0466435, "epoch": 0.29111678941830754, "flos": 30010188533760.0, "grad_norm": 1.6884398798470666, "language_loss": 0.71262383, "learning_rate": 3.326391068322232e-06, "loss": 0.73429155, "num_input_tokens_seen": 104435245, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.83203125, "step": 4842, "time_per_iteration": 2.5432846546173096 }, { "auxiliary_loss_clip": 0.01127946, "auxiliary_loss_mlp": 0.0103534, "balance_loss_clip": 1.02124918, "balance_loss_mlp": 1.04495728, "epoch": 0.2911769126709755, "flos": 22857393617280.0, "grad_norm": 3.0734091651742417, "language_loss": 0.73286188, "learning_rate": 3.3260995520681098e-06, "loss": 0.75449479, "num_input_tokens_seen": 104455395, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.828125, "step": 4843, "time_per_iteration": 3.9459776878356934 }, { "auxiliary_loss_clip": 0.01132123, "auxiliary_loss_mlp": 0.01036263, "balance_loss_clip": 1.0212431, "balance_loss_mlp": 1.04736781, "epoch": 0.2912370359236435, "flos": 21650507619840.0, "grad_norm": 2.3633662191041234, "language_loss": 0.579, "learning_rate": 3.3258079855281602e-06, "loss": 0.60068393, "num_input_tokens_seen": 104473350, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84765625, "step": 4844, "time_per_iteration": 2.4909212589263916 }, { "auxiliary_loss_clip": 0.0113948, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.01971829, "balance_loss_mlp": 1.05228949, "epoch": 0.29129715917631144, "flos": 22893340152960.0, "grad_norm": 2.1368514814989843, "language_loss": 0.86559576, "learning_rate": 3.3255163687134396e-06, "loss": 0.88735354, "num_input_tokens_seen": 104492265, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.87109375, "step": 4845, "time_per_iteration": 3.8418354988098145 }, { "auxiliary_loss_clip": 0.01135545, "auxiliary_loss_mlp": 0.01051379, "balance_loss_clip": 1.03485608, "balance_loss_mlp": 1.0499171, "epoch": 0.2913572824289794, "flos": 22674464628480.0, "grad_norm": 1.804972674583486, "language_loss": 0.66953272, "learning_rate": 3.3252247016350046e-06, "loss": 0.69140196, "num_input_tokens_seen": 104510755, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.85546875, "step": 4846, "time_per_iteration": 3.8337509632110596 }, { "auxiliary_loss_clip": 0.01131732, "auxiliary_loss_mlp": 0.01036348, "balance_loss_clip": 1.02160752, "balance_loss_mlp": 1.04918361, "epoch": 0.29141740568164737, "flos": 23107403255040.0, "grad_norm": 2.2873570578597855, "language_loss": 0.70599318, "learning_rate": 3.3249329843039166e-06, "loss": 0.72767401, "num_input_tokens_seen": 104530830, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.82421875, "step": 4847, "time_per_iteration": 3.9555835723876953 }, { "auxiliary_loss_clip": 0.01131099, "auxiliary_loss_mlp": 0.01035216, "balance_loss_clip": 1.01979029, "balance_loss_mlp": 1.04740167, "epoch": 0.29147752893431533, "flos": 23587026583680.0, "grad_norm": 1.7306352642947105, "language_loss": 0.7405917, "learning_rate": 3.324641216731237e-06, "loss": 0.76225483, "num_input_tokens_seen": 104550115, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 4848, "time_per_iteration": 2.4913296699523926 }, { "auxiliary_loss_clip": 0.01131921, "auxiliary_loss_mlp": 0.01039782, "balance_loss_clip": 1.02406394, "balance_loss_mlp": 1.04793644, "epoch": 0.2915376521869833, "flos": 20591968792320.0, "grad_norm": 2.360215712412421, "language_loss": 0.76765823, "learning_rate": 3.3243493989280295e-06, "loss": 0.78937531, "num_input_tokens_seen": 104566255, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.83984375, "step": 4849, "time_per_iteration": 2.471541166305542 }, { "auxiliary_loss_clip": 0.01134561, "auxiliary_loss_mlp": 0.01039307, "balance_loss_clip": 1.02421474, "balance_loss_mlp": 1.04836941, "epoch": 0.29159777543965126, "flos": 20811490761600.0, "grad_norm": 2.740416257869251, "language_loss": 0.78962028, "learning_rate": 3.3240575309053596e-06, "loss": 0.81135893, "num_input_tokens_seen": 104585235, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.859375, "step": 4850, "time_per_iteration": 2.4753527641296387 }, { "auxiliary_loss_clip": 0.01129854, "auxiliary_loss_mlp": 0.0103896, "balance_loss_clip": 1.02256846, "balance_loss_mlp": 1.04737985, "epoch": 0.29165789869231923, "flos": 24244155947520.0, "grad_norm": 3.6453335708346106, "language_loss": 0.75601715, "learning_rate": 3.323765612674296e-06, "loss": 0.77770525, "num_input_tokens_seen": 104605315, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.82421875, "step": 4851, "time_per_iteration": 2.5214614868164062 }, { "auxiliary_loss_clip": 0.01132019, "auxiliary_loss_mlp": 0.01043104, "balance_loss_clip": 1.02962089, "balance_loss_mlp": 1.05155587, "epoch": 0.29171802194498725, "flos": 28949925853440.0, "grad_norm": 2.5853530861979213, "language_loss": 0.77278018, "learning_rate": 3.3234736442459078e-06, "loss": 0.7945314, "num_input_tokens_seen": 104626055, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.8046875, "step": 4852, "time_per_iteration": 2.5334949493408203 }, { "auxiliary_loss_clip": 0.01132431, "auxiliary_loss_mlp": 0.01044697, "balance_loss_clip": 1.02959275, "balance_loss_mlp": 1.04852843, "epoch": 0.2917781451976552, "flos": 22598226011520.0, "grad_norm": 1.6918723790416355, "language_loss": 0.78145754, "learning_rate": 3.3231816256312665e-06, "loss": 0.80322886, "num_input_tokens_seen": 104646005, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8359375, "step": 4853, "time_per_iteration": 2.5323774814605713 }, { "auxiliary_loss_clip": 0.01135606, "auxiliary_loss_mlp": 0.01039905, "balance_loss_clip": 1.02415729, "balance_loss_mlp": 1.04976773, "epoch": 0.2918382684503232, "flos": 21574448570880.0, "grad_norm": 2.5030072292800747, "language_loss": 0.88322097, "learning_rate": 3.322889556841445e-06, "loss": 0.90497613, "num_input_tokens_seen": 104661620, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.859375, "step": 4854, "time_per_iteration": 2.441812753677368 }, { "auxiliary_loss_clip": 0.01132948, "auxiliary_loss_mlp": 0.01051465, "balance_loss_clip": 1.03454351, "balance_loss_mlp": 1.04945135, "epoch": 0.29189839170299114, "flos": 24353503925760.0, "grad_norm": 1.8655680090562305, "language_loss": 0.87096286, "learning_rate": 3.322597437887519e-06, "loss": 0.89280695, "num_input_tokens_seen": 104681445, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8359375, "step": 4855, "time_per_iteration": 2.5116677284240723 }, { "auxiliary_loss_clip": 0.01059947, "auxiliary_loss_mlp": 0.01022372, "balance_loss_clip": 1.02027416, "balance_loss_mlp": 1.0308702, "epoch": 0.2919585149556591, "flos": 71316726215040.0, "grad_norm": 0.8232894224592109, "language_loss": 0.60200286, "learning_rate": 3.322305268780566e-06, "loss": 0.6228261, "num_input_tokens_seen": 104747945, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.2890625, "step": 4856, "time_per_iteration": 3.1809098720550537 }, { "auxiliary_loss_clip": 0.01131163, "auxiliary_loss_mlp": 0.01043869, "balance_loss_clip": 1.02882457, "balance_loss_mlp": 1.04824901, "epoch": 0.2920186382083271, "flos": 15633208419840.0, "grad_norm": 2.2744285929959727, "language_loss": 0.68200409, "learning_rate": 3.322013049531664e-06, "loss": 0.70375443, "num_input_tokens_seen": 104766225, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 4857, "time_per_iteration": 2.4546079635620117 }, { "auxiliary_loss_clip": 0.01132621, "auxiliary_loss_mlp": 0.01038409, "balance_loss_clip": 1.02385354, "balance_loss_mlp": 1.05016136, "epoch": 0.29207876146099504, "flos": 28366018364160.0, "grad_norm": 2.0649809584788206, "language_loss": 0.83931398, "learning_rate": 3.321720780151895e-06, "loss": 0.86102432, "num_input_tokens_seen": 104785345, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.82421875, "step": 4858, "time_per_iteration": 2.524287462234497 }, { "auxiliary_loss_clip": 0.01132611, "auxiliary_loss_mlp": 0.01040482, "balance_loss_clip": 1.02509212, "balance_loss_mlp": 1.05016685, "epoch": 0.292138884713663, "flos": 21870963342720.0, "grad_norm": 1.6777097158420438, "language_loss": 0.77611887, "learning_rate": 3.321428460652342e-06, "loss": 0.79784983, "num_input_tokens_seen": 104804560, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.82421875, "step": 4859, "time_per_iteration": 2.480877161026001 }, { "auxiliary_loss_clip": 0.01135067, "auxiliary_loss_mlp": 0.01039294, "balance_loss_clip": 1.02284312, "balance_loss_mlp": 1.04773986, "epoch": 0.29219900796633097, "flos": 20992552243200.0, "grad_norm": 2.4965391179793115, "language_loss": 0.68835258, "learning_rate": 3.3211360910440885e-06, "loss": 0.71009618, "num_input_tokens_seen": 104821105, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.875, "step": 4860, "time_per_iteration": 2.4537267684936523 }, { "auxiliary_loss_clip": 0.01127663, "auxiliary_loss_mlp": 0.01036154, "balance_loss_clip": 1.02239752, "balance_loss_mlp": 1.04735589, "epoch": 0.29225913121899894, "flos": 35004608133120.0, "grad_norm": 2.86142232828902, "language_loss": 0.75110745, "learning_rate": 3.320843671338222e-06, "loss": 0.77274561, "num_input_tokens_seen": 104841440, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.8046875, "step": 4861, "time_per_iteration": 2.5843069553375244 }, { "auxiliary_loss_clip": 0.01129205, "auxiliary_loss_mlp": 0.01038092, "balance_loss_clip": 1.02323866, "balance_loss_mlp": 1.0475843, "epoch": 0.2923192544716669, "flos": 13515663888000.0, "grad_norm": 1.6299665191724568, "language_loss": 0.91592836, "learning_rate": 3.320551201545832e-06, "loss": 0.93760139, "num_input_tokens_seen": 104858210, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.81640625, "step": 4862, "time_per_iteration": 2.4391062259674072 }, { "auxiliary_loss_clip": 0.01127737, "auxiliary_loss_mlp": 0.01030609, "balance_loss_clip": 1.01659048, "balance_loss_mlp": 1.04525161, "epoch": 0.29237937772433487, "flos": 19463512141440.0, "grad_norm": 2.456404159621355, "language_loss": 0.7343235, "learning_rate": 3.320258681678008e-06, "loss": 0.75590706, "num_input_tokens_seen": 104875620, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.82421875, "step": 4863, "time_per_iteration": 2.4535088539123535 }, { "auxiliary_loss_clip": 0.01126876, "auxiliary_loss_mlp": 0.01033125, "balance_loss_clip": 1.01986909, "balance_loss_mlp": 1.04786468, "epoch": 0.29243950097700283, "flos": 20850597694080.0, "grad_norm": 1.888418408542347, "language_loss": 0.77779078, "learning_rate": 3.319966111745842e-06, "loss": 0.79939079, "num_input_tokens_seen": 104894600, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7890625, "step": 4864, "time_per_iteration": 2.4713962078094482 }, { "auxiliary_loss_clip": 0.01133833, "auxiliary_loss_mlp": 0.01041127, "balance_loss_clip": 1.02512884, "balance_loss_mlp": 1.04949284, "epoch": 0.29249962422967085, "flos": 23584225322880.0, "grad_norm": 1.8596567295115196, "language_loss": 0.81863368, "learning_rate": 3.319673491760429e-06, "loss": 0.84038329, "num_input_tokens_seen": 104914530, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 4865, "time_per_iteration": 2.500652313232422 }, { "auxiliary_loss_clip": 0.01132035, "auxiliary_loss_mlp": 0.01040603, "balance_loss_clip": 1.02474809, "balance_loss_mlp": 1.04809213, "epoch": 0.2925597474823388, "flos": 22273342473600.0, "grad_norm": 2.567920847640952, "language_loss": 0.85175562, "learning_rate": 3.3193808217328645e-06, "loss": 0.87348199, "num_input_tokens_seen": 104933460, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83984375, "step": 4866, "time_per_iteration": 2.472916841506958 }, { "auxiliary_loss_clip": 0.01126315, "auxiliary_loss_mlp": 0.01033017, "balance_loss_clip": 1.01904619, "balance_loss_mlp": 1.04643166, "epoch": 0.2926198707350068, "flos": 34456108475520.0, "grad_norm": 1.7373885539371228, "language_loss": 0.75750148, "learning_rate": 3.3190881016742476e-06, "loss": 0.77909482, "num_input_tokens_seen": 104954495, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.796875, "step": 4867, "time_per_iteration": 2.5686209201812744 }, { "auxiliary_loss_clip": 0.01132104, "auxiliary_loss_mlp": 0.01041609, "balance_loss_clip": 1.02588511, "balance_loss_mlp": 1.0473634, "epoch": 0.29267999398767475, "flos": 20704153944960.0, "grad_norm": 1.9485372832553778, "language_loss": 0.73134696, "learning_rate": 3.3187953315956776e-06, "loss": 0.75308406, "num_input_tokens_seen": 104971915, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84765625, "step": 4868, "time_per_iteration": 2.4874541759490967 }, { "auxiliary_loss_clip": 0.01130417, "auxiliary_loss_mlp": 0.01029791, "balance_loss_clip": 1.01527774, "balance_loss_mlp": 1.04852939, "epoch": 0.2927401172403427, "flos": 18368667642240.0, "grad_norm": 1.5445811199671797, "language_loss": 0.74522328, "learning_rate": 3.3185025115082566e-06, "loss": 0.76682544, "num_input_tokens_seen": 104991335, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8203125, "step": 4869, "time_per_iteration": 2.5411436557769775 }, { "auxiliary_loss_clip": 0.01132262, "auxiliary_loss_mlp": 0.01036305, "balance_loss_clip": 1.02105856, "balance_loss_mlp": 1.04946971, "epoch": 0.2928002404930107, "flos": 26104041244800.0, "grad_norm": 1.474807014098925, "language_loss": 0.76389611, "learning_rate": 3.318209641423088e-06, "loss": 0.78558183, "num_input_tokens_seen": 105012015, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 4870, "time_per_iteration": 2.5022990703582764 }, { "auxiliary_loss_clip": 0.01138797, "auxiliary_loss_mlp": 0.01039959, "balance_loss_clip": 1.02338946, "balance_loss_mlp": 1.05247498, "epoch": 0.29286036374567864, "flos": 21324726241920.0, "grad_norm": 2.327706979024736, "language_loss": 0.67815018, "learning_rate": 3.3179167213512777e-06, "loss": 0.6999377, "num_input_tokens_seen": 105031460, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.86328125, "step": 4871, "time_per_iteration": 2.4731757640838623 }, { "auxiliary_loss_clip": 0.01126859, "auxiliary_loss_mlp": 0.01036476, "balance_loss_clip": 1.0221827, "balance_loss_mlp": 1.04573131, "epoch": 0.2929204869983466, "flos": 29569492569600.0, "grad_norm": 2.0139876573243143, "language_loss": 0.77481353, "learning_rate": 3.317623751303933e-06, "loss": 0.79644692, "num_input_tokens_seen": 105052965, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 4872, "time_per_iteration": 2.535935640335083 }, { "auxiliary_loss_clip": 0.01135234, "auxiliary_loss_mlp": 0.01039667, "balance_loss_clip": 1.02327561, "balance_loss_mlp": 1.05054951, "epoch": 0.2929806102510146, "flos": 19058259922560.0, "grad_norm": 3.2286867227872644, "language_loss": 0.73253739, "learning_rate": 3.317330731292164e-06, "loss": 0.75428635, "num_input_tokens_seen": 105071840, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84375, "step": 4873, "time_per_iteration": 2.4888525009155273 }, { "auxiliary_loss_clip": 0.01134705, "auxiliary_loss_mlp": 0.01042011, "balance_loss_clip": 1.02557254, "balance_loss_mlp": 1.04874444, "epoch": 0.29304073350368254, "flos": 21944221130880.0, "grad_norm": 3.3308183937012736, "language_loss": 0.7846688, "learning_rate": 3.3170376613270812e-06, "loss": 0.80643594, "num_input_tokens_seen": 105089445, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.859375, "step": 4874, "time_per_iteration": 2.4717321395874023 }, { "auxiliary_loss_clip": 0.0113992, "auxiliary_loss_mlp": 0.01043071, "balance_loss_clip": 1.02725172, "balance_loss_mlp": 1.05068994, "epoch": 0.2931008567563505, "flos": 15450818135040.0, "grad_norm": 2.2659013654432543, "language_loss": 0.77084708, "learning_rate": 3.3167445414197985e-06, "loss": 0.79267699, "num_input_tokens_seen": 105106210, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.890625, "step": 4875, "time_per_iteration": 2.4474356174468994 }, { "auxiliary_loss_clip": 0.01138924, "auxiliary_loss_mlp": 0.0103743, "balance_loss_clip": 1.02192712, "balance_loss_mlp": 1.05371428, "epoch": 0.29316098000901847, "flos": 16983162288000.0, "grad_norm": 2.0264232405349043, "language_loss": 0.69524455, "learning_rate": 3.316451371581431e-06, "loss": 0.71700811, "num_input_tokens_seen": 105124200, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8515625, "step": 4876, "time_per_iteration": 2.453352689743042 }, { "auxiliary_loss_clip": 0.01131881, "auxiliary_loss_mlp": 0.01039126, "balance_loss_clip": 1.02430797, "balance_loss_mlp": 1.04970884, "epoch": 0.29322110326168643, "flos": 16357705741440.0, "grad_norm": 2.296742564591341, "language_loss": 0.82271945, "learning_rate": 3.316158151823096e-06, "loss": 0.84442955, "num_input_tokens_seen": 105140400, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 4877, "time_per_iteration": 2.457329034805298 }, { "auxiliary_loss_clip": 0.01139607, "auxiliary_loss_mlp": 0.01042768, "balance_loss_clip": 1.02743745, "balance_loss_mlp": 1.05236602, "epoch": 0.29328122651435445, "flos": 13990869843840.0, "grad_norm": 2.182325894869162, "language_loss": 0.67529196, "learning_rate": 3.315864882155911e-06, "loss": 0.69711572, "num_input_tokens_seen": 105157535, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.875, "step": 4878, "time_per_iteration": 2.4429469108581543 }, { "auxiliary_loss_clip": 0.01135573, "auxiliary_loss_mlp": 0.01038701, "balance_loss_clip": 1.02357888, "balance_loss_mlp": 1.05210996, "epoch": 0.2933413497670224, "flos": 25264593423360.0, "grad_norm": 1.7765999169428521, "language_loss": 0.73468673, "learning_rate": 3.3155715625909982e-06, "loss": 0.75642943, "num_input_tokens_seen": 105175185, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8359375, "step": 4879, "time_per_iteration": 2.5178909301757812 }, { "auxiliary_loss_clip": 0.01140385, "auxiliary_loss_mlp": 0.0104819, "balance_loss_clip": 1.03088093, "balance_loss_mlp": 1.05348361, "epoch": 0.2934014730196904, "flos": 32123746656000.0, "grad_norm": 1.9687675958783744, "language_loss": 0.66129899, "learning_rate": 3.3152781931394803e-06, "loss": 0.68318474, "num_input_tokens_seen": 105194540, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8671875, "step": 4880, "time_per_iteration": 2.55429744720459 }, { "auxiliary_loss_clip": 0.0113501, "auxiliary_loss_mlp": 0.01049771, "balance_loss_clip": 1.03414249, "balance_loss_mlp": 1.05016458, "epoch": 0.29346159627235835, "flos": 24352498344960.0, "grad_norm": 1.7966605042657295, "language_loss": 0.7029177, "learning_rate": 3.314984773812481e-06, "loss": 0.72476548, "num_input_tokens_seen": 105213215, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8515625, "step": 4881, "time_per_iteration": 2.504812240600586 }, { "auxiliary_loss_clip": 0.01134688, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.02471125, "balance_loss_mlp": 1.04966974, "epoch": 0.2935217195250263, "flos": 22746752749440.0, "grad_norm": 5.790855948968629, "language_loss": 0.83421218, "learning_rate": 3.314691304621127e-06, "loss": 0.85596651, "num_input_tokens_seen": 105231585, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8515625, "step": 4882, "time_per_iteration": 2.4786713123321533 }, { "auxiliary_loss_clip": 0.01139242, "auxiliary_loss_mlp": 0.01039494, "balance_loss_clip": 1.02288842, "balance_loss_mlp": 1.05141664, "epoch": 0.2935818427776943, "flos": 21725561088000.0, "grad_norm": 2.12609409524607, "language_loss": 0.71358162, "learning_rate": 3.314397785576548e-06, "loss": 0.73536903, "num_input_tokens_seen": 105250120, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 4883, "time_per_iteration": 2.518608808517456 }, { "auxiliary_loss_clip": 0.01136501, "auxiliary_loss_mlp": 0.01040918, "balance_loss_clip": 1.02465796, "balance_loss_mlp": 1.05108976, "epoch": 0.29364196603036224, "flos": 23804968354560.0, "grad_norm": 2.090992929976852, "language_loss": 0.92265987, "learning_rate": 3.3141042166898726e-06, "loss": 0.94443399, "num_input_tokens_seen": 105266065, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.85546875, "step": 4884, "time_per_iteration": 4.031891107559204 }, { "auxiliary_loss_clip": 0.01141503, "auxiliary_loss_mlp": 0.010399, "balance_loss_clip": 1.02456343, "balance_loss_mlp": 1.05498743, "epoch": 0.2937020892830302, "flos": 23470064922240.0, "grad_norm": 2.1752804410578728, "language_loss": 0.73499191, "learning_rate": 3.313810597972234e-06, "loss": 0.7568059, "num_input_tokens_seen": 105282155, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.86328125, "step": 4885, "time_per_iteration": 2.488344192504883 }, { "auxiliary_loss_clip": 0.01133994, "auxiliary_loss_mlp": 0.01043805, "balance_loss_clip": 1.02826571, "balance_loss_mlp": 1.04996514, "epoch": 0.2937622125356982, "flos": 24272740195200.0, "grad_norm": 2.0159265207950505, "language_loss": 0.85223728, "learning_rate": 3.3135169294347655e-06, "loss": 0.87401527, "num_input_tokens_seen": 105299225, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83984375, "step": 4886, "time_per_iteration": 2.5084521770477295 }, { "auxiliary_loss_clip": 0.01136678, "auxiliary_loss_mlp": 0.01042901, "balance_loss_clip": 1.02710545, "balance_loss_mlp": 1.04959345, "epoch": 0.29382233578836614, "flos": 20662461233280.0, "grad_norm": 2.196863549250748, "language_loss": 0.7685945, "learning_rate": 3.313223211088603e-06, "loss": 0.79039031, "num_input_tokens_seen": 105315710, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.87109375, "step": 4887, "time_per_iteration": 3.8469979763031006 }, { "auxiliary_loss_clip": 0.01137622, "auxiliary_loss_mlp": 0.01043605, "balance_loss_clip": 1.02873957, "balance_loss_mlp": 1.0514667, "epoch": 0.2938824590410341, "flos": 16545052103040.0, "grad_norm": 2.333773912487663, "language_loss": 0.79357696, "learning_rate": 3.3129294429448855e-06, "loss": 0.81538916, "num_input_tokens_seen": 105333505, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.859375, "step": 4888, "time_per_iteration": 3.8465335369110107 }, { "auxiliary_loss_clip": 0.01135656, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.01946759, "balance_loss_mlp": 1.05113316, "epoch": 0.29394258229370207, "flos": 37925474382720.0, "grad_norm": 1.386867549185229, "language_loss": 0.55015659, "learning_rate": 3.3126356250147517e-06, "loss": 0.57185781, "num_input_tokens_seen": 105355605, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84375, "step": 4889, "time_per_iteration": 4.057069301605225 }, { "auxiliary_loss_clip": 0.01138236, "auxiliary_loss_mlp": 0.01040502, "balance_loss_clip": 1.02408648, "balance_loss_mlp": 1.05133462, "epoch": 0.29400270554637004, "flos": 20044690197120.0, "grad_norm": 1.757928683994084, "language_loss": 0.84396696, "learning_rate": 3.3123417573093434e-06, "loss": 0.86575437, "num_input_tokens_seen": 105374225, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87109375, "step": 4890, "time_per_iteration": 2.513556480407715 }, { "auxiliary_loss_clip": 0.01140181, "auxiliary_loss_mlp": 0.0104497, "balance_loss_clip": 1.02932954, "balance_loss_mlp": 1.05289268, "epoch": 0.294062828799038, "flos": 15266380775040.0, "grad_norm": 1.82848981258474, "language_loss": 0.72382212, "learning_rate": 3.3120478398398046e-06, "loss": 0.7456736, "num_input_tokens_seen": 105391565, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.87109375, "step": 4891, "time_per_iteration": 2.4394683837890625 }, { "auxiliary_loss_clip": 0.01137838, "auxiliary_loss_mlp": 0.01046333, "balance_loss_clip": 1.02911925, "balance_loss_mlp": 1.05142593, "epoch": 0.294122952051706, "flos": 22747147799040.0, "grad_norm": 1.6861966344038206, "language_loss": 0.77029967, "learning_rate": 3.3117538726172797e-06, "loss": 0.79214132, "num_input_tokens_seen": 105409840, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.86328125, "step": 4892, "time_per_iteration": 2.5010199546813965 }, { "auxiliary_loss_clip": 0.01133572, "auxiliary_loss_mlp": 0.01033112, "balance_loss_clip": 1.01724505, "balance_loss_mlp": 1.04896319, "epoch": 0.294183075304374, "flos": 24972891073920.0, "grad_norm": 1.6849987141711933, "language_loss": 0.78319395, "learning_rate": 3.3114598556529164e-06, "loss": 0.80486077, "num_input_tokens_seen": 105428645, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 4893, "time_per_iteration": 2.5101959705352783 }, { "auxiliary_loss_clip": 0.01136498, "auxiliary_loss_mlp": 0.01045186, "balance_loss_clip": 1.02960539, "balance_loss_mlp": 1.05070007, "epoch": 0.29424319855704195, "flos": 30952986762240.0, "grad_norm": 1.8849184656798634, "language_loss": 0.84928423, "learning_rate": 3.311165788957864e-06, "loss": 0.87110114, "num_input_tokens_seen": 105447480, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 4894, "time_per_iteration": 2.5779967308044434 }, { "auxiliary_loss_clip": 0.01136781, "auxiliary_loss_mlp": 0.01031614, "balance_loss_clip": 1.01656389, "balance_loss_mlp": 1.05065024, "epoch": 0.2943033218097099, "flos": 15231583474560.0, "grad_norm": 2.78340934918678, "language_loss": 0.90477884, "learning_rate": 3.310871672543274e-06, "loss": 0.92646277, "num_input_tokens_seen": 105464600, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.859375, "step": 4895, "time_per_iteration": 2.441826105117798 }, { "auxiliary_loss_clip": 0.01140024, "auxiliary_loss_mlp": 0.01040072, "balance_loss_clip": 1.02394295, "balance_loss_mlp": 1.05195141, "epoch": 0.2943634450623779, "flos": 21725884310400.0, "grad_norm": 1.9166110698733088, "language_loss": 0.86396801, "learning_rate": 3.3105775064202982e-06, "loss": 0.88576901, "num_input_tokens_seen": 105481510, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8828125, "step": 4896, "time_per_iteration": 2.5023510456085205 }, { "auxiliary_loss_clip": 0.01137694, "auxiliary_loss_mlp": 0.0104364, "balance_loss_clip": 1.02758288, "balance_loss_mlp": 1.05161953, "epoch": 0.29442356831504585, "flos": 22602104680320.0, "grad_norm": 1.756966574473799, "language_loss": 0.73110127, "learning_rate": 3.3102832906000924e-06, "loss": 0.75291455, "num_input_tokens_seen": 105501390, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.859375, "step": 4897, "time_per_iteration": 2.4742443561553955 }, { "auxiliary_loss_clip": 0.0114129, "auxiliary_loss_mlp": 0.01039967, "balance_loss_clip": 1.02264595, "balance_loss_mlp": 1.0499028, "epoch": 0.2944836915677138, "flos": 20011401267840.0, "grad_norm": 2.084283412785124, "language_loss": 0.73736829, "learning_rate": 3.309989025093813e-06, "loss": 0.75918084, "num_input_tokens_seen": 105519600, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.9140625, "step": 4898, "time_per_iteration": 2.485342264175415 }, { "auxiliary_loss_clip": 0.0114533, "auxiliary_loss_mlp": 0.01040216, "balance_loss_clip": 1.02074909, "balance_loss_mlp": 1.05458272, "epoch": 0.2945438148203818, "flos": 20045875345920.0, "grad_norm": 2.7159033551073213, "language_loss": 0.69510221, "learning_rate": 3.309694709912618e-06, "loss": 0.71695769, "num_input_tokens_seen": 105535970, "router_z_loss_clip": 0.19433594, "router_z_loss_mlp": 0.90625, "step": 4899, "time_per_iteration": 2.451815366744995 }, { "auxiliary_loss_clip": 0.01138503, "auxiliary_loss_mlp": 0.01039767, "balance_loss_clip": 1.02390063, "balance_loss_mlp": 1.05203199, "epoch": 0.29460393807304974, "flos": 23733542160000.0, "grad_norm": 1.9580504876935247, "language_loss": 0.7869373, "learning_rate": 3.3094003450676685e-06, "loss": 0.80871999, "num_input_tokens_seen": 105556735, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 4900, "time_per_iteration": 2.532167911529541 }, { "auxiliary_loss_clip": 0.01132428, "auxiliary_loss_mlp": 0.01037512, "balance_loss_clip": 1.02241993, "balance_loss_mlp": 1.04736066, "epoch": 0.2946640613257177, "flos": 14976079056000.0, "grad_norm": 1.876525046447065, "language_loss": 0.80777192, "learning_rate": 3.3091059305701268e-06, "loss": 0.82947135, "num_input_tokens_seen": 105574875, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8515625, "step": 4901, "time_per_iteration": 2.4419894218444824 }, { "auxiliary_loss_clip": 0.0113075, "auxiliary_loss_mlp": 0.01033399, "balance_loss_clip": 1.01918364, "balance_loss_mlp": 1.04967868, "epoch": 0.2947241845783857, "flos": 24243904552320.0, "grad_norm": 2.085110021223738, "language_loss": 0.57265037, "learning_rate": 3.308811466431157e-06, "loss": 0.59429187, "num_input_tokens_seen": 105594225, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 4902, "time_per_iteration": 2.5329506397247314 }, { "auxiliary_loss_clip": 0.01134334, "auxiliary_loss_mlp": 0.01036784, "balance_loss_clip": 1.02207959, "balance_loss_mlp": 1.05025601, "epoch": 0.29478430783105364, "flos": 19938394874880.0, "grad_norm": 1.8218960291639847, "language_loss": 0.76071405, "learning_rate": 3.308516952661925e-06, "loss": 0.78242528, "num_input_tokens_seen": 105614000, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.83984375, "step": 4903, "time_per_iteration": 2.4744396209716797 }, { "auxiliary_loss_clip": 0.01135367, "auxiliary_loss_mlp": 0.01043299, "balance_loss_clip": 1.02678847, "balance_loss_mlp": 1.05005789, "epoch": 0.2948444310837216, "flos": 27381347856000.0, "grad_norm": 3.2070603830284923, "language_loss": 0.62783343, "learning_rate": 3.3082223892736e-06, "loss": 0.64962012, "num_input_tokens_seen": 105634575, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8515625, "step": 4904, "time_per_iteration": 2.543907403945923 }, { "auxiliary_loss_clip": 0.01137321, "auxiliary_loss_mlp": 0.01038079, "balance_loss_clip": 1.02286816, "balance_loss_mlp": 1.04923832, "epoch": 0.2949045543363896, "flos": 23405462311680.0, "grad_norm": 1.485726197736973, "language_loss": 0.73466974, "learning_rate": 3.3079277762773496e-06, "loss": 0.75642371, "num_input_tokens_seen": 105654385, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.87890625, "step": 4905, "time_per_iteration": 2.4963507652282715 }, { "auxiliary_loss_clip": 0.01133431, "auxiliary_loss_mlp": 0.01037759, "balance_loss_clip": 1.02211881, "balance_loss_mlp": 1.04982781, "epoch": 0.2949646775890576, "flos": 23951483930880.0, "grad_norm": 1.8356813224476898, "language_loss": 0.81719887, "learning_rate": 3.3076331136843476e-06, "loss": 0.83891076, "num_input_tokens_seen": 105673570, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 4906, "time_per_iteration": 2.5124640464782715 }, { "auxiliary_loss_clip": 0.01131648, "auxiliary_loss_mlp": 0.01035714, "balance_loss_clip": 1.02096772, "balance_loss_mlp": 1.04999065, "epoch": 0.29502480084172555, "flos": 22784315397120.0, "grad_norm": 1.7552146271473796, "language_loss": 0.87092328, "learning_rate": 3.3073384015057667e-06, "loss": 0.89259696, "num_input_tokens_seen": 105691940, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.81640625, "step": 4907, "time_per_iteration": 2.4951422214508057 }, { "auxiliary_loss_clip": 0.01136255, "auxiliary_loss_mlp": 0.01039535, "balance_loss_clip": 1.02311957, "balance_loss_mlp": 1.04977846, "epoch": 0.2950849240943935, "flos": 19646656611840.0, "grad_norm": 2.040439268106513, "language_loss": 0.81669867, "learning_rate": 3.307043639752782e-06, "loss": 0.83845657, "num_input_tokens_seen": 105709825, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8671875, "step": 4908, "time_per_iteration": 2.497197151184082 }, { "auxiliary_loss_clip": 0.01061779, "auxiliary_loss_mlp": 0.01004873, "balance_loss_clip": 1.00196457, "balance_loss_mlp": 1.0329268, "epoch": 0.2951450473470615, "flos": 71002829260800.0, "grad_norm": 1.2605998354092713, "language_loss": 0.57306087, "learning_rate": 3.3067488284365728e-06, "loss": 0.59372741, "num_input_tokens_seen": 105766880, "router_z_loss_clip": 0.02905273, "router_z_loss_mlp": 0.2890625, "step": 4909, "time_per_iteration": 2.9580399990081787 }, { "auxiliary_loss_clip": 0.01134511, "auxiliary_loss_mlp": 0.01038624, "balance_loss_clip": 1.02406859, "balance_loss_mlp": 1.05236208, "epoch": 0.29520517059972945, "flos": 22966310632320.0, "grad_norm": 1.4809233219450977, "language_loss": 0.86807066, "learning_rate": 3.3064539675683163e-06, "loss": 0.88980204, "num_input_tokens_seen": 105786875, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8203125, "step": 4910, "time_per_iteration": 2.5266175270080566 }, { "auxiliary_loss_clip": 0.01128086, "auxiliary_loss_mlp": 0.01036668, "balance_loss_clip": 1.02216029, "balance_loss_mlp": 1.04789901, "epoch": 0.2952652938523974, "flos": 20485673470080.0, "grad_norm": 2.071714319290404, "language_loss": 0.72942346, "learning_rate": 3.3061590571591946e-06, "loss": 0.75107098, "num_input_tokens_seen": 105805315, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 4911, "time_per_iteration": 2.469198226928711 }, { "auxiliary_loss_clip": 0.01133208, "auxiliary_loss_mlp": 0.01031481, "balance_loss_clip": 1.01714015, "balance_loss_mlp": 1.05136585, "epoch": 0.2953254171050654, "flos": 19646584784640.0, "grad_norm": 1.7662501515669584, "language_loss": 0.89742893, "learning_rate": 3.3058640972203904e-06, "loss": 0.91907579, "num_input_tokens_seen": 105825125, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8203125, "step": 4912, "time_per_iteration": 2.489701271057129 }, { "auxiliary_loss_clip": 0.01131591, "auxiliary_loss_mlp": 0.01045454, "balance_loss_clip": 1.03009963, "balance_loss_mlp": 1.04889047, "epoch": 0.29538554035773334, "flos": 22747973811840.0, "grad_norm": 1.9600176640318991, "language_loss": 0.83156312, "learning_rate": 3.3055690877630894e-06, "loss": 0.85333359, "num_input_tokens_seen": 105846085, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 4913, "time_per_iteration": 2.486558675765991 }, { "auxiliary_loss_clip": 0.01133452, "auxiliary_loss_mlp": 0.01040223, "balance_loss_clip": 1.02551293, "balance_loss_mlp": 1.04911208, "epoch": 0.2954456636104013, "flos": 21871861182720.0, "grad_norm": 1.8313135315475813, "language_loss": 0.76651835, "learning_rate": 3.3052740287984765e-06, "loss": 0.7882551, "num_input_tokens_seen": 105865400, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.84375, "step": 4914, "time_per_iteration": 2.5021023750305176 }, { "auxiliary_loss_clip": 0.01129635, "auxiliary_loss_mlp": 0.01038398, "balance_loss_clip": 1.0232408, "balance_loss_mlp": 1.04760695, "epoch": 0.2955057868630693, "flos": 40442560871040.0, "grad_norm": 9.369469044327262, "language_loss": 0.81748664, "learning_rate": 3.3049789203377424e-06, "loss": 0.83916694, "num_input_tokens_seen": 105887920, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8203125, "step": 4915, "time_per_iteration": 2.634644031524658 }, { "auxiliary_loss_clip": 0.01135203, "auxiliary_loss_mlp": 0.0103729, "balance_loss_clip": 1.02287745, "balance_loss_mlp": 1.05020857, "epoch": 0.29556591011573724, "flos": 22564506119040.0, "grad_norm": 2.3076882767685536, "language_loss": 0.84375745, "learning_rate": 3.3046837623920772e-06, "loss": 0.86548245, "num_input_tokens_seen": 105904035, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8515625, "step": 4916, "time_per_iteration": 2.4952447414398193 }, { "auxiliary_loss_clip": 0.01126896, "auxiliary_loss_mlp": 0.01032738, "balance_loss_clip": 1.01865923, "balance_loss_mlp": 1.04536104, "epoch": 0.2956260333684052, "flos": 22089300163200.0, "grad_norm": 2.318720103257628, "language_loss": 0.69840914, "learning_rate": 3.3043885549726723e-06, "loss": 0.72000551, "num_input_tokens_seen": 105922685, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 4917, "time_per_iteration": 2.468273639678955 }, { "auxiliary_loss_clip": 0.01133955, "auxiliary_loss_mlp": 0.01036495, "balance_loss_clip": 1.02165318, "balance_loss_mlp": 1.05066347, "epoch": 0.2956861566210732, "flos": 16435488643200.0, "grad_norm": 2.064503804590439, "language_loss": 0.90887678, "learning_rate": 3.3040932980907226e-06, "loss": 0.93058127, "num_input_tokens_seen": 105940425, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.83203125, "step": 4918, "time_per_iteration": 2.4652562141418457 }, { "auxiliary_loss_clip": 0.01136094, "auxiliary_loss_mlp": 0.01033901, "balance_loss_clip": 1.0193572, "balance_loss_mlp": 1.05157959, "epoch": 0.2957462798737412, "flos": 25812087500160.0, "grad_norm": 1.987901578353923, "language_loss": 0.72373939, "learning_rate": 3.303797991757425e-06, "loss": 0.74543929, "num_input_tokens_seen": 105960550, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.84375, "step": 4919, "time_per_iteration": 2.510037660598755 }, { "auxiliary_loss_clip": 0.01130626, "auxiliary_loss_mlp": 0.01040327, "balance_loss_clip": 1.02491903, "balance_loss_mlp": 1.04865503, "epoch": 0.29580640312640916, "flos": 16690849407360.0, "grad_norm": 1.9226252210805546, "language_loss": 0.75916851, "learning_rate": 3.3035026359839763e-06, "loss": 0.78087807, "num_input_tokens_seen": 105978820, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 4920, "time_per_iteration": 2.4769701957702637 }, { "auxiliary_loss_clip": 0.01135928, "auxiliary_loss_mlp": 0.01042622, "balance_loss_clip": 1.02767348, "balance_loss_mlp": 1.05083394, "epoch": 0.2958665263790771, "flos": 23945594100480.0, "grad_norm": 2.443896061406595, "language_loss": 0.68430394, "learning_rate": 3.3032072307815774e-06, "loss": 0.7060895, "num_input_tokens_seen": 105997545, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8515625, "step": 4921, "time_per_iteration": 2.4949378967285156 }, { "auxiliary_loss_clip": 0.01136187, "auxiliary_loss_mlp": 0.01042335, "balance_loss_clip": 1.02558601, "balance_loss_mlp": 1.05066144, "epoch": 0.2959266496317451, "flos": 18478410670080.0, "grad_norm": 3.0685296259848216, "language_loss": 0.74092937, "learning_rate": 3.3029117761614298e-06, "loss": 0.76271462, "num_input_tokens_seen": 106015320, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.85546875, "step": 4922, "time_per_iteration": 2.4921207427978516 }, { "auxiliary_loss_clip": 0.01139183, "auxiliary_loss_mlp": 0.01034866, "balance_loss_clip": 1.01864195, "balance_loss_mlp": 1.05031407, "epoch": 0.29598677288441305, "flos": 25957489754880.0, "grad_norm": 4.187391954908238, "language_loss": 0.76268029, "learning_rate": 3.302616272134737e-06, "loss": 0.78442079, "num_input_tokens_seen": 106034555, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.88671875, "step": 4923, "time_per_iteration": 2.509535551071167 }, { "auxiliary_loss_clip": 0.01132526, "auxiliary_loss_mlp": 0.01037341, "balance_loss_clip": 1.02218962, "balance_loss_mlp": 1.04953921, "epoch": 0.296046896137081, "flos": 25155999630720.0, "grad_norm": 1.9765896286859317, "language_loss": 0.86268592, "learning_rate": 3.3023207187127042e-06, "loss": 0.88438463, "num_input_tokens_seen": 106054200, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.828125, "step": 4924, "time_per_iteration": 2.535447835922241 }, { "auxiliary_loss_clip": 0.01132386, "auxiliary_loss_mlp": 0.01031769, "balance_loss_clip": 1.01653421, "balance_loss_mlp": 1.04969633, "epoch": 0.296107019389749, "flos": 21761148487680.0, "grad_norm": 1.5280176362613846, "language_loss": 0.8190788, "learning_rate": 3.3020251159065396e-06, "loss": 0.8407203, "num_input_tokens_seen": 106074700, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 4925, "time_per_iteration": 2.477691411972046 }, { "auxiliary_loss_clip": 0.01131388, "auxiliary_loss_mlp": 0.01033346, "balance_loss_clip": 1.01849246, "balance_loss_mlp": 1.04914212, "epoch": 0.29616714264241695, "flos": 17960039544960.0, "grad_norm": 2.384659788899005, "language_loss": 0.86445153, "learning_rate": 3.301729463727452e-06, "loss": 0.88609886, "num_input_tokens_seen": 106091415, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 4926, "time_per_iteration": 3.9566633701324463 }, { "auxiliary_loss_clip": 0.01133011, "auxiliary_loss_mlp": 0.01030718, "balance_loss_clip": 1.01562023, "balance_loss_mlp": 1.0475229, "epoch": 0.2962272658950849, "flos": 15012779777280.0, "grad_norm": 2.552886901466749, "language_loss": 0.86188185, "learning_rate": 3.3014337621866527e-06, "loss": 0.88351917, "num_input_tokens_seen": 106109135, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8515625, "step": 4927, "time_per_iteration": 2.4549758434295654 }, { "auxiliary_loss_clip": 0.01130209, "auxiliary_loss_mlp": 0.01038511, "balance_loss_clip": 1.02341938, "balance_loss_mlp": 1.04805589, "epoch": 0.2962873891477529, "flos": 14720861946240.0, "grad_norm": 3.3803046949435718, "language_loss": 0.80388069, "learning_rate": 3.3011380112953553e-06, "loss": 0.8255679, "num_input_tokens_seen": 106125750, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8203125, "step": 4928, "time_per_iteration": 3.8300623893737793 }, { "auxiliary_loss_clip": 0.01142802, "auxiliary_loss_mlp": 0.01043352, "balance_loss_clip": 1.02511311, "balance_loss_mlp": 1.05269933, "epoch": 0.29634751240042084, "flos": 26723787528960.0, "grad_norm": 2.4949058982978407, "language_loss": 0.72462744, "learning_rate": 3.300842211064773e-06, "loss": 0.74648893, "num_input_tokens_seen": 106142835, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.8984375, "step": 4929, "time_per_iteration": 3.9365856647491455 }, { "auxiliary_loss_clip": 0.01136746, "auxiliary_loss_mlp": 0.01042755, "balance_loss_clip": 1.02602994, "balance_loss_mlp": 1.04992735, "epoch": 0.2964076356530888, "flos": 14571293713920.0, "grad_norm": 2.407092863572812, "language_loss": 0.72224295, "learning_rate": 3.3005463615061246e-06, "loss": 0.74403799, "num_input_tokens_seen": 106160680, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 4930, "time_per_iteration": 2.4417147636413574 }, { "auxiliary_loss_clip": 0.01054517, "auxiliary_loss_mlp": 0.01004489, "balance_loss_clip": 1.00138938, "balance_loss_mlp": 1.02536416, "epoch": 0.29646775890575683, "flos": 63104315063040.0, "grad_norm": 0.8034984351863772, "language_loss": 0.6058948, "learning_rate": 3.3002504626306275e-06, "loss": 0.62648487, "num_input_tokens_seen": 106224415, "router_z_loss_clip": 0.03088379, "router_z_loss_mlp": 0.29101562, "step": 4931, "time_per_iteration": 4.481427907943726 }, { "auxiliary_loss_clip": 0.01053311, "auxiliary_loss_mlp": 0.01003923, "balance_loss_clip": 1.00068021, "balance_loss_mlp": 1.0239135, "epoch": 0.2965278821584248, "flos": 63067686168960.0, "grad_norm": 0.7572047051347746, "language_loss": 0.52384412, "learning_rate": 3.2999545144495023e-06, "loss": 0.54441649, "num_input_tokens_seen": 106279140, "router_z_loss_clip": 0.0324707, "router_z_loss_mlp": 0.29296875, "step": 4932, "time_per_iteration": 2.9796953201293945 }, { "auxiliary_loss_clip": 0.01130485, "auxiliary_loss_mlp": 0.01034381, "balance_loss_clip": 1.01899123, "balance_loss_mlp": 1.04837108, "epoch": 0.29658800541109276, "flos": 23768734510080.0, "grad_norm": 2.3919579583543715, "language_loss": 0.81501746, "learning_rate": 3.299658516973972e-06, "loss": 0.83666611, "num_input_tokens_seen": 106298190, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 4933, "time_per_iteration": 2.4887821674346924 }, { "auxiliary_loss_clip": 0.01129149, "auxiliary_loss_mlp": 0.01028045, "balance_loss_clip": 1.01314425, "balance_loss_mlp": 1.04966712, "epoch": 0.2966481286637607, "flos": 23988543788160.0, "grad_norm": 1.9526233852114032, "language_loss": 0.75399303, "learning_rate": 3.299362470215261e-06, "loss": 0.77556503, "num_input_tokens_seen": 106319065, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 4934, "time_per_iteration": 2.5272817611694336 }, { "auxiliary_loss_clip": 0.01136388, "auxiliary_loss_mlp": 0.0104447, "balance_loss_clip": 1.02823949, "balance_loss_mlp": 1.04943252, "epoch": 0.2967082519164287, "flos": 17165157523200.0, "grad_norm": 2.5600699684699446, "language_loss": 0.61807686, "learning_rate": 3.299066374184594e-06, "loss": 0.63988543, "num_input_tokens_seen": 106338040, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87109375, "step": 4935, "time_per_iteration": 2.4467251300811768 }, { "auxiliary_loss_clip": 0.0113333, "auxiliary_loss_mlp": 0.01040785, "balance_loss_clip": 1.02476323, "balance_loss_mlp": 1.05011058, "epoch": 0.29676837516909665, "flos": 29387712816000.0, "grad_norm": 1.6258704728689664, "language_loss": 0.79857051, "learning_rate": 3.2987702288932e-06, "loss": 0.82031167, "num_input_tokens_seen": 106358900, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 4936, "time_per_iteration": 2.5564796924591064 }, { "auxiliary_loss_clip": 0.01137519, "auxiliary_loss_mlp": 0.0104717, "balance_loss_clip": 1.03105307, "balance_loss_mlp": 1.05137658, "epoch": 0.2968284984217646, "flos": 34751222616960.0, "grad_norm": 1.8470604681918037, "language_loss": 0.74279904, "learning_rate": 3.298474034352309e-06, "loss": 0.76464593, "num_input_tokens_seen": 106381805, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.86328125, "step": 4937, "time_per_iteration": 2.5897505283355713 }, { "auxiliary_loss_clip": 0.01134656, "auxiliary_loss_mlp": 0.01037642, "balance_loss_clip": 1.02167964, "balance_loss_mlp": 1.05088818, "epoch": 0.2968886216744326, "flos": 21544104556800.0, "grad_norm": 1.9768000718396639, "language_loss": 0.77934164, "learning_rate": 3.2981777905731526e-06, "loss": 0.80106461, "num_input_tokens_seen": 106402365, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8359375, "step": 4938, "time_per_iteration": 2.4968409538269043 }, { "auxiliary_loss_clip": 0.0113713, "auxiliary_loss_mlp": 0.01041453, "balance_loss_clip": 1.02524018, "balance_loss_mlp": 1.05053115, "epoch": 0.29694874492710055, "flos": 12787323811200.0, "grad_norm": 2.3796001871899906, "language_loss": 0.77032131, "learning_rate": 3.297881497566964e-06, "loss": 0.79210716, "num_input_tokens_seen": 106419800, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8671875, "step": 4939, "time_per_iteration": 2.4371719360351562 }, { "auxiliary_loss_clip": 0.01137831, "auxiliary_loss_mlp": 0.01037636, "balance_loss_clip": 1.02162659, "balance_loss_mlp": 1.05056691, "epoch": 0.2970088681797685, "flos": 24569973239040.0, "grad_norm": 1.6459352115337862, "language_loss": 0.78099447, "learning_rate": 3.297585155344979e-06, "loss": 0.80274916, "num_input_tokens_seen": 106440300, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 4940, "time_per_iteration": 2.5301568508148193 }, { "auxiliary_loss_clip": 0.01137799, "auxiliary_loss_mlp": 0.01037768, "balance_loss_clip": 1.01976788, "balance_loss_mlp": 1.05091834, "epoch": 0.2970689914324365, "flos": 23659171050240.0, "grad_norm": 1.5400356515733777, "language_loss": 0.75575888, "learning_rate": 3.297288763918435e-06, "loss": 0.77751452, "num_input_tokens_seen": 106460035, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.8671875, "step": 4941, "time_per_iteration": 2.4902966022491455 }, { "auxiliary_loss_clip": 0.01140944, "auxiliary_loss_mlp": 0.0105285, "balance_loss_clip": 1.03592265, "balance_loss_mlp": 1.05216932, "epoch": 0.29712911468510445, "flos": 39670301439360.0, "grad_norm": 2.2077760274021694, "language_loss": 0.73758948, "learning_rate": 3.2969923232985712e-06, "loss": 0.75952744, "num_input_tokens_seen": 106481095, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.88671875, "step": 4942, "time_per_iteration": 2.6756656169891357 }, { "auxiliary_loss_clip": 0.01139874, "auxiliary_loss_mlp": 0.01041, "balance_loss_clip": 1.02416801, "balance_loss_mlp": 1.05087817, "epoch": 0.2971892379377724, "flos": 26395312631040.0, "grad_norm": 2.1113311971619466, "language_loss": 0.69969457, "learning_rate": 3.2966958334966287e-06, "loss": 0.72150338, "num_input_tokens_seen": 106501590, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.890625, "step": 4943, "time_per_iteration": 2.5319998264312744 }, { "auxiliary_loss_clip": 0.01139704, "auxiliary_loss_mlp": 0.01038369, "balance_loss_clip": 1.02197778, "balance_loss_mlp": 1.05178308, "epoch": 0.2972493611904404, "flos": 17603195880960.0, "grad_norm": 2.236091242903778, "language_loss": 0.79572487, "learning_rate": 3.2963992945238497e-06, "loss": 0.8175056, "num_input_tokens_seen": 106519430, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87890625, "step": 4944, "time_per_iteration": 2.449038505554199 }, { "auxiliary_loss_clip": 0.01131168, "auxiliary_loss_mlp": 0.01037474, "balance_loss_clip": 1.02222097, "balance_loss_mlp": 1.04892802, "epoch": 0.2973094844431084, "flos": 20412774817920.0, "grad_norm": 2.687126348810588, "language_loss": 0.82974905, "learning_rate": 3.2961027063914795e-06, "loss": 0.85143548, "num_input_tokens_seen": 106535870, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.82421875, "step": 4945, "time_per_iteration": 2.4960265159606934 }, { "auxiliary_loss_clip": 0.01131986, "auxiliary_loss_mlp": 0.01036377, "balance_loss_clip": 1.02097571, "balance_loss_mlp": 1.05009055, "epoch": 0.29736960769577636, "flos": 17493488766720.0, "grad_norm": 3.886932501793515, "language_loss": 0.66672689, "learning_rate": 3.2958060691107654e-06, "loss": 0.68841058, "num_input_tokens_seen": 106553560, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 4946, "time_per_iteration": 2.4407660961151123 }, { "auxiliary_loss_clip": 0.01138702, "auxiliary_loss_mlp": 0.01029796, "balance_loss_clip": 1.01454866, "balance_loss_mlp": 1.0529182, "epoch": 0.2974297309484443, "flos": 26103969417600.0, "grad_norm": 1.780954045879929, "language_loss": 0.73989969, "learning_rate": 3.2955093826929547e-06, "loss": 0.76158464, "num_input_tokens_seen": 106574115, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.859375, "step": 4947, "time_per_iteration": 2.5438690185546875 }, { "auxiliary_loss_clip": 0.01140362, "auxiliary_loss_mlp": 0.0104103, "balance_loss_clip": 1.0239594, "balance_loss_mlp": 1.05306184, "epoch": 0.2974898542011123, "flos": 25666433850240.0, "grad_norm": 1.9096219126154155, "language_loss": 0.7320891, "learning_rate": 3.2952126471492985e-06, "loss": 0.75390303, "num_input_tokens_seen": 106593070, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.875, "step": 4948, "time_per_iteration": 2.5128672122955322 }, { "auxiliary_loss_clip": 0.01129952, "auxiliary_loss_mlp": 0.01034102, "balance_loss_clip": 1.01930797, "balance_loss_mlp": 1.04677439, "epoch": 0.29754997745378026, "flos": 18661339658880.0, "grad_norm": 1.9499523325222476, "language_loss": 0.83634353, "learning_rate": 3.2949158624910497e-06, "loss": 0.85798407, "num_input_tokens_seen": 106610695, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.83203125, "step": 4949, "time_per_iteration": 2.4549221992492676 }, { "auxiliary_loss_clip": 0.01130868, "auxiliary_loss_mlp": 0.01037563, "balance_loss_clip": 1.02132678, "balance_loss_mlp": 1.04795539, "epoch": 0.2976101007064482, "flos": 22274599449600.0, "grad_norm": 1.8997591398200053, "language_loss": 0.71264124, "learning_rate": 3.2946190287294603e-06, "loss": 0.73432559, "num_input_tokens_seen": 106631300, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.828125, "step": 4950, "time_per_iteration": 2.490908622741699 }, { "auxiliary_loss_clip": 0.01130127, "auxiliary_loss_mlp": 0.01038255, "balance_loss_clip": 1.02325869, "balance_loss_mlp": 1.0498116, "epoch": 0.2976702239591162, "flos": 21945657674880.0, "grad_norm": 3.349450503549352, "language_loss": 0.82659131, "learning_rate": 3.294322145875789e-06, "loss": 0.84827507, "num_input_tokens_seen": 106650065, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 4951, "time_per_iteration": 2.495171070098877 }, { "auxiliary_loss_clip": 0.01131655, "auxiliary_loss_mlp": 0.01030887, "balance_loss_clip": 1.01481724, "balance_loss_mlp": 1.04719925, "epoch": 0.29773034721178415, "flos": 24637197542400.0, "grad_norm": 15.765803135184223, "language_loss": 0.73949528, "learning_rate": 3.2940252139412912e-06, "loss": 0.76112062, "num_input_tokens_seen": 106668230, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 4952, "time_per_iteration": 2.496713638305664 }, { "auxiliary_loss_clip": 0.01135675, "auxiliary_loss_mlp": 0.01036099, "balance_loss_clip": 1.01971912, "balance_loss_mlp": 1.0510639, "epoch": 0.2977904704644521, "flos": 20557566541440.0, "grad_norm": 2.7697219667442052, "language_loss": 0.84252644, "learning_rate": 3.293728232937228e-06, "loss": 0.86424422, "num_input_tokens_seen": 106687785, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84765625, "step": 4953, "time_per_iteration": 2.4734888076782227 }, { "auxiliary_loss_clip": 0.01136041, "auxiliary_loss_mlp": 0.01036372, "balance_loss_clip": 1.0213871, "balance_loss_mlp": 1.05075574, "epoch": 0.2978505937171201, "flos": 18916449027840.0, "grad_norm": 2.0128704022431676, "language_loss": 0.73601943, "learning_rate": 3.2934312028748597e-06, "loss": 0.75774354, "num_input_tokens_seen": 106706875, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8515625, "step": 4954, "time_per_iteration": 2.461686134338379 }, { "auxiliary_loss_clip": 0.01129717, "auxiliary_loss_mlp": 0.01032191, "balance_loss_clip": 1.01734948, "balance_loss_mlp": 1.0475769, "epoch": 0.29791071696978805, "flos": 19317750750720.0, "grad_norm": 2.6030912277397915, "language_loss": 0.75656223, "learning_rate": 3.293134123765452e-06, "loss": 0.77818131, "num_input_tokens_seen": 106725105, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 4955, "time_per_iteration": 2.4485929012298584 }, { "auxiliary_loss_clip": 0.01138067, "auxiliary_loss_mlp": 0.01036781, "balance_loss_clip": 1.0211761, "balance_loss_mlp": 1.05162215, "epoch": 0.297970840222456, "flos": 18806813740800.0, "grad_norm": 2.0994300518794224, "language_loss": 0.72511888, "learning_rate": 3.2928369956202684e-06, "loss": 0.74686742, "num_input_tokens_seen": 106744780, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.86328125, "step": 4956, "time_per_iteration": 2.4921281337738037 }, { "auxiliary_loss_clip": 0.01138959, "auxiliary_loss_mlp": 0.01040699, "balance_loss_clip": 1.02396214, "balance_loss_mlp": 1.05084896, "epoch": 0.298030963475124, "flos": 22852760762880.0, "grad_norm": 2.015260502157524, "language_loss": 0.78968501, "learning_rate": 3.2925398184505754e-06, "loss": 0.8114816, "num_input_tokens_seen": 106764670, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8828125, "step": 4957, "time_per_iteration": 2.4782888889312744 }, { "auxiliary_loss_clip": 0.01136211, "auxiliary_loss_mlp": 0.01041789, "balance_loss_clip": 1.0252068, "balance_loss_mlp": 1.05080211, "epoch": 0.298091086727792, "flos": 21868485304320.0, "grad_norm": 2.9945356935038743, "language_loss": 0.70565212, "learning_rate": 3.2922425922676437e-06, "loss": 0.72743213, "num_input_tokens_seen": 106783695, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8515625, "step": 4958, "time_per_iteration": 2.4903042316436768 }, { "auxiliary_loss_clip": 0.01134882, "auxiliary_loss_mlp": 0.01043343, "balance_loss_clip": 1.02738094, "balance_loss_mlp": 1.05250633, "epoch": 0.29815120998045996, "flos": 21175014355200.0, "grad_norm": 1.6013873499180915, "language_loss": 0.78771389, "learning_rate": 3.291945317082743e-06, "loss": 0.8094961, "num_input_tokens_seen": 106803150, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.82421875, "step": 4959, "time_per_iteration": 2.475919246673584 }, { "auxiliary_loss_clip": 0.01133842, "auxiliary_loss_mlp": 0.01046632, "balance_loss_clip": 1.03100419, "balance_loss_mlp": 1.0500561, "epoch": 0.29821133323312793, "flos": 19896271200000.0, "grad_norm": 3.060585672851324, "language_loss": 0.7942571, "learning_rate": 3.291647992907147e-06, "loss": 0.81606185, "num_input_tokens_seen": 106820705, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 4960, "time_per_iteration": 2.483855724334717 }, { "auxiliary_loss_clip": 0.01138721, "auxiliary_loss_mlp": 0.01042414, "balance_loss_clip": 1.02506948, "balance_loss_mlp": 1.05125475, "epoch": 0.2982714564857959, "flos": 12750766744320.0, "grad_norm": 2.5140256256200746, "language_loss": 0.74223185, "learning_rate": 3.291350619752129e-06, "loss": 0.76404321, "num_input_tokens_seen": 106837335, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.875, "step": 4961, "time_per_iteration": 2.455265522003174 }, { "auxiliary_loss_clip": 0.01137035, "auxiliary_loss_mlp": 0.01037961, "balance_loss_clip": 1.02274966, "balance_loss_mlp": 1.05182457, "epoch": 0.29833157973846386, "flos": 22271905929600.0, "grad_norm": 3.333511218015046, "language_loss": 0.61975825, "learning_rate": 3.291053197628967e-06, "loss": 0.64150822, "num_input_tokens_seen": 106856250, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8515625, "step": 4962, "time_per_iteration": 2.5195369720458984 }, { "auxiliary_loss_clip": 0.01137722, "auxiliary_loss_mlp": 0.01042683, "balance_loss_clip": 1.02643514, "balance_loss_mlp": 1.05283642, "epoch": 0.2983917029911318, "flos": 15372999319680.0, "grad_norm": 1.7292819068464367, "language_loss": 0.83248568, "learning_rate": 3.2907557265489375e-06, "loss": 0.85428971, "num_input_tokens_seen": 106873370, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84765625, "step": 4963, "time_per_iteration": 2.4506096839904785 }, { "auxiliary_loss_clip": 0.01139063, "auxiliary_loss_mlp": 0.01035474, "balance_loss_clip": 1.01922607, "balance_loss_mlp": 1.05502129, "epoch": 0.2984518262437998, "flos": 15377632174080.0, "grad_norm": 2.6072586430959412, "language_loss": 0.66441607, "learning_rate": 3.290458206523322e-06, "loss": 0.68616152, "num_input_tokens_seen": 106890330, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.83984375, "step": 4964, "time_per_iteration": 2.470897674560547 }, { "auxiliary_loss_clip": 0.01133835, "auxiliary_loss_mlp": 0.01032994, "balance_loss_clip": 1.01892734, "balance_loss_mlp": 1.0505178, "epoch": 0.29851194949646775, "flos": 18108458542080.0, "grad_norm": 1.9014369584578257, "language_loss": 0.71086961, "learning_rate": 3.2901606375634015e-06, "loss": 0.73253793, "num_input_tokens_seen": 106909190, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8359375, "step": 4965, "time_per_iteration": 2.4716126918792725 }, { "auxiliary_loss_clip": 0.01143633, "auxiliary_loss_mlp": 0.0104506, "balance_loss_clip": 1.02862084, "balance_loss_mlp": 1.05732727, "epoch": 0.2985720727491357, "flos": 22018233104640.0, "grad_norm": 1.9606458244527527, "language_loss": 0.66310799, "learning_rate": 3.289863019680461e-06, "loss": 0.68499494, "num_input_tokens_seen": 106927825, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.86328125, "step": 4966, "time_per_iteration": 2.4969794750213623 }, { "auxiliary_loss_clip": 0.01141781, "auxiliary_loss_mlp": 0.01040339, "balance_loss_clip": 1.02499676, "balance_loss_mlp": 1.05580258, "epoch": 0.2986321960018037, "flos": 13041355772160.0, "grad_norm": 3.2052745044313, "language_loss": 0.73942691, "learning_rate": 3.289565352885785e-06, "loss": 0.76124817, "num_input_tokens_seen": 106943155, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.859375, "step": 4967, "time_per_iteration": 2.444897174835205 }, { "auxiliary_loss_clip": 0.01133351, "auxiliary_loss_mlp": 0.01033354, "balance_loss_clip": 1.01885247, "balance_loss_mlp": 1.04929352, "epoch": 0.29869231925447165, "flos": 14465034305280.0, "grad_norm": 3.1996541941619516, "language_loss": 0.71319234, "learning_rate": 3.2892676371906614e-06, "loss": 0.73485935, "num_input_tokens_seen": 106960295, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.83984375, "step": 4968, "time_per_iteration": 3.919342041015625 }, { "auxiliary_loss_clip": 0.01137291, "auxiliary_loss_mlp": 0.01034932, "balance_loss_clip": 1.01917243, "balance_loss_mlp": 1.0512774, "epoch": 0.2987524425071396, "flos": 31650228639360.0, "grad_norm": 1.773886004319386, "language_loss": 0.76375067, "learning_rate": 3.2889698726063805e-06, "loss": 0.78547293, "num_input_tokens_seen": 106982870, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.859375, "step": 4969, "time_per_iteration": 2.604611396789551 }, { "auxiliary_loss_clip": 0.01135732, "auxiliary_loss_mlp": 0.01034614, "balance_loss_clip": 1.0201304, "balance_loss_mlp": 1.05086327, "epoch": 0.2988125657598076, "flos": 21433427775360.0, "grad_norm": 2.0147271191338554, "language_loss": 0.70039082, "learning_rate": 3.2886720591442327e-06, "loss": 0.72209424, "num_input_tokens_seen": 107002405, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.84765625, "step": 4970, "time_per_iteration": 3.89534330368042 }, { "auxiliary_loss_clip": 0.01139811, "auxiliary_loss_mlp": 0.01043656, "balance_loss_clip": 1.02718174, "balance_loss_mlp": 1.05160975, "epoch": 0.2988726890124756, "flos": 18076965292800.0, "grad_norm": 2.466732945353511, "language_loss": 0.84848183, "learning_rate": 3.2883741968155103e-06, "loss": 0.87031651, "num_input_tokens_seen": 107017310, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8828125, "step": 4971, "time_per_iteration": 3.835563898086548 }, { "auxiliary_loss_clip": 0.01135059, "auxiliary_loss_mlp": 0.01043002, "balance_loss_clip": 1.02738547, "balance_loss_mlp": 1.05325437, "epoch": 0.29893281226514357, "flos": 21755653706880.0, "grad_norm": 2.873332851146429, "language_loss": 0.793401, "learning_rate": 3.2880762856315107e-06, "loss": 0.81518161, "num_input_tokens_seen": 107034645, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 4972, "time_per_iteration": 4.025270462036133 }, { "auxiliary_loss_clip": 0.01135198, "auxiliary_loss_mlp": 0.01045367, "balance_loss_clip": 1.02991772, "balance_loss_mlp": 1.05067182, "epoch": 0.29899293551781153, "flos": 16836718538880.0, "grad_norm": 2.36283028834508, "language_loss": 0.85313916, "learning_rate": 3.2877783256035285e-06, "loss": 0.87494475, "num_input_tokens_seen": 107051125, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84375, "step": 4973, "time_per_iteration": 2.4575235843658447 }, { "auxiliary_loss_clip": 0.01129525, "auxiliary_loss_mlp": 0.01036685, "balance_loss_clip": 1.02229023, "balance_loss_mlp": 1.05152035, "epoch": 0.2990530587704795, "flos": 11729215946880.0, "grad_norm": 1.7724696075160449, "language_loss": 0.77683556, "learning_rate": 3.287480316742863e-06, "loss": 0.79849756, "num_input_tokens_seen": 107068815, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 4974, "time_per_iteration": 2.4490790367126465 }, { "auxiliary_loss_clip": 0.01138335, "auxiliary_loss_mlp": 0.01043148, "balance_loss_clip": 1.02819335, "balance_loss_mlp": 1.05301666, "epoch": 0.29911318202314746, "flos": 28039877850240.0, "grad_norm": 2.0609128359115285, "language_loss": 0.71962965, "learning_rate": 3.287182259060815e-06, "loss": 0.74144447, "num_input_tokens_seen": 107090420, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8515625, "step": 4975, "time_per_iteration": 2.5601885318756104 }, { "auxiliary_loss_clip": 0.01134512, "auxiliary_loss_mlp": 0.01038381, "balance_loss_clip": 1.02202582, "balance_loss_mlp": 1.05184102, "epoch": 0.2991733052758154, "flos": 18733555952640.0, "grad_norm": 2.2671833970958852, "language_loss": 0.75621992, "learning_rate": 3.286884152568687e-06, "loss": 0.77794886, "num_input_tokens_seen": 107107255, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.828125, "step": 4976, "time_per_iteration": 2.460477352142334 }, { "auxiliary_loss_clip": 0.0113251, "auxiliary_loss_mlp": 0.01038817, "balance_loss_clip": 1.02411246, "balance_loss_mlp": 1.05086184, "epoch": 0.2992334285284834, "flos": 15559160532480.0, "grad_norm": 2.593808149119799, "language_loss": 0.8587873, "learning_rate": 3.2865859972777827e-06, "loss": 0.88050056, "num_input_tokens_seen": 107123840, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.81640625, "step": 4977, "time_per_iteration": 2.458617687225342 }, { "auxiliary_loss_clip": 0.01136274, "auxiliary_loss_mlp": 0.01038515, "balance_loss_clip": 1.02357793, "balance_loss_mlp": 1.05269384, "epoch": 0.29929355178115136, "flos": 21797561900160.0, "grad_norm": 1.583546828017816, "language_loss": 0.68318206, "learning_rate": 3.2862877931994088e-06, "loss": 0.70492995, "num_input_tokens_seen": 107143475, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8359375, "step": 4978, "time_per_iteration": 2.5031445026397705 }, { "auxiliary_loss_clip": 0.01140665, "auxiliary_loss_mlp": 0.0103279, "balance_loss_clip": 1.01745939, "balance_loss_mlp": 1.05616844, "epoch": 0.2993536750338193, "flos": 21178533888000.0, "grad_norm": 2.183143166794293, "language_loss": 0.75839329, "learning_rate": 3.2859895403448726e-06, "loss": 0.78012788, "num_input_tokens_seen": 107161725, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.84375, "step": 4979, "time_per_iteration": 2.5053913593292236 }, { "auxiliary_loss_clip": 0.0113373, "auxiliary_loss_mlp": 0.0103918, "balance_loss_clip": 1.02345037, "balance_loss_mlp": 1.04842246, "epoch": 0.2994137982864873, "flos": 32122130544000.0, "grad_norm": 2.7467613974567726, "language_loss": 0.68916059, "learning_rate": 3.285691238725484e-06, "loss": 0.7108897, "num_input_tokens_seen": 107183935, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8515625, "step": 4980, "time_per_iteration": 2.56997013092041 }, { "auxiliary_loss_clip": 0.01135417, "auxiliary_loss_mlp": 0.01039923, "balance_loss_clip": 1.02442622, "balance_loss_mlp": 1.05397391, "epoch": 0.29947392153915525, "flos": 21105419754240.0, "grad_norm": 3.973030444023802, "language_loss": 0.73507071, "learning_rate": 3.285392888352555e-06, "loss": 0.75682414, "num_input_tokens_seen": 107204285, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8125, "step": 4981, "time_per_iteration": 2.5300121307373047 }, { "auxiliary_loss_clip": 0.01137181, "auxiliary_loss_mlp": 0.01038552, "balance_loss_clip": 1.02369845, "balance_loss_mlp": 1.04943824, "epoch": 0.2995340447918232, "flos": 21542632099200.0, "grad_norm": 1.7059258802517956, "language_loss": 0.86470044, "learning_rate": 3.2850944892373987e-06, "loss": 0.88645774, "num_input_tokens_seen": 107225265, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.875, "step": 4982, "time_per_iteration": 2.502901315689087 }, { "auxiliary_loss_clip": 0.01141142, "auxiliary_loss_mlp": 0.01036706, "balance_loss_clip": 1.01968288, "balance_loss_mlp": 1.05318522, "epoch": 0.2995941680444912, "flos": 16725143917440.0, "grad_norm": 4.065261910494673, "language_loss": 0.87248844, "learning_rate": 3.2847960413913307e-06, "loss": 0.8942669, "num_input_tokens_seen": 107241335, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 4983, "time_per_iteration": 2.463650941848755 }, { "auxiliary_loss_clip": 0.01136544, "auxiliary_loss_mlp": 0.01039229, "balance_loss_clip": 1.02444685, "balance_loss_mlp": 1.05231416, "epoch": 0.2996542912971592, "flos": 20923496346240.0, "grad_norm": 2.1501152792858105, "language_loss": 0.78644919, "learning_rate": 3.284497544825668e-06, "loss": 0.80820692, "num_input_tokens_seen": 107259375, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.84375, "step": 4984, "time_per_iteration": 2.491938829421997 }, { "auxiliary_loss_clip": 0.0113995, "auxiliary_loss_mlp": 0.01039977, "balance_loss_clip": 1.02402723, "balance_loss_mlp": 1.05445993, "epoch": 0.29971441454982717, "flos": 25079868754560.0, "grad_norm": 2.0392912918848034, "language_loss": 0.78614289, "learning_rate": 3.2841989995517303e-06, "loss": 0.80794215, "num_input_tokens_seen": 107279890, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.85546875, "step": 4985, "time_per_iteration": 2.534486770629883 }, { "auxiliary_loss_clip": 0.01142697, "auxiliary_loss_mlp": 0.01038838, "balance_loss_clip": 1.02167153, "balance_loss_mlp": 1.05407143, "epoch": 0.29977453780249513, "flos": 52555911840000.0, "grad_norm": 2.376834591381445, "language_loss": 0.71390378, "learning_rate": 3.283900405580837e-06, "loss": 0.73571914, "num_input_tokens_seen": 107303430, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.88671875, "step": 4986, "time_per_iteration": 2.7710399627685547 }, { "auxiliary_loss_clip": 0.01141215, "auxiliary_loss_mlp": 0.01038951, "balance_loss_clip": 1.02260792, "balance_loss_mlp": 1.05273032, "epoch": 0.2998346610551631, "flos": 22237144542720.0, "grad_norm": 2.0310356588508123, "language_loss": 0.73149323, "learning_rate": 3.283601762924312e-06, "loss": 0.75329489, "num_input_tokens_seen": 107323700, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.88671875, "step": 4987, "time_per_iteration": 2.499264717102051 }, { "auxiliary_loss_clip": 0.01133557, "auxiliary_loss_mlp": 0.01036503, "balance_loss_clip": 1.02158368, "balance_loss_mlp": 1.05064321, "epoch": 0.29989478430783106, "flos": 16873203778560.0, "grad_norm": 3.0655098707713266, "language_loss": 0.79889089, "learning_rate": 3.2833030715934793e-06, "loss": 0.82059145, "num_input_tokens_seen": 107341965, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.828125, "step": 4988, "time_per_iteration": 2.4536986351013184 }, { "auxiliary_loss_clip": 0.01135338, "auxiliary_loss_mlp": 0.01042406, "balance_loss_clip": 1.02668881, "balance_loss_mlp": 1.0517031, "epoch": 0.29995490756049903, "flos": 23768878164480.0, "grad_norm": 1.5386814657115147, "language_loss": 0.71009552, "learning_rate": 3.2830043315996658e-06, "loss": 0.73187292, "num_input_tokens_seen": 107362615, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8359375, "step": 4989, "time_per_iteration": 2.5255329608917236 }, { "auxiliary_loss_clip": 0.01140756, "auxiliary_loss_mlp": 0.01039653, "balance_loss_clip": 1.02314258, "balance_loss_mlp": 1.05370617, "epoch": 0.300015030813167, "flos": 14465321614080.0, "grad_norm": 2.4802473859014174, "language_loss": 0.84810346, "learning_rate": 3.282705542954199e-06, "loss": 0.86990756, "num_input_tokens_seen": 107378980, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8671875, "step": 4990, "time_per_iteration": 2.4430253505706787 }, { "auxiliary_loss_clip": 0.01141067, "auxiliary_loss_mlp": 0.01033585, "balance_loss_clip": 1.01677632, "balance_loss_mlp": 1.05288148, "epoch": 0.30007515406583496, "flos": 25191982080000.0, "grad_norm": 3.1810169423634402, "language_loss": 0.66715032, "learning_rate": 3.28240670566841e-06, "loss": 0.68889689, "num_input_tokens_seen": 107397640, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8828125, "step": 4991, "time_per_iteration": 2.541934013366699 }, { "auxiliary_loss_clip": 0.01144299, "auxiliary_loss_mlp": 0.01037341, "balance_loss_clip": 1.01924467, "balance_loss_mlp": 1.05433989, "epoch": 0.3001352773185029, "flos": 19391188106880.0, "grad_norm": 2.563569921965657, "language_loss": 0.78652477, "learning_rate": 3.28210781975363e-06, "loss": 0.80834126, "num_input_tokens_seen": 107416020, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.90234375, "step": 4992, "time_per_iteration": 2.471766710281372 }, { "auxiliary_loss_clip": 0.01138666, "auxiliary_loss_mlp": 0.01034641, "balance_loss_clip": 1.01869059, "balance_loss_mlp": 1.05326939, "epoch": 0.3001954005711709, "flos": 21543853161600.0, "grad_norm": 1.8509087030287523, "language_loss": 0.82239288, "learning_rate": 3.281808885221193e-06, "loss": 0.84412599, "num_input_tokens_seen": 107436340, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8515625, "step": 4993, "time_per_iteration": 2.5133614540100098 }, { "auxiliary_loss_clip": 0.01143686, "auxiliary_loss_mlp": 0.0105119, "balance_loss_clip": 1.03304672, "balance_loss_mlp": 1.05257154, "epoch": 0.30025552382383885, "flos": 17384320356480.0, "grad_norm": 5.156522322597039, "language_loss": 0.85914993, "learning_rate": 3.2815099020824345e-06, "loss": 0.88109869, "num_input_tokens_seen": 107454585, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.91015625, "step": 4994, "time_per_iteration": 2.45546293258667 }, { "auxiliary_loss_clip": 0.01141384, "auxiliary_loss_mlp": 0.01037639, "balance_loss_clip": 1.02137887, "balance_loss_mlp": 1.05486882, "epoch": 0.3003156470765068, "flos": 29533330552320.0, "grad_norm": 1.763734148280243, "language_loss": 0.81119603, "learning_rate": 3.2812108703486924e-06, "loss": 0.83298624, "num_input_tokens_seen": 107477180, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8671875, "step": 4995, "time_per_iteration": 2.582240104675293 }, { "auxiliary_loss_clip": 0.01138002, "auxiliary_loss_mlp": 0.01038914, "balance_loss_clip": 1.02288032, "balance_loss_mlp": 1.05413032, "epoch": 0.3003757703291748, "flos": 43646402465280.0, "grad_norm": 2.072686928139122, "language_loss": 0.675309, "learning_rate": 3.2809117900313055e-06, "loss": 0.69707811, "num_input_tokens_seen": 107500250, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 4996, "time_per_iteration": 2.702237606048584 }, { "auxiliary_loss_clip": 0.0113905, "auxiliary_loss_mlp": 0.01040576, "balance_loss_clip": 1.02430439, "balance_loss_mlp": 1.05239427, "epoch": 0.30043589358184275, "flos": 22528380015360.0, "grad_norm": 1.9850599665304485, "language_loss": 0.75273323, "learning_rate": 3.280612661141615e-06, "loss": 0.77452946, "num_input_tokens_seen": 107520070, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8671875, "step": 4997, "time_per_iteration": 2.5086681842803955 }, { "auxiliary_loss_clip": 0.01136104, "auxiliary_loss_mlp": 0.01046666, "balance_loss_clip": 1.03100145, "balance_loss_mlp": 1.05241203, "epoch": 0.30049601683451077, "flos": 20995892208000.0, "grad_norm": 1.9882920841479848, "language_loss": 0.77891225, "learning_rate": 3.2803134836909646e-06, "loss": 0.80073988, "num_input_tokens_seen": 107539285, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 4998, "time_per_iteration": 2.4759037494659424 }, { "auxiliary_loss_clip": 0.01137224, "auxiliary_loss_mlp": 0.01038029, "balance_loss_clip": 1.02285385, "balance_loss_mlp": 1.05370235, "epoch": 0.30055614008717874, "flos": 23916004272000.0, "grad_norm": 1.584425407645564, "language_loss": 0.73644936, "learning_rate": 3.2800142576906985e-06, "loss": 0.75820196, "num_input_tokens_seen": 107560260, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8359375, "step": 4999, "time_per_iteration": 2.5337111949920654 }, { "auxiliary_loss_clip": 0.01141066, "auxiliary_loss_mlp": 0.01040195, "balance_loss_clip": 1.0241611, "balance_loss_mlp": 1.05469584, "epoch": 0.3006162633398467, "flos": 19169798630400.0, "grad_norm": 1.8808924473334812, "language_loss": 0.7547915, "learning_rate": 3.2797149831521626e-06, "loss": 0.77660406, "num_input_tokens_seen": 107579260, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.86328125, "step": 5000, "time_per_iteration": 2.459698438644409 }, { "auxiliary_loss_clip": 0.01135722, "auxiliary_loss_mlp": 0.01036319, "balance_loss_clip": 1.02197838, "balance_loss_mlp": 1.05347013, "epoch": 0.30067638659251467, "flos": 14679241061760.0, "grad_norm": 2.0062577165979, "language_loss": 0.82029575, "learning_rate": 3.2794156600867073e-06, "loss": 0.8420161, "num_input_tokens_seen": 107595245, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.82421875, "step": 5001, "time_per_iteration": 2.463869571685791 }, { "auxiliary_loss_clip": 0.01143521, "auxiliary_loss_mlp": 0.01045641, "balance_loss_clip": 1.02904737, "balance_loss_mlp": 1.05693102, "epoch": 0.30073650984518263, "flos": 23368007404800.0, "grad_norm": 2.9687683851599775, "language_loss": 0.80480123, "learning_rate": 3.2791162885056815e-06, "loss": 0.82669282, "num_input_tokens_seen": 107613985, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8671875, "step": 5002, "time_per_iteration": 2.504654884338379 }, { "auxiliary_loss_clip": 0.01147728, "auxiliary_loss_mlp": 0.01035818, "balance_loss_clip": 1.01961696, "balance_loss_mlp": 1.05755186, "epoch": 0.3007966330978506, "flos": 22966633854720.0, "grad_norm": 2.4701075673049244, "language_loss": 0.70701325, "learning_rate": 3.2788168684204376e-06, "loss": 0.72884876, "num_input_tokens_seen": 107631435, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.90234375, "step": 5003, "time_per_iteration": 2.512733221054077 }, { "auxiliary_loss_clip": 0.01144709, "auxiliary_loss_mlp": 0.01043638, "balance_loss_clip": 1.02729475, "balance_loss_mlp": 1.05578589, "epoch": 0.30085675635051856, "flos": 27818452460160.0, "grad_norm": 2.082073760631166, "language_loss": 0.70338261, "learning_rate": 3.27851739984233e-06, "loss": 0.72526604, "num_input_tokens_seen": 107650530, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.890625, "step": 5004, "time_per_iteration": 2.5407068729400635 }, { "auxiliary_loss_clip": 0.011425, "auxiliary_loss_mlp": 0.01041502, "balance_loss_clip": 1.02495575, "balance_loss_mlp": 1.05504596, "epoch": 0.3009168796031865, "flos": 10882729059840.0, "grad_norm": 2.662326474298553, "language_loss": 0.81396532, "learning_rate": 3.278217882782715e-06, "loss": 0.8358053, "num_input_tokens_seen": 107662240, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 5005, "time_per_iteration": 2.4465830326080322 }, { "auxiliary_loss_clip": 0.01139198, "auxiliary_loss_mlp": 0.01036507, "balance_loss_clip": 1.02126038, "balance_loss_mlp": 1.05395031, "epoch": 0.3009770028558545, "flos": 23805399317760.0, "grad_norm": 2.4090994129490775, "language_loss": 0.74348259, "learning_rate": 3.2779183172529497e-06, "loss": 0.7652396, "num_input_tokens_seen": 107680330, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8515625, "step": 5006, "time_per_iteration": 2.490290880203247 }, { "auxiliary_loss_clip": 0.01137053, "auxiliary_loss_mlp": 0.0104157, "balance_loss_clip": 1.02560782, "balance_loss_mlp": 1.05347252, "epoch": 0.30103712610852246, "flos": 26468211283200.0, "grad_norm": 7.430798949021306, "language_loss": 0.71722507, "learning_rate": 3.2776187032643932e-06, "loss": 0.73901129, "num_input_tokens_seen": 107700020, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 5007, "time_per_iteration": 2.5331954956054688 }, { "auxiliary_loss_clip": 0.01140753, "auxiliary_loss_mlp": 0.01041971, "balance_loss_clip": 1.02469778, "balance_loss_mlp": 1.05505204, "epoch": 0.3010972493611904, "flos": 22856459863680.0, "grad_norm": 4.09547064371976, "language_loss": 0.7632218, "learning_rate": 3.2773190408284075e-06, "loss": 0.78504902, "num_input_tokens_seen": 107718575, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.859375, "step": 5008, "time_per_iteration": 2.489539861679077 }, { "auxiliary_loss_clip": 0.01141097, "auxiliary_loss_mlp": 0.01037934, "balance_loss_clip": 1.02187657, "balance_loss_mlp": 1.0552336, "epoch": 0.3011573726138584, "flos": 24053685102720.0, "grad_norm": 2.929283884906225, "language_loss": 0.84124219, "learning_rate": 3.2770193299563564e-06, "loss": 0.86303252, "num_input_tokens_seen": 107738635, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.859375, "step": 5009, "time_per_iteration": 3.981048107147217 }, { "auxiliary_loss_clip": 0.01143329, "auxiliary_loss_mlp": 0.01044605, "balance_loss_clip": 1.02642572, "balance_loss_mlp": 1.05343556, "epoch": 0.30121749586652635, "flos": 20259687052800.0, "grad_norm": 3.1794876332464757, "language_loss": 0.8381058, "learning_rate": 3.276719570659604e-06, "loss": 0.85998511, "num_input_tokens_seen": 107753415, "router_z_loss_clip": 0.18164062, "router_z_loss_mlp": 0.8984375, "step": 5010, "time_per_iteration": 2.469299554824829 }, { "auxiliary_loss_clip": 0.01138574, "auxiliary_loss_mlp": 0.0103909, "balance_loss_clip": 1.02411711, "balance_loss_mlp": 1.05247831, "epoch": 0.3012776191191944, "flos": 26943058103040.0, "grad_norm": 2.916589129412552, "language_loss": 0.84945631, "learning_rate": 3.2764197629495176e-06, "loss": 0.87123299, "num_input_tokens_seen": 107773840, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.859375, "step": 5011, "time_per_iteration": 2.5402541160583496 }, { "auxiliary_loss_clip": 0.01140207, "auxiliary_loss_mlp": 0.01038534, "balance_loss_clip": 1.0215112, "balance_loss_mlp": 1.05111599, "epoch": 0.30133774237186234, "flos": 20412307941120.0, "grad_norm": 3.5963247945684524, "language_loss": 0.72167683, "learning_rate": 3.2761199068374656e-06, "loss": 0.74346423, "num_input_tokens_seen": 107792020, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.890625, "step": 5012, "time_per_iteration": 5.40651798248291 }, { "auxiliary_loss_clip": 0.01141281, "auxiliary_loss_mlp": 0.01038886, "balance_loss_clip": 1.02304292, "balance_loss_mlp": 1.05341387, "epoch": 0.3013978656245303, "flos": 19792453916160.0, "grad_norm": 5.788468022336435, "language_loss": 0.87560827, "learning_rate": 3.275820002334819e-06, "loss": 0.89740992, "num_input_tokens_seen": 107809595, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.87890625, "step": 5013, "time_per_iteration": 2.4960031509399414 }, { "auxiliary_loss_clip": 0.01145214, "auxiliary_loss_mlp": 0.01043396, "balance_loss_clip": 1.02583671, "balance_loss_mlp": 1.05375528, "epoch": 0.30145798887719827, "flos": 16249650652800.0, "grad_norm": 2.17801624375623, "language_loss": 0.82983255, "learning_rate": 3.2755200494529496e-06, "loss": 0.85171866, "num_input_tokens_seen": 107827230, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.9140625, "step": 5014, "time_per_iteration": 3.8958942890167236 }, { "auxiliary_loss_clip": 0.01134832, "auxiliary_loss_mlp": 0.01039018, "balance_loss_clip": 1.02347875, "balance_loss_mlp": 1.051826, "epoch": 0.30151811212986623, "flos": 24571733005440.0, "grad_norm": 2.9440577051227814, "language_loss": 0.68334937, "learning_rate": 3.2752200482032323e-06, "loss": 0.7050879, "num_input_tokens_seen": 107847195, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83203125, "step": 5015, "time_per_iteration": 2.5302953720092773 }, { "auxiliary_loss_clip": 0.01139985, "auxiliary_loss_mlp": 0.01040843, "balance_loss_clip": 1.02372432, "balance_loss_mlp": 1.05300641, "epoch": 0.3015782353825342, "flos": 21872076664320.0, "grad_norm": 2.2578052390793824, "language_loss": 0.74246973, "learning_rate": 3.2749199985970436e-06, "loss": 0.76427805, "num_input_tokens_seen": 107866420, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.87109375, "step": 5016, "time_per_iteration": 2.489347457885742 }, { "auxiliary_loss_clip": 0.01144127, "auxiliary_loss_mlp": 0.01038332, "balance_loss_clip": 1.02133226, "balance_loss_mlp": 1.05527997, "epoch": 0.30163835863520216, "flos": 28769331248640.0, "grad_norm": 1.5598782015093104, "language_loss": 0.65828502, "learning_rate": 3.2746199006457603e-06, "loss": 0.68010962, "num_input_tokens_seen": 107889090, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.88671875, "step": 5017, "time_per_iteration": 2.573620557785034 }, { "auxiliary_loss_clip": 0.01142837, "auxiliary_loss_mlp": 0.01045804, "balance_loss_clip": 1.02932906, "balance_loss_mlp": 1.05439675, "epoch": 0.30169848188787013, "flos": 22966202891520.0, "grad_norm": 2.7395754577399365, "language_loss": 0.68749493, "learning_rate": 3.2743197543607628e-06, "loss": 0.70938134, "num_input_tokens_seen": 107907520, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8828125, "step": 5018, "time_per_iteration": 2.492321491241455 }, { "auxiliary_loss_clip": 0.01134656, "auxiliary_loss_mlp": 0.01045665, "balance_loss_clip": 1.03032279, "balance_loss_mlp": 1.05161095, "epoch": 0.3017586051405381, "flos": 21835268202240.0, "grad_norm": 1.9853277801189864, "language_loss": 0.79087806, "learning_rate": 3.2740195597534327e-06, "loss": 0.81268126, "num_input_tokens_seen": 107925650, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 5019, "time_per_iteration": 2.520611047744751 }, { "auxiliary_loss_clip": 0.0114209, "auxiliary_loss_mlp": 0.01042171, "balance_loss_clip": 1.02636421, "balance_loss_mlp": 1.05525422, "epoch": 0.30181872839320606, "flos": 22160403135360.0, "grad_norm": 2.2645569955945497, "language_loss": 0.69868046, "learning_rate": 3.2737193168351527e-06, "loss": 0.72052306, "num_input_tokens_seen": 107943975, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 5020, "time_per_iteration": 2.4797821044921875 }, { "auxiliary_loss_clip": 0.01140504, "auxiliary_loss_mlp": 0.01042217, "balance_loss_clip": 1.02576602, "balance_loss_mlp": 1.0518949, "epoch": 0.301878851645874, "flos": 18114168804480.0, "grad_norm": 1.9175178117504794, "language_loss": 0.78259158, "learning_rate": 3.2734190256173085e-06, "loss": 0.8044188, "num_input_tokens_seen": 107962950, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.88671875, "step": 5021, "time_per_iteration": 2.504429340362549 }, { "auxiliary_loss_clip": 0.01140066, "auxiliary_loss_mlp": 0.01034245, "balance_loss_clip": 1.01819956, "balance_loss_mlp": 1.05298913, "epoch": 0.301938974898542, "flos": 17602226213760.0, "grad_norm": 2.604393874451412, "language_loss": 0.76143515, "learning_rate": 3.2731186861112877e-06, "loss": 0.78317821, "num_input_tokens_seen": 107979700, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87109375, "step": 5022, "time_per_iteration": 2.4566495418548584 }, { "auxiliary_loss_clip": 0.01141681, "auxiliary_loss_mlp": 0.01043366, "balance_loss_clip": 1.02642608, "balance_loss_mlp": 1.05320072, "epoch": 0.30199909815120995, "flos": 11181219079680.0, "grad_norm": 2.76748469308177, "language_loss": 0.69823384, "learning_rate": 3.2728182983284793e-06, "loss": 0.72008431, "num_input_tokens_seen": 107996645, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 5023, "time_per_iteration": 2.4842402935028076 }, { "auxiliary_loss_clip": 0.01144582, "auxiliary_loss_mlp": 0.01037058, "balance_loss_clip": 1.02160871, "balance_loss_mlp": 1.05369961, "epoch": 0.302059221403878, "flos": 21907843632000.0, "grad_norm": 3.755068675805925, "language_loss": 0.71507943, "learning_rate": 3.2725178622802724e-06, "loss": 0.7368958, "num_input_tokens_seen": 108015020, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.90625, "step": 5024, "time_per_iteration": 2.48915696144104 }, { "auxiliary_loss_clip": 0.01137261, "auxiliary_loss_mlp": 0.01041084, "balance_loss_clip": 1.02469325, "balance_loss_mlp": 1.05278122, "epoch": 0.30211934465654594, "flos": 26396390039040.0, "grad_norm": 1.8552012735551613, "language_loss": 0.74210882, "learning_rate": 3.272217377978061e-06, "loss": 0.76389229, "num_input_tokens_seen": 108036430, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84375, "step": 5025, "time_per_iteration": 2.533207654953003 }, { "auxiliary_loss_clip": 0.01136419, "auxiliary_loss_mlp": 0.01044175, "balance_loss_clip": 1.02817726, "balance_loss_mlp": 1.05302215, "epoch": 0.3021794679092139, "flos": 23400470321280.0, "grad_norm": 2.136329821768025, "language_loss": 0.67298698, "learning_rate": 3.2719168454332387e-06, "loss": 0.69479293, "num_input_tokens_seen": 108054250, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 5026, "time_per_iteration": 2.4848947525024414 }, { "auxiliary_loss_clip": 0.01141125, "auxiliary_loss_mlp": 0.01043998, "balance_loss_clip": 1.02732086, "balance_loss_mlp": 1.05503273, "epoch": 0.30223959116188187, "flos": 20260979942400.0, "grad_norm": 2.187704415817669, "language_loss": 0.85274088, "learning_rate": 3.2716162646572034e-06, "loss": 0.87459207, "num_input_tokens_seen": 108071495, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.859375, "step": 5027, "time_per_iteration": 2.497626543045044 }, { "auxiliary_loss_clip": 0.01134967, "auxiliary_loss_mlp": 0.0104359, "balance_loss_clip": 1.02859378, "balance_loss_mlp": 1.051355, "epoch": 0.30229971441454984, "flos": 26687840993280.0, "grad_norm": 2.2163610158902545, "language_loss": 0.78878552, "learning_rate": 3.271315635661351e-06, "loss": 0.81057107, "num_input_tokens_seen": 108092135, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8359375, "step": 5028, "time_per_iteration": 2.5267395973205566 }, { "auxiliary_loss_clip": 0.01140868, "auxiliary_loss_mlp": 0.01041831, "balance_loss_clip": 1.02506995, "balance_loss_mlp": 1.05438089, "epoch": 0.3023598376672178, "flos": 34345323953280.0, "grad_norm": 1.8050136338933478, "language_loss": 0.7726087, "learning_rate": 3.2710149584570826e-06, "loss": 0.79443568, "num_input_tokens_seen": 108112945, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 5029, "time_per_iteration": 2.5885543823242188 }, { "auxiliary_loss_clip": 0.01141082, "auxiliary_loss_mlp": 0.01045264, "balance_loss_clip": 1.0270133, "balance_loss_mlp": 1.05284166, "epoch": 0.30241996091988577, "flos": 23112143850240.0, "grad_norm": 2.3973828004644386, "language_loss": 0.82601339, "learning_rate": 3.2707142330557993e-06, "loss": 0.84787679, "num_input_tokens_seen": 108130325, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.8828125, "step": 5030, "time_per_iteration": 2.5020036697387695 }, { "auxiliary_loss_clip": 0.01139589, "auxiliary_loss_mlp": 0.01041123, "balance_loss_clip": 1.02436185, "balance_loss_mlp": 1.05128908, "epoch": 0.30248008417255373, "flos": 19390002958080.0, "grad_norm": 1.786639409218717, "language_loss": 0.69723171, "learning_rate": 3.270413459468905e-06, "loss": 0.71903878, "num_input_tokens_seen": 108150300, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8828125, "step": 5031, "time_per_iteration": 2.4998433589935303 }, { "auxiliary_loss_clip": 0.01139974, "auxiliary_loss_mlp": 0.01036836, "balance_loss_clip": 1.02057624, "balance_loss_mlp": 1.05389798, "epoch": 0.3025402074252217, "flos": 23769704177280.0, "grad_norm": 1.7381050074597222, "language_loss": 0.82135737, "learning_rate": 3.2701126377078047e-06, "loss": 0.84312546, "num_input_tokens_seen": 108170330, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 5032, "time_per_iteration": 2.497814178466797 }, { "auxiliary_loss_clip": 0.01146728, "auxiliary_loss_mlp": 0.01048859, "balance_loss_clip": 1.03095365, "balance_loss_mlp": 1.05525458, "epoch": 0.30260033067788966, "flos": 25994118648960.0, "grad_norm": 2.3615595986659894, "language_loss": 0.73537076, "learning_rate": 3.269811767783906e-06, "loss": 0.7573266, "num_input_tokens_seen": 108191265, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9140625, "step": 5033, "time_per_iteration": 2.524729013442993 }, { "auxiliary_loss_clip": 0.01135682, "auxiliary_loss_mlp": 0.01049126, "balance_loss_clip": 1.03206682, "balance_loss_mlp": 1.05012739, "epoch": 0.3026604539305576, "flos": 25374551932800.0, "grad_norm": 2.3310412123134623, "language_loss": 0.74118388, "learning_rate": 3.2695108497086185e-06, "loss": 0.76303196, "num_input_tokens_seen": 108211615, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.85546875, "step": 5034, "time_per_iteration": 2.5396158695220947 }, { "auxiliary_loss_clip": 0.01140029, "auxiliary_loss_mlp": 0.01037691, "balance_loss_clip": 1.02095413, "balance_loss_mlp": 1.05233526, "epoch": 0.3027205771832256, "flos": 25812733944960.0, "grad_norm": 2.0367962755291966, "language_loss": 0.71917266, "learning_rate": 3.269209883493352e-06, "loss": 0.74094987, "num_input_tokens_seen": 108231080, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.875, "step": 5035, "time_per_iteration": 2.535703182220459 }, { "auxiliary_loss_clip": 0.01136112, "auxiliary_loss_mlp": 0.01037208, "balance_loss_clip": 1.02258134, "balance_loss_mlp": 1.0509671, "epoch": 0.30278070043589356, "flos": 27344539393920.0, "grad_norm": 1.8724407754692434, "language_loss": 0.8745904, "learning_rate": 3.2689088691495196e-06, "loss": 0.89632356, "num_input_tokens_seen": 108251125, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8515625, "step": 5036, "time_per_iteration": 2.5286266803741455 }, { "auxiliary_loss_clip": 0.01136183, "auxiliary_loss_mlp": 0.01052308, "balance_loss_clip": 1.03541613, "balance_loss_mlp": 1.05136895, "epoch": 0.3028408236885616, "flos": 24786227070720.0, "grad_norm": 1.9406243966074406, "language_loss": 0.77472782, "learning_rate": 3.268607806688536e-06, "loss": 0.79661274, "num_input_tokens_seen": 108272545, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.84765625, "step": 5037, "time_per_iteration": 2.540850877761841 }, { "auxiliary_loss_clip": 0.01140446, "auxiliary_loss_mlp": 0.01046682, "balance_loss_clip": 1.02977812, "balance_loss_mlp": 1.05221248, "epoch": 0.30290094694122954, "flos": 12932474670720.0, "grad_norm": 2.382433983908504, "language_loss": 0.77000058, "learning_rate": 3.268306696121816e-06, "loss": 0.79187191, "num_input_tokens_seen": 108289725, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8828125, "step": 5038, "time_per_iteration": 2.444394826889038 }, { "auxiliary_loss_clip": 0.01135509, "auxiliary_loss_mlp": 0.01038985, "balance_loss_clip": 1.02380943, "balance_loss_mlp": 1.05249882, "epoch": 0.3029610701938975, "flos": 25916443488000.0, "grad_norm": 1.8998906887153333, "language_loss": 0.73907816, "learning_rate": 3.2680055374607804e-06, "loss": 0.76082313, "num_input_tokens_seen": 108310690, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.83203125, "step": 5039, "time_per_iteration": 2.533855676651001 }, { "auxiliary_loss_clip": 0.01134602, "auxiliary_loss_mlp": 0.01039592, "balance_loss_clip": 1.02508402, "balance_loss_mlp": 1.052122, "epoch": 0.3030211934465655, "flos": 21980993679360.0, "grad_norm": 3.7748106731349798, "language_loss": 0.80046201, "learning_rate": 3.267704330716847e-06, "loss": 0.82220387, "num_input_tokens_seen": 108328905, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.82421875, "step": 5040, "time_per_iteration": 2.4898266792297363 }, { "auxiliary_loss_clip": 0.01137802, "auxiliary_loss_mlp": 0.0103912, "balance_loss_clip": 1.02429616, "balance_loss_mlp": 1.0537039, "epoch": 0.30308131669923344, "flos": 20991977625600.0, "grad_norm": 1.6708726575512707, "language_loss": 0.8158195, "learning_rate": 3.267403075901438e-06, "loss": 0.83758873, "num_input_tokens_seen": 108346680, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.84375, "step": 5041, "time_per_iteration": 2.519296407699585 }, { "auxiliary_loss_clip": 0.01064466, "auxiliary_loss_mlp": 0.01036699, "balance_loss_clip": 1.03395677, "balance_loss_mlp": 1.03309226, "epoch": 0.3031414399519014, "flos": 60548875827840.0, "grad_norm": 0.7673847084250672, "language_loss": 0.59474206, "learning_rate": 3.267101773025978e-06, "loss": 0.61575377, "num_input_tokens_seen": 108413885, "router_z_loss_clip": 0.02746582, "router_z_loss_mlp": 0.31445312, "step": 5042, "time_per_iteration": 3.2293474674224854 }, { "auxiliary_loss_clip": 0.01141338, "auxiliary_loss_mlp": 0.01038751, "balance_loss_clip": 1.02348077, "balance_loss_mlp": 1.0541048, "epoch": 0.30320156320456937, "flos": 21907664064000.0, "grad_norm": 6.27014667478997, "language_loss": 0.71558654, "learning_rate": 3.266800422101892e-06, "loss": 0.73738742, "num_input_tokens_seen": 108433640, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.875, "step": 5043, "time_per_iteration": 2.4857797622680664 }, { "auxiliary_loss_clip": 0.01139047, "auxiliary_loss_mlp": 0.01037231, "balance_loss_clip": 1.0215075, "balance_loss_mlp": 1.05251145, "epoch": 0.30326168645723733, "flos": 21652770176640.0, "grad_norm": 2.1044606276005817, "language_loss": 0.69783425, "learning_rate": 3.266499023140606e-06, "loss": 0.7195971, "num_input_tokens_seen": 108452640, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.86328125, "step": 5044, "time_per_iteration": 2.508591413497925 }, { "auxiliary_loss_clip": 0.01136217, "auxiliary_loss_mlp": 0.01036251, "balance_loss_clip": 1.02059901, "balance_loss_mlp": 1.05241287, "epoch": 0.3033218097099053, "flos": 21871286565120.0, "grad_norm": 1.4317228598115148, "language_loss": 0.77105737, "learning_rate": 3.2661975761535513e-06, "loss": 0.79278207, "num_input_tokens_seen": 108472470, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 5045, "time_per_iteration": 2.497934103012085 }, { "auxiliary_loss_clip": 0.01139129, "auxiliary_loss_mlp": 0.0103516, "balance_loss_clip": 1.01781511, "balance_loss_mlp": 1.05245829, "epoch": 0.30338193296257326, "flos": 27089717333760.0, "grad_norm": 1.7712299973943542, "language_loss": 0.72400606, "learning_rate": 3.2658960811521564e-06, "loss": 0.745749, "num_input_tokens_seen": 108493025, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8671875, "step": 5046, "time_per_iteration": 2.549562454223633 }, { "auxiliary_loss_clip": 0.01138855, "auxiliary_loss_mlp": 0.01037483, "balance_loss_clip": 1.01961398, "balance_loss_mlp": 1.05160785, "epoch": 0.30344205621524123, "flos": 19534363718400.0, "grad_norm": 2.499705396531316, "language_loss": 0.81023157, "learning_rate": 3.2655945381478564e-06, "loss": 0.83199495, "num_input_tokens_seen": 108513480, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.875, "step": 5047, "time_per_iteration": 2.5260214805603027 }, { "auxiliary_loss_clip": 0.01135057, "auxiliary_loss_mlp": 0.01039952, "balance_loss_clip": 1.02447903, "balance_loss_mlp": 1.05057657, "epoch": 0.3035021794679092, "flos": 23910976368000.0, "grad_norm": 1.8263011179969455, "language_loss": 0.7227335, "learning_rate": 3.265292947152084e-06, "loss": 0.74448365, "num_input_tokens_seen": 108533155, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84375, "step": 5048, "time_per_iteration": 2.5260586738586426 }, { "auxiliary_loss_clip": 0.01134853, "auxiliary_loss_mlp": 0.01030235, "balance_loss_clip": 1.01553667, "balance_loss_mlp": 1.0483644, "epoch": 0.30356230272057716, "flos": 16143606725760.0, "grad_norm": 2.207019439066471, "language_loss": 0.75355136, "learning_rate": 3.2649913081762763e-06, "loss": 0.77520227, "num_input_tokens_seen": 108551900, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.86328125, "step": 5049, "time_per_iteration": 2.4582972526550293 }, { "auxiliary_loss_clip": 0.01137551, "auxiliary_loss_mlp": 0.01033925, "balance_loss_clip": 1.01869035, "balance_loss_mlp": 1.04987204, "epoch": 0.3036224259732452, "flos": 28914697589760.0, "grad_norm": 4.314167680277507, "language_loss": 0.81868553, "learning_rate": 3.2646896212318717e-06, "loss": 0.84040028, "num_input_tokens_seen": 108574005, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.875, "step": 5050, "time_per_iteration": 2.5597782135009766 }, { "auxiliary_loss_clip": 0.01136525, "auxiliary_loss_mlp": 0.01039648, "balance_loss_clip": 1.02255309, "balance_loss_mlp": 1.04980171, "epoch": 0.30368254922591315, "flos": 21105599322240.0, "grad_norm": 2.865461118972273, "language_loss": 0.74044067, "learning_rate": 3.2643878863303106e-06, "loss": 0.76220244, "num_input_tokens_seen": 108592715, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8671875, "step": 5051, "time_per_iteration": 3.992126941680908 }, { "auxiliary_loss_clip": 0.01135662, "auxiliary_loss_mlp": 0.01035058, "balance_loss_clip": 1.02004337, "balance_loss_mlp": 1.05004704, "epoch": 0.3037426724785811, "flos": 23002293081600.0, "grad_norm": 1.7192603678709257, "language_loss": 0.76678991, "learning_rate": 3.264086103483033e-06, "loss": 0.78849715, "num_input_tokens_seen": 108611770, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.859375, "step": 5052, "time_per_iteration": 2.4984302520751953 }, { "auxiliary_loss_clip": 0.01139143, "auxiliary_loss_mlp": 0.01037986, "balance_loss_clip": 1.0223701, "balance_loss_mlp": 1.05065751, "epoch": 0.3038027957312491, "flos": 15632705629440.0, "grad_norm": 2.238172943080475, "language_loss": 0.82570654, "learning_rate": 3.2637842727014836e-06, "loss": 0.84747785, "num_input_tokens_seen": 108629070, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8828125, "step": 5053, "time_per_iteration": 3.872917890548706 }, { "auxiliary_loss_clip": 0.01136063, "auxiliary_loss_mlp": 0.01036077, "balance_loss_clip": 1.02025199, "balance_loss_mlp": 1.05113685, "epoch": 0.30386291898391704, "flos": 12713994195840.0, "grad_norm": 1.842321315726828, "language_loss": 0.71021265, "learning_rate": 3.2634823939971083e-06, "loss": 0.73193407, "num_input_tokens_seen": 108646315, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5054, "time_per_iteration": 3.8836171627044678 }, { "auxiliary_loss_clip": 0.01138264, "auxiliary_loss_mlp": 0.01038613, "balance_loss_clip": 1.02224541, "balance_loss_mlp": 1.05129504, "epoch": 0.303923042236585, "flos": 26359437922560.0, "grad_norm": 1.911368187406643, "language_loss": 0.69184339, "learning_rate": 3.2631804673813545e-06, "loss": 0.71361214, "num_input_tokens_seen": 108665920, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87109375, "step": 5055, "time_per_iteration": 3.972968578338623 }, { "auxiliary_loss_clip": 0.01137722, "auxiliary_loss_mlp": 0.01031137, "balance_loss_clip": 1.01473343, "balance_loss_mlp": 1.05054677, "epoch": 0.30398316548925297, "flos": 19719232041600.0, "grad_norm": 1.910747559150168, "language_loss": 0.6707046, "learning_rate": 3.2628784928656707e-06, "loss": 0.69239318, "num_input_tokens_seen": 108683485, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87109375, "step": 5056, "time_per_iteration": 2.4921958446502686 }, { "auxiliary_loss_clip": 0.01135985, "auxiliary_loss_mlp": 0.01040173, "balance_loss_clip": 1.02455103, "balance_loss_mlp": 1.0518626, "epoch": 0.30404328874192094, "flos": 24239846315520.0, "grad_norm": 1.6929284700975873, "language_loss": 0.82227409, "learning_rate": 3.262576470461507e-06, "loss": 0.84403569, "num_input_tokens_seen": 108702700, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83984375, "step": 5057, "time_per_iteration": 2.5084779262542725 }, { "auxiliary_loss_clip": 0.01134454, "auxiliary_loss_mlp": 0.01037922, "balance_loss_clip": 1.02218604, "balance_loss_mlp": 1.05000329, "epoch": 0.3041034119945889, "flos": 24498942094080.0, "grad_norm": 2.7056333937377346, "language_loss": 0.89257801, "learning_rate": 3.2622744001803176e-06, "loss": 0.91430175, "num_input_tokens_seen": 108721860, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84375, "step": 5058, "time_per_iteration": 2.5237839221954346 }, { "auxiliary_loss_clip": 0.01140567, "auxiliary_loss_mlp": 0.01038608, "balance_loss_clip": 1.0220145, "balance_loss_mlp": 1.05391014, "epoch": 0.30416353524725687, "flos": 28288881907200.0, "grad_norm": 2.456688557288375, "language_loss": 0.71132344, "learning_rate": 3.2619722820335564e-06, "loss": 0.7331152, "num_input_tokens_seen": 108743215, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8671875, "step": 5059, "time_per_iteration": 2.5464134216308594 }, { "auxiliary_loss_clip": 0.01137173, "auxiliary_loss_mlp": 0.0103794, "balance_loss_clip": 1.02264535, "balance_loss_mlp": 1.05228436, "epoch": 0.30422365849992483, "flos": 23660392112640.0, "grad_norm": 1.6359908609889124, "language_loss": 0.72768062, "learning_rate": 3.26167011603268e-06, "loss": 0.74943173, "num_input_tokens_seen": 108765505, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.84765625, "step": 5060, "time_per_iteration": 2.552158832550049 }, { "auxiliary_loss_clip": 0.011395, "auxiliary_loss_mlp": 0.01033625, "balance_loss_clip": 1.01809239, "balance_loss_mlp": 1.05337608, "epoch": 0.3042837817525928, "flos": 22998773548800.0, "grad_norm": 1.8512203476850608, "language_loss": 0.7651425, "learning_rate": 3.2613679021891463e-06, "loss": 0.78687376, "num_input_tokens_seen": 108783370, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.859375, "step": 5061, "time_per_iteration": 2.4846200942993164 }, { "auxiliary_loss_clip": 0.01144069, "auxiliary_loss_mlp": 0.01039081, "balance_loss_clip": 1.02271378, "balance_loss_mlp": 1.05542898, "epoch": 0.30434390500526076, "flos": 22082332924800.0, "grad_norm": 4.134354513369343, "language_loss": 0.81603473, "learning_rate": 3.261065640514415e-06, "loss": 0.83786619, "num_input_tokens_seen": 108797430, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.88671875, "step": 5062, "time_per_iteration": 2.4753782749176025 }, { "auxiliary_loss_clip": 0.01134411, "auxiliary_loss_mlp": 0.01030818, "balance_loss_clip": 1.01619148, "balance_loss_mlp": 1.05035114, "epoch": 0.3044040282579287, "flos": 25483504861440.0, "grad_norm": 1.799905041129742, "language_loss": 0.74455577, "learning_rate": 3.2607633310199483e-06, "loss": 0.76620805, "num_input_tokens_seen": 108816945, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.84375, "step": 5063, "time_per_iteration": 2.507316827774048 }, { "auxiliary_loss_clip": 0.01138101, "auxiliary_loss_mlp": 0.01037741, "balance_loss_clip": 1.02058721, "balance_loss_mlp": 1.05365682, "epoch": 0.30446415151059675, "flos": 21945478106880.0, "grad_norm": 2.241212620115189, "language_loss": 0.83968043, "learning_rate": 3.26046097371721e-06, "loss": 0.86143887, "num_input_tokens_seen": 108836615, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.84375, "step": 5064, "time_per_iteration": 2.504645347595215 }, { "auxiliary_loss_clip": 0.01137196, "auxiliary_loss_mlp": 0.01034404, "balance_loss_clip": 1.01760757, "balance_loss_mlp": 1.05101705, "epoch": 0.3045242747632647, "flos": 16435416816000.0, "grad_norm": 1.9091301023213751, "language_loss": 0.7529183, "learning_rate": 3.2601585686176655e-06, "loss": 0.77463424, "num_input_tokens_seen": 108855165, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.86328125, "step": 5065, "time_per_iteration": 2.4551053047180176 }, { "auxiliary_loss_clip": 0.01140605, "auxiliary_loss_mlp": 0.01041015, "balance_loss_clip": 1.02399182, "balance_loss_mlp": 1.05198359, "epoch": 0.3045843980159327, "flos": 31540341957120.0, "grad_norm": 2.095834259399163, "language_loss": 0.62391454, "learning_rate": 3.2598561157327814e-06, "loss": 0.64573073, "num_input_tokens_seen": 108874690, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.88671875, "step": 5066, "time_per_iteration": 2.58196759223938 }, { "auxiliary_loss_clip": 0.01144158, "auxiliary_loss_mlp": 0.01041166, "balance_loss_clip": 1.02501369, "balance_loss_mlp": 1.05472827, "epoch": 0.30464452126860064, "flos": 17853636481920.0, "grad_norm": 2.820600167172984, "language_loss": 0.82693541, "learning_rate": 3.2595536150740265e-06, "loss": 0.84878868, "num_input_tokens_seen": 108893140, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.89453125, "step": 5067, "time_per_iteration": 2.4573726654052734 }, { "auxiliary_loss_clip": 0.01135068, "auxiliary_loss_mlp": 0.0103901, "balance_loss_clip": 1.02375102, "balance_loss_mlp": 1.05193889, "epoch": 0.3047046445212686, "flos": 20631398947200.0, "grad_norm": 2.0072662293879393, "language_loss": 0.62837601, "learning_rate": 3.259251066652873e-06, "loss": 0.6501168, "num_input_tokens_seen": 108911880, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83203125, "step": 5068, "time_per_iteration": 2.4954800605773926 }, { "auxiliary_loss_clip": 0.01134839, "auxiliary_loss_mlp": 0.01032058, "balance_loss_clip": 1.01665592, "balance_loss_mlp": 1.05036998, "epoch": 0.3047647677739366, "flos": 21287594557440.0, "grad_norm": 3.339319258668453, "language_loss": 0.74828374, "learning_rate": 3.258948470480793e-06, "loss": 0.76995271, "num_input_tokens_seen": 108930440, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84375, "step": 5069, "time_per_iteration": 2.480872869491577 }, { "auxiliary_loss_clip": 0.01132593, "auxiliary_loss_mlp": 0.01043095, "balance_loss_clip": 1.02694178, "balance_loss_mlp": 1.0502317, "epoch": 0.30482489102660454, "flos": 20995928121600.0, "grad_norm": 2.10093024190985, "language_loss": 0.75372493, "learning_rate": 3.258645826569261e-06, "loss": 0.77548182, "num_input_tokens_seen": 108949125, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8203125, "step": 5070, "time_per_iteration": 2.5094738006591797 }, { "auxiliary_loss_clip": 0.01141693, "auxiliary_loss_mlp": 0.01038963, "balance_loss_clip": 1.02214229, "balance_loss_mlp": 1.05251038, "epoch": 0.3048850142792725, "flos": 26290812988800.0, "grad_norm": 7.132948604396973, "language_loss": 0.81264055, "learning_rate": 3.2583431349297527e-06, "loss": 0.83444715, "num_input_tokens_seen": 108972190, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.890625, "step": 5071, "time_per_iteration": 2.5487618446350098 }, { "auxiliary_loss_clip": 0.01139395, "auxiliary_loss_mlp": 0.01042521, "balance_loss_clip": 1.0250802, "balance_loss_mlp": 1.0516336, "epoch": 0.30494513753194047, "flos": 22346241125760.0, "grad_norm": 3.3901865021409296, "language_loss": 0.75588393, "learning_rate": 3.2580403955737467e-06, "loss": 0.77770311, "num_input_tokens_seen": 108990325, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.875, "step": 5072, "time_per_iteration": 2.5052504539489746 }, { "auxiliary_loss_clip": 0.01136223, "auxiliary_loss_mlp": 0.01043747, "balance_loss_clip": 1.02749908, "balance_loss_mlp": 1.05165088, "epoch": 0.30500526078460843, "flos": 19537667769600.0, "grad_norm": 1.8374122119548726, "language_loss": 0.70748878, "learning_rate": 3.257737608512723e-06, "loss": 0.72928846, "num_input_tokens_seen": 109009505, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84765625, "step": 5073, "time_per_iteration": 2.477994680404663 }, { "auxiliary_loss_clip": 0.0114124, "auxiliary_loss_mlp": 0.010461, "balance_loss_clip": 1.02983952, "balance_loss_mlp": 1.05242777, "epoch": 0.3050653840372764, "flos": 14465321614080.0, "grad_norm": 2.2749790721687653, "language_loss": 0.75984257, "learning_rate": 3.257434773758163e-06, "loss": 0.78171599, "num_input_tokens_seen": 109026350, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.88671875, "step": 5074, "time_per_iteration": 2.4613776206970215 }, { "auxiliary_loss_clip": 0.01136092, "auxiliary_loss_mlp": 0.01037756, "balance_loss_clip": 1.02228308, "balance_loss_mlp": 1.05199838, "epoch": 0.30512550728994436, "flos": 24243796811520.0, "grad_norm": 2.0199041088326926, "language_loss": 0.74765778, "learning_rate": 3.25713189132155e-06, "loss": 0.76939631, "num_input_tokens_seen": 109044165, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83984375, "step": 5075, "time_per_iteration": 2.5020787715911865 }, { "auxiliary_loss_clip": 0.01141216, "auxiliary_loss_mlp": 0.01039353, "balance_loss_clip": 1.02128041, "balance_loss_mlp": 1.05286646, "epoch": 0.30518563054261233, "flos": 16360542915840.0, "grad_norm": 2.2050474359382415, "language_loss": 0.7544179, "learning_rate": 3.2568289612143703e-06, "loss": 0.77622354, "num_input_tokens_seen": 109060665, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.8828125, "step": 5076, "time_per_iteration": 2.4817607402801514 }, { "auxiliary_loss_clip": 0.0113898, "auxiliary_loss_mlp": 0.01035275, "balance_loss_clip": 1.01911068, "balance_loss_mlp": 1.05414772, "epoch": 0.30524575379528035, "flos": 21579584215680.0, "grad_norm": 9.223189623848523, "language_loss": 0.7931391, "learning_rate": 3.25652598344811e-06, "loss": 0.81488162, "num_input_tokens_seen": 109080035, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84765625, "step": 5077, "time_per_iteration": 2.49175763130188 }, { "auxiliary_loss_clip": 0.01132668, "auxiliary_loss_mlp": 0.01032402, "balance_loss_clip": 1.01794231, "balance_loss_mlp": 1.0520587, "epoch": 0.3053058770479483, "flos": 16545231671040.0, "grad_norm": 1.787791349693063, "language_loss": 0.74769294, "learning_rate": 3.256222958034259e-06, "loss": 0.76934361, "num_input_tokens_seen": 109097385, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 5078, "time_per_iteration": 2.4987001419067383 }, { "auxiliary_loss_clip": 0.01134849, "auxiliary_loss_mlp": 0.0104741, "balance_loss_clip": 1.03194821, "balance_loss_mlp": 1.05221796, "epoch": 0.3053660003006163, "flos": 12312907954560.0, "grad_norm": 2.0724463129149964, "language_loss": 0.66922396, "learning_rate": 3.255919884984307e-06, "loss": 0.69104654, "num_input_tokens_seen": 109115495, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 5079, "time_per_iteration": 2.4608778953552246 }, { "auxiliary_loss_clip": 0.01136176, "auxiliary_loss_mlp": 0.01034448, "balance_loss_clip": 1.01958895, "balance_loss_mlp": 1.05242383, "epoch": 0.30542612355328425, "flos": 23112287504640.0, "grad_norm": 2.549130613057491, "language_loss": 0.80131125, "learning_rate": 3.2556167643097477e-06, "loss": 0.82301748, "num_input_tokens_seen": 109134235, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8359375, "step": 5080, "time_per_iteration": 2.5311031341552734 }, { "auxiliary_loss_clip": 0.01133587, "auxiliary_loss_mlp": 0.01042557, "balance_loss_clip": 1.02850819, "balance_loss_mlp": 1.05093443, "epoch": 0.3054862468059522, "flos": 24389450461440.0, "grad_norm": 8.318555139277395, "language_loss": 0.80358255, "learning_rate": 3.255313596022074e-06, "loss": 0.82534403, "num_input_tokens_seen": 109152760, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.828125, "step": 5081, "time_per_iteration": 2.505072832107544 }, { "auxiliary_loss_clip": 0.01131891, "auxiliary_loss_mlp": 0.01035903, "balance_loss_clip": 1.02051282, "balance_loss_mlp": 1.04951882, "epoch": 0.3055463700586202, "flos": 29386096704000.0, "grad_norm": 1.829341840614564, "language_loss": 0.71692538, "learning_rate": 3.255010380132783e-06, "loss": 0.73860329, "num_input_tokens_seen": 109173925, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.82421875, "step": 5082, "time_per_iteration": 2.5687758922576904 }, { "auxiliary_loss_clip": 0.01138206, "auxiliary_loss_mlp": 0.01041142, "balance_loss_clip": 1.02477479, "balance_loss_mlp": 1.05157232, "epoch": 0.30560649331128814, "flos": 25591775431680.0, "grad_norm": 1.943714994222683, "language_loss": 0.72905028, "learning_rate": 3.2547071166533736e-06, "loss": 0.75084376, "num_input_tokens_seen": 109192510, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8671875, "step": 5083, "time_per_iteration": 2.499420642852783 }, { "auxiliary_loss_clip": 0.01133756, "auxiliary_loss_mlp": 0.0103435, "balance_loss_clip": 1.01807213, "balance_loss_mlp": 1.04848742, "epoch": 0.3056666165639561, "flos": 19128321400320.0, "grad_norm": 2.0463088718756, "language_loss": 0.70942163, "learning_rate": 3.254403805595344e-06, "loss": 0.73110271, "num_input_tokens_seen": 109210885, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8515625, "step": 5084, "time_per_iteration": 2.489102363586426 }, { "auxiliary_loss_clip": 0.01138844, "auxiliary_loss_mlp": 0.01035544, "balance_loss_clip": 1.01971292, "balance_loss_mlp": 1.0529933, "epoch": 0.30572673981662407, "flos": 15523860441600.0, "grad_norm": 2.8183893397027737, "language_loss": 0.78343809, "learning_rate": 3.2541004469701962e-06, "loss": 0.80518198, "num_input_tokens_seen": 109229180, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.859375, "step": 5085, "time_per_iteration": 2.444143772125244 }, { "auxiliary_loss_clip": 0.01131508, "auxiliary_loss_mlp": 0.01037614, "balance_loss_clip": 1.02299285, "balance_loss_mlp": 1.05052555, "epoch": 0.30578686306929204, "flos": 21506541909120.0, "grad_norm": 1.7269779123799103, "language_loss": 0.78253281, "learning_rate": 3.2537970407894342e-06, "loss": 0.80422401, "num_input_tokens_seen": 109249510, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 5086, "time_per_iteration": 2.511042833328247 }, { "auxiliary_loss_clip": 0.01136664, "auxiliary_loss_mlp": 0.0104321, "balance_loss_clip": 1.02738464, "balance_loss_mlp": 1.05409241, "epoch": 0.30584698632196, "flos": 20954271323520.0, "grad_norm": 2.6558107426737365, "language_loss": 0.76771355, "learning_rate": 3.253493587064563e-06, "loss": 0.78951228, "num_input_tokens_seen": 109268200, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.82421875, "step": 5087, "time_per_iteration": 2.4718754291534424 }, { "auxiliary_loss_clip": 0.01137597, "auxiliary_loss_mlp": 0.01040378, "balance_loss_clip": 1.0247854, "balance_loss_mlp": 1.05197513, "epoch": 0.30590710957462797, "flos": 24681116897280.0, "grad_norm": 2.0846349538632083, "language_loss": 0.71619594, "learning_rate": 3.2531900858070885e-06, "loss": 0.73797566, "num_input_tokens_seen": 109288370, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.85546875, "step": 5088, "time_per_iteration": 2.543673276901245 }, { "auxiliary_loss_clip": 0.01141522, "auxiliary_loss_mlp": 0.01037323, "balance_loss_clip": 1.02206981, "balance_loss_mlp": 1.05195212, "epoch": 0.30596723282729593, "flos": 17086907744640.0, "grad_norm": 5.021980128923013, "language_loss": 0.78966486, "learning_rate": 3.252886537028521e-06, "loss": 0.81145334, "num_input_tokens_seen": 109306730, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8984375, "step": 5089, "time_per_iteration": 2.438856601715088 }, { "auxiliary_loss_clip": 0.01136743, "auxiliary_loss_mlp": 0.01039062, "balance_loss_clip": 1.02342129, "balance_loss_mlp": 1.05267298, "epoch": 0.30602735607996395, "flos": 22857106308480.0, "grad_norm": 2.208296502668681, "language_loss": 0.76985937, "learning_rate": 3.2525829407403703e-06, "loss": 0.79161745, "num_input_tokens_seen": 109327360, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83984375, "step": 5090, "time_per_iteration": 2.5018229484558105 }, { "auxiliary_loss_clip": 0.0114048, "auxiliary_loss_mlp": 0.0104419, "balance_loss_clip": 1.02816856, "balance_loss_mlp": 1.05259776, "epoch": 0.3060874793326319, "flos": 29861482227840.0, "grad_norm": 2.075335214588602, "language_loss": 0.75878322, "learning_rate": 3.2522792969541488e-06, "loss": 0.78062987, "num_input_tokens_seen": 109348135, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87890625, "step": 5091, "time_per_iteration": 2.5354299545288086 }, { "auxiliary_loss_clip": 0.01138888, "auxiliary_loss_mlp": 0.01039837, "balance_loss_clip": 1.02341056, "balance_loss_mlp": 1.05186045, "epoch": 0.3061476025852999, "flos": 20448577699200.0, "grad_norm": 2.2689907393125046, "language_loss": 0.71794355, "learning_rate": 3.2519756056813705e-06, "loss": 0.73973083, "num_input_tokens_seen": 109366220, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8671875, "step": 5092, "time_per_iteration": 4.050215482711792 }, { "auxiliary_loss_clip": 0.01136629, "auxiliary_loss_mlp": 0.01038851, "balance_loss_clip": 1.02480817, "balance_loss_mlp": 1.05239677, "epoch": 0.30620772583796785, "flos": 19391475415680.0, "grad_norm": 2.2786079445034573, "language_loss": 0.83067733, "learning_rate": 3.2516718669335522e-06, "loss": 0.85243213, "num_input_tokens_seen": 109385260, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.84375, "step": 5093, "time_per_iteration": 2.518906593322754 }, { "auxiliary_loss_clip": 0.01134478, "auxiliary_loss_mlp": 0.0103637, "balance_loss_clip": 1.02212477, "balance_loss_mlp": 1.05146933, "epoch": 0.3062678490906358, "flos": 24024562151040.0, "grad_norm": 2.4331784549422175, "language_loss": 0.74756104, "learning_rate": 3.2513680807222114e-06, "loss": 0.76926947, "num_input_tokens_seen": 109405025, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.828125, "step": 5094, "time_per_iteration": 2.5225510597229004 }, { "auxiliary_loss_clip": 0.0113512, "auxiliary_loss_mlp": 0.01040695, "balance_loss_clip": 1.02637815, "balance_loss_mlp": 1.05315745, "epoch": 0.3063279723433038, "flos": 19754639873280.0, "grad_norm": 11.44465773904738, "language_loss": 0.76416653, "learning_rate": 3.251064247058868e-06, "loss": 0.78592467, "num_input_tokens_seen": 109422465, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8203125, "step": 5095, "time_per_iteration": 3.833705425262451 }, { "auxiliary_loss_clip": 0.01132321, "auxiliary_loss_mlp": 0.01038036, "balance_loss_clip": 1.02309299, "balance_loss_mlp": 1.05121434, "epoch": 0.30638809559597174, "flos": 22450022496000.0, "grad_norm": 1.8534904210199874, "language_loss": 0.80715024, "learning_rate": 3.250760365955042e-06, "loss": 0.82885385, "num_input_tokens_seen": 109440575, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8125, "step": 5096, "time_per_iteration": 3.859358072280884 }, { "auxiliary_loss_clip": 0.01136761, "auxiliary_loss_mlp": 0.0103279, "balance_loss_clip": 1.01793671, "balance_loss_mlp": 1.0519613, "epoch": 0.3064482188486397, "flos": 17165157523200.0, "grad_norm": 3.465887174590767, "language_loss": 0.81384289, "learning_rate": 3.250456437422258e-06, "loss": 0.83553839, "num_input_tokens_seen": 109459050, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.84765625, "step": 5097, "time_per_iteration": 3.8773248195648193 }, { "auxiliary_loss_clip": 0.01135419, "auxiliary_loss_mlp": 0.01039812, "balance_loss_clip": 1.0237546, "balance_loss_mlp": 1.05086803, "epoch": 0.3065083421013077, "flos": 23768483114880.0, "grad_norm": 2.026794381014655, "language_loss": 0.78055495, "learning_rate": 3.250152461472041e-06, "loss": 0.80230725, "num_input_tokens_seen": 109475860, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 5098, "time_per_iteration": 2.493215560913086 }, { "auxiliary_loss_clip": 0.01132254, "auxiliary_loss_mlp": 0.01037085, "balance_loss_clip": 1.022089, "balance_loss_mlp": 1.05015659, "epoch": 0.30656846535397564, "flos": 26431833784320.0, "grad_norm": 2.173919269059902, "language_loss": 0.83836806, "learning_rate": 3.249848438115917e-06, "loss": 0.86006147, "num_input_tokens_seen": 109494760, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 5099, "time_per_iteration": 2.509462594985962 }, { "auxiliary_loss_clip": 0.0113602, "auxiliary_loss_mlp": 0.01041073, "balance_loss_clip": 1.02543855, "balance_loss_mlp": 1.05023098, "epoch": 0.3066285886066436, "flos": 26651786716800.0, "grad_norm": 4.176344877985559, "language_loss": 0.85907066, "learning_rate": 3.2495443673654148e-06, "loss": 0.88084161, "num_input_tokens_seen": 109516480, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 5100, "time_per_iteration": 2.5165743827819824 }, { "auxiliary_loss_clip": 0.01135136, "auxiliary_loss_mlp": 0.0103244, "balance_loss_clip": 1.01643026, "balance_loss_mlp": 1.0507617, "epoch": 0.30668871185931157, "flos": 15049947375360.0, "grad_norm": 1.878255325432686, "language_loss": 0.78840125, "learning_rate": 3.249240249232065e-06, "loss": 0.81007707, "num_input_tokens_seen": 109534615, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 5101, "time_per_iteration": 2.45942759513855 }, { "auxiliary_loss_clip": 0.01137339, "auxiliary_loss_mlp": 0.01043589, "balance_loss_clip": 1.02657223, "balance_loss_mlp": 1.05160582, "epoch": 0.30674883511197953, "flos": 20082109190400.0, "grad_norm": 4.012307892381807, "language_loss": 0.79790628, "learning_rate": 3.2489360837273998e-06, "loss": 0.8197155, "num_input_tokens_seen": 109554040, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.859375, "step": 5102, "time_per_iteration": 2.473597764968872 }, { "auxiliary_loss_clip": 0.01136856, "auxiliary_loss_mlp": 0.01039385, "balance_loss_clip": 1.02288675, "balance_loss_mlp": 1.05254674, "epoch": 0.30680895836464755, "flos": 22893807029760.0, "grad_norm": 2.567506665238322, "language_loss": 0.89052057, "learning_rate": 3.2486318708629532e-06, "loss": 0.91228294, "num_input_tokens_seen": 109574345, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.84375, "step": 5103, "time_per_iteration": 2.5079660415649414 }, { "auxiliary_loss_clip": 0.01133527, "auxiliary_loss_mlp": 0.01038884, "balance_loss_clip": 1.02313602, "balance_loss_mlp": 1.05059278, "epoch": 0.3068690816173155, "flos": 23696159080320.0, "grad_norm": 1.7404044079455745, "language_loss": 0.73555833, "learning_rate": 3.2483276106502607e-06, "loss": 0.75728244, "num_input_tokens_seen": 109593670, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 5104, "time_per_iteration": 2.512503147125244 }, { "auxiliary_loss_clip": 0.01138415, "auxiliary_loss_mlp": 0.01039485, "balance_loss_clip": 1.02352238, "balance_loss_mlp": 1.05063915, "epoch": 0.3069292048699835, "flos": 23551044134400.0, "grad_norm": 1.854603323907984, "language_loss": 0.72992009, "learning_rate": 3.2480233031008605e-06, "loss": 0.75169909, "num_input_tokens_seen": 109613385, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.87890625, "step": 5105, "time_per_iteration": 2.5080926418304443 }, { "auxiliary_loss_clip": 0.01137695, "auxiliary_loss_mlp": 0.01041616, "balance_loss_clip": 1.02533209, "balance_loss_mlp": 1.05170274, "epoch": 0.30698932812265145, "flos": 24531656405760.0, "grad_norm": 2.8358727959353303, "language_loss": 0.87471855, "learning_rate": 3.2477189482262916e-06, "loss": 0.89651167, "num_input_tokens_seen": 109632395, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 5106, "time_per_iteration": 2.521575689315796 }, { "auxiliary_loss_clip": 0.01139686, "auxiliary_loss_mlp": 0.01047415, "balance_loss_clip": 1.03074932, "balance_loss_mlp": 1.05097091, "epoch": 0.3070494513753194, "flos": 20996430912000.0, "grad_norm": 3.902204186353937, "language_loss": 0.71316844, "learning_rate": 3.2474145460380945e-06, "loss": 0.73503947, "num_input_tokens_seen": 109651380, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8828125, "step": 5107, "time_per_iteration": 2.485919237136841 }, { "auxiliary_loss_clip": 0.01132515, "auxiliary_loss_mlp": 0.01046341, "balance_loss_clip": 1.03026557, "balance_loss_mlp": 1.04934514, "epoch": 0.3071095746279874, "flos": 19025940660480.0, "grad_norm": 2.4854677581788827, "language_loss": 0.72346175, "learning_rate": 3.247110096547814e-06, "loss": 0.74525028, "num_input_tokens_seen": 109670240, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 5108, "time_per_iteration": 2.474865436553955 }, { "auxiliary_loss_clip": 0.01134808, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.01921415, "balance_loss_mlp": 1.05060387, "epoch": 0.30716969788065535, "flos": 21215521918080.0, "grad_norm": 1.6531761750735017, "language_loss": 0.85784185, "learning_rate": 3.2468055997669926e-06, "loss": 0.87954009, "num_input_tokens_seen": 109690810, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84375, "step": 5109, "time_per_iteration": 2.542456865310669 }, { "auxiliary_loss_clip": 0.01133201, "auxiliary_loss_mlp": 0.0103487, "balance_loss_clip": 1.02021921, "balance_loss_mlp": 1.04898834, "epoch": 0.3072298211333233, "flos": 25772765086080.0, "grad_norm": 2.393649329976712, "language_loss": 0.67696571, "learning_rate": 3.2465010557071788e-06, "loss": 0.69864643, "num_input_tokens_seen": 109711145, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.83984375, "step": 5110, "time_per_iteration": 2.5393149852752686 }, { "auxiliary_loss_clip": 0.01129076, "auxiliary_loss_mlp": 0.01026808, "balance_loss_clip": 1.01319432, "balance_loss_mlp": 1.0480113, "epoch": 0.3072899443859913, "flos": 25848931875840.0, "grad_norm": 1.6031233224126078, "language_loss": 0.77080941, "learning_rate": 3.246196464379919e-06, "loss": 0.79236817, "num_input_tokens_seen": 109731425, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.8125, "step": 5111, "time_per_iteration": 2.5293102264404297 }, { "auxiliary_loss_clip": 0.01135844, "auxiliary_loss_mlp": 0.0104014, "balance_loss_clip": 1.02511346, "balance_loss_mlp": 1.05106783, "epoch": 0.30735006763865924, "flos": 25922800195200.0, "grad_norm": 2.06254726683273, "language_loss": 0.67269772, "learning_rate": 3.245891825796765e-06, "loss": 0.69445753, "num_input_tokens_seen": 109752720, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84765625, "step": 5112, "time_per_iteration": 2.555216073989868 }, { "auxiliary_loss_clip": 0.01140754, "auxiliary_loss_mlp": 0.01042523, "balance_loss_clip": 1.0249989, "balance_loss_mlp": 1.05229115, "epoch": 0.3074101908913272, "flos": 30917004312960.0, "grad_norm": 2.6995564557516345, "language_loss": 0.79234445, "learning_rate": 3.2455871399692678e-06, "loss": 0.81417722, "num_input_tokens_seen": 109772840, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.8828125, "step": 5113, "time_per_iteration": 2.548447847366333 }, { "auxiliary_loss_clip": 0.01133195, "auxiliary_loss_mlp": 0.01037846, "balance_loss_clip": 1.02267075, "balance_loss_mlp": 1.04778993, "epoch": 0.30747031414399517, "flos": 18401058731520.0, "grad_norm": 3.5884663782213972, "language_loss": 0.76896369, "learning_rate": 3.2452824069089815e-06, "loss": 0.79067409, "num_input_tokens_seen": 109790150, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8515625, "step": 5114, "time_per_iteration": 2.4868531227111816 }, { "auxiliary_loss_clip": 0.01133218, "auxiliary_loss_mlp": 0.01034306, "balance_loss_clip": 1.0175575, "balance_loss_mlp": 1.04941285, "epoch": 0.30753043739666314, "flos": 22633166966400.0, "grad_norm": 2.058865574831365, "language_loss": 0.62120199, "learning_rate": 3.2449776266274623e-06, "loss": 0.64287728, "num_input_tokens_seen": 109807985, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8359375, "step": 5115, "time_per_iteration": 2.4933390617370605 }, { "auxiliary_loss_clip": 0.01133315, "auxiliary_loss_mlp": 0.01035238, "balance_loss_clip": 1.0197413, "balance_loss_mlp": 1.04809523, "epoch": 0.3075905606493311, "flos": 27344072517120.0, "grad_norm": 2.3461256650807414, "language_loss": 0.82920283, "learning_rate": 3.2446727991362657e-06, "loss": 0.85088837, "num_input_tokens_seen": 109825920, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8515625, "step": 5116, "time_per_iteration": 2.533574104309082 }, { "auxiliary_loss_clip": 0.01132441, "auxiliary_loss_mlp": 0.01038209, "balance_loss_clip": 1.02357006, "balance_loss_mlp": 1.04866803, "epoch": 0.3076506839019991, "flos": 22090808534400.0, "grad_norm": 1.973788082875569, "language_loss": 0.75596505, "learning_rate": 3.244367924446952e-06, "loss": 0.77767152, "num_input_tokens_seen": 109846220, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8359375, "step": 5117, "time_per_iteration": 2.491054058074951 }, { "auxiliary_loss_clip": 0.01136486, "auxiliary_loss_mlp": 0.01034129, "balance_loss_clip": 1.01826203, "balance_loss_mlp": 1.0512023, "epoch": 0.3077108071546671, "flos": 21289533891840.0, "grad_norm": 2.84861136475531, "language_loss": 0.71760583, "learning_rate": 3.2440630025710826e-06, "loss": 0.73931199, "num_input_tokens_seen": 109863870, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5118, "time_per_iteration": 2.4929380416870117 }, { "auxiliary_loss_clip": 0.01135192, "auxiliary_loss_mlp": 0.01038412, "balance_loss_clip": 1.02361774, "balance_loss_mlp": 1.05064297, "epoch": 0.30777093040733505, "flos": 21430985650560.0, "grad_norm": 1.8902109245825525, "language_loss": 0.74455279, "learning_rate": 3.243758033520219e-06, "loss": 0.76628882, "num_input_tokens_seen": 109883500, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.84375, "step": 5119, "time_per_iteration": 2.500114679336548 }, { "auxiliary_loss_clip": 0.01137342, "auxiliary_loss_mlp": 0.01045999, "balance_loss_clip": 1.02895248, "balance_loss_mlp": 1.05007267, "epoch": 0.307831053660003, "flos": 23149275534720.0, "grad_norm": 6.1041804895244605, "language_loss": 0.7976197, "learning_rate": 3.243453017305926e-06, "loss": 0.81945312, "num_input_tokens_seen": 109904620, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.87109375, "step": 5120, "time_per_iteration": 2.5382466316223145 }, { "auxiliary_loss_clip": 0.011299, "auxiliary_loss_mlp": 0.01043564, "balance_loss_clip": 1.02847815, "balance_loss_mlp": 1.04663038, "epoch": 0.307891176912671, "flos": 17019755268480.0, "grad_norm": 1.6941295792091313, "language_loss": 0.79897904, "learning_rate": 3.24314795393977e-06, "loss": 0.82071364, "num_input_tokens_seen": 109922275, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83203125, "step": 5121, "time_per_iteration": 2.4377496242523193 }, { "auxiliary_loss_clip": 0.01132762, "auxiliary_loss_mlp": 0.01037277, "balance_loss_clip": 1.02254295, "balance_loss_mlp": 1.04969537, "epoch": 0.30795130016533895, "flos": 27705046245120.0, "grad_norm": 1.6842713522488417, "language_loss": 0.82411063, "learning_rate": 3.242842843433319e-06, "loss": 0.84581101, "num_input_tokens_seen": 109944265, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.828125, "step": 5122, "time_per_iteration": 2.5339434146881104 }, { "auxiliary_loss_clip": 0.01067401, "auxiliary_loss_mlp": 0.01012088, "balance_loss_clip": 1.00968027, "balance_loss_mlp": 1.03723741, "epoch": 0.3080114234180069, "flos": 69058699591680.0, "grad_norm": 0.7532025341550157, "language_loss": 0.58653599, "learning_rate": 3.242537685798143e-06, "loss": 0.60733086, "num_input_tokens_seen": 110014160, "router_z_loss_clip": 0.02404785, "router_z_loss_mlp": 0.30078125, "step": 5123, "time_per_iteration": 3.2626919746398926 }, { "auxiliary_loss_clip": 0.01139345, "auxiliary_loss_mlp": 0.01038793, "balance_loss_clip": 1.0215081, "balance_loss_mlp": 1.05073047, "epoch": 0.3080715466706749, "flos": 24060221377920.0, "grad_norm": 3.711833149736471, "language_loss": 0.82819784, "learning_rate": 3.242232481045813e-06, "loss": 0.84997928, "num_input_tokens_seen": 110034865, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.88671875, "step": 5124, "time_per_iteration": 2.515864849090576 }, { "auxiliary_loss_clip": 0.01138271, "auxiliary_loss_mlp": 0.01042525, "balance_loss_clip": 1.02707529, "balance_loss_mlp": 1.05147457, "epoch": 0.30813166992334284, "flos": 25848680480640.0, "grad_norm": 2.0210684963760013, "language_loss": 0.79246283, "learning_rate": 3.2419272291879035e-06, "loss": 0.81427079, "num_input_tokens_seen": 110052930, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8671875, "step": 5125, "time_per_iteration": 2.514693260192871 }, { "auxiliary_loss_clip": 0.01137519, "auxiliary_loss_mlp": 0.01040532, "balance_loss_clip": 1.0228411, "balance_loss_mlp": 1.04906178, "epoch": 0.3081917931760108, "flos": 20449619193600.0, "grad_norm": 2.372984447225798, "language_loss": 0.64279985, "learning_rate": 3.241621930235989e-06, "loss": 0.66458035, "num_input_tokens_seen": 110071765, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.8828125, "step": 5126, "time_per_iteration": 2.45477557182312 }, { "auxiliary_loss_clip": 0.01131847, "auxiliary_loss_mlp": 0.01036419, "balance_loss_clip": 1.02110088, "balance_loss_mlp": 1.05083203, "epoch": 0.3082519164286788, "flos": 22166257052160.0, "grad_norm": 1.7772910220384905, "language_loss": 0.8674227, "learning_rate": 3.241316584201646e-06, "loss": 0.88910532, "num_input_tokens_seen": 110092660, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.80859375, "step": 5127, "time_per_iteration": 2.5026485919952393 }, { "auxiliary_loss_clip": 0.01132474, "auxiliary_loss_mlp": 0.0103997, "balance_loss_clip": 1.02363849, "balance_loss_mlp": 1.04892921, "epoch": 0.30831203968134674, "flos": 28913404700160.0, "grad_norm": 1.6273932506199922, "language_loss": 0.68796432, "learning_rate": 3.2410111910964538e-06, "loss": 0.70968872, "num_input_tokens_seen": 110114960, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 5128, "time_per_iteration": 2.5218870639801025 }, { "auxiliary_loss_clip": 0.01138499, "auxiliary_loss_mlp": 0.01040552, "balance_loss_clip": 1.02364779, "balance_loss_mlp": 1.05117202, "epoch": 0.3083721629340147, "flos": 25667726739840.0, "grad_norm": 1.9215966597433733, "language_loss": 0.71502596, "learning_rate": 3.240705750931993e-06, "loss": 0.73681647, "num_input_tokens_seen": 110135750, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.875, "step": 5129, "time_per_iteration": 2.5535082817077637 }, { "auxiliary_loss_clip": 0.01064761, "auxiliary_loss_mlp": 0.01002734, "balance_loss_clip": 1.0001353, "balance_loss_mlp": 1.03483152, "epoch": 0.3084322861866827, "flos": 68212679581440.0, "grad_norm": 0.825053945692798, "language_loss": 0.592363, "learning_rate": 3.240400263719846e-06, "loss": 0.61303794, "num_input_tokens_seen": 110189480, "router_z_loss_clip": 0.02600098, "router_z_loss_mlp": 0.29882812, "step": 5130, "time_per_iteration": 3.079982042312622 }, { "auxiliary_loss_clip": 0.01139033, "auxiliary_loss_mlp": 0.01036306, "balance_loss_clip": 1.0195812, "balance_loss_mlp": 1.05084777, "epoch": 0.3084924094393507, "flos": 20296495514880.0, "grad_norm": 2.19388693684468, "language_loss": 0.7287873, "learning_rate": 3.2400947294715957e-06, "loss": 0.75054067, "num_input_tokens_seen": 110206445, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8828125, "step": 5131, "time_per_iteration": 2.487323522567749 }, { "auxiliary_loss_clip": 0.01134334, "auxiliary_loss_mlp": 0.01037487, "balance_loss_clip": 1.02283597, "balance_loss_mlp": 1.05020642, "epoch": 0.30855253269201866, "flos": 23949831905280.0, "grad_norm": 1.5899223248430068, "language_loss": 0.71219909, "learning_rate": 3.2397891481988303e-06, "loss": 0.7339173, "num_input_tokens_seen": 110226845, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.84375, "step": 5132, "time_per_iteration": 2.5054078102111816 }, { "auxiliary_loss_clip": 0.01131458, "auxiliary_loss_mlp": 0.01037041, "balance_loss_clip": 1.02143681, "balance_loss_mlp": 1.05063093, "epoch": 0.3086126559446866, "flos": 19281876042240.0, "grad_norm": 2.3411579389946175, "language_loss": 0.89828503, "learning_rate": 3.239483519913136e-06, "loss": 0.91996992, "num_input_tokens_seen": 110244095, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.80859375, "step": 5133, "time_per_iteration": 2.4685728549957275 }, { "auxiliary_loss_clip": 0.01136512, "auxiliary_loss_mlp": 0.01044026, "balance_loss_clip": 1.02784932, "balance_loss_mlp": 1.04887366, "epoch": 0.3086727791973546, "flos": 33760770019200.0, "grad_norm": 1.936171309196177, "language_loss": 0.66577417, "learning_rate": 3.239177844626102e-06, "loss": 0.68757957, "num_input_tokens_seen": 110264240, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.875, "step": 5134, "time_per_iteration": 4.1329028606414795 }, { "auxiliary_loss_clip": 0.01136873, "auxiliary_loss_mlp": 0.01036827, "balance_loss_clip": 1.02037609, "balance_loss_mlp": 1.04935217, "epoch": 0.30873290245002255, "flos": 16034151006720.0, "grad_norm": 2.5672656086109633, "language_loss": 0.82710898, "learning_rate": 3.2388721223493197e-06, "loss": 0.84884596, "num_input_tokens_seen": 110282450, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.875, "step": 5135, "time_per_iteration": 2.4413511753082275 }, { "auxiliary_loss_clip": 0.01062835, "auxiliary_loss_mlp": 0.01001812, "balance_loss_clip": 0.99921304, "balance_loss_mlp": 1.03339672, "epoch": 0.3087930257026905, "flos": 65048304055680.0, "grad_norm": 0.7215883324270294, "language_loss": 0.55286938, "learning_rate": 3.2385663530943824e-06, "loss": 0.57351583, "num_input_tokens_seen": 110343715, "router_z_loss_clip": 0.02600098, "router_z_loss_mlp": 0.29296875, "step": 5136, "time_per_iteration": 3.137556552886963 }, { "auxiliary_loss_clip": 0.01134241, "auxiliary_loss_mlp": 0.01036341, "balance_loss_clip": 1.02048671, "balance_loss_mlp": 1.04917932, "epoch": 0.3088531489553585, "flos": 74738829824640.0, "grad_norm": 2.1001829781591197, "language_loss": 0.76358438, "learning_rate": 3.2382605368728852e-06, "loss": 0.78529024, "num_input_tokens_seen": 110368430, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5137, "time_per_iteration": 5.597553968429565 }, { "auxiliary_loss_clip": 0.01133732, "auxiliary_loss_mlp": 0.01034722, "balance_loss_clip": 1.01966536, "balance_loss_mlp": 1.04894209, "epoch": 0.30891327220802645, "flos": 21142300043520.0, "grad_norm": 7.587922524854496, "language_loss": 0.79449981, "learning_rate": 3.237954673696424e-06, "loss": 0.81618434, "num_input_tokens_seen": 110386735, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84765625, "step": 5138, "time_per_iteration": 2.4696226119995117 }, { "auxiliary_loss_clip": 0.01133631, "auxiliary_loss_mlp": 0.01038304, "balance_loss_clip": 1.02165675, "balance_loss_mlp": 1.04722631, "epoch": 0.3089733954606944, "flos": 25664494515840.0, "grad_norm": 1.468745774180791, "language_loss": 0.81578517, "learning_rate": 3.2376487635765983e-06, "loss": 0.83750463, "num_input_tokens_seen": 110406820, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8671875, "step": 5139, "time_per_iteration": 3.9558486938476562 }, { "auxiliary_loss_clip": 0.01141402, "auxiliary_loss_mlp": 0.01038926, "balance_loss_clip": 1.02110386, "balance_loss_mlp": 1.0500015, "epoch": 0.3090335187133624, "flos": 19427350124160.0, "grad_norm": 2.8511136743720176, "language_loss": 0.76778591, "learning_rate": 3.2373428065250067e-06, "loss": 0.78958917, "num_input_tokens_seen": 110424225, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.9140625, "step": 5140, "time_per_iteration": 2.482848644256592 }, { "auxiliary_loss_clip": 0.01130298, "auxiliary_loss_mlp": 0.01043931, "balance_loss_clip": 1.02880311, "balance_loss_mlp": 1.049088, "epoch": 0.30909364196603034, "flos": 20011329440640.0, "grad_norm": 1.7924192170600488, "language_loss": 0.78379303, "learning_rate": 3.237036802553252e-06, "loss": 0.80553532, "num_input_tokens_seen": 110443310, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8125, "step": 5141, "time_per_iteration": 2.462111473083496 }, { "auxiliary_loss_clip": 0.01137562, "auxiliary_loss_mlp": 0.01041771, "balance_loss_clip": 1.02492714, "balance_loss_mlp": 1.04995108, "epoch": 0.3091537652186983, "flos": 19677575243520.0, "grad_norm": 2.068335538391424, "language_loss": 0.87302411, "learning_rate": 3.2367307516729377e-06, "loss": 0.89481741, "num_input_tokens_seen": 110460215, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.875, "step": 5142, "time_per_iteration": 2.474501371383667 }, { "auxiliary_loss_clip": 0.01133761, "auxiliary_loss_mlp": 0.01041454, "balance_loss_clip": 1.02574253, "balance_loss_mlp": 1.04813647, "epoch": 0.3092138884713663, "flos": 17020042577280.0, "grad_norm": 1.8151195099772348, "language_loss": 0.79130799, "learning_rate": 3.23642465389567e-06, "loss": 0.81306016, "num_input_tokens_seen": 110479385, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.85546875, "step": 5143, "time_per_iteration": 2.453160047531128 }, { "auxiliary_loss_clip": 0.01134356, "auxiliary_loss_mlp": 0.01042206, "balance_loss_clip": 1.0255053, "balance_loss_mlp": 1.04886532, "epoch": 0.3092740117240343, "flos": 25009986844800.0, "grad_norm": 1.7060924106108704, "language_loss": 0.71456164, "learning_rate": 3.236118509233055e-06, "loss": 0.73632729, "num_input_tokens_seen": 110499885, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.85546875, "step": 5144, "time_per_iteration": 2.534254789352417 }, { "auxiliary_loss_clip": 0.01135027, "auxiliary_loss_mlp": 0.01040246, "balance_loss_clip": 1.02358067, "balance_loss_mlp": 1.04745746, "epoch": 0.30933413497670226, "flos": 25590410714880.0, "grad_norm": 1.8721310724299614, "language_loss": 0.73945689, "learning_rate": 3.235812317696702e-06, "loss": 0.76120961, "num_input_tokens_seen": 110519690, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.875, "step": 5145, "time_per_iteration": 2.5000176429748535 }, { "auxiliary_loss_clip": 0.01133666, "auxiliary_loss_mlp": 0.01041523, "balance_loss_clip": 1.02541757, "balance_loss_mlp": 1.04873371, "epoch": 0.3093942582293702, "flos": 24389665943040.0, "grad_norm": 2.083515816289934, "language_loss": 0.75860262, "learning_rate": 3.2355060792982224e-06, "loss": 0.78035456, "num_input_tokens_seen": 110540520, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8515625, "step": 5146, "time_per_iteration": 2.529646158218384 }, { "auxiliary_loss_clip": 0.01133631, "auxiliary_loss_mlp": 0.01035, "balance_loss_clip": 1.019485, "balance_loss_mlp": 1.04946613, "epoch": 0.3094543814820382, "flos": 19646441130240.0, "grad_norm": 2.488216591772369, "language_loss": 0.66611093, "learning_rate": 3.2351997940492286e-06, "loss": 0.68779725, "num_input_tokens_seen": 110557950, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84375, "step": 5147, "time_per_iteration": 2.458592414855957 }, { "auxiliary_loss_clip": 0.01139486, "auxiliary_loss_mlp": 0.01038616, "balance_loss_clip": 1.02330971, "balance_loss_mlp": 1.05280387, "epoch": 0.30951450473470615, "flos": 25663812157440.0, "grad_norm": 1.5855839671060143, "language_loss": 0.74610472, "learning_rate": 3.2348934619613346e-06, "loss": 0.7678858, "num_input_tokens_seen": 110578215, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8671875, "step": 5148, "time_per_iteration": 2.523735284805298 }, { "auxiliary_loss_clip": 0.01142709, "auxiliary_loss_mlp": 0.01046418, "balance_loss_clip": 1.02929986, "balance_loss_mlp": 1.05126369, "epoch": 0.3095746279873741, "flos": 12020415505920.0, "grad_norm": 6.4768131385948635, "language_loss": 0.72964382, "learning_rate": 3.2345870830461567e-06, "loss": 0.75153518, "num_input_tokens_seen": 110592990, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9140625, "step": 5149, "time_per_iteration": 2.412785291671753 }, { "auxiliary_loss_clip": 0.01136066, "auxiliary_loss_mlp": 0.01041199, "balance_loss_clip": 1.02404487, "balance_loss_mlp": 1.04845929, "epoch": 0.3096347512400421, "flos": 23623044946560.0, "grad_norm": 2.339662510476989, "language_loss": 0.84427691, "learning_rate": 3.2342806573153132e-06, "loss": 0.86604959, "num_input_tokens_seen": 110612130, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.875, "step": 5150, "time_per_iteration": 2.527120351791382 }, { "auxiliary_loss_clip": 0.01135948, "auxiliary_loss_mlp": 0.01039295, "balance_loss_clip": 1.02266574, "balance_loss_mlp": 1.04990888, "epoch": 0.30969487449271005, "flos": 22529313768960.0, "grad_norm": 2.223473967610299, "language_loss": 0.78587598, "learning_rate": 3.233974184780424e-06, "loss": 0.80762839, "num_input_tokens_seen": 110632045, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.859375, "step": 5151, "time_per_iteration": 2.489332675933838 }, { "auxiliary_loss_clip": 0.01139177, "auxiliary_loss_mlp": 0.01040457, "balance_loss_clip": 1.02315938, "balance_loss_mlp": 1.0506103, "epoch": 0.309754997745378, "flos": 15267925059840.0, "grad_norm": 2.014745019861368, "language_loss": 0.67111146, "learning_rate": 3.2336676654531084e-06, "loss": 0.69290775, "num_input_tokens_seen": 110649340, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.88671875, "step": 5152, "time_per_iteration": 2.4817514419555664 }, { "auxiliary_loss_clip": 0.01135272, "auxiliary_loss_mlp": 0.01037933, "balance_loss_clip": 1.02180409, "balance_loss_mlp": 1.04954672, "epoch": 0.309815120998046, "flos": 26979291947520.0, "grad_norm": 2.1097196875782553, "language_loss": 0.82447612, "learning_rate": 3.2333610993449926e-06, "loss": 0.84620821, "num_input_tokens_seen": 110668450, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.859375, "step": 5153, "time_per_iteration": 2.5277717113494873 }, { "auxiliary_loss_clip": 0.01138231, "auxiliary_loss_mlp": 0.01040031, "balance_loss_clip": 1.02371109, "balance_loss_mlp": 1.05176854, "epoch": 0.30987524425071394, "flos": 21143161969920.0, "grad_norm": 3.0195540544381507, "language_loss": 0.7390182, "learning_rate": 3.2330544864676997e-06, "loss": 0.76080084, "num_input_tokens_seen": 110689410, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.86328125, "step": 5154, "time_per_iteration": 2.5556864738464355 }, { "auxiliary_loss_clip": 0.01136438, "auxiliary_loss_mlp": 0.0103357, "balance_loss_clip": 1.01812029, "balance_loss_mlp": 1.05106974, "epoch": 0.3099353675033819, "flos": 15268284195840.0, "grad_norm": 2.3764234852407706, "language_loss": 0.75687891, "learning_rate": 3.232747826832858e-06, "loss": 0.778579, "num_input_tokens_seen": 110707350, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8515625, "step": 5155, "time_per_iteration": 2.4504764080047607 }, { "auxiliary_loss_clip": 0.01138438, "auxiliary_loss_mlp": 0.0103895, "balance_loss_clip": 1.02286923, "balance_loss_mlp": 1.05053091, "epoch": 0.30999549075604993, "flos": 15413794191360.0, "grad_norm": 2.9125616351252432, "language_loss": 0.78727233, "learning_rate": 3.232441120452094e-06, "loss": 0.80904627, "num_input_tokens_seen": 110724910, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.87890625, "step": 5156, "time_per_iteration": 2.4782421588897705 }, { "auxiliary_loss_clip": 0.0113969, "auxiliary_loss_mlp": 0.01043834, "balance_loss_clip": 1.02634645, "balance_loss_mlp": 1.05063105, "epoch": 0.3100556140087179, "flos": 23184539712000.0, "grad_norm": 2.1847054672269572, "language_loss": 0.75293958, "learning_rate": 3.23213436733704e-06, "loss": 0.77477479, "num_input_tokens_seen": 110744010, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.890625, "step": 5157, "time_per_iteration": 2.4856536388397217 }, { "auxiliary_loss_clip": 0.01134354, "auxiliary_loss_mlp": 0.01035035, "balance_loss_clip": 1.02022958, "balance_loss_mlp": 1.05020487, "epoch": 0.31011573726138586, "flos": 25742169676800.0, "grad_norm": 1.9906840093945892, "language_loss": 0.6926468, "learning_rate": 3.231827567499327e-06, "loss": 0.71434069, "num_input_tokens_seen": 110765835, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.84375, "step": 5158, "time_per_iteration": 2.5446841716766357 }, { "auxiliary_loss_clip": 0.0113402, "auxiliary_loss_mlp": 0.01036602, "balance_loss_clip": 1.02184391, "balance_loss_mlp": 1.05009484, "epoch": 0.3101758605140538, "flos": 20011329440640.0, "grad_norm": 2.4338127825763185, "language_loss": 0.85044706, "learning_rate": 3.2315207209505896e-06, "loss": 0.87215328, "num_input_tokens_seen": 110784655, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8359375, "step": 5159, "time_per_iteration": 2.4719650745391846 }, { "auxiliary_loss_clip": 0.01135748, "auxiliary_loss_mlp": 0.0103576, "balance_loss_clip": 1.01960683, "balance_loss_mlp": 1.04978538, "epoch": 0.3102359837667218, "flos": 19135683688320.0, "grad_norm": 2.705583781905566, "language_loss": 0.84716517, "learning_rate": 3.231213827702462e-06, "loss": 0.86888027, "num_input_tokens_seen": 110802545, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 5160, "time_per_iteration": 2.483539581298828 }, { "auxiliary_loss_clip": 0.01136576, "auxiliary_loss_mlp": 0.01039282, "balance_loss_clip": 1.02361798, "balance_loss_mlp": 1.05239749, "epoch": 0.31029610701938976, "flos": 22265405568000.0, "grad_norm": 1.8349560830857459, "language_loss": 0.75610965, "learning_rate": 3.230906887766584e-06, "loss": 0.77786827, "num_input_tokens_seen": 110820265, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84375, "step": 5161, "time_per_iteration": 2.4550111293792725 }, { "auxiliary_loss_clip": 0.01138907, "auxiliary_loss_mlp": 0.01035988, "balance_loss_clip": 1.02060962, "balance_loss_mlp": 1.05086613, "epoch": 0.3103562302720577, "flos": 20805349536000.0, "grad_norm": 2.259313988808482, "language_loss": 0.81730604, "learning_rate": 3.2305999011545924e-06, "loss": 0.839055, "num_input_tokens_seen": 110836195, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8828125, "step": 5162, "time_per_iteration": 2.455233573913574 }, { "auxiliary_loss_clip": 0.01133992, "auxiliary_loss_mlp": 0.01034961, "balance_loss_clip": 1.02061999, "balance_loss_mlp": 1.05029941, "epoch": 0.3104163535247257, "flos": 22344158136960.0, "grad_norm": 1.7112993326402652, "language_loss": 0.82790983, "learning_rate": 3.2302928678781295e-06, "loss": 0.8495993, "num_input_tokens_seen": 110856420, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8359375, "step": 5163, "time_per_iteration": 2.4692206382751465 }, { "auxiliary_loss_clip": 0.011405, "auxiliary_loss_mlp": 0.01038775, "balance_loss_clip": 1.02271152, "balance_loss_mlp": 1.05298901, "epoch": 0.31047647677739365, "flos": 21689363157120.0, "grad_norm": 1.6807029375761808, "language_loss": 0.76370996, "learning_rate": 3.2299857879488376e-06, "loss": 0.78550267, "num_input_tokens_seen": 110876650, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 5164, "time_per_iteration": 2.502312183380127 }, { "auxiliary_loss_clip": 0.01143033, "auxiliary_loss_mlp": 0.01039277, "balance_loss_clip": 1.02242064, "balance_loss_mlp": 1.05639148, "epoch": 0.3105366000300616, "flos": 18917275040640.0, "grad_norm": 2.0878040825068718, "language_loss": 0.74491656, "learning_rate": 3.2296786613783626e-06, "loss": 0.76673973, "num_input_tokens_seen": 110894445, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 5165, "time_per_iteration": 2.491447925567627 }, { "auxiliary_loss_clip": 0.01137994, "auxiliary_loss_mlp": 0.01036956, "balance_loss_clip": 1.02114844, "balance_loss_mlp": 1.05303228, "epoch": 0.3105967232827296, "flos": 18260397072000.0, "grad_norm": 3.090904297942544, "language_loss": 0.75755179, "learning_rate": 3.229371488178348e-06, "loss": 0.77930129, "num_input_tokens_seen": 110912855, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5166, "time_per_iteration": 2.4683916568756104 }, { "auxiliary_loss_clip": 0.01140827, "auxiliary_loss_mlp": 0.01039542, "balance_loss_clip": 1.02342486, "balance_loss_mlp": 1.05389452, "epoch": 0.31065684653539755, "flos": 17672144037120.0, "grad_norm": 2.3937576806263903, "language_loss": 0.73481929, "learning_rate": 3.229064268360444e-06, "loss": 0.75662297, "num_input_tokens_seen": 110928025, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.87109375, "step": 5167, "time_per_iteration": 2.4436845779418945 }, { "auxiliary_loss_clip": 0.01065687, "auxiliary_loss_mlp": 0.01007209, "balance_loss_clip": 1.00453901, "balance_loss_mlp": 1.03659201, "epoch": 0.3107169697880655, "flos": 68531996511360.0, "grad_norm": 0.7253204155250941, "language_loss": 0.52976716, "learning_rate": 3.2287570019362997e-06, "loss": 0.5504961, "num_input_tokens_seen": 110992215, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.29101562, "step": 5168, "time_per_iteration": 3.1993567943573 }, { "auxiliary_loss_clip": 0.01144132, "auxiliary_loss_mlp": 0.01037629, "balance_loss_clip": 1.02071357, "balance_loss_mlp": 1.05546105, "epoch": 0.3107770930407335, "flos": 13188733274880.0, "grad_norm": 1.875202479834562, "language_loss": 0.78284407, "learning_rate": 3.2284496889175668e-06, "loss": 0.80466169, "num_input_tokens_seen": 111010400, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.88671875, "step": 5169, "time_per_iteration": 2.440812349319458 }, { "auxiliary_loss_clip": 0.01143082, "auxiliary_loss_mlp": 0.01040699, "balance_loss_clip": 1.02473712, "balance_loss_mlp": 1.05422688, "epoch": 0.3108372162934015, "flos": 31580849520000.0, "grad_norm": 2.5026269826408383, "language_loss": 0.63791049, "learning_rate": 3.2281423293158986e-06, "loss": 0.65974826, "num_input_tokens_seen": 111033960, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.890625, "step": 5170, "time_per_iteration": 2.5980045795440674 }, { "auxiliary_loss_clip": 0.0114493, "auxiliary_loss_mlp": 0.01040619, "balance_loss_clip": 1.02396536, "balance_loss_mlp": 1.05691826, "epoch": 0.31089733954606946, "flos": 28729829266560.0, "grad_norm": 10.479980321947973, "language_loss": 0.77767336, "learning_rate": 3.22783492314295e-06, "loss": 0.79952884, "num_input_tokens_seen": 111053265, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.87890625, "step": 5171, "time_per_iteration": 2.5388686656951904 }, { "auxiliary_loss_clip": 0.01144639, "auxiliary_loss_mlp": 0.01044974, "balance_loss_clip": 1.02929187, "balance_loss_mlp": 1.05637658, "epoch": 0.3109574627987374, "flos": 19683249592320.0, "grad_norm": 1.843154263370736, "language_loss": 0.83669364, "learning_rate": 3.2275274704103785e-06, "loss": 0.85858977, "num_input_tokens_seen": 111071130, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8828125, "step": 5172, "time_per_iteration": 2.4889748096466064 }, { "auxiliary_loss_clip": 0.01140261, "auxiliary_loss_mlp": 0.01040439, "balance_loss_clip": 1.02481055, "balance_loss_mlp": 1.05275226, "epoch": 0.3110175860514054, "flos": 14683981656960.0, "grad_norm": 2.766943687732997, "language_loss": 0.84270805, "learning_rate": 3.227219971129842e-06, "loss": 0.86451507, "num_input_tokens_seen": 111089560, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.875, "step": 5173, "time_per_iteration": 2.4462485313415527 }, { "auxiliary_loss_clip": 0.01140764, "auxiliary_loss_mlp": 0.01033325, "balance_loss_clip": 1.01879358, "balance_loss_mlp": 1.05742788, "epoch": 0.31107770930407336, "flos": 25739655724800.0, "grad_norm": 1.631549558608023, "language_loss": 0.83347619, "learning_rate": 3.226912425313001e-06, "loss": 0.8552171, "num_input_tokens_seen": 111109960, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8359375, "step": 5174, "time_per_iteration": 2.5717196464538574 }, { "auxiliary_loss_clip": 0.01143135, "auxiliary_loss_mlp": 0.0104011, "balance_loss_clip": 1.0243926, "balance_loss_mlp": 1.05698359, "epoch": 0.3111378325567413, "flos": 19208259118080.0, "grad_norm": 2.490645623575468, "language_loss": 0.85281265, "learning_rate": 3.2266048329715183e-06, "loss": 0.87464511, "num_input_tokens_seen": 111127960, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.859375, "step": 5175, "time_per_iteration": 2.47245192527771 }, { "auxiliary_loss_clip": 0.01142458, "auxiliary_loss_mlp": 0.01038709, "balance_loss_clip": 1.02240086, "balance_loss_mlp": 1.05785704, "epoch": 0.3111979558094093, "flos": 23696374561920.0, "grad_norm": 2.2353768030246703, "language_loss": 0.83223104, "learning_rate": 3.2262971941170575e-06, "loss": 0.85404265, "num_input_tokens_seen": 111146730, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.84765625, "step": 5176, "time_per_iteration": 3.9523251056671143 }, { "auxiliary_loss_clip": 0.01138493, "auxiliary_loss_mlp": 0.01038559, "balance_loss_clip": 1.02213168, "balance_loss_mlp": 1.05207753, "epoch": 0.31125807906207725, "flos": 21033023892480.0, "grad_norm": 2.131530745075992, "language_loss": 0.80355608, "learning_rate": 3.2259895087612837e-06, "loss": 0.82532656, "num_input_tokens_seen": 111166295, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.86328125, "step": 5177, "time_per_iteration": 2.4802136421203613 }, { "auxiliary_loss_clip": 0.01145939, "auxiliary_loss_mlp": 0.01037765, "balance_loss_clip": 1.0219934, "balance_loss_mlp": 1.05893755, "epoch": 0.3113182023147452, "flos": 23076628277760.0, "grad_norm": 1.7452906951613767, "language_loss": 0.80953908, "learning_rate": 3.2256817769158657e-06, "loss": 0.83137608, "num_input_tokens_seen": 111185665, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.87109375, "step": 5178, "time_per_iteration": 3.969151020050049 }, { "auxiliary_loss_clip": 0.01143927, "auxiliary_loss_mlp": 0.01046439, "balance_loss_clip": 1.03093004, "balance_loss_mlp": 1.05517304, "epoch": 0.3113783255674132, "flos": 11838994888320.0, "grad_norm": 1.9324478325376764, "language_loss": 0.81575423, "learning_rate": 3.225373998592471e-06, "loss": 0.83765793, "num_input_tokens_seen": 111201615, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.88671875, "step": 5179, "time_per_iteration": 3.8192129135131836 }, { "auxiliary_loss_clip": 0.01141987, "auxiliary_loss_mlp": 0.01043885, "balance_loss_clip": 1.02778006, "balance_loss_mlp": 1.05629504, "epoch": 0.31143844882008115, "flos": 16289547684480.0, "grad_norm": 1.706082300828718, "language_loss": 0.77987826, "learning_rate": 3.2250661738027715e-06, "loss": 0.80173695, "num_input_tokens_seen": 111220515, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.859375, "step": 5180, "time_per_iteration": 3.863070011138916 }, { "auxiliary_loss_clip": 0.0114048, "auxiliary_loss_mlp": 0.01034197, "balance_loss_clip": 1.01831222, "balance_loss_mlp": 1.05397034, "epoch": 0.3114985720727491, "flos": 23217792727680.0, "grad_norm": 1.8782429534043166, "language_loss": 0.831954, "learning_rate": 3.22475830255844e-06, "loss": 0.85370076, "num_input_tokens_seen": 111240395, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 5181, "time_per_iteration": 2.5014514923095703 }, { "auxiliary_loss_clip": 0.01135646, "auxiliary_loss_mlp": 0.01038758, "balance_loss_clip": 1.02403593, "balance_loss_mlp": 1.05260038, "epoch": 0.3115586953254171, "flos": 30044626698240.0, "grad_norm": 1.8535075953341713, "language_loss": 0.73768234, "learning_rate": 3.2244503848711516e-06, "loss": 0.75942641, "num_input_tokens_seen": 111261100, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.83203125, "step": 5182, "time_per_iteration": 2.5635478496551514 }, { "auxiliary_loss_clip": 0.01143711, "auxiliary_loss_mlp": 0.01048579, "balance_loss_clip": 1.03266478, "balance_loss_mlp": 1.05420756, "epoch": 0.3116188185780851, "flos": 25666326109440.0, "grad_norm": 2.0476874236219214, "language_loss": 0.70541668, "learning_rate": 3.2241424207525815e-06, "loss": 0.72733957, "num_input_tokens_seen": 111281320, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.89453125, "step": 5183, "time_per_iteration": 2.513044595718384 }, { "auxiliary_loss_clip": 0.01061162, "auxiliary_loss_mlp": 0.01001224, "balance_loss_clip": 0.99876803, "balance_loss_mlp": 1.03207445, "epoch": 0.31167894183075306, "flos": 69510058917120.0, "grad_norm": 0.9366512575011844, "language_loss": 0.59686887, "learning_rate": 3.223834410214408e-06, "loss": 0.61749268, "num_input_tokens_seen": 111341405, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.29101562, "step": 5184, "time_per_iteration": 3.111398935317993 }, { "auxiliary_loss_clip": 0.01140084, "auxiliary_loss_mlp": 0.01041508, "balance_loss_clip": 1.02623117, "balance_loss_mlp": 1.05352426, "epoch": 0.31173906508342103, "flos": 14939845211520.0, "grad_norm": 2.8006303772917884, "language_loss": 0.70471585, "learning_rate": 3.223526353268311e-06, "loss": 0.72653174, "num_input_tokens_seen": 111358975, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8671875, "step": 5185, "time_per_iteration": 2.459275245666504 }, { "auxiliary_loss_clip": 0.01143466, "auxiliary_loss_mlp": 0.01042314, "balance_loss_clip": 1.02686429, "balance_loss_mlp": 1.05497074, "epoch": 0.311799188336089, "flos": 16176033728640.0, "grad_norm": 2.5356818411517166, "language_loss": 0.63603061, "learning_rate": 3.2232182499259725e-06, "loss": 0.65788841, "num_input_tokens_seen": 111375845, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8828125, "step": 5186, "time_per_iteration": 2.4681968688964844 }, { "auxiliary_loss_clip": 0.01145131, "auxiliary_loss_mlp": 0.01043118, "balance_loss_clip": 1.02548659, "balance_loss_mlp": 1.05403447, "epoch": 0.31185931158875696, "flos": 25009627708800.0, "grad_norm": 2.4089285713973227, "language_loss": 0.86748803, "learning_rate": 3.2229101001990747e-06, "loss": 0.8893705, "num_input_tokens_seen": 111394150, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.91015625, "step": 5187, "time_per_iteration": 2.5142605304718018 }, { "auxiliary_loss_clip": 0.01141657, "auxiliary_loss_mlp": 0.01046571, "balance_loss_clip": 1.03011978, "balance_loss_mlp": 1.05372822, "epoch": 0.3119194348414249, "flos": 37232901273600.0, "grad_norm": 1.6980812614047704, "language_loss": 0.63609785, "learning_rate": 3.2226019040993036e-06, "loss": 0.65798008, "num_input_tokens_seen": 111418355, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87890625, "step": 5188, "time_per_iteration": 2.6217129230499268 }, { "auxiliary_loss_clip": 0.01144188, "auxiliary_loss_mlp": 0.01043345, "balance_loss_clip": 1.02683401, "balance_loss_mlp": 1.05712557, "epoch": 0.3119795580940929, "flos": 15012779777280.0, "grad_norm": 4.365042989736555, "language_loss": 0.82944435, "learning_rate": 3.222293661638346e-06, "loss": 0.85131967, "num_input_tokens_seen": 111435445, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.87109375, "step": 5189, "time_per_iteration": 2.4756414890289307 }, { "auxiliary_loss_clip": 0.01137825, "auxiliary_loss_mlp": 0.01031654, "balance_loss_clip": 1.01620424, "balance_loss_mlp": 1.05204463, "epoch": 0.31203968134676086, "flos": 15998168557440.0, "grad_norm": 1.7443259535196256, "language_loss": 0.79000509, "learning_rate": 3.22198537282789e-06, "loss": 0.81169987, "num_input_tokens_seen": 111453430, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.859375, "step": 5190, "time_per_iteration": 2.4946374893188477 }, { "auxiliary_loss_clip": 0.01141323, "auxiliary_loss_mlp": 0.0103996, "balance_loss_clip": 1.02416503, "balance_loss_mlp": 1.05444312, "epoch": 0.3120998045994288, "flos": 23837359443840.0, "grad_norm": 1.5005427966861427, "language_loss": 0.7508896, "learning_rate": 3.2216770376796262e-06, "loss": 0.7727024, "num_input_tokens_seen": 111475325, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 5191, "time_per_iteration": 2.5105905532836914 }, { "auxiliary_loss_clip": 0.01059205, "auxiliary_loss_mlp": 0.01003027, "balance_loss_clip": 1.00052357, "balance_loss_mlp": 1.03060484, "epoch": 0.3121599278520968, "flos": 69184205712000.0, "grad_norm": 0.844701729071676, "language_loss": 0.63933545, "learning_rate": 3.221368656205247e-06, "loss": 0.65995777, "num_input_tokens_seen": 111533960, "router_z_loss_clip": 0.02502441, "router_z_loss_mlp": 0.28515625, "step": 5192, "time_per_iteration": 3.1857826709747314 }, { "auxiliary_loss_clip": 0.01138152, "auxiliary_loss_mlp": 0.01038084, "balance_loss_clip": 1.02108443, "balance_loss_mlp": 1.04971755, "epoch": 0.31222005110476475, "flos": 23806368984960.0, "grad_norm": 2.0987578318153486, "language_loss": 0.79765069, "learning_rate": 3.221060228416446e-06, "loss": 0.81941307, "num_input_tokens_seen": 111554055, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.88671875, "step": 5193, "time_per_iteration": 2.5170047283172607 }, { "auxiliary_loss_clip": 0.01138595, "auxiliary_loss_mlp": 0.01039801, "balance_loss_clip": 1.02294469, "balance_loss_mlp": 1.05015397, "epoch": 0.3122801743574327, "flos": 25226132935680.0, "grad_norm": 2.0147623005229454, "language_loss": 0.72160137, "learning_rate": 3.2207517543249183e-06, "loss": 0.74338531, "num_input_tokens_seen": 111574305, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8828125, "step": 5194, "time_per_iteration": 2.4908854961395264 }, { "auxiliary_loss_clip": 0.01136742, "auxiliary_loss_mlp": 0.01038096, "balance_loss_clip": 1.0230875, "balance_loss_mlp": 1.05135512, "epoch": 0.3123402976101007, "flos": 22966490200320.0, "grad_norm": 1.4614822936430425, "language_loss": 0.76489282, "learning_rate": 3.2204432339423616e-06, "loss": 0.78664124, "num_input_tokens_seen": 111595680, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.85546875, "step": 5195, "time_per_iteration": 2.4996917247772217 }, { "auxiliary_loss_clip": 0.01137781, "auxiliary_loss_mlp": 0.0104158, "balance_loss_clip": 1.02602363, "balance_loss_mlp": 1.04942346, "epoch": 0.3124004208627687, "flos": 25192089820800.0, "grad_norm": 13.117825239545592, "language_loss": 0.7793349, "learning_rate": 3.220134667280476e-06, "loss": 0.80112851, "num_input_tokens_seen": 111618135, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8828125, "step": 5196, "time_per_iteration": 2.5107030868530273 }, { "auxiliary_loss_clip": 0.0105688, "auxiliary_loss_mlp": 0.0100555, "balance_loss_clip": 1.00302267, "balance_loss_mlp": 1.02774072, "epoch": 0.31246054411543667, "flos": 67485165517440.0, "grad_norm": 0.7818402314236688, "language_loss": 0.5477035, "learning_rate": 3.2198260543509613e-06, "loss": 0.56832778, "num_input_tokens_seen": 111682220, "router_z_loss_clip": 0.02526855, "router_z_loss_mlp": 0.29101562, "step": 5197, "time_per_iteration": 3.1253581047058105 }, { "auxiliary_loss_clip": 0.01136363, "auxiliary_loss_mlp": 0.01033879, "balance_loss_clip": 1.01857305, "balance_loss_mlp": 1.05152416, "epoch": 0.31252066736810463, "flos": 17858520731520.0, "grad_norm": 1.9352695900903691, "language_loss": 0.66739219, "learning_rate": 3.21951739516552e-06, "loss": 0.68909466, "num_input_tokens_seen": 111700815, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.84765625, "step": 5198, "time_per_iteration": 2.4402477741241455 }, { "auxiliary_loss_clip": 0.01140154, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.02037227, "balance_loss_mlp": 1.05119586, "epoch": 0.3125807906207726, "flos": 18475034791680.0, "grad_norm": 2.1241681387461537, "language_loss": 0.69120288, "learning_rate": 3.219208689735857e-06, "loss": 0.71297222, "num_input_tokens_seen": 111718195, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.890625, "step": 5199, "time_per_iteration": 2.4781272411346436 }, { "auxiliary_loss_clip": 0.01136682, "auxiliary_loss_mlp": 0.01040988, "balance_loss_clip": 1.02478731, "balance_loss_mlp": 1.05088222, "epoch": 0.31264091387344056, "flos": 18946541646720.0, "grad_norm": 2.067107546015995, "language_loss": 0.79071563, "learning_rate": 3.2188999380736785e-06, "loss": 0.81249231, "num_input_tokens_seen": 111734440, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 5200, "time_per_iteration": 2.4442152976989746 }, { "auxiliary_loss_clip": 0.01135098, "auxiliary_loss_mlp": 0.01033561, "balance_loss_clip": 1.0182066, "balance_loss_mlp": 1.05158162, "epoch": 0.3127010371261085, "flos": 21468512384640.0, "grad_norm": 2.1140340905772654, "language_loss": 0.83922994, "learning_rate": 3.2185911401906917e-06, "loss": 0.8609165, "num_input_tokens_seen": 111751960, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 5201, "time_per_iteration": 2.4785757064819336 }, { "auxiliary_loss_clip": 0.01137941, "auxiliary_loss_mlp": 0.01041669, "balance_loss_clip": 1.02515852, "balance_loss_mlp": 1.05102372, "epoch": 0.3127611603787765, "flos": 15336047203200.0, "grad_norm": 1.934847938090054, "language_loss": 0.69269836, "learning_rate": 3.2182822960986072e-06, "loss": 0.71449447, "num_input_tokens_seen": 111769585, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8671875, "step": 5202, "time_per_iteration": 2.4564898014068604 }, { "auxiliary_loss_clip": 0.01138084, "auxiliary_loss_mlp": 0.01036936, "balance_loss_clip": 1.02238095, "balance_loss_mlp": 1.05068135, "epoch": 0.31282128363144446, "flos": 17602980399360.0, "grad_norm": 1.9713170905291109, "language_loss": 0.84053516, "learning_rate": 3.2179734058091358e-06, "loss": 0.86228538, "num_input_tokens_seen": 111787880, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.875, "step": 5203, "time_per_iteration": 2.4567699432373047 }, { "auxiliary_loss_clip": 0.01137982, "auxiliary_loss_mlp": 0.01038557, "balance_loss_clip": 1.02214158, "balance_loss_mlp": 1.05062866, "epoch": 0.3128814068841124, "flos": 26756753235840.0, "grad_norm": 2.100965329907856, "language_loss": 0.60864151, "learning_rate": 3.2176644693339913e-06, "loss": 0.63040692, "num_input_tokens_seen": 111805950, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.875, "step": 5204, "time_per_iteration": 2.509404420852661 }, { "auxiliary_loss_clip": 0.01130845, "auxiliary_loss_mlp": 0.01034787, "balance_loss_clip": 1.02043438, "balance_loss_mlp": 1.0473671, "epoch": 0.3129415301367804, "flos": 22272372806400.0, "grad_norm": 2.1594877020369716, "language_loss": 0.66244328, "learning_rate": 3.217355486684887e-06, "loss": 0.68409956, "num_input_tokens_seen": 111826135, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8359375, "step": 5205, "time_per_iteration": 2.4954745769500732 }, { "auxiliary_loss_clip": 0.01136115, "auxiliary_loss_mlp": 0.01038944, "balance_loss_clip": 1.02245736, "balance_loss_mlp": 1.04954028, "epoch": 0.31300165338944835, "flos": 26464907232000.0, "grad_norm": 1.5615989122020077, "language_loss": 0.7665292, "learning_rate": 3.2170464578735414e-06, "loss": 0.78827977, "num_input_tokens_seen": 111844700, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.86328125, "step": 5206, "time_per_iteration": 2.500288724899292 }, { "auxiliary_loss_clip": 0.01133388, "auxiliary_loss_mlp": 0.01033171, "balance_loss_clip": 1.01764977, "balance_loss_mlp": 1.04818916, "epoch": 0.3130617766421163, "flos": 21944652094080.0, "grad_norm": 2.8799285042908007, "language_loss": 0.83041686, "learning_rate": 3.216737382911672e-06, "loss": 0.85208249, "num_input_tokens_seen": 111861585, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8515625, "step": 5207, "time_per_iteration": 2.496680974960327 }, { "auxiliary_loss_clip": 0.01128471, "auxiliary_loss_mlp": 0.01037285, "balance_loss_clip": 1.02292037, "balance_loss_mlp": 1.04567599, "epoch": 0.3131218998947843, "flos": 23292774368640.0, "grad_norm": 1.7203454186373959, "language_loss": 0.71070802, "learning_rate": 3.216428261810999e-06, "loss": 0.73236561, "num_input_tokens_seen": 111882950, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.828125, "step": 5208, "time_per_iteration": 2.4880142211914062 }, { "auxiliary_loss_clip": 0.01136413, "auxiliary_loss_mlp": 0.01040007, "balance_loss_clip": 1.02434325, "balance_loss_mlp": 1.04986119, "epoch": 0.3131820231474523, "flos": 21139642437120.0, "grad_norm": 1.9455887992704524, "language_loss": 0.75012594, "learning_rate": 3.2161190945832445e-06, "loss": 0.77189016, "num_input_tokens_seen": 111901640, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8671875, "step": 5209, "time_per_iteration": 2.500545024871826 }, { "auxiliary_loss_clip": 0.0113221, "auxiliary_loss_mlp": 0.01039356, "balance_loss_clip": 1.02490771, "balance_loss_mlp": 1.04517436, "epoch": 0.31324214640012027, "flos": 23909863046400.0, "grad_norm": 2.0043015127117507, "language_loss": 0.77253336, "learning_rate": 3.2158098812401325e-06, "loss": 0.79424906, "num_input_tokens_seen": 111919615, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.87109375, "step": 5210, "time_per_iteration": 2.4696006774902344 }, { "auxiliary_loss_clip": 0.01127834, "auxiliary_loss_mlp": 0.01040403, "balance_loss_clip": 1.02538276, "balance_loss_mlp": 1.04588103, "epoch": 0.31330226965278823, "flos": 22236929061120.0, "grad_norm": 4.957812759651179, "language_loss": 0.79283988, "learning_rate": 3.2155006217933874e-06, "loss": 0.81452227, "num_input_tokens_seen": 111938485, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 5211, "time_per_iteration": 2.5148167610168457 }, { "auxiliary_loss_clip": 0.01130045, "auxiliary_loss_mlp": 0.01034802, "balance_loss_clip": 1.02012098, "balance_loss_mlp": 1.04562497, "epoch": 0.3133623929054562, "flos": 19753993428480.0, "grad_norm": 2.284073050841509, "language_loss": 0.79110777, "learning_rate": 3.2151913162547367e-06, "loss": 0.8127563, "num_input_tokens_seen": 111956425, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.84375, "step": 5212, "time_per_iteration": 2.490856409072876 }, { "auxiliary_loss_clip": 0.01137759, "auxiliary_loss_mlp": 0.01052129, "balance_loss_clip": 1.03572619, "balance_loss_mlp": 1.04887199, "epoch": 0.31342251615812416, "flos": 27162256849920.0, "grad_norm": 2.0600245276977622, "language_loss": 0.71273851, "learning_rate": 3.2148819646359097e-06, "loss": 0.73463738, "num_input_tokens_seen": 111975915, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.88671875, "step": 5213, "time_per_iteration": 2.5542473793029785 }, { "auxiliary_loss_clip": 0.01138783, "auxiliary_loss_mlp": 0.01042974, "balance_loss_clip": 1.02734542, "balance_loss_mlp": 1.05133724, "epoch": 0.31348263941079213, "flos": 20229809915520.0, "grad_norm": 1.8433300157125168, "language_loss": 0.77331758, "learning_rate": 3.2145725669486374e-06, "loss": 0.79513514, "num_input_tokens_seen": 111995055, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.875, "step": 5214, "time_per_iteration": 2.471853017807007 }, { "auxiliary_loss_clip": 0.01129445, "auxiliary_loss_mlp": 0.01036606, "balance_loss_clip": 1.02197921, "balance_loss_mlp": 1.04666328, "epoch": 0.3135427626634601, "flos": 24607643627520.0, "grad_norm": 1.7654017504986848, "language_loss": 0.82519209, "learning_rate": 3.2142631232046517e-06, "loss": 0.84685254, "num_input_tokens_seen": 112015830, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.828125, "step": 5215, "time_per_iteration": 2.5166983604431152 }, { "auxiliary_loss_clip": 0.0113619, "auxiliary_loss_mlp": 0.01035882, "balance_loss_clip": 1.01958609, "balance_loss_mlp": 1.04887497, "epoch": 0.31360288591612806, "flos": 20959873845120.0, "grad_norm": 2.3449693616678764, "language_loss": 0.79385877, "learning_rate": 3.213953633415686e-06, "loss": 0.81557953, "num_input_tokens_seen": 112035065, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.875, "step": 5216, "time_per_iteration": 2.462038516998291 }, { "auxiliary_loss_clip": 0.01136059, "auxiliary_loss_mlp": 0.01047725, "balance_loss_clip": 1.03007054, "balance_loss_mlp": 1.04734814, "epoch": 0.313663009168796, "flos": 26980513009920.0, "grad_norm": 4.466656853914214, "language_loss": 0.68384379, "learning_rate": 3.213644097593477e-06, "loss": 0.70568168, "num_input_tokens_seen": 112058405, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.88671875, "step": 5217, "time_per_iteration": 4.028246641159058 }, { "auxiliary_loss_clip": 0.0113779, "auxiliary_loss_mlp": 0.01035377, "balance_loss_clip": 1.02018952, "balance_loss_mlp": 1.04993296, "epoch": 0.313723132421464, "flos": 18040911016320.0, "grad_norm": 1.7190073499366134, "language_loss": 0.8076055, "learning_rate": 3.2133345157497624e-06, "loss": 0.82933712, "num_input_tokens_seen": 112076420, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.87890625, "step": 5218, "time_per_iteration": 2.4539387226104736 }, { "auxiliary_loss_clip": 0.01133749, "auxiliary_loss_mlp": 0.01041877, "balance_loss_clip": 1.02554512, "balance_loss_mlp": 1.04682922, "epoch": 0.31378325567413196, "flos": 22488913946880.0, "grad_norm": 2.921160808895241, "language_loss": 0.68949062, "learning_rate": 3.2130248878962813e-06, "loss": 0.71124685, "num_input_tokens_seen": 112090775, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8671875, "step": 5219, "time_per_iteration": 2.466773271560669 }, { "auxiliary_loss_clip": 0.01136135, "auxiliary_loss_mlp": 0.01045056, "balance_loss_clip": 1.02989316, "balance_loss_mlp": 1.04919124, "epoch": 0.3138433789267999, "flos": 22419247518720.0, "grad_norm": 3.133238449681667, "language_loss": 0.79954159, "learning_rate": 3.2127152140447747e-06, "loss": 0.8213535, "num_input_tokens_seen": 112110980, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.87109375, "step": 5220, "time_per_iteration": 5.211414098739624 }, { "auxiliary_loss_clip": 0.011357, "auxiliary_loss_mlp": 0.0104105, "balance_loss_clip": 1.02598143, "balance_loss_mlp": 1.04873133, "epoch": 0.3139035021794679, "flos": 13005912026880.0, "grad_norm": 2.0955605372422634, "language_loss": 0.73133445, "learning_rate": 3.212405494206986e-06, "loss": 0.75310194, "num_input_tokens_seen": 112129020, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.87109375, "step": 5221, "time_per_iteration": 2.4635698795318604 }, { "auxiliary_loss_clip": 0.0113352, "auxiliary_loss_mlp": 0.01035425, "balance_loss_clip": 1.01992226, "balance_loss_mlp": 1.04917884, "epoch": 0.31396362543213585, "flos": 16945994689920.0, "grad_norm": 2.150801650531795, "language_loss": 0.82095695, "learning_rate": 3.2120957283946588e-06, "loss": 0.84264642, "num_input_tokens_seen": 112147865, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84375, "step": 5222, "time_per_iteration": 3.876337766647339 }, { "auxiliary_loss_clip": 0.01138236, "auxiliary_loss_mlp": 0.01044654, "balance_loss_clip": 1.02753568, "balance_loss_mlp": 1.04970264, "epoch": 0.31402374868480387, "flos": 20156731695360.0, "grad_norm": 3.3524948965020878, "language_loss": 0.70709687, "learning_rate": 3.2117859166195407e-06, "loss": 0.72892576, "num_input_tokens_seen": 112166745, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.88671875, "step": 5223, "time_per_iteration": 2.4782962799072266 }, { "auxiliary_loss_clip": 0.01133475, "auxiliary_loss_mlp": 0.01035886, "balance_loss_clip": 1.02111614, "balance_loss_mlp": 1.04819691, "epoch": 0.31408387193747184, "flos": 21251073404160.0, "grad_norm": 1.6331732079035055, "language_loss": 0.80571043, "learning_rate": 3.211476058893379e-06, "loss": 0.82740414, "num_input_tokens_seen": 112185895, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8515625, "step": 5224, "time_per_iteration": 2.495849609375 }, { "auxiliary_loss_clip": 0.01149311, "auxiliary_loss_mlp": 0.01047335, "balance_loss_clip": 1.03069377, "balance_loss_mlp": 1.05590415, "epoch": 0.3141439951901398, "flos": 27484267299840.0, "grad_norm": 2.3448113637689447, "language_loss": 0.5761016, "learning_rate": 3.2111661552279243e-06, "loss": 0.598068, "num_input_tokens_seen": 112204465, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.93359375, "step": 5225, "time_per_iteration": 2.5528810024261475 }, { "auxiliary_loss_clip": 0.01131971, "auxiliary_loss_mlp": 0.01035252, "balance_loss_clip": 1.02029109, "balance_loss_mlp": 1.04866624, "epoch": 0.31420411844280777, "flos": 17852235851520.0, "grad_norm": 3.517082632803011, "language_loss": 0.81803805, "learning_rate": 3.2108562056349273e-06, "loss": 0.8397103, "num_input_tokens_seen": 112221635, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.83203125, "step": 5226, "time_per_iteration": 2.516092538833618 }, { "auxiliary_loss_clip": 0.0113882, "auxiliary_loss_mlp": 0.01046019, "balance_loss_clip": 1.02949667, "balance_loss_mlp": 1.05118489, "epoch": 0.31426424169547573, "flos": 21616967295360.0, "grad_norm": 1.7799139675174251, "language_loss": 0.74322546, "learning_rate": 3.210546210126141e-06, "loss": 0.76507384, "num_input_tokens_seen": 112241240, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.875, "step": 5227, "time_per_iteration": 2.4946579933166504 }, { "auxiliary_loss_clip": 0.01142348, "auxiliary_loss_mlp": 0.01040827, "balance_loss_clip": 1.02454305, "balance_loss_mlp": 1.05386949, "epoch": 0.3143243649481437, "flos": 30920631586560.0, "grad_norm": 1.7512427834770208, "language_loss": 0.67176127, "learning_rate": 3.2102361687133213e-06, "loss": 0.69359303, "num_input_tokens_seen": 112262350, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8828125, "step": 5228, "time_per_iteration": 2.5484232902526855 }, { "auxiliary_loss_clip": 0.01138647, "auxiliary_loss_mlp": 0.01045906, "balance_loss_clip": 1.03069544, "balance_loss_mlp": 1.05072784, "epoch": 0.31438448820081166, "flos": 22821411168000.0, "grad_norm": 1.75830705305047, "language_loss": 0.79433209, "learning_rate": 3.2099260814082254e-06, "loss": 0.81617761, "num_input_tokens_seen": 112283710, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.87890625, "step": 5229, "time_per_iteration": 2.5174105167388916 }, { "auxiliary_loss_clip": 0.01138593, "auxiliary_loss_mlp": 0.01038383, "balance_loss_clip": 1.02220654, "balance_loss_mlp": 1.05284035, "epoch": 0.3144446114534796, "flos": 23292127923840.0, "grad_norm": 2.6807753524277884, "language_loss": 0.69607693, "learning_rate": 3.209615948222611e-06, "loss": 0.71784669, "num_input_tokens_seen": 112304285, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.859375, "step": 5230, "time_per_iteration": 2.5484619140625 }, { "auxiliary_loss_clip": 0.01137614, "auxiliary_loss_mlp": 0.01045677, "balance_loss_clip": 1.02873778, "balance_loss_mlp": 1.04951084, "epoch": 0.3145047347061476, "flos": 31355976424320.0, "grad_norm": 1.7952735817880994, "language_loss": 0.79463035, "learning_rate": 3.209305769168239e-06, "loss": 0.81646323, "num_input_tokens_seen": 112325110, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 5231, "time_per_iteration": 2.580134630203247 }, { "auxiliary_loss_clip": 0.01134818, "auxiliary_loss_mlp": 0.01042638, "balance_loss_clip": 1.02604437, "balance_loss_mlp": 1.05047727, "epoch": 0.31456485795881556, "flos": 10889552643840.0, "grad_norm": 2.233579904140648, "language_loss": 0.84785515, "learning_rate": 3.2089955442568704e-06, "loss": 0.86962974, "num_input_tokens_seen": 112339855, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84375, "step": 5232, "time_per_iteration": 2.4279868602752686 }, { "auxiliary_loss_clip": 0.01135704, "auxiliary_loss_mlp": 0.01053306, "balance_loss_clip": 1.03661704, "balance_loss_mlp": 1.05055869, "epoch": 0.3146249812114835, "flos": 17092438439040.0, "grad_norm": 1.6952152200994488, "language_loss": 0.80155909, "learning_rate": 3.2086852735002692e-06, "loss": 0.82344919, "num_input_tokens_seen": 112358480, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8515625, "step": 5233, "time_per_iteration": 2.4780852794647217 }, { "auxiliary_loss_clip": 0.01141586, "auxiliary_loss_mlp": 0.0104339, "balance_loss_clip": 1.02739191, "balance_loss_mlp": 1.05320811, "epoch": 0.3146851044641515, "flos": 55291442889600.0, "grad_norm": 1.9397131268715337, "language_loss": 0.70785141, "learning_rate": 3.2083749569102024e-06, "loss": 0.72970122, "num_input_tokens_seen": 112382350, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8828125, "step": 5234, "time_per_iteration": 2.775376319885254 }, { "auxiliary_loss_clip": 0.01140082, "auxiliary_loss_mlp": 0.01035799, "balance_loss_clip": 1.01906192, "balance_loss_mlp": 1.05201054, "epoch": 0.31474522771681945, "flos": 27015884928000.0, "grad_norm": 2.0439533541585377, "language_loss": 0.72286052, "learning_rate": 3.2080645944984356e-06, "loss": 0.74461937, "num_input_tokens_seen": 112400260, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8828125, "step": 5235, "time_per_iteration": 2.513296365737915 }, { "auxiliary_loss_clip": 0.01137963, "auxiliary_loss_mlp": 0.01038217, "balance_loss_clip": 1.02242208, "balance_loss_mlp": 1.05095816, "epoch": 0.3148053509694875, "flos": 21251935330560.0, "grad_norm": 3.1245812542899842, "language_loss": 0.7891798, "learning_rate": 3.2077541862767384e-06, "loss": 0.81094158, "num_input_tokens_seen": 112419400, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.87109375, "step": 5236, "time_per_iteration": 2.4750311374664307 }, { "auxiliary_loss_clip": 0.01142045, "auxiliary_loss_mlp": 0.0104118, "balance_loss_clip": 1.02364445, "balance_loss_mlp": 1.05155373, "epoch": 0.31486547422215544, "flos": 31248675521280.0, "grad_norm": 1.8597950284555813, "language_loss": 0.75929016, "learning_rate": 3.207443732256881e-06, "loss": 0.78112245, "num_input_tokens_seen": 112440825, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.90625, "step": 5237, "time_per_iteration": 2.554612398147583 }, { "auxiliary_loss_clip": 0.01136171, "auxiliary_loss_mlp": 0.01037935, "balance_loss_clip": 1.02290845, "balance_loss_mlp": 1.05231178, "epoch": 0.3149255974748234, "flos": 19828615933440.0, "grad_norm": 4.631528407469005, "language_loss": 0.79456681, "learning_rate": 3.2071332324506372e-06, "loss": 0.8163079, "num_input_tokens_seen": 112459180, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83984375, "step": 5238, "time_per_iteration": 2.4899351596832275 }, { "auxiliary_loss_clip": 0.01066384, "auxiliary_loss_mlp": 0.01002845, "balance_loss_clip": 1.00007904, "balance_loss_mlp": 1.03841305, "epoch": 0.31498572072749137, "flos": 67683965339520.0, "grad_norm": 1.2907238184361036, "language_loss": 0.67898136, "learning_rate": 3.2068226868697795e-06, "loss": 0.69967359, "num_input_tokens_seen": 112516680, "router_z_loss_clip": 0.02770996, "router_z_loss_mlp": 0.27929688, "step": 5239, "time_per_iteration": 3.0880510807037354 }, { "auxiliary_loss_clip": 0.01145618, "auxiliary_loss_mlp": 0.01052544, "balance_loss_clip": 1.03368545, "balance_loss_mlp": 1.05411756, "epoch": 0.31504584398015933, "flos": 19793136274560.0, "grad_norm": 2.137540688984833, "language_loss": 0.8285836, "learning_rate": 3.2065120955260846e-06, "loss": 0.85056525, "num_input_tokens_seen": 112535895, "router_z_loss_clip": 0.18945312, "router_z_loss_mlp": 0.9140625, "step": 5240, "time_per_iteration": 2.460073947906494 }, { "auxiliary_loss_clip": 0.01139713, "auxiliary_loss_mlp": 0.01040179, "balance_loss_clip": 1.02426422, "balance_loss_mlp": 1.05351532, "epoch": 0.3151059672328273, "flos": 26615409217920.0, "grad_norm": 2.024627015660466, "language_loss": 0.80976653, "learning_rate": 3.2062014584313302e-06, "loss": 0.83156544, "num_input_tokens_seen": 112557490, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.86328125, "step": 5241, "time_per_iteration": 2.5250403881073 }, { "auxiliary_loss_clip": 0.01139967, "auxiliary_loss_mlp": 0.01039505, "balance_loss_clip": 1.02301884, "balance_loss_mlp": 1.05451167, "epoch": 0.31516609048549526, "flos": 24204438483840.0, "grad_norm": 2.068787684744515, "language_loss": 0.74072045, "learning_rate": 3.2058907755972956e-06, "loss": 0.76251519, "num_input_tokens_seen": 112577075, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8515625, "step": 5242, "time_per_iteration": 2.5258290767669678 }, { "auxiliary_loss_clip": 0.01140551, "auxiliary_loss_mlp": 0.01040227, "balance_loss_clip": 1.02253592, "balance_loss_mlp": 1.05401146, "epoch": 0.31522621373816323, "flos": 25958710817280.0, "grad_norm": 2.6842228809560007, "language_loss": 0.7374115, "learning_rate": 3.2055800470357626e-06, "loss": 0.75921929, "num_input_tokens_seen": 112597620, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.86328125, "step": 5243, "time_per_iteration": 2.5450470447540283 }, { "auxiliary_loss_clip": 0.0113891, "auxiliary_loss_mlp": 0.0104281, "balance_loss_clip": 1.02548862, "balance_loss_mlp": 1.05199242, "epoch": 0.3152863369908312, "flos": 21908813299200.0, "grad_norm": 2.291728579989118, "language_loss": 0.64612818, "learning_rate": 3.205269272758513e-06, "loss": 0.66794527, "num_input_tokens_seen": 112617150, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8671875, "step": 5244, "time_per_iteration": 2.508605480194092 }, { "auxiliary_loss_clip": 0.01142937, "auxiliary_loss_mlp": 0.01044238, "balance_loss_clip": 1.02804899, "balance_loss_mlp": 1.05393505, "epoch": 0.31534646024349916, "flos": 16281072074880.0, "grad_norm": 2.6922835817301074, "language_loss": 0.91347909, "learning_rate": 3.2049584527773313e-06, "loss": 0.93535084, "num_input_tokens_seen": 112631090, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.890625, "step": 5245, "time_per_iteration": 2.4343836307525635 }, { "auxiliary_loss_clip": 0.01141214, "auxiliary_loss_mlp": 0.01051958, "balance_loss_clip": 1.03495872, "balance_loss_mlp": 1.05253959, "epoch": 0.3154065834961671, "flos": 24717243000960.0, "grad_norm": 2.099218226137889, "language_loss": 0.74922669, "learning_rate": 3.2046475871040048e-06, "loss": 0.77115846, "num_input_tokens_seen": 112651220, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.88671875, "step": 5246, "time_per_iteration": 2.5104198455810547 }, { "auxiliary_loss_clip": 0.01140135, "auxiliary_loss_mlp": 0.0104552, "balance_loss_clip": 1.02828228, "balance_loss_mlp": 1.05158544, "epoch": 0.3154667067488351, "flos": 35371148469120.0, "grad_norm": 2.375913060851549, "language_loss": 0.61327785, "learning_rate": 3.204336675750321e-06, "loss": 0.63513446, "num_input_tokens_seen": 112671560, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.88671875, "step": 5247, "time_per_iteration": 2.604630708694458 }, { "auxiliary_loss_clip": 0.0114085, "auxiliary_loss_mlp": 0.01046187, "balance_loss_clip": 1.02977157, "balance_loss_mlp": 1.05224276, "epoch": 0.31552683000150306, "flos": 17456464823040.0, "grad_norm": 3.61515025582235, "language_loss": 0.8260076, "learning_rate": 3.2040257187280693e-06, "loss": 0.84787792, "num_input_tokens_seen": 112689790, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.88671875, "step": 5248, "time_per_iteration": 2.436790704727173 }, { "auxiliary_loss_clip": 0.0113905, "auxiliary_loss_mlp": 0.0105248, "balance_loss_clip": 1.03469408, "balance_loss_mlp": 1.05192077, "epoch": 0.3155869532541711, "flos": 18405763413120.0, "grad_norm": 1.9243107038945846, "language_loss": 0.84783351, "learning_rate": 3.2037147160490423e-06, "loss": 0.86974883, "num_input_tokens_seen": 112708265, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.87109375, "step": 5249, "time_per_iteration": 2.4974887371063232 }, { "auxiliary_loss_clip": 0.01140289, "auxiliary_loss_mlp": 0.01038893, "balance_loss_clip": 1.02144098, "balance_loss_mlp": 1.05197144, "epoch": 0.31564707650683904, "flos": 21579763783680.0, "grad_norm": 1.8531193493070157, "language_loss": 0.85267258, "learning_rate": 3.2034036677250322e-06, "loss": 0.87446445, "num_input_tokens_seen": 112727820, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8828125, "step": 5250, "time_per_iteration": 2.4952328205108643 }, { "auxiliary_loss_clip": 0.01137434, "auxiliary_loss_mlp": 0.01042856, "balance_loss_clip": 1.02527285, "balance_loss_mlp": 1.05015707, "epoch": 0.315707199759507, "flos": 21030976817280.0, "grad_norm": 2.026197996758638, "language_loss": 0.68144256, "learning_rate": 3.203092573767835e-06, "loss": 0.70324552, "num_input_tokens_seen": 112743140, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.875, "step": 5251, "time_per_iteration": 2.466670274734497 }, { "auxiliary_loss_clip": 0.01139065, "auxiliary_loss_mlp": 0.01041699, "balance_loss_clip": 1.02489018, "balance_loss_mlp": 1.05275846, "epoch": 0.31576732301217497, "flos": 26828861788800.0, "grad_norm": 1.944997623916263, "language_loss": 0.78792763, "learning_rate": 3.202781434189246e-06, "loss": 0.80973524, "num_input_tokens_seen": 112764705, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.86328125, "step": 5252, "time_per_iteration": 2.503708600997925 }, { "auxiliary_loss_clip": 0.01138487, "auxiliary_loss_mlp": 0.01042596, "balance_loss_clip": 1.02609766, "balance_loss_mlp": 1.05261755, "epoch": 0.31582744626484294, "flos": 22711165349760.0, "grad_norm": 1.7595318421745174, "language_loss": 0.7391997, "learning_rate": 3.202470249001066e-06, "loss": 0.76101053, "num_input_tokens_seen": 112785310, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.859375, "step": 5253, "time_per_iteration": 2.5128588676452637 }, { "auxiliary_loss_clip": 0.0113812, "auxiliary_loss_mlp": 0.01040573, "balance_loss_clip": 1.02343059, "balance_loss_mlp": 1.05051661, "epoch": 0.3158875695175109, "flos": 23951914894080.0, "grad_norm": 2.071972108670709, "language_loss": 0.73395908, "learning_rate": 3.2021590182150924e-06, "loss": 0.75574601, "num_input_tokens_seen": 112802905, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.875, "step": 5254, "time_per_iteration": 2.493957996368408 }, { "auxiliary_loss_clip": 0.01138605, "auxiliary_loss_mlp": 0.01037405, "balance_loss_clip": 1.02091217, "balance_loss_mlp": 1.05060112, "epoch": 0.31594769277017887, "flos": 13261883322240.0, "grad_norm": 1.8599987022966258, "language_loss": 0.77828622, "learning_rate": 3.201847741843128e-06, "loss": 0.80004638, "num_input_tokens_seen": 112820305, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8828125, "step": 5255, "time_per_iteration": 2.476553201675415 }, { "auxiliary_loss_clip": 0.01136774, "auxiliary_loss_mlp": 0.01038672, "balance_loss_clip": 1.02007508, "balance_loss_mlp": 1.05117297, "epoch": 0.31600781602284683, "flos": 23368258800000.0, "grad_norm": 2.4821361368765253, "language_loss": 0.77911603, "learning_rate": 3.2015364198969772e-06, "loss": 0.80087054, "num_input_tokens_seen": 112841185, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.85546875, "step": 5256, "time_per_iteration": 2.4965579509735107 }, { "auxiliary_loss_clip": 0.0113321, "auxiliary_loss_mlp": 0.0103956, "balance_loss_clip": 1.02424192, "balance_loss_mlp": 1.05198753, "epoch": 0.3160679392755148, "flos": 19828580019840.0, "grad_norm": 1.6573135471335259, "language_loss": 0.71334326, "learning_rate": 3.2012250523884453e-06, "loss": 0.73507094, "num_input_tokens_seen": 112860570, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8125, "step": 5257, "time_per_iteration": 2.4837565422058105 }, { "auxiliary_loss_clip": 0.01137618, "auxiliary_loss_mlp": 0.01040176, "balance_loss_clip": 1.02296185, "balance_loss_mlp": 1.05148375, "epoch": 0.31612806252818276, "flos": 20193216935040.0, "grad_norm": 2.7380954367977135, "language_loss": 0.76269019, "learning_rate": 3.2009136393293393e-06, "loss": 0.78446817, "num_input_tokens_seen": 112877975, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.859375, "step": 5258, "time_per_iteration": 2.451819896697998 }, { "auxiliary_loss_clip": 0.01138708, "auxiliary_loss_mlp": 0.01047339, "balance_loss_clip": 1.03004193, "balance_loss_mlp": 1.05196297, "epoch": 0.31618818578085073, "flos": 24235967646720.0, "grad_norm": 2.1478328097052297, "language_loss": 0.72605336, "learning_rate": 3.200602180731467e-06, "loss": 0.74791384, "num_input_tokens_seen": 112896170, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8671875, "step": 5259, "time_per_iteration": 3.990356922149658 }, { "auxiliary_loss_clip": 0.01145009, "auxiliary_loss_mlp": 0.0105221, "balance_loss_clip": 1.03610444, "balance_loss_mlp": 1.05540156, "epoch": 0.3162483090335187, "flos": 25081844002560.0, "grad_norm": 2.1205822780446573, "language_loss": 0.66268003, "learning_rate": 3.20029067660664e-06, "loss": 0.68465227, "num_input_tokens_seen": 112916180, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.89453125, "step": 5260, "time_per_iteration": 2.5439345836639404 }, { "auxiliary_loss_clip": 0.01137454, "auxiliary_loss_mlp": 0.01032388, "balance_loss_clip": 1.01678395, "balance_loss_mlp": 1.05009353, "epoch": 0.31630843228618666, "flos": 26323383646080.0, "grad_norm": 1.8252313202409027, "language_loss": 0.72255242, "learning_rate": 3.1999791269666706e-06, "loss": 0.74425089, "num_input_tokens_seen": 112936745, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.875, "step": 5261, "time_per_iteration": 2.505249500274658 }, { "auxiliary_loss_clip": 0.01067299, "auxiliary_loss_mlp": 0.01004704, "balance_loss_clip": 1.00232017, "balance_loss_mlp": 1.03839147, "epoch": 0.3163685555388547, "flos": 66758441552640.0, "grad_norm": 0.7495947627842271, "language_loss": 0.50687504, "learning_rate": 3.1996675318233716e-06, "loss": 0.52759504, "num_input_tokens_seen": 112994845, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.2890625, "step": 5262, "time_per_iteration": 5.839004755020142 }, { "auxiliary_loss_clip": 0.01141911, "auxiliary_loss_mlp": 0.0103919, "balance_loss_clip": 1.02344263, "balance_loss_mlp": 1.05465317, "epoch": 0.31642867879152264, "flos": 25995662933760.0, "grad_norm": 1.5369898001411713, "language_loss": 0.85187781, "learning_rate": 3.19935589118856e-06, "loss": 0.87368882, "num_input_tokens_seen": 113015125, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.87109375, "step": 5263, "time_per_iteration": 2.5375454425811768 }, { "auxiliary_loss_clip": 0.01132282, "auxiliary_loss_mlp": 0.01041244, "balance_loss_clip": 1.02640283, "balance_loss_mlp": 1.05005574, "epoch": 0.3164888020441906, "flos": 25774955815680.0, "grad_norm": 1.5383816343963193, "language_loss": 0.81803811, "learning_rate": 3.1990442050740535e-06, "loss": 0.83977336, "num_input_tokens_seen": 113035535, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 5264, "time_per_iteration": 3.9777259826660156 }, { "auxiliary_loss_clip": 0.01140557, "auxiliary_loss_mlp": 0.01038018, "balance_loss_clip": 1.02075613, "balance_loss_mlp": 1.05280662, "epoch": 0.3165489252968586, "flos": 19756220071680.0, "grad_norm": 2.282244281944786, "language_loss": 0.79647326, "learning_rate": 3.19873247349167e-06, "loss": 0.818259, "num_input_tokens_seen": 113052720, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.87890625, "step": 5265, "time_per_iteration": 2.4722657203674316 }, { "auxiliary_loss_clip": 0.01139871, "auxiliary_loss_mlp": 0.01042936, "balance_loss_clip": 1.0259012, "balance_loss_mlp": 1.05187488, "epoch": 0.31660904854952654, "flos": 23183929180800.0, "grad_norm": 4.68712014787587, "language_loss": 0.74861836, "learning_rate": 3.1984206964532307e-06, "loss": 0.77044642, "num_input_tokens_seen": 113071435, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 5266, "time_per_iteration": 2.5166828632354736 }, { "auxiliary_loss_clip": 0.01137461, "auxiliary_loss_mlp": 0.01042, "balance_loss_clip": 1.02550173, "balance_loss_mlp": 1.05013955, "epoch": 0.3166691718021945, "flos": 20408501099520.0, "grad_norm": 2.3837277083724437, "language_loss": 0.79526508, "learning_rate": 3.1981088739705585e-06, "loss": 0.8170597, "num_input_tokens_seen": 113088645, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.875, "step": 5267, "time_per_iteration": 2.495356798171997 }, { "auxiliary_loss_clip": 0.01064342, "auxiliary_loss_mlp": 0.01002566, "balance_loss_clip": 1.00021768, "balance_loss_mlp": 1.03601599, "epoch": 0.31672929505486247, "flos": 70144781172480.0, "grad_norm": 1.096799502525564, "language_loss": 0.57831454, "learning_rate": 3.197797006055478e-06, "loss": 0.59898365, "num_input_tokens_seen": 113152775, "router_z_loss_clip": 0.0234375, "router_z_loss_mlp": 0.28320312, "step": 5268, "time_per_iteration": 3.142151117324829 }, { "auxiliary_loss_clip": 0.01138527, "auxiliary_loss_mlp": 0.01041067, "balance_loss_clip": 1.02424693, "balance_loss_mlp": 1.05071282, "epoch": 0.31678941830753043, "flos": 14355758154240.0, "grad_norm": 2.148742773775786, "language_loss": 0.72687787, "learning_rate": 3.197485092719815e-06, "loss": 0.7486738, "num_input_tokens_seen": 113171410, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.87890625, "step": 5269, "time_per_iteration": 2.4549784660339355 }, { "auxiliary_loss_clip": 0.01134955, "auxiliary_loss_mlp": 0.01043686, "balance_loss_clip": 1.02767658, "balance_loss_mlp": 1.04966998, "epoch": 0.3168495415601984, "flos": 22747722416640.0, "grad_norm": 1.8612153962447575, "language_loss": 0.7997191, "learning_rate": 3.1971731339753973e-06, "loss": 0.82150555, "num_input_tokens_seen": 113189965, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8515625, "step": 5270, "time_per_iteration": 2.5045406818389893 }, { "auxiliary_loss_clip": 0.01140732, "auxiliary_loss_mlp": 0.01050967, "balance_loss_clip": 1.03299069, "balance_loss_mlp": 1.05069757, "epoch": 0.31690966481286637, "flos": 20115254465280.0, "grad_norm": 1.9767818005438487, "language_loss": 0.79223382, "learning_rate": 3.1968611298340545e-06, "loss": 0.81415087, "num_input_tokens_seen": 113206355, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.90234375, "step": 5271, "time_per_iteration": 2.4396393299102783 }, { "auxiliary_loss_clip": 0.01138814, "auxiliary_loss_mlp": 0.01040559, "balance_loss_clip": 1.02381039, "balance_loss_mlp": 1.05168188, "epoch": 0.31696978806553433, "flos": 21178928937600.0, "grad_norm": 2.060792550582616, "language_loss": 0.73010361, "learning_rate": 3.1965490803076173e-06, "loss": 0.75189739, "num_input_tokens_seen": 113225440, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.87109375, "step": 5272, "time_per_iteration": 2.4807324409484863 }, { "auxiliary_loss_clip": 0.01138787, "auxiliary_loss_mlp": 0.01040037, "balance_loss_clip": 1.02225137, "balance_loss_mlp": 1.04936981, "epoch": 0.3170299113182023, "flos": 42997030439040.0, "grad_norm": 2.9917331238405223, "language_loss": 0.69136584, "learning_rate": 3.1962369854079194e-06, "loss": 0.71315414, "num_input_tokens_seen": 113248840, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.89453125, "step": 5273, "time_per_iteration": 2.6585853099823 }, { "auxiliary_loss_clip": 0.0113583, "auxiliary_loss_mlp": 0.01039363, "balance_loss_clip": 1.02280486, "balance_loss_mlp": 1.05006838, "epoch": 0.31709003457087026, "flos": 24460158384000.0, "grad_norm": 1.6007111844845185, "language_loss": 0.67770177, "learning_rate": 3.195924845146795e-06, "loss": 0.69945371, "num_input_tokens_seen": 113269630, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.859375, "step": 5274, "time_per_iteration": 2.498748540878296 }, { "auxiliary_loss_clip": 0.01130978, "auxiliary_loss_mlp": 0.01043819, "balance_loss_clip": 1.02852416, "balance_loss_mlp": 1.04874825, "epoch": 0.3171501578235382, "flos": 24135310759680.0, "grad_norm": 1.6058784021395964, "language_loss": 0.80682176, "learning_rate": 3.195612659536081e-06, "loss": 0.82856971, "num_input_tokens_seen": 113291200, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.82421875, "step": 5275, "time_per_iteration": 2.5008206367492676 }, { "auxiliary_loss_clip": 0.01136253, "auxiliary_loss_mlp": 0.01045243, "balance_loss_clip": 1.02882791, "balance_loss_mlp": 1.04832554, "epoch": 0.31721028107620625, "flos": 18879712392960.0, "grad_norm": 1.9018117187581285, "language_loss": 0.72387463, "learning_rate": 3.1953004285876147e-06, "loss": 0.74568963, "num_input_tokens_seen": 113310170, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87890625, "step": 5276, "time_per_iteration": 2.454965353012085 }, { "auxiliary_loss_clip": 0.01130742, "auxiliary_loss_mlp": 0.01039239, "balance_loss_clip": 1.02408791, "balance_loss_mlp": 1.04883182, "epoch": 0.3172704043288742, "flos": 23147874904320.0, "grad_norm": 1.4758014822589929, "language_loss": 0.77828979, "learning_rate": 3.194988152313236e-06, "loss": 0.79998958, "num_input_tokens_seen": 113331140, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8203125, "step": 5277, "time_per_iteration": 2.502580404281616 }, { "auxiliary_loss_clip": 0.01137058, "auxiliary_loss_mlp": 0.01038939, "balance_loss_clip": 1.02121282, "balance_loss_mlp": 1.04920721, "epoch": 0.3173305275815422, "flos": 17858520731520.0, "grad_norm": 1.8480407275855562, "language_loss": 0.79015517, "learning_rate": 3.1946758307247878e-06, "loss": 0.81191516, "num_input_tokens_seen": 113350030, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.87890625, "step": 5278, "time_per_iteration": 2.459829330444336 }, { "auxiliary_loss_clip": 0.01065893, "auxiliary_loss_mlp": 0.01010848, "balance_loss_clip": 1.00853527, "balance_loss_mlp": 1.03755414, "epoch": 0.31739065083421014, "flos": 59973476883840.0, "grad_norm": 0.8761023443268281, "language_loss": 0.62810922, "learning_rate": 3.1943634638341114e-06, "loss": 0.64887667, "num_input_tokens_seen": 113395820, "router_z_loss_clip": 0.02307129, "router_z_loss_mlp": 0.28320312, "step": 5279, "time_per_iteration": 2.8576056957244873 }, { "auxiliary_loss_clip": 0.01140772, "auxiliary_loss_mlp": 0.01046533, "balance_loss_clip": 1.0286634, "balance_loss_mlp": 1.05032802, "epoch": 0.3174507740868781, "flos": 23800981944960.0, "grad_norm": 1.5128826437867828, "language_loss": 0.81146127, "learning_rate": 3.194051051653053e-06, "loss": 0.83333433, "num_input_tokens_seen": 113416835, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.90625, "step": 5280, "time_per_iteration": 2.499309778213501 }, { "auxiliary_loss_clip": 0.0113761, "auxiliary_loss_mlp": 0.01045768, "balance_loss_clip": 1.0296154, "balance_loss_mlp": 1.05243731, "epoch": 0.31751089733954607, "flos": 27638899349760.0, "grad_norm": 2.1464689049021484, "language_loss": 0.78089488, "learning_rate": 3.19373859419346e-06, "loss": 0.80272865, "num_input_tokens_seen": 113440850, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8515625, "step": 5281, "time_per_iteration": 2.6400198936462402 }, { "auxiliary_loss_clip": 0.01136457, "auxiliary_loss_mlp": 0.01044973, "balance_loss_clip": 1.0275569, "balance_loss_mlp": 1.05098736, "epoch": 0.31757102059221404, "flos": 23769273214080.0, "grad_norm": 1.6175207709484967, "language_loss": 0.78470337, "learning_rate": 3.193426091467179e-06, "loss": 0.80651766, "num_input_tokens_seen": 113461000, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.85546875, "step": 5282, "time_per_iteration": 2.555341958999634 }, { "auxiliary_loss_clip": 0.01141329, "auxiliary_loss_mlp": 0.01047374, "balance_loss_clip": 1.0309943, "balance_loss_mlp": 1.05233693, "epoch": 0.317631143844882, "flos": 25264521596160.0, "grad_norm": 2.015132482141598, "language_loss": 0.67007488, "learning_rate": 3.193113543486061e-06, "loss": 0.69196188, "num_input_tokens_seen": 113480820, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.890625, "step": 5283, "time_per_iteration": 2.5182993412017822 }, { "auxiliary_loss_clip": 0.01063125, "auxiliary_loss_mlp": 0.01004129, "balance_loss_clip": 1.0019114, "balance_loss_mlp": 1.0351274, "epoch": 0.31769126709754997, "flos": 55825939221120.0, "grad_norm": 0.8793490448524518, "language_loss": 0.52824861, "learning_rate": 3.192800950261958e-06, "loss": 0.54892111, "num_input_tokens_seen": 113536910, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.28125, "step": 5284, "time_per_iteration": 3.0572903156280518 }, { "auxiliary_loss_clip": 0.01142923, "auxiliary_loss_mlp": 0.01037613, "balance_loss_clip": 1.02135265, "balance_loss_mlp": 1.05278587, "epoch": 0.31775139035021793, "flos": 16690562098560.0, "grad_norm": 1.785642215758206, "language_loss": 0.70829576, "learning_rate": 3.1924883118067235e-06, "loss": 0.73010117, "num_input_tokens_seen": 113555480, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.90234375, "step": 5285, "time_per_iteration": 2.4690194129943848 }, { "auxiliary_loss_clip": 0.01059905, "auxiliary_loss_mlp": 0.01000368, "balance_loss_clip": 0.99791259, "balance_loss_mlp": 1.03190231, "epoch": 0.3178115136028859, "flos": 64227241019520.0, "grad_norm": 0.82643869415647, "language_loss": 0.60482705, "learning_rate": 3.1921756281322123e-06, "loss": 0.62542975, "num_input_tokens_seen": 113616790, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.28125, "step": 5286, "time_per_iteration": 3.1256017684936523 }, { "auxiliary_loss_clip": 0.01137808, "auxiliary_loss_mlp": 0.01045162, "balance_loss_clip": 1.02824664, "balance_loss_mlp": 1.0487895, "epoch": 0.31787163685555386, "flos": 18697465762560.0, "grad_norm": 2.0359057330149057, "language_loss": 0.71943986, "learning_rate": 3.1918628992502826e-06, "loss": 0.74126953, "num_input_tokens_seen": 113635320, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.890625, "step": 5287, "time_per_iteration": 2.5354535579681396 }, { "auxiliary_loss_clip": 0.01136245, "auxiliary_loss_mlp": 0.01044074, "balance_loss_clip": 1.02645445, "balance_loss_mlp": 1.04749703, "epoch": 0.31793176010822183, "flos": 21324762155520.0, "grad_norm": 2.742670056976728, "language_loss": 0.75413471, "learning_rate": 3.191550125172792e-06, "loss": 0.77593791, "num_input_tokens_seen": 113654000, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.88671875, "step": 5288, "time_per_iteration": 2.449958562850952 }, { "auxiliary_loss_clip": 0.01127836, "auxiliary_loss_mlp": 0.01032626, "balance_loss_clip": 1.01816583, "balance_loss_mlp": 1.04464555, "epoch": 0.31799188336088985, "flos": 20958688696320.0, "grad_norm": 1.8262229895098876, "language_loss": 0.87766588, "learning_rate": 3.1912373059116007e-06, "loss": 0.89927047, "num_input_tokens_seen": 113672375, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.83203125, "step": 5289, "time_per_iteration": 2.4883546829223633 }, { "auxiliary_loss_clip": 0.01130342, "auxiliary_loss_mlp": 0.01037564, "balance_loss_clip": 1.02278233, "balance_loss_mlp": 1.04785383, "epoch": 0.3180520066135578, "flos": 22491930689280.0, "grad_norm": 1.663698852347326, "language_loss": 0.67558122, "learning_rate": 3.190924441478572e-06, "loss": 0.69726026, "num_input_tokens_seen": 113692385, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.82421875, "step": 5290, "time_per_iteration": 2.4787254333496094 }, { "auxiliary_loss_clip": 0.01135516, "auxiliary_loss_mlp": 0.01037902, "balance_loss_clip": 1.02185655, "balance_loss_mlp": 1.0468297, "epoch": 0.3181121298662258, "flos": 27235335070080.0, "grad_norm": 1.8619670783926525, "language_loss": 0.79737461, "learning_rate": 3.1906115318855687e-06, "loss": 0.81910878, "num_input_tokens_seen": 113712145, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.88671875, "step": 5291, "time_per_iteration": 2.52301025390625 }, { "auxiliary_loss_clip": 0.01134542, "auxiliary_loss_mlp": 0.01035186, "balance_loss_clip": 1.01817453, "balance_loss_mlp": 1.04635048, "epoch": 0.31817225311889374, "flos": 23180158252800.0, "grad_norm": 2.1164025616629867, "language_loss": 0.7961334, "learning_rate": 3.1902985771444577e-06, "loss": 0.81783068, "num_input_tokens_seen": 113731435, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 5292, "time_per_iteration": 2.469181776046753 }, { "auxiliary_loss_clip": 0.01127144, "auxiliary_loss_mlp": 0.01035043, "balance_loss_clip": 1.02055907, "balance_loss_mlp": 1.0467397, "epoch": 0.3182323763715617, "flos": 23258803080960.0, "grad_norm": 4.279273488214172, "language_loss": 0.74860811, "learning_rate": 3.1899855772671043e-06, "loss": 0.77022994, "num_input_tokens_seen": 113750825, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 5293, "time_per_iteration": 2.491039514541626 }, { "auxiliary_loss_clip": 0.01130706, "auxiliary_loss_mlp": 0.01039493, "balance_loss_clip": 1.02516389, "balance_loss_mlp": 1.04828548, "epoch": 0.3182924996242297, "flos": 29016683280000.0, "grad_norm": 2.017835810419932, "language_loss": 0.74122959, "learning_rate": 3.189672532265379e-06, "loss": 0.76293159, "num_input_tokens_seen": 113770010, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.82421875, "step": 5294, "time_per_iteration": 2.558957099914551 }, { "auxiliary_loss_clip": 0.01135931, "auxiliary_loss_mlp": 0.01037747, "balance_loss_clip": 1.02030635, "balance_loss_mlp": 1.04875302, "epoch": 0.31835262287689764, "flos": 20449188230400.0, "grad_norm": 1.9862117133872137, "language_loss": 0.75778413, "learning_rate": 3.189359442151152e-06, "loss": 0.77952087, "num_input_tokens_seen": 113788640, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.87109375, "step": 5295, "time_per_iteration": 2.4881138801574707 }, { "auxiliary_loss_clip": 0.01139424, "auxiliary_loss_mlp": 0.01039, "balance_loss_clip": 1.02352703, "balance_loss_mlp": 1.05096829, "epoch": 0.3184127461295656, "flos": 25119478477440.0, "grad_norm": 1.642727208903362, "language_loss": 0.69374478, "learning_rate": 3.189046306936296e-06, "loss": 0.71552908, "num_input_tokens_seen": 113809515, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8828125, "step": 5296, "time_per_iteration": 2.499696731567383 }, { "auxiliary_loss_clip": 0.01132114, "auxiliary_loss_mlp": 0.01039555, "balance_loss_clip": 1.02416539, "balance_loss_mlp": 1.04647875, "epoch": 0.31847286938223357, "flos": 25551231955200.0, "grad_norm": 1.5870257287132896, "language_loss": 0.77863139, "learning_rate": 3.1887331266326846e-06, "loss": 0.80034804, "num_input_tokens_seen": 113829770, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.859375, "step": 5297, "time_per_iteration": 2.508626937866211 }, { "auxiliary_loss_clip": 0.01129588, "auxiliary_loss_mlp": 0.01030657, "balance_loss_clip": 1.0146234, "balance_loss_mlp": 1.04581261, "epoch": 0.31853299263490154, "flos": 27782470010880.0, "grad_norm": 2.015648249626744, "language_loss": 0.79436684, "learning_rate": 3.1884199012521942e-06, "loss": 0.81596935, "num_input_tokens_seen": 113849320, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 5298, "time_per_iteration": 2.51086163520813 }, { "auxiliary_loss_clip": 0.0113668, "auxiliary_loss_mlp": 0.01038892, "balance_loss_clip": 1.02358532, "balance_loss_mlp": 1.04822052, "epoch": 0.3185931158875695, "flos": 22706747976960.0, "grad_norm": 1.7190798687080864, "language_loss": 0.73787224, "learning_rate": 3.1881066308067016e-06, "loss": 0.75962794, "num_input_tokens_seen": 113867860, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8828125, "step": 5299, "time_per_iteration": 2.490832567214966 }, { "auxiliary_loss_clip": 0.01134777, "auxiliary_loss_mlp": 0.0104603, "balance_loss_clip": 1.02985311, "balance_loss_mlp": 1.04651284, "epoch": 0.31865323914023747, "flos": 24571517523840.0, "grad_norm": 2.16931257043687, "language_loss": 0.78412497, "learning_rate": 3.1877933153080873e-06, "loss": 0.805933, "num_input_tokens_seen": 113886375, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8828125, "step": 5300, "time_per_iteration": 3.9869143962860107 }, { "auxiliary_loss_clip": 0.01132794, "auxiliary_loss_mlp": 0.01039149, "balance_loss_clip": 1.02197099, "balance_loss_mlp": 1.04662895, "epoch": 0.31871336239290543, "flos": 18186564666240.0, "grad_norm": 3.108210602161327, "language_loss": 0.83711976, "learning_rate": 3.1874799547682304e-06, "loss": 0.85883915, "num_input_tokens_seen": 113904065, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.859375, "step": 5301, "time_per_iteration": 2.4317264556884766 }, { "auxiliary_loss_clip": 0.01134686, "auxiliary_loss_mlp": 0.01049163, "balance_loss_clip": 1.03205681, "balance_loss_mlp": 1.05091929, "epoch": 0.31877348564557345, "flos": 21826756679040.0, "grad_norm": 2.2852790225857107, "language_loss": 0.77186078, "learning_rate": 3.187166549199015e-06, "loss": 0.79369926, "num_input_tokens_seen": 113918415, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8359375, "step": 5302, "time_per_iteration": 2.428788185119629 }, { "auxiliary_loss_clip": 0.0112757, "auxiliary_loss_mlp": 0.01039712, "balance_loss_clip": 1.02360642, "balance_loss_mlp": 1.04626226, "epoch": 0.3188336088982414, "flos": 22015252275840.0, "grad_norm": 1.6731771700977383, "language_loss": 0.79330528, "learning_rate": 3.1868530986123255e-06, "loss": 0.81497806, "num_input_tokens_seen": 113938135, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8125, "step": 5303, "time_per_iteration": 2.4942352771759033 }, { "auxiliary_loss_clip": 0.01140322, "auxiliary_loss_mlp": 0.01040035, "balance_loss_clip": 1.02280986, "balance_loss_mlp": 1.04916751, "epoch": 0.3188937321509094, "flos": 20047886507520.0, "grad_norm": 2.2336642411998056, "language_loss": 0.72713763, "learning_rate": 3.186539603020047e-06, "loss": 0.74894118, "num_input_tokens_seen": 113957125, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.9140625, "step": 5304, "time_per_iteration": 3.9907352924346924 }, { "auxiliary_loss_clip": 0.01129695, "auxiliary_loss_mlp": 0.01040005, "balance_loss_clip": 1.02521157, "balance_loss_mlp": 1.04791784, "epoch": 0.31895385540357735, "flos": 25848105863040.0, "grad_norm": 1.9030816273879398, "language_loss": 0.71875125, "learning_rate": 3.186226062434068e-06, "loss": 0.74044824, "num_input_tokens_seen": 113974875, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.81640625, "step": 5305, "time_per_iteration": 3.9508371353149414 }, { "auxiliary_loss_clip": 0.01130352, "auxiliary_loss_mlp": 0.01041459, "balance_loss_clip": 1.02665329, "balance_loss_mlp": 1.04654002, "epoch": 0.3190139786562453, "flos": 23477714519040.0, "grad_norm": 1.7383493711272033, "language_loss": 0.63992298, "learning_rate": 3.1859124768662778e-06, "loss": 0.66164112, "num_input_tokens_seen": 113994450, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.83984375, "step": 5306, "time_per_iteration": 2.4902725219726562 }, { "auxiliary_loss_clip": 0.01136011, "auxiliary_loss_mlp": 0.01041507, "balance_loss_clip": 1.02559233, "balance_loss_mlp": 1.05071437, "epoch": 0.3190741019089133, "flos": 29095543589760.0, "grad_norm": 2.7904080279441694, "language_loss": 0.79634517, "learning_rate": 3.1855988463285678e-06, "loss": 0.81812036, "num_input_tokens_seen": 114013945, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.85546875, "step": 5307, "time_per_iteration": 2.5289204120635986 }, { "auxiliary_loss_clip": 0.01127662, "auxiliary_loss_mlp": 0.01044759, "balance_loss_clip": 1.028409, "balance_loss_mlp": 1.04611325, "epoch": 0.31913422516158124, "flos": 17129534209920.0, "grad_norm": 1.7999574706230614, "language_loss": 0.77378136, "learning_rate": 3.1852851708328308e-06, "loss": 0.79550552, "num_input_tokens_seen": 114031375, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.81640625, "step": 5308, "time_per_iteration": 2.4359536170959473 }, { "auxiliary_loss_clip": 0.01145535, "auxiliary_loss_mlp": 0.01048641, "balance_loss_clip": 1.03060436, "balance_loss_mlp": 1.05240798, "epoch": 0.3191943484142492, "flos": 16069846147200.0, "grad_norm": 2.6997688940807567, "language_loss": 0.7440424, "learning_rate": 3.184971450390961e-06, "loss": 0.76598412, "num_input_tokens_seen": 114048465, "router_z_loss_clip": 0.18066406, "router_z_loss_mlp": 0.9296875, "step": 5309, "time_per_iteration": 2.4560303688049316 }, { "auxiliary_loss_clip": 0.01135341, "auxiliary_loss_mlp": 0.01042354, "balance_loss_clip": 1.02774477, "balance_loss_mlp": 1.05048048, "epoch": 0.3192544716669172, "flos": 22966166977920.0, "grad_norm": 1.8313000154875787, "language_loss": 0.82705015, "learning_rate": 3.184657685014856e-06, "loss": 0.84882712, "num_input_tokens_seen": 114068415, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8515625, "step": 5310, "time_per_iteration": 2.4696669578552246 }, { "auxiliary_loss_clip": 0.01132093, "auxiliary_loss_mlp": 0.01040694, "balance_loss_clip": 1.02650833, "balance_loss_mlp": 1.04788709, "epoch": 0.31931459491958514, "flos": 26870339018880.0, "grad_norm": 2.0100934575190763, "language_loss": 0.7836194, "learning_rate": 3.184343874716412e-06, "loss": 0.80534732, "num_input_tokens_seen": 114088565, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.84375, "step": 5311, "time_per_iteration": 2.531705856323242 }, { "auxiliary_loss_clip": 0.01133689, "auxiliary_loss_mlp": 0.01039556, "balance_loss_clip": 1.02362943, "balance_loss_mlp": 1.04957926, "epoch": 0.3193747181722531, "flos": 21836525178240.0, "grad_norm": 2.3275932964678723, "language_loss": 0.84143949, "learning_rate": 3.1840300195075295e-06, "loss": 0.86317194, "num_input_tokens_seen": 114107160, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83984375, "step": 5312, "time_per_iteration": 2.468160390853882 }, { "auxiliary_loss_clip": 0.01140375, "auxiliary_loss_mlp": 0.01052009, "balance_loss_clip": 1.03542674, "balance_loss_mlp": 1.05082238, "epoch": 0.31943484142492107, "flos": 18324999682560.0, "grad_norm": 2.458337498518607, "language_loss": 0.78033102, "learning_rate": 3.1837161194001102e-06, "loss": 0.8022548, "num_input_tokens_seen": 114123420, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.89453125, "step": 5313, "time_per_iteration": 2.4512805938720703 }, { "auxiliary_loss_clip": 0.01132998, "auxiliary_loss_mlp": 0.01039074, "balance_loss_clip": 1.0236845, "balance_loss_mlp": 1.0488404, "epoch": 0.31949496467758903, "flos": 21615818060160.0, "grad_norm": 2.4605655670903226, "language_loss": 0.85959601, "learning_rate": 3.183402174406057e-06, "loss": 0.88131678, "num_input_tokens_seen": 114139230, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84375, "step": 5314, "time_per_iteration": 2.449310302734375 }, { "auxiliary_loss_clip": 0.01135506, "auxiliary_loss_mlp": 0.01042685, "balance_loss_clip": 1.02674675, "balance_loss_mlp": 1.0508287, "epoch": 0.31955508793025705, "flos": 21760214734080.0, "grad_norm": 2.5035166249774505, "language_loss": 0.7990762, "learning_rate": 3.1830881845372747e-06, "loss": 0.820858, "num_input_tokens_seen": 114159290, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84765625, "step": 5315, "time_per_iteration": 2.4979565143585205 }, { "auxiliary_loss_clip": 0.01138073, "auxiliary_loss_mlp": 0.01054769, "balance_loss_clip": 1.0378654, "balance_loss_mlp": 1.05200982, "epoch": 0.319615211182925, "flos": 17164331510400.0, "grad_norm": 2.4438131753468912, "language_loss": 0.67722154, "learning_rate": 3.18277414980567e-06, "loss": 0.69914997, "num_input_tokens_seen": 114177655, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.859375, "step": 5316, "time_per_iteration": 2.448503255844116 }, { "auxiliary_loss_clip": 0.01135882, "auxiliary_loss_mlp": 0.01036799, "balance_loss_clip": 1.0224936, "balance_loss_mlp": 1.05064976, "epoch": 0.319675334435593, "flos": 28112812416000.0, "grad_norm": 1.8013078161546352, "language_loss": 0.69388926, "learning_rate": 3.1824600702231515e-06, "loss": 0.71561605, "num_input_tokens_seen": 114200880, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8515625, "step": 5317, "time_per_iteration": 2.5631446838378906 }, { "auxiliary_loss_clip": 0.01064802, "auxiliary_loss_mlp": 0.01016908, "balance_loss_clip": 1.01423728, "balance_loss_mlp": 1.036484, "epoch": 0.31973545768826095, "flos": 69501119408640.0, "grad_norm": 0.7328066066520273, "language_loss": 0.53096706, "learning_rate": 3.182145945801628e-06, "loss": 0.55178416, "num_input_tokens_seen": 114267145, "router_z_loss_clip": 0.0267334, "router_z_loss_mlp": 0.28320312, "step": 5318, "time_per_iteration": 3.2351205348968506 }, { "auxiliary_loss_clip": 0.01131065, "auxiliary_loss_mlp": 0.01039101, "balance_loss_clip": 1.02462864, "balance_loss_mlp": 1.0492481, "epoch": 0.3197955809409289, "flos": 13699203408000.0, "grad_norm": 1.6879566010413758, "language_loss": 0.83767653, "learning_rate": 3.181831776553012e-06, "loss": 0.85937822, "num_input_tokens_seen": 114284630, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.81640625, "step": 5319, "time_per_iteration": 2.464843273162842 }, { "auxiliary_loss_clip": 0.01133013, "auxiliary_loss_mlp": 0.01040063, "balance_loss_clip": 1.02522206, "balance_loss_mlp": 1.0493319, "epoch": 0.3198557041935969, "flos": 33218124278400.0, "grad_norm": 1.7625683145171314, "language_loss": 0.63480961, "learning_rate": 3.1815175624892165e-06, "loss": 0.65654033, "num_input_tokens_seen": 114305830, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.83984375, "step": 5320, "time_per_iteration": 2.5712730884552 }, { "auxiliary_loss_clip": 0.01140222, "auxiliary_loss_mlp": 0.01037234, "balance_loss_clip": 1.02195168, "balance_loss_mlp": 1.05300224, "epoch": 0.31991582744626484, "flos": 23732033788800.0, "grad_norm": 3.075524222182294, "language_loss": 0.70812345, "learning_rate": 3.1812033036221567e-06, "loss": 0.72989798, "num_input_tokens_seen": 114325165, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.87109375, "step": 5321, "time_per_iteration": 2.5140392780303955 }, { "auxiliary_loss_clip": 0.01144597, "auxiliary_loss_mlp": 0.01058494, "balance_loss_clip": 1.04170895, "balance_loss_mlp": 1.05390394, "epoch": 0.3199759506989328, "flos": 18550842445440.0, "grad_norm": 2.7712207786233156, "language_loss": 0.86428404, "learning_rate": 3.180888999963749e-06, "loss": 0.88631493, "num_input_tokens_seen": 114341310, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.90625, "step": 5322, "time_per_iteration": 2.4582700729370117 }, { "auxiliary_loss_clip": 0.01136403, "auxiliary_loss_mlp": 0.01038134, "balance_loss_clip": 1.02288759, "balance_loss_mlp": 1.05182874, "epoch": 0.3200360739516008, "flos": 22418888382720.0, "grad_norm": 2.0985800958675807, "language_loss": 0.8339569, "learning_rate": 3.1805746515259123e-06, "loss": 0.85570228, "num_input_tokens_seen": 114360355, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84765625, "step": 5323, "time_per_iteration": 2.496999502182007 }, { "auxiliary_loss_clip": 0.01132446, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.01848686, "balance_loss_mlp": 1.04933763, "epoch": 0.32009619720426874, "flos": 20595236929920.0, "grad_norm": 1.9561668966688595, "language_loss": 0.77960098, "learning_rate": 3.1802602583205663e-06, "loss": 0.80127567, "num_input_tokens_seen": 114379220, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.828125, "step": 5324, "time_per_iteration": 2.472533941268921 }, { "auxiliary_loss_clip": 0.01134021, "auxiliary_loss_mlp": 0.01035543, "balance_loss_clip": 1.01930702, "balance_loss_mlp": 1.04969442, "epoch": 0.3201563204569367, "flos": 18147637301760.0, "grad_norm": 2.91370703345447, "language_loss": 0.80436736, "learning_rate": 3.1799458203596333e-06, "loss": 0.82606292, "num_input_tokens_seen": 114396365, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84375, "step": 5325, "time_per_iteration": 2.5246124267578125 }, { "auxiliary_loss_clip": 0.01137909, "auxiliary_loss_mlp": 0.01036339, "balance_loss_clip": 1.02109218, "balance_loss_mlp": 1.05233943, "epoch": 0.32021644370960467, "flos": 31684235840640.0, "grad_norm": 1.7018753718696258, "language_loss": 0.75173759, "learning_rate": 3.179631337655037e-06, "loss": 0.77348012, "num_input_tokens_seen": 114416780, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.85546875, "step": 5326, "time_per_iteration": 2.5517053604125977 }, { "auxiliary_loss_clip": 0.01135332, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.02501702, "balance_loss_mlp": 1.05302572, "epoch": 0.32027656696227264, "flos": 26865921646080.0, "grad_norm": 2.1439207015448902, "language_loss": 0.81037623, "learning_rate": 3.179316810218701e-06, "loss": 0.83213067, "num_input_tokens_seen": 114437405, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.82421875, "step": 5327, "time_per_iteration": 2.525404214859009 }, { "auxiliary_loss_clip": 0.01137642, "auxiliary_loss_mlp": 0.01037585, "balance_loss_clip": 1.02077675, "balance_loss_mlp": 1.04991078, "epoch": 0.32033669021494066, "flos": 24169928492160.0, "grad_norm": 1.5274072815336608, "language_loss": 0.7775991, "learning_rate": 3.179002238062554e-06, "loss": 0.79935133, "num_input_tokens_seen": 114458505, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.875, "step": 5328, "time_per_iteration": 2.4853014945983887 }, { "auxiliary_loss_clip": 0.01136735, "auxiliary_loss_mlp": 0.01039004, "balance_loss_clip": 1.02180195, "balance_loss_mlp": 1.05046892, "epoch": 0.3203968134676086, "flos": 24460768915200.0, "grad_norm": 1.6263297275938877, "language_loss": 0.74404967, "learning_rate": 3.178687621198524e-06, "loss": 0.76580709, "num_input_tokens_seen": 114479050, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.86328125, "step": 5329, "time_per_iteration": 2.536207914352417 }, { "auxiliary_loss_clip": 0.0112951, "auxiliary_loss_mlp": 0.01031139, "balance_loss_clip": 1.01708436, "balance_loss_mlp": 1.04895377, "epoch": 0.3204569367202766, "flos": 18004713085440.0, "grad_norm": 1.7024858769314644, "language_loss": 0.71031487, "learning_rate": 3.1783729596385415e-06, "loss": 0.73192132, "num_input_tokens_seen": 114497415, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8046875, "step": 5330, "time_per_iteration": 2.4552502632141113 }, { "auxiliary_loss_clip": 0.01141823, "auxiliary_loss_mlp": 0.01043918, "balance_loss_clip": 1.02644229, "balance_loss_mlp": 1.05228424, "epoch": 0.32051705997294455, "flos": 30589678650240.0, "grad_norm": 1.7913664341497584, "language_loss": 0.79650533, "learning_rate": 3.1780582533945376e-06, "loss": 0.81836271, "num_input_tokens_seen": 114518785, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8984375, "step": 5331, "time_per_iteration": 2.5583300590515137 }, { "auxiliary_loss_clip": 0.01063931, "auxiliary_loss_mlp": 0.01004146, "balance_loss_clip": 1.00132072, "balance_loss_mlp": 1.03546333, "epoch": 0.3205771832256125, "flos": 68417979765120.0, "grad_norm": 0.8617319692120825, "language_loss": 0.57806182, "learning_rate": 3.177743502478447e-06, "loss": 0.5987426, "num_input_tokens_seen": 114577710, "router_z_loss_clip": 0.02819824, "router_z_loss_mlp": 0.28515625, "step": 5332, "time_per_iteration": 3.04228138923645 }, { "auxiliary_loss_clip": 0.01138794, "auxiliary_loss_mlp": 0.01034184, "balance_loss_clip": 1.01859164, "balance_loss_mlp": 1.0510267, "epoch": 0.3206373064782805, "flos": 30443953173120.0, "grad_norm": 1.6476168721018825, "language_loss": 0.72787243, "learning_rate": 3.177428706902205e-06, "loss": 0.7496022, "num_input_tokens_seen": 114598640, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.875, "step": 5333, "time_per_iteration": 2.5481107234954834 }, { "auxiliary_loss_clip": 0.01138529, "auxiliary_loss_mlp": 0.0104015, "balance_loss_clip": 1.02363908, "balance_loss_mlp": 1.05174875, "epoch": 0.32069742973094845, "flos": 22054502862720.0, "grad_norm": 1.6829544646280192, "language_loss": 0.70643806, "learning_rate": 3.1771138666777485e-06, "loss": 0.72822481, "num_input_tokens_seen": 114618780, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8671875, "step": 5334, "time_per_iteration": 2.5115771293640137 }, { "auxiliary_loss_clip": 0.01135219, "auxiliary_loss_mlp": 0.01038869, "balance_loss_clip": 1.02327681, "balance_loss_mlp": 1.04908276, "epoch": 0.3207575529836164, "flos": 22054000072320.0, "grad_norm": 2.240515665221056, "language_loss": 0.77433914, "learning_rate": 3.1767989818170156e-06, "loss": 0.79607999, "num_input_tokens_seen": 114637525, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 5335, "time_per_iteration": 2.4926769733428955 }, { "auxiliary_loss_clip": 0.01137696, "auxiliary_loss_mlp": 0.01041908, "balance_loss_clip": 1.02537358, "balance_loss_mlp": 1.05233049, "epoch": 0.3208176762362844, "flos": 34057536186240.0, "grad_norm": 1.4367478656287835, "language_loss": 0.68314147, "learning_rate": 3.1764840523319477e-06, "loss": 0.70493758, "num_input_tokens_seen": 114659705, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8515625, "step": 5336, "time_per_iteration": 2.564385414123535 }, { "auxiliary_loss_clip": 0.01134776, "auxiliary_loss_mlp": 0.01044637, "balance_loss_clip": 1.02868724, "balance_loss_mlp": 1.04946685, "epoch": 0.32087779948895234, "flos": 21798711135360.0, "grad_norm": 3.5950388771229034, "language_loss": 0.78798026, "learning_rate": 3.176169078234487e-06, "loss": 0.8097744, "num_input_tokens_seen": 114678340, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8515625, "step": 5337, "time_per_iteration": 2.4948058128356934 }, { "auxiliary_loss_clip": 0.01130099, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.01947665, "balance_loss_mlp": 1.04924786, "epoch": 0.3209379227416203, "flos": 21434110133760.0, "grad_norm": 1.601678333669245, "language_loss": 0.7379334, "learning_rate": 3.1758540595365766e-06, "loss": 0.75957596, "num_input_tokens_seen": 114696980, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80859375, "step": 5338, "time_per_iteration": 2.478224039077759 }, { "auxiliary_loss_clip": 0.01136446, "auxiliary_loss_mlp": 0.01037091, "balance_loss_clip": 1.02065241, "balance_loss_mlp": 1.04857802, "epoch": 0.3209980459942883, "flos": 25849075530240.0, "grad_norm": 2.2229207032887137, "language_loss": 0.63228095, "learning_rate": 3.1755389962501626e-06, "loss": 0.65401638, "num_input_tokens_seen": 114717330, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87890625, "step": 5339, "time_per_iteration": 2.5317628383636475 }, { "auxiliary_loss_clip": 0.01135935, "auxiliary_loss_mlp": 0.01039432, "balance_loss_clip": 1.02381563, "balance_loss_mlp": 1.05006242, "epoch": 0.32105816924695624, "flos": 19099162535040.0, "grad_norm": 1.9851329273302496, "language_loss": 0.81363511, "learning_rate": 3.175223888387192e-06, "loss": 0.83538878, "num_input_tokens_seen": 114736320, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 5340, "time_per_iteration": 2.4607465267181396 }, { "auxiliary_loss_clip": 0.01134817, "auxiliary_loss_mlp": 0.01044069, "balance_loss_clip": 1.02892959, "balance_loss_mlp": 1.04977298, "epoch": 0.3211182924996242, "flos": 16581860565120.0, "grad_norm": 2.324159950445897, "language_loss": 0.76419002, "learning_rate": 3.1749087359596137e-06, "loss": 0.78597885, "num_input_tokens_seen": 114754575, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8515625, "step": 5341, "time_per_iteration": 2.4856605529785156 }, { "auxiliary_loss_clip": 0.01134153, "auxiliary_loss_mlp": 0.01037991, "balance_loss_clip": 1.02280974, "balance_loss_mlp": 1.05096221, "epoch": 0.3211784157522922, "flos": 22672202071680.0, "grad_norm": 2.076831299298217, "language_loss": 0.79341722, "learning_rate": 3.1745935389793786e-06, "loss": 0.81513864, "num_input_tokens_seen": 114773590, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83203125, "step": 5342, "time_per_iteration": 3.936675786972046 }, { "auxiliary_loss_clip": 0.01139355, "auxiliary_loss_mlp": 0.01039127, "balance_loss_clip": 1.02256894, "balance_loss_mlp": 1.05256844, "epoch": 0.3212385390049602, "flos": 20558787603840.0, "grad_norm": 2.880360784157917, "language_loss": 0.75184965, "learning_rate": 3.174278297458438e-06, "loss": 0.77363443, "num_input_tokens_seen": 114790775, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8671875, "step": 5343, "time_per_iteration": 2.4557383060455322 }, { "auxiliary_loss_clip": 0.01135975, "auxiliary_loss_mlp": 0.01040449, "balance_loss_clip": 1.02459383, "balance_loss_mlp": 1.05111408, "epoch": 0.32129866225762815, "flos": 24791147233920.0, "grad_norm": 1.6230469917154269, "language_loss": 0.82767928, "learning_rate": 3.173963011408748e-06, "loss": 0.8494435, "num_input_tokens_seen": 114809835, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 5344, "time_per_iteration": 2.541816473007202 }, { "auxiliary_loss_clip": 0.01137469, "auxiliary_loss_mlp": 0.01041494, "balance_loss_clip": 1.02492404, "balance_loss_mlp": 1.04969192, "epoch": 0.3213587855102961, "flos": 18366871962240.0, "grad_norm": 2.6054803192123304, "language_loss": 0.79701185, "learning_rate": 3.173647680842262e-06, "loss": 0.81880152, "num_input_tokens_seen": 114826505, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.87890625, "step": 5345, "time_per_iteration": 3.8268210887908936 }, { "auxiliary_loss_clip": 0.01136676, "auxiliary_loss_mlp": 0.01038417, "balance_loss_clip": 1.02262735, "balance_loss_mlp": 1.04952335, "epoch": 0.3214189087629641, "flos": 27015992668800.0, "grad_norm": 2.605569273240741, "language_loss": 0.83054066, "learning_rate": 3.1733323057709384e-06, "loss": 0.85229152, "num_input_tokens_seen": 114846140, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.875, "step": 5346, "time_per_iteration": 3.8590123653411865 }, { "auxiliary_loss_clip": 0.01138431, "auxiliary_loss_mlp": 0.01044102, "balance_loss_clip": 1.0269835, "balance_loss_mlp": 1.04961538, "epoch": 0.32147903201563205, "flos": 23148269953920.0, "grad_norm": 1.6673814685140433, "language_loss": 0.81648469, "learning_rate": 3.1730168862067366e-06, "loss": 0.83831, "num_input_tokens_seen": 114866660, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.88671875, "step": 5347, "time_per_iteration": 3.9124131202697754 }, { "auxiliary_loss_clip": 0.01135355, "auxiliary_loss_mlp": 0.01044721, "balance_loss_clip": 1.02785301, "balance_loss_mlp": 1.05031502, "epoch": 0.3215391552683, "flos": 16580747243520.0, "grad_norm": 2.283306567835413, "language_loss": 0.79993242, "learning_rate": 3.1727014221616164e-06, "loss": 0.82173318, "num_input_tokens_seen": 114882820, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8515625, "step": 5348, "time_per_iteration": 2.444831132888794 }, { "auxiliary_loss_clip": 0.0113912, "auxiliary_loss_mlp": 0.01045555, "balance_loss_clip": 1.02974796, "balance_loss_mlp": 1.05165195, "epoch": 0.321599278520968, "flos": 17821820010240.0, "grad_norm": 2.0955421718684093, "language_loss": 0.85289067, "learning_rate": 3.172385913647542e-06, "loss": 0.87473744, "num_input_tokens_seen": 114900745, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.875, "step": 5349, "time_per_iteration": 2.454421281814575 }, { "auxiliary_loss_clip": 0.01137312, "auxiliary_loss_mlp": 0.01044123, "balance_loss_clip": 1.0277791, "balance_loss_mlp": 1.05113673, "epoch": 0.32165940177363594, "flos": 16251769555200.0, "grad_norm": 1.736860227951666, "language_loss": 0.80726892, "learning_rate": 3.172070360676475e-06, "loss": 0.82908332, "num_input_tokens_seen": 114917940, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.86328125, "step": 5350, "time_per_iteration": 2.4482173919677734 }, { "auxiliary_loss_clip": 0.01135153, "auxiliary_loss_mlp": 0.01041252, "balance_loss_clip": 1.02590978, "balance_loss_mlp": 1.0502274, "epoch": 0.3217195250263039, "flos": 27599900158080.0, "grad_norm": 1.5793604319713561, "language_loss": 0.79573047, "learning_rate": 3.1717547632603828e-06, "loss": 0.81749451, "num_input_tokens_seen": 114937735, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.84765625, "step": 5351, "time_per_iteration": 2.540372848510742 }, { "auxiliary_loss_clip": 0.01134155, "auxiliary_loss_mlp": 0.01043192, "balance_loss_clip": 1.02640688, "balance_loss_mlp": 1.04965687, "epoch": 0.3217796482789719, "flos": 21470595373440.0, "grad_norm": 1.742944992301844, "language_loss": 0.7538259, "learning_rate": 3.1714391214112326e-06, "loss": 0.77559936, "num_input_tokens_seen": 114956630, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.84375, "step": 5352, "time_per_iteration": 2.4703054428100586 }, { "auxiliary_loss_clip": 0.0113691, "auxiliary_loss_mlp": 0.0103981, "balance_loss_clip": 1.02297223, "balance_loss_mlp": 1.05091858, "epoch": 0.32183977153163984, "flos": 21215593745280.0, "grad_norm": 1.97842264830669, "language_loss": 0.82016063, "learning_rate": 3.1711234351409933e-06, "loss": 0.84192789, "num_input_tokens_seen": 114976470, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.859375, "step": 5353, "time_per_iteration": 2.469971179962158 }, { "auxiliary_loss_clip": 0.01134059, "auxiliary_loss_mlp": 0.01039898, "balance_loss_clip": 1.02312577, "balance_loss_mlp": 1.05137992, "epoch": 0.3218998947843078, "flos": 24608182331520.0, "grad_norm": 1.6675635689513582, "language_loss": 0.73138905, "learning_rate": 3.1708077044616365e-06, "loss": 0.75312865, "num_input_tokens_seen": 114996710, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.828125, "step": 5354, "time_per_iteration": 2.4977152347564697 }, { "auxiliary_loss_clip": 0.0113416, "auxiliary_loss_mlp": 0.01034748, "balance_loss_clip": 1.01982331, "balance_loss_mlp": 1.04803419, "epoch": 0.3219600180369758, "flos": 22270577126400.0, "grad_norm": 1.5324296830222848, "language_loss": 0.83192289, "learning_rate": 3.1704919293851334e-06, "loss": 0.85361201, "num_input_tokens_seen": 115015775, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.859375, "step": 5355, "time_per_iteration": 2.4915292263031006 }, { "auxiliary_loss_clip": 0.01140193, "auxiliary_loss_mlp": 0.01045365, "balance_loss_clip": 1.02934313, "balance_loss_mlp": 1.05313277, "epoch": 0.3220201412896438, "flos": 14939126939520.0, "grad_norm": 1.8622863857911054, "language_loss": 0.71275961, "learning_rate": 3.1701761099234597e-06, "loss": 0.73461521, "num_input_tokens_seen": 115034265, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87109375, "step": 5356, "time_per_iteration": 2.4391958713531494 }, { "auxiliary_loss_clip": 0.01144929, "auxiliary_loss_mlp": 0.01042866, "balance_loss_clip": 1.02648664, "balance_loss_mlp": 1.05282545, "epoch": 0.32208026454231176, "flos": 22667389649280.0, "grad_norm": 6.756807090941716, "language_loss": 0.68593293, "learning_rate": 3.1698602460885903e-06, "loss": 0.70781088, "num_input_tokens_seen": 115051945, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.921875, "step": 5357, "time_per_iteration": 2.506861925125122 }, { "auxiliary_loss_clip": 0.01066914, "auxiliary_loss_mlp": 0.01004148, "balance_loss_clip": 1.00176418, "balance_loss_mlp": 1.03873897, "epoch": 0.3221403877949797, "flos": 64605130053120.0, "grad_norm": 0.710710928177514, "language_loss": 0.58283925, "learning_rate": 3.1695443378925035e-06, "loss": 0.6035499, "num_input_tokens_seen": 115119090, "router_z_loss_clip": 0.02380371, "router_z_loss_mlp": 0.28125, "step": 5358, "time_per_iteration": 3.2092506885528564 }, { "auxiliary_loss_clip": 0.01135095, "auxiliary_loss_mlp": 0.01040033, "balance_loss_clip": 1.02314091, "balance_loss_mlp": 1.04825997, "epoch": 0.3222005110476477, "flos": 20157019004160.0, "grad_norm": 2.2575406970079936, "language_loss": 0.83602697, "learning_rate": 3.1692283853471777e-06, "loss": 0.85777819, "num_input_tokens_seen": 115137755, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8671875, "step": 5359, "time_per_iteration": 2.4643709659576416 }, { "auxiliary_loss_clip": 0.01137698, "auxiliary_loss_mlp": 0.01035388, "balance_loss_clip": 1.01946127, "balance_loss_mlp": 1.0503999, "epoch": 0.32226063430031565, "flos": 22674177319680.0, "grad_norm": 2.312920279476574, "language_loss": 0.79499125, "learning_rate": 3.168912388464595e-06, "loss": 0.8167221, "num_input_tokens_seen": 115158150, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87109375, "step": 5360, "time_per_iteration": 2.50201416015625 }, { "auxiliary_loss_clip": 0.01065483, "auxiliary_loss_mlp": 0.01005169, "balance_loss_clip": 1.00284469, "balance_loss_mlp": 1.03720427, "epoch": 0.3223207575529836, "flos": 63828525075840.0, "grad_norm": 0.6642143897576872, "language_loss": 0.57042408, "learning_rate": 3.168596347256737e-06, "loss": 0.59113061, "num_input_tokens_seen": 115212755, "router_z_loss_clip": 0.02319336, "router_z_loss_mlp": 0.28320312, "step": 5361, "time_per_iteration": 2.9678516387939453 }, { "auxiliary_loss_clip": 0.01136036, "auxiliary_loss_mlp": 0.01043753, "balance_loss_clip": 1.02723074, "balance_loss_mlp": 1.05109203, "epoch": 0.3223808808056516, "flos": 26870123537280.0, "grad_norm": 1.9663073318238427, "language_loss": 0.71071392, "learning_rate": 3.168280261735588e-06, "loss": 0.73251188, "num_input_tokens_seen": 115233090, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.84765625, "step": 5362, "time_per_iteration": 2.552405834197998 }, { "auxiliary_loss_clip": 0.01135479, "auxiliary_loss_mlp": 0.0103765, "balance_loss_clip": 1.0225997, "balance_loss_mlp": 1.05031621, "epoch": 0.32244100405831955, "flos": 26761350176640.0, "grad_norm": 2.624636236244407, "language_loss": 0.74136448, "learning_rate": 3.167964131913135e-06, "loss": 0.76309574, "num_input_tokens_seen": 115252645, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8515625, "step": 5363, "time_per_iteration": 2.533932685852051 }, { "auxiliary_loss_clip": 0.01139079, "auxiliary_loss_mlp": 0.01037772, "balance_loss_clip": 1.02128506, "balance_loss_mlp": 1.04944336, "epoch": 0.3225011273109875, "flos": 23803029020160.0, "grad_norm": 2.3365080284296917, "language_loss": 0.76657993, "learning_rate": 3.167647957801365e-06, "loss": 0.78834844, "num_input_tokens_seen": 115269085, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.89453125, "step": 5364, "time_per_iteration": 2.483755111694336 }, { "auxiliary_loss_clip": 0.01134424, "auxiliary_loss_mlp": 0.01045893, "balance_loss_clip": 1.02929902, "balance_loss_mlp": 1.04892671, "epoch": 0.3225612505636555, "flos": 17274505501440.0, "grad_norm": 2.171346743442479, "language_loss": 0.77517205, "learning_rate": 3.1673317394122672e-06, "loss": 0.7969752, "num_input_tokens_seen": 115286470, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.85546875, "step": 5365, "time_per_iteration": 2.420889377593994 }, { "auxiliary_loss_clip": 0.01137453, "auxiliary_loss_mlp": 0.01042894, "balance_loss_clip": 1.02686071, "balance_loss_mlp": 1.05143404, "epoch": 0.32262137381632344, "flos": 23366247638400.0, "grad_norm": 1.5798690458715665, "language_loss": 0.764979, "learning_rate": 3.1670154767578333e-06, "loss": 0.7867825, "num_input_tokens_seen": 115307000, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.859375, "step": 5366, "time_per_iteration": 2.4981541633605957 }, { "auxiliary_loss_clip": 0.01134084, "auxiliary_loss_mlp": 0.01036146, "balance_loss_clip": 1.01983857, "balance_loss_mlp": 1.04900408, "epoch": 0.3226814970689914, "flos": 23258803080960.0, "grad_norm": 2.190940986086408, "language_loss": 0.71971565, "learning_rate": 3.166699169850055e-06, "loss": 0.741418, "num_input_tokens_seen": 115325925, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8515625, "step": 5367, "time_per_iteration": 2.4811270236968994 }, { "auxiliary_loss_clip": 0.011318, "auxiliary_loss_mlp": 0.01036208, "balance_loss_clip": 1.02152169, "balance_loss_mlp": 1.04879057, "epoch": 0.32274162032165943, "flos": 16395196561920.0, "grad_norm": 2.2897577760523933, "language_loss": 0.74552929, "learning_rate": 3.1663828187009274e-06, "loss": 0.76720941, "num_input_tokens_seen": 115343705, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.828125, "step": 5368, "time_per_iteration": 2.449932336807251 }, { "auxiliary_loss_clip": 0.01131488, "auxiliary_loss_mlp": 0.01036723, "balance_loss_clip": 1.02096391, "balance_loss_mlp": 1.04917824, "epoch": 0.3228017435743274, "flos": 27855081354240.0, "grad_norm": 1.7856343636566747, "language_loss": 0.78663623, "learning_rate": 3.1660664233224467e-06, "loss": 0.80831826, "num_input_tokens_seen": 115364170, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.82421875, "step": 5369, "time_per_iteration": 2.509648323059082 }, { "auxiliary_loss_clip": 0.01129525, "auxiliary_loss_mlp": 0.01035297, "balance_loss_clip": 1.02059817, "balance_loss_mlp": 1.04771471, "epoch": 0.32286186682699536, "flos": 19608770741760.0, "grad_norm": 2.0875921382657974, "language_loss": 0.83080786, "learning_rate": 3.16574998372661e-06, "loss": 0.85245609, "num_input_tokens_seen": 115382495, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8203125, "step": 5370, "time_per_iteration": 2.454667568206787 }, { "auxiliary_loss_clip": 0.01136698, "auxiliary_loss_mlp": 0.01041426, "balance_loss_clip": 1.02600014, "balance_loss_mlp": 1.05147719, "epoch": 0.3229219900796633, "flos": 24134017870080.0, "grad_norm": 2.101471800104356, "language_loss": 0.83063638, "learning_rate": 3.1654334999254177e-06, "loss": 0.85241765, "num_input_tokens_seen": 115399450, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8515625, "step": 5371, "time_per_iteration": 2.4700767993927 }, { "auxiliary_loss_clip": 0.0113722, "auxiliary_loss_mlp": 0.01041606, "balance_loss_clip": 1.02451169, "balance_loss_mlp": 1.04959571, "epoch": 0.3229821133323313, "flos": 17748705876480.0, "grad_norm": 2.4617228264626836, "language_loss": 0.88597107, "learning_rate": 3.1651169719308695e-06, "loss": 0.90775931, "num_input_tokens_seen": 115417700, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.875, "step": 5372, "time_per_iteration": 2.45542311668396 }, { "auxiliary_loss_clip": 0.01135078, "auxiliary_loss_mlp": 0.01042884, "balance_loss_clip": 1.02649283, "balance_loss_mlp": 1.05071628, "epoch": 0.32304223658499925, "flos": 22346025644160.0, "grad_norm": 2.686039622921258, "language_loss": 0.72822541, "learning_rate": 3.1648003997549694e-06, "loss": 0.75000501, "num_input_tokens_seen": 115435840, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84375, "step": 5373, "time_per_iteration": 2.457855463027954 }, { "auxiliary_loss_clip": 0.01132011, "auxiliary_loss_mlp": 0.0103324, "balance_loss_clip": 1.01807094, "balance_loss_mlp": 1.04991293, "epoch": 0.3231023598376672, "flos": 18478302929280.0, "grad_norm": 2.439368260288254, "language_loss": 0.81078142, "learning_rate": 3.1644837834097214e-06, "loss": 0.83243394, "num_input_tokens_seen": 115454210, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 5374, "time_per_iteration": 2.4576189517974854 }, { "auxiliary_loss_clip": 0.01130135, "auxiliary_loss_mlp": 0.0103717, "balance_loss_clip": 1.02176845, "balance_loss_mlp": 1.04814672, "epoch": 0.3231624830903352, "flos": 27636313570560.0, "grad_norm": 1.94991343180469, "language_loss": 0.87985063, "learning_rate": 3.1641671229071317e-06, "loss": 0.90152371, "num_input_tokens_seen": 115471785, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 5375, "time_per_iteration": 2.521090030670166 }, { "auxiliary_loss_clip": 0.01135727, "auxiliary_loss_mlp": 0.01037216, "balance_loss_clip": 1.02037156, "balance_loss_mlp": 1.04890215, "epoch": 0.32322260634300315, "flos": 21726423014400.0, "grad_norm": 1.9040159809120438, "language_loss": 0.75774997, "learning_rate": 3.1638504182592076e-06, "loss": 0.77947944, "num_input_tokens_seen": 115491405, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 5376, "time_per_iteration": 2.496772289276123 }, { "auxiliary_loss_clip": 0.01130424, "auxiliary_loss_mlp": 0.01035872, "balance_loss_clip": 1.02143002, "balance_loss_mlp": 1.04816103, "epoch": 0.3232827295956711, "flos": 22637656166400.0, "grad_norm": 1.6759794101416445, "language_loss": 0.66610944, "learning_rate": 3.1635336694779594e-06, "loss": 0.68777239, "num_input_tokens_seen": 115511555, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 5377, "time_per_iteration": 2.4734365940093994 }, { "auxiliary_loss_clip": 0.01133561, "auxiliary_loss_mlp": 0.01047937, "balance_loss_clip": 1.03020453, "balance_loss_mlp": 1.04927254, "epoch": 0.3233428528483391, "flos": 26322593546880.0, "grad_norm": 1.51647063911387, "language_loss": 0.71991807, "learning_rate": 3.1632168765753982e-06, "loss": 0.74173307, "num_input_tokens_seen": 115532860, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.84375, "step": 5378, "time_per_iteration": 2.533710241317749 }, { "auxiliary_loss_clip": 0.01133999, "auxiliary_loss_mlp": 0.01034237, "balance_loss_clip": 1.01868033, "balance_loss_mlp": 1.04870772, "epoch": 0.32340297610100704, "flos": 28585217111040.0, "grad_norm": 2.089070035515225, "language_loss": 0.82095361, "learning_rate": 3.1629000395635357e-06, "loss": 0.84263599, "num_input_tokens_seen": 115553850, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8515625, "step": 5379, "time_per_iteration": 2.51928973197937 }, { "auxiliary_loss_clip": 0.01136268, "auxiliary_loss_mlp": 0.01039326, "balance_loss_clip": 1.0243057, "balance_loss_mlp": 1.0494777, "epoch": 0.323463099353675, "flos": 30773792787840.0, "grad_norm": 2.0523136542189113, "language_loss": 0.78873086, "learning_rate": 3.162583158454388e-06, "loss": 0.81048679, "num_input_tokens_seen": 115575530, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8671875, "step": 5380, "time_per_iteration": 2.5682358741760254 }, { "auxiliary_loss_clip": 0.01135592, "auxiliary_loss_mlp": 0.01040613, "balance_loss_clip": 1.02564645, "balance_loss_mlp": 1.05092716, "epoch": 0.32352322260634303, "flos": 25228610974080.0, "grad_norm": 1.8142296475505049, "language_loss": 0.77224112, "learning_rate": 3.1622662332599697e-06, "loss": 0.79400313, "num_input_tokens_seen": 115594885, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.84765625, "step": 5381, "time_per_iteration": 2.493964195251465 }, { "auxiliary_loss_clip": 0.01131186, "auxiliary_loss_mlp": 0.01037786, "balance_loss_clip": 1.02374935, "balance_loss_mlp": 1.05020046, "epoch": 0.323583345859011, "flos": 23330480670720.0, "grad_norm": 2.0862036068091587, "language_loss": 0.71726817, "learning_rate": 3.1619492639922998e-06, "loss": 0.73895788, "num_input_tokens_seen": 115614080, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 5382, "time_per_iteration": 2.4851136207580566 }, { "auxiliary_loss_clip": 0.01137106, "auxiliary_loss_mlp": 0.01040491, "balance_loss_clip": 1.02561426, "balance_loss_mlp": 1.04973221, "epoch": 0.32364346911167896, "flos": 26207499392640.0, "grad_norm": 2.541687835421867, "language_loss": 0.70520276, "learning_rate": 3.1616322506633964e-06, "loss": 0.72697878, "num_input_tokens_seen": 115632820, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.875, "step": 5383, "time_per_iteration": 2.5069937705993652 }, { "auxiliary_loss_clip": 0.01131146, "auxiliary_loss_mlp": 0.01039795, "balance_loss_clip": 1.02563334, "balance_loss_mlp": 1.05018592, "epoch": 0.3237035923643469, "flos": 23695764030720.0, "grad_norm": 1.5902050329735165, "language_loss": 0.78328472, "learning_rate": 3.161315193285283e-06, "loss": 0.80499411, "num_input_tokens_seen": 115652860, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.80859375, "step": 5384, "time_per_iteration": 4.033925771713257 }, { "auxiliary_loss_clip": 0.01136928, "auxiliary_loss_mlp": 0.01046004, "balance_loss_clip": 1.02905273, "balance_loss_mlp": 1.05062819, "epoch": 0.3237637156170149, "flos": 14428728633600.0, "grad_norm": 2.7331334123130953, "language_loss": 0.75339216, "learning_rate": 3.16099809186998e-06, "loss": 0.77522147, "num_input_tokens_seen": 115670940, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.86328125, "step": 5385, "time_per_iteration": 2.462052822113037 }, { "auxiliary_loss_clip": 0.01136138, "auxiliary_loss_mlp": 0.01040582, "balance_loss_clip": 1.02532339, "balance_loss_mlp": 1.05227184, "epoch": 0.32382383886968286, "flos": 31062981185280.0, "grad_norm": 1.7286093293457094, "language_loss": 0.71599185, "learning_rate": 3.1606809464295145e-06, "loss": 0.73775905, "num_input_tokens_seen": 115691155, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 5386, "time_per_iteration": 2.631507158279419 }, { "auxiliary_loss_clip": 0.01137109, "auxiliary_loss_mlp": 0.01041845, "balance_loss_clip": 1.02560902, "balance_loss_mlp": 1.04895425, "epoch": 0.3238839621223508, "flos": 23256935573760.0, "grad_norm": 2.3682675936387376, "language_loss": 0.94313318, "learning_rate": 3.1603637569759095e-06, "loss": 0.96492267, "num_input_tokens_seen": 115710340, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8828125, "step": 5387, "time_per_iteration": 5.236666679382324 }, { "auxiliary_loss_clip": 0.01138412, "auxiliary_loss_mlp": 0.01045653, "balance_loss_clip": 1.02877283, "balance_loss_mlp": 1.05169535, "epoch": 0.3239440853750188, "flos": 22964658606720.0, "grad_norm": 2.769472736670803, "language_loss": 0.77841288, "learning_rate": 3.1600465235211956e-06, "loss": 0.80025351, "num_input_tokens_seen": 115726745, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8671875, "step": 5388, "time_per_iteration": 2.468245029449463 }, { "auxiliary_loss_clip": 0.01136224, "auxiliary_loss_mlp": 0.01036448, "balance_loss_clip": 1.02047443, "balance_loss_mlp": 1.05021834, "epoch": 0.32400420862768675, "flos": 36246614653440.0, "grad_norm": 2.789043768109847, "language_loss": 0.71270192, "learning_rate": 3.1597292460774006e-06, "loss": 0.73442864, "num_input_tokens_seen": 115749385, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.859375, "step": 5389, "time_per_iteration": 4.036770343780518 }, { "auxiliary_loss_clip": 0.01132857, "auxiliary_loss_mlp": 0.01041113, "balance_loss_clip": 1.02534175, "balance_loss_mlp": 1.04986012, "epoch": 0.3240643318803547, "flos": 21616500418560.0, "grad_norm": 1.8035272350651546, "language_loss": 0.81292588, "learning_rate": 3.159411924656557e-06, "loss": 0.83466566, "num_input_tokens_seen": 115768105, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 5390, "time_per_iteration": 2.475205898284912 }, { "auxiliary_loss_clip": 0.01140386, "auxiliary_loss_mlp": 0.01049948, "balance_loss_clip": 1.03359258, "balance_loss_mlp": 1.05521703, "epoch": 0.3241244551330227, "flos": 23295611543040.0, "grad_norm": 1.9219988711590767, "language_loss": 0.72772229, "learning_rate": 3.1590945592706967e-06, "loss": 0.74962568, "num_input_tokens_seen": 115787340, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8515625, "step": 5391, "time_per_iteration": 2.4819440841674805 }, { "auxiliary_loss_clip": 0.01131485, "auxiliary_loss_mlp": 0.01043089, "balance_loss_clip": 1.02808046, "balance_loss_mlp": 1.04890609, "epoch": 0.32418457838569065, "flos": 14097236993280.0, "grad_norm": 3.992594945684472, "language_loss": 0.77196109, "learning_rate": 3.158777149931855e-06, "loss": 0.79370677, "num_input_tokens_seen": 115805565, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 5392, "time_per_iteration": 2.449596643447876 }, { "auxiliary_loss_clip": 0.01139058, "auxiliary_loss_mlp": 0.01042382, "balance_loss_clip": 1.02622938, "balance_loss_mlp": 1.05135524, "epoch": 0.3242447016383586, "flos": 29752672953600.0, "grad_norm": 2.2808634874258673, "language_loss": 0.62413228, "learning_rate": 3.158459696652067e-06, "loss": 0.64594662, "num_input_tokens_seen": 115826725, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.87890625, "step": 5393, "time_per_iteration": 2.5222878456115723 }, { "auxiliary_loss_clip": 0.01133336, "auxiliary_loss_mlp": 0.01038365, "balance_loss_clip": 1.02261734, "balance_loss_mlp": 1.04912138, "epoch": 0.3243048248910266, "flos": 24351205455360.0, "grad_norm": 1.8860350482582189, "language_loss": 0.82736504, "learning_rate": 3.158142199443371e-06, "loss": 0.84908211, "num_input_tokens_seen": 115846955, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.83984375, "step": 5394, "time_per_iteration": 2.5140371322631836 }, { "auxiliary_loss_clip": 0.01129925, "auxiliary_loss_mlp": 0.01047767, "balance_loss_clip": 1.03293705, "balance_loss_mlp": 1.04903328, "epoch": 0.3243649481436946, "flos": 24353037048960.0, "grad_norm": 1.8994278811090934, "language_loss": 0.81673521, "learning_rate": 3.1578246583178076e-06, "loss": 0.83851206, "num_input_tokens_seen": 115865975, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80859375, "step": 5395, "time_per_iteration": 2.4923126697540283 }, { "auxiliary_loss_clip": 0.01132449, "auxiliary_loss_mlp": 0.01042258, "balance_loss_clip": 1.02724373, "balance_loss_mlp": 1.05249143, "epoch": 0.32442507139636256, "flos": 22925228451840.0, "grad_norm": 2.126123068365582, "language_loss": 0.83374763, "learning_rate": 3.157507073287417e-06, "loss": 0.85549474, "num_input_tokens_seen": 115884950, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 5396, "time_per_iteration": 2.4784417152404785 }, { "auxiliary_loss_clip": 0.01139953, "auxiliary_loss_mlp": 0.01041022, "balance_loss_clip": 1.02469015, "balance_loss_mlp": 1.05196154, "epoch": 0.32448519464903053, "flos": 22200192426240.0, "grad_norm": 2.132934149772523, "language_loss": 0.7633031, "learning_rate": 3.1571894443642414e-06, "loss": 0.78511286, "num_input_tokens_seen": 115904170, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.87890625, "step": 5397, "time_per_iteration": 2.463301181793213 }, { "auxiliary_loss_clip": 0.01133368, "auxiliary_loss_mlp": 0.01036563, "balance_loss_clip": 1.02132189, "balance_loss_mlp": 1.05073786, "epoch": 0.3245453179016985, "flos": 18838450644480.0, "grad_norm": 2.6848340576581338, "language_loss": 0.66904265, "learning_rate": 3.1568717715603263e-06, "loss": 0.69074196, "num_input_tokens_seen": 115919255, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 5398, "time_per_iteration": 2.461284637451172 }, { "auxiliary_loss_clip": 0.01131516, "auxiliary_loss_mlp": 0.01035101, "balance_loss_clip": 1.01963913, "balance_loss_mlp": 1.04773307, "epoch": 0.32460544115436646, "flos": 21178390233600.0, "grad_norm": 1.9700005361800477, "language_loss": 0.73142052, "learning_rate": 3.156554054887718e-06, "loss": 0.75308669, "num_input_tokens_seen": 115938535, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 5399, "time_per_iteration": 2.459853410720825 }, { "auxiliary_loss_clip": 0.01134043, "auxiliary_loss_mlp": 0.01038328, "balance_loss_clip": 1.02261662, "balance_loss_mlp": 1.05028248, "epoch": 0.3246655644070344, "flos": 21981137333760.0, "grad_norm": 2.376963503420597, "language_loss": 0.71293616, "learning_rate": 3.1562362943584645e-06, "loss": 0.73465985, "num_input_tokens_seen": 115955005, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8359375, "step": 5400, "time_per_iteration": 2.4904496669769287 }, { "auxiliary_loss_clip": 0.01135186, "auxiliary_loss_mlp": 0.01036519, "balance_loss_clip": 1.0204618, "balance_loss_mlp": 1.04901314, "epoch": 0.3247256876597024, "flos": 32159729105280.0, "grad_norm": 15.747171539087612, "language_loss": 0.80201232, "learning_rate": 3.155918489984614e-06, "loss": 0.8237294, "num_input_tokens_seen": 115975305, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.859375, "step": 5401, "time_per_iteration": 2.57041597366333 }, { "auxiliary_loss_clip": 0.01135537, "auxiliary_loss_mlp": 0.01039639, "balance_loss_clip": 1.02278316, "balance_loss_mlp": 1.049649, "epoch": 0.32478581091237035, "flos": 20997544233600.0, "grad_norm": 2.2398611357520584, "language_loss": 0.8736183, "learning_rate": 3.1556006417782196e-06, "loss": 0.89537007, "num_input_tokens_seen": 115994810, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.859375, "step": 5402, "time_per_iteration": 2.481177806854248 }, { "auxiliary_loss_clip": 0.01129179, "auxiliary_loss_mlp": 0.01037001, "balance_loss_clip": 1.02161741, "balance_loss_mlp": 1.04851949, "epoch": 0.3248459341650383, "flos": 17924990849280.0, "grad_norm": 2.133155764700821, "language_loss": 0.84642029, "learning_rate": 3.155282749751332e-06, "loss": 0.86808217, "num_input_tokens_seen": 116011095, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 5403, "time_per_iteration": 2.4187893867492676 }, { "auxiliary_loss_clip": 0.01130676, "auxiliary_loss_mlp": 0.01036184, "balance_loss_clip": 1.0221467, "balance_loss_mlp": 1.05097723, "epoch": 0.3249060574177063, "flos": 24535606901760.0, "grad_norm": 2.667739380715423, "language_loss": 0.87291867, "learning_rate": 3.154964813916007e-06, "loss": 0.89458728, "num_input_tokens_seen": 116028805, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 5404, "time_per_iteration": 2.507148504257202 }, { "auxiliary_loss_clip": 0.01132571, "auxiliary_loss_mlp": 0.01035499, "balance_loss_clip": 1.01993072, "balance_loss_mlp": 1.05077076, "epoch": 0.32496618067037425, "flos": 25994765093760.0, "grad_norm": 2.6470404128490848, "language_loss": 0.72994912, "learning_rate": 3.1546468342843008e-06, "loss": 0.75162983, "num_input_tokens_seen": 116047765, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 5405, "time_per_iteration": 2.500452995300293 }, { "auxiliary_loss_clip": 0.01132056, "auxiliary_loss_mlp": 0.01036196, "balance_loss_clip": 1.02119339, "balance_loss_mlp": 1.04949248, "epoch": 0.3250263039230422, "flos": 19573757959680.0, "grad_norm": 1.7903287553966516, "language_loss": 0.83184612, "learning_rate": 3.1543288108682707e-06, "loss": 0.85352862, "num_input_tokens_seen": 116068385, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.82421875, "step": 5406, "time_per_iteration": 2.5015573501586914 }, { "auxiliary_loss_clip": 0.01132289, "auxiliary_loss_mlp": 0.01035972, "balance_loss_clip": 1.02099991, "balance_loss_mlp": 1.05063868, "epoch": 0.3250864271757102, "flos": 16763640318720.0, "grad_norm": 1.8287263175231248, "language_loss": 0.87653619, "learning_rate": 3.1540107436799764e-06, "loss": 0.89821881, "num_input_tokens_seen": 116085350, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.81640625, "step": 5407, "time_per_iteration": 2.4305288791656494 }, { "auxiliary_loss_clip": 0.0113327, "auxiliary_loss_mlp": 0.01039467, "balance_loss_clip": 1.02370799, "balance_loss_mlp": 1.04975867, "epoch": 0.3251465504283782, "flos": 27819458040960.0, "grad_norm": 1.4940181578181548, "language_loss": 0.69483262, "learning_rate": 3.153692632731479e-06, "loss": 0.71656001, "num_input_tokens_seen": 116107560, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8359375, "step": 5408, "time_per_iteration": 2.5392889976501465 }, { "auxiliary_loss_clip": 0.01138416, "auxiliary_loss_mlp": 0.01031596, "balance_loss_clip": 1.01551425, "balance_loss_mlp": 1.04911888, "epoch": 0.32520667368104617, "flos": 19063144172160.0, "grad_norm": 1.8771675377740622, "language_loss": 0.77539694, "learning_rate": 3.153374478034841e-06, "loss": 0.79709709, "num_input_tokens_seen": 116125980, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.89453125, "step": 5409, "time_per_iteration": 2.4554479122161865 }, { "auxiliary_loss_clip": 0.01134082, "auxiliary_loss_mlp": 0.0104405, "balance_loss_clip": 1.02845776, "balance_loss_mlp": 1.04727602, "epoch": 0.32526679693371413, "flos": 29382146208000.0, "grad_norm": 1.8057012132455301, "language_loss": 0.83192396, "learning_rate": 3.1530562796021285e-06, "loss": 0.85370535, "num_input_tokens_seen": 116146530, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8671875, "step": 5410, "time_per_iteration": 2.541524887084961 }, { "auxiliary_loss_clip": 0.01128733, "auxiliary_loss_mlp": 0.01031819, "balance_loss_clip": 1.01708508, "balance_loss_mlp": 1.0474987, "epoch": 0.3253269201863821, "flos": 20704513080960.0, "grad_norm": 1.5645335317262128, "language_loss": 0.70873016, "learning_rate": 3.152738037445405e-06, "loss": 0.73033565, "num_input_tokens_seen": 116165695, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 5411, "time_per_iteration": 2.4497246742248535 }, { "auxiliary_loss_clip": 0.01132085, "auxiliary_loss_mlp": 0.01035971, "balance_loss_clip": 1.02139771, "balance_loss_mlp": 1.04918575, "epoch": 0.32538704343905006, "flos": 29094142959360.0, "grad_norm": 1.520446243630936, "language_loss": 0.82925326, "learning_rate": 3.1524197515767403e-06, "loss": 0.85093379, "num_input_tokens_seen": 116185375, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.828125, "step": 5412, "time_per_iteration": 2.5293660163879395 }, { "auxiliary_loss_clip": 0.01134605, "auxiliary_loss_mlp": 0.01034551, "balance_loss_clip": 1.01808822, "balance_loss_mlp": 1.04849076, "epoch": 0.325447166691718, "flos": 24676124906880.0, "grad_norm": 2.536180975407027, "language_loss": 0.80422115, "learning_rate": 3.152101422008203e-06, "loss": 0.82591265, "num_input_tokens_seen": 116204335, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.859375, "step": 5413, "time_per_iteration": 2.4944334030151367 }, { "auxiliary_loss_clip": 0.0113533, "auxiliary_loss_mlp": 0.01039324, "balance_loss_clip": 1.02255154, "balance_loss_mlp": 1.05086517, "epoch": 0.325507289944386, "flos": 21543134889600.0, "grad_norm": 2.0576283936934274, "language_loss": 0.76747823, "learning_rate": 3.151783048751864e-06, "loss": 0.78922474, "num_input_tokens_seen": 116222840, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.84375, "step": 5414, "time_per_iteration": 2.4772279262542725 }, { "auxiliary_loss_clip": 0.01065934, "auxiliary_loss_mlp": 0.01006832, "balance_loss_clip": 1.00434017, "balance_loss_mlp": 1.03764057, "epoch": 0.32556741319705396, "flos": 71518722347520.0, "grad_norm": 0.9152323597820615, "language_loss": 0.63928032, "learning_rate": 3.1514646318197965e-06, "loss": 0.66000795, "num_input_tokens_seen": 116274940, "router_z_loss_clip": 0.02490234, "router_z_loss_mlp": 0.28320312, "step": 5415, "time_per_iteration": 3.0100815296173096 }, { "auxiliary_loss_clip": 0.01130258, "auxiliary_loss_mlp": 0.0103428, "balance_loss_clip": 1.01896167, "balance_loss_mlp": 1.04718637, "epoch": 0.3256275364497219, "flos": 23732428838400.0, "grad_norm": 1.4347488752548379, "language_loss": 0.73856676, "learning_rate": 3.151146171224075e-06, "loss": 0.76021218, "num_input_tokens_seen": 116297300, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.83203125, "step": 5416, "time_per_iteration": 2.5136258602142334 }, { "auxiliary_loss_clip": 0.01066098, "auxiliary_loss_mlp": 0.01006792, "balance_loss_clip": 1.00446701, "balance_loss_mlp": 1.03757143, "epoch": 0.3256876597023899, "flos": 67289199891840.0, "grad_norm": 0.7992224005870542, "language_loss": 0.57976574, "learning_rate": 3.1508276669767757e-06, "loss": 0.60049468, "num_input_tokens_seen": 116362370, "router_z_loss_clip": 0.02319336, "router_z_loss_mlp": 0.28515625, "step": 5417, "time_per_iteration": 3.164811611175537 }, { "auxiliary_loss_clip": 0.01063633, "auxiliary_loss_mlp": 0.01004901, "balance_loss_clip": 1.0024693, "balance_loss_mlp": 1.03527927, "epoch": 0.32574778295505785, "flos": 71282323964160.0, "grad_norm": 0.8192386752633862, "language_loss": 0.63421297, "learning_rate": 3.150509119089975e-06, "loss": 0.65489829, "num_input_tokens_seen": 116430365, "router_z_loss_clip": 0.02429199, "router_z_loss_mlp": 0.28320312, "step": 5418, "time_per_iteration": 3.2258646488189697 }, { "auxiliary_loss_clip": 0.0113141, "auxiliary_loss_mlp": 0.01040862, "balance_loss_clip": 1.02580619, "balance_loss_mlp": 1.04790831, "epoch": 0.3258079062077258, "flos": 20776370238720.0, "grad_norm": 2.042993772334519, "language_loss": 0.69395441, "learning_rate": 3.1501905275757537e-06, "loss": 0.71567714, "num_input_tokens_seen": 116447525, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8359375, "step": 5419, "time_per_iteration": 2.489511013031006 }, { "auxiliary_loss_clip": 0.01132793, "auxiliary_loss_mlp": 0.01037597, "balance_loss_clip": 1.02131319, "balance_loss_mlp": 1.04828811, "epoch": 0.3258680294603938, "flos": 22235456603520.0, "grad_norm": 1.7100531814989608, "language_loss": 0.77300304, "learning_rate": 3.1498718924461926e-06, "loss": 0.794707, "num_input_tokens_seen": 116466310, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.84375, "step": 5420, "time_per_iteration": 2.484499931335449 }, { "auxiliary_loss_clip": 0.01132442, "auxiliary_loss_mlp": 0.01035206, "balance_loss_clip": 1.01960135, "balance_loss_mlp": 1.04704809, "epoch": 0.3259281527130618, "flos": 26979974305920.0, "grad_norm": 2.0113401556846786, "language_loss": 0.80140537, "learning_rate": 3.1495532137133736e-06, "loss": 0.82308185, "num_input_tokens_seen": 116487825, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8515625, "step": 5421, "time_per_iteration": 2.5410609245300293 }, { "auxiliary_loss_clip": 0.01127267, "auxiliary_loss_mlp": 0.01035595, "balance_loss_clip": 1.02086103, "balance_loss_mlp": 1.04649973, "epoch": 0.32598827596572977, "flos": 26214251149440.0, "grad_norm": 1.590847226670311, "language_loss": 0.75295925, "learning_rate": 3.149234491389381e-06, "loss": 0.77458787, "num_input_tokens_seen": 116509950, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80859375, "step": 5422, "time_per_iteration": 2.5278334617614746 }, { "auxiliary_loss_clip": 0.01134542, "auxiliary_loss_mlp": 0.01039627, "balance_loss_clip": 1.0237726, "balance_loss_mlp": 1.04904008, "epoch": 0.32604839921839773, "flos": 17639752947840.0, "grad_norm": 2.1977370435211463, "language_loss": 0.62639034, "learning_rate": 3.1489157254863026e-06, "loss": 0.64813209, "num_input_tokens_seen": 116527695, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.85546875, "step": 5423, "time_per_iteration": 2.455848217010498 }, { "auxiliary_loss_clip": 0.01121854, "auxiliary_loss_mlp": 0.0103095, "balance_loss_clip": 1.01737237, "balance_loss_mlp": 1.04424059, "epoch": 0.3261085224710657, "flos": 23622721724160.0, "grad_norm": 1.6291407305836427, "language_loss": 0.7443077, "learning_rate": 3.148596916016224e-06, "loss": 0.76583576, "num_input_tokens_seen": 116547800, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7734375, "step": 5424, "time_per_iteration": 2.4953386783599854 }, { "auxiliary_loss_clip": 0.01128659, "auxiliary_loss_mlp": 0.01037215, "balance_loss_clip": 1.0231843, "balance_loss_mlp": 1.04773808, "epoch": 0.32616864572373366, "flos": 23260455106560.0, "grad_norm": 1.6127417615424822, "language_loss": 0.77138716, "learning_rate": 3.1482780629912355e-06, "loss": 0.79304588, "num_input_tokens_seen": 116568460, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 5425, "time_per_iteration": 4.056157112121582 }, { "auxiliary_loss_clip": 0.01134018, "auxiliary_loss_mlp": 0.0104241, "balance_loss_clip": 1.02518415, "balance_loss_mlp": 1.04658878, "epoch": 0.32622876897640163, "flos": 25593427457280.0, "grad_norm": 6.00765081859729, "language_loss": 0.78081024, "learning_rate": 3.147959166423428e-06, "loss": 0.80257457, "num_input_tokens_seen": 116588705, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.875, "step": 5426, "time_per_iteration": 2.507378578186035 }, { "auxiliary_loss_clip": 0.01130801, "auxiliary_loss_mlp": 0.01038853, "balance_loss_clip": 1.02248549, "balance_loss_mlp": 1.04695511, "epoch": 0.3262888922290696, "flos": 22418996123520.0, "grad_norm": 1.7186388684554186, "language_loss": 0.74649155, "learning_rate": 3.147640226324893e-06, "loss": 0.76818812, "num_input_tokens_seen": 116608845, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 5427, "time_per_iteration": 2.4911742210388184 }, { "auxiliary_loss_clip": 0.01132757, "auxiliary_loss_mlp": 0.01042308, "balance_loss_clip": 1.02550006, "balance_loss_mlp": 1.04660749, "epoch": 0.32634901548173756, "flos": 19718908819200.0, "grad_norm": 1.7530794668678644, "language_loss": 0.78974128, "learning_rate": 3.1473212427077266e-06, "loss": 0.81149197, "num_input_tokens_seen": 116628145, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.859375, "step": 5428, "time_per_iteration": 3.8490822315216064 }, { "auxiliary_loss_clip": 0.01128981, "auxiliary_loss_mlp": 0.01038009, "balance_loss_clip": 1.02283359, "balance_loss_mlp": 1.04589093, "epoch": 0.3264091387344055, "flos": 16142924367360.0, "grad_norm": 2.029743430440299, "language_loss": 0.71140921, "learning_rate": 3.147002215584023e-06, "loss": 0.73307908, "num_input_tokens_seen": 116646920, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83203125, "step": 5429, "time_per_iteration": 3.832866668701172 }, { "auxiliary_loss_clip": 0.011291, "auxiliary_loss_mlp": 0.01037067, "balance_loss_clip": 1.02278614, "balance_loss_mlp": 1.04702306, "epoch": 0.3264692619870735, "flos": 16399075230720.0, "grad_norm": 1.9138105330946846, "language_loss": 0.78793907, "learning_rate": 3.146683144965881e-06, "loss": 0.80960071, "num_input_tokens_seen": 116665100, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8203125, "step": 5430, "time_per_iteration": 3.9260916709899902 }, { "auxiliary_loss_clip": 0.01133825, "auxiliary_loss_mlp": 0.01039127, "balance_loss_clip": 1.02249706, "balance_loss_mlp": 1.04827309, "epoch": 0.32652938523974145, "flos": 22382331315840.0, "grad_norm": 3.865661532129935, "language_loss": 0.84355295, "learning_rate": 3.146364030865399e-06, "loss": 0.86528248, "num_input_tokens_seen": 116682205, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.85546875, "step": 5431, "time_per_iteration": 2.512371778488159 }, { "auxiliary_loss_clip": 0.01126963, "auxiliary_loss_mlp": 0.010351, "balance_loss_clip": 1.02021098, "balance_loss_mlp": 1.04517758, "epoch": 0.3265895084924094, "flos": 21908059113600.0, "grad_norm": 1.76366383916072, "language_loss": 0.70625377, "learning_rate": 3.146044873294678e-06, "loss": 0.72787446, "num_input_tokens_seen": 116702575, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.81640625, "step": 5432, "time_per_iteration": 2.469794273376465 }, { "auxiliary_loss_clip": 0.01127639, "auxiliary_loss_mlp": 0.01034576, "balance_loss_clip": 1.01993692, "balance_loss_mlp": 1.04390287, "epoch": 0.3266496317450774, "flos": 16067152627200.0, "grad_norm": 1.8281120740000312, "language_loss": 0.84271234, "learning_rate": 3.1457256722658203e-06, "loss": 0.86433446, "num_input_tokens_seen": 116720885, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8359375, "step": 5433, "time_per_iteration": 2.4668924808502197 }, { "auxiliary_loss_clip": 0.01127312, "auxiliary_loss_mlp": 0.01035932, "balance_loss_clip": 1.0209713, "balance_loss_mlp": 1.04709101, "epoch": 0.3267097549977454, "flos": 22528236360960.0, "grad_norm": 1.4049188129748005, "language_loss": 0.85720873, "learning_rate": 3.145406427790931e-06, "loss": 0.87884116, "num_input_tokens_seen": 116740395, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 5434, "time_per_iteration": 2.5067520141601562 }, { "auxiliary_loss_clip": 0.01132137, "auxiliary_loss_mlp": 0.01037913, "balance_loss_clip": 1.02203441, "balance_loss_mlp": 1.04721713, "epoch": 0.32676987825041337, "flos": 27270419679360.0, "grad_norm": 3.4147558862146346, "language_loss": 0.87740362, "learning_rate": 3.1450871398821147e-06, "loss": 0.89910418, "num_input_tokens_seen": 116758870, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5435, "time_per_iteration": 2.5309207439422607 }, { "auxiliary_loss_clip": 0.01128609, "auxiliary_loss_mlp": 0.01035759, "balance_loss_clip": 1.02100122, "balance_loss_mlp": 1.04563761, "epoch": 0.32683000150308134, "flos": 11508257433600.0, "grad_norm": 2.479558014576162, "language_loss": 0.76034653, "learning_rate": 3.144767808551479e-06, "loss": 0.78199017, "num_input_tokens_seen": 116773440, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.828125, "step": 5436, "time_per_iteration": 2.4195752143859863 }, { "auxiliary_loss_clip": 0.01129046, "auxiliary_loss_mlp": 0.01035659, "balance_loss_clip": 1.02143097, "balance_loss_mlp": 1.04698908, "epoch": 0.3268901247557493, "flos": 25630200005760.0, "grad_norm": 1.7234316001615133, "language_loss": 0.71820903, "learning_rate": 3.144448433811134e-06, "loss": 0.73985612, "num_input_tokens_seen": 116794375, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8203125, "step": 5437, "time_per_iteration": 2.534250020980835 }, { "auxiliary_loss_clip": 0.01132392, "auxiliary_loss_mlp": 0.01043729, "balance_loss_clip": 1.02664673, "balance_loss_mlp": 1.04572272, "epoch": 0.32695024800841727, "flos": 24860849575680.0, "grad_norm": 1.7471261838820398, "language_loss": 0.64022505, "learning_rate": 3.144129015673189e-06, "loss": 0.66198623, "num_input_tokens_seen": 116815095, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8671875, "step": 5438, "time_per_iteration": 2.502086877822876 }, { "auxiliary_loss_clip": 0.01131151, "auxiliary_loss_mlp": 0.01035926, "balance_loss_clip": 1.02054822, "balance_loss_mlp": 1.0484314, "epoch": 0.32701037126108523, "flos": 28839249072000.0, "grad_norm": 1.680309212293446, "language_loss": 0.74405217, "learning_rate": 3.1438095541497576e-06, "loss": 0.76572299, "num_input_tokens_seen": 116836630, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 5439, "time_per_iteration": 2.5622782707214355 }, { "auxiliary_loss_clip": 0.01134363, "auxiliary_loss_mlp": 0.01045575, "balance_loss_clip": 1.0300777, "balance_loss_mlp": 1.05044591, "epoch": 0.3270704945137532, "flos": 27965075777280.0, "grad_norm": 1.8992985533486269, "language_loss": 0.74661827, "learning_rate": 3.1434900492529527e-06, "loss": 0.76841772, "num_input_tokens_seen": 116856880, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83984375, "step": 5440, "time_per_iteration": 2.5115957260131836 }, { "auxiliary_loss_clip": 0.01128623, "auxiliary_loss_mlp": 0.01042839, "balance_loss_clip": 1.02840233, "balance_loss_mlp": 1.04698324, "epoch": 0.32713061776642116, "flos": 23690700213120.0, "grad_norm": 2.207423490385492, "language_loss": 0.84513652, "learning_rate": 3.1431705009948914e-06, "loss": 0.86685121, "num_input_tokens_seen": 116873770, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.81640625, "step": 5441, "time_per_iteration": 2.5379152297973633 }, { "auxiliary_loss_clip": 0.01132525, "auxiliary_loss_mlp": 0.01044931, "balance_loss_clip": 1.02857542, "balance_loss_mlp": 1.04760003, "epoch": 0.3271907410190891, "flos": 22455625017600.0, "grad_norm": 2.1970862382661, "language_loss": 0.86507201, "learning_rate": 3.1428509093876897e-06, "loss": 0.8868466, "num_input_tokens_seen": 116891225, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8515625, "step": 5442, "time_per_iteration": 2.464324951171875 }, { "auxiliary_loss_clip": 0.01135443, "auxiliary_loss_mlp": 0.01035833, "balance_loss_clip": 1.01964426, "balance_loss_mlp": 1.04950535, "epoch": 0.3272508642717571, "flos": 22820118278400.0, "grad_norm": 1.7575338777976106, "language_loss": 0.77364528, "learning_rate": 3.1425312744434668e-06, "loss": 0.795358, "num_input_tokens_seen": 116912300, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 5443, "time_per_iteration": 2.526099443435669 }, { "auxiliary_loss_clip": 0.01132411, "auxiliary_loss_mlp": 0.01038918, "balance_loss_clip": 1.02308071, "balance_loss_mlp": 1.04644036, "epoch": 0.32731098752442506, "flos": 11801360413440.0, "grad_norm": 2.2864040900129226, "language_loss": 0.81542552, "learning_rate": 3.142211596174343e-06, "loss": 0.83713877, "num_input_tokens_seen": 116929425, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.859375, "step": 5444, "time_per_iteration": 2.4509990215301514 }, { "auxiliary_loss_clip": 0.0113231, "auxiliary_loss_mlp": 0.0103974, "balance_loss_clip": 1.02431452, "balance_loss_mlp": 1.04869425, "epoch": 0.327371110777093, "flos": 21027780506880.0, "grad_norm": 2.289779274682543, "language_loss": 0.59372801, "learning_rate": 3.1418918745924423e-06, "loss": 0.61544859, "num_input_tokens_seen": 116948255, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 5445, "time_per_iteration": 2.4972245693206787 }, { "auxiliary_loss_clip": 0.01133688, "auxiliary_loss_mlp": 0.01038144, "balance_loss_clip": 1.02237892, "balance_loss_mlp": 1.04900908, "epoch": 0.327431234029761, "flos": 19062102677760.0, "grad_norm": 2.2481798059286686, "language_loss": 0.88589799, "learning_rate": 3.1415721097098865e-06, "loss": 0.90761626, "num_input_tokens_seen": 116964905, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 5446, "time_per_iteration": 2.4710981845855713 }, { "auxiliary_loss_clip": 0.01141158, "auxiliary_loss_mlp": 0.01043377, "balance_loss_clip": 1.02581716, "balance_loss_mlp": 1.05250239, "epoch": 0.32749135728242895, "flos": 25849219184640.0, "grad_norm": 1.5651541875621335, "language_loss": 0.79154348, "learning_rate": 3.141252301538802e-06, "loss": 0.81338882, "num_input_tokens_seen": 116983650, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.88671875, "step": 5447, "time_per_iteration": 2.5523462295532227 }, { "auxiliary_loss_clip": 0.01132074, "auxiliary_loss_mlp": 0.01034219, "balance_loss_clip": 1.0200634, "balance_loss_mlp": 1.04826212, "epoch": 0.327551480535097, "flos": 20120533764480.0, "grad_norm": 1.7349623013173125, "language_loss": 0.73602867, "learning_rate": 3.1409324500913157e-06, "loss": 0.75769162, "num_input_tokens_seen": 117003265, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8359375, "step": 5448, "time_per_iteration": 2.4569568634033203 }, { "auxiliary_loss_clip": 0.01130879, "auxiliary_loss_mlp": 0.01038703, "balance_loss_clip": 1.02332497, "balance_loss_mlp": 1.0478133, "epoch": 0.32761160378776494, "flos": 28803553931520.0, "grad_norm": 1.5709318834260153, "language_loss": 0.66963357, "learning_rate": 3.1406125553795567e-06, "loss": 0.69132936, "num_input_tokens_seen": 117025370, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83203125, "step": 5449, "time_per_iteration": 2.569791793823242 }, { "auxiliary_loss_clip": 0.01131947, "auxiliary_loss_mlp": 0.01038975, "balance_loss_clip": 1.02388358, "balance_loss_mlp": 1.04951501, "epoch": 0.3276717270404329, "flos": 26937778803840.0, "grad_norm": 1.7777810437104868, "language_loss": 0.65654993, "learning_rate": 3.1402926174156556e-06, "loss": 0.67825913, "num_input_tokens_seen": 117044350, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.82421875, "step": 5450, "time_per_iteration": 2.4968101978302 }, { "auxiliary_loss_clip": 0.01136309, "auxiliary_loss_mlp": 0.01038216, "balance_loss_clip": 1.02296948, "balance_loss_mlp": 1.05159116, "epoch": 0.32773185029310087, "flos": 25338425829120.0, "grad_norm": 1.8762120424047373, "language_loss": 0.77768123, "learning_rate": 3.1399726362117437e-06, "loss": 0.79942656, "num_input_tokens_seen": 117064450, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84375, "step": 5451, "time_per_iteration": 2.553997278213501 }, { "auxiliary_loss_clip": 0.01136897, "auxiliary_loss_mlp": 0.01041059, "balance_loss_clip": 1.02475154, "balance_loss_mlp": 1.05169821, "epoch": 0.32779197354576883, "flos": 26391721271040.0, "grad_norm": 2.260392970404607, "language_loss": 0.70398414, "learning_rate": 3.1396526117799555e-06, "loss": 0.72576368, "num_input_tokens_seen": 117083060, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8515625, "step": 5452, "time_per_iteration": 2.501802921295166 }, { "auxiliary_loss_clip": 0.01131518, "auxiliary_loss_mlp": 0.01033692, "balance_loss_clip": 1.01845682, "balance_loss_mlp": 1.0501616, "epoch": 0.3278520967984368, "flos": 24899381890560.0, "grad_norm": 1.707159035838644, "language_loss": 0.79062927, "learning_rate": 3.1393325441324256e-06, "loss": 0.81228131, "num_input_tokens_seen": 117101860, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 5453, "time_per_iteration": 2.5392556190490723 }, { "auxiliary_loss_clip": 0.0113712, "auxiliary_loss_mlp": 0.01032353, "balance_loss_clip": 1.01742828, "balance_loss_mlp": 1.05296588, "epoch": 0.32791222005110476, "flos": 29752996176000.0, "grad_norm": 2.238099879480404, "language_loss": 0.75501406, "learning_rate": 3.1390124332812916e-06, "loss": 0.77670878, "num_input_tokens_seen": 117123100, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.83984375, "step": 5454, "time_per_iteration": 2.542097568511963 }, { "auxiliary_loss_clip": 0.01128305, "auxiliary_loss_mlp": 0.01034724, "balance_loss_clip": 1.0216347, "balance_loss_mlp": 1.04891109, "epoch": 0.32797234330377273, "flos": 16508064072960.0, "grad_norm": 1.7414524753043257, "language_loss": 0.76777482, "learning_rate": 3.1386922792386924e-06, "loss": 0.78940511, "num_input_tokens_seen": 117140515, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.79296875, "step": 5455, "time_per_iteration": 2.4584670066833496 }, { "auxiliary_loss_clip": 0.01138217, "auxiliary_loss_mlp": 0.01039581, "balance_loss_clip": 1.02319014, "balance_loss_mlp": 1.05170715, "epoch": 0.3280324665564407, "flos": 26577918397440.0, "grad_norm": 1.8711148810678337, "language_loss": 0.73741251, "learning_rate": 3.138372082016768e-06, "loss": 0.7591905, "num_input_tokens_seen": 117161485, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.86328125, "step": 5456, "time_per_iteration": 2.5353667736053467 }, { "auxiliary_loss_clip": 0.0113591, "auxiliary_loss_mlp": 0.01040292, "balance_loss_clip": 1.02488995, "balance_loss_mlp": 1.05124617, "epoch": 0.32809258980910866, "flos": 22929969047040.0, "grad_norm": 1.469415753121963, "language_loss": 0.78299654, "learning_rate": 3.1380518416276596e-06, "loss": 0.80475855, "num_input_tokens_seen": 117181870, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84375, "step": 5457, "time_per_iteration": 2.498751640319824 }, { "auxiliary_loss_clip": 0.01137256, "auxiliary_loss_mlp": 0.01035924, "balance_loss_clip": 1.02095115, "balance_loss_mlp": 1.04968357, "epoch": 0.3281527130617766, "flos": 22783848520320.0, "grad_norm": 3.2158261039239466, "language_loss": 0.79098791, "learning_rate": 3.1377315580835115e-06, "loss": 0.8127197, "num_input_tokens_seen": 117201380, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.875, "step": 5458, "time_per_iteration": 2.4966979026794434 }, { "auxiliary_loss_clip": 0.01130834, "auxiliary_loss_mlp": 0.01036949, "balance_loss_clip": 1.02122557, "balance_loss_mlp": 1.04840517, "epoch": 0.3282128363144446, "flos": 21250678354560.0, "grad_norm": 2.094084311218433, "language_loss": 0.73104787, "learning_rate": 3.1374112313964686e-06, "loss": 0.75272572, "num_input_tokens_seen": 117221040, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.82421875, "step": 5459, "time_per_iteration": 2.5225987434387207 }, { "auxiliary_loss_clip": 0.01137373, "auxiliary_loss_mlp": 0.01039482, "balance_loss_clip": 1.02420497, "balance_loss_mlp": 1.05225182, "epoch": 0.32827295956711255, "flos": 30843064166400.0, "grad_norm": 2.1600248249759746, "language_loss": 0.83903801, "learning_rate": 3.1370908615786783e-06, "loss": 0.86080652, "num_input_tokens_seen": 117241395, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8515625, "step": 5460, "time_per_iteration": 2.5421535968780518 }, { "auxiliary_loss_clip": 0.01132397, "auxiliary_loss_mlp": 0.01035455, "balance_loss_clip": 1.02031553, "balance_loss_mlp": 1.04880095, "epoch": 0.3283330828197806, "flos": 25915006944000.0, "grad_norm": 3.4531173333574503, "language_loss": 0.76771283, "learning_rate": 3.136770448642288e-06, "loss": 0.7893914, "num_input_tokens_seen": 117259340, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8359375, "step": 5461, "time_per_iteration": 2.5199546813964844 }, { "auxiliary_loss_clip": 0.01136387, "auxiliary_loss_mlp": 0.01032123, "balance_loss_clip": 1.01519573, "balance_loss_mlp": 1.05173647, "epoch": 0.32839320607244854, "flos": 38582065042560.0, "grad_norm": 1.9234097363923657, "language_loss": 0.62748706, "learning_rate": 3.1364499925994484e-06, "loss": 0.64917219, "num_input_tokens_seen": 117282375, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.84375, "step": 5462, "time_per_iteration": 2.620256185531616 }, { "auxiliary_loss_clip": 0.0113145, "auxiliary_loss_mlp": 0.0103414, "balance_loss_clip": 1.02026439, "balance_loss_mlp": 1.04953969, "epoch": 0.3284533293251165, "flos": 26650888876800.0, "grad_norm": 1.4149899440163873, "language_loss": 0.78369999, "learning_rate": 3.1361294934623115e-06, "loss": 0.80535591, "num_input_tokens_seen": 117303830, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8203125, "step": 5463, "time_per_iteration": 2.5412347316741943 }, { "auxiliary_loss_clip": 0.01135452, "auxiliary_loss_mlp": 0.01038039, "balance_loss_clip": 1.02192235, "balance_loss_mlp": 1.05104971, "epoch": 0.32851345257778447, "flos": 15304158904320.0, "grad_norm": 1.9145354122687568, "language_loss": 0.69525135, "learning_rate": 3.1358089512430303e-06, "loss": 0.7169863, "num_input_tokens_seen": 117320665, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 5464, "time_per_iteration": 2.452070951461792 }, { "auxiliary_loss_clip": 0.01132365, "auxiliary_loss_mlp": 0.0103846, "balance_loss_clip": 1.02297437, "balance_loss_mlp": 1.0510633, "epoch": 0.32857357583045244, "flos": 23513732881920.0, "grad_norm": 6.050971883948604, "language_loss": 0.72577435, "learning_rate": 3.1354883659537594e-06, "loss": 0.7474826, "num_input_tokens_seen": 117339795, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8125, "step": 5465, "time_per_iteration": 2.502480983734131 }, { "auxiliary_loss_clip": 0.01135056, "auxiliary_loss_mlp": 0.01036244, "balance_loss_clip": 1.02155113, "balance_loss_mlp": 1.05223966, "epoch": 0.3286336990831204, "flos": 20995209849600.0, "grad_norm": 1.990325900505742, "language_loss": 0.82705814, "learning_rate": 3.1351677376066567e-06, "loss": 0.8487711, "num_input_tokens_seen": 117359525, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.828125, "step": 5466, "time_per_iteration": 2.462747097015381 }, { "auxiliary_loss_clip": 0.01132268, "auxiliary_loss_mlp": 0.01037872, "balance_loss_clip": 1.02286315, "balance_loss_mlp": 1.04842877, "epoch": 0.32869382233578837, "flos": 23658811914240.0, "grad_norm": 2.0569356087200106, "language_loss": 0.79376626, "learning_rate": 3.134847066213879e-06, "loss": 0.81546766, "num_input_tokens_seen": 117380320, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83984375, "step": 5467, "time_per_iteration": 3.970270872116089 }, { "auxiliary_loss_clip": 0.01133878, "auxiliary_loss_mlp": 0.01033522, "balance_loss_clip": 1.01806045, "balance_loss_mlp": 1.04907835, "epoch": 0.32875394558845633, "flos": 25336522408320.0, "grad_norm": 1.622179670742462, "language_loss": 0.74812949, "learning_rate": 3.134526351787587e-06, "loss": 0.7698034, "num_input_tokens_seen": 117400695, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84765625, "step": 5468, "time_per_iteration": 2.5181005001068115 }, { "auxiliary_loss_clip": 0.01141776, "auxiliary_loss_mlp": 0.0103695, "balance_loss_clip": 1.01984382, "balance_loss_mlp": 1.05318522, "epoch": 0.3288140688411243, "flos": 14903108576640.0, "grad_norm": 1.904431905500346, "language_loss": 0.78386915, "learning_rate": 3.134205594339942e-06, "loss": 0.80565643, "num_input_tokens_seen": 117418800, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.88671875, "step": 5469, "time_per_iteration": 2.4406538009643555 }, { "auxiliary_loss_clip": 0.01132702, "auxiliary_loss_mlp": 0.01037526, "balance_loss_clip": 1.02295864, "balance_loss_mlp": 1.04813015, "epoch": 0.32887419209379226, "flos": 18551345235840.0, "grad_norm": 3.136597687056511, "language_loss": 0.82129014, "learning_rate": 3.133884793883107e-06, "loss": 0.84299237, "num_input_tokens_seen": 117438220, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.84375, "step": 5470, "time_per_iteration": 5.225810766220093 }, { "auxiliary_loss_clip": 0.0113383, "auxiliary_loss_mlp": 0.01041376, "balance_loss_clip": 1.02573597, "balance_loss_mlp": 1.04761291, "epoch": 0.3289343153464602, "flos": 48105610439040.0, "grad_norm": 1.8799122341914072, "language_loss": 0.67366242, "learning_rate": 3.1335639504292478e-06, "loss": 0.69541454, "num_input_tokens_seen": 117462560, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 5471, "time_per_iteration": 2.735111951828003 }, { "auxiliary_loss_clip": 0.01137153, "auxiliary_loss_mlp": 0.01045468, "balance_loss_clip": 1.02767015, "balance_loss_mlp": 1.04862869, "epoch": 0.3289944385991282, "flos": 27600295207680.0, "grad_norm": 1.8455207947914276, "language_loss": 0.64954233, "learning_rate": 3.1332430639905288e-06, "loss": 0.67136848, "num_input_tokens_seen": 117483665, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.88671875, "step": 5472, "time_per_iteration": 3.973836898803711 }, { "auxiliary_loss_clip": 0.01136865, "auxiliary_loss_mlp": 0.0104255, "balance_loss_clip": 1.02565789, "balance_loss_mlp": 1.05088425, "epoch": 0.32905456185179616, "flos": 20120318282880.0, "grad_norm": 1.7255407583836577, "language_loss": 0.88050997, "learning_rate": 3.13292213457912e-06, "loss": 0.90230417, "num_input_tokens_seen": 117503565, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.859375, "step": 5473, "time_per_iteration": 2.467001438140869 }, { "auxiliary_loss_clip": 0.01138043, "auxiliary_loss_mlp": 0.01042459, "balance_loss_clip": 1.02479172, "balance_loss_mlp": 1.05119085, "epoch": 0.3291146851044642, "flos": 23180230080000.0, "grad_norm": 8.40222217331198, "language_loss": 0.78145862, "learning_rate": 3.1326011622071903e-06, "loss": 0.80326366, "num_input_tokens_seen": 117521460, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.8671875, "step": 5474, "time_per_iteration": 2.47285532951355 }, { "auxiliary_loss_clip": 0.01067461, "auxiliary_loss_mlp": 0.01014451, "balance_loss_clip": 1.01220989, "balance_loss_mlp": 1.03912306, "epoch": 0.32917480835713214, "flos": 67621912594560.0, "grad_norm": 0.8484464579696617, "language_loss": 0.60284531, "learning_rate": 3.132280146886911e-06, "loss": 0.62366444, "num_input_tokens_seen": 117580550, "router_z_loss_clip": 0.02246094, "router_z_loss_mlp": 0.28320312, "step": 5475, "time_per_iteration": 3.057631492614746 }, { "auxiliary_loss_clip": 0.01136789, "auxiliary_loss_mlp": 0.01043911, "balance_loss_clip": 1.02678061, "balance_loss_mlp": 1.04728174, "epoch": 0.3292349316098001, "flos": 27964537073280.0, "grad_norm": 2.6769831685883068, "language_loss": 0.7704398, "learning_rate": 3.131959088630455e-06, "loss": 0.79224682, "num_input_tokens_seen": 117600645, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.89453125, "step": 5476, "time_per_iteration": 2.5333926677703857 }, { "auxiliary_loss_clip": 0.01135067, "auxiliary_loss_mlp": 0.01047823, "balance_loss_clip": 1.03223085, "balance_loss_mlp": 1.05025804, "epoch": 0.3292950548624681, "flos": 20263673462400.0, "grad_norm": 2.1808468768057465, "language_loss": 0.74493492, "learning_rate": 3.131637987449997e-06, "loss": 0.76676381, "num_input_tokens_seen": 117618880, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84765625, "step": 5477, "time_per_iteration": 2.4561235904693604 }, { "auxiliary_loss_clip": 0.0112827, "auxiliary_loss_mlp": 0.01036249, "balance_loss_clip": 1.02155066, "balance_loss_mlp": 1.04701865, "epoch": 0.32935517811513604, "flos": 20812999132800.0, "grad_norm": 2.3616785937481524, "language_loss": 0.75575382, "learning_rate": 3.131316843357713e-06, "loss": 0.77739894, "num_input_tokens_seen": 117636445, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 5478, "time_per_iteration": 2.477618932723999 }, { "auxiliary_loss_clip": 0.01131025, "auxiliary_loss_mlp": 0.01041462, "balance_loss_clip": 1.02703822, "balance_loss_mlp": 1.0483427, "epoch": 0.329415301367804, "flos": 18441853603200.0, "grad_norm": 1.8227689459860095, "language_loss": 0.80333805, "learning_rate": 3.1309956563657807e-06, "loss": 0.82506287, "num_input_tokens_seen": 117653105, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.828125, "step": 5479, "time_per_iteration": 2.423996925354004 }, { "auxiliary_loss_clip": 0.01064914, "auxiliary_loss_mlp": 0.01003729, "balance_loss_clip": 1.00163138, "balance_loss_mlp": 1.03672624, "epoch": 0.32947542462047197, "flos": 66323024887680.0, "grad_norm": 0.7487402349817062, "language_loss": 0.56574845, "learning_rate": 3.1306744264863804e-06, "loss": 0.58643496, "num_input_tokens_seen": 117719225, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.28125, "step": 5480, "time_per_iteration": 3.1483287811279297 }, { "auxiliary_loss_clip": 0.01131545, "auxiliary_loss_mlp": 0.01045693, "balance_loss_clip": 1.0305655, "balance_loss_mlp": 1.04705167, "epoch": 0.32953554787313993, "flos": 23221599569280.0, "grad_norm": 3.8732568055141954, "language_loss": 0.7759726, "learning_rate": 3.1303531537316915e-06, "loss": 0.79774499, "num_input_tokens_seen": 117738725, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.84375, "step": 5481, "time_per_iteration": 2.4764604568481445 }, { "auxiliary_loss_clip": 0.01136006, "auxiliary_loss_mlp": 0.01046071, "balance_loss_clip": 1.03039467, "balance_loss_mlp": 1.05009866, "epoch": 0.3295956711258079, "flos": 27009492307200.0, "grad_norm": 1.7000324566083687, "language_loss": 0.78571421, "learning_rate": 3.130031838113899e-06, "loss": 0.80753499, "num_input_tokens_seen": 117757765, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.859375, "step": 5482, "time_per_iteration": 2.524866819381714 }, { "auxiliary_loss_clip": 0.01134309, "auxiliary_loss_mlp": 0.01046709, "balance_loss_clip": 1.03012085, "balance_loss_mlp": 1.04812276, "epoch": 0.32965579437847586, "flos": 19171702051200.0, "grad_norm": 1.7042306488857963, "language_loss": 0.73851717, "learning_rate": 3.129710479645185e-06, "loss": 0.76032734, "num_input_tokens_seen": 117776810, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.86328125, "step": 5483, "time_per_iteration": 2.450268268585205 }, { "auxiliary_loss_clip": 0.01134094, "auxiliary_loss_mlp": 0.01038852, "balance_loss_clip": 1.02499986, "balance_loss_mlp": 1.0499388, "epoch": 0.32971591763114383, "flos": 30482521401600.0, "grad_norm": 1.9772295456980566, "language_loss": 0.75153589, "learning_rate": 3.1293890783377366e-06, "loss": 0.77326536, "num_input_tokens_seen": 117797730, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.84375, "step": 5484, "time_per_iteration": 2.5695486068725586 }, { "auxiliary_loss_clip": 0.0113326, "auxiliary_loss_mlp": 0.01046677, "balance_loss_clip": 1.03029728, "balance_loss_mlp": 1.04970956, "epoch": 0.3297760408838118, "flos": 16289583598080.0, "grad_norm": 1.825223413525223, "language_loss": 0.71808928, "learning_rate": 3.129067634203742e-06, "loss": 0.73988867, "num_input_tokens_seen": 117815365, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 5485, "time_per_iteration": 2.431777238845825 }, { "auxiliary_loss_clip": 0.01130828, "auxiliary_loss_mlp": 0.0104881, "balance_loss_clip": 1.03399205, "balance_loss_mlp": 1.04807639, "epoch": 0.32983616413647976, "flos": 29530924341120.0, "grad_norm": 1.687395915681828, "language_loss": 0.80246043, "learning_rate": 3.128746147255388e-06, "loss": 0.82425678, "num_input_tokens_seen": 117836095, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 5486, "time_per_iteration": 2.539843797683716 }, { "auxiliary_loss_clip": 0.01128746, "auxiliary_loss_mlp": 0.01041114, "balance_loss_clip": 1.02646363, "balance_loss_mlp": 1.04688334, "epoch": 0.3298962873891478, "flos": 20631398947200.0, "grad_norm": 2.0474279171294048, "language_loss": 0.84609914, "learning_rate": 3.1284246175048683e-06, "loss": 0.86779773, "num_input_tokens_seen": 117854655, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.81640625, "step": 5487, "time_per_iteration": 2.445830821990967 }, { "auxiliary_loss_clip": 0.01139804, "auxiliary_loss_mlp": 0.01046596, "balance_loss_clip": 1.03001404, "balance_loss_mlp": 1.05174184, "epoch": 0.32995641064181574, "flos": 14976007228800.0, "grad_norm": 5.66886694787302, "language_loss": 0.74978703, "learning_rate": 3.1281030449643735e-06, "loss": 0.77165103, "num_input_tokens_seen": 117873300, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.87890625, "step": 5488, "time_per_iteration": 2.4436771869659424 }, { "auxiliary_loss_clip": 0.01133397, "auxiliary_loss_mlp": 0.01046828, "balance_loss_clip": 1.03172398, "balance_loss_mlp": 1.04949093, "epoch": 0.3300165338944837, "flos": 18661447399680.0, "grad_norm": 2.372186272896866, "language_loss": 0.72626269, "learning_rate": 3.127781429646098e-06, "loss": 0.74806499, "num_input_tokens_seen": 117891540, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83984375, "step": 5489, "time_per_iteration": 2.4313814640045166 }, { "auxiliary_loss_clip": 0.01128548, "auxiliary_loss_mlp": 0.01043579, "balance_loss_clip": 1.02785528, "balance_loss_mlp": 1.044168, "epoch": 0.3300766571471517, "flos": 25583730785280.0, "grad_norm": 2.370838249747597, "language_loss": 0.88637042, "learning_rate": 3.127459771562238e-06, "loss": 0.90809178, "num_input_tokens_seen": 117907690, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84375, "step": 5490, "time_per_iteration": 2.5099761486053467 }, { "auxiliary_loss_clip": 0.01127437, "auxiliary_loss_mlp": 0.01039162, "balance_loss_clip": 1.02409387, "balance_loss_mlp": 1.04409349, "epoch": 0.33013678039981964, "flos": 11363501623680.0, "grad_norm": 2.1012313409702723, "language_loss": 0.83039349, "learning_rate": 3.1271380707249907e-06, "loss": 0.85205948, "num_input_tokens_seen": 117925640, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83203125, "step": 5491, "time_per_iteration": 2.4247591495513916 }, { "auxiliary_loss_clip": 0.01129978, "auxiliary_loss_mlp": 0.01042009, "balance_loss_clip": 1.02648807, "balance_loss_mlp": 1.0463444, "epoch": 0.3301969036524876, "flos": 24821203939200.0, "grad_norm": 1.8231161419199606, "language_loss": 0.77322006, "learning_rate": 3.126816327146554e-06, "loss": 0.79493988, "num_input_tokens_seen": 117944525, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8359375, "step": 5492, "time_per_iteration": 2.510770082473755 }, { "auxiliary_loss_clip": 0.01138835, "auxiliary_loss_mlp": 0.01047088, "balance_loss_clip": 1.03036261, "balance_loss_mlp": 1.05047011, "epoch": 0.33025702690515557, "flos": 15961144613760.0, "grad_norm": 2.748496305852494, "language_loss": 0.74319673, "learning_rate": 3.12649454083913e-06, "loss": 0.76505595, "num_input_tokens_seen": 117962515, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8828125, "step": 5493, "time_per_iteration": 2.439077138900757 }, { "auxiliary_loss_clip": 0.01060448, "auxiliary_loss_mlp": 0.01016432, "balance_loss_clip": 1.0143218, "balance_loss_mlp": 1.03208745, "epoch": 0.33031715015782354, "flos": 59416755989760.0, "grad_norm": 0.787112277542701, "language_loss": 0.53994238, "learning_rate": 3.12617271181492e-06, "loss": 0.56071121, "num_input_tokens_seen": 118018780, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.28515625, "step": 5494, "time_per_iteration": 3.0616397857666016 }, { "auxiliary_loss_clip": 0.01130337, "auxiliary_loss_mlp": 0.01040342, "balance_loss_clip": 1.02490425, "balance_loss_mlp": 1.04761004, "epoch": 0.3303772734104915, "flos": 23184360144000.0, "grad_norm": 1.4747987237254272, "language_loss": 0.86722821, "learning_rate": 3.1258508400861276e-06, "loss": 0.88893497, "num_input_tokens_seen": 118038610, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 5495, "time_per_iteration": 2.474031448364258 }, { "auxiliary_loss_clip": 0.0113368, "auxiliary_loss_mlp": 0.0104607, "balance_loss_clip": 1.02951169, "balance_loss_mlp": 1.04764557, "epoch": 0.33043739666315947, "flos": 33071896010880.0, "grad_norm": 2.620911036790278, "language_loss": 0.73133373, "learning_rate": 3.1255289256649587e-06, "loss": 0.75313115, "num_input_tokens_seen": 118055905, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.859375, "step": 5496, "time_per_iteration": 2.6035401821136475 }, { "auxiliary_loss_clip": 0.01128291, "auxiliary_loss_mlp": 0.01034043, "balance_loss_clip": 1.01969624, "balance_loss_mlp": 1.04577029, "epoch": 0.33049751991582743, "flos": 24895431394560.0, "grad_norm": 2.9108769228356643, "language_loss": 0.72637856, "learning_rate": 3.1252069685636196e-06, "loss": 0.74800193, "num_input_tokens_seen": 118073695, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.82421875, "step": 5497, "time_per_iteration": 2.4824442863464355 }, { "auxiliary_loss_clip": 0.01128975, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.01786005, "balance_loss_mlp": 1.04673767, "epoch": 0.3305576431684954, "flos": 29460575554560.0, "grad_norm": 1.9332173827283203, "language_loss": 0.79914129, "learning_rate": 3.124884968794321e-06, "loss": 0.82076108, "num_input_tokens_seen": 118094030, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8203125, "step": 5498, "time_per_iteration": 2.539001941680908 }, { "auxiliary_loss_clip": 0.01128905, "auxiliary_loss_mlp": 0.01038312, "balance_loss_clip": 1.02215958, "balance_loss_mlp": 1.04359436, "epoch": 0.33061776642116336, "flos": 22632305040000.0, "grad_norm": 2.3053096651170892, "language_loss": 0.75967097, "learning_rate": 3.12456292636927e-06, "loss": 0.7813431, "num_input_tokens_seen": 118111665, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.85546875, "step": 5499, "time_per_iteration": 2.4518444538116455 }, { "auxiliary_loss_clip": 0.01127826, "auxiliary_loss_mlp": 0.0103489, "balance_loss_clip": 1.01936913, "balance_loss_mlp": 1.04384387, "epoch": 0.3306778896738313, "flos": 25776320532480.0, "grad_norm": 1.6489646629431305, "language_loss": 0.79129195, "learning_rate": 3.124240841300681e-06, "loss": 0.81291914, "num_input_tokens_seen": 118132435, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83984375, "step": 5500, "time_per_iteration": 2.507126808166504 }, { "auxiliary_loss_clip": 0.01134459, "auxiliary_loss_mlp": 0.01030537, "balance_loss_clip": 1.01414573, "balance_loss_mlp": 1.04852641, "epoch": 0.33073801292649935, "flos": 36940552479360.0, "grad_norm": 1.956608142629321, "language_loss": 0.66137487, "learning_rate": 3.1239187136007665e-06, "loss": 0.68302482, "num_input_tokens_seen": 118155255, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.859375, "step": 5501, "time_per_iteration": 2.594869375228882 }, { "auxiliary_loss_clip": 0.01133792, "auxiliary_loss_mlp": 0.01036894, "balance_loss_clip": 1.0206939, "balance_loss_mlp": 1.04708576, "epoch": 0.3307981361791673, "flos": 12967738848000.0, "grad_norm": 2.1834707912243503, "language_loss": 0.77022719, "learning_rate": 3.1235965432817417e-06, "loss": 0.79193407, "num_input_tokens_seen": 118169865, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8671875, "step": 5502, "time_per_iteration": 2.4503333568573 }, { "auxiliary_loss_clip": 0.01135771, "auxiliary_loss_mlp": 0.01035136, "balance_loss_clip": 1.01968634, "balance_loss_mlp": 1.04934311, "epoch": 0.3308582594318353, "flos": 25374372364800.0, "grad_norm": 1.704174575543469, "language_loss": 0.7216264, "learning_rate": 3.123274330355824e-06, "loss": 0.74333549, "num_input_tokens_seen": 118190760, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.86328125, "step": 5503, "time_per_iteration": 2.4868268966674805 }, { "auxiliary_loss_clip": 0.01130575, "auxiliary_loss_mlp": 0.01034935, "balance_loss_clip": 1.0189606, "balance_loss_mlp": 1.04524899, "epoch": 0.33091838268450324, "flos": 26468570419200.0, "grad_norm": 2.2315153790412476, "language_loss": 0.75192475, "learning_rate": 3.12295207483523e-06, "loss": 0.77357984, "num_input_tokens_seen": 118213620, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8515625, "step": 5504, "time_per_iteration": 2.529724359512329 }, { "auxiliary_loss_clip": 0.01132006, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.0214262, "balance_loss_mlp": 1.04714036, "epoch": 0.3309785059371712, "flos": 24971167221120.0, "grad_norm": 1.6512839532066175, "language_loss": 0.6965372, "learning_rate": 3.1226297767321816e-06, "loss": 0.7182281, "num_input_tokens_seen": 118235010, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84765625, "step": 5505, "time_per_iteration": 2.505293130874634 }, { "auxiliary_loss_clip": 0.01131898, "auxiliary_loss_mlp": 0.01040114, "balance_loss_clip": 1.02505827, "balance_loss_mlp": 1.04917073, "epoch": 0.3310386291898392, "flos": 20446710192000.0, "grad_norm": 1.9230367679737403, "language_loss": 0.82015765, "learning_rate": 3.122307436058899e-06, "loss": 0.8418777, "num_input_tokens_seen": 118255820, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 5506, "time_per_iteration": 2.510498523712158 }, { "auxiliary_loss_clip": 0.0113308, "auxiliary_loss_mlp": 0.01036745, "balance_loss_clip": 1.02081919, "balance_loss_mlp": 1.04848671, "epoch": 0.33109875244250714, "flos": 23182672204800.0, "grad_norm": 1.8224881495304568, "language_loss": 0.79638094, "learning_rate": 3.121985052827606e-06, "loss": 0.81807923, "num_input_tokens_seen": 118274160, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84375, "step": 5507, "time_per_iteration": 2.4608192443847656 }, { "auxiliary_loss_clip": 0.01129261, "auxiliary_loss_mlp": 0.01044337, "balance_loss_clip": 1.02850604, "balance_loss_mlp": 1.04407859, "epoch": 0.3311588756951751, "flos": 24168384207360.0, "grad_norm": 1.554534098081972, "language_loss": 0.72212982, "learning_rate": 3.1216626270505274e-06, "loss": 0.74386585, "num_input_tokens_seen": 118294385, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5508, "time_per_iteration": 4.045572280883789 }, { "auxiliary_loss_clip": 0.01129707, "auxiliary_loss_mlp": 0.01034423, "balance_loss_clip": 1.01958156, "balance_loss_mlp": 1.04793382, "epoch": 0.33121899894784307, "flos": 28145742209280.0, "grad_norm": 2.3543554876128145, "language_loss": 0.71805906, "learning_rate": 3.12134015873989e-06, "loss": 0.73970038, "num_input_tokens_seen": 118313105, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 5509, "time_per_iteration": 2.5674917697906494 }, { "auxiliary_loss_clip": 0.01131306, "auxiliary_loss_mlp": 0.01031925, "balance_loss_clip": 1.01674998, "balance_loss_mlp": 1.04750061, "epoch": 0.33127912220051103, "flos": 29567660976000.0, "grad_norm": 1.5628424715469087, "language_loss": 0.73079771, "learning_rate": 3.121017647907921e-06, "loss": 0.75243008, "num_input_tokens_seen": 118335250, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8359375, "step": 5510, "time_per_iteration": 2.522237539291382 }, { "auxiliary_loss_clip": 0.01129328, "auxiliary_loss_mlp": 0.01036721, "balance_loss_clip": 1.02223742, "balance_loss_mlp": 1.04632592, "epoch": 0.331339245453179, "flos": 14428836374400.0, "grad_norm": 2.3215031632424115, "language_loss": 0.87685585, "learning_rate": 3.1206950945668508e-06, "loss": 0.8985163, "num_input_tokens_seen": 118351470, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.828125, "step": 5511, "time_per_iteration": 2.448133945465088 }, { "auxiliary_loss_clip": 0.01126493, "auxiliary_loss_mlp": 0.01034018, "balance_loss_clip": 1.02051747, "balance_loss_mlp": 1.04707694, "epoch": 0.33139936870584696, "flos": 20887118847360.0, "grad_norm": 4.285092808849524, "language_loss": 0.72921515, "learning_rate": 3.12037249872891e-06, "loss": 0.75082028, "num_input_tokens_seen": 118370970, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.796875, "step": 5512, "time_per_iteration": 5.387911558151245 }, { "auxiliary_loss_clip": 0.01129534, "auxiliary_loss_mlp": 0.01040088, "balance_loss_clip": 1.02502048, "balance_loss_mlp": 1.04732656, "epoch": 0.33145949195851493, "flos": 36284356869120.0, "grad_norm": 1.6821693506007278, "language_loss": 0.72220987, "learning_rate": 3.1200498604063317e-06, "loss": 0.74390614, "num_input_tokens_seen": 118393125, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 5513, "time_per_iteration": 2.594388008117676 }, { "auxiliary_loss_clip": 0.01134693, "auxiliary_loss_mlp": 0.01039586, "balance_loss_clip": 1.02371907, "balance_loss_mlp": 1.04759455, "epoch": 0.33151961521118295, "flos": 14279735018880.0, "grad_norm": 2.3545471415630375, "language_loss": 0.68093127, "learning_rate": 3.1197271796113507e-06, "loss": 0.70267403, "num_input_tokens_seen": 118410860, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.87109375, "step": 5514, "time_per_iteration": 3.8849079608917236 }, { "auxiliary_loss_clip": 0.01134754, "auxiliary_loss_mlp": 0.01044905, "balance_loss_clip": 1.02697551, "balance_loss_mlp": 1.04843903, "epoch": 0.3315797384638509, "flos": 20774323163520.0, "grad_norm": 2.0044621170443566, "language_loss": 0.66013277, "learning_rate": 3.1194044563562026e-06, "loss": 0.68192935, "num_input_tokens_seen": 118429570, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.86328125, "step": 5515, "time_per_iteration": 2.47088885307312 }, { "auxiliary_loss_clip": 0.01131024, "auxiliary_loss_mlp": 0.01034609, "balance_loss_clip": 1.01868308, "balance_loss_mlp": 1.04569232, "epoch": 0.3316398617165189, "flos": 24679464871680.0, "grad_norm": 1.5266386282721154, "language_loss": 0.69304734, "learning_rate": 3.1190816906531257e-06, "loss": 0.71470368, "num_input_tokens_seen": 118450285, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8515625, "step": 5516, "time_per_iteration": 2.4864284992218018 }, { "auxiliary_loss_clip": 0.01133955, "auxiliary_loss_mlp": 0.0104118, "balance_loss_clip": 1.02538466, "balance_loss_mlp": 1.04620743, "epoch": 0.33169998496918685, "flos": 18587974129920.0, "grad_norm": 2.5024951101693693, "language_loss": 0.80779767, "learning_rate": 3.118758882514359e-06, "loss": 0.82954907, "num_input_tokens_seen": 118468270, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.87890625, "step": 5517, "time_per_iteration": 2.4261505603790283 }, { "auxiliary_loss_clip": 0.01127091, "auxiliary_loss_mlp": 0.01035243, "balance_loss_clip": 1.02073514, "balance_loss_mlp": 1.04517245, "epoch": 0.3317601082218548, "flos": 20193647898240.0, "grad_norm": 1.6896379198466982, "language_loss": 0.74617213, "learning_rate": 3.118436031952143e-06, "loss": 0.76779544, "num_input_tokens_seen": 118486615, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 5518, "time_per_iteration": 2.469893455505371 }, { "auxiliary_loss_clip": 0.01058852, "auxiliary_loss_mlp": 0.01005593, "balance_loss_clip": 1.00339925, "balance_loss_mlp": 1.0314362, "epoch": 0.3318202314745228, "flos": 68974703637120.0, "grad_norm": 0.6247915674466403, "language_loss": 0.54390454, "learning_rate": 3.1181131389787206e-06, "loss": 0.56454903, "num_input_tokens_seen": 118553580, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.2734375, "step": 5519, "time_per_iteration": 3.1959967613220215 }, { "auxiliary_loss_clip": 0.01131263, "auxiliary_loss_mlp": 0.01037699, "balance_loss_clip": 1.02087855, "balance_loss_mlp": 1.04689288, "epoch": 0.33188035472719074, "flos": 21500113374720.0, "grad_norm": 2.4213026965056748, "language_loss": 0.79033625, "learning_rate": 3.117790203606336e-06, "loss": 0.8120259, "num_input_tokens_seen": 118570280, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.84375, "step": 5520, "time_per_iteration": 2.453308582305908 }, { "auxiliary_loss_clip": 0.01129486, "auxiliary_loss_mlp": 0.0103461, "balance_loss_clip": 1.01986337, "balance_loss_mlp": 1.0470345, "epoch": 0.3319404779798587, "flos": 28870490926080.0, "grad_norm": 2.122730402019589, "language_loss": 0.76495302, "learning_rate": 3.1174672258472344e-06, "loss": 0.78659403, "num_input_tokens_seen": 118590455, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.82421875, "step": 5521, "time_per_iteration": 2.571866273880005 }, { "auxiliary_loss_clip": 0.01132173, "auxiliary_loss_mlp": 0.01041884, "balance_loss_clip": 1.02542138, "balance_loss_mlp": 1.04515135, "epoch": 0.33200060123252667, "flos": 23076915586560.0, "grad_norm": 1.8479852306354334, "language_loss": 0.70179069, "learning_rate": 3.117144205713664e-06, "loss": 0.72353125, "num_input_tokens_seen": 118609495, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.87109375, "step": 5522, "time_per_iteration": 2.474299907684326 }, { "auxiliary_loss_clip": 0.01129093, "auxiliary_loss_mlp": 0.01036409, "balance_loss_clip": 1.02124548, "balance_loss_mlp": 1.0461812, "epoch": 0.33206072448519464, "flos": 21142479611520.0, "grad_norm": 1.771890486276015, "language_loss": 0.73904729, "learning_rate": 3.1168211432178735e-06, "loss": 0.76070231, "num_input_tokens_seen": 118628720, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 5523, "time_per_iteration": 2.480776309967041 }, { "auxiliary_loss_clip": 0.01128579, "auxiliary_loss_mlp": 0.0103613, "balance_loss_clip": 1.02071083, "balance_loss_mlp": 1.04618406, "epoch": 0.3321208477378626, "flos": 13079097987840.0, "grad_norm": 1.7880846619284907, "language_loss": 0.81898451, "learning_rate": 3.116498038372114e-06, "loss": 0.8406316, "num_input_tokens_seen": 118645955, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.82421875, "step": 5524, "time_per_iteration": 2.435821533203125 }, { "auxiliary_loss_clip": 0.0112913, "auxiliary_loss_mlp": 0.01038034, "balance_loss_clip": 1.02370501, "balance_loss_mlp": 1.04619157, "epoch": 0.33218097099053057, "flos": 21215414177280.0, "grad_norm": 1.729814478772269, "language_loss": 0.82598269, "learning_rate": 3.116174891188636e-06, "loss": 0.84765434, "num_input_tokens_seen": 118665605, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.828125, "step": 5525, "time_per_iteration": 2.4801316261291504 }, { "auxiliary_loss_clip": 0.01057953, "auxiliary_loss_mlp": 0.00999127, "balance_loss_clip": 0.99704093, "balance_loss_mlp": 1.03064704, "epoch": 0.33224109424319853, "flos": 64348979189760.0, "grad_norm": 0.7771297619805941, "language_loss": 0.52686226, "learning_rate": 3.1158517016796945e-06, "loss": 0.54743314, "num_input_tokens_seen": 118728155, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.2734375, "step": 5526, "time_per_iteration": 3.084139585494995 }, { "auxiliary_loss_clip": 0.01133676, "auxiliary_loss_mlp": 0.01039079, "balance_loss_clip": 1.0228312, "balance_loss_mlp": 1.04765701, "epoch": 0.33230121749586655, "flos": 17346003523200.0, "grad_norm": 2.3174366259328583, "language_loss": 0.78062004, "learning_rate": 3.1155284698575445e-06, "loss": 0.80234754, "num_input_tokens_seen": 118743955, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 5527, "time_per_iteration": 2.450310468673706 }, { "auxiliary_loss_clip": 0.01132439, "auxiliary_loss_mlp": 0.01046188, "balance_loss_clip": 1.03098905, "balance_loss_mlp": 1.04838419, "epoch": 0.3323613407485345, "flos": 20997041443200.0, "grad_norm": 1.9094192759153374, "language_loss": 0.72098112, "learning_rate": 3.1152051957344434e-06, "loss": 0.74276745, "num_input_tokens_seen": 118763275, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 5528, "time_per_iteration": 2.473377227783203 }, { "auxiliary_loss_clip": 0.01130665, "auxiliary_loss_mlp": 0.01032814, "balance_loss_clip": 1.01797819, "balance_loss_mlp": 1.04611027, "epoch": 0.3324214640012025, "flos": 13152535344000.0, "grad_norm": 1.6231416203828173, "language_loss": 0.8238939, "learning_rate": 3.1148818793226497e-06, "loss": 0.84552872, "num_input_tokens_seen": 118781110, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.84375, "step": 5529, "time_per_iteration": 2.4520506858825684 }, { "auxiliary_loss_clip": 0.01135846, "auxiliary_loss_mlp": 0.01035728, "balance_loss_clip": 1.0203619, "balance_loss_mlp": 1.0483129, "epoch": 0.33248158725387045, "flos": 22273522041600.0, "grad_norm": 1.9719108647093877, "language_loss": 0.69893563, "learning_rate": 3.114558520634423e-06, "loss": 0.72065133, "num_input_tokens_seen": 118800620, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.875, "step": 5530, "time_per_iteration": 2.4897267818450928 }, { "auxiliary_loss_clip": 0.01132506, "auxiliary_loss_mlp": 0.01041196, "balance_loss_clip": 1.02500772, "balance_loss_mlp": 1.0476532, "epoch": 0.3325417105065384, "flos": 20740998320640.0, "grad_norm": 2.4785270505695967, "language_loss": 0.7585113, "learning_rate": 3.1142351196820256e-06, "loss": 0.78024828, "num_input_tokens_seen": 118818725, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8515625, "step": 5531, "time_per_iteration": 2.4739274978637695 }, { "auxiliary_loss_clip": 0.01133547, "auxiliary_loss_mlp": 0.01040507, "balance_loss_clip": 1.02417529, "balance_loss_mlp": 1.04716647, "epoch": 0.3326018337592064, "flos": 24790536702720.0, "grad_norm": 1.7444973070341065, "language_loss": 0.73400539, "learning_rate": 3.1139116764777206e-06, "loss": 0.75574595, "num_input_tokens_seen": 118839390, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.86328125, "step": 5532, "time_per_iteration": 2.513410806655884 }, { "auxiliary_loss_clip": 0.01133074, "auxiliary_loss_mlp": 0.01032901, "balance_loss_clip": 1.01733792, "balance_loss_mlp": 1.04789853, "epoch": 0.33266195701187434, "flos": 14501699112960.0, "grad_norm": 3.3688706655474174, "language_loss": 0.65764517, "learning_rate": 3.1135881910337735e-06, "loss": 0.6793049, "num_input_tokens_seen": 118856275, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8515625, "step": 5533, "time_per_iteration": 2.4485905170440674 }, { "auxiliary_loss_clip": 0.01130475, "auxiliary_loss_mlp": 0.01039721, "balance_loss_clip": 1.02414012, "balance_loss_mlp": 1.04579079, "epoch": 0.3327220802645423, "flos": 15304410299520.0, "grad_norm": 2.3713470235795864, "language_loss": 0.71345246, "learning_rate": 3.113264663362451e-06, "loss": 0.73515445, "num_input_tokens_seen": 118873830, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84375, "step": 5534, "time_per_iteration": 2.4534623622894287 }, { "auxiliary_loss_clip": 0.01128409, "auxiliary_loss_mlp": 0.01032924, "balance_loss_clip": 1.01758206, "balance_loss_mlp": 1.0463841, "epoch": 0.3327822035172103, "flos": 23477534951040.0, "grad_norm": 1.8846478166614111, "language_loss": 0.66999584, "learning_rate": 3.1129410934760204e-06, "loss": 0.6916092, "num_input_tokens_seen": 118891560, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8203125, "step": 5535, "time_per_iteration": 2.492504358291626 }, { "auxiliary_loss_clip": 0.01130095, "auxiliary_loss_mlp": 0.01039437, "balance_loss_clip": 1.02372503, "balance_loss_mlp": 1.04579949, "epoch": 0.33284232676987824, "flos": 25374516019200.0, "grad_norm": 1.9253070312256961, "language_loss": 0.73024315, "learning_rate": 3.1126174813867517e-06, "loss": 0.75193852, "num_input_tokens_seen": 118910260, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84375, "step": 5536, "time_per_iteration": 2.5172855854034424 }, { "auxiliary_loss_clip": 0.01129426, "auxiliary_loss_mlp": 0.01039832, "balance_loss_clip": 1.02532387, "balance_loss_mlp": 1.04593194, "epoch": 0.3329024500225462, "flos": 23694363400320.0, "grad_norm": 1.599450616424259, "language_loss": 0.81572068, "learning_rate": 3.112293827106917e-06, "loss": 0.83741319, "num_input_tokens_seen": 118929985, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8359375, "step": 5537, "time_per_iteration": 2.489861488342285 }, { "auxiliary_loss_clip": 0.0113762, "auxiliary_loss_mlp": 0.01041326, "balance_loss_clip": 1.0256381, "balance_loss_mlp": 1.04985106, "epoch": 0.33296257327521417, "flos": 31723163205120.0, "grad_norm": 2.075901060063919, "language_loss": 0.71408653, "learning_rate": 3.111970130648789e-06, "loss": 0.73587602, "num_input_tokens_seen": 118951355, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.87890625, "step": 5538, "time_per_iteration": 2.550894021987915 }, { "auxiliary_loss_clip": 0.01129171, "auxiliary_loss_mlp": 0.01040676, "balance_loss_clip": 1.02526248, "balance_loss_mlp": 1.04623497, "epoch": 0.33302269652788213, "flos": 22744705674240.0, "grad_norm": 1.9957309050610856, "language_loss": 0.7469213, "learning_rate": 3.1116463920246424e-06, "loss": 0.76861978, "num_input_tokens_seen": 118970910, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 5539, "time_per_iteration": 2.502763271331787 }, { "auxiliary_loss_clip": 0.01135483, "auxiliary_loss_mlp": 0.01046448, "balance_loss_clip": 1.0301764, "balance_loss_mlp": 1.04582191, "epoch": 0.33308281978055015, "flos": 11473747441920.0, "grad_norm": 2.2575552246241024, "language_loss": 0.71346718, "learning_rate": 3.1113226112467527e-06, "loss": 0.73528647, "num_input_tokens_seen": 118989200, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.89453125, "step": 5540, "time_per_iteration": 2.4371235370635986 }, { "auxiliary_loss_clip": 0.01126856, "auxiliary_loss_mlp": 0.01040429, "balance_loss_clip": 1.02520597, "balance_loss_mlp": 1.0429368, "epoch": 0.3331429430332181, "flos": 38213693112960.0, "grad_norm": 1.8586864356266637, "language_loss": 0.60576367, "learning_rate": 3.1109987883273983e-06, "loss": 0.62743652, "num_input_tokens_seen": 119011030, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 5541, "time_per_iteration": 2.636749744415283 }, { "auxiliary_loss_clip": 0.01131918, "auxiliary_loss_mlp": 0.01046787, "balance_loss_clip": 1.03011012, "balance_loss_mlp": 1.04491949, "epoch": 0.3332030662858861, "flos": 22528667324160.0, "grad_norm": 1.9698509226813798, "language_loss": 0.6829046, "learning_rate": 3.1106749232788584e-06, "loss": 0.70469165, "num_input_tokens_seen": 119030620, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8671875, "step": 5542, "time_per_iteration": 2.4641995429992676 }, { "auxiliary_loss_clip": 0.01130108, "auxiliary_loss_mlp": 0.01036025, "balance_loss_clip": 1.02099288, "balance_loss_mlp": 1.0455941, "epoch": 0.33326318953855405, "flos": 15997773507840.0, "grad_norm": 1.841461071493568, "language_loss": 0.74967003, "learning_rate": 3.110351016113414e-06, "loss": 0.77133137, "num_input_tokens_seen": 119048015, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84375, "step": 5543, "time_per_iteration": 2.448214530944824 }, { "auxiliary_loss_clip": 0.01135901, "auxiliary_loss_mlp": 0.01039527, "balance_loss_clip": 1.02401745, "balance_loss_mlp": 1.04929757, "epoch": 0.333323312791222, "flos": 25593535198080.0, "grad_norm": 1.8171519572428487, "language_loss": 0.75354594, "learning_rate": 3.110027066843348e-06, "loss": 0.7753002, "num_input_tokens_seen": 119066280, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8671875, "step": 5544, "time_per_iteration": 2.4971556663513184 }, { "auxiliary_loss_clip": 0.01126954, "auxiliary_loss_mlp": 0.01037751, "balance_loss_clip": 1.02244449, "balance_loss_mlp": 1.04344213, "epoch": 0.33338343604389, "flos": 25119550304640.0, "grad_norm": 1.5747543995652715, "language_loss": 0.70775902, "learning_rate": 3.1097030754809456e-06, "loss": 0.72940606, "num_input_tokens_seen": 119087680, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8359375, "step": 5545, "time_per_iteration": 2.5133697986602783 }, { "auxiliary_loss_clip": 0.0112833, "auxiliary_loss_mlp": 0.01039798, "balance_loss_clip": 1.02425361, "balance_loss_mlp": 1.04616678, "epoch": 0.33344355929655795, "flos": 16947287579520.0, "grad_norm": 1.7373793643824917, "language_loss": 0.69337177, "learning_rate": 3.1093790420384894e-06, "loss": 0.71505308, "num_input_tokens_seen": 119105820, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8203125, "step": 5546, "time_per_iteration": 2.4332058429718018 }, { "auxiliary_loss_clip": 0.01132562, "auxiliary_loss_mlp": 0.01035299, "balance_loss_clip": 1.01986122, "balance_loss_mlp": 1.04583991, "epoch": 0.3335036825492259, "flos": 27889591345920.0, "grad_norm": 2.024261889794473, "language_loss": 0.64973891, "learning_rate": 3.1090549665282702e-06, "loss": 0.67141753, "num_input_tokens_seen": 119126630, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8671875, "step": 5547, "time_per_iteration": 2.54767107963562 }, { "auxiliary_loss_clip": 0.01129999, "auxiliary_loss_mlp": 0.01037576, "balance_loss_clip": 1.02347994, "balance_loss_mlp": 1.04719329, "epoch": 0.3335638058018939, "flos": 16179553261440.0, "grad_norm": 4.734536165664258, "language_loss": 0.85995263, "learning_rate": 3.1087308489625742e-06, "loss": 0.88162839, "num_input_tokens_seen": 119143375, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.828125, "step": 5548, "time_per_iteration": 2.4564712047576904 }, { "auxiliary_loss_clip": 0.01132307, "auxiliary_loss_mlp": 0.01039283, "balance_loss_clip": 1.02302241, "balance_loss_mlp": 1.04578817, "epoch": 0.33362392905456184, "flos": 39896108288640.0, "grad_norm": 2.1257408349316846, "language_loss": 0.74503684, "learning_rate": 3.1084066893536945e-06, "loss": 0.76675278, "num_input_tokens_seen": 119166450, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8671875, "step": 5549, "time_per_iteration": 2.6552436351776123 }, { "auxiliary_loss_clip": 0.01133659, "auxiliary_loss_mlp": 0.01040683, "balance_loss_clip": 1.02369523, "balance_loss_mlp": 1.04737127, "epoch": 0.3336840523072298, "flos": 44271212567040.0, "grad_norm": 1.6387099478523954, "language_loss": 0.69136906, "learning_rate": 3.108082487713921e-06, "loss": 0.71311241, "num_input_tokens_seen": 119189645, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.86328125, "step": 5550, "time_per_iteration": 4.151496887207031 }, { "auxiliary_loss_clip": 0.01130052, "auxiliary_loss_mlp": 0.01039219, "balance_loss_clip": 1.02460361, "balance_loss_mlp": 1.04602861, "epoch": 0.33374417555989777, "flos": 15085678429440.0, "grad_norm": 1.769866750167474, "language_loss": 0.60407799, "learning_rate": 3.1077582440555495e-06, "loss": 0.62577069, "num_input_tokens_seen": 119208045, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.83984375, "step": 5551, "time_per_iteration": 2.5087318420410156 }, { "auxiliary_loss_clip": 0.01128637, "auxiliary_loss_mlp": 0.0104011, "balance_loss_clip": 1.02425504, "balance_loss_mlp": 1.04555893, "epoch": 0.33380429881256574, "flos": 15849174942720.0, "grad_norm": 2.047923477594718, "language_loss": 0.70886695, "learning_rate": 3.1074339583908746e-06, "loss": 0.73055434, "num_input_tokens_seen": 119224910, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 5552, "time_per_iteration": 2.4535458087921143 }, { "auxiliary_loss_clip": 0.01131438, "auxiliary_loss_mlp": 0.01041836, "balance_loss_clip": 1.02700686, "balance_loss_mlp": 1.04637265, "epoch": 0.33386442206523376, "flos": 13480327883520.0, "grad_norm": 2.8134571656679697, "language_loss": 0.82397509, "learning_rate": 3.107109630732192e-06, "loss": 0.84570777, "num_input_tokens_seen": 119243290, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8515625, "step": 5553, "time_per_iteration": 3.8467907905578613 }, { "auxiliary_loss_clip": 0.01131395, "auxiliary_loss_mlp": 0.01043638, "balance_loss_clip": 1.02666283, "balance_loss_mlp": 1.0464108, "epoch": 0.3339245453179017, "flos": 16690669839360.0, "grad_norm": 2.0348179906213115, "language_loss": 0.80904078, "learning_rate": 3.1067852610918017e-06, "loss": 0.83079112, "num_input_tokens_seen": 119261195, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8515625, "step": 5554, "time_per_iteration": 3.798009157180786 }, { "auxiliary_loss_clip": 0.01130383, "auxiliary_loss_mlp": 0.01039253, "balance_loss_clip": 1.02424479, "balance_loss_mlp": 1.04568911, "epoch": 0.3339846685705697, "flos": 24610624456320.0, "grad_norm": 1.8640071320234675, "language_loss": 0.81707442, "learning_rate": 3.1064608494820032e-06, "loss": 0.83877075, "num_input_tokens_seen": 119282845, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84765625, "step": 5555, "time_per_iteration": 3.962599754333496 }, { "auxiliary_loss_clip": 0.01126784, "auxiliary_loss_mlp": 0.01036572, "balance_loss_clip": 1.02182603, "balance_loss_mlp": 1.04352093, "epoch": 0.33404479182323765, "flos": 30953812775040.0, "grad_norm": 1.6290749686519563, "language_loss": 0.73889291, "learning_rate": 3.106136395915099e-06, "loss": 0.76052648, "num_input_tokens_seen": 119304430, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.83203125, "step": 5556, "time_per_iteration": 2.526236057281494 }, { "auxiliary_loss_clip": 0.01129079, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.0203898, "balance_loss_mlp": 1.04612327, "epoch": 0.3341049150759056, "flos": 23513301918720.0, "grad_norm": 1.629268819575655, "language_loss": 0.82315379, "learning_rate": 3.105811900403391e-06, "loss": 0.84479642, "num_input_tokens_seen": 119323830, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 5557, "time_per_iteration": 2.5116662979125977 }, { "auxiliary_loss_clip": 0.01130676, "auxiliary_loss_mlp": 0.01037551, "balance_loss_clip": 1.02187502, "balance_loss_mlp": 1.04578912, "epoch": 0.3341650383285736, "flos": 24026824707840.0, "grad_norm": 5.043652227031437, "language_loss": 0.80430752, "learning_rate": 3.1054873629591855e-06, "loss": 0.82598978, "num_input_tokens_seen": 119346340, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84765625, "step": 5558, "time_per_iteration": 2.5455615520477295 }, { "auxiliary_loss_clip": 0.01131037, "auxiliary_loss_mlp": 0.01035101, "balance_loss_clip": 1.0205102, "balance_loss_mlp": 1.04478478, "epoch": 0.33422516158124155, "flos": 24901967669760.0, "grad_norm": 1.611156285890901, "language_loss": 0.81366265, "learning_rate": 3.105162783594788e-06, "loss": 0.83532405, "num_input_tokens_seen": 119367285, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.859375, "step": 5559, "time_per_iteration": 2.536630630493164 }, { "auxiliary_loss_clip": 0.01127358, "auxiliary_loss_mlp": 0.01040576, "balance_loss_clip": 1.02560306, "balance_loss_mlp": 1.04414952, "epoch": 0.3342852848339095, "flos": 18333403464960.0, "grad_norm": 1.8110630537827692, "language_loss": 0.71971011, "learning_rate": 3.1048381623225074e-06, "loss": 0.74138939, "num_input_tokens_seen": 119385370, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83203125, "step": 5560, "time_per_iteration": 2.4402523040771484 }, { "auxiliary_loss_clip": 0.01134691, "auxiliary_loss_mlp": 0.01042879, "balance_loss_clip": 1.02649999, "balance_loss_mlp": 1.04691219, "epoch": 0.3343454080865775, "flos": 30046530119040.0, "grad_norm": 1.4871643754045074, "language_loss": 0.75006229, "learning_rate": 3.1045134991546526e-06, "loss": 0.77183801, "num_input_tokens_seen": 119409150, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87890625, "step": 5561, "time_per_iteration": 2.545541763305664 }, { "auxiliary_loss_clip": 0.01131416, "auxiliary_loss_mlp": 0.01038204, "balance_loss_clip": 1.02221799, "balance_loss_mlp": 1.0468154, "epoch": 0.33440553133924544, "flos": 16398823835520.0, "grad_norm": 1.7669946741542162, "language_loss": 0.69389021, "learning_rate": 3.1041887941035355e-06, "loss": 0.71558642, "num_input_tokens_seen": 119426475, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84765625, "step": 5562, "time_per_iteration": 2.439720630645752 }, { "auxiliary_loss_clip": 0.01127939, "auxiliary_loss_mlp": 0.0103939, "balance_loss_clip": 1.02534068, "balance_loss_mlp": 1.04454565, "epoch": 0.3344656545919134, "flos": 24242072958720.0, "grad_norm": 1.8908436379747906, "language_loss": 0.65157133, "learning_rate": 3.1038640471814685e-06, "loss": 0.6732446, "num_input_tokens_seen": 119446900, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8359375, "step": 5563, "time_per_iteration": 2.5140111446380615 }, { "auxiliary_loss_clip": 0.01131805, "auxiliary_loss_mlp": 0.01044863, "balance_loss_clip": 1.02862692, "balance_loss_mlp": 1.04542458, "epoch": 0.3345257778445814, "flos": 52118843149440.0, "grad_norm": 5.47589339537294, "language_loss": 0.74123549, "learning_rate": 3.103539258400766e-06, "loss": 0.76300216, "num_input_tokens_seen": 119470945, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.86328125, "step": 5564, "time_per_iteration": 2.726158857345581 }, { "auxiliary_loss_clip": 0.01048329, "auxiliary_loss_mlp": 0.0100664, "balance_loss_clip": 1.00453019, "balance_loss_mlp": 1.02074099, "epoch": 0.33458590109724934, "flos": 68048602254720.0, "grad_norm": 0.7778936415698177, "language_loss": 0.55467463, "learning_rate": 3.103214427773745e-06, "loss": 0.57522428, "num_input_tokens_seen": 119529925, "router_z_loss_clip": 0.02111816, "router_z_loss_mlp": 0.27539062, "step": 5565, "time_per_iteration": 3.057525634765625 }, { "auxiliary_loss_clip": 0.01128933, "auxiliary_loss_mlp": 0.01035048, "balance_loss_clip": 1.02076101, "balance_loss_mlp": 1.04672074, "epoch": 0.3346460243499173, "flos": 37414788768000.0, "grad_norm": 1.9336741508469708, "language_loss": 0.64634973, "learning_rate": 3.102889555312721e-06, "loss": 0.66798955, "num_input_tokens_seen": 119550700, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.82421875, "step": 5566, "time_per_iteration": 2.616384744644165 }, { "auxiliary_loss_clip": 0.01127356, "auxiliary_loss_mlp": 0.01039249, "balance_loss_clip": 1.02365077, "balance_loss_mlp": 1.04494917, "epoch": 0.3347061476025853, "flos": 18697358021760.0, "grad_norm": 2.1124868227723064, "language_loss": 0.77496123, "learning_rate": 3.102564641030016e-06, "loss": 0.79662728, "num_input_tokens_seen": 119569295, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.82421875, "step": 5567, "time_per_iteration": 2.451932668685913 }, { "auxiliary_loss_clip": 0.01131118, "auxiliary_loss_mlp": 0.01032107, "balance_loss_clip": 1.01633537, "balance_loss_mlp": 1.04656804, "epoch": 0.3347662708552533, "flos": 13917827537280.0, "grad_norm": 1.6249291627709945, "language_loss": 0.76819652, "learning_rate": 3.102239684937949e-06, "loss": 0.78982878, "num_input_tokens_seen": 119587375, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84375, "step": 5568, "time_per_iteration": 2.459087371826172 }, { "auxiliary_loss_clip": 0.01128988, "auxiliary_loss_mlp": 0.01041168, "balance_loss_clip": 1.02534282, "balance_loss_mlp": 1.04340982, "epoch": 0.33482639410792125, "flos": 19750402068480.0, "grad_norm": 2.6445522074146455, "language_loss": 0.70760977, "learning_rate": 3.101914687048842e-06, "loss": 0.72931135, "num_input_tokens_seen": 119604530, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.85546875, "step": 5569, "time_per_iteration": 2.46659255027771 }, { "auxiliary_loss_clip": 0.01127796, "auxiliary_loss_mlp": 0.01035466, "balance_loss_clip": 1.01925349, "balance_loss_mlp": 1.04199457, "epoch": 0.3348865173605892, "flos": 16102991422080.0, "grad_norm": 1.8915577642597017, "language_loss": 0.90050739, "learning_rate": 3.10158964737502e-06, "loss": 0.92214, "num_input_tokens_seen": 119621025, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 5570, "time_per_iteration": 2.4788401126861572 }, { "auxiliary_loss_clip": 0.01128378, "auxiliary_loss_mlp": 0.0103386, "balance_loss_clip": 1.01852953, "balance_loss_mlp": 1.04497421, "epoch": 0.3349466406132572, "flos": 25008945350400.0, "grad_norm": 2.0358856275512283, "language_loss": 0.79791057, "learning_rate": 3.101264565928808e-06, "loss": 0.81953299, "num_input_tokens_seen": 119641725, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8359375, "step": 5571, "time_per_iteration": 2.5427045822143555 }, { "auxiliary_loss_clip": 0.01047903, "auxiliary_loss_mlp": 0.01000332, "balance_loss_clip": 0.99813873, "balance_loss_mlp": 1.02022076, "epoch": 0.33500676386592515, "flos": 54319991564160.0, "grad_norm": 0.9171654741107618, "language_loss": 0.56076133, "learning_rate": 3.1009394427225335e-06, "loss": 0.58124369, "num_input_tokens_seen": 119693560, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.27734375, "step": 5572, "time_per_iteration": 3.0599465370178223 }, { "auxiliary_loss_clip": 0.01132178, "auxiliary_loss_mlp": 0.01043553, "balance_loss_clip": 1.02850294, "balance_loss_mlp": 1.04878008, "epoch": 0.3350668871185931, "flos": 26797332625920.0, "grad_norm": 2.1615376641895168, "language_loss": 0.78224605, "learning_rate": 3.1006142777685257e-06, "loss": 0.80400336, "num_input_tokens_seen": 119712935, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83203125, "step": 5573, "time_per_iteration": 2.523899793624878 }, { "auxiliary_loss_clip": 0.01131637, "auxiliary_loss_mlp": 0.0104447, "balance_loss_clip": 1.02782893, "balance_loss_mlp": 1.04678094, "epoch": 0.3351270103712611, "flos": 33510508986240.0, "grad_norm": 3.4614481379127935, "language_loss": 0.72774279, "learning_rate": 3.1002890710791133e-06, "loss": 0.74950385, "num_input_tokens_seen": 119731680, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84765625, "step": 5574, "time_per_iteration": 2.572056531906128 }, { "auxiliary_loss_clip": 0.0112328, "auxiliary_loss_mlp": 0.01034062, "balance_loss_clip": 1.01888669, "balance_loss_mlp": 1.04217172, "epoch": 0.33518713362392905, "flos": 26506240807680.0, "grad_norm": 1.790321085123563, "language_loss": 0.87767673, "learning_rate": 3.0999638226666287e-06, "loss": 0.89925015, "num_input_tokens_seen": 119752155, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 5575, "time_per_iteration": 2.5042550563812256 }, { "auxiliary_loss_clip": 0.01134381, "auxiliary_loss_mlp": 0.01038824, "balance_loss_clip": 1.02186036, "balance_loss_mlp": 1.04599178, "epoch": 0.335247256876597, "flos": 17232345912960.0, "grad_norm": 2.6134981078859383, "language_loss": 0.82721716, "learning_rate": 3.0996385325434063e-06, "loss": 0.84894919, "num_input_tokens_seen": 119769195, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 5576, "time_per_iteration": 2.4522955417633057 }, { "auxiliary_loss_clip": 0.01132847, "auxiliary_loss_mlp": 0.01040068, "balance_loss_clip": 1.02416587, "balance_loss_mlp": 1.04600656, "epoch": 0.335307380129265, "flos": 25629373992960.0, "grad_norm": 2.463027770750191, "language_loss": 0.72526902, "learning_rate": 3.0993132007217806e-06, "loss": 0.74699813, "num_input_tokens_seen": 119786810, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8671875, "step": 5577, "time_per_iteration": 2.5107109546661377 }, { "auxiliary_loss_clip": 0.01135415, "auxiliary_loss_mlp": 0.01038535, "balance_loss_clip": 1.02243018, "balance_loss_mlp": 1.05069959, "epoch": 0.33536750338193294, "flos": 19680089195520.0, "grad_norm": 1.714839539117415, "language_loss": 0.81895149, "learning_rate": 3.0989878272140883e-06, "loss": 0.84069091, "num_input_tokens_seen": 119805395, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 5578, "time_per_iteration": 2.4622645378112793 }, { "auxiliary_loss_clip": 0.01124348, "auxiliary_loss_mlp": 0.010411, "balance_loss_clip": 1.02621102, "balance_loss_mlp": 1.04498196, "epoch": 0.3354276266346009, "flos": 18332613365760.0, "grad_norm": 1.868985947565243, "language_loss": 0.71741724, "learning_rate": 3.0986624120326676e-06, "loss": 0.73907167, "num_input_tokens_seen": 119823135, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 5579, "time_per_iteration": 2.4555299282073975 }, { "auxiliary_loss_clip": 0.01135079, "auxiliary_loss_mlp": 0.0103684, "balance_loss_clip": 1.02100956, "balance_loss_mlp": 1.04864311, "epoch": 0.3354877498872689, "flos": 17858556645120.0, "grad_norm": 2.365568283549464, "language_loss": 0.81493032, "learning_rate": 3.0983369551898573e-06, "loss": 0.83664942, "num_input_tokens_seen": 119842265, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 5580, "time_per_iteration": 2.459831476211548 }, { "auxiliary_loss_clip": 0.01130366, "auxiliary_loss_mlp": 0.01032474, "balance_loss_clip": 1.01679802, "balance_loss_mlp": 1.04491329, "epoch": 0.3355478731399369, "flos": 24717745791360.0, "grad_norm": 2.3272486305295796, "language_loss": 0.78119302, "learning_rate": 3.0980114566980003e-06, "loss": 0.8028214, "num_input_tokens_seen": 119862500, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.85546875, "step": 5581, "time_per_iteration": 2.4985337257385254 }, { "auxiliary_loss_clip": 0.01133318, "auxiliary_loss_mlp": 0.01043529, "balance_loss_clip": 1.02633905, "balance_loss_mlp": 1.04472637, "epoch": 0.33560799639260486, "flos": 16873886136960.0, "grad_norm": 2.315198260824745, "language_loss": 0.74789554, "learning_rate": 3.0976859165694384e-06, "loss": 0.76966399, "num_input_tokens_seen": 119880160, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.88671875, "step": 5582, "time_per_iteration": 2.458491086959839 }, { "auxiliary_loss_clip": 0.01130097, "auxiliary_loss_mlp": 0.01047536, "balance_loss_clip": 1.03119254, "balance_loss_mlp": 1.04286718, "epoch": 0.3356681196452728, "flos": 18333511205760.0, "grad_norm": 4.415424483883326, "language_loss": 0.82458675, "learning_rate": 3.0973603348165166e-06, "loss": 0.84636307, "num_input_tokens_seen": 119899040, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.87109375, "step": 5583, "time_per_iteration": 2.4535388946533203 }, { "auxiliary_loss_clip": 0.0112834, "auxiliary_loss_mlp": 0.01042917, "balance_loss_clip": 1.02818251, "balance_loss_mlp": 1.0448029, "epoch": 0.3357282428979408, "flos": 34750612085760.0, "grad_norm": 1.7400878393657178, "language_loss": 0.77285928, "learning_rate": 3.097034711451581e-06, "loss": 0.79457188, "num_input_tokens_seen": 119921120, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8359375, "step": 5584, "time_per_iteration": 2.5912652015686035 }, { "auxiliary_loss_clip": 0.0113131, "auxiliary_loss_mlp": 0.01044649, "balance_loss_clip": 1.0290091, "balance_loss_mlp": 1.04434299, "epoch": 0.33578836615060875, "flos": 21580087006080.0, "grad_norm": 2.609571545663827, "language_loss": 0.76480293, "learning_rate": 3.0967090464869795e-06, "loss": 0.78656256, "num_input_tokens_seen": 119940165, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.87109375, "step": 5585, "time_per_iteration": 2.483898878097534 }, { "auxiliary_loss_clip": 0.01123071, "auxiliary_loss_mlp": 0.01038275, "balance_loss_clip": 1.02191925, "balance_loss_mlp": 1.04045248, "epoch": 0.3358484894032767, "flos": 24530291688960.0, "grad_norm": 1.5451477659449706, "language_loss": 0.77644652, "learning_rate": 3.0963833399350608e-06, "loss": 0.79805994, "num_input_tokens_seen": 119959730, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.82421875, "step": 5586, "time_per_iteration": 2.5241076946258545 }, { "auxiliary_loss_clip": 0.01137509, "auxiliary_loss_mlp": 0.01047622, "balance_loss_clip": 1.02934742, "balance_loss_mlp": 1.04868662, "epoch": 0.3359086126559447, "flos": 22455589104000.0, "grad_norm": 5.578845511078941, "language_loss": 0.80562192, "learning_rate": 3.0960575918081756e-06, "loss": 0.82747316, "num_input_tokens_seen": 119979315, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.88671875, "step": 5587, "time_per_iteration": 2.4896249771118164 }, { "auxiliary_loss_clip": 0.01126581, "auxiliary_loss_mlp": 0.01037368, "balance_loss_clip": 1.02343273, "balance_loss_mlp": 1.04595828, "epoch": 0.33596873590861265, "flos": 16543687386240.0, "grad_norm": 1.8872234145053965, "language_loss": 0.67218542, "learning_rate": 3.095731802118677e-06, "loss": 0.69382489, "num_input_tokens_seen": 119996140, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.8046875, "step": 5588, "time_per_iteration": 2.4517645835876465 }, { "auxiliary_loss_clip": 0.01131061, "auxiliary_loss_mlp": 0.01038831, "balance_loss_clip": 1.02178383, "balance_loss_mlp": 1.04585171, "epoch": 0.3360288591612806, "flos": 31175812782720.0, "grad_norm": 2.036020050608687, "language_loss": 0.70288241, "learning_rate": 3.095405970878919e-06, "loss": 0.72458136, "num_input_tokens_seen": 120017720, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8515625, "step": 5589, "time_per_iteration": 2.560655117034912 }, { "auxiliary_loss_clip": 0.0112999, "auxiliary_loss_mlp": 0.01037844, "balance_loss_clip": 1.02103603, "balance_loss_mlp": 1.04449797, "epoch": 0.3360889824139486, "flos": 23696913265920.0, "grad_norm": 2.8594081719713937, "language_loss": 0.66957378, "learning_rate": 3.0950800981012567e-06, "loss": 0.69125211, "num_input_tokens_seen": 120036335, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.85546875, "step": 5590, "time_per_iteration": 2.501987934112549 }, { "auxiliary_loss_clip": 0.01126134, "auxiliary_loss_mlp": 0.01037011, "balance_loss_clip": 1.02121568, "balance_loss_mlp": 1.04455173, "epoch": 0.33614910566661654, "flos": 19318109886720.0, "grad_norm": 2.2344547256463985, "language_loss": 0.73170996, "learning_rate": 3.094754183798047e-06, "loss": 0.75334138, "num_input_tokens_seen": 120056120, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.81640625, "step": 5591, "time_per_iteration": 2.491964101791382 }, { "auxiliary_loss_clip": 0.01127036, "auxiliary_loss_mlp": 0.01037723, "balance_loss_clip": 1.02283359, "balance_loss_mlp": 1.04402542, "epoch": 0.3362092289192845, "flos": 16472261191680.0, "grad_norm": 1.9070688660772044, "language_loss": 0.7021476, "learning_rate": 3.0944282279816493e-06, "loss": 0.72379524, "num_input_tokens_seen": 120073650, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 5592, "time_per_iteration": 3.9167630672454834 }, { "auxiliary_loss_clip": 0.01128373, "auxiliary_loss_mlp": 0.01036807, "balance_loss_clip": 1.02163172, "balance_loss_mlp": 1.04576945, "epoch": 0.33626935217195253, "flos": 24243581329920.0, "grad_norm": 1.888662076071076, "language_loss": 0.76549876, "learning_rate": 3.094102230664423e-06, "loss": 0.78715062, "num_input_tokens_seen": 120093260, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.828125, "step": 5593, "time_per_iteration": 2.498857021331787 }, { "auxiliary_loss_clip": 0.01131798, "auxiliary_loss_mlp": 0.01037035, "balance_loss_clip": 1.01933205, "balance_loss_mlp": 1.04543579, "epoch": 0.3363294754246205, "flos": 19718765164800.0, "grad_norm": 2.3487162662417016, "language_loss": 0.72530514, "learning_rate": 3.093776191858731e-06, "loss": 0.74699342, "num_input_tokens_seen": 120111830, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.86328125, "step": 5594, "time_per_iteration": 2.469362735748291 }, { "auxiliary_loss_clip": 0.01136045, "auxiliary_loss_mlp": 0.01041774, "balance_loss_clip": 1.02531111, "balance_loss_mlp": 1.04915452, "epoch": 0.33638959867728846, "flos": 22596286677120.0, "grad_norm": 1.9560873097081024, "language_loss": 0.80045903, "learning_rate": 3.0934501115769363e-06, "loss": 0.82223719, "num_input_tokens_seen": 120130470, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8671875, "step": 5595, "time_per_iteration": 5.289881944656372 }, { "auxiliary_loss_clip": 0.01129826, "auxiliary_loss_mlp": 0.01033398, "balance_loss_clip": 1.01897407, "balance_loss_mlp": 1.04571486, "epoch": 0.3364497219299564, "flos": 20994742972800.0, "grad_norm": 1.8584615875777823, "language_loss": 0.81208551, "learning_rate": 3.0931239898314037e-06, "loss": 0.8337177, "num_input_tokens_seen": 120150735, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.84375, "step": 5596, "time_per_iteration": 2.483398199081421 }, { "auxiliary_loss_clip": 0.01130349, "auxiliary_loss_mlp": 0.01036575, "balance_loss_clip": 1.02234721, "balance_loss_mlp": 1.0460391, "epoch": 0.3365098451826244, "flos": 25228610974080.0, "grad_norm": 1.6547268516527305, "language_loss": 0.75834191, "learning_rate": 3.0927978266344995e-06, "loss": 0.78001118, "num_input_tokens_seen": 120173230, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.84375, "step": 5597, "time_per_iteration": 3.990950345993042 }, { "auxiliary_loss_clip": 0.01128494, "auxiliary_loss_mlp": 0.01036693, "balance_loss_clip": 1.02172029, "balance_loss_mlp": 1.04653895, "epoch": 0.33656996843529235, "flos": 24571697091840.0, "grad_norm": 1.8849258770324362, "language_loss": 0.78551722, "learning_rate": 3.0924716219985916e-06, "loss": 0.80716908, "num_input_tokens_seen": 120191860, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 5598, "time_per_iteration": 2.506399393081665 }, { "auxiliary_loss_clip": 0.0113571, "auxiliary_loss_mlp": 0.0103863, "balance_loss_clip": 1.02180946, "balance_loss_mlp": 1.04646921, "epoch": 0.3366300916879603, "flos": 44091120752640.0, "grad_norm": 1.9339104887455882, "language_loss": 0.6452719, "learning_rate": 3.0921453759360514e-06, "loss": 0.66701531, "num_input_tokens_seen": 120219195, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.890625, "step": 5599, "time_per_iteration": 2.6692705154418945 }, { "auxiliary_loss_clip": 0.01139667, "auxiliary_loss_mlp": 0.01044048, "balance_loss_clip": 1.02654827, "balance_loss_mlp": 1.04950261, "epoch": 0.3366902149406283, "flos": 13879869840000.0, "grad_norm": 2.5687423389960786, "language_loss": 0.81877172, "learning_rate": 3.091819088459249e-06, "loss": 0.84060884, "num_input_tokens_seen": 120232950, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.90234375, "step": 5600, "time_per_iteration": 2.421933650970459 }, { "auxiliary_loss_clip": 0.01133313, "auxiliary_loss_mlp": 0.01043948, "balance_loss_clip": 1.02719939, "balance_loss_mlp": 1.04657269, "epoch": 0.33675033819329625, "flos": 16253098358400.0, "grad_norm": 2.2361589966360307, "language_loss": 0.83072251, "learning_rate": 3.0914927595805573e-06, "loss": 0.85249519, "num_input_tokens_seen": 120248865, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 5601, "time_per_iteration": 2.4489753246307373 }, { "auxiliary_loss_clip": 0.01131115, "auxiliary_loss_mlp": 0.01036539, "balance_loss_clip": 1.02219796, "balance_loss_mlp": 1.05073845, "epoch": 0.3368104614459642, "flos": 17055809544960.0, "grad_norm": 1.9892248212158867, "language_loss": 0.83535373, "learning_rate": 3.0911663893123507e-06, "loss": 0.85703027, "num_input_tokens_seen": 120267820, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8046875, "step": 5602, "time_per_iteration": 2.463282823562622 }, { "auxiliary_loss_clip": 0.01135354, "auxiliary_loss_mlp": 0.01049825, "balance_loss_clip": 1.03393412, "balance_loss_mlp": 1.04952002, "epoch": 0.3368705846986322, "flos": 17858628472320.0, "grad_norm": 1.8455813248094084, "language_loss": 0.69174194, "learning_rate": 3.0908399776670048e-06, "loss": 0.71359372, "num_input_tokens_seen": 120286540, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.859375, "step": 5603, "time_per_iteration": 2.4657480716705322 }, { "auxiliary_loss_clip": 0.01138814, "auxiliary_loss_mlp": 0.01048003, "balance_loss_clip": 1.03195167, "balance_loss_mlp": 1.05124581, "epoch": 0.33693070795130015, "flos": 22929502170240.0, "grad_norm": 2.1337078381358645, "language_loss": 0.83013481, "learning_rate": 3.090513524656898e-06, "loss": 0.85200298, "num_input_tokens_seen": 120307305, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 5604, "time_per_iteration": 2.505023241043091 }, { "auxiliary_loss_clip": 0.01137147, "auxiliary_loss_mlp": 0.01045349, "balance_loss_clip": 1.02957809, "balance_loss_mlp": 1.05006289, "epoch": 0.3369908312039681, "flos": 22017443005440.0, "grad_norm": 1.9076283313310165, "language_loss": 0.73288786, "learning_rate": 3.090187030294409e-06, "loss": 0.75471282, "num_input_tokens_seen": 120327845, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.87109375, "step": 5605, "time_per_iteration": 2.5221338272094727 }, { "auxiliary_loss_clip": 0.01138271, "auxiliary_loss_mlp": 0.01043481, "balance_loss_clip": 1.02766168, "balance_loss_mlp": 1.04972386, "epoch": 0.33705095445663613, "flos": 11801970944640.0, "grad_norm": 3.195075217053191, "language_loss": 0.83607519, "learning_rate": 3.089860494591919e-06, "loss": 0.85789269, "num_input_tokens_seen": 120343255, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8828125, "step": 5606, "time_per_iteration": 2.458536148071289 }, { "auxiliary_loss_clip": 0.01131212, "auxiliary_loss_mlp": 0.01047627, "balance_loss_clip": 1.03255284, "balance_loss_mlp": 1.04583526, "epoch": 0.3371110777093041, "flos": 25046400257280.0, "grad_norm": 2.1653423571803474, "language_loss": 0.6790514, "learning_rate": 3.089533917561809e-06, "loss": 0.70083976, "num_input_tokens_seen": 120361745, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.85546875, "step": 5607, "time_per_iteration": 2.504286050796509 }, { "auxiliary_loss_clip": 0.01138032, "auxiliary_loss_mlp": 0.01044396, "balance_loss_clip": 1.0278976, "balance_loss_mlp": 1.05048215, "epoch": 0.33717120096197206, "flos": 26579031719040.0, "grad_norm": 2.6276459585127534, "language_loss": 0.70585716, "learning_rate": 3.089207299216464e-06, "loss": 0.72768152, "num_input_tokens_seen": 120380565, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.875, "step": 5608, "time_per_iteration": 2.548398017883301 }, { "auxiliary_loss_clip": 0.01133005, "auxiliary_loss_mlp": 0.01039628, "balance_loss_clip": 1.02420211, "balance_loss_mlp": 1.04740179, "epoch": 0.33723132421464, "flos": 15158541168000.0, "grad_norm": 16.93076714591991, "language_loss": 0.79073739, "learning_rate": 3.088880639568269e-06, "loss": 0.81246376, "num_input_tokens_seen": 120399235, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.85546875, "step": 5609, "time_per_iteration": 2.4708855152130127 }, { "auxiliary_loss_clip": 0.01137411, "auxiliary_loss_mlp": 0.01043156, "balance_loss_clip": 1.02591848, "balance_loss_mlp": 1.05088735, "epoch": 0.337291447467308, "flos": 23436093634560.0, "grad_norm": 1.825266027100034, "language_loss": 0.82597476, "learning_rate": 3.0885539386296114e-06, "loss": 0.84778047, "num_input_tokens_seen": 120420095, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8671875, "step": 5610, "time_per_iteration": 2.542353868484497 }, { "auxiliary_loss_clip": 0.01132181, "auxiliary_loss_mlp": 0.01043739, "balance_loss_clip": 1.02633476, "balance_loss_mlp": 1.04895663, "epoch": 0.33735157071997596, "flos": 17238163916160.0, "grad_norm": 1.9370231276568883, "language_loss": 0.82303727, "learning_rate": 3.088227196412879e-06, "loss": 0.84479642, "num_input_tokens_seen": 120437690, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.83203125, "step": 5611, "time_per_iteration": 2.4488940238952637 }, { "auxiliary_loss_clip": 0.0113584, "auxiliary_loss_mlp": 0.01047256, "balance_loss_clip": 1.02991164, "balance_loss_mlp": 1.05033469, "epoch": 0.3374116939726439, "flos": 28257388657920.0, "grad_norm": 2.2762803156327487, "language_loss": 0.79710811, "learning_rate": 3.0879004129304626e-06, "loss": 0.81893903, "num_input_tokens_seen": 120459240, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.85546875, "step": 5612, "time_per_iteration": 2.537111759185791 }, { "auxiliary_loss_clip": 0.01134136, "auxiliary_loss_mlp": 0.01042656, "balance_loss_clip": 1.02616906, "balance_loss_mlp": 1.04657936, "epoch": 0.3374718172253119, "flos": 35919396731520.0, "grad_norm": 2.772173899533971, "language_loss": 0.70530987, "learning_rate": 3.087573588194753e-06, "loss": 0.72707772, "num_input_tokens_seen": 120481090, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.875, "step": 5613, "time_per_iteration": 2.5950491428375244 }, { "auxiliary_loss_clip": 0.01137672, "auxiliary_loss_mlp": 0.01038998, "balance_loss_clip": 1.02205849, "balance_loss_mlp": 1.0499332, "epoch": 0.33753194047797985, "flos": 18186672407040.0, "grad_norm": 1.9409868306832458, "language_loss": 0.79445016, "learning_rate": 3.087246722218144e-06, "loss": 0.81621683, "num_input_tokens_seen": 120500045, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.875, "step": 5614, "time_per_iteration": 2.450982093811035 }, { "auxiliary_loss_clip": 0.01133797, "auxiliary_loss_mlp": 0.01042742, "balance_loss_clip": 1.025051, "balance_loss_mlp": 1.04743981, "epoch": 0.3375920637306478, "flos": 23148916398720.0, "grad_norm": 1.8930424241771135, "language_loss": 0.91424572, "learning_rate": 3.086919815013031e-06, "loss": 0.93601108, "num_input_tokens_seen": 120521125, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.86328125, "step": 5615, "time_per_iteration": 2.483729124069214 }, { "auxiliary_loss_clip": 0.011304, "auxiliary_loss_mlp": 0.01039515, "balance_loss_clip": 1.02442884, "balance_loss_mlp": 1.0465734, "epoch": 0.3376521869833158, "flos": 23112215677440.0, "grad_norm": 1.901070800703298, "language_loss": 0.81338894, "learning_rate": 3.086592866591809e-06, "loss": 0.83508813, "num_input_tokens_seen": 120539180, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8359375, "step": 5616, "time_per_iteration": 2.4889204502105713 }, { "auxiliary_loss_clip": 0.01138687, "auxiliary_loss_mlp": 0.01045697, "balance_loss_clip": 1.02705264, "balance_loss_mlp": 1.04855013, "epoch": 0.33771231023598375, "flos": 19274585581440.0, "grad_norm": 1.9581848494741088, "language_loss": 0.83962965, "learning_rate": 3.0862658769668774e-06, "loss": 0.86147344, "num_input_tokens_seen": 120556280, "router_z_loss_clip": 0.18652344, "router_z_loss_mlp": 0.8984375, "step": 5617, "time_per_iteration": 2.464773416519165 }, { "auxiliary_loss_clip": 0.0113397, "auxiliary_loss_mlp": 0.01042309, "balance_loss_clip": 1.02638233, "balance_loss_mlp": 1.04742908, "epoch": 0.3377724334886517, "flos": 18150187167360.0, "grad_norm": 1.4766000170146887, "language_loss": 0.80241251, "learning_rate": 3.0859388461506343e-06, "loss": 0.82417524, "num_input_tokens_seen": 120575395, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8671875, "step": 5618, "time_per_iteration": 2.4587159156799316 }, { "auxiliary_loss_clip": 0.01136745, "auxiliary_loss_mlp": 0.0103928, "balance_loss_clip": 1.02347255, "balance_loss_mlp": 1.04849696, "epoch": 0.3378325567413197, "flos": 25775997310080.0, "grad_norm": 1.9950023441348137, "language_loss": 0.70663381, "learning_rate": 3.085611774155481e-06, "loss": 0.72839403, "num_input_tokens_seen": 120596075, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8828125, "step": 5619, "time_per_iteration": 2.5056869983673096 }, { "auxiliary_loss_clip": 0.0113379, "auxiliary_loss_mlp": 0.01051611, "balance_loss_clip": 1.03569698, "balance_loss_mlp": 1.04832029, "epoch": 0.3378926799939877, "flos": 21317112558720.0, "grad_norm": 3.194101952641767, "language_loss": 0.70327842, "learning_rate": 3.085284660993821e-06, "loss": 0.72513241, "num_input_tokens_seen": 120614195, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.85546875, "step": 5620, "time_per_iteration": 2.495445489883423 }, { "auxiliary_loss_clip": 0.01134774, "auxiliary_loss_mlp": 0.01040573, "balance_loss_clip": 1.0253979, "balance_loss_mlp": 1.05007529, "epoch": 0.33795280324665566, "flos": 24900028335360.0, "grad_norm": 2.6908946020007782, "language_loss": 0.6825155, "learning_rate": 3.084957506678058e-06, "loss": 0.70426899, "num_input_tokens_seen": 120634475, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84765625, "step": 5621, "time_per_iteration": 2.5391697883605957 }, { "auxiliary_loss_clip": 0.01131939, "auxiliary_loss_mlp": 0.01040781, "balance_loss_clip": 1.02576029, "balance_loss_mlp": 1.04790032, "epoch": 0.33801292649932363, "flos": 24753943722240.0, "grad_norm": 1.9071439373701304, "language_loss": 0.82658583, "learning_rate": 3.0846303112205975e-06, "loss": 0.84831303, "num_input_tokens_seen": 120654980, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83984375, "step": 5622, "time_per_iteration": 2.524831533432007 }, { "auxiliary_loss_clip": 0.01132302, "auxiliary_loss_mlp": 0.01037919, "balance_loss_clip": 1.0227083, "balance_loss_mlp": 1.04807687, "epoch": 0.3380730497519916, "flos": 26723967096960.0, "grad_norm": 1.4481100021974889, "language_loss": 0.73573232, "learning_rate": 3.0843030746338464e-06, "loss": 0.75743449, "num_input_tokens_seen": 120676245, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84375, "step": 5623, "time_per_iteration": 2.532804250717163 }, { "auxiliary_loss_clip": 0.01062768, "auxiliary_loss_mlp": 0.01009752, "balance_loss_clip": 1.00757062, "balance_loss_mlp": 1.0348748, "epoch": 0.33813317300465956, "flos": 70035756416640.0, "grad_norm": 0.7500659125373377, "language_loss": 0.54990256, "learning_rate": 3.083975796930215e-06, "loss": 0.57062781, "num_input_tokens_seen": 120741965, "router_z_loss_clip": 0.02185059, "router_z_loss_mlp": 0.27734375, "step": 5624, "time_per_iteration": 3.241513729095459 }, { "auxiliary_loss_clip": 0.01136723, "auxiliary_loss_mlp": 0.01039281, "balance_loss_clip": 1.02300942, "balance_loss_mlp": 1.05001116, "epoch": 0.3381932962573275, "flos": 24097317148800.0, "grad_norm": 3.690691832014119, "language_loss": 0.73091519, "learning_rate": 3.083648478122111e-06, "loss": 0.75267524, "num_input_tokens_seen": 120760410, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8671875, "step": 5625, "time_per_iteration": 2.5062880516052246 }, { "auxiliary_loss_clip": 0.01138722, "auxiliary_loss_mlp": 0.01038937, "balance_loss_clip": 1.02198565, "balance_loss_mlp": 1.04910195, "epoch": 0.3382534195099955, "flos": 19278248768640.0, "grad_norm": 2.0080654661170154, "language_loss": 0.70753658, "learning_rate": 3.0833211182219497e-06, "loss": 0.72931314, "num_input_tokens_seen": 120777705, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.89453125, "step": 5626, "time_per_iteration": 2.4540627002716064 }, { "auxiliary_loss_clip": 0.01133004, "auxiliary_loss_mlp": 0.01037074, "balance_loss_clip": 1.02073073, "balance_loss_mlp": 1.04925179, "epoch": 0.33831354276266346, "flos": 25226240676480.0, "grad_norm": 1.6192150415616837, "language_loss": 0.80733085, "learning_rate": 3.0829937172421425e-06, "loss": 0.82903159, "num_input_tokens_seen": 120798660, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 5627, "time_per_iteration": 2.531536340713501 }, { "auxiliary_loss_clip": 0.01142678, "auxiliary_loss_mlp": 0.01048613, "balance_loss_clip": 1.03157783, "balance_loss_mlp": 1.0527885, "epoch": 0.3383736660153314, "flos": 23112000195840.0, "grad_norm": 1.7614505647954866, "language_loss": 0.8032732, "learning_rate": 3.0826662751951055e-06, "loss": 0.82518619, "num_input_tokens_seen": 120816705, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8984375, "step": 5628, "time_per_iteration": 2.4779529571533203 }, { "auxiliary_loss_clip": 0.01133806, "auxiliary_loss_mlp": 0.01034207, "balance_loss_clip": 1.01803005, "balance_loss_mlp": 1.04794168, "epoch": 0.3384337892679994, "flos": 23477139901440.0, "grad_norm": 2.1183917937209866, "language_loss": 0.77839476, "learning_rate": 3.082338792093254e-06, "loss": 0.80007488, "num_input_tokens_seen": 120835375, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 5629, "time_per_iteration": 2.573059320449829 }, { "auxiliary_loss_clip": 0.01137261, "auxiliary_loss_mlp": 0.01046414, "balance_loss_clip": 1.02853251, "balance_loss_mlp": 1.04935884, "epoch": 0.33849391252066735, "flos": 19425805839360.0, "grad_norm": 1.865006203569391, "language_loss": 0.84516233, "learning_rate": 3.0820112679490074e-06, "loss": 0.86699915, "num_input_tokens_seen": 120854260, "router_z_loss_clip": 0.17871094, "router_z_loss_mlp": 0.87890625, "step": 5630, "time_per_iteration": 2.4720375537872314 }, { "auxiliary_loss_clip": 0.01137301, "auxiliary_loss_mlp": 0.01042208, "balance_loss_clip": 1.02691317, "balance_loss_mlp": 1.05041194, "epoch": 0.3385540357733353, "flos": 21064840364160.0, "grad_norm": 1.9108805788082102, "language_loss": 0.72034395, "learning_rate": 3.0816837027747857e-06, "loss": 0.74213898, "num_input_tokens_seen": 120871590, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8671875, "step": 5631, "time_per_iteration": 2.501457452774048 }, { "auxiliary_loss_clip": 0.0106212, "auxiliary_loss_mlp": 0.01009306, "balance_loss_clip": 1.00716043, "balance_loss_mlp": 1.03381586, "epoch": 0.3386141590260033, "flos": 69208013450880.0, "grad_norm": 0.8459186044153898, "language_loss": 0.56134105, "learning_rate": 3.0813560965830084e-06, "loss": 0.58205521, "num_input_tokens_seen": 120925550, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.28125, "step": 5632, "time_per_iteration": 3.1689770221710205 }, { "auxiliary_loss_clip": 0.0113502, "auxiliary_loss_mlp": 0.01037432, "balance_loss_clip": 1.02076626, "balance_loss_mlp": 1.04871559, "epoch": 0.3386742822786713, "flos": 25519487310720.0, "grad_norm": 1.5825516308228054, "language_loss": 0.80255437, "learning_rate": 3.0810284493861005e-06, "loss": 0.82427889, "num_input_tokens_seen": 120947620, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.86328125, "step": 5633, "time_per_iteration": 4.055371522903442 }, { "auxiliary_loss_clip": 0.01137508, "auxiliary_loss_mlp": 0.01036222, "balance_loss_clip": 1.02002192, "balance_loss_mlp": 1.05043244, "epoch": 0.33873440553133927, "flos": 23623116773760.0, "grad_norm": 1.9288111873940732, "language_loss": 0.59205329, "learning_rate": 3.0807007611964855e-06, "loss": 0.61379063, "num_input_tokens_seen": 120965205, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87109375, "step": 5634, "time_per_iteration": 2.483365297317505 }, { "auxiliary_loss_clip": 0.01132539, "auxiliary_loss_mlp": 0.01033475, "balance_loss_clip": 1.0177995, "balance_loss_mlp": 1.04764104, "epoch": 0.33879452878400723, "flos": 17088882992640.0, "grad_norm": 1.9059539092318367, "language_loss": 0.9258728, "learning_rate": 3.080373032026589e-06, "loss": 0.94753295, "num_input_tokens_seen": 120983560, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84765625, "step": 5635, "time_per_iteration": 2.4636058807373047 }, { "auxiliary_loss_clip": 0.01132376, "auxiliary_loss_mlp": 0.01033482, "balance_loss_clip": 1.01767445, "balance_loss_mlp": 1.05058098, "epoch": 0.3388546520366752, "flos": 15742053607680.0, "grad_norm": 1.798175401883372, "language_loss": 0.75349081, "learning_rate": 3.0800452618888386e-06, "loss": 0.77514935, "num_input_tokens_seen": 121001400, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.81640625, "step": 5636, "time_per_iteration": 2.440357208251953 }, { "auxiliary_loss_clip": 0.01132883, "auxiliary_loss_mlp": 0.01040375, "balance_loss_clip": 1.02442515, "balance_loss_mlp": 1.04883528, "epoch": 0.33891477528934316, "flos": 22418744728320.0, "grad_norm": 1.7857284578825343, "language_loss": 0.83424097, "learning_rate": 3.0797174507956637e-06, "loss": 0.85597354, "num_input_tokens_seen": 121021760, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84375, "step": 5637, "time_per_iteration": 5.289116859436035 }, { "auxiliary_loss_clip": 0.01137421, "auxiliary_loss_mlp": 0.01045998, "balance_loss_clip": 1.02859306, "balance_loss_mlp": 1.0509572, "epoch": 0.3389748985420111, "flos": 17274828723840.0, "grad_norm": 1.921023203238012, "language_loss": 0.69593167, "learning_rate": 3.079389598759495e-06, "loss": 0.71776593, "num_input_tokens_seen": 121041070, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.8671875, "step": 5638, "time_per_iteration": 2.451805830001831 }, { "auxiliary_loss_clip": 0.01140014, "auxiliary_loss_mlp": 0.0105029, "balance_loss_clip": 1.03368449, "balance_loss_mlp": 1.0536859, "epoch": 0.3390350217946791, "flos": 27744979190400.0, "grad_norm": 1.59639176053839, "language_loss": 0.80971438, "learning_rate": 3.079061705792765e-06, "loss": 0.83161741, "num_input_tokens_seen": 121060890, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.86328125, "step": 5639, "time_per_iteration": 4.032024145126343 }, { "auxiliary_loss_clip": 0.01138608, "auxiliary_loss_mlp": 0.01046873, "balance_loss_clip": 1.0300889, "balance_loss_mlp": 1.04994345, "epoch": 0.33909514504734706, "flos": 20339804338560.0, "grad_norm": 1.9729595268911195, "language_loss": 0.67966366, "learning_rate": 3.078733771907907e-06, "loss": 0.70151848, "num_input_tokens_seen": 121079135, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.88671875, "step": 5640, "time_per_iteration": 2.4823596477508545 }, { "auxiliary_loss_clip": 0.0113619, "auxiliary_loss_mlp": 0.01037976, "balance_loss_clip": 1.02133393, "balance_loss_mlp": 1.05114985, "epoch": 0.339155268300015, "flos": 14830030356480.0, "grad_norm": 1.5368491930483177, "language_loss": 0.6965704, "learning_rate": 3.0784057971173554e-06, "loss": 0.71831203, "num_input_tokens_seen": 121097685, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8515625, "step": 5641, "time_per_iteration": 2.5034122467041016 }, { "auxiliary_loss_clip": 0.01137403, "auxiliary_loss_mlp": 0.0104556, "balance_loss_clip": 1.02969968, "balance_loss_mlp": 1.05063987, "epoch": 0.339215391552683, "flos": 26067951054720.0, "grad_norm": 1.7101167031238562, "language_loss": 0.87292266, "learning_rate": 3.0780777814335483e-06, "loss": 0.89475226, "num_input_tokens_seen": 121115640, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 5642, "time_per_iteration": 2.5372424125671387 }, { "auxiliary_loss_clip": 0.01127047, "auxiliary_loss_mlp": 0.01032091, "balance_loss_clip": 1.01802444, "balance_loss_mlp": 1.04770494, "epoch": 0.33927551480535095, "flos": 14574705505920.0, "grad_norm": 1.9597004975493666, "language_loss": 0.84025741, "learning_rate": 3.077749724868924e-06, "loss": 0.86184877, "num_input_tokens_seen": 121132485, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.79296875, "step": 5643, "time_per_iteration": 2.480969190597534 }, { "auxiliary_loss_clip": 0.01132818, "auxiliary_loss_mlp": 0.01046002, "balance_loss_clip": 1.03095746, "balance_loss_mlp": 1.04964972, "epoch": 0.3393356380580189, "flos": 23805578885760.0, "grad_norm": 1.5816624779425432, "language_loss": 0.77042329, "learning_rate": 3.077421627435922e-06, "loss": 0.79221141, "num_input_tokens_seen": 121152935, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83203125, "step": 5644, "time_per_iteration": 2.493894100189209 }, { "auxiliary_loss_clip": 0.0113354, "auxiliary_loss_mlp": 0.01044848, "balance_loss_clip": 1.0284332, "balance_loss_mlp": 1.04841447, "epoch": 0.3393957613106869, "flos": 17347871030400.0, "grad_norm": 2.909563750636601, "language_loss": 0.63383913, "learning_rate": 3.0770934891469832e-06, "loss": 0.65562296, "num_input_tokens_seen": 121169835, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8515625, "step": 5645, "time_per_iteration": 2.4623773097991943 }, { "auxiliary_loss_clip": 0.01129564, "auxiliary_loss_mlp": 0.01038183, "balance_loss_clip": 1.02293658, "balance_loss_mlp": 1.0468924, "epoch": 0.3394558845633549, "flos": 28433960939520.0, "grad_norm": 2.129208318427981, "language_loss": 0.76371133, "learning_rate": 3.076765310014552e-06, "loss": 0.78538877, "num_input_tokens_seen": 121190290, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 5646, "time_per_iteration": 2.5151453018188477 }, { "auxiliary_loss_clip": 0.01140223, "auxiliary_loss_mlp": 0.01042918, "balance_loss_clip": 1.0259788, "balance_loss_mlp": 1.05175471, "epoch": 0.33951600781602287, "flos": 22086929865600.0, "grad_norm": 2.7247489891291616, "language_loss": 0.79108143, "learning_rate": 3.0764370900510727e-06, "loss": 0.81291276, "num_input_tokens_seen": 121209060, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 5647, "time_per_iteration": 2.517009735107422 }, { "auxiliary_loss_clip": 0.01136899, "auxiliary_loss_mlp": 0.01041757, "balance_loss_clip": 1.02664173, "balance_loss_mlp": 1.05164611, "epoch": 0.33957613106869083, "flos": 23878262056320.0, "grad_norm": 1.9562700968204236, "language_loss": 0.77527416, "learning_rate": 3.0761088292689904e-06, "loss": 0.79706073, "num_input_tokens_seen": 121227480, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8515625, "step": 5648, "time_per_iteration": 2.4887235164642334 }, { "auxiliary_loss_clip": 0.01060955, "auxiliary_loss_mlp": 0.01026265, "balance_loss_clip": 1.02430964, "balance_loss_mlp": 1.03364003, "epoch": 0.3396362543213588, "flos": 71242642414080.0, "grad_norm": 0.7906053714363371, "language_loss": 0.56450248, "learning_rate": 3.075780527680754e-06, "loss": 0.58537471, "num_input_tokens_seen": 121291305, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.2734375, "step": 5649, "time_per_iteration": 3.1611762046813965 }, { "auxiliary_loss_clip": 0.01134315, "auxiliary_loss_mlp": 0.01045689, "balance_loss_clip": 1.02926207, "balance_loss_mlp": 1.04988575, "epoch": 0.33969637757402676, "flos": 25921615046400.0, "grad_norm": 1.6635876820517885, "language_loss": 0.8551693, "learning_rate": 3.0754521852988117e-06, "loss": 0.87696934, "num_input_tokens_seen": 121312740, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84375, "step": 5650, "time_per_iteration": 2.506294012069702 }, { "auxiliary_loss_clip": 0.01132031, "auxiliary_loss_mlp": 0.01030157, "balance_loss_clip": 1.01527333, "balance_loss_mlp": 1.04928732, "epoch": 0.33975650082669473, "flos": 35261728663680.0, "grad_norm": 1.6767132798602742, "language_loss": 0.70695305, "learning_rate": 3.0751238021356152e-06, "loss": 0.72857493, "num_input_tokens_seen": 121334220, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 5651, "time_per_iteration": 2.616335868835449 }, { "auxiliary_loss_clip": 0.01133847, "auxiliary_loss_mlp": 0.01044077, "balance_loss_clip": 1.02856779, "balance_loss_mlp": 1.04958868, "epoch": 0.3398166240793627, "flos": 16647001879680.0, "grad_norm": 1.9334663603134288, "language_loss": 0.80798578, "learning_rate": 3.074795378203616e-06, "loss": 0.82976508, "num_input_tokens_seen": 121351870, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84375, "step": 5652, "time_per_iteration": 2.4334287643432617 }, { "auxiliary_loss_clip": 0.01140705, "auxiliary_loss_mlp": 0.01037636, "balance_loss_clip": 1.0217452, "balance_loss_mlp": 1.05348253, "epoch": 0.33987674733203066, "flos": 24062196625920.0, "grad_norm": 1.8048497766934921, "language_loss": 0.76849097, "learning_rate": 3.0744669135152685e-06, "loss": 0.79027438, "num_input_tokens_seen": 121373400, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.875, "step": 5653, "time_per_iteration": 2.530841827392578 }, { "auxiliary_loss_clip": 0.01132938, "auxiliary_loss_mlp": 0.0103769, "balance_loss_clip": 1.02344429, "balance_loss_mlp": 1.04875779, "epoch": 0.3399368705846986, "flos": 13250678279040.0, "grad_norm": 2.7993249992708304, "language_loss": 0.8583703, "learning_rate": 3.0741384080830278e-06, "loss": 0.88007659, "num_input_tokens_seen": 121385225, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.84375, "step": 5654, "time_per_iteration": 2.4147465229034424 }, { "auxiliary_loss_clip": 0.01131986, "auxiliary_loss_mlp": 0.01038774, "balance_loss_clip": 1.02370596, "balance_loss_mlp": 1.04739964, "epoch": 0.3399969938373666, "flos": 27012832272000.0, "grad_norm": 2.201610934987578, "language_loss": 0.65256834, "learning_rate": 3.073809861919351e-06, "loss": 0.67427599, "num_input_tokens_seen": 121404735, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84375, "step": 5655, "time_per_iteration": 2.533442258834839 }, { "auxiliary_loss_clip": 0.01133427, "auxiliary_loss_mlp": 0.01037528, "balance_loss_clip": 1.02304411, "balance_loss_mlp": 1.04984927, "epoch": 0.34005711709003456, "flos": 28550096588160.0, "grad_norm": 1.7383647536497848, "language_loss": 0.76429474, "learning_rate": 3.073481275036697e-06, "loss": 0.7860043, "num_input_tokens_seen": 121426780, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8359375, "step": 5656, "time_per_iteration": 2.5468831062316895 }, { "auxiliary_loss_clip": 0.0113822, "auxiliary_loss_mlp": 0.0104147, "balance_loss_clip": 1.02554345, "balance_loss_mlp": 1.04893231, "epoch": 0.3401172403427025, "flos": 21617003208960.0, "grad_norm": 2.202599893480885, "language_loss": 0.83050561, "learning_rate": 3.073152647447525e-06, "loss": 0.85230255, "num_input_tokens_seen": 121447245, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.89453125, "step": 5657, "time_per_iteration": 2.512683391571045 }, { "auxiliary_loss_clip": 0.01132942, "auxiliary_loss_mlp": 0.01040274, "balance_loss_clip": 1.02575445, "balance_loss_mlp": 1.04891515, "epoch": 0.3401773635953705, "flos": 25885776251520.0, "grad_norm": 2.2733536096609934, "language_loss": 0.8534106, "learning_rate": 3.0728239791642976e-06, "loss": 0.87514275, "num_input_tokens_seen": 121468165, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.83984375, "step": 5658, "time_per_iteration": 2.5069515705108643 }, { "auxiliary_loss_clip": 0.01055417, "auxiliary_loss_mlp": 0.01006857, "balance_loss_clip": 1.00466371, "balance_loss_mlp": 1.02810025, "epoch": 0.3402374868480385, "flos": 65507995336320.0, "grad_norm": 0.8201124042446191, "language_loss": 0.60065067, "learning_rate": 3.072495270199477e-06, "loss": 0.62127352, "num_input_tokens_seen": 121523795, "router_z_loss_clip": 0.02197266, "router_z_loss_mlp": 0.2734375, "step": 5659, "time_per_iteration": 3.0890848636627197 }, { "auxiliary_loss_clip": 0.01130185, "auxiliary_loss_mlp": 0.01035381, "balance_loss_clip": 1.02063525, "balance_loss_mlp": 1.04909873, "epoch": 0.34029761010070647, "flos": 24060580513920.0, "grad_norm": 2.034400429514871, "language_loss": 0.67948496, "learning_rate": 3.0721665205655284e-06, "loss": 0.70114058, "num_input_tokens_seen": 121542950, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80859375, "step": 5660, "time_per_iteration": 2.505758047103882 }, { "auxiliary_loss_clip": 0.01133876, "auxiliary_loss_mlp": 0.01041302, "balance_loss_clip": 1.02607918, "balance_loss_mlp": 1.05053747, "epoch": 0.34035773335337444, "flos": 27599720590080.0, "grad_norm": 3.2581319223178244, "language_loss": 0.67592895, "learning_rate": 3.071837730274918e-06, "loss": 0.69768077, "num_input_tokens_seen": 121562765, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83203125, "step": 5661, "time_per_iteration": 2.5640413761138916 }, { "auxiliary_loss_clip": 0.01131357, "auxiliary_loss_mlp": 0.01035189, "balance_loss_clip": 1.02069271, "balance_loss_mlp": 1.04935455, "epoch": 0.3404178566060424, "flos": 20812783651200.0, "grad_norm": 1.9210061021518818, "language_loss": 0.79029179, "learning_rate": 3.071508899340113e-06, "loss": 0.81195724, "num_input_tokens_seen": 121581610, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 5662, "time_per_iteration": 2.488847255706787 }, { "auxiliary_loss_clip": 0.01133422, "auxiliary_loss_mlp": 0.01042777, "balance_loss_clip": 1.0268867, "balance_loss_mlp": 1.04953003, "epoch": 0.34047797985871037, "flos": 26833566470400.0, "grad_norm": 2.975607383790366, "language_loss": 0.73371786, "learning_rate": 3.0711800277735833e-06, "loss": 0.75547981, "num_input_tokens_seen": 121601885, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83984375, "step": 5663, "time_per_iteration": 2.5350286960601807 }, { "auxiliary_loss_clip": 0.01127587, "auxiliary_loss_mlp": 0.01034656, "balance_loss_clip": 1.0209713, "balance_loss_mlp": 1.04813743, "epoch": 0.34053810311137833, "flos": 19682639061120.0, "grad_norm": 1.7828869601794932, "language_loss": 0.86458659, "learning_rate": 3.0708511155877997e-06, "loss": 0.88620913, "num_input_tokens_seen": 121621335, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.796875, "step": 5664, "time_per_iteration": 2.4715514183044434 }, { "auxiliary_loss_clip": 0.01133406, "auxiliary_loss_mlp": 0.01037317, "balance_loss_clip": 1.02265477, "balance_loss_mlp": 1.04866457, "epoch": 0.3405982263640463, "flos": 21725740656000.0, "grad_norm": 2.514468553783073, "language_loss": 0.68635798, "learning_rate": 3.070522162795235e-06, "loss": 0.70806527, "num_input_tokens_seen": 121641310, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.84765625, "step": 5665, "time_per_iteration": 2.470667600631714 }, { "auxiliary_loss_clip": 0.01131721, "auxiliary_loss_mlp": 0.01032648, "balance_loss_clip": 1.0166024, "balance_loss_mlp": 1.04778004, "epoch": 0.34065834961671426, "flos": 18041629288320.0, "grad_norm": 2.597598119485795, "language_loss": 0.73297024, "learning_rate": 3.0701931694083626e-06, "loss": 0.75461394, "num_input_tokens_seen": 121659625, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 5666, "time_per_iteration": 2.4544143676757812 }, { "auxiliary_loss_clip": 0.01132614, "auxiliary_loss_mlp": 0.01040844, "balance_loss_clip": 1.02560854, "balance_loss_mlp": 1.04762661, "epoch": 0.3407184728693822, "flos": 21397337585280.0, "grad_norm": 1.5655424269159806, "language_loss": 0.72999191, "learning_rate": 3.0698641354396576e-06, "loss": 0.75172651, "num_input_tokens_seen": 121679205, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8515625, "step": 5667, "time_per_iteration": 2.4982340335845947 }, { "auxiliary_loss_clip": 0.0105036, "auxiliary_loss_mlp": 0.00999692, "balance_loss_clip": 0.99753439, "balance_loss_mlp": 1.02275419, "epoch": 0.3407785961220502, "flos": 68688101018880.0, "grad_norm": 0.8418390272734328, "language_loss": 0.63313627, "learning_rate": 3.069535060901597e-06, "loss": 0.65363681, "num_input_tokens_seen": 121751085, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.27539062, "step": 5668, "time_per_iteration": 3.259575605392456 }, { "auxiliary_loss_clip": 0.01129097, "auxiliary_loss_mlp": 0.01038683, "balance_loss_clip": 1.02300715, "balance_loss_mlp": 1.04713464, "epoch": 0.34083871937471816, "flos": 14064379027200.0, "grad_norm": 2.2468195484563394, "language_loss": 0.72552156, "learning_rate": 3.0692059458066596e-06, "loss": 0.7471993, "num_input_tokens_seen": 121768565, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 5669, "time_per_iteration": 2.458843946456909 }, { "auxiliary_loss_clip": 0.01131492, "auxiliary_loss_mlp": 0.01034784, "balance_loss_clip": 1.01977587, "balance_loss_mlp": 1.04610682, "epoch": 0.3408988426273861, "flos": 17085435287040.0, "grad_norm": 1.7496944475008807, "language_loss": 0.80367148, "learning_rate": 3.0688767901673265e-06, "loss": 0.82533419, "num_input_tokens_seen": 121784925, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8515625, "step": 5670, "time_per_iteration": 2.4693589210510254 }, { "auxiliary_loss_clip": 0.01131325, "auxiliary_loss_mlp": 0.01038087, "balance_loss_clip": 1.02242255, "balance_loss_mlp": 1.04588223, "epoch": 0.3409589658800541, "flos": 24024562151040.0, "grad_norm": 1.7541793465301425, "language_loss": 0.77085721, "learning_rate": 3.068547593996078e-06, "loss": 0.79255128, "num_input_tokens_seen": 121804425, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.85546875, "step": 5671, "time_per_iteration": 2.48453426361084 }, { "auxiliary_loss_clip": 0.01131091, "auxiliary_loss_mlp": 0.01042186, "balance_loss_clip": 1.02659369, "balance_loss_mlp": 1.04725218, "epoch": 0.34101908913272205, "flos": 21142012734720.0, "grad_norm": 2.2650798147565916, "language_loss": 0.73576576, "learning_rate": 3.0682183573053974e-06, "loss": 0.75749856, "num_input_tokens_seen": 121825145, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 5672, "time_per_iteration": 2.524841785430908 }, { "auxiliary_loss_clip": 0.01131815, "auxiliary_loss_mlp": 0.01036071, "balance_loss_clip": 1.0211817, "balance_loss_mlp": 1.04708743, "epoch": 0.3410792123853901, "flos": 15702012921600.0, "grad_norm": 1.814564244257005, "language_loss": 0.73211116, "learning_rate": 3.06788908010777e-06, "loss": 0.75379002, "num_input_tokens_seen": 121842185, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.84375, "step": 5673, "time_per_iteration": 2.4303805828094482 }, { "auxiliary_loss_clip": 0.01128768, "auxiliary_loss_mlp": 0.01037754, "balance_loss_clip": 1.02292466, "balance_loss_mlp": 1.04717088, "epoch": 0.34113933563805804, "flos": 23036012974080.0, "grad_norm": 1.9596830656588209, "language_loss": 0.79581881, "learning_rate": 3.067559762415682e-06, "loss": 0.81748402, "num_input_tokens_seen": 121862260, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.81640625, "step": 5674, "time_per_iteration": 2.5195484161376953 }, { "auxiliary_loss_clip": 0.01048539, "auxiliary_loss_mlp": 0.01002959, "balance_loss_clip": 1.00080168, "balance_loss_mlp": 1.02136803, "epoch": 0.341199458890726, "flos": 69614235336960.0, "grad_norm": 0.7850229895486683, "language_loss": 0.56063575, "learning_rate": 3.0672304042416198e-06, "loss": 0.58115077, "num_input_tokens_seen": 121923560, "router_z_loss_clip": 0.02160645, "router_z_loss_mlp": 0.2734375, "step": 5675, "time_per_iteration": 4.639748811721802 }, { "auxiliary_loss_clip": 0.01129527, "auxiliary_loss_mlp": 0.01041005, "balance_loss_clip": 1.02561545, "balance_loss_mlp": 1.04794288, "epoch": 0.34125958214339397, "flos": 22346348866560.0, "grad_norm": 1.9665215805745448, "language_loss": 0.79097295, "learning_rate": 3.0669010055980734e-06, "loss": 0.81267828, "num_input_tokens_seen": 121943515, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.81640625, "step": 5676, "time_per_iteration": 2.493166923522949 }, { "auxiliary_loss_clip": 0.01129429, "auxiliary_loss_mlp": 0.01037339, "balance_loss_clip": 1.02179456, "balance_loss_mlp": 1.04602325, "epoch": 0.34131970539606193, "flos": 21871933009920.0, "grad_norm": 1.7505659400759717, "language_loss": 0.85704339, "learning_rate": 3.0665715664975357e-06, "loss": 0.8787111, "num_input_tokens_seen": 121962540, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8359375, "step": 5677, "time_per_iteration": 2.511507987976074 }, { "auxiliary_loss_clip": 0.01129621, "auxiliary_loss_mlp": 0.01041587, "balance_loss_clip": 1.02568436, "balance_loss_mlp": 1.04705405, "epoch": 0.3413798286487299, "flos": 24935723475840.0, "grad_norm": 2.3475606057747433, "language_loss": 0.79448199, "learning_rate": 3.0662420869524966e-06, "loss": 0.81619406, "num_input_tokens_seen": 121979830, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.82421875, "step": 5678, "time_per_iteration": 3.9445362091064453 }, { "auxiliary_loss_clip": 0.01129632, "auxiliary_loss_mlp": 0.01032536, "balance_loss_clip": 1.01779628, "balance_loss_mlp": 1.0459398, "epoch": 0.34143995190139786, "flos": 25374372364800.0, "grad_norm": 1.7602252662894937, "language_loss": 0.74982238, "learning_rate": 3.0659125669754506e-06, "loss": 0.77144408, "num_input_tokens_seen": 121999055, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8359375, "step": 5679, "time_per_iteration": 3.861802339553833 }, { "auxiliary_loss_clip": 0.0104844, "auxiliary_loss_mlp": 0.01009426, "balance_loss_clip": 1.00732756, "balance_loss_mlp": 1.02080631, "epoch": 0.34150007515406583, "flos": 67782578129280.0, "grad_norm": 0.7181957382093463, "language_loss": 0.59550738, "learning_rate": 3.0655830065788923e-06, "loss": 0.61608613, "num_input_tokens_seen": 122067015, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.27734375, "step": 5680, "time_per_iteration": 4.573252439498901 }, { "auxiliary_loss_clip": 0.011269, "auxiliary_loss_mlp": 0.01032416, "balance_loss_clip": 1.01762247, "balance_loss_mlp": 1.04651046, "epoch": 0.3415601984067338, "flos": 20302421258880.0, "grad_norm": 1.7648668133819267, "language_loss": 0.7239573, "learning_rate": 3.0652534057753206e-06, "loss": 0.74555051, "num_input_tokens_seen": 122085295, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 5681, "time_per_iteration": 2.4882078170776367 }, { "auxiliary_loss_clip": 0.01126389, "auxiliary_loss_mlp": 0.01039027, "balance_loss_clip": 1.02459073, "balance_loss_mlp": 1.04509902, "epoch": 0.34162032165940176, "flos": 26031178506240.0, "grad_norm": 2.2359430987472426, "language_loss": 0.71411699, "learning_rate": 3.064923764577233e-06, "loss": 0.73577112, "num_input_tokens_seen": 122104020, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 5682, "time_per_iteration": 2.48905086517334 }, { "auxiliary_loss_clip": 0.01129504, "auxiliary_loss_mlp": 0.01040026, "balance_loss_clip": 1.02407539, "balance_loss_mlp": 1.04622352, "epoch": 0.3416804449120697, "flos": 28803338449920.0, "grad_norm": 1.7000850744258702, "language_loss": 0.83781695, "learning_rate": 3.0645940829971295e-06, "loss": 0.85951221, "num_input_tokens_seen": 122125080, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83203125, "step": 5683, "time_per_iteration": 2.581622362136841 }, { "auxiliary_loss_clip": 0.01133234, "auxiliary_loss_mlp": 0.01049992, "balance_loss_clip": 1.03402424, "balance_loss_mlp": 1.04810512, "epoch": 0.3417405681647377, "flos": 22601601889920.0, "grad_norm": 1.6777624097154917, "language_loss": 0.7079801, "learning_rate": 3.0642643610475116e-06, "loss": 0.72981238, "num_input_tokens_seen": 122146350, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8515625, "step": 5684, "time_per_iteration": 2.515896797180176 }, { "auxiliary_loss_clip": 0.01130677, "auxiliary_loss_mlp": 0.0103601, "balance_loss_clip": 1.02178788, "balance_loss_mlp": 1.04936624, "epoch": 0.34180069141740566, "flos": 24716237420160.0, "grad_norm": 1.3510961640788817, "language_loss": 0.75028002, "learning_rate": 3.0639345987408823e-06, "loss": 0.77194691, "num_input_tokens_seen": 122168085, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8125, "step": 5685, "time_per_iteration": 2.5485222339630127 }, { "auxiliary_loss_clip": 0.01128318, "auxiliary_loss_mlp": 0.0103622, "balance_loss_clip": 1.02185524, "balance_loss_mlp": 1.04678178, "epoch": 0.3418608146700737, "flos": 30518755246080.0, "grad_norm": 1.953784490972982, "language_loss": 0.71015525, "learning_rate": 3.0636047960897468e-06, "loss": 0.73180068, "num_input_tokens_seen": 122191040, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.81640625, "step": 5686, "time_per_iteration": 2.5391199588775635 }, { "auxiliary_loss_clip": 0.01132021, "auxiliary_loss_mlp": 0.01040557, "balance_loss_clip": 1.02473831, "balance_loss_mlp": 1.04746795, "epoch": 0.34192093792274164, "flos": 15122343237120.0, "grad_norm": 1.8150751662667657, "language_loss": 0.77585971, "learning_rate": 3.06327495310661e-06, "loss": 0.79758549, "num_input_tokens_seen": 122209225, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 5687, "time_per_iteration": 2.4884822368621826 }, { "auxiliary_loss_clip": 0.01129898, "auxiliary_loss_mlp": 0.01037288, "balance_loss_clip": 1.0215106, "balance_loss_mlp": 1.04819858, "epoch": 0.3419810611754096, "flos": 13187799521280.0, "grad_norm": 2.050850433440333, "language_loss": 0.86825174, "learning_rate": 3.062945069803981e-06, "loss": 0.88992357, "num_input_tokens_seen": 122226160, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.81640625, "step": 5688, "time_per_iteration": 2.4289143085479736 }, { "auxiliary_loss_clip": 0.01139186, "auxiliary_loss_mlp": 0.01036262, "balance_loss_clip": 1.0196327, "balance_loss_mlp": 1.050174, "epoch": 0.34204118442807757, "flos": 19536267139200.0, "grad_norm": 1.9128338121460084, "language_loss": 0.79698259, "learning_rate": 3.0626151461943684e-06, "loss": 0.81873703, "num_input_tokens_seen": 122243115, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.890625, "step": 5689, "time_per_iteration": 2.491595506668091 }, { "auxiliary_loss_clip": 0.01136834, "auxiliary_loss_mlp": 0.01040696, "balance_loss_clip": 1.02406621, "balance_loss_mlp": 1.05128145, "epoch": 0.34210130768074554, "flos": 15194846839680.0, "grad_norm": 2.342943571181623, "language_loss": 0.73801494, "learning_rate": 3.0622851822902834e-06, "loss": 0.7597903, "num_input_tokens_seen": 122261105, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.85546875, "step": 5690, "time_per_iteration": 2.4408280849456787 }, { "auxiliary_loss_clip": 0.01132027, "auxiliary_loss_mlp": 0.01037453, "balance_loss_clip": 1.0226239, "balance_loss_mlp": 1.04825699, "epoch": 0.3421614309334135, "flos": 24936226266240.0, "grad_norm": 2.5478998224489966, "language_loss": 0.7546494, "learning_rate": 3.061955178104237e-06, "loss": 0.77634424, "num_input_tokens_seen": 122279995, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8359375, "step": 5691, "time_per_iteration": 2.5346951484680176 }, { "auxiliary_loss_clip": 0.01130419, "auxiliary_loss_mlp": 0.01038849, "balance_loss_clip": 1.0242697, "balance_loss_mlp": 1.04928756, "epoch": 0.34222155418608147, "flos": 21908633731200.0, "grad_norm": 1.7471593519566053, "language_loss": 0.6821847, "learning_rate": 3.0616251336487447e-06, "loss": 0.70387733, "num_input_tokens_seen": 122299070, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8125, "step": 5692, "time_per_iteration": 2.463336229324341 }, { "auxiliary_loss_clip": 0.01135224, "auxiliary_loss_mlp": 0.01041995, "balance_loss_clip": 1.02506757, "balance_loss_mlp": 1.05087447, "epoch": 0.34228167743874943, "flos": 18114061063680.0, "grad_norm": 2.4468714631422905, "language_loss": 0.72873271, "learning_rate": 3.06129504893632e-06, "loss": 0.75050485, "num_input_tokens_seen": 122316800, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.84375, "step": 5693, "time_per_iteration": 2.4677979946136475 }, { "auxiliary_loss_clip": 0.01131579, "auxiliary_loss_mlp": 0.01037529, "balance_loss_clip": 1.02310419, "balance_loss_mlp": 1.04805982, "epoch": 0.3423418006914174, "flos": 21288600138240.0, "grad_norm": 1.9822842779189123, "language_loss": 0.75808442, "learning_rate": 3.0609649239794813e-06, "loss": 0.7797755, "num_input_tokens_seen": 122335275, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8359375, "step": 5694, "time_per_iteration": 2.4704251289367676 }, { "auxiliary_loss_clip": 0.01132746, "auxiliary_loss_mlp": 0.01040653, "balance_loss_clip": 1.02568579, "balance_loss_mlp": 1.0519594, "epoch": 0.34240192394408536, "flos": 19823480288640.0, "grad_norm": 2.2677749346241405, "language_loss": 0.79317844, "learning_rate": 3.060634758790747e-06, "loss": 0.81491238, "num_input_tokens_seen": 122353215, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8046875, "step": 5695, "time_per_iteration": 2.497758388519287 }, { "auxiliary_loss_clip": 0.01131031, "auxiliary_loss_mlp": 0.0104003, "balance_loss_clip": 1.02498603, "balance_loss_mlp": 1.04769778, "epoch": 0.3424620471967533, "flos": 24535535074560.0, "grad_norm": 2.4964023228372665, "language_loss": 0.74030739, "learning_rate": 3.060304553382635e-06, "loss": 0.76201802, "num_input_tokens_seen": 122372495, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83203125, "step": 5696, "time_per_iteration": 2.498582124710083 }, { "auxiliary_loss_clip": 0.01133115, "auxiliary_loss_mlp": 0.01049203, "balance_loss_clip": 1.03419471, "balance_loss_mlp": 1.04979992, "epoch": 0.3425221704494213, "flos": 25848895962240.0, "grad_norm": 1.9118770611084657, "language_loss": 0.70822275, "learning_rate": 3.0599743077676685e-06, "loss": 0.73004597, "num_input_tokens_seen": 122394600, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8359375, "step": 5697, "time_per_iteration": 2.526374101638794 }, { "auxiliary_loss_clip": 0.01130119, "auxiliary_loss_mlp": 0.01033244, "balance_loss_clip": 1.01791382, "balance_loss_mlp": 1.04947734, "epoch": 0.34258229370208926, "flos": 21540513196800.0, "grad_norm": 1.9170544574095756, "language_loss": 0.81645453, "learning_rate": 3.05964402195837e-06, "loss": 0.83808815, "num_input_tokens_seen": 122414700, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8046875, "step": 5698, "time_per_iteration": 2.5107216835021973 }, { "auxiliary_loss_clip": 0.01132868, "auxiliary_loss_mlp": 0.01050631, "balance_loss_clip": 1.03373933, "balance_loss_mlp": 1.04779601, "epoch": 0.3426424169547573, "flos": 23652778429440.0, "grad_norm": 2.2955058657902234, "language_loss": 0.69461, "learning_rate": 3.0593136959672645e-06, "loss": 0.71644497, "num_input_tokens_seen": 122432760, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8515625, "step": 5699, "time_per_iteration": 2.5408401489257812 }, { "auxiliary_loss_clip": 0.0113288, "auxiliary_loss_mlp": 0.01037222, "balance_loss_clip": 1.02181983, "balance_loss_mlp": 1.04911971, "epoch": 0.34270254020742524, "flos": 24644883052800.0, "grad_norm": 2.0154211431561966, "language_loss": 0.72336566, "learning_rate": 3.058983329806877e-06, "loss": 0.74506664, "num_input_tokens_seen": 122449105, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 5700, "time_per_iteration": 2.495457410812378 }, { "auxiliary_loss_clip": 0.01132955, "auxiliary_loss_mlp": 0.01037861, "balance_loss_clip": 1.02331734, "balance_loss_mlp": 1.05056214, "epoch": 0.3427626634600932, "flos": 20996754134400.0, "grad_norm": 2.1291072571345184, "language_loss": 0.81569517, "learning_rate": 3.0586529234897354e-06, "loss": 0.8374033, "num_input_tokens_seen": 122468700, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.82421875, "step": 5701, "time_per_iteration": 2.507122039794922 }, { "auxiliary_loss_clip": 0.0113201, "auxiliary_loss_mlp": 0.01039367, "balance_loss_clip": 1.02443576, "balance_loss_mlp": 1.04728198, "epoch": 0.3428227867127612, "flos": 21433786911360.0, "grad_norm": 1.6706051690297437, "language_loss": 0.71492982, "learning_rate": 3.0583224770283694e-06, "loss": 0.73664355, "num_input_tokens_seen": 122488160, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.84765625, "step": 5702, "time_per_iteration": 2.4608659744262695 }, { "auxiliary_loss_clip": 0.01055035, "auxiliary_loss_mlp": 0.01007703, "balance_loss_clip": 1.0055809, "balance_loss_mlp": 1.02759933, "epoch": 0.34288290996542914, "flos": 55731782695680.0, "grad_norm": 0.7762025739346432, "language_loss": 0.57493818, "learning_rate": 3.057991990435309e-06, "loss": 0.59556556, "num_input_tokens_seen": 122542890, "router_z_loss_clip": 0.02124023, "router_z_loss_mlp": 0.2734375, "step": 5703, "time_per_iteration": 3.0188097953796387 }, { "auxiliary_loss_clip": 0.01132934, "auxiliary_loss_mlp": 0.01044713, "balance_loss_clip": 1.02796412, "balance_loss_mlp": 1.04857922, "epoch": 0.3429430332180971, "flos": 20156803522560.0, "grad_norm": 1.9134846035514388, "language_loss": 0.74906498, "learning_rate": 3.057661463723086e-06, "loss": 0.77084148, "num_input_tokens_seen": 122561770, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.84375, "step": 5704, "time_per_iteration": 2.461329221725464 }, { "auxiliary_loss_clip": 0.01129852, "auxiliary_loss_mlp": 0.01036969, "balance_loss_clip": 1.02258658, "balance_loss_mlp": 1.04819369, "epoch": 0.34300315647076507, "flos": 17965857548160.0, "grad_norm": 3.2965197419603416, "language_loss": 0.72640228, "learning_rate": 3.0573308969042346e-06, "loss": 0.74807054, "num_input_tokens_seen": 122580580, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.81640625, "step": 5705, "time_per_iteration": 2.482865333557129 }, { "auxiliary_loss_clip": 0.01132138, "auxiliary_loss_mlp": 0.01037661, "balance_loss_clip": 1.02243781, "balance_loss_mlp": 1.0486908, "epoch": 0.34306327972343303, "flos": 22086822124800.0, "grad_norm": 5.855480528683966, "language_loss": 0.79884857, "learning_rate": 3.057000289991289e-06, "loss": 0.82054657, "num_input_tokens_seen": 122599810, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8359375, "step": 5706, "time_per_iteration": 2.4800615310668945 }, { "auxiliary_loss_clip": 0.01134686, "auxiliary_loss_mlp": 0.01034344, "balance_loss_clip": 1.01886439, "balance_loss_mlp": 1.04975343, "epoch": 0.343123402976101, "flos": 18442679616000.0, "grad_norm": 2.200156058124213, "language_loss": 0.83308446, "learning_rate": 3.056669642996787e-06, "loss": 0.85477477, "num_input_tokens_seen": 122616035, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8515625, "step": 5707, "time_per_iteration": 2.4820735454559326 }, { "auxiliary_loss_clip": 0.01134117, "auxiliary_loss_mlp": 0.01030772, "balance_loss_clip": 1.01483989, "balance_loss_mlp": 1.05073905, "epoch": 0.34318352622876896, "flos": 17163685065600.0, "grad_norm": 1.562789160030617, "language_loss": 0.7498306, "learning_rate": 3.056338955933266e-06, "loss": 0.77147949, "num_input_tokens_seen": 122633785, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83203125, "step": 5708, "time_per_iteration": 2.453096866607666 }, { "auxiliary_loss_clip": 0.01126524, "auxiliary_loss_mlp": 0.01037392, "balance_loss_clip": 1.02208602, "balance_loss_mlp": 1.04663086, "epoch": 0.34324364948143693, "flos": 26688164215680.0, "grad_norm": 1.696660781653651, "language_loss": 0.81041634, "learning_rate": 3.0560082288132662e-06, "loss": 0.83205551, "num_input_tokens_seen": 122652100, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 5709, "time_per_iteration": 2.531841516494751 }, { "auxiliary_loss_clip": 0.01131809, "auxiliary_loss_mlp": 0.01040388, "balance_loss_clip": 1.02368712, "balance_loss_mlp": 1.04837227, "epoch": 0.3433037727341049, "flos": 21251576194560.0, "grad_norm": 2.211696913495418, "language_loss": 0.79226112, "learning_rate": 3.055677461649329e-06, "loss": 0.81398308, "num_input_tokens_seen": 122669720, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8359375, "step": 5710, "time_per_iteration": 2.449578285217285 }, { "auxiliary_loss_clip": 0.01133633, "auxiliary_loss_mlp": 0.01040836, "balance_loss_clip": 1.02431393, "balance_loss_mlp": 1.04804683, "epoch": 0.34336389598677286, "flos": 20629423699200.0, "grad_norm": 1.7196690774268975, "language_loss": 0.7025373, "learning_rate": 3.055346654453996e-06, "loss": 0.72428203, "num_input_tokens_seen": 122688715, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.85546875, "step": 5711, "time_per_iteration": 2.5203771591186523 }, { "auxiliary_loss_clip": 0.01131022, "auxiliary_loss_mlp": 0.01039768, "balance_loss_clip": 1.02434266, "balance_loss_mlp": 1.04817939, "epoch": 0.3434240192394409, "flos": 14538579402240.0, "grad_norm": 2.4311660867127287, "language_loss": 0.67439556, "learning_rate": 3.055015807239812e-06, "loss": 0.69610345, "num_input_tokens_seen": 122706970, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 5712, "time_per_iteration": 2.436317205429077 }, { "auxiliary_loss_clip": 0.01054898, "auxiliary_loss_mlp": 0.01001666, "balance_loss_clip": 0.99952012, "balance_loss_mlp": 1.02702665, "epoch": 0.34348414249210885, "flos": 58051538841600.0, "grad_norm": 0.8427138586096649, "language_loss": 0.5809865, "learning_rate": 3.0546849200193226e-06, "loss": 0.60155213, "num_input_tokens_seen": 122758095, "router_z_loss_clip": 0.02148438, "router_z_loss_mlp": 0.27734375, "step": 5713, "time_per_iteration": 3.0854053497314453 }, { "auxiliary_loss_clip": 0.01131115, "auxiliary_loss_mlp": 0.01039472, "balance_loss_clip": 1.02392697, "balance_loss_mlp": 1.0483048, "epoch": 0.3435442657447768, "flos": 20704441253760.0, "grad_norm": 1.5855506721412824, "language_loss": 0.80470067, "learning_rate": 3.054353992805076e-06, "loss": 0.82640654, "num_input_tokens_seen": 122777815, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.828125, "step": 5714, "time_per_iteration": 2.472395181655884 }, { "auxiliary_loss_clip": 0.01132314, "auxiliary_loss_mlp": 0.01040188, "balance_loss_clip": 1.02413023, "balance_loss_mlp": 1.04941821, "epoch": 0.3436043889974448, "flos": 22930256355840.0, "grad_norm": 2.288982003306013, "language_loss": 0.71612722, "learning_rate": 3.05402302560962e-06, "loss": 0.73785222, "num_input_tokens_seen": 122797555, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.828125, "step": 5715, "time_per_iteration": 2.524061441421509 }, { "auxiliary_loss_clip": 0.01054947, "auxiliary_loss_mlp": 0.01002123, "balance_loss_clip": 1.00004852, "balance_loss_mlp": 1.02735281, "epoch": 0.34366451225011274, "flos": 58403285752320.0, "grad_norm": 1.009838353574389, "language_loss": 0.65937793, "learning_rate": 3.053692018445505e-06, "loss": 0.67994869, "num_input_tokens_seen": 122863955, "router_z_loss_clip": 0.02075195, "router_z_loss_mlp": 0.27539062, "step": 5716, "time_per_iteration": 4.591624021530151 }, { "auxiliary_loss_clip": 0.01129795, "auxiliary_loss_mlp": 0.01038048, "balance_loss_clip": 1.02303934, "balance_loss_mlp": 1.04912233, "epoch": 0.3437246355027807, "flos": 15596292216960.0, "grad_norm": 2.015760603287809, "language_loss": 0.74104714, "learning_rate": 3.0533609713252838e-06, "loss": 0.76272559, "num_input_tokens_seen": 122883000, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 5717, "time_per_iteration": 2.4853994846343994 }, { "auxiliary_loss_clip": 0.01128671, "auxiliary_loss_mlp": 0.01038997, "balance_loss_clip": 1.02468014, "balance_loss_mlp": 1.04550719, "epoch": 0.34378475875544867, "flos": 27672260106240.0, "grad_norm": 2.2957420290815644, "language_loss": 0.75364935, "learning_rate": 3.0530298842615077e-06, "loss": 0.77532601, "num_input_tokens_seen": 122903265, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.83203125, "step": 5718, "time_per_iteration": 2.573814868927002 }, { "auxiliary_loss_clip": 0.01131104, "auxiliary_loss_mlp": 0.01042817, "balance_loss_clip": 1.02686703, "balance_loss_mlp": 1.04700422, "epoch": 0.34384488200811664, "flos": 31431496769280.0, "grad_norm": 2.4021323790632727, "language_loss": 0.63926256, "learning_rate": 3.052698757266734e-06, "loss": 0.66100174, "num_input_tokens_seen": 122923860, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83984375, "step": 5719, "time_per_iteration": 2.5665292739868164 }, { "auxiliary_loss_clip": 0.01134378, "auxiliary_loss_mlp": 0.01041219, "balance_loss_clip": 1.0243628, "balance_loss_mlp": 1.04890323, "epoch": 0.3439050052607846, "flos": 24899920594560.0, "grad_norm": 1.9514850933795658, "language_loss": 0.73423707, "learning_rate": 3.0523675903535183e-06, "loss": 0.75599301, "num_input_tokens_seen": 122945305, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.85546875, "step": 5720, "time_per_iteration": 5.35900092124939 }, { "auxiliary_loss_clip": 0.01132651, "auxiliary_loss_mlp": 0.01039147, "balance_loss_clip": 1.02339995, "balance_loss_mlp": 1.0486666, "epoch": 0.34396512851345257, "flos": 18150079426560.0, "grad_norm": 1.5997838994970932, "language_loss": 0.74345553, "learning_rate": 3.0520363835344173e-06, "loss": 0.76517349, "num_input_tokens_seen": 122962535, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.83984375, "step": 5721, "time_per_iteration": 2.4864766597747803 }, { "auxiliary_loss_clip": 0.01135303, "auxiliary_loss_mlp": 0.01045469, "balance_loss_clip": 1.03001904, "balance_loss_mlp": 1.05100775, "epoch": 0.34402525176612053, "flos": 16034438315520.0, "grad_norm": 3.674740692037708, "language_loss": 0.80129415, "learning_rate": 3.051705136821992e-06, "loss": 0.82310188, "num_input_tokens_seen": 122979750, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84375, "step": 5722, "time_per_iteration": 3.9259133338928223 }, { "auxiliary_loss_clip": 0.01130037, "auxiliary_loss_mlp": 0.01035305, "balance_loss_clip": 1.02040386, "balance_loss_mlp": 1.04812407, "epoch": 0.3440853750187885, "flos": 21178641628800.0, "grad_norm": 1.6522677397171586, "language_loss": 0.8147198, "learning_rate": 3.051373850228801e-06, "loss": 0.83637321, "num_input_tokens_seen": 122998955, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 5723, "time_per_iteration": 2.523252010345459 }, { "auxiliary_loss_clip": 0.01133323, "auxiliary_loss_mlp": 0.01045433, "balance_loss_clip": 1.02991211, "balance_loss_mlp": 1.04892015, "epoch": 0.34414549827145646, "flos": 12677868092160.0, "grad_norm": 1.8625997764282085, "language_loss": 0.81135297, "learning_rate": 3.0510425237674096e-06, "loss": 0.83314049, "num_input_tokens_seen": 123016165, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84375, "step": 5724, "time_per_iteration": 2.4694979190826416 }, { "auxiliary_loss_clip": 0.01133152, "auxiliary_loss_mlp": 0.01038651, "balance_loss_clip": 1.02377355, "balance_loss_mlp": 1.04875779, "epoch": 0.3442056215241244, "flos": 31284514316160.0, "grad_norm": 3.353977181133419, "language_loss": 0.69017339, "learning_rate": 3.05071115745038e-06, "loss": 0.71189141, "num_input_tokens_seen": 123036900, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.84375, "step": 5725, "time_per_iteration": 2.55513072013855 }, { "auxiliary_loss_clip": 0.01138897, "auxiliary_loss_mlp": 0.01050963, "balance_loss_clip": 1.03352261, "balance_loss_mlp": 1.04988456, "epoch": 0.34426574477679245, "flos": 23367289132800.0, "grad_norm": 1.6231076160030324, "language_loss": 0.69218898, "learning_rate": 3.0503797512902773e-06, "loss": 0.71408761, "num_input_tokens_seen": 123057480, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.890625, "step": 5726, "time_per_iteration": 2.5011398792266846 }, { "auxiliary_loss_clip": 0.01132055, "auxiliary_loss_mlp": 0.01044436, "balance_loss_clip": 1.03017282, "balance_loss_mlp": 1.04907084, "epoch": 0.3443258680294604, "flos": 24535427333760.0, "grad_norm": 2.321136674262087, "language_loss": 0.73082751, "learning_rate": 3.0500483052996703e-06, "loss": 0.75259244, "num_input_tokens_seen": 123076890, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.83203125, "step": 5727, "time_per_iteration": 2.5106606483459473 }, { "auxiliary_loss_clip": 0.01133525, "auxiliary_loss_mlp": 0.0104688, "balance_loss_clip": 1.03095388, "balance_loss_mlp": 1.05021214, "epoch": 0.3443859912821284, "flos": 20230133137920.0, "grad_norm": 2.0033914078022126, "language_loss": 0.88481474, "learning_rate": 3.0497168194911257e-06, "loss": 0.90661871, "num_input_tokens_seen": 123092530, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83203125, "step": 5728, "time_per_iteration": 2.49629282951355 }, { "auxiliary_loss_clip": 0.01133095, "auxiliary_loss_mlp": 0.01051306, "balance_loss_clip": 1.03633296, "balance_loss_mlp": 1.04986143, "epoch": 0.34444611453479634, "flos": 24316515895680.0, "grad_norm": 2.0129755063036434, "language_loss": 0.70710379, "learning_rate": 3.0493852938772143e-06, "loss": 0.72894776, "num_input_tokens_seen": 123110560, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83203125, "step": 5729, "time_per_iteration": 2.4917964935302734 }, { "auxiliary_loss_clip": 0.01130947, "auxiliary_loss_mlp": 0.01035214, "balance_loss_clip": 1.01970494, "balance_loss_mlp": 1.04761899, "epoch": 0.3445062377874643, "flos": 16983413683200.0, "grad_norm": 1.718304645570533, "language_loss": 0.7408129, "learning_rate": 3.0490537284705078e-06, "loss": 0.76247454, "num_input_tokens_seen": 123128655, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83203125, "step": 5730, "time_per_iteration": 2.4803802967071533 }, { "auxiliary_loss_clip": 0.01133714, "auxiliary_loss_mlp": 0.01047105, "balance_loss_clip": 1.03140473, "balance_loss_mlp": 1.04960728, "epoch": 0.3445663610401323, "flos": 20302708567680.0, "grad_norm": 2.0379775595447427, "language_loss": 0.79840171, "learning_rate": 3.048722123283578e-06, "loss": 0.82020986, "num_input_tokens_seen": 123145130, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83984375, "step": 5731, "time_per_iteration": 2.475041151046753 }, { "auxiliary_loss_clip": 0.01134662, "auxiliary_loss_mlp": 0.01043746, "balance_loss_clip": 1.027951, "balance_loss_mlp": 1.05072451, "epoch": 0.34462648429280024, "flos": 15888102307200.0, "grad_norm": 2.3479170432468597, "language_loss": 0.77973735, "learning_rate": 3.0483904783290006e-06, "loss": 0.80152142, "num_input_tokens_seen": 123162265, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8359375, "step": 5732, "time_per_iteration": 2.480681896209717 }, { "auxiliary_loss_clip": 0.01058927, "auxiliary_loss_mlp": 0.01023327, "balance_loss_clip": 1.02125323, "balance_loss_mlp": 1.03094268, "epoch": 0.3446866075454682, "flos": 59311035285120.0, "grad_norm": 0.7479389726824821, "language_loss": 0.53572571, "learning_rate": 3.0480587936193505e-06, "loss": 0.55654824, "num_input_tokens_seen": 123218620, "router_z_loss_clip": 0.02075195, "router_z_loss_mlp": 0.27929688, "step": 5733, "time_per_iteration": 3.133751392364502 }, { "auxiliary_loss_clip": 0.01136684, "auxiliary_loss_mlp": 0.01043143, "balance_loss_clip": 1.02719307, "balance_loss_mlp": 1.05203402, "epoch": 0.34474673079813617, "flos": 22343799000960.0, "grad_norm": 2.1192493773755627, "language_loss": 0.83507311, "learning_rate": 3.047727069167207e-06, "loss": 0.85687137, "num_input_tokens_seen": 123237325, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84765625, "step": 5734, "time_per_iteration": 2.490622043609619 }, { "auxiliary_loss_clip": 0.0113548, "auxiliary_loss_mlp": 0.01038164, "balance_loss_clip": 1.02238083, "balance_loss_mlp": 1.0499748, "epoch": 0.34480685405080413, "flos": 27670141203840.0, "grad_norm": 1.9822421988478682, "language_loss": 0.9265002, "learning_rate": 3.0473953049851478e-06, "loss": 0.94823664, "num_input_tokens_seen": 123258650, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.85546875, "step": 5735, "time_per_iteration": 2.535928964614868 }, { "auxiliary_loss_clip": 0.01139191, "auxiliary_loss_mlp": 0.01040723, "balance_loss_clip": 1.02412879, "balance_loss_mlp": 1.0532726, "epoch": 0.3448669773034721, "flos": 22456020067200.0, "grad_norm": 1.8062595848770835, "language_loss": 0.76724207, "learning_rate": 3.0470635010857533e-06, "loss": 0.78904116, "num_input_tokens_seen": 123277155, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.859375, "step": 5736, "time_per_iteration": 2.5204126834869385 }, { "auxiliary_loss_clip": 0.01139558, "auxiliary_loss_mlp": 0.01040252, "balance_loss_clip": 1.02450442, "balance_loss_mlp": 1.05314469, "epoch": 0.34492710055614006, "flos": 24936190352640.0, "grad_norm": 1.6379556676517362, "language_loss": 0.78728259, "learning_rate": 3.0467316574816064e-06, "loss": 0.80908066, "num_input_tokens_seen": 123297640, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8671875, "step": 5737, "time_per_iteration": 2.501596450805664 }, { "auxiliary_loss_clip": 0.01142605, "auxiliary_loss_mlp": 0.0103907, "balance_loss_clip": 1.02180815, "balance_loss_mlp": 1.05263972, "epoch": 0.34498722380880803, "flos": 20120821073280.0, "grad_norm": 1.952311271143405, "language_loss": 0.71917605, "learning_rate": 3.0463997741852893e-06, "loss": 0.74099278, "num_input_tokens_seen": 123314370, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.8984375, "step": 5738, "time_per_iteration": 2.479227304458618 }, { "auxiliary_loss_clip": 0.01141705, "auxiliary_loss_mlp": 0.01039869, "balance_loss_clip": 1.02300072, "balance_loss_mlp": 1.05270267, "epoch": 0.34504734706147605, "flos": 28438126917120.0, "grad_norm": 1.998226420316854, "language_loss": 0.81550789, "learning_rate": 3.046067851209389e-06, "loss": 0.83732355, "num_input_tokens_seen": 123336085, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.890625, "step": 5739, "time_per_iteration": 2.531392812728882 }, { "auxiliary_loss_clip": 0.01140328, "auxiliary_loss_mlp": 0.01042273, "balance_loss_clip": 1.02598906, "balance_loss_mlp": 1.05424666, "epoch": 0.345107470314144, "flos": 22674464628480.0, "grad_norm": 1.935505425422837, "language_loss": 0.82947016, "learning_rate": 3.0457358885664898e-06, "loss": 0.85129613, "num_input_tokens_seen": 123354460, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.859375, "step": 5740, "time_per_iteration": 2.512145519256592 }, { "auxiliary_loss_clip": 0.01138311, "auxiliary_loss_mlp": 0.01034131, "balance_loss_clip": 1.01702428, "balance_loss_mlp": 1.05258298, "epoch": 0.345167593566812, "flos": 20630716588800.0, "grad_norm": 2.2258648116255126, "language_loss": 0.77389431, "learning_rate": 3.045403886269181e-06, "loss": 0.79561865, "num_input_tokens_seen": 123373420, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.859375, "step": 5741, "time_per_iteration": 2.468822717666626 }, { "auxiliary_loss_clip": 0.01138355, "auxiliary_loss_mlp": 0.01035869, "balance_loss_clip": 1.02031231, "balance_loss_mlp": 1.04972827, "epoch": 0.34522771681947995, "flos": 26214358890240.0, "grad_norm": 1.7191677282413176, "language_loss": 0.77362502, "learning_rate": 3.045071844330053e-06, "loss": 0.79536724, "num_input_tokens_seen": 123394730, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.88671875, "step": 5742, "time_per_iteration": 2.5565266609191895 }, { "auxiliary_loss_clip": 0.01135499, "auxiliary_loss_mlp": 0.01036785, "balance_loss_clip": 1.02040577, "balance_loss_mlp": 1.05069995, "epoch": 0.3452878400721479, "flos": 19062354072960.0, "grad_norm": 2.28606459296683, "language_loss": 0.76086402, "learning_rate": 3.0447397627616955e-06, "loss": 0.78258681, "num_input_tokens_seen": 123412895, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84765625, "step": 5743, "time_per_iteration": 2.4952306747436523 }, { "auxiliary_loss_clip": 0.01134502, "auxiliary_loss_mlp": 0.01041877, "balance_loss_clip": 1.02653503, "balance_loss_mlp": 1.0503062, "epoch": 0.3453479633248159, "flos": 27929739772800.0, "grad_norm": 1.7069369546996804, "language_loss": 0.70376158, "learning_rate": 3.0444076415767016e-06, "loss": 0.72552538, "num_input_tokens_seen": 123432320, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.84375, "step": 5744, "time_per_iteration": 2.53963565826416 }, { "auxiliary_loss_clip": 0.01134253, "auxiliary_loss_mlp": 0.01040517, "balance_loss_clip": 1.02406621, "balance_loss_mlp": 1.05035949, "epoch": 0.34540808657748384, "flos": 19606113135360.0, "grad_norm": 1.6433822383196752, "language_loss": 0.79479969, "learning_rate": 3.044075480787665e-06, "loss": 0.81654739, "num_input_tokens_seen": 123450980, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8359375, "step": 5745, "time_per_iteration": 2.4483702182769775 }, { "auxiliary_loss_clip": 0.01139909, "auxiliary_loss_mlp": 0.01042799, "balance_loss_clip": 1.02510834, "balance_loss_mlp": 1.05192339, "epoch": 0.3454682098301518, "flos": 20411661496320.0, "grad_norm": 1.7648733333568902, "language_loss": 0.88776582, "learning_rate": 3.043743280407182e-06, "loss": 0.90959287, "num_input_tokens_seen": 123469365, "router_z_loss_clip": 0.17675781, "router_z_loss_mlp": 0.87890625, "step": 5746, "time_per_iteration": 2.4775304794311523 }, { "auxiliary_loss_clip": 0.01139888, "auxiliary_loss_mlp": 0.01037869, "balance_loss_clip": 1.02117956, "balance_loss_mlp": 1.05135071, "epoch": 0.34552833308281977, "flos": 21325121291520.0, "grad_norm": 1.7535196795080072, "language_loss": 0.64157748, "learning_rate": 3.043411040447849e-06, "loss": 0.66335499, "num_input_tokens_seen": 123489425, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.88671875, "step": 5747, "time_per_iteration": 2.4788413047790527 }, { "auxiliary_loss_clip": 0.01136615, "auxiliary_loss_mlp": 0.01035836, "balance_loss_clip": 1.02062511, "balance_loss_mlp": 1.05135238, "epoch": 0.34558845633548774, "flos": 36243633824640.0, "grad_norm": 1.8042167693557831, "language_loss": 0.73024416, "learning_rate": 3.043078760922264e-06, "loss": 0.75196868, "num_input_tokens_seen": 123509970, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8515625, "step": 5748, "time_per_iteration": 2.60089111328125 }, { "auxiliary_loss_clip": 0.01133147, "auxiliary_loss_mlp": 0.01037538, "balance_loss_clip": 1.02272081, "balance_loss_mlp": 1.05138564, "epoch": 0.3456485795881557, "flos": 22450561200000.0, "grad_norm": 1.6601506414242182, "language_loss": 0.75729668, "learning_rate": 3.042746441843029e-06, "loss": 0.7790035, "num_input_tokens_seen": 123531055, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 5749, "time_per_iteration": 2.4940898418426514 }, { "auxiliary_loss_clip": 0.01053628, "auxiliary_loss_mlp": 0.01005461, "balance_loss_clip": 1.0033865, "balance_loss_mlp": 1.02623081, "epoch": 0.34570870284082367, "flos": 62004299005440.0, "grad_norm": 0.9693941797467783, "language_loss": 0.62742829, "learning_rate": 3.0424140832227437e-06, "loss": 0.64801919, "num_input_tokens_seen": 123584720, "router_z_loss_clip": 0.02075195, "router_z_loss_mlp": 0.2734375, "step": 5750, "time_per_iteration": 2.99426007270813 }, { "auxiliary_loss_clip": 0.01130015, "auxiliary_loss_mlp": 0.01034421, "balance_loss_clip": 1.0191803, "balance_loss_mlp": 1.0492661, "epoch": 0.34576882609349163, "flos": 22782196494720.0, "grad_norm": 1.8714666218648277, "language_loss": 0.80749166, "learning_rate": 3.042081685074012e-06, "loss": 0.82913601, "num_input_tokens_seen": 123604465, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 5751, "time_per_iteration": 2.4790000915527344 }, { "auxiliary_loss_clip": 0.01131847, "auxiliary_loss_mlp": 0.01045642, "balance_loss_clip": 1.02981102, "balance_loss_mlp": 1.04933333, "epoch": 0.34582894934615965, "flos": 12348818576640.0, "grad_norm": 2.510433832894663, "language_loss": 0.8434996, "learning_rate": 3.041749247409439e-06, "loss": 0.86527449, "num_input_tokens_seen": 123622320, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.82421875, "step": 5752, "time_per_iteration": 2.5101921558380127 }, { "auxiliary_loss_clip": 0.01051924, "auxiliary_loss_mlp": 0.0100303, "balance_loss_clip": 1.00112295, "balance_loss_mlp": 1.02465177, "epoch": 0.3458890725988276, "flos": 70167691071360.0, "grad_norm": 0.7777244639950482, "language_loss": 0.63159752, "learning_rate": 3.0414167702416296e-06, "loss": 0.65214705, "num_input_tokens_seen": 123678010, "router_z_loss_clip": 0.01904297, "router_z_loss_mlp": 0.2734375, "step": 5753, "time_per_iteration": 2.979177474975586 }, { "auxiliary_loss_clip": 0.01132646, "auxiliary_loss_mlp": 0.01041054, "balance_loss_clip": 1.02444768, "balance_loss_mlp": 1.04927802, "epoch": 0.3459491958514956, "flos": 17092582093440.0, "grad_norm": 1.8142730718610147, "language_loss": 0.70979965, "learning_rate": 3.0410842535831914e-06, "loss": 0.73153663, "num_input_tokens_seen": 123696830, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8359375, "step": 5754, "time_per_iteration": 2.486807107925415 }, { "auxiliary_loss_clip": 0.01135625, "auxiliary_loss_mlp": 0.01039279, "balance_loss_clip": 1.02337623, "balance_loss_mlp": 1.04777491, "epoch": 0.34600931910416355, "flos": 16650952375680.0, "grad_norm": 1.8402268104418975, "language_loss": 0.73078823, "learning_rate": 3.0407516974467343e-06, "loss": 0.75253731, "num_input_tokens_seen": 123714360, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.87890625, "step": 5755, "time_per_iteration": 2.4628372192382812 }, { "auxiliary_loss_clip": 0.01130591, "auxiliary_loss_mlp": 0.01035563, "balance_loss_clip": 1.02053022, "balance_loss_mlp": 1.04845715, "epoch": 0.3460694423568315, "flos": 38546190334080.0, "grad_norm": 1.9213229357833048, "language_loss": 0.72159982, "learning_rate": 3.040419101844869e-06, "loss": 0.7432614, "num_input_tokens_seen": 123739250, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 5756, "time_per_iteration": 2.6500132083892822 }, { "auxiliary_loss_clip": 0.01048948, "auxiliary_loss_mlp": 0.0100907, "balance_loss_clip": 1.0072583, "balance_loss_mlp": 1.02212763, "epoch": 0.3461295656094995, "flos": 72081479704320.0, "grad_norm": 1.2547471023298435, "language_loss": 0.62582755, "learning_rate": 3.040086466790207e-06, "loss": 0.64640772, "num_input_tokens_seen": 123802845, "router_z_loss_clip": 0.01806641, "router_z_loss_mlp": 0.26757812, "step": 5757, "time_per_iteration": 3.1242079734802246 }, { "auxiliary_loss_clip": 0.01047263, "auxiliary_loss_mlp": 0.0100717, "balance_loss_clip": 1.00529861, "balance_loss_mlp": 1.02041209, "epoch": 0.34618968886216744, "flos": 65460089571840.0, "grad_norm": 0.858662930658982, "language_loss": 0.59281051, "learning_rate": 3.039753792295362e-06, "loss": 0.6133548, "num_input_tokens_seen": 123861805, "router_z_loss_clip": 0.01867676, "router_z_loss_mlp": 0.26953125, "step": 5758, "time_per_iteration": 4.563826084136963 }, { "auxiliary_loss_clip": 0.01136058, "auxiliary_loss_mlp": 0.01044934, "balance_loss_clip": 1.03018212, "balance_loss_mlp": 1.05212498, "epoch": 0.3462498121148354, "flos": 23472542960640.0, "grad_norm": 3.5182526818716493, "language_loss": 0.7152428, "learning_rate": 3.0394210783729487e-06, "loss": 0.73705274, "num_input_tokens_seen": 123881820, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8359375, "step": 5759, "time_per_iteration": 2.503175735473633 }, { "auxiliary_loss_clip": 0.01130446, "auxiliary_loss_mlp": 0.01045738, "balance_loss_clip": 1.02982306, "balance_loss_mlp": 1.04681373, "epoch": 0.3463099353675034, "flos": 24170790418560.0, "grad_norm": 1.6673378592083752, "language_loss": 0.83222282, "learning_rate": 3.0390883250355836e-06, "loss": 0.85398459, "num_input_tokens_seen": 123903700, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8359375, "step": 5760, "time_per_iteration": 2.523806095123291 }, { "auxiliary_loss_clip": 0.01046859, "auxiliary_loss_mlp": 0.01005417, "balance_loss_clip": 1.00361729, "balance_loss_mlp": 1.01965487, "epoch": 0.34637005862017134, "flos": 63700609766400.0, "grad_norm": 0.8425252995745853, "language_loss": 0.56591475, "learning_rate": 3.0387555322958865e-06, "loss": 0.58643758, "num_input_tokens_seen": 123960075, "router_z_loss_clip": 0.01794434, "router_z_loss_mlp": 0.2734375, "step": 5761, "time_per_iteration": 3.178837537765503 }, { "auxiliary_loss_clip": 0.01129571, "auxiliary_loss_mlp": 0.01046252, "balance_loss_clip": 1.03088605, "balance_loss_mlp": 1.04664278, "epoch": 0.3464301818728393, "flos": 13145532192000.0, "grad_norm": 2.219342443896354, "language_loss": 0.94588685, "learning_rate": 3.038422700166474e-06, "loss": 0.96764505, "num_input_tokens_seen": 123975805, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 5762, "time_per_iteration": 5.304632663726807 }, { "auxiliary_loss_clip": 0.01133055, "auxiliary_loss_mlp": 0.01040069, "balance_loss_clip": 1.02420187, "balance_loss_mlp": 1.04587078, "epoch": 0.34649030512550727, "flos": 29315173299840.0, "grad_norm": 1.7041372971520623, "language_loss": 0.69775546, "learning_rate": 3.0380898286599692e-06, "loss": 0.71948671, "num_input_tokens_seen": 123997530, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.87109375, "step": 5763, "time_per_iteration": 2.546225070953369 }, { "auxiliary_loss_clip": 0.01138551, "auxiliary_loss_mlp": 0.01045659, "balance_loss_clip": 1.02849257, "balance_loss_mlp": 1.0503732, "epoch": 0.34655042837817523, "flos": 23730884553600.0, "grad_norm": 1.8501739487613358, "language_loss": 0.83843935, "learning_rate": 3.0377569177889945e-06, "loss": 0.86028147, "num_input_tokens_seen": 124016375, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.87890625, "step": 5764, "time_per_iteration": 3.9756100177764893 }, { "auxiliary_loss_clip": 0.01132223, "auxiliary_loss_mlp": 0.01037949, "balance_loss_clip": 1.02264833, "balance_loss_mlp": 1.04776037, "epoch": 0.34661055163084326, "flos": 22054215553920.0, "grad_norm": 2.2780089557015386, "language_loss": 0.67425829, "learning_rate": 3.0374239675661722e-06, "loss": 0.69596004, "num_input_tokens_seen": 124033975, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84375, "step": 5765, "time_per_iteration": 2.4711413383483887 }, { "auxiliary_loss_clip": 0.01136075, "auxiliary_loss_mlp": 0.01043098, "balance_loss_clip": 1.02788734, "balance_loss_mlp": 1.05305517, "epoch": 0.3466706748835112, "flos": 21799213925760.0, "grad_norm": 1.7661747348088266, "language_loss": 0.77005363, "learning_rate": 3.03709097800413e-06, "loss": 0.79184532, "num_input_tokens_seen": 124051930, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 5766, "time_per_iteration": 2.464200019836426 }, { "auxiliary_loss_clip": 0.01130683, "auxiliary_loss_mlp": 0.01035407, "balance_loss_clip": 1.02060723, "balance_loss_mlp": 1.04784036, "epoch": 0.3467307981361792, "flos": 19461680547840.0, "grad_norm": 1.528021107917656, "language_loss": 0.73359317, "learning_rate": 3.0367579491154943e-06, "loss": 0.75525403, "num_input_tokens_seen": 124071220, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 5767, "time_per_iteration": 2.461188793182373 }, { "auxiliary_loss_clip": 0.01135588, "auxiliary_loss_mlp": 0.01043907, "balance_loss_clip": 1.02734852, "balance_loss_mlp": 1.05073714, "epoch": 0.34679092138884715, "flos": 24827452905600.0, "grad_norm": 1.7668187980910754, "language_loss": 0.77861559, "learning_rate": 3.036424880912893e-06, "loss": 0.80041051, "num_input_tokens_seen": 124090140, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84765625, "step": 5768, "time_per_iteration": 2.4920365810394287 }, { "auxiliary_loss_clip": 0.01045236, "auxiliary_loss_mlp": 0.0100252, "balance_loss_clip": 1.0007081, "balance_loss_mlp": 1.01858115, "epoch": 0.3468510446415151, "flos": 63236070149760.0, "grad_norm": 0.7677830940363665, "language_loss": 0.57525611, "learning_rate": 3.036091773408956e-06, "loss": 0.59573364, "num_input_tokens_seen": 124152025, "router_z_loss_clip": 0.01806641, "router_z_loss_mlp": 0.265625, "step": 5769, "time_per_iteration": 3.123196840286255 }, { "auxiliary_loss_clip": 0.01146657, "auxiliary_loss_mlp": 0.01043128, "balance_loss_clip": 1.02478147, "balance_loss_mlp": 1.05319488, "epoch": 0.3469111678941831, "flos": 12120713256960.0, "grad_norm": 2.4600278676595186, "language_loss": 0.86076516, "learning_rate": 3.0357586266163154e-06, "loss": 0.88266301, "num_input_tokens_seen": 124165795, "router_z_loss_clip": 0.18359375, "router_z_loss_mlp": 0.93359375, "step": 5770, "time_per_iteration": 2.424983024597168 }, { "auxiliary_loss_clip": 0.01042565, "auxiliary_loss_mlp": 0.01001507, "balance_loss_clip": 0.99948007, "balance_loss_mlp": 1.0159719, "epoch": 0.34697129114685105, "flos": 65934110378880.0, "grad_norm": 0.862998838715562, "language_loss": 0.5986402, "learning_rate": 3.0354254405476036e-06, "loss": 0.6190809, "num_input_tokens_seen": 124222925, "router_z_loss_clip": 0.02026367, "router_z_loss_mlp": 0.265625, "step": 5771, "time_per_iteration": 2.8743207454681396 }, { "auxiliary_loss_clip": 0.01132099, "auxiliary_loss_mlp": 0.01049043, "balance_loss_clip": 1.03315234, "balance_loss_mlp": 1.04824197, "epoch": 0.347031414399519, "flos": 34454205054720.0, "grad_norm": 2.021042259281646, "language_loss": 0.71632856, "learning_rate": 3.0350922152154557e-06, "loss": 0.73813999, "num_input_tokens_seen": 124240915, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83984375, "step": 5772, "time_per_iteration": 2.614776849746704 }, { "auxiliary_loss_clip": 0.01132103, "auxiliary_loss_mlp": 0.01041158, "balance_loss_clip": 1.02492166, "balance_loss_mlp": 1.04711473, "epoch": 0.347091537652187, "flos": 26944135511040.0, "grad_norm": 1.638850346502792, "language_loss": 0.76515245, "learning_rate": 3.034758950632507e-06, "loss": 0.78688508, "num_input_tokens_seen": 124262770, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8515625, "step": 5773, "time_per_iteration": 2.528792381286621 }, { "auxiliary_loss_clip": 0.01134837, "auxiliary_loss_mlp": 0.01041513, "balance_loss_clip": 1.02514577, "balance_loss_mlp": 1.04919136, "epoch": 0.34715166090485494, "flos": 21142228216320.0, "grad_norm": 3.6586419162429222, "language_loss": 0.70372236, "learning_rate": 3.034425646811396e-06, "loss": 0.72548592, "num_input_tokens_seen": 124280950, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.859375, "step": 5774, "time_per_iteration": 2.4792683124542236 }, { "auxiliary_loss_clip": 0.01131669, "auxiliary_loss_mlp": 0.01038045, "balance_loss_clip": 1.02272058, "balance_loss_mlp": 1.04886723, "epoch": 0.3472117841575229, "flos": 23478001827840.0, "grad_norm": 1.5619976896125631, "language_loss": 0.75433266, "learning_rate": 3.0340923037647602e-06, "loss": 0.77602983, "num_input_tokens_seen": 124299540, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 5775, "time_per_iteration": 2.5112719535827637 }, { "auxiliary_loss_clip": 0.01135438, "auxiliary_loss_mlp": 0.01041743, "balance_loss_clip": 1.0253402, "balance_loss_mlp": 1.04702878, "epoch": 0.34727190741019087, "flos": 17492806408320.0, "grad_norm": 2.2669717845240704, "language_loss": 0.77730787, "learning_rate": 3.0337589215052404e-06, "loss": 0.79907966, "num_input_tokens_seen": 124316285, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8828125, "step": 5776, "time_per_iteration": 2.4892961978912354 }, { "auxiliary_loss_clip": 0.01040528, "auxiliary_loss_mlp": 0.01006857, "balance_loss_clip": 1.00487828, "balance_loss_mlp": 1.01409423, "epoch": 0.34733203066285884, "flos": 65265491640960.0, "grad_norm": 0.8389365787725632, "language_loss": 0.63329208, "learning_rate": 3.033425500045478e-06, "loss": 0.65376604, "num_input_tokens_seen": 124376650, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.265625, "step": 5777, "time_per_iteration": 3.1099839210510254 }, { "auxiliary_loss_clip": 0.01132932, "auxiliary_loss_mlp": 0.01044713, "balance_loss_clip": 1.02830982, "balance_loss_mlp": 1.04683423, "epoch": 0.3473921539155268, "flos": 28658726294400.0, "grad_norm": 2.1458514730763083, "language_loss": 0.64428544, "learning_rate": 3.033092039398119e-06, "loss": 0.66606194, "num_input_tokens_seen": 124396475, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.859375, "step": 5778, "time_per_iteration": 2.572571277618408 }, { "auxiliary_loss_clip": 0.01133944, "auxiliary_loss_mlp": 0.01048597, "balance_loss_clip": 1.03264666, "balance_loss_mlp": 1.04606283, "epoch": 0.3474522771681948, "flos": 40836895355520.0, "grad_norm": 2.391706926888959, "language_loss": 0.7075178, "learning_rate": 3.0327585395758046e-06, "loss": 0.72934324, "num_input_tokens_seen": 124416480, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87890625, "step": 5779, "time_per_iteration": 2.6195132732391357 }, { "auxiliary_loss_clip": 0.01138658, "auxiliary_loss_mlp": 0.01049084, "balance_loss_clip": 1.03293133, "balance_loss_mlp": 1.04915166, "epoch": 0.3475124004208628, "flos": 24608577381120.0, "grad_norm": 2.503286414069963, "language_loss": 0.62661833, "learning_rate": 3.0324250005911837e-06, "loss": 0.64849573, "num_input_tokens_seen": 124435950, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.890625, "step": 5780, "time_per_iteration": 2.5394973754882812 }, { "auxiliary_loss_clip": 0.01132974, "auxiliary_loss_mlp": 0.01040716, "balance_loss_clip": 1.02542126, "balance_loss_mlp": 1.04781461, "epoch": 0.34757252367353075, "flos": 22711309004160.0, "grad_norm": 1.5960228546317234, "language_loss": 0.7226218, "learning_rate": 3.0320914224569033e-06, "loss": 0.74435866, "num_input_tokens_seen": 124455410, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8515625, "step": 5781, "time_per_iteration": 2.5047659873962402 }, { "auxiliary_loss_clip": 0.01136691, "auxiliary_loss_mlp": 0.01047801, "balance_loss_clip": 1.03095114, "balance_loss_mlp": 1.04939699, "epoch": 0.3476326469261987, "flos": 19828184970240.0, "grad_norm": 2.230203508517906, "language_loss": 0.76768702, "learning_rate": 3.031757805185612e-06, "loss": 0.78953195, "num_input_tokens_seen": 124474870, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.875, "step": 5782, "time_per_iteration": 2.478447914123535 }, { "auxiliary_loss_clip": 0.01131493, "auxiliary_loss_mlp": 0.0103673, "balance_loss_clip": 1.0209409, "balance_loss_mlp": 1.04651725, "epoch": 0.3476927701788667, "flos": 19938107566080.0, "grad_norm": 1.848715396076189, "language_loss": 0.62327409, "learning_rate": 3.0314241487899622e-06, "loss": 0.64495635, "num_input_tokens_seen": 124494105, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5783, "time_per_iteration": 2.4671151638031006 }, { "auxiliary_loss_clip": 0.01127395, "auxiliary_loss_mlp": 0.01028402, "balance_loss_clip": 1.01423979, "balance_loss_mlp": 1.04510713, "epoch": 0.34775289343153465, "flos": 20735108490240.0, "grad_norm": 1.6548775454251232, "language_loss": 0.88619113, "learning_rate": 3.031090453282605e-06, "loss": 0.90774912, "num_input_tokens_seen": 124512030, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.82421875, "step": 5784, "time_per_iteration": 2.505087375640869 }, { "auxiliary_loss_clip": 0.01129692, "auxiliary_loss_mlp": 0.01032836, "balance_loss_clip": 1.017887, "balance_loss_mlp": 1.04634559, "epoch": 0.3478130166842026, "flos": 19354846521600.0, "grad_norm": 1.721086853759888, "language_loss": 0.81230283, "learning_rate": 3.0307567186761946e-06, "loss": 0.83392811, "num_input_tokens_seen": 124530980, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8359375, "step": 5785, "time_per_iteration": 2.4466960430145264 }, { "auxiliary_loss_clip": 0.0113737, "auxiliary_loss_mlp": 0.01037439, "balance_loss_clip": 1.02263355, "balance_loss_mlp": 1.05235672, "epoch": 0.3478731399368706, "flos": 22051198811520.0, "grad_norm": 1.910777545006738, "language_loss": 0.80497181, "learning_rate": 3.0304229449833862e-06, "loss": 0.82671988, "num_input_tokens_seen": 124549330, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.84765625, "step": 5786, "time_per_iteration": 2.5351860523223877 }, { "auxiliary_loss_clip": 0.01131803, "auxiliary_loss_mlp": 0.01036776, "balance_loss_clip": 1.02088571, "balance_loss_mlp": 1.04856944, "epoch": 0.34793326318953854, "flos": 18041449720320.0, "grad_norm": 1.8316552089113873, "language_loss": 0.74663019, "learning_rate": 3.030089132216836e-06, "loss": 0.76831591, "num_input_tokens_seen": 124567200, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 5787, "time_per_iteration": 2.4825432300567627 }, { "auxiliary_loss_clip": 0.01130701, "auxiliary_loss_mlp": 0.01034948, "balance_loss_clip": 1.01951659, "balance_loss_mlp": 1.04554152, "epoch": 0.3479933864422065, "flos": 29314670509440.0, "grad_norm": 1.8355715739588312, "language_loss": 0.80974853, "learning_rate": 3.029755280389203e-06, "loss": 0.8314051, "num_input_tokens_seen": 124587025, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8515625, "step": 5788, "time_per_iteration": 2.650613307952881 }, { "auxiliary_loss_clip": 0.01137995, "auxiliary_loss_mlp": 0.01034786, "balance_loss_clip": 1.01907432, "balance_loss_mlp": 1.04928255, "epoch": 0.3480535096948745, "flos": 20120713332480.0, "grad_norm": 1.8811878723314985, "language_loss": 0.85562515, "learning_rate": 3.029421389513147e-06, "loss": 0.87735295, "num_input_tokens_seen": 124605860, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.890625, "step": 5789, "time_per_iteration": 2.4596152305603027 }, { "auxiliary_loss_clip": 0.01136, "auxiliary_loss_mlp": 0.01052325, "balance_loss_clip": 1.03635108, "balance_loss_mlp": 1.04838657, "epoch": 0.34811363294754244, "flos": 18548974938240.0, "grad_norm": 1.879309599133083, "language_loss": 0.85186374, "learning_rate": 3.029087459601328e-06, "loss": 0.87374699, "num_input_tokens_seen": 124624270, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 5790, "time_per_iteration": 2.4551985263824463 }, { "auxiliary_loss_clip": 0.01133869, "auxiliary_loss_mlp": 0.01044002, "balance_loss_clip": 1.02841544, "balance_loss_mlp": 1.04886198, "epoch": 0.3481737562002104, "flos": 26870303105280.0, "grad_norm": 2.0481768258507076, "language_loss": 0.81226408, "learning_rate": 3.0287534906664097e-06, "loss": 0.83404279, "num_input_tokens_seen": 124644005, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8515625, "step": 5791, "time_per_iteration": 2.5076096057891846 }, { "auxiliary_loss_clip": 0.0113238, "auxiliary_loss_mlp": 0.01038392, "balance_loss_clip": 1.02218008, "balance_loss_mlp": 1.04554534, "epoch": 0.3482338794528784, "flos": 28908664104960.0, "grad_norm": 1.782736543206293, "language_loss": 0.77929568, "learning_rate": 3.028419482721056e-06, "loss": 0.8010034, "num_input_tokens_seen": 124663020, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8671875, "step": 5792, "time_per_iteration": 2.5843281745910645 }, { "auxiliary_loss_clip": 0.01128948, "auxiliary_loss_mlp": 0.01031472, "balance_loss_clip": 1.01586199, "balance_loss_mlp": 1.04461563, "epoch": 0.3482940027055464, "flos": 22200767043840.0, "grad_norm": 1.59399532280634, "language_loss": 0.81342953, "learning_rate": 3.0280854357779325e-06, "loss": 0.83503377, "num_input_tokens_seen": 124682975, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84375, "step": 5793, "time_per_iteration": 2.4694485664367676 }, { "auxiliary_loss_clip": 0.01133328, "auxiliary_loss_mlp": 0.01050369, "balance_loss_clip": 1.03457355, "balance_loss_mlp": 1.04755151, "epoch": 0.34835412595821436, "flos": 20302708567680.0, "grad_norm": 1.9713366843715416, "language_loss": 0.75881112, "learning_rate": 3.027751349849706e-06, "loss": 0.78064811, "num_input_tokens_seen": 124701340, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.85546875, "step": 5794, "time_per_iteration": 2.4877965450286865 }, { "auxiliary_loss_clip": 0.01130999, "auxiliary_loss_mlp": 0.01036534, "balance_loss_clip": 1.02110863, "balance_loss_mlp": 1.04726255, "epoch": 0.3484142492108823, "flos": 20449691020800.0, "grad_norm": 3.2384148999232805, "language_loss": 0.5768432, "learning_rate": 3.0274172249490456e-06, "loss": 0.59851849, "num_input_tokens_seen": 124719165, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 5795, "time_per_iteration": 2.4816131591796875 }, { "auxiliary_loss_clip": 0.01129639, "auxiliary_loss_mlp": 0.01038372, "balance_loss_clip": 1.02336371, "balance_loss_mlp": 1.04665256, "epoch": 0.3484743724635503, "flos": 24352929308160.0, "grad_norm": 3.6396053828016877, "language_loss": 0.82605398, "learning_rate": 3.0270830610886213e-06, "loss": 0.84773415, "num_input_tokens_seen": 124738670, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 5796, "time_per_iteration": 2.527686595916748 }, { "auxiliary_loss_clip": 0.01128567, "auxiliary_loss_mlp": 0.01030655, "balance_loss_clip": 1.01593268, "balance_loss_mlp": 1.04702115, "epoch": 0.34853449571621825, "flos": 24353001135360.0, "grad_norm": 2.5282476755814147, "language_loss": 0.83648622, "learning_rate": 3.0267488582811033e-06, "loss": 0.85807848, "num_input_tokens_seen": 124758760, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.81640625, "step": 5797, "time_per_iteration": 2.4919211864471436 }, { "auxiliary_loss_clip": 0.01126435, "auxiliary_loss_mlp": 0.01035151, "balance_loss_clip": 1.01951098, "balance_loss_mlp": 1.04488814, "epoch": 0.3485946189688862, "flos": 27267690245760.0, "grad_norm": 1.7827671507946263, "language_loss": 0.7314353, "learning_rate": 3.026414616539167e-06, "loss": 0.75305116, "num_input_tokens_seen": 124777765, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8125, "step": 5798, "time_per_iteration": 2.5244545936584473 }, { "auxiliary_loss_clip": 0.01131175, "auxiliary_loss_mlp": 0.01041579, "balance_loss_clip": 1.02548575, "balance_loss_mlp": 1.04580069, "epoch": 0.3486547422215542, "flos": 20156695781760.0, "grad_norm": 1.9162479731714994, "language_loss": 0.7592591, "learning_rate": 3.026080335875485e-06, "loss": 0.78098667, "num_input_tokens_seen": 124796775, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.85546875, "step": 5799, "time_per_iteration": 2.496189594268799 }, { "auxiliary_loss_clip": 0.01128843, "auxiliary_loss_mlp": 0.01032109, "balance_loss_clip": 1.01735055, "balance_loss_mlp": 1.04565084, "epoch": 0.34871486547422215, "flos": 20230348619520.0, "grad_norm": 1.682747069158681, "language_loss": 0.75607026, "learning_rate": 3.025746016302734e-06, "loss": 0.77767974, "num_input_tokens_seen": 124815825, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.83203125, "step": 5800, "time_per_iteration": 3.948640823364258 }, { "auxiliary_loss_clip": 0.01136064, "auxiliary_loss_mlp": 0.01044578, "balance_loss_clip": 1.02773416, "balance_loss_mlp": 1.04781091, "epoch": 0.3487749887268901, "flos": 44053234882560.0, "grad_norm": 1.7817022948226255, "language_loss": 0.67046505, "learning_rate": 3.025411657833591e-06, "loss": 0.69227147, "num_input_tokens_seen": 124838420, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8828125, "step": 5801, "time_per_iteration": 2.6687612533569336 }, { "auxiliary_loss_clip": 0.01129491, "auxiliary_loss_mlp": 0.01043702, "balance_loss_clip": 1.02868176, "balance_loss_mlp": 1.04698503, "epoch": 0.3488351119795581, "flos": 23295144666240.0, "grad_norm": 1.795008286787812, "language_loss": 0.76438236, "learning_rate": 3.025077260480735e-06, "loss": 0.78611422, "num_input_tokens_seen": 124857320, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.82421875, "step": 5802, "time_per_iteration": 2.4659385681152344 }, { "auxiliary_loss_clip": 0.01123882, "auxiliary_loss_mlp": 0.01038204, "balance_loss_clip": 1.02361298, "balance_loss_mlp": 1.04405665, "epoch": 0.34889523523222604, "flos": 19934839428480.0, "grad_norm": 1.7359543355790736, "language_loss": 0.7912004, "learning_rate": 3.0247428242568474e-06, "loss": 0.81282127, "num_input_tokens_seen": 124875685, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.796875, "step": 5803, "time_per_iteration": 3.859504461288452 }, { "auxiliary_loss_clip": 0.01131688, "auxiliary_loss_mlp": 0.01034846, "balance_loss_clip": 1.02051723, "balance_loss_mlp": 1.04518807, "epoch": 0.348955358484894, "flos": 30446179816320.0, "grad_norm": 2.3484548052091703, "language_loss": 0.6760726, "learning_rate": 3.0244083491746085e-06, "loss": 0.69773793, "num_input_tokens_seen": 124895960, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.86328125, "step": 5804, "time_per_iteration": 3.921285390853882 }, { "auxiliary_loss_clip": 0.0112781, "auxiliary_loss_mlp": 0.01043389, "balance_loss_clip": 1.02816033, "balance_loss_mlp": 1.04857373, "epoch": 0.349015481737562, "flos": 17999972490240.0, "grad_norm": 1.9230874840047643, "language_loss": 0.75988114, "learning_rate": 3.024073835246702e-06, "loss": 0.78159314, "num_input_tokens_seen": 124914140, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.79296875, "step": 5805, "time_per_iteration": 3.8519740104675293 }, { "auxiliary_loss_clip": 0.01130435, "auxiliary_loss_mlp": 0.0103975, "balance_loss_clip": 1.02464032, "balance_loss_mlp": 1.04735208, "epoch": 0.34907560499023, "flos": 27198490694400.0, "grad_norm": 2.860751776900879, "language_loss": 0.67106533, "learning_rate": 3.023739282485814e-06, "loss": 0.69276714, "num_input_tokens_seen": 124934180, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.83203125, "step": 5806, "time_per_iteration": 2.5207149982452393 }, { "auxiliary_loss_clip": 0.01133852, "auxiliary_loss_mlp": 0.01037487, "balance_loss_clip": 1.02221656, "balance_loss_mlp": 1.05027139, "epoch": 0.34913572824289796, "flos": 30226873328640.0, "grad_norm": 1.531613328964626, "language_loss": 0.72082108, "learning_rate": 3.023404690904629e-06, "loss": 0.74253446, "num_input_tokens_seen": 124956060, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8359375, "step": 5807, "time_per_iteration": 2.5419211387634277 }, { "auxiliary_loss_clip": 0.01129742, "auxiliary_loss_mlp": 0.01038703, "balance_loss_clip": 1.02281213, "balance_loss_mlp": 1.04458523, "epoch": 0.3491958514955659, "flos": 29971907614080.0, "grad_norm": 2.115604615197572, "language_loss": 0.73812962, "learning_rate": 3.0230700605158364e-06, "loss": 0.75981402, "num_input_tokens_seen": 124976070, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5808, "time_per_iteration": 2.5376429557800293 }, { "auxiliary_loss_clip": 0.01126073, "auxiliary_loss_mlp": 0.01045243, "balance_loss_clip": 1.03081846, "balance_loss_mlp": 1.04670775, "epoch": 0.3492559747482339, "flos": 22783273902720.0, "grad_norm": 1.4980503613480947, "language_loss": 0.84625483, "learning_rate": 3.0227353913321238e-06, "loss": 0.86796802, "num_input_tokens_seen": 124996995, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.79296875, "step": 5809, "time_per_iteration": 2.504009962081909 }, { "auxiliary_loss_clip": 0.01125206, "auxiliary_loss_mlp": 0.01036741, "balance_loss_clip": 1.02314544, "balance_loss_mlp": 1.04685462, "epoch": 0.34931609800090185, "flos": 26068022881920.0, "grad_norm": 2.074802115444058, "language_loss": 0.80388212, "learning_rate": 3.0224006833661835e-06, "loss": 0.82550156, "num_input_tokens_seen": 125015600, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 5810, "time_per_iteration": 2.508547782897949 }, { "auxiliary_loss_clip": 0.01127883, "auxiliary_loss_mlp": 0.01044968, "balance_loss_clip": 1.03083563, "balance_loss_mlp": 1.04563618, "epoch": 0.3493762212535698, "flos": 29242023252480.0, "grad_norm": 1.766641490680919, "language_loss": 0.75391167, "learning_rate": 3.0220659366307057e-06, "loss": 0.77564013, "num_input_tokens_seen": 125035290, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.82421875, "step": 5811, "time_per_iteration": 2.5800247192382812 }, { "auxiliary_loss_clip": 0.01131614, "auxiliary_loss_mlp": 0.01038799, "balance_loss_clip": 1.02466679, "balance_loss_mlp": 1.04664326, "epoch": 0.3494363445062378, "flos": 27126058919040.0, "grad_norm": 2.243302403551748, "language_loss": 0.80229449, "learning_rate": 3.021731151138386e-06, "loss": 0.82399857, "num_input_tokens_seen": 125057130, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.84765625, "step": 5812, "time_per_iteration": 2.5089831352233887 }, { "auxiliary_loss_clip": 0.01126158, "auxiliary_loss_mlp": 0.01038094, "balance_loss_clip": 1.0226326, "balance_loss_mlp": 1.04277849, "epoch": 0.34949646775890575, "flos": 12276207233280.0, "grad_norm": 1.8935281879167716, "language_loss": 0.69720298, "learning_rate": 3.021396326901918e-06, "loss": 0.71884549, "num_input_tokens_seen": 125073720, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 5813, "time_per_iteration": 2.465606689453125 }, { "auxiliary_loss_clip": 0.01123358, "auxiliary_loss_mlp": 0.01035526, "balance_loss_clip": 1.02129269, "balance_loss_mlp": 1.0438112, "epoch": 0.3495565910115737, "flos": 17165516659200.0, "grad_norm": 1.9508996008729418, "language_loss": 0.76266462, "learning_rate": 3.0210614639339998e-06, "loss": 0.78425342, "num_input_tokens_seen": 125090635, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 5814, "time_per_iteration": 2.483783483505249 }, { "auxiliary_loss_clip": 0.01129452, "auxiliary_loss_mlp": 0.01045297, "balance_loss_clip": 1.02940643, "balance_loss_mlp": 1.04466856, "epoch": 0.3496167142642417, "flos": 26465661417600.0, "grad_norm": 1.66993917619739, "language_loss": 0.84668076, "learning_rate": 3.020726562247328e-06, "loss": 0.86842823, "num_input_tokens_seen": 125110070, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84765625, "step": 5815, "time_per_iteration": 2.5243732929229736 }, { "auxiliary_loss_clip": 0.01126721, "auxiliary_loss_mlp": 0.01034966, "balance_loss_clip": 1.02123272, "balance_loss_mlp": 1.0439626, "epoch": 0.34967683751690964, "flos": 17414843938560.0, "grad_norm": 2.480791726049618, "language_loss": 0.77393222, "learning_rate": 3.0203916218546024e-06, "loss": 0.79554909, "num_input_tokens_seen": 125125730, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.828125, "step": 5816, "time_per_iteration": 2.4339394569396973 }, { "auxiliary_loss_clip": 0.01135075, "auxiliary_loss_mlp": 0.01044297, "balance_loss_clip": 1.02928281, "balance_loss_mlp": 1.05043399, "epoch": 0.3497369607695776, "flos": 22600021691520.0, "grad_norm": 1.8305633057111452, "language_loss": 0.58904278, "learning_rate": 3.0200566427685246e-06, "loss": 0.61083651, "num_input_tokens_seen": 125146195, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84765625, "step": 5817, "time_per_iteration": 2.534543514251709 }, { "auxiliary_loss_clip": 0.01042946, "auxiliary_loss_mlp": 0.01026468, "balance_loss_clip": 1.02421522, "balance_loss_mlp": 1.0165298, "epoch": 0.34979708402224563, "flos": 68529374818560.0, "grad_norm": 0.8845056733009554, "language_loss": 0.59891945, "learning_rate": 3.0197216250017975e-06, "loss": 0.61961359, "num_input_tokens_seen": 125207790, "router_z_loss_clip": 0.02258301, "router_z_loss_mlp": 0.26367188, "step": 5818, "time_per_iteration": 3.165062427520752 }, { "auxiliary_loss_clip": 0.01124932, "auxiliary_loss_mlp": 0.0103805, "balance_loss_clip": 1.02335167, "balance_loss_mlp": 1.0458653, "epoch": 0.3498572072749136, "flos": 18989634988800.0, "grad_norm": 1.700935006172828, "language_loss": 0.83045024, "learning_rate": 3.019386568567123e-06, "loss": 0.85208011, "num_input_tokens_seen": 125226220, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 5819, "time_per_iteration": 2.497955560684204 }, { "auxiliary_loss_clip": 0.01125851, "auxiliary_loss_mlp": 0.01030293, "balance_loss_clip": 1.01586866, "balance_loss_mlp": 1.04360723, "epoch": 0.34991733052758156, "flos": 27818883423360.0, "grad_norm": 2.2330902186191794, "language_loss": 0.71137083, "learning_rate": 3.0190514734772083e-06, "loss": 0.73293221, "num_input_tokens_seen": 125247485, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 5820, "time_per_iteration": 2.5109739303588867 }, { "auxiliary_loss_clip": 0.01127632, "auxiliary_loss_mlp": 0.01031865, "balance_loss_clip": 1.01826882, "balance_loss_mlp": 1.04480875, "epoch": 0.3499774537802495, "flos": 33584197737600.0, "grad_norm": 1.6762186582179466, "language_loss": 0.70375252, "learning_rate": 3.018716339744759e-06, "loss": 0.7253474, "num_input_tokens_seen": 125268625, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.828125, "step": 5821, "time_per_iteration": 2.597611904144287 }, { "auxiliary_loss_clip": 0.01135078, "auxiliary_loss_mlp": 0.01042125, "balance_loss_clip": 1.02511334, "balance_loss_mlp": 1.04787934, "epoch": 0.3500375770329175, "flos": 23476744851840.0, "grad_norm": 1.8643803036736086, "language_loss": 0.73964024, "learning_rate": 3.0183811673824842e-06, "loss": 0.76141232, "num_input_tokens_seen": 125287530, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.87109375, "step": 5822, "time_per_iteration": 2.4929416179656982 }, { "auxiliary_loss_clip": 0.0112969, "auxiliary_loss_mlp": 0.0103481, "balance_loss_clip": 1.01910377, "balance_loss_mlp": 1.04520965, "epoch": 0.35009770028558546, "flos": 19026048401280.0, "grad_norm": 1.6671466836757192, "language_loss": 0.78747976, "learning_rate": 3.018045956403094e-06, "loss": 0.80912471, "num_input_tokens_seen": 125307020, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84765625, "step": 5823, "time_per_iteration": 2.4806406497955322 }, { "auxiliary_loss_clip": 0.01041647, "auxiliary_loss_mlp": 0.01010793, "balance_loss_clip": 1.00833726, "balance_loss_mlp": 1.01502752, "epoch": 0.3501578235382534, "flos": 68351868783360.0, "grad_norm": 0.7279336342271966, "language_loss": 0.59251058, "learning_rate": 3.017710706819298e-06, "loss": 0.61303496, "num_input_tokens_seen": 125370445, "router_z_loss_clip": 0.02453613, "router_z_loss_mlp": 0.265625, "step": 5824, "time_per_iteration": 3.122225046157837 }, { "auxiliary_loss_clip": 0.01126957, "auxiliary_loss_mlp": 0.01034622, "balance_loss_clip": 1.01866031, "balance_loss_mlp": 1.04388142, "epoch": 0.3502179467909214, "flos": 21250893836160.0, "grad_norm": 1.8883193526367081, "language_loss": 0.8467738, "learning_rate": 3.017375418643811e-06, "loss": 0.86838961, "num_input_tokens_seen": 125388900, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 5825, "time_per_iteration": 2.4806392192840576 }, { "auxiliary_loss_clip": 0.01129626, "auxiliary_loss_mlp": 0.01033271, "balance_loss_clip": 1.01803637, "balance_loss_mlp": 1.04663718, "epoch": 0.35027807004358935, "flos": 11942955826560.0, "grad_norm": 3.4954862768440647, "language_loss": 0.82859921, "learning_rate": 3.0170400918893464e-06, "loss": 0.85022819, "num_input_tokens_seen": 125402675, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 5826, "time_per_iteration": 2.4076876640319824 }, { "auxiliary_loss_clip": 0.01130031, "auxiliary_loss_mlp": 0.01041137, "balance_loss_clip": 1.02575946, "balance_loss_mlp": 1.04495692, "epoch": 0.3503381932962573, "flos": 21470918595840.0, "grad_norm": 1.518091840331078, "language_loss": 0.8090384, "learning_rate": 3.0167047265686186e-06, "loss": 0.83075011, "num_input_tokens_seen": 125421360, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8515625, "step": 5827, "time_per_iteration": 2.479426383972168 }, { "auxiliary_loss_clip": 0.01129161, "auxiliary_loss_mlp": 0.0103422, "balance_loss_clip": 1.01949143, "balance_loss_mlp": 1.04629993, "epoch": 0.3503983165489253, "flos": 21251109317760.0, "grad_norm": 2.3756806985074643, "language_loss": 0.70455813, "learning_rate": 3.0163693226943467e-06, "loss": 0.726192, "num_input_tokens_seen": 125440000, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.828125, "step": 5828, "time_per_iteration": 2.456068515777588 }, { "auxiliary_loss_clip": 0.01134328, "auxiliary_loss_mlp": 0.01045048, "balance_loss_clip": 1.027632, "balance_loss_mlp": 1.04810548, "epoch": 0.35045843980159325, "flos": 27815723026560.0, "grad_norm": 3.48752631465084, "language_loss": 0.7952702, "learning_rate": 3.016033880279248e-06, "loss": 0.81706399, "num_input_tokens_seen": 125460390, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.86328125, "step": 5829, "time_per_iteration": 2.5388758182525635 }, { "auxiliary_loss_clip": 0.01135547, "auxiliary_loss_mlp": 0.01044004, "balance_loss_clip": 1.02761245, "balance_loss_mlp": 1.04749238, "epoch": 0.3505185630542612, "flos": 25921148169600.0, "grad_norm": 1.7668956531732494, "language_loss": 0.72008348, "learning_rate": 3.0156983993360417e-06, "loss": 0.74187899, "num_input_tokens_seen": 125478410, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8828125, "step": 5830, "time_per_iteration": 2.501549482345581 }, { "auxiliary_loss_clip": 0.01125366, "auxiliary_loss_mlp": 0.01034714, "balance_loss_clip": 1.01944375, "balance_loss_mlp": 1.04365361, "epoch": 0.35057868630692923, "flos": 20521763660160.0, "grad_norm": 2.1501504036037757, "language_loss": 0.88918686, "learning_rate": 3.0153628798774513e-06, "loss": 0.91078764, "num_input_tokens_seen": 125495975, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.81640625, "step": 5831, "time_per_iteration": 2.486159563064575 }, { "auxiliary_loss_clip": 0.01129896, "auxiliary_loss_mlp": 0.01044475, "balance_loss_clip": 1.02906108, "balance_loss_mlp": 1.04494333, "epoch": 0.3506388095595972, "flos": 20448649526400.0, "grad_norm": 7.033398029875035, "language_loss": 0.78503811, "learning_rate": 3.0150273219161985e-06, "loss": 0.80678177, "num_input_tokens_seen": 125515035, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84765625, "step": 5832, "time_per_iteration": 2.4540767669677734 }, { "auxiliary_loss_clip": 0.01130528, "auxiliary_loss_mlp": 0.01042907, "balance_loss_clip": 1.02646816, "balance_loss_mlp": 1.04571557, "epoch": 0.35069893281226516, "flos": 23109665811840.0, "grad_norm": 2.021923119959501, "language_loss": 0.70871425, "learning_rate": 3.014691725465008e-06, "loss": 0.7304486, "num_input_tokens_seen": 125535555, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.84765625, "step": 5833, "time_per_iteration": 2.5135927200317383 }, { "auxiliary_loss_clip": 0.01125673, "auxiliary_loss_mlp": 0.01035722, "balance_loss_clip": 1.02077913, "balance_loss_mlp": 1.04438519, "epoch": 0.35075905606493313, "flos": 27271999877760.0, "grad_norm": 1.3946982537947616, "language_loss": 0.81147331, "learning_rate": 3.014356090536606e-06, "loss": 0.83308727, "num_input_tokens_seen": 125558195, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8125, "step": 5834, "time_per_iteration": 2.509753942489624 }, { "auxiliary_loss_clip": 0.01131431, "auxiliary_loss_mlp": 0.01039912, "balance_loss_clip": 1.02395606, "balance_loss_mlp": 1.04771066, "epoch": 0.3508191793176011, "flos": 19128608709120.0, "grad_norm": 2.1069110048157556, "language_loss": 0.83409786, "learning_rate": 3.0140204171437183e-06, "loss": 0.85581124, "num_input_tokens_seen": 125575375, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 5835, "time_per_iteration": 2.4636411666870117 }, { "auxiliary_loss_clip": 0.01126418, "auxiliary_loss_mlp": 0.01044415, "balance_loss_clip": 1.02994251, "balance_loss_mlp": 1.04417062, "epoch": 0.35087930257026906, "flos": 25557588662400.0, "grad_norm": 1.4904971293091578, "language_loss": 0.76374841, "learning_rate": 3.0136847052990754e-06, "loss": 0.78545672, "num_input_tokens_seen": 125596745, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 5836, "time_per_iteration": 2.5114917755126953 }, { "auxiliary_loss_clip": 0.0113057, "auxiliary_loss_mlp": 0.01042041, "balance_loss_clip": 1.02669299, "balance_loss_mlp": 1.04763317, "epoch": 0.350939425822937, "flos": 18004246208640.0, "grad_norm": 1.9069739083285373, "language_loss": 0.77527326, "learning_rate": 3.0133489550154074e-06, "loss": 0.79699939, "num_input_tokens_seen": 125613980, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 5837, "time_per_iteration": 2.5073258876800537 }, { "auxiliary_loss_clip": 0.01128115, "auxiliary_loss_mlp": 0.01040491, "balance_loss_clip": 1.02542305, "balance_loss_mlp": 1.04470122, "epoch": 0.350999549075605, "flos": 22273198819200.0, "grad_norm": 1.7471461401076376, "language_loss": 0.68113565, "learning_rate": 3.0130131663054442e-06, "loss": 0.70282173, "num_input_tokens_seen": 125632100, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8359375, "step": 5838, "time_per_iteration": 2.456247091293335 }, { "auxiliary_loss_clip": 0.01127114, "auxiliary_loss_mlp": 0.01038264, "balance_loss_clip": 1.02217042, "balance_loss_mlp": 1.04340589, "epoch": 0.35105967232827295, "flos": 14392279307520.0, "grad_norm": 2.087043731943096, "language_loss": 0.83683598, "learning_rate": 3.0126773391819215e-06, "loss": 0.85848975, "num_input_tokens_seen": 125649190, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 5839, "time_per_iteration": 2.448052167892456 }, { "auxiliary_loss_clip": 0.01132238, "auxiliary_loss_mlp": 0.01037964, "balance_loss_clip": 1.02196622, "balance_loss_mlp": 1.04529572, "epoch": 0.3511197955809409, "flos": 25082346792960.0, "grad_norm": 1.811249136738239, "language_loss": 0.58572054, "learning_rate": 3.012341473657572e-06, "loss": 0.60742259, "num_input_tokens_seen": 125668680, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8671875, "step": 5840, "time_per_iteration": 2.4940006732940674 }, { "auxiliary_loss_clip": 0.01131009, "auxiliary_loss_mlp": 0.01039857, "balance_loss_clip": 1.02437186, "balance_loss_mlp": 1.04661727, "epoch": 0.3511799188336089, "flos": 25884160139520.0, "grad_norm": 2.9359659664550177, "language_loss": 0.87674212, "learning_rate": 3.0120055697451322e-06, "loss": 0.89845085, "num_input_tokens_seen": 125686935, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84375, "step": 5841, "time_per_iteration": 3.968937873840332 }, { "auxiliary_loss_clip": 0.01136064, "auxiliary_loss_mlp": 0.01040329, "balance_loss_clip": 1.02249551, "balance_loss_mlp": 1.04882395, "epoch": 0.35124004208627685, "flos": 20083725302400.0, "grad_norm": 2.4213411529823747, "language_loss": 0.75014472, "learning_rate": 3.0116696274573406e-06, "loss": 0.77190864, "num_input_tokens_seen": 125707180, "router_z_loss_clip": 0.17773438, "router_z_loss_mlp": 0.87109375, "step": 5842, "time_per_iteration": 2.5012900829315186 }, { "auxiliary_loss_clip": 0.01131262, "auxiliary_loss_mlp": 0.01041502, "balance_loss_clip": 1.02587366, "balance_loss_mlp": 1.04543018, "epoch": 0.3513001653389448, "flos": 17783431349760.0, "grad_norm": 1.9262375713117224, "language_loss": 0.68305087, "learning_rate": 3.0113336468069346e-06, "loss": 0.70477849, "num_input_tokens_seen": 125722780, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 5843, "time_per_iteration": 2.4120607376098633 }, { "auxiliary_loss_clip": 0.01129685, "auxiliary_loss_mlp": 0.01043533, "balance_loss_clip": 1.0274812, "balance_loss_mlp": 1.04635322, "epoch": 0.3513602885916128, "flos": 29387138198400.0, "grad_norm": 2.2873676665108884, "language_loss": 0.65380901, "learning_rate": 3.010997627806655e-06, "loss": 0.67554122, "num_input_tokens_seen": 125742110, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8359375, "step": 5844, "time_per_iteration": 2.522756576538086 }, { "auxiliary_loss_clip": 0.01132562, "auxiliary_loss_mlp": 0.01043525, "balance_loss_clip": 1.02724075, "balance_loss_mlp": 1.04786813, "epoch": 0.3514204118442808, "flos": 16179876483840.0, "grad_norm": 2.867420100229134, "language_loss": 0.75119358, "learning_rate": 3.010661570469245e-06, "loss": 0.7729544, "num_input_tokens_seen": 125759980, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.84765625, "step": 5845, "time_per_iteration": 3.9334700107574463 }, { "auxiliary_loss_clip": 0.01130722, "auxiliary_loss_mlp": 0.01041889, "balance_loss_clip": 1.02591479, "balance_loss_mlp": 1.04795527, "epoch": 0.35148053509694877, "flos": 23834665923840.0, "grad_norm": 2.732240899169347, "language_loss": 0.73462689, "learning_rate": 3.0103254748074465e-06, "loss": 0.75635302, "num_input_tokens_seen": 125772660, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 5846, "time_per_iteration": 3.8235862255096436 }, { "auxiliary_loss_clip": 0.0113351, "auxiliary_loss_mlp": 0.01039543, "balance_loss_clip": 1.02392626, "balance_loss_mlp": 1.04862046, "epoch": 0.35154065834961673, "flos": 20991295267200.0, "grad_norm": 1.8144672465444724, "language_loss": 0.7612654, "learning_rate": 3.0099893408340046e-06, "loss": 0.78299594, "num_input_tokens_seen": 125791935, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84765625, "step": 5847, "time_per_iteration": 3.9295620918273926 }, { "auxiliary_loss_clip": 0.01131659, "auxiliary_loss_mlp": 0.01034171, "balance_loss_clip": 1.01915061, "balance_loss_mlp": 1.04735339, "epoch": 0.3516007816022847, "flos": 33255471444480.0, "grad_norm": 1.9058833694607726, "language_loss": 0.72264743, "learning_rate": 3.009653168561666e-06, "loss": 0.74430573, "num_input_tokens_seen": 125813455, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84375, "step": 5848, "time_per_iteration": 2.6011455059051514 }, { "auxiliary_loss_clip": 0.01135848, "auxiliary_loss_mlp": 0.01049117, "balance_loss_clip": 1.03319037, "balance_loss_mlp": 1.05029607, "epoch": 0.35166090485495266, "flos": 11726953390080.0, "grad_norm": 9.474522812354389, "language_loss": 0.89007771, "learning_rate": 3.009316958003178e-06, "loss": 0.91192734, "num_input_tokens_seen": 125827660, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.85546875, "step": 5849, "time_per_iteration": 2.454706907272339 }, { "auxiliary_loss_clip": 0.01132262, "auxiliary_loss_mlp": 0.01035814, "balance_loss_clip": 1.02032912, "balance_loss_mlp": 1.0476259, "epoch": 0.3517210281076206, "flos": 22638446265600.0, "grad_norm": 2.141914497412386, "language_loss": 0.74911052, "learning_rate": 3.0089807091712897e-06, "loss": 0.77079129, "num_input_tokens_seen": 125846655, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84765625, "step": 5850, "time_per_iteration": 2.4747300148010254 }, { "auxiliary_loss_clip": 0.01129295, "auxiliary_loss_mlp": 0.01036219, "balance_loss_clip": 1.02066255, "balance_loss_mlp": 1.04733014, "epoch": 0.3517811513602886, "flos": 21322750993920.0, "grad_norm": 2.542058761990139, "language_loss": 0.75689834, "learning_rate": 3.0086444220787515e-06, "loss": 0.77855349, "num_input_tokens_seen": 125866290, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8203125, "step": 5851, "time_per_iteration": 2.479281187057495 }, { "auxiliary_loss_clip": 0.01134198, "auxiliary_loss_mlp": 0.01038082, "balance_loss_clip": 1.02196491, "balance_loss_mlp": 1.04873514, "epoch": 0.35184127461295656, "flos": 21032880238080.0, "grad_norm": 1.8978205563694357, "language_loss": 0.87424451, "learning_rate": 3.0083080967383165e-06, "loss": 0.89596736, "num_input_tokens_seen": 125884620, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.85546875, "step": 5852, "time_per_iteration": 2.4951088428497314 }, { "auxiliary_loss_clip": 0.01127781, "auxiliary_loss_mlp": 0.01034181, "balance_loss_clip": 1.01903534, "balance_loss_mlp": 1.04601932, "epoch": 0.3519013978656245, "flos": 22455265881600.0, "grad_norm": 2.1043504160549804, "language_loss": 0.67345178, "learning_rate": 3.007971733162737e-06, "loss": 0.69507146, "num_input_tokens_seen": 125902430, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.81640625, "step": 5853, "time_per_iteration": 2.4871296882629395 }, { "auxiliary_loss_clip": 0.0113182, "auxiliary_loss_mlp": 0.01034162, "balance_loss_clip": 1.01860547, "balance_loss_mlp": 1.04770184, "epoch": 0.3519615211182925, "flos": 13115295918720.0, "grad_norm": 2.015927583444519, "language_loss": 0.80997133, "learning_rate": 3.0076353313647686e-06, "loss": 0.83163118, "num_input_tokens_seen": 125920570, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83984375, "step": 5854, "time_per_iteration": 2.4528417587280273 }, { "auxiliary_loss_clip": 0.01126931, "auxiliary_loss_mlp": 0.01036024, "balance_loss_clip": 1.02130771, "balance_loss_mlp": 1.04600716, "epoch": 0.35202164437096045, "flos": 19135144984320.0, "grad_norm": 1.99611556310401, "language_loss": 0.73242331, "learning_rate": 3.0072988913571666e-06, "loss": 0.75405276, "num_input_tokens_seen": 125939800, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 5855, "time_per_iteration": 2.5126075744628906 }, { "auxiliary_loss_clip": 0.01125657, "auxiliary_loss_mlp": 0.01037553, "balance_loss_clip": 1.02294374, "balance_loss_mlp": 1.04449892, "epoch": 0.3520817676236284, "flos": 26542187343360.0, "grad_norm": 2.603178193946073, "language_loss": 0.71371621, "learning_rate": 3.006962413152691e-06, "loss": 0.73534834, "num_input_tokens_seen": 125958720, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 5856, "time_per_iteration": 2.510755777359009 }, { "auxiliary_loss_clip": 0.01133441, "auxiliary_loss_mlp": 0.01041484, "balance_loss_clip": 1.02543831, "balance_loss_mlp": 1.04809093, "epoch": 0.3521418908762964, "flos": 44893472803200.0, "grad_norm": 2.030084042238158, "language_loss": 0.61388779, "learning_rate": 3.0066258967640987e-06, "loss": 0.63563704, "num_input_tokens_seen": 125984310, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8515625, "step": 5857, "time_per_iteration": 2.6716716289520264 }, { "auxiliary_loss_clip": 0.01128152, "auxiliary_loss_mlp": 0.01039385, "balance_loss_clip": 1.02381682, "balance_loss_mlp": 1.04552817, "epoch": 0.3522020141289644, "flos": 20187398931840.0, "grad_norm": 2.0377274691719975, "language_loss": 0.73524779, "learning_rate": 3.006289342204152e-06, "loss": 0.7569232, "num_input_tokens_seen": 126002410, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 5858, "time_per_iteration": 2.453726291656494 }, { "auxiliary_loss_clip": 0.01130099, "auxiliary_loss_mlp": 0.01038551, "balance_loss_clip": 1.02334607, "balance_loss_mlp": 1.04546928, "epoch": 0.35226213738163237, "flos": 27563917708800.0, "grad_norm": 1.5864567972359418, "language_loss": 0.76039428, "learning_rate": 3.0059527494856126e-06, "loss": 0.78208083, "num_input_tokens_seen": 126022490, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84375, "step": 5859, "time_per_iteration": 2.512127637863159 }, { "auxiliary_loss_clip": 0.01138449, "auxiliary_loss_mlp": 0.01043482, "balance_loss_clip": 1.02637589, "balance_loss_mlp": 1.04982793, "epoch": 0.35232226063430033, "flos": 22966310632320.0, "grad_norm": 1.7234056328014011, "language_loss": 0.71768463, "learning_rate": 3.0056161186212435e-06, "loss": 0.73950392, "num_input_tokens_seen": 126042895, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.88671875, "step": 5860, "time_per_iteration": 2.460073709487915 }, { "auxiliary_loss_clip": 0.01133102, "auxiliary_loss_mlp": 0.01042577, "balance_loss_clip": 1.02582824, "balance_loss_mlp": 1.0451448, "epoch": 0.3523823838869683, "flos": 19168290259200.0, "grad_norm": 2.261550794200152, "language_loss": 0.65943718, "learning_rate": 3.005279449623811e-06, "loss": 0.68119395, "num_input_tokens_seen": 126060130, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.87890625, "step": 5861, "time_per_iteration": 2.443087100982666 }, { "auxiliary_loss_clip": 0.01127132, "auxiliary_loss_mlp": 0.01033933, "balance_loss_clip": 1.01867473, "balance_loss_mlp": 1.04499364, "epoch": 0.35244250713963626, "flos": 17930988420480.0, "grad_norm": 1.885748062735199, "language_loss": 0.66562295, "learning_rate": 3.0049427425060815e-06, "loss": 0.68723363, "num_input_tokens_seen": 126077850, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 5862, "time_per_iteration": 2.4391703605651855 }, { "auxiliary_loss_clip": 0.0113161, "auxiliary_loss_mlp": 0.01043758, "balance_loss_clip": 1.02643657, "balance_loss_mlp": 1.04669869, "epoch": 0.35250263039230423, "flos": 21432529935360.0, "grad_norm": 2.112761127380694, "language_loss": 0.76940846, "learning_rate": 3.0046059972808215e-06, "loss": 0.79116219, "num_input_tokens_seen": 126095985, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.84765625, "step": 5863, "time_per_iteration": 2.4763457775115967 }, { "auxiliary_loss_clip": 0.0112983, "auxiliary_loss_mlp": 0.0103882, "balance_loss_clip": 1.02368093, "balance_loss_mlp": 1.04543281, "epoch": 0.3525627536449722, "flos": 27416863428480.0, "grad_norm": 1.8570435922178345, "language_loss": 0.75140083, "learning_rate": 3.0042692139608024e-06, "loss": 0.77308738, "num_input_tokens_seen": 126116070, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84375, "step": 5864, "time_per_iteration": 2.512108325958252 }, { "auxiliary_loss_clip": 0.01127278, "auxiliary_loss_mlp": 0.01050747, "balance_loss_clip": 1.03559506, "balance_loss_mlp": 1.04347253, "epoch": 0.35262287689764016, "flos": 24789818430720.0, "grad_norm": 2.0963984958832413, "language_loss": 0.79408801, "learning_rate": 3.003932392558793e-06, "loss": 0.81586832, "num_input_tokens_seen": 126135205, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.83984375, "step": 5865, "time_per_iteration": 2.5007951259613037 }, { "auxiliary_loss_clip": 0.01135029, "auxiliary_loss_mlp": 0.01042649, "balance_loss_clip": 1.02634096, "balance_loss_mlp": 1.04850757, "epoch": 0.3526830001503081, "flos": 17821604528640.0, "grad_norm": 2.0500751349023165, "language_loss": 0.81160331, "learning_rate": 3.0035955330875677e-06, "loss": 0.8333801, "num_input_tokens_seen": 126151895, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.86328125, "step": 5866, "time_per_iteration": 2.435336112976074 }, { "auxiliary_loss_clip": 0.01136994, "auxiliary_loss_mlp": 0.01038388, "balance_loss_clip": 1.02103138, "balance_loss_mlp": 1.04611707, "epoch": 0.3527431234029761, "flos": 18078114528000.0, "grad_norm": 2.288673586890261, "language_loss": 0.83971596, "learning_rate": 3.0032586355598986e-06, "loss": 0.86146975, "num_input_tokens_seen": 126168515, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.91015625, "step": 5867, "time_per_iteration": 2.4393253326416016 }, { "auxiliary_loss_clip": 0.01130726, "auxiliary_loss_mlp": 0.01048114, "balance_loss_clip": 1.0316397, "balance_loss_mlp": 1.0451237, "epoch": 0.35280324665564405, "flos": 19427350124160.0, "grad_norm": 2.2536362402222205, "language_loss": 0.73969787, "learning_rate": 3.0029216999885613e-06, "loss": 0.76148629, "num_input_tokens_seen": 126186460, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.85546875, "step": 5868, "time_per_iteration": 2.462965965270996 }, { "auxiliary_loss_clip": 0.01133355, "auxiliary_loss_mlp": 0.01041864, "balance_loss_clip": 1.02531767, "balance_loss_mlp": 1.04678512, "epoch": 0.352863369908312, "flos": 21504027957120.0, "grad_norm": 1.817489229980544, "language_loss": 0.61682928, "learning_rate": 3.0025847263863327e-06, "loss": 0.63858151, "num_input_tokens_seen": 126206170, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8671875, "step": 5869, "time_per_iteration": 2.4716832637786865 }, { "auxiliary_loss_clip": 0.01129664, "auxiliary_loss_mlp": 0.01042342, "balance_loss_clip": 1.02624893, "balance_loss_mlp": 1.04457021, "epoch": 0.35292349316098, "flos": 22309504490880.0, "grad_norm": 2.0602691301185634, "language_loss": 0.74114037, "learning_rate": 3.0022477147659917e-06, "loss": 0.76286042, "num_input_tokens_seen": 126225605, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8515625, "step": 5870, "time_per_iteration": 2.457200288772583 }, { "auxiliary_loss_clip": 0.01127249, "auxiliary_loss_mlp": 0.01036598, "balance_loss_clip": 1.02046847, "balance_loss_mlp": 1.04309893, "epoch": 0.352983616413648, "flos": 33109745967360.0, "grad_norm": 1.7440905483314293, "language_loss": 0.71977592, "learning_rate": 3.001910665140316e-06, "loss": 0.74141431, "num_input_tokens_seen": 126250230, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 5871, "time_per_iteration": 2.628674030303955 }, { "auxiliary_loss_clip": 0.01124007, "auxiliary_loss_mlp": 0.01037248, "balance_loss_clip": 1.0229069, "balance_loss_mlp": 1.04264092, "epoch": 0.35304373966631597, "flos": 18696603836160.0, "grad_norm": 1.9088622285968186, "language_loss": 0.73595411, "learning_rate": 3.0015735775220873e-06, "loss": 0.75756669, "num_input_tokens_seen": 126268315, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8125, "step": 5872, "time_per_iteration": 2.448789596557617 }, { "auxiliary_loss_clip": 0.01128537, "auxiliary_loss_mlp": 0.01043318, "balance_loss_clip": 1.02856016, "balance_loss_mlp": 1.04548407, "epoch": 0.35310386291898394, "flos": 23364954748800.0, "grad_norm": 2.0813297505367774, "language_loss": 0.82627022, "learning_rate": 3.001236451924089e-06, "loss": 0.84798884, "num_input_tokens_seen": 126288390, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.83203125, "step": 5873, "time_per_iteration": 2.4902405738830566 }, { "auxiliary_loss_clip": 0.01133745, "auxiliary_loss_mlp": 0.01045959, "balance_loss_clip": 1.02901959, "balance_loss_mlp": 1.04557467, "epoch": 0.3531639861716519, "flos": 24461954064000.0, "grad_norm": 1.9077680141213926, "language_loss": 0.66342366, "learning_rate": 3.000899288359104e-06, "loss": 0.68522072, "num_input_tokens_seen": 126305750, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8828125, "step": 5874, "time_per_iteration": 2.4957969188690186 }, { "auxiliary_loss_clip": 0.01047152, "auxiliary_loss_mlp": 0.01005919, "balance_loss_clip": 1.00383317, "balance_loss_mlp": 1.02088046, "epoch": 0.35322410942431987, "flos": 70312446881280.0, "grad_norm": 0.7781857621913962, "language_loss": 0.6159054, "learning_rate": 3.000562086839917e-06, "loss": 0.6364361, "num_input_tokens_seen": 126362495, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.26171875, "step": 5875, "time_per_iteration": 3.023057222366333 }, { "auxiliary_loss_clip": 0.01129881, "auxiliary_loss_mlp": 0.01038949, "balance_loss_clip": 1.02403021, "balance_loss_mlp": 1.04625511, "epoch": 0.35328423267698783, "flos": 19820894509440.0, "grad_norm": 2.36129175053276, "language_loss": 0.80098265, "learning_rate": 3.0002248473793163e-06, "loss": 0.82267094, "num_input_tokens_seen": 126378320, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8359375, "step": 5876, "time_per_iteration": 2.453796148300171 }, { "auxiliary_loss_clip": 0.01047098, "auxiliary_loss_mlp": 0.01003282, "balance_loss_clip": 1.00118434, "balance_loss_mlp": 1.02077258, "epoch": 0.3533443559296558, "flos": 60826356391680.0, "grad_norm": 0.6882017458738773, "language_loss": 0.56770277, "learning_rate": 2.999887569990088e-06, "loss": 0.58820653, "num_input_tokens_seen": 126442735, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.26171875, "step": 5877, "time_per_iteration": 3.191944122314453 }, { "auxiliary_loss_clip": 0.01132005, "auxiliary_loss_mlp": 0.0103347, "balance_loss_clip": 1.01780581, "balance_loss_mlp": 1.04645264, "epoch": 0.35340447918232376, "flos": 24755775315840.0, "grad_norm": 1.593128537864418, "language_loss": 0.71756232, "learning_rate": 2.999550254685024e-06, "loss": 0.73921716, "num_input_tokens_seen": 126463090, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.85546875, "step": 5878, "time_per_iteration": 2.52748703956604 }, { "auxiliary_loss_clip": 0.01127104, "auxiliary_loss_mlp": 0.01040523, "balance_loss_clip": 1.02454853, "balance_loss_mlp": 1.04281974, "epoch": 0.3534646024349917, "flos": 21796304924160.0, "grad_norm": 5.695699178112265, "language_loss": 0.78700656, "learning_rate": 2.9992129014769136e-06, "loss": 0.8086828, "num_input_tokens_seen": 126482105, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 5879, "time_per_iteration": 2.4607975482940674 }, { "auxiliary_loss_clip": 0.01133952, "auxiliary_loss_mlp": 0.01045609, "balance_loss_clip": 1.02734637, "balance_loss_mlp": 1.04584146, "epoch": 0.3535247256876597, "flos": 20012119539840.0, "grad_norm": 2.4684851324600747, "language_loss": 0.63131964, "learning_rate": 2.9988755103785493e-06, "loss": 0.65311527, "num_input_tokens_seen": 126502125, "router_z_loss_clip": 0.18261719, "router_z_loss_mlp": 0.8828125, "step": 5880, "time_per_iteration": 2.4967310428619385 }, { "auxiliary_loss_clip": 0.01130717, "auxiliary_loss_mlp": 0.0103685, "balance_loss_clip": 1.02033961, "balance_loss_mlp": 1.0448513, "epoch": 0.35358484894032766, "flos": 18187929383040.0, "grad_norm": 2.224591122939583, "language_loss": 0.66045725, "learning_rate": 2.998538081402727e-06, "loss": 0.68213296, "num_input_tokens_seen": 126521950, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.859375, "step": 5881, "time_per_iteration": 2.4623780250549316 }, { "auxiliary_loss_clip": 0.01123537, "auxiliary_loss_mlp": 0.01032091, "balance_loss_clip": 1.01729107, "balance_loss_mlp": 1.04264069, "epoch": 0.3536449721929956, "flos": 22820369673600.0, "grad_norm": 1.3962436141005508, "language_loss": 0.75467014, "learning_rate": 2.998200614562239e-06, "loss": 0.7762264, "num_input_tokens_seen": 126542445, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80859375, "step": 5882, "time_per_iteration": 2.4969325065612793 }, { "auxiliary_loss_clip": 0.01131639, "auxiliary_loss_mlp": 0.01038465, "balance_loss_clip": 1.02234781, "balance_loss_mlp": 1.04591084, "epoch": 0.3537050954456636, "flos": 26432336574720.0, "grad_norm": 2.025073019886281, "language_loss": 0.71007872, "learning_rate": 2.9978631098698847e-06, "loss": 0.73177975, "num_input_tokens_seen": 126560690, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.85546875, "step": 5883, "time_per_iteration": 3.938652515411377 }, { "auxiliary_loss_clip": 0.01134298, "auxiliary_loss_mlp": 0.0104135, "balance_loss_clip": 1.02512586, "balance_loss_mlp": 1.04624557, "epoch": 0.3537652186983316, "flos": 17197153562880.0, "grad_norm": 5.679369788962536, "language_loss": 0.78136861, "learning_rate": 2.9975255673384614e-06, "loss": 0.80312514, "num_input_tokens_seen": 126577620, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.87890625, "step": 5884, "time_per_iteration": 2.4492909908294678 }, { "auxiliary_loss_clip": 0.0112712, "auxiliary_loss_mlp": 0.01037295, "balance_loss_clip": 1.02264404, "balance_loss_mlp": 1.04404998, "epoch": 0.3538253419509996, "flos": 19536769929600.0, "grad_norm": 2.5922737865573375, "language_loss": 0.75341409, "learning_rate": 2.9971879869807673e-06, "loss": 0.77505827, "num_input_tokens_seen": 126596235, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.828125, "step": 5885, "time_per_iteration": 2.441558599472046 }, { "auxiliary_loss_clip": 0.01130907, "auxiliary_loss_mlp": 0.01049232, "balance_loss_clip": 1.03284705, "balance_loss_mlp": 1.04413116, "epoch": 0.35388546520366754, "flos": 12128578335360.0, "grad_norm": 2.126347112352382, "language_loss": 0.834463, "learning_rate": 2.996850368809606e-06, "loss": 0.85626447, "num_input_tokens_seen": 126612830, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8671875, "step": 5886, "time_per_iteration": 2.4390013217926025 }, { "auxiliary_loss_clip": 0.0112827, "auxiliary_loss_mlp": 0.01037202, "balance_loss_clip": 1.02040505, "balance_loss_mlp": 1.04496968, "epoch": 0.3539455884563355, "flos": 19678149861120.0, "grad_norm": 2.242404930885164, "language_loss": 0.78356659, "learning_rate": 2.9965127128377787e-06, "loss": 0.80522132, "num_input_tokens_seen": 126630910, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.83203125, "step": 5887, "time_per_iteration": 3.8236961364746094 }, { "auxiliary_loss_clip": 0.01125844, "auxiliary_loss_mlp": 0.01042447, "balance_loss_clip": 1.02699733, "balance_loss_mlp": 1.04213881, "epoch": 0.35400571170900347, "flos": 18072045129600.0, "grad_norm": 2.051815161746513, "language_loss": 0.65784001, "learning_rate": 2.996175019078089e-06, "loss": 0.67952293, "num_input_tokens_seen": 126648365, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 5888, "time_per_iteration": 3.839416980743408 }, { "auxiliary_loss_clip": 0.01130128, "auxiliary_loss_mlp": 0.01039015, "balance_loss_clip": 1.0242095, "balance_loss_mlp": 1.04678965, "epoch": 0.35406583496167143, "flos": 26068058795520.0, "grad_norm": 1.7253994962267358, "language_loss": 0.77016246, "learning_rate": 2.9958372875433437e-06, "loss": 0.7918539, "num_input_tokens_seen": 126667500, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.83203125, "step": 5889, "time_per_iteration": 3.9376184940338135 }, { "auxiliary_loss_clip": 0.01128811, "auxiliary_loss_mlp": 0.01040813, "balance_loss_clip": 1.02578092, "balance_loss_mlp": 1.04638124, "epoch": 0.3541259582143394, "flos": 19792453916160.0, "grad_norm": 1.9185875837046582, "language_loss": 0.81115174, "learning_rate": 2.9954995182463478e-06, "loss": 0.83284807, "num_input_tokens_seen": 126686820, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.82421875, "step": 5890, "time_per_iteration": 2.49838924407959 }, { "auxiliary_loss_clip": 0.0112614, "auxiliary_loss_mlp": 0.01036777, "balance_loss_clip": 1.02310383, "balance_loss_mlp": 1.04394317, "epoch": 0.35418608146700736, "flos": 24022084112640.0, "grad_norm": 1.7407824338302427, "language_loss": 0.79525834, "learning_rate": 2.99516171119991e-06, "loss": 0.8168875, "num_input_tokens_seen": 126706965, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.8203125, "step": 5891, "time_per_iteration": 2.4783966541290283 }, { "auxiliary_loss_clip": 0.0112745, "auxiliary_loss_mlp": 0.01045416, "balance_loss_clip": 1.02934611, "balance_loss_mlp": 1.04451394, "epoch": 0.35424620471967533, "flos": 12385770693120.0, "grad_norm": 2.3044365042818136, "language_loss": 0.7363838, "learning_rate": 2.9948238664168415e-06, "loss": 0.75811243, "num_input_tokens_seen": 126724015, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 5892, "time_per_iteration": 2.473065137863159 }, { "auxiliary_loss_clip": 0.01130961, "auxiliary_loss_mlp": 0.01044682, "balance_loss_clip": 1.02875543, "balance_loss_mlp": 1.0465436, "epoch": 0.3543063279723433, "flos": 19673624747520.0, "grad_norm": 3.2062769300587255, "language_loss": 0.67155063, "learning_rate": 2.9944859839099518e-06, "loss": 0.69330716, "num_input_tokens_seen": 126737565, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84375, "step": 5893, "time_per_iteration": 2.4465837478637695 }, { "auxiliary_loss_clip": 0.01127808, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.02545094, "balance_loss_mlp": 1.04519725, "epoch": 0.35436645122501126, "flos": 21909208348800.0, "grad_norm": 2.0415962299763697, "language_loss": 0.69885945, "learning_rate": 2.9941480636920533e-06, "loss": 0.72054869, "num_input_tokens_seen": 126756095, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 5894, "time_per_iteration": 2.5000340938568115 }, { "auxiliary_loss_clip": 0.0112929, "auxiliary_loss_mlp": 0.01033301, "balance_loss_clip": 1.01841807, "balance_loss_mlp": 1.04679585, "epoch": 0.3544265744776792, "flos": 21719527603200.0, "grad_norm": 1.7628704243200264, "language_loss": 0.74965018, "learning_rate": 2.9938101057759615e-06, "loss": 0.77127612, "num_input_tokens_seen": 126775455, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.82421875, "step": 5895, "time_per_iteration": 2.4484119415283203 }, { "auxiliary_loss_clip": 0.01126903, "auxiliary_loss_mlp": 0.01039, "balance_loss_clip": 1.02374125, "balance_loss_mlp": 1.04441619, "epoch": 0.3544866977303472, "flos": 21213223447680.0, "grad_norm": 2.092549216770644, "language_loss": 0.8411634, "learning_rate": 2.993472110174491e-06, "loss": 0.86282241, "num_input_tokens_seen": 126792320, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.82421875, "step": 5896, "time_per_iteration": 2.4615044593811035 }, { "auxiliary_loss_clip": 0.01129608, "auxiliary_loss_mlp": 0.01050238, "balance_loss_clip": 1.03463912, "balance_loss_mlp": 1.04749095, "epoch": 0.35454682098301515, "flos": 29311402371840.0, "grad_norm": 1.7609373760730123, "language_loss": 0.70211965, "learning_rate": 2.9931340769004576e-06, "loss": 0.72391808, "num_input_tokens_seen": 126813680, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 5897, "time_per_iteration": 2.5223147869110107 }, { "auxiliary_loss_clip": 0.01126549, "auxiliary_loss_mlp": 0.01041567, "balance_loss_clip": 1.02595019, "balance_loss_mlp": 1.04408407, "epoch": 0.3546069442356832, "flos": 24316587722880.0, "grad_norm": 2.052838281420549, "language_loss": 0.81575656, "learning_rate": 2.9927960059666816e-06, "loss": 0.83743769, "num_input_tokens_seen": 126834395, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.82421875, "step": 5898, "time_per_iteration": 2.5341947078704834 }, { "auxiliary_loss_clip": 0.01125289, "auxiliary_loss_mlp": 0.01041133, "balance_loss_clip": 1.02672648, "balance_loss_mlp": 1.04439485, "epoch": 0.35466706748835114, "flos": 22857285876480.0, "grad_norm": 1.7332739103956123, "language_loss": 0.74482435, "learning_rate": 2.9924578973859804e-06, "loss": 0.76648855, "num_input_tokens_seen": 126855145, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80859375, "step": 5899, "time_per_iteration": 2.4775075912475586 }, { "auxiliary_loss_clip": 0.01127856, "auxiliary_loss_mlp": 0.01041853, "balance_loss_clip": 1.02636814, "balance_loss_mlp": 1.04398143, "epoch": 0.3547271907410191, "flos": 28330107742080.0, "grad_norm": 1.7268261148731041, "language_loss": 0.79493093, "learning_rate": 2.9921197511711763e-06, "loss": 0.8166281, "num_input_tokens_seen": 126873790, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8359375, "step": 5900, "time_per_iteration": 2.547647714614868 }, { "auxiliary_loss_clip": 0.01128811, "auxiliary_loss_mlp": 0.01046765, "balance_loss_clip": 1.03065956, "balance_loss_mlp": 1.04588401, "epoch": 0.35478731399368707, "flos": 23514092017920.0, "grad_norm": 2.246848650157151, "language_loss": 0.81564027, "learning_rate": 2.991781567335093e-06, "loss": 0.83739609, "num_input_tokens_seen": 126892865, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.83203125, "step": 5901, "time_per_iteration": 2.4862616062164307 }, { "auxiliary_loss_clip": 0.01132827, "auxiliary_loss_mlp": 0.01043717, "balance_loss_clip": 1.02756429, "balance_loss_mlp": 1.04577017, "epoch": 0.35484743724635504, "flos": 18624315715200.0, "grad_norm": 2.0318832767219916, "language_loss": 0.75896239, "learning_rate": 2.9914433458905525e-06, "loss": 0.78072774, "num_input_tokens_seen": 126911935, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.87109375, "step": 5902, "time_per_iteration": 2.524082899093628 }, { "auxiliary_loss_clip": 0.01129128, "auxiliary_loss_mlp": 0.01039311, "balance_loss_clip": 1.02417135, "balance_loss_mlp": 1.04531264, "epoch": 0.354907560499023, "flos": 17384499924480.0, "grad_norm": 2.2437222979983527, "language_loss": 0.70749509, "learning_rate": 2.991105086850381e-06, "loss": 0.7291795, "num_input_tokens_seen": 126930040, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.83984375, "step": 5903, "time_per_iteration": 2.4364216327667236 }, { "auxiliary_loss_clip": 0.01132069, "auxiliary_loss_mlp": 0.01039343, "balance_loss_clip": 1.02335691, "balance_loss_mlp": 1.04500151, "epoch": 0.35496768375169097, "flos": 19208546426880.0, "grad_norm": 2.37237301642559, "language_loss": 0.74729824, "learning_rate": 2.9907667902274053e-06, "loss": 0.76901233, "num_input_tokens_seen": 126948390, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.87109375, "step": 5904, "time_per_iteration": 2.459299325942993 }, { "auxiliary_loss_clip": 0.01132443, "auxiliary_loss_mlp": 0.01045667, "balance_loss_clip": 1.03028321, "balance_loss_mlp": 1.04681337, "epoch": 0.35502780700435893, "flos": 18332792933760.0, "grad_norm": 2.3079854516641367, "language_loss": 0.78836793, "learning_rate": 2.9904284560344536e-06, "loss": 0.81014901, "num_input_tokens_seen": 126964905, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.859375, "step": 5905, "time_per_iteration": 2.4302942752838135 }, { "auxiliary_loss_clip": 0.01118216, "auxiliary_loss_mlp": 0.01036763, "balance_loss_clip": 1.02342355, "balance_loss_mlp": 1.04138494, "epoch": 0.3550879302570269, "flos": 15448555578240.0, "grad_norm": 1.798111351447343, "language_loss": 0.7235074, "learning_rate": 2.990090084284356e-06, "loss": 0.74505723, "num_input_tokens_seen": 126982000, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 5906, "time_per_iteration": 2.4410595893859863 }, { "auxiliary_loss_clip": 0.01133234, "auxiliary_loss_mlp": 0.0103447, "balance_loss_clip": 1.01792347, "balance_loss_mlp": 1.04605341, "epoch": 0.35514805350969486, "flos": 21979197999360.0, "grad_norm": 2.4201501689269205, "language_loss": 0.74437934, "learning_rate": 2.9897516749899426e-06, "loss": 0.76605642, "num_input_tokens_seen": 126998390, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.87109375, "step": 5907, "time_per_iteration": 2.4546031951904297 }, { "auxiliary_loss_clip": 0.01125778, "auxiliary_loss_mlp": 0.01038766, "balance_loss_clip": 1.02276802, "balance_loss_mlp": 1.04308367, "epoch": 0.3552081767623628, "flos": 29861949104640.0, "grad_norm": 1.8074006441152144, "language_loss": 0.7546823, "learning_rate": 2.989413228164047e-06, "loss": 0.77632773, "num_input_tokens_seen": 127020220, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 5908, "time_per_iteration": 2.5440499782562256 }, { "auxiliary_loss_clip": 0.01125938, "auxiliary_loss_mlp": 0.01042185, "balance_loss_clip": 1.02689052, "balance_loss_mlp": 1.04335606, "epoch": 0.3552683000150308, "flos": 26432264747520.0, "grad_norm": 1.8756150909924156, "language_loss": 0.67975277, "learning_rate": 2.989074743819502e-06, "loss": 0.70143402, "num_input_tokens_seen": 127038585, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.82421875, "step": 5909, "time_per_iteration": 2.5033421516418457 }, { "auxiliary_loss_clip": 0.01124537, "auxiliary_loss_mlp": 0.01038084, "balance_loss_clip": 1.02373743, "balance_loss_mlp": 1.04534972, "epoch": 0.35532842326769876, "flos": 19785989468160.0, "grad_norm": 2.279018573744385, "language_loss": 0.78594339, "learning_rate": 2.988736221969144e-06, "loss": 0.80756962, "num_input_tokens_seen": 127056215, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.79296875, "step": 5910, "time_per_iteration": 2.4684154987335205 }, { "auxiliary_loss_clip": 0.01130138, "auxiliary_loss_mlp": 0.01039872, "balance_loss_clip": 1.02322459, "balance_loss_mlp": 1.04463077, "epoch": 0.3553885465203668, "flos": 17239277237760.0, "grad_norm": 2.1681760361577003, "language_loss": 0.70814502, "learning_rate": 2.98839766262581e-06, "loss": 0.72984517, "num_input_tokens_seen": 127075825, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.85546875, "step": 5911, "time_per_iteration": 2.436216115951538 }, { "auxiliary_loss_clip": 0.01123714, "auxiliary_loss_mlp": 0.01033851, "balance_loss_clip": 1.01906323, "balance_loss_mlp": 1.0437125, "epoch": 0.35544866977303474, "flos": 14934350430720.0, "grad_norm": 2.3476006763590656, "language_loss": 0.86586148, "learning_rate": 2.9880590658023366e-06, "loss": 0.88743711, "num_input_tokens_seen": 127091205, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.796875, "step": 5912, "time_per_iteration": 2.464069128036499 }, { "auxiliary_loss_clip": 0.0112623, "auxiliary_loss_mlp": 0.01039974, "balance_loss_clip": 1.02484632, "balance_loss_mlp": 1.04465771, "epoch": 0.3555087930257027, "flos": 19756040503680.0, "grad_norm": 1.802831813783044, "language_loss": 0.76417232, "learning_rate": 2.9877204315115646e-06, "loss": 0.78583437, "num_input_tokens_seen": 127109210, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.81640625, "step": 5913, "time_per_iteration": 2.5038599967956543 }, { "auxiliary_loss_clip": 0.01128359, "auxiliary_loss_mlp": 0.01037718, "balance_loss_clip": 1.02232802, "balance_loss_mlp": 1.04820275, "epoch": 0.3555689162783707, "flos": 21068252156160.0, "grad_norm": 1.3794762019750735, "language_loss": 0.8253485, "learning_rate": 2.9873817597663353e-06, "loss": 0.8470093, "num_input_tokens_seen": 127128400, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80078125, "step": 5914, "time_per_iteration": 2.491291046142578 }, { "auxiliary_loss_clip": 0.0112659, "auxiliary_loss_mlp": 0.01037232, "balance_loss_clip": 1.02167511, "balance_loss_mlp": 1.04429746, "epoch": 0.35562903953103864, "flos": 33069633454080.0, "grad_norm": 2.4902739142599297, "language_loss": 0.70007694, "learning_rate": 2.98704305057949e-06, "loss": 0.72171521, "num_input_tokens_seen": 127149965, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.82421875, "step": 5915, "time_per_iteration": 2.578418731689453 }, { "auxiliary_loss_clip": 0.01124329, "auxiliary_loss_mlp": 0.01039086, "balance_loss_clip": 1.02462006, "balance_loss_mlp": 1.04121685, "epoch": 0.3556891627837066, "flos": 20557853850240.0, "grad_norm": 1.8097827166501121, "language_loss": 0.76000273, "learning_rate": 2.9867043039638737e-06, "loss": 0.78163689, "num_input_tokens_seen": 127169865, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.828125, "step": 5916, "time_per_iteration": 2.4651288986206055 }, { "auxiliary_loss_clip": 0.01129106, "auxiliary_loss_mlp": 0.0103471, "balance_loss_clip": 1.02060175, "balance_loss_mlp": 1.0466944, "epoch": 0.35574928603637457, "flos": 20703327932160.0, "grad_norm": 2.475623313298482, "language_loss": 0.8857044, "learning_rate": 2.986365519932332e-06, "loss": 0.90734255, "num_input_tokens_seen": 127188075, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.82421875, "step": 5917, "time_per_iteration": 2.4590988159179688 }, { "auxiliary_loss_clip": 0.01128217, "auxiliary_loss_mlp": 0.010318, "balance_loss_clip": 1.01694632, "balance_loss_mlp": 1.04626775, "epoch": 0.35580940928904253, "flos": 15194595444480.0, "grad_norm": 4.54782764568952, "language_loss": 0.74717385, "learning_rate": 2.98602669849771e-06, "loss": 0.76877403, "num_input_tokens_seen": 127206065, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 5918, "time_per_iteration": 2.449838399887085 }, { "auxiliary_loss_clip": 0.01048707, "auxiliary_loss_mlp": 0.01003704, "balance_loss_clip": 1.00168955, "balance_loss_mlp": 1.02161074, "epoch": 0.3558695325417105, "flos": 58639145431680.0, "grad_norm": 0.922436930597252, "language_loss": 0.63797426, "learning_rate": 2.985687839672857e-06, "loss": 0.65849841, "num_input_tokens_seen": 127257885, "router_z_loss_clip": 0.0201416, "router_z_loss_mlp": 0.26953125, "step": 5919, "time_per_iteration": 2.849086046218872 }, { "auxiliary_loss_clip": 0.01129311, "auxiliary_loss_mlp": 0.01033389, "balance_loss_clip": 1.01798749, "balance_loss_mlp": 1.04394889, "epoch": 0.35592965579437846, "flos": 22018233104640.0, "grad_norm": 2.6266720035447766, "language_loss": 0.73845553, "learning_rate": 2.9853489434706223e-06, "loss": 0.76008254, "num_input_tokens_seen": 127275550, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.85546875, "step": 5920, "time_per_iteration": 2.47930645942688 }, { "auxiliary_loss_clip": 0.01124842, "auxiliary_loss_mlp": 0.01033603, "balance_loss_clip": 1.01872563, "balance_loss_mlp": 1.04368043, "epoch": 0.35598977904704643, "flos": 23367684182400.0, "grad_norm": 1.9520207678017634, "language_loss": 0.77008718, "learning_rate": 2.985010009903857e-06, "loss": 0.79167163, "num_input_tokens_seen": 127295110, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 5921, "time_per_iteration": 2.4692134857177734 }, { "auxiliary_loss_clip": 0.0112654, "auxiliary_loss_mlp": 0.01036451, "balance_loss_clip": 1.02244401, "balance_loss_mlp": 1.04455912, "epoch": 0.3560499022997144, "flos": 17785334770560.0, "grad_norm": 1.9190301962024368, "language_loss": 0.67805564, "learning_rate": 2.9846710389854133e-06, "loss": 0.69968557, "num_input_tokens_seen": 127312865, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.8203125, "step": 5922, "time_per_iteration": 2.4743173122406006 }, { "auxiliary_loss_clip": 0.01128679, "auxiliary_loss_mlp": 0.01035854, "balance_loss_clip": 1.02058327, "balance_loss_mlp": 1.04616976, "epoch": 0.35611002555238236, "flos": 20740459616640.0, "grad_norm": 2.427986408603291, "language_loss": 0.78885525, "learning_rate": 2.9843320307281454e-06, "loss": 0.81050062, "num_input_tokens_seen": 127331710, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.82421875, "step": 5923, "time_per_iteration": 2.458427906036377 }, { "auxiliary_loss_clip": 0.01128488, "auxiliary_loss_mlp": 0.01039686, "balance_loss_clip": 1.02562499, "balance_loss_mlp": 1.04615414, "epoch": 0.3561701488050504, "flos": 19462219251840.0, "grad_norm": 2.4627499896099025, "language_loss": 0.84943265, "learning_rate": 2.983992985144908e-06, "loss": 0.87111437, "num_input_tokens_seen": 127350950, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.82421875, "step": 5924, "time_per_iteration": 2.4709560871124268 }, { "auxiliary_loss_clip": 0.01126092, "auxiliary_loss_mlp": 0.01037733, "balance_loss_clip": 1.02278399, "balance_loss_mlp": 1.04505312, "epoch": 0.35623027205771834, "flos": 30774942023040.0, "grad_norm": 2.347972779757536, "language_loss": 0.77365756, "learning_rate": 2.9836539022485578e-06, "loss": 0.79529583, "num_input_tokens_seen": 127369385, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8125, "step": 5925, "time_per_iteration": 4.004887342453003 }, { "auxiliary_loss_clip": 0.01124446, "auxiliary_loss_mlp": 0.01044339, "balance_loss_clip": 1.0299685, "balance_loss_mlp": 1.04210615, "epoch": 0.3562903953103863, "flos": 16981079299200.0, "grad_norm": 2.0604599581871144, "language_loss": 0.75640357, "learning_rate": 2.9833147820519535e-06, "loss": 0.77809143, "num_input_tokens_seen": 127386965, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.82421875, "step": 5926, "time_per_iteration": 2.4769232273101807 }, { "auxiliary_loss_clip": 0.01130958, "auxiliary_loss_mlp": 0.01034564, "balance_loss_clip": 1.01888239, "balance_loss_mlp": 1.04531491, "epoch": 0.3563505185630543, "flos": 23839837482240.0, "grad_norm": 2.2364201530921695, "language_loss": 0.69679832, "learning_rate": 2.9829756245679544e-06, "loss": 0.71845353, "num_input_tokens_seen": 127406075, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.859375, "step": 5927, "time_per_iteration": 2.4931094646453857 }, { "auxiliary_loss_clip": 0.01124329, "auxiliary_loss_mlp": 0.01032885, "balance_loss_clip": 1.01895583, "balance_loss_mlp": 1.0443058, "epoch": 0.35641064181572224, "flos": 22273450214400.0, "grad_norm": 2.1168688123208224, "language_loss": 0.79622245, "learning_rate": 2.9826364298094212e-06, "loss": 0.81779456, "num_input_tokens_seen": 127425350, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.80078125, "step": 5928, "time_per_iteration": 2.469921827316284 }, { "auxiliary_loss_clip": 0.01127416, "auxiliary_loss_mlp": 0.01039172, "balance_loss_clip": 1.02453268, "balance_loss_mlp": 1.04554462, "epoch": 0.3564707650683902, "flos": 23001251587200.0, "grad_norm": 1.4593644104154995, "language_loss": 0.81877553, "learning_rate": 2.982297197789215e-06, "loss": 0.84044135, "num_input_tokens_seen": 127446335, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8203125, "step": 5929, "time_per_iteration": 3.9119374752044678 }, { "auxiliary_loss_clip": 0.01121434, "auxiliary_loss_mlp": 0.01031127, "balance_loss_clip": 1.01729274, "balance_loss_mlp": 1.04198503, "epoch": 0.35653088832105817, "flos": 14684268965760.0, "grad_norm": 1.8781906193109583, "language_loss": 0.70478535, "learning_rate": 2.981957928520201e-06, "loss": 0.72631097, "num_input_tokens_seen": 127462795, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.79296875, "step": 5930, "time_per_iteration": 3.8679661750793457 }, { "auxiliary_loss_clip": 0.01131513, "auxiliary_loss_mlp": 0.01042132, "balance_loss_clip": 1.02686131, "balance_loss_mlp": 1.04750085, "epoch": 0.35659101157372614, "flos": 23477068074240.0, "grad_norm": 2.7911199604266033, "language_loss": 0.67976975, "learning_rate": 2.981618622015244e-06, "loss": 0.70150626, "num_input_tokens_seen": 127482675, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.83984375, "step": 5931, "time_per_iteration": 2.5032267570495605 }, { "auxiliary_loss_clip": 0.0112646, "auxiliary_loss_mlp": 0.0103607, "balance_loss_clip": 1.02145481, "balance_loss_mlp": 1.04485464, "epoch": 0.3566511348263941, "flos": 26578672583040.0, "grad_norm": 1.6148098112730516, "language_loss": 0.6788677, "learning_rate": 2.981279278287211e-06, "loss": 0.70049304, "num_input_tokens_seen": 127502275, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.81640625, "step": 5932, "time_per_iteration": 2.502558708190918 }, { "auxiliary_loss_clip": 0.01124273, "auxiliary_loss_mlp": 0.01032882, "balance_loss_clip": 1.01894689, "balance_loss_mlp": 1.04457521, "epoch": 0.35671125807906207, "flos": 13115008609920.0, "grad_norm": 2.0748396308680803, "language_loss": 0.7911545, "learning_rate": 2.980939897348969e-06, "loss": 0.81272608, "num_input_tokens_seen": 127520195, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.796875, "step": 5933, "time_per_iteration": 2.452016830444336 }, { "auxiliary_loss_clip": 0.01125656, "auxiliary_loss_mlp": 0.01044745, "balance_loss_clip": 1.02996337, "balance_loss_mlp": 1.04299068, "epoch": 0.35677138133173003, "flos": 33000577557120.0, "grad_norm": 1.6792903182658083, "language_loss": 0.69719356, "learning_rate": 2.980600479213388e-06, "loss": 0.71889758, "num_input_tokens_seen": 127544495, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 5934, "time_per_iteration": 2.5656564235687256 }, { "auxiliary_loss_clip": 0.01135116, "auxiliary_loss_mlp": 0.01042951, "balance_loss_clip": 1.0256238, "balance_loss_mlp": 1.04736388, "epoch": 0.356831504584398, "flos": 20777842696320.0, "grad_norm": 2.4738616736193015, "language_loss": 0.71013558, "learning_rate": 2.9802610238933384e-06, "loss": 0.73191619, "num_input_tokens_seen": 127563810, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.87890625, "step": 5935, "time_per_iteration": 2.465909004211426 }, { "auxiliary_loss_clip": 0.01129013, "auxiliary_loss_mlp": 0.0103916, "balance_loss_clip": 1.0240922, "balance_loss_mlp": 1.04679358, "epoch": 0.35689162783706596, "flos": 12165566365440.0, "grad_norm": 2.1440574895871385, "language_loss": 0.78666776, "learning_rate": 2.979921531401692e-06, "loss": 0.80834955, "num_input_tokens_seen": 127579065, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.82421875, "step": 5936, "time_per_iteration": 2.431424617767334 }, { "auxiliary_loss_clip": 0.01126529, "auxiliary_loss_mlp": 0.01040439, "balance_loss_clip": 1.0256691, "balance_loss_mlp": 1.04534578, "epoch": 0.356951751089734, "flos": 23841489507840.0, "grad_norm": 1.477451787606615, "language_loss": 0.64429623, "learning_rate": 2.9795820017513242e-06, "loss": 0.66596591, "num_input_tokens_seen": 127599105, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 5937, "time_per_iteration": 2.4934473037719727 }, { "auxiliary_loss_clip": 0.0112992, "auxiliary_loss_mlp": 0.01036615, "balance_loss_clip": 1.02103496, "balance_loss_mlp": 1.04582822, "epoch": 0.35701187434240195, "flos": 11722176881280.0, "grad_norm": 2.5510354327498606, "language_loss": 0.77733964, "learning_rate": 2.9792424349551073e-06, "loss": 0.79900503, "num_input_tokens_seen": 127614940, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83984375, "step": 5938, "time_per_iteration": 2.416025161743164 }, { "auxiliary_loss_clip": 0.01129563, "auxiliary_loss_mlp": 0.01047845, "balance_loss_clip": 1.03370726, "balance_loss_mlp": 1.04712033, "epoch": 0.3570719975950699, "flos": 24898879100160.0, "grad_norm": 1.7868361915030844, "language_loss": 0.80298901, "learning_rate": 2.9789028310259202e-06, "loss": 0.82476306, "num_input_tokens_seen": 127634960, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.828125, "step": 5939, "time_per_iteration": 2.5020101070404053 }, { "auxiliary_loss_clip": 0.01135196, "auxiliary_loss_mlp": 0.01040941, "balance_loss_clip": 1.02543235, "balance_loss_mlp": 1.04696703, "epoch": 0.3571321208477379, "flos": 25994836920960.0, "grad_norm": 2.278124732248039, "language_loss": 0.79299295, "learning_rate": 2.9785631899766395e-06, "loss": 0.81475437, "num_input_tokens_seen": 127654545, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8828125, "step": 5940, "time_per_iteration": 2.504007577896118 }, { "auxiliary_loss_clip": 0.01133114, "auxiliary_loss_mlp": 0.01035764, "balance_loss_clip": 1.01996899, "balance_loss_mlp": 1.04831576, "epoch": 0.35719224410040584, "flos": 14501663199360.0, "grad_norm": 4.636646741639683, "language_loss": 0.72255993, "learning_rate": 2.9782235118201443e-06, "loss": 0.74424875, "num_input_tokens_seen": 127672320, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 5941, "time_per_iteration": 2.459184169769287 }, { "auxiliary_loss_clip": 0.0113241, "auxiliary_loss_mlp": 0.01037971, "balance_loss_clip": 1.02202129, "balance_loss_mlp": 1.04837847, "epoch": 0.3572523673530738, "flos": 31175453646720.0, "grad_norm": 2.300979815419475, "language_loss": 0.64961404, "learning_rate": 2.9778837965693154e-06, "loss": 0.67131788, "num_input_tokens_seen": 127693315, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83984375, "step": 5942, "time_per_iteration": 2.550037145614624 }, { "auxiliary_loss_clip": 0.01129095, "auxiliary_loss_mlp": 0.0103939, "balance_loss_clip": 1.02376807, "balance_loss_mlp": 1.04649723, "epoch": 0.3573124906057418, "flos": 15851976203520.0, "grad_norm": 2.4206628802001853, "language_loss": 0.74084854, "learning_rate": 2.9775440442370354e-06, "loss": 0.76253343, "num_input_tokens_seen": 127711570, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 5943, "time_per_iteration": 2.461919069290161 }, { "auxiliary_loss_clip": 0.01053049, "auxiliary_loss_mlp": 0.01008842, "balance_loss_clip": 1.00704241, "balance_loss_mlp": 1.02545285, "epoch": 0.35737261385840974, "flos": 60822729118080.0, "grad_norm": 0.9252105691967194, "language_loss": 0.60792851, "learning_rate": 2.9772042548361867e-06, "loss": 0.62854743, "num_input_tokens_seen": 127772475, "router_z_loss_clip": 0.01794434, "router_z_loss_mlp": 0.27734375, "step": 5944, "time_per_iteration": 3.2100565433502197 }, { "auxiliary_loss_clip": 0.01126352, "auxiliary_loss_mlp": 0.01039008, "balance_loss_clip": 1.02423811, "balance_loss_mlp": 1.04470372, "epoch": 0.3574327371110777, "flos": 18843765857280.0, "grad_norm": 2.044449214636738, "language_loss": 0.72434539, "learning_rate": 2.976864428379655e-06, "loss": 0.74599904, "num_input_tokens_seen": 127790940, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.81640625, "step": 5945, "time_per_iteration": 2.4509780406951904 }, { "auxiliary_loss_clip": 0.01127037, "auxiliary_loss_mlp": 0.0103783, "balance_loss_clip": 1.02289295, "balance_loss_mlp": 1.04486203, "epoch": 0.35749286036374567, "flos": 23549679417600.0, "grad_norm": 2.466497944000598, "language_loss": 0.81383514, "learning_rate": 2.976524564880326e-06, "loss": 0.83548379, "num_input_tokens_seen": 127808275, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8203125, "step": 5946, "time_per_iteration": 2.4929451942443848 }, { "auxiliary_loss_clip": 0.01132152, "auxiliary_loss_mlp": 0.01047351, "balance_loss_clip": 1.03199077, "balance_loss_mlp": 1.0494566, "epoch": 0.35755298361641363, "flos": 21105491581440.0, "grad_norm": 2.1631451729957636, "language_loss": 0.68914878, "learning_rate": 2.9761846643510882e-06, "loss": 0.71094382, "num_input_tokens_seen": 127828840, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 5947, "time_per_iteration": 2.4651920795440674 }, { "auxiliary_loss_clip": 0.01125593, "auxiliary_loss_mlp": 0.01039045, "balance_loss_clip": 1.02431118, "balance_loss_mlp": 1.04618061, "epoch": 0.3576131068690816, "flos": 19245031666560.0, "grad_norm": 2.318722662901737, "language_loss": 0.75891018, "learning_rate": 2.9758447268048297e-06, "loss": 0.78055656, "num_input_tokens_seen": 127846240, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.79296875, "step": 5948, "time_per_iteration": 2.539079427719116 }, { "auxiliary_loss_clip": 0.01126868, "auxiliary_loss_mlp": 0.01040575, "balance_loss_clip": 1.02576876, "balance_loss_mlp": 1.04464531, "epoch": 0.35767323012174956, "flos": 28654703971200.0, "grad_norm": 1.7029950861051382, "language_loss": 0.70848608, "learning_rate": 2.9755047522544415e-06, "loss": 0.73016047, "num_input_tokens_seen": 127866880, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 5949, "time_per_iteration": 2.564012289047241 }, { "auxiliary_loss_clip": 0.01128427, "auxiliary_loss_mlp": 0.01039783, "balance_loss_clip": 1.02518535, "balance_loss_mlp": 1.04726219, "epoch": 0.35773335337441753, "flos": 17085363459840.0, "grad_norm": 15.767984523626753, "language_loss": 0.76923752, "learning_rate": 2.9751647407128154e-06, "loss": 0.79091954, "num_input_tokens_seen": 127883560, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 5950, "time_per_iteration": 2.4478862285614014 }, { "auxiliary_loss_clip": 0.01129997, "auxiliary_loss_mlp": 0.01035864, "balance_loss_clip": 1.02042079, "balance_loss_mlp": 1.04640651, "epoch": 0.35779347662708555, "flos": 15888605097600.0, "grad_norm": 1.906328515350603, "language_loss": 0.72904944, "learning_rate": 2.9748246921928445e-06, "loss": 0.7507081, "num_input_tokens_seen": 127902330, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 5951, "time_per_iteration": 2.433854579925537 }, { "auxiliary_loss_clip": 0.01133534, "auxiliary_loss_mlp": 0.01040784, "balance_loss_clip": 1.02565598, "balance_loss_mlp": 1.04799747, "epoch": 0.3578535998797535, "flos": 28658834035200.0, "grad_norm": 2.589543847302719, "language_loss": 0.70495331, "learning_rate": 2.9744846067074236e-06, "loss": 0.72669649, "num_input_tokens_seen": 127922325, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.85546875, "step": 5952, "time_per_iteration": 2.5317542552948 }, { "auxiliary_loss_clip": 0.01125971, "auxiliary_loss_mlp": 0.01032996, "balance_loss_clip": 1.01892972, "balance_loss_mlp": 1.04526639, "epoch": 0.3579137231324215, "flos": 37852432076160.0, "grad_norm": 2.216745775913524, "language_loss": 0.69544536, "learning_rate": 2.974144484269449e-06, "loss": 0.71703506, "num_input_tokens_seen": 127942635, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80859375, "step": 5953, "time_per_iteration": 2.600184202194214 }, { "auxiliary_loss_clip": 0.01126659, "auxiliary_loss_mlp": 0.01030584, "balance_loss_clip": 1.01611793, "balance_loss_mlp": 1.04490542, "epoch": 0.35797384638508944, "flos": 22346851656960.0, "grad_norm": 1.6557348483881327, "language_loss": 0.66825813, "learning_rate": 2.9738043248918175e-06, "loss": 0.68983054, "num_input_tokens_seen": 127962520, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.81640625, "step": 5954, "time_per_iteration": 2.491992950439453 }, { "auxiliary_loss_clip": 0.01129131, "auxiliary_loss_mlp": 0.01038902, "balance_loss_clip": 1.02442443, "balance_loss_mlp": 1.04833937, "epoch": 0.3580339696377574, "flos": 13589711775360.0, "grad_norm": 3.068052967766246, "language_loss": 0.74614882, "learning_rate": 2.9734641285874282e-06, "loss": 0.76782918, "num_input_tokens_seen": 127981180, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80859375, "step": 5955, "time_per_iteration": 2.4310319423675537 }, { "auxiliary_loss_clip": 0.01123434, "auxiliary_loss_mlp": 0.01031003, "balance_loss_clip": 1.01725829, "balance_loss_mlp": 1.04531312, "epoch": 0.3580940928904254, "flos": 23768231719680.0, "grad_norm": 2.0164852930568173, "language_loss": 0.76235187, "learning_rate": 2.973123895369182e-06, "loss": 0.78389621, "num_input_tokens_seen": 127999725, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 5956, "time_per_iteration": 2.476607322692871 }, { "auxiliary_loss_clip": 0.01123449, "auxiliary_loss_mlp": 0.01032795, "balance_loss_clip": 1.01854324, "balance_loss_mlp": 1.04489112, "epoch": 0.35815421614309334, "flos": 19463871277440.0, "grad_norm": 1.8553967061167091, "language_loss": 0.73315585, "learning_rate": 2.9727836252499805e-06, "loss": 0.7547183, "num_input_tokens_seen": 128018885, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 5957, "time_per_iteration": 2.4442195892333984 }, { "auxiliary_loss_clip": 0.01131045, "auxiliary_loss_mlp": 0.01033433, "balance_loss_clip": 1.01946139, "balance_loss_mlp": 1.05042577, "epoch": 0.3582143393957613, "flos": 23368186972800.0, "grad_norm": 1.9304427289666048, "language_loss": 0.71508253, "learning_rate": 2.972443318242726e-06, "loss": 0.73672736, "num_input_tokens_seen": 128037875, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.8046875, "step": 5958, "time_per_iteration": 2.489654779434204 }, { "auxiliary_loss_clip": 0.01124791, "auxiliary_loss_mlp": 0.01028576, "balance_loss_clip": 1.01454461, "balance_loss_mlp": 1.04556882, "epoch": 0.35827446264842927, "flos": 26323275905280.0, "grad_norm": 2.6755389712672217, "language_loss": 0.88471806, "learning_rate": 2.972102974360324e-06, "loss": 0.90625179, "num_input_tokens_seen": 128056045, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.79296875, "step": 5959, "time_per_iteration": 2.509742498397827 }, { "auxiliary_loss_clip": 0.0112898, "auxiliary_loss_mlp": 0.01039336, "balance_loss_clip": 1.02469683, "balance_loss_mlp": 1.04845548, "epoch": 0.35833458590109724, "flos": 30446610779520.0, "grad_norm": 1.9110827650435076, "language_loss": 0.57696623, "learning_rate": 2.971762593615679e-06, "loss": 0.59864938, "num_input_tokens_seen": 128077815, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 5960, "time_per_iteration": 2.5531647205352783 }, { "auxiliary_loss_clip": 0.01128301, "auxiliary_loss_mlp": 0.01035593, "balance_loss_clip": 1.01995885, "balance_loss_mlp": 1.04642105, "epoch": 0.3583947091537652, "flos": 14829886702080.0, "grad_norm": 2.3346800017951965, "language_loss": 0.76262975, "learning_rate": 2.9714221760216993e-06, "loss": 0.78426874, "num_input_tokens_seen": 128095460, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 5961, "time_per_iteration": 2.4217731952667236 }, { "auxiliary_loss_clip": 0.01128558, "auxiliary_loss_mlp": 0.0103283, "balance_loss_clip": 1.0179348, "balance_loss_mlp": 1.04814696, "epoch": 0.35845483240643317, "flos": 34240644743040.0, "grad_norm": 1.8652914755652914, "language_loss": 0.70413464, "learning_rate": 2.971081721591294e-06, "loss": 0.72574854, "num_input_tokens_seen": 128118605, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8046875, "step": 5962, "time_per_iteration": 2.601607084274292 }, { "auxiliary_loss_clip": 0.01126714, "auxiliary_loss_mlp": 0.01033595, "balance_loss_clip": 1.02110171, "balance_loss_mlp": 1.04778838, "epoch": 0.35851495565910113, "flos": 20960089326720.0, "grad_norm": 1.8451486112257334, "language_loss": 0.74179679, "learning_rate": 2.9707412303373716e-06, "loss": 0.76339984, "num_input_tokens_seen": 128139205, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7890625, "step": 5963, "time_per_iteration": 2.4680047035217285 }, { "auxiliary_loss_clip": 0.01128723, "auxiliary_loss_mlp": 0.01035031, "balance_loss_clip": 1.02051163, "balance_loss_mlp": 1.04860449, "epoch": 0.35857507891176915, "flos": 22309863626880.0, "grad_norm": 1.7362962986601456, "language_loss": 0.78466654, "learning_rate": 2.9704007022728447e-06, "loss": 0.8063041, "num_input_tokens_seen": 128158765, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.80078125, "step": 5964, "time_per_iteration": 2.496908187866211 }, { "auxiliary_loss_clip": 0.01129885, "auxiliary_loss_mlp": 0.01034556, "balance_loss_clip": 1.01991701, "balance_loss_mlp": 1.04673421, "epoch": 0.3586352021644371, "flos": 23367863750400.0, "grad_norm": 1.9522974096567485, "language_loss": 0.66695809, "learning_rate": 2.970060137410626e-06, "loss": 0.68860251, "num_input_tokens_seen": 128177850, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.83203125, "step": 5965, "time_per_iteration": 2.4852840900421143 }, { "auxiliary_loss_clip": 0.01127324, "auxiliary_loss_mlp": 0.01037372, "balance_loss_clip": 1.02215457, "balance_loss_mlp": 1.04656553, "epoch": 0.3586953254171051, "flos": 27849227437440.0, "grad_norm": 2.1334888984558202, "language_loss": 0.79035735, "learning_rate": 2.9697195357636294e-06, "loss": 0.81200427, "num_input_tokens_seen": 128196925, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 5966, "time_per_iteration": 4.017859697341919 }, { "auxiliary_loss_clip": 0.01127871, "auxiliary_loss_mlp": 0.01039568, "balance_loss_clip": 1.024279, "balance_loss_mlp": 1.04677105, "epoch": 0.35875544866977305, "flos": 19500500171520.0, "grad_norm": 5.199602683117892, "language_loss": 0.91099197, "learning_rate": 2.9693788973447715e-06, "loss": 0.9326663, "num_input_tokens_seen": 128213955, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8125, "step": 5967, "time_per_iteration": 2.442193031311035 }, { "auxiliary_loss_clip": 0.01133239, "auxiliary_loss_mlp": 0.01039396, "balance_loss_clip": 1.02315927, "balance_loss_mlp": 1.04928923, "epoch": 0.358815571922441, "flos": 21471134077440.0, "grad_norm": 2.1352398306315314, "language_loss": 0.80384105, "learning_rate": 2.9690382221669682e-06, "loss": 0.82556736, "num_input_tokens_seen": 128232980, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.83984375, "step": 5968, "time_per_iteration": 2.498882532119751 }, { "auxiliary_loss_clip": 0.0113186, "auxiliary_loss_mlp": 0.01051972, "balance_loss_clip": 1.03604531, "balance_loss_mlp": 1.0464673, "epoch": 0.358875695175109, "flos": 21835411856640.0, "grad_norm": 2.259758807513458, "language_loss": 0.84217352, "learning_rate": 2.9686975102431384e-06, "loss": 0.86401176, "num_input_tokens_seen": 128252795, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.85546875, "step": 5969, "time_per_iteration": 2.4632561206817627 }, { "auxiliary_loss_clip": 0.01128093, "auxiliary_loss_mlp": 0.01029965, "balance_loss_clip": 1.01649988, "balance_loss_mlp": 1.04772675, "epoch": 0.35893581842777694, "flos": 32011633330560.0, "grad_norm": 18.909108597747554, "language_loss": 0.72092533, "learning_rate": 2.968356761586202e-06, "loss": 0.74250579, "num_input_tokens_seen": 128273115, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.8046875, "step": 5970, "time_per_iteration": 4.01327109336853 }, { "auxiliary_loss_clip": 0.01128866, "auxiliary_loss_mlp": 0.0103459, "balance_loss_clip": 1.02072632, "balance_loss_mlp": 1.04870605, "epoch": 0.3589959416804449, "flos": 20485817124480.0, "grad_norm": 2.2772303676819274, "language_loss": 0.79582411, "learning_rate": 2.9680159762090805e-06, "loss": 0.81745869, "num_input_tokens_seen": 128292220, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.80078125, "step": 5971, "time_per_iteration": 3.8338675498962402 }, { "auxiliary_loss_clip": 0.01127815, "auxiliary_loss_mlp": 0.01038182, "balance_loss_clip": 1.02242279, "balance_loss_mlp": 1.0436976, "epoch": 0.3590560649331129, "flos": 16180666583040.0, "grad_norm": 2.268069048066451, "language_loss": 0.78399187, "learning_rate": 2.967675154124696e-06, "loss": 0.80565184, "num_input_tokens_seen": 128310305, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83984375, "step": 5972, "time_per_iteration": 3.877854347229004 }, { "auxiliary_loss_clip": 0.01127986, "auxiliary_loss_mlp": 0.01036019, "balance_loss_clip": 1.02134395, "balance_loss_mlp": 1.04498792, "epoch": 0.35911618818578084, "flos": 20375391738240.0, "grad_norm": 1.8640595073315782, "language_loss": 0.81545973, "learning_rate": 2.9673342953459722e-06, "loss": 0.83709985, "num_input_tokens_seen": 128328305, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.828125, "step": 5973, "time_per_iteration": 2.4545416831970215 }, { "auxiliary_loss_clip": 0.01058963, "auxiliary_loss_mlp": 0.01001989, "balance_loss_clip": 1.00039148, "balance_loss_mlp": 1.03154719, "epoch": 0.3591763114384488, "flos": 41236691685120.0, "grad_norm": 1.4962048993380734, "language_loss": 0.56722748, "learning_rate": 2.9669933998858355e-06, "loss": 0.58783698, "num_input_tokens_seen": 128378380, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.2734375, "step": 5974, "time_per_iteration": 2.946453809738159 }, { "auxiliary_loss_clip": 0.01131392, "auxiliary_loss_mlp": 0.01036815, "balance_loss_clip": 1.02248621, "balance_loss_mlp": 1.04820418, "epoch": 0.35923643469111677, "flos": 18695454600960.0, "grad_norm": 1.7918731054889419, "language_loss": 0.68776518, "learning_rate": 2.9666524677572114e-06, "loss": 0.70944726, "num_input_tokens_seen": 128394315, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.83203125, "step": 5975, "time_per_iteration": 2.431429147720337 }, { "auxiliary_loss_clip": 0.01127374, "auxiliary_loss_mlp": 0.01037301, "balance_loss_clip": 1.02344298, "balance_loss_mlp": 1.04676366, "epoch": 0.35929655794378473, "flos": 25009950931200.0, "grad_norm": 1.6070277313622388, "language_loss": 0.80009556, "learning_rate": 2.96631149897303e-06, "loss": 0.8217423, "num_input_tokens_seen": 128414515, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8046875, "step": 5976, "time_per_iteration": 2.517625093460083 }, { "auxiliary_loss_clip": 0.01128192, "auxiliary_loss_mlp": 0.01038306, "balance_loss_clip": 1.02339327, "balance_loss_mlp": 1.0468111, "epoch": 0.35935668119645275, "flos": 14975576265600.0, "grad_norm": 1.883605780606231, "language_loss": 0.78904796, "learning_rate": 2.9659704935462194e-06, "loss": 0.81071293, "num_input_tokens_seen": 128430615, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 5977, "time_per_iteration": 2.4745819568634033 }, { "auxiliary_loss_clip": 0.01123497, "auxiliary_loss_mlp": 0.01036561, "balance_loss_clip": 1.02327549, "balance_loss_mlp": 1.04373217, "epoch": 0.3594168044491207, "flos": 21178138838400.0, "grad_norm": 1.936492018083067, "language_loss": 0.80392194, "learning_rate": 2.9656294514897102e-06, "loss": 0.82552254, "num_input_tokens_seen": 128449480, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.796875, "step": 5978, "time_per_iteration": 2.4829745292663574 }, { "auxiliary_loss_clip": 0.0112789, "auxiliary_loss_mlp": 0.01036054, "balance_loss_clip": 1.02115321, "balance_loss_mlp": 1.04618645, "epoch": 0.3594769277017887, "flos": 27672152365440.0, "grad_norm": 1.7505884037101578, "language_loss": 0.67558861, "learning_rate": 2.965288372816436e-06, "loss": 0.69722807, "num_input_tokens_seen": 128471465, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.81640625, "step": 5979, "time_per_iteration": 2.526801347732544 }, { "auxiliary_loss_clip": 0.01127555, "auxiliary_loss_mlp": 0.01038632, "balance_loss_clip": 1.02424955, "balance_loss_mlp": 1.0464201, "epoch": 0.35953705095445665, "flos": 23002328995200.0, "grad_norm": 2.4577667490890023, "language_loss": 0.66792989, "learning_rate": 2.9649472575393296e-06, "loss": 0.68959177, "num_input_tokens_seen": 128490645, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8125, "step": 5980, "time_per_iteration": 2.4818925857543945 }, { "auxiliary_loss_clip": 0.01129391, "auxiliary_loss_mlp": 0.01041274, "balance_loss_clip": 1.02547848, "balance_loss_mlp": 1.04466391, "epoch": 0.3595971742071246, "flos": 25513992529920.0, "grad_norm": 2.0838682901886796, "language_loss": 0.71153295, "learning_rate": 2.964606105671327e-06, "loss": 0.73323959, "num_input_tokens_seen": 128510225, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 5981, "time_per_iteration": 2.4958064556121826 }, { "auxiliary_loss_clip": 0.01127969, "auxiliary_loss_mlp": 0.01042093, "balance_loss_clip": 1.02636957, "balance_loss_mlp": 1.04576778, "epoch": 0.3596572974597926, "flos": 29862559635840.0, "grad_norm": 2.180065490570222, "language_loss": 0.71339297, "learning_rate": 2.9642649172253635e-06, "loss": 0.73509347, "num_input_tokens_seen": 128530195, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8203125, "step": 5982, "time_per_iteration": 2.5586800575256348 }, { "auxiliary_loss_clip": 0.01123952, "auxiliary_loss_mlp": 0.01042049, "balance_loss_clip": 1.0285728, "balance_loss_mlp": 1.0454514, "epoch": 0.35971742071246054, "flos": 23112538899840.0, "grad_norm": 1.8262317415835, "language_loss": 0.75427246, "learning_rate": 2.9639236922143786e-06, "loss": 0.77593243, "num_input_tokens_seen": 128549990, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78515625, "step": 5983, "time_per_iteration": 2.479027509689331 }, { "auxiliary_loss_clip": 0.01132962, "auxiliary_loss_mlp": 0.01045898, "balance_loss_clip": 1.03010285, "balance_loss_mlp": 1.04765701, "epoch": 0.3597775439651285, "flos": 16725359399040.0, "grad_norm": 2.586787010021673, "language_loss": 0.76091784, "learning_rate": 2.96358243065131e-06, "loss": 0.7827065, "num_input_tokens_seen": 128567925, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5984, "time_per_iteration": 2.48459792137146 }, { "auxiliary_loss_clip": 0.01124149, "auxiliary_loss_mlp": 0.0103872, "balance_loss_clip": 1.02456367, "balance_loss_mlp": 1.0438478, "epoch": 0.3598376672177965, "flos": 19719483436800.0, "grad_norm": 2.554872696574726, "language_loss": 0.86175162, "learning_rate": 2.9632411325490993e-06, "loss": 0.88338035, "num_input_tokens_seen": 128585655, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.80078125, "step": 5985, "time_per_iteration": 2.4540047645568848 }, { "auxiliary_loss_clip": 0.01125131, "auxiliary_loss_mlp": 0.01037182, "balance_loss_clip": 1.02229238, "balance_loss_mlp": 1.04427242, "epoch": 0.35989779047046444, "flos": 17311529445120.0, "grad_norm": 1.4884608023679173, "language_loss": 0.72395325, "learning_rate": 2.9628997979206884e-06, "loss": 0.74557638, "num_input_tokens_seen": 128604820, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80859375, "step": 5986, "time_per_iteration": 2.5212321281433105 }, { "auxiliary_loss_clip": 0.01130722, "auxiliary_loss_mlp": 0.01037089, "balance_loss_clip": 1.02216399, "balance_loss_mlp": 1.0453074, "epoch": 0.3599579137231324, "flos": 22711237176960.0, "grad_norm": 2.462087017536295, "language_loss": 0.74071729, "learning_rate": 2.9625584267790204e-06, "loss": 0.76239538, "num_input_tokens_seen": 128623070, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8515625, "step": 5987, "time_per_iteration": 2.4664385318756104 }, { "auxiliary_loss_clip": 0.01131976, "auxiliary_loss_mlp": 0.01038037, "balance_loss_clip": 1.02261114, "balance_loss_mlp": 1.04767346, "epoch": 0.36001803697580037, "flos": 20959873845120.0, "grad_norm": 1.8418208095763162, "language_loss": 0.69711041, "learning_rate": 2.9622170191370404e-06, "loss": 0.71881056, "num_input_tokens_seen": 128642430, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84375, "step": 5988, "time_per_iteration": 2.5065486431121826 }, { "auxiliary_loss_clip": 0.01131802, "auxiliary_loss_mlp": 0.01041858, "balance_loss_clip": 1.02647448, "balance_loss_mlp": 1.04588342, "epoch": 0.36007816022846834, "flos": 20485565729280.0, "grad_norm": 1.878440052398067, "language_loss": 0.73144996, "learning_rate": 2.9618755750076953e-06, "loss": 0.75318652, "num_input_tokens_seen": 128661285, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.859375, "step": 5989, "time_per_iteration": 2.450465679168701 }, { "auxiliary_loss_clip": 0.01128379, "auxiliary_loss_mlp": 0.01035845, "balance_loss_clip": 1.02173114, "balance_loss_mlp": 1.04686451, "epoch": 0.36013828348113636, "flos": 28001237794560.0, "grad_norm": 1.5053494241792789, "language_loss": 0.79892862, "learning_rate": 2.961534094403931e-06, "loss": 0.82057089, "num_input_tokens_seen": 128682210, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 5990, "time_per_iteration": 2.55413818359375 }, { "auxiliary_loss_clip": 0.0112917, "auxiliary_loss_mlp": 0.01033642, "balance_loss_clip": 1.01862717, "balance_loss_mlp": 1.04710746, "epoch": 0.3601984067338043, "flos": 20082181017600.0, "grad_norm": 1.8025116534373562, "language_loss": 0.83634573, "learning_rate": 2.961192577338698e-06, "loss": 0.85797381, "num_input_tokens_seen": 128700445, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 5991, "time_per_iteration": 2.487382650375366 }, { "auxiliary_loss_clip": 0.01130017, "auxiliary_loss_mlp": 0.0104276, "balance_loss_clip": 1.0278585, "balance_loss_mlp": 1.04558349, "epoch": 0.3602585299864723, "flos": 18617599872000.0, "grad_norm": 1.939252976514166, "language_loss": 0.7554189, "learning_rate": 2.9608510238249463e-06, "loss": 0.7771467, "num_input_tokens_seen": 128716855, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.84375, "step": 5992, "time_per_iteration": 2.45570707321167 }, { "auxiliary_loss_clip": 0.0112843, "auxiliary_loss_mlp": 0.01040484, "balance_loss_clip": 1.02528453, "balance_loss_mlp": 1.04729927, "epoch": 0.36031865323914025, "flos": 19573003774080.0, "grad_norm": 8.917847093531314, "language_loss": 0.77287132, "learning_rate": 2.960509433875627e-06, "loss": 0.79456043, "num_input_tokens_seen": 128735835, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 5993, "time_per_iteration": 2.454348087310791 }, { "auxiliary_loss_clip": 0.01131382, "auxiliary_loss_mlp": 0.01043071, "balance_loss_clip": 1.02781868, "balance_loss_mlp": 1.04739189, "epoch": 0.3603787764918082, "flos": 17490615678720.0, "grad_norm": 1.9928009613995876, "language_loss": 0.74547565, "learning_rate": 2.9601678075036943e-06, "loss": 0.7672202, "num_input_tokens_seen": 128752465, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 5994, "time_per_iteration": 2.450971841812134 }, { "auxiliary_loss_clip": 0.01133366, "auxiliary_loss_mlp": 0.01035246, "balance_loss_clip": 1.02089322, "balance_loss_mlp": 1.04906297, "epoch": 0.3604388997444762, "flos": 15523393564800.0, "grad_norm": 2.679784801171904, "language_loss": 0.68901539, "learning_rate": 2.9598261447221024e-06, "loss": 0.71070158, "num_input_tokens_seen": 128770865, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.84375, "step": 5995, "time_per_iteration": 2.457888126373291 }, { "auxiliary_loss_clip": 0.01132906, "auxiliary_loss_mlp": 0.01043789, "balance_loss_clip": 1.02798176, "balance_loss_mlp": 1.0478549, "epoch": 0.36049902299714415, "flos": 17310883000320.0, "grad_norm": 10.864956273564985, "language_loss": 0.82267237, "learning_rate": 2.9594844455438057e-06, "loss": 0.84443927, "num_input_tokens_seen": 128789730, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 5996, "time_per_iteration": 2.5065152645111084 }, { "auxiliary_loss_clip": 0.01128084, "auxiliary_loss_mlp": 0.01039646, "balance_loss_clip": 1.02516806, "balance_loss_mlp": 1.04702353, "epoch": 0.3605591462498121, "flos": 17056025026560.0, "grad_norm": 1.6486314095901557, "language_loss": 0.73580551, "learning_rate": 2.959142709981763e-06, "loss": 0.75748277, "num_input_tokens_seen": 128806610, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 5997, "time_per_iteration": 2.429171562194824 }, { "auxiliary_loss_clip": 0.01126589, "auxiliary_loss_mlp": 0.01035852, "balance_loss_clip": 1.0220536, "balance_loss_mlp": 1.04687619, "epoch": 0.3606192695024801, "flos": 16836862193280.0, "grad_norm": 2.229235726444975, "language_loss": 0.68786609, "learning_rate": 2.9588009380489337e-06, "loss": 0.70949054, "num_input_tokens_seen": 128824830, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.796875, "step": 5998, "time_per_iteration": 2.457024574279785 }, { "auxiliary_loss_clip": 0.01129014, "auxiliary_loss_mlp": 0.010338, "balance_loss_clip": 1.01998377, "balance_loss_mlp": 1.04796267, "epoch": 0.36067939275514804, "flos": 12129655743360.0, "grad_norm": 3.8954620708880268, "language_loss": 0.77021217, "learning_rate": 2.9584591297582758e-06, "loss": 0.79184031, "num_input_tokens_seen": 128838170, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8125, "step": 5999, "time_per_iteration": 2.418710470199585 }, { "auxiliary_loss_clip": 0.01130868, "auxiliary_loss_mlp": 0.01041534, "balance_loss_clip": 1.02717531, "balance_loss_mlp": 1.04863, "epoch": 0.360739516007816, "flos": 18041449720320.0, "grad_norm": 5.968814362483806, "language_loss": 0.78432918, "learning_rate": 2.9581172851227516e-06, "loss": 0.80605316, "num_input_tokens_seen": 128855625, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8203125, "step": 6000, "time_per_iteration": 2.439303398132324 }, { "auxiliary_loss_clip": 0.01129663, "auxiliary_loss_mlp": 0.01036935, "balance_loss_clip": 1.02289224, "balance_loss_mlp": 1.04714, "epoch": 0.360799639260484, "flos": 18549800951040.0, "grad_norm": 1.8564722505060567, "language_loss": 0.78303456, "learning_rate": 2.9577754041553243e-06, "loss": 0.80470049, "num_input_tokens_seen": 128873540, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.82421875, "step": 6001, "time_per_iteration": 2.4486281871795654 }, { "auxiliary_loss_clip": 0.01125565, "auxiliary_loss_mlp": 0.01032222, "balance_loss_clip": 1.01877534, "balance_loss_mlp": 1.04696894, "epoch": 0.36085976251315194, "flos": 19682028529920.0, "grad_norm": 2.0140228407968057, "language_loss": 0.83581591, "learning_rate": 2.9574334868689575e-06, "loss": 0.8573938, "num_input_tokens_seen": 128889925, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78515625, "step": 6002, "time_per_iteration": 2.4713029861450195 }, { "auxiliary_loss_clip": 0.01124429, "auxiliary_loss_mlp": 0.01034784, "balance_loss_clip": 1.02159941, "balance_loss_mlp": 1.04741037, "epoch": 0.3609198857658199, "flos": 24198943703040.0, "grad_norm": 2.6606730662112716, "language_loss": 0.90905213, "learning_rate": 2.9570915332766165e-06, "loss": 0.93064427, "num_input_tokens_seen": 128906890, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7734375, "step": 6003, "time_per_iteration": 2.476947784423828 }, { "auxiliary_loss_clip": 0.01054799, "auxiliary_loss_mlp": 0.01013363, "balance_loss_clip": 1.01192057, "balance_loss_mlp": 1.02770424, "epoch": 0.3609800090184879, "flos": 57115995160320.0, "grad_norm": 0.8578847670699251, "language_loss": 0.53438836, "learning_rate": 2.9567495433912693e-06, "loss": 0.55506998, "num_input_tokens_seen": 128965940, "router_z_loss_clip": 0.0144043, "router_z_loss_mlp": 0.27148438, "step": 6004, "time_per_iteration": 3.04313063621521 }, { "auxiliary_loss_clip": 0.01129071, "auxiliary_loss_mlp": 0.01036401, "balance_loss_clip": 1.0205462, "balance_loss_mlp": 1.04625165, "epoch": 0.3610401322711559, "flos": 20811239366400.0, "grad_norm": 2.348341397708473, "language_loss": 0.77885169, "learning_rate": 2.956407517225883e-06, "loss": 0.80050635, "num_input_tokens_seen": 128985835, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 6005, "time_per_iteration": 2.4982054233551025 }, { "auxiliary_loss_clip": 0.01127168, "auxiliary_loss_mlp": 0.01038486, "balance_loss_clip": 1.02509904, "balance_loss_mlp": 1.04734409, "epoch": 0.36110025552382385, "flos": 13699167494400.0, "grad_norm": 2.115433068044949, "language_loss": 0.79078329, "learning_rate": 2.956065454793429e-06, "loss": 0.8124398, "num_input_tokens_seen": 129003120, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.796875, "step": 6006, "time_per_iteration": 2.497161865234375 }, { "auxiliary_loss_clip": 0.01129977, "auxiliary_loss_mlp": 0.01036249, "balance_loss_clip": 1.02073979, "balance_loss_mlp": 1.04681516, "epoch": 0.3611603787764918, "flos": 22455014486400.0, "grad_norm": 1.846339154257721, "language_loss": 0.84870726, "learning_rate": 2.955723356106876e-06, "loss": 0.87036949, "num_input_tokens_seen": 129021645, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83203125, "step": 6007, "time_per_iteration": 2.4722206592559814 }, { "auxiliary_loss_clip": 0.01135126, "auxiliary_loss_mlp": 0.01037217, "balance_loss_clip": 1.02136183, "balance_loss_mlp": 1.0480473, "epoch": 0.3612205020291598, "flos": 20886651970560.0, "grad_norm": 2.1622922615774427, "language_loss": 0.72583354, "learning_rate": 2.955381221179198e-06, "loss": 0.74755704, "num_input_tokens_seen": 129038375, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.87109375, "step": 6008, "time_per_iteration": 3.955660820007324 }, { "auxiliary_loss_clip": 0.01128219, "auxiliary_loss_mlp": 0.0103511, "balance_loss_clip": 1.02109146, "balance_loss_mlp": 1.04618144, "epoch": 0.36128062528182775, "flos": 15741981780480.0, "grad_norm": 2.7525668808767625, "language_loss": 0.83049953, "learning_rate": 2.955039050023368e-06, "loss": 0.85213292, "num_input_tokens_seen": 129056235, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8203125, "step": 6009, "time_per_iteration": 2.477189064025879 }, { "auxiliary_loss_clip": 0.01128003, "auxiliary_loss_mlp": 0.0104072, "balance_loss_clip": 1.02653408, "balance_loss_mlp": 1.04604995, "epoch": 0.3613407485344957, "flos": 16764502245120.0, "grad_norm": 1.7239154076505832, "language_loss": 0.76056343, "learning_rate": 2.954696842652362e-06, "loss": 0.78225064, "num_input_tokens_seen": 129072405, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8203125, "step": 6010, "time_per_iteration": 2.4432754516601562 }, { "auxiliary_loss_clip": 0.01128237, "auxiliary_loss_mlp": 0.0103562, "balance_loss_clip": 1.02187538, "balance_loss_mlp": 1.04766548, "epoch": 0.3614008717871637, "flos": 20371189847040.0, "grad_norm": 1.9777670593959802, "language_loss": 0.82832575, "learning_rate": 2.9543545990791554e-06, "loss": 0.84996438, "num_input_tokens_seen": 129090225, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.8046875, "step": 6011, "time_per_iteration": 2.450502634048462 }, { "auxiliary_loss_clip": 0.01136113, "auxiliary_loss_mlp": 0.01039559, "balance_loss_clip": 1.02416968, "balance_loss_mlp": 1.04956305, "epoch": 0.36146099503983165, "flos": 22776665800320.0, "grad_norm": 2.3857007491431124, "language_loss": 0.62777752, "learning_rate": 2.954012319316727e-06, "loss": 0.64953423, "num_input_tokens_seen": 129107685, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8671875, "step": 6012, "time_per_iteration": 3.966587781906128 }, { "auxiliary_loss_clip": 0.01121588, "auxiliary_loss_mlp": 0.01035971, "balance_loss_clip": 1.02211857, "balance_loss_mlp": 1.04273009, "epoch": 0.3615211182924996, "flos": 22996654646400.0, "grad_norm": 2.5946301139582424, "language_loss": 0.83785522, "learning_rate": 2.9536700033780565e-06, "loss": 0.85943079, "num_input_tokens_seen": 129125315, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7890625, "step": 6013, "time_per_iteration": 2.4814884662628174 }, { "auxiliary_loss_clip": 0.01127882, "auxiliary_loss_mlp": 0.01035618, "balance_loss_clip": 1.01988864, "balance_loss_mlp": 1.04564118, "epoch": 0.3615812415451676, "flos": 16648079287680.0, "grad_norm": 2.227758869409079, "language_loss": 0.91519916, "learning_rate": 2.9533276512761228e-06, "loss": 0.93683422, "num_input_tokens_seen": 129141600, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8203125, "step": 6014, "time_per_iteration": 3.9476943016052246 }, { "auxiliary_loss_clip": 0.01124032, "auxiliary_loss_mlp": 0.01040174, "balance_loss_clip": 1.02594042, "balance_loss_mlp": 1.04352212, "epoch": 0.36164136479783554, "flos": 21320093387520.0, "grad_norm": 2.0351677497773895, "language_loss": 0.73669732, "learning_rate": 2.95298526302391e-06, "loss": 0.75833941, "num_input_tokens_seen": 129160665, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 6015, "time_per_iteration": 2.4672579765319824 }, { "auxiliary_loss_clip": 0.01129714, "auxiliary_loss_mlp": 0.01038192, "balance_loss_clip": 1.02319598, "balance_loss_mlp": 1.04699087, "epoch": 0.3617014880505035, "flos": 24169569356160.0, "grad_norm": 2.714981319491949, "language_loss": 0.65335488, "learning_rate": 2.9526428386344e-06, "loss": 0.67503399, "num_input_tokens_seen": 129179220, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 6016, "time_per_iteration": 2.5261688232421875 }, { "auxiliary_loss_clip": 0.01130852, "auxiliary_loss_mlp": 0.01038201, "balance_loss_clip": 1.02205968, "balance_loss_mlp": 1.04782426, "epoch": 0.3617616113031715, "flos": 39014824101120.0, "grad_norm": 1.9993643800451055, "language_loss": 0.71664655, "learning_rate": 2.9523003781205785e-06, "loss": 0.73833704, "num_input_tokens_seen": 129200385, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.828125, "step": 6017, "time_per_iteration": 2.615147590637207 }, { "auxiliary_loss_clip": 0.01129829, "auxiliary_loss_mlp": 0.01036937, "balance_loss_clip": 1.02176142, "balance_loss_mlp": 1.04529858, "epoch": 0.3618217345558395, "flos": 12130840892160.0, "grad_norm": 2.361931440889962, "language_loss": 0.73200512, "learning_rate": 2.9519578814954307e-06, "loss": 0.75367272, "num_input_tokens_seen": 129217395, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84375, "step": 6018, "time_per_iteration": 2.470737934112549 }, { "auxiliary_loss_clip": 0.01125073, "auxiliary_loss_mlp": 0.01033988, "balance_loss_clip": 1.018646, "balance_loss_mlp": 1.04557943, "epoch": 0.36188185780850746, "flos": 24935005203840.0, "grad_norm": 2.1838250166540623, "language_loss": 0.68996227, "learning_rate": 2.9516153487719448e-06, "loss": 0.71155292, "num_input_tokens_seen": 129238940, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.79296875, "step": 6019, "time_per_iteration": 2.5193428993225098 }, { "auxiliary_loss_clip": 0.01130797, "auxiliary_loss_mlp": 0.0103325, "balance_loss_clip": 1.0175029, "balance_loss_mlp": 1.04618788, "epoch": 0.3619419810611754, "flos": 20958832350720.0, "grad_norm": 1.7560203838045594, "language_loss": 0.76389909, "learning_rate": 2.95127277996311e-06, "loss": 0.78553951, "num_input_tokens_seen": 129258240, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84375, "step": 6020, "time_per_iteration": 2.503295660018921 }, { "auxiliary_loss_clip": 0.01132401, "auxiliary_loss_mlp": 0.01040774, "balance_loss_clip": 1.02460957, "balance_loss_mlp": 1.04943347, "epoch": 0.3620021043138434, "flos": 22528882805760.0, "grad_norm": 2.774882685782743, "language_loss": 0.73531866, "learning_rate": 2.9509301750819156e-06, "loss": 0.7570504, "num_input_tokens_seen": 129279040, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.828125, "step": 6021, "time_per_iteration": 2.4763882160186768 }, { "auxiliary_loss_clip": 0.01128241, "auxiliary_loss_mlp": 0.01035886, "balance_loss_clip": 1.02188492, "balance_loss_mlp": 1.04701376, "epoch": 0.36206222756651135, "flos": 15596687266560.0, "grad_norm": 3.3152634114957937, "language_loss": 0.80865026, "learning_rate": 2.9505875341413533e-06, "loss": 0.83029151, "num_input_tokens_seen": 129295415, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 6022, "time_per_iteration": 2.489795684814453 }, { "auxiliary_loss_clip": 0.01128001, "auxiliary_loss_mlp": 0.01036781, "balance_loss_clip": 1.02319157, "balance_loss_mlp": 1.04900312, "epoch": 0.3621223508191793, "flos": 23587170238080.0, "grad_norm": 1.810779285423745, "language_loss": 0.8164978, "learning_rate": 2.950244857154417e-06, "loss": 0.83814561, "num_input_tokens_seen": 129312620, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7890625, "step": 6023, "time_per_iteration": 2.4955763816833496 }, { "auxiliary_loss_clip": 0.0113205, "auxiliary_loss_mlp": 0.01036238, "balance_loss_clip": 1.02090716, "balance_loss_mlp": 1.0475266, "epoch": 0.3621824740718473, "flos": 22309899540480.0, "grad_norm": 2.230472924024473, "language_loss": 0.79946434, "learning_rate": 2.9499021441341e-06, "loss": 0.8211472, "num_input_tokens_seen": 129331825, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.84375, "step": 6024, "time_per_iteration": 2.4984045028686523 }, { "auxiliary_loss_clip": 0.01126796, "auxiliary_loss_mlp": 0.01031427, "balance_loss_clip": 1.01744938, "balance_loss_mlp": 1.0467931, "epoch": 0.36224259732451525, "flos": 16763640318720.0, "grad_norm": 1.971841743952657, "language_loss": 0.74561512, "learning_rate": 2.9495593950933997e-06, "loss": 0.76719731, "num_input_tokens_seen": 129350400, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.80078125, "step": 6025, "time_per_iteration": 2.4689531326293945 }, { "auxiliary_loss_clip": 0.01127082, "auxiliary_loss_mlp": 0.01033688, "balance_loss_clip": 1.01933491, "balance_loss_mlp": 1.04644299, "epoch": 0.3623027205771832, "flos": 23149742411520.0, "grad_norm": 1.7302542909828005, "language_loss": 0.72401041, "learning_rate": 2.9492166100453107e-06, "loss": 0.74561805, "num_input_tokens_seen": 129371155, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8046875, "step": 6026, "time_per_iteration": 2.5401692390441895 }, { "auxiliary_loss_clip": 0.01137483, "auxiliary_loss_mlp": 0.01041809, "balance_loss_clip": 1.02571535, "balance_loss_mlp": 1.05002177, "epoch": 0.3623628438298512, "flos": 28549162834560.0, "grad_norm": 2.440551131263938, "language_loss": 0.79476124, "learning_rate": 2.948873789002833e-06, "loss": 0.81655413, "num_input_tokens_seen": 129391230, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.875, "step": 6027, "time_per_iteration": 2.555651903152466 }, { "auxiliary_loss_clip": 0.01132415, "auxiliary_loss_mlp": 0.01043893, "balance_loss_clip": 1.02738237, "balance_loss_mlp": 1.04869854, "epoch": 0.36242296708251914, "flos": 25484941405440.0, "grad_norm": 2.6516036817513573, "language_loss": 0.67406422, "learning_rate": 2.9485309319789667e-06, "loss": 0.69582731, "num_input_tokens_seen": 129410065, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.8359375, "step": 6028, "time_per_iteration": 2.5171456336975098 }, { "auxiliary_loss_clip": 0.01130137, "auxiliary_loss_mlp": 0.0103293, "balance_loss_clip": 1.01858342, "balance_loss_mlp": 1.04858601, "epoch": 0.3624830903351871, "flos": 16290373697280.0, "grad_norm": 2.4625201816618376, "language_loss": 0.8561042, "learning_rate": 2.9481880389867117e-06, "loss": 0.87773484, "num_input_tokens_seen": 129428655, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8125, "step": 6029, "time_per_iteration": 2.445570468902588 }, { "auxiliary_loss_clip": 0.01127453, "auxiliary_loss_mlp": 0.01035772, "balance_loss_clip": 1.02124, "balance_loss_mlp": 1.04695308, "epoch": 0.36254321358785513, "flos": 18296307694080.0, "grad_norm": 1.9759584525786906, "language_loss": 0.72714525, "learning_rate": 2.9478451100390714e-06, "loss": 0.74877751, "num_input_tokens_seen": 129447845, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8046875, "step": 6030, "time_per_iteration": 2.494051456451416 }, { "auxiliary_loss_clip": 0.01135299, "auxiliary_loss_mlp": 0.01039938, "balance_loss_clip": 1.02310562, "balance_loss_mlp": 1.04839778, "epoch": 0.3626033368405231, "flos": 14865294533760.0, "grad_norm": 2.156244054945047, "language_loss": 0.73631662, "learning_rate": 2.94750214514905e-06, "loss": 0.75806904, "num_input_tokens_seen": 129463275, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 6031, "time_per_iteration": 2.456214427947998 }, { "auxiliary_loss_clip": 0.01128112, "auxiliary_loss_mlp": 0.01033137, "balance_loss_clip": 1.01874208, "balance_loss_mlp": 1.04707432, "epoch": 0.36266346009319106, "flos": 22306595489280.0, "grad_norm": 2.236930133066583, "language_loss": 0.73215699, "learning_rate": 2.9471591443296516e-06, "loss": 0.75376946, "num_input_tokens_seen": 129483205, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 6032, "time_per_iteration": 2.5008599758148193 }, { "auxiliary_loss_clip": 0.01132299, "auxiliary_loss_mlp": 0.01046542, "balance_loss_clip": 1.03175974, "balance_loss_mlp": 1.0490098, "epoch": 0.362723583345859, "flos": 18222331633920.0, "grad_norm": 2.056073268052135, "language_loss": 0.77849156, "learning_rate": 2.946816107593884e-06, "loss": 0.80027997, "num_input_tokens_seen": 129499885, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.83203125, "step": 6033, "time_per_iteration": 2.4240195751190186 }, { "auxiliary_loss_clip": 0.01054678, "auxiliary_loss_mlp": 0.0100589, "balance_loss_clip": 1.00422096, "balance_loss_mlp": 1.02759469, "epoch": 0.362783706598527, "flos": 68499174458880.0, "grad_norm": 0.7820744199469062, "language_loss": 0.64764583, "learning_rate": 2.9464730349547547e-06, "loss": 0.66825151, "num_input_tokens_seen": 129561885, "router_z_loss_clip": 0.01672363, "router_z_loss_mlp": 0.26953125, "step": 6034, "time_per_iteration": 3.161646842956543 }, { "auxiliary_loss_clip": 0.01129331, "auxiliary_loss_mlp": 0.01038027, "balance_loss_clip": 1.02291703, "balance_loss_mlp": 1.04836714, "epoch": 0.36284382985119495, "flos": 26576589594240.0, "grad_norm": 1.5757719603246103, "language_loss": 0.89505833, "learning_rate": 2.946129926425273e-06, "loss": 0.91673189, "num_input_tokens_seen": 129582325, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.80859375, "step": 6035, "time_per_iteration": 2.4923276901245117 }, { "auxiliary_loss_clip": 0.01132656, "auxiliary_loss_mlp": 0.01043369, "balance_loss_clip": 1.02749014, "balance_loss_mlp": 1.04661012, "epoch": 0.3629039531038629, "flos": 20156767608960.0, "grad_norm": 2.1673335686662103, "language_loss": 0.74056286, "learning_rate": 2.9457867820184496e-06, "loss": 0.76232308, "num_input_tokens_seen": 129600350, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.859375, "step": 6036, "time_per_iteration": 2.474379062652588 }, { "auxiliary_loss_clip": 0.01134032, "auxiliary_loss_mlp": 0.01032557, "balance_loss_clip": 1.0167619, "balance_loss_mlp": 1.04647684, "epoch": 0.3629640763565309, "flos": 18625716345600.0, "grad_norm": 2.5372206120963403, "language_loss": 0.75879264, "learning_rate": 2.945443601747297e-06, "loss": 0.78045857, "num_input_tokens_seen": 129618425, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.875, "step": 6037, "time_per_iteration": 2.447810173034668 }, { "auxiliary_loss_clip": 0.01127378, "auxiliary_loss_mlp": 0.01044273, "balance_loss_clip": 1.02903163, "balance_loss_mlp": 1.04736292, "epoch": 0.36302419960919885, "flos": 19571459489280.0, "grad_norm": 1.5808838803936502, "language_loss": 0.78412104, "learning_rate": 2.945100385624828e-06, "loss": 0.80583751, "num_input_tokens_seen": 129636750, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80078125, "step": 6038, "time_per_iteration": 2.4738829135894775 }, { "auxiliary_loss_clip": 0.01054324, "auxiliary_loss_mlp": 0.01006782, "balance_loss_clip": 1.00510144, "balance_loss_mlp": 1.02682066, "epoch": 0.3630843228618668, "flos": 63797606444160.0, "grad_norm": 1.129254622290157, "language_loss": 0.63443369, "learning_rate": 2.9447571336640573e-06, "loss": 0.65504473, "num_input_tokens_seen": 129699030, "router_z_loss_clip": 0.0168457, "router_z_loss_mlp": 0.27539062, "step": 6039, "time_per_iteration": 3.153303623199463 }, { "auxiliary_loss_clip": 0.01130133, "auxiliary_loss_mlp": 0.01039002, "balance_loss_clip": 1.02458322, "balance_loss_mlp": 1.04758668, "epoch": 0.3631444461145348, "flos": 21835160461440.0, "grad_norm": 19.38201585653562, "language_loss": 0.7102989, "learning_rate": 2.944413845878002e-06, "loss": 0.73199022, "num_input_tokens_seen": 129717135, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.82421875, "step": 6040, "time_per_iteration": 2.496760845184326 }, { "auxiliary_loss_clip": 0.01134448, "auxiliary_loss_mlp": 0.01040588, "balance_loss_clip": 1.02541232, "balance_loss_mlp": 1.04793274, "epoch": 0.36320456936720275, "flos": 21722041555200.0, "grad_norm": 1.935463473660703, "language_loss": 0.81732732, "learning_rate": 2.9440705222796783e-06, "loss": 0.83907765, "num_input_tokens_seen": 129735940, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.86328125, "step": 6041, "time_per_iteration": 2.4753167629241943 }, { "auxiliary_loss_clip": 0.01131148, "auxiliary_loss_mlp": 0.01035705, "balance_loss_clip": 1.01901543, "balance_loss_mlp": 1.04656065, "epoch": 0.3632646926198707, "flos": 17019072910080.0, "grad_norm": 2.2382046818078405, "language_loss": 0.83723342, "learning_rate": 2.943727162882107e-06, "loss": 0.85890198, "num_input_tokens_seen": 129752790, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.84375, "step": 6042, "time_per_iteration": 2.5084033012390137 }, { "auxiliary_loss_clip": 0.01129458, "auxiliary_loss_mlp": 0.01043183, "balance_loss_clip": 1.02826977, "balance_loss_mlp": 1.04752398, "epoch": 0.36332481587253873, "flos": 23331163029120.0, "grad_norm": 2.1045053855737943, "language_loss": 0.78408074, "learning_rate": 2.9433837676983064e-06, "loss": 0.80580717, "num_input_tokens_seen": 129773655, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8203125, "step": 6043, "time_per_iteration": 2.5657567977905273 }, { "auxiliary_loss_clip": 0.01127166, "auxiliary_loss_mlp": 0.01039262, "balance_loss_clip": 1.02335989, "balance_loss_mlp": 1.04701543, "epoch": 0.3633849391252067, "flos": 10743539857920.0, "grad_norm": 1.9507523610428992, "language_loss": 0.65613002, "learning_rate": 2.943040336741298e-06, "loss": 0.67779434, "num_input_tokens_seen": 129791605, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.80078125, "step": 6044, "time_per_iteration": 2.4698328971862793 }, { "auxiliary_loss_clip": 0.01129957, "auxiliary_loss_mlp": 0.01033002, "balance_loss_clip": 1.01838708, "balance_loss_mlp": 1.04827571, "epoch": 0.36344506237787466, "flos": 25849147357440.0, "grad_norm": 2.947353282104242, "language_loss": 0.80916888, "learning_rate": 2.9426968700241066e-06, "loss": 0.83079851, "num_input_tokens_seen": 129811075, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.81640625, "step": 6045, "time_per_iteration": 2.504270076751709 }, { "auxiliary_loss_clip": 0.01130086, "auxiliary_loss_mlp": 0.01038151, "balance_loss_clip": 1.02262998, "balance_loss_mlp": 1.04652452, "epoch": 0.3635051856305426, "flos": 30154046503680.0, "grad_norm": 1.802289330613788, "language_loss": 0.64807183, "learning_rate": 2.942353367559755e-06, "loss": 0.66975427, "num_input_tokens_seen": 129833755, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8359375, "step": 6046, "time_per_iteration": 2.579902410507202 }, { "auxiliary_loss_clip": 0.01129635, "auxiliary_loss_mlp": 0.01035501, "balance_loss_clip": 1.02044487, "balance_loss_mlp": 1.04667783, "epoch": 0.3635653088832106, "flos": 22198396746240.0, "grad_norm": 1.6305036299128968, "language_loss": 0.7746675, "learning_rate": 2.9420098293612692e-06, "loss": 0.79631883, "num_input_tokens_seen": 129854475, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 6047, "time_per_iteration": 2.5093443393707275 }, { "auxiliary_loss_clip": 0.01135199, "auxiliary_loss_mlp": 0.01044894, "balance_loss_clip": 1.02757275, "balance_loss_mlp": 1.04560685, "epoch": 0.36362543213587856, "flos": 24787053083520.0, "grad_norm": 2.9828707874597247, "language_loss": 0.79352546, "learning_rate": 2.9416662554416767e-06, "loss": 0.81532639, "num_input_tokens_seen": 129873530, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.89453125, "step": 6048, "time_per_iteration": 2.509507179260254 }, { "auxiliary_loss_clip": 0.01049982, "auxiliary_loss_mlp": 0.01000709, "balance_loss_clip": 0.99907631, "balance_loss_mlp": 1.02275598, "epoch": 0.3636855553885465, "flos": 62526369231360.0, "grad_norm": 0.7770083040043108, "language_loss": 0.52624851, "learning_rate": 2.9413226458140054e-06, "loss": 0.54675543, "num_input_tokens_seen": 129940400, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.27148438, "step": 6049, "time_per_iteration": 3.1640138626098633 }, { "auxiliary_loss_clip": 0.01130709, "auxiliary_loss_mlp": 0.0103756, "balance_loss_clip": 1.02172923, "balance_loss_mlp": 1.04689169, "epoch": 0.3637456786412145, "flos": 24060652341120.0, "grad_norm": 7.131648676412746, "language_loss": 0.86507428, "learning_rate": 2.9409790004912845e-06, "loss": 0.88675696, "num_input_tokens_seen": 129958635, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8359375, "step": 6050, "time_per_iteration": 3.9322681427001953 }, { "auxiliary_loss_clip": 0.01128174, "auxiliary_loss_mlp": 0.01035821, "balance_loss_clip": 1.02127719, "balance_loss_mlp": 1.04664564, "epoch": 0.36380580189388245, "flos": 16691495852160.0, "grad_norm": 2.344274606760866, "language_loss": 0.78559679, "learning_rate": 2.940635319486546e-06, "loss": 0.80723673, "num_input_tokens_seen": 129977685, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.81640625, "step": 6051, "time_per_iteration": 2.4654314517974854 }, { "auxiliary_loss_clip": 0.01126481, "auxiliary_loss_mlp": 0.01037141, "balance_loss_clip": 1.02248979, "balance_loss_mlp": 1.04332018, "epoch": 0.3638659251465504, "flos": 25114091437440.0, "grad_norm": 1.8833918374872358, "language_loss": 0.82139611, "learning_rate": 2.940291602812822e-06, "loss": 0.84303236, "num_input_tokens_seen": 129997530, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.83203125, "step": 6052, "time_per_iteration": 2.484407663345337 }, { "auxiliary_loss_clip": 0.01125075, "auxiliary_loss_mlp": 0.01035919, "balance_loss_clip": 1.02192426, "balance_loss_mlp": 1.04425585, "epoch": 0.3639260483992184, "flos": 23003011353600.0, "grad_norm": 6.3271780399681985, "language_loss": 0.728881, "learning_rate": 2.939947850483145e-06, "loss": 0.75049102, "num_input_tokens_seen": 130017955, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.80859375, "step": 6053, "time_per_iteration": 2.5261940956115723 }, { "auxiliary_loss_clip": 0.01051821, "auxiliary_loss_mlp": 0.01000975, "balance_loss_clip": 0.99927008, "balance_loss_mlp": 1.02492821, "epoch": 0.36398617165188635, "flos": 70716011160960.0, "grad_norm": 0.7811101705025103, "language_loss": 0.6120016, "learning_rate": 2.9396040625105532e-06, "loss": 0.63252956, "num_input_tokens_seen": 130074275, "router_z_loss_clip": 0.01708984, "router_z_loss_mlp": 0.26953125, "step": 6054, "time_per_iteration": 4.476816654205322 }, { "auxiliary_loss_clip": 0.01129603, "auxiliary_loss_mlp": 0.01039787, "balance_loss_clip": 1.02395618, "balance_loss_mlp": 1.04582715, "epoch": 0.3640462949045543, "flos": 22235456603520.0, "grad_norm": 2.9650340215699775, "language_loss": 0.7584393, "learning_rate": 2.9392602389080802e-06, "loss": 0.78013325, "num_input_tokens_seen": 130091375, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8359375, "step": 6055, "time_per_iteration": 3.89220929145813 }, { "auxiliary_loss_clip": 0.01128656, "auxiliary_loss_mlp": 0.01042758, "balance_loss_clip": 1.02717769, "balance_loss_mlp": 1.04525805, "epoch": 0.3641064181572223, "flos": 21543529939200.0, "grad_norm": 1.8001643264014817, "language_loss": 0.75400966, "learning_rate": 2.938916379688765e-06, "loss": 0.77572381, "num_input_tokens_seen": 130111595, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 6056, "time_per_iteration": 2.482121467590332 }, { "auxiliary_loss_clip": 0.01128811, "auxiliary_loss_mlp": 0.01040684, "balance_loss_clip": 1.0252707, "balance_loss_mlp": 1.0466063, "epoch": 0.3641665414098903, "flos": 22273306560000.0, "grad_norm": 3.006791422838204, "language_loss": 0.80318266, "learning_rate": 2.9385724848656468e-06, "loss": 0.82487762, "num_input_tokens_seen": 130131440, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 6057, "time_per_iteration": 2.4574971199035645 }, { "auxiliary_loss_clip": 0.01127433, "auxiliary_loss_mlp": 0.01034315, "balance_loss_clip": 1.01958108, "balance_loss_mlp": 1.04690945, "epoch": 0.36422666466255826, "flos": 28329676778880.0, "grad_norm": 1.8192379421597848, "language_loss": 0.79908937, "learning_rate": 2.9382285544517647e-06, "loss": 0.82070684, "num_input_tokens_seen": 130151375, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8046875, "step": 6058, "time_per_iteration": 2.5421247482299805 }, { "auxiliary_loss_clip": 0.01125202, "auxiliary_loss_mlp": 0.01038557, "balance_loss_clip": 1.02372766, "balance_loss_mlp": 1.04351735, "epoch": 0.36428678791522623, "flos": 24170503109760.0, "grad_norm": 1.9525956951878185, "language_loss": 0.84990436, "learning_rate": 2.9378845884601636e-06, "loss": 0.87154192, "num_input_tokens_seen": 130169960, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.81640625, "step": 6059, "time_per_iteration": 2.505518674850464 }, { "auxiliary_loss_clip": 0.01131273, "auxiliary_loss_mlp": 0.01041435, "balance_loss_clip": 1.02572942, "balance_loss_mlp": 1.0468514, "epoch": 0.3643469111678942, "flos": 22528451842560.0, "grad_norm": 2.7664553468999245, "language_loss": 0.87508035, "learning_rate": 2.937540586903884e-06, "loss": 0.89680743, "num_input_tokens_seen": 130189800, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84375, "step": 6060, "time_per_iteration": 2.489638090133667 }, { "auxiliary_loss_clip": 0.01132331, "auxiliary_loss_mlp": 0.01043748, "balance_loss_clip": 1.02876389, "balance_loss_mlp": 1.04804826, "epoch": 0.36440703442056216, "flos": 19426595938560.0, "grad_norm": 2.323924576044238, "language_loss": 0.66987067, "learning_rate": 2.937196549795971e-06, "loss": 0.69163144, "num_input_tokens_seen": 130206370, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.84375, "step": 6061, "time_per_iteration": 2.4442996978759766 }, { "auxiliary_loss_clip": 0.01133203, "auxiliary_loss_mlp": 0.01036799, "balance_loss_clip": 1.02087283, "balance_loss_mlp": 1.04906642, "epoch": 0.3644671576732301, "flos": 18040515966720.0, "grad_norm": 6.755353637303202, "language_loss": 0.74943882, "learning_rate": 2.9368524771494718e-06, "loss": 0.77113879, "num_input_tokens_seen": 130224445, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.83984375, "step": 6062, "time_per_iteration": 2.452174425125122 }, { "auxiliary_loss_clip": 0.01128297, "auxiliary_loss_mlp": 0.01039725, "balance_loss_clip": 1.02277935, "balance_loss_mlp": 1.04576111, "epoch": 0.3645272809258981, "flos": 21542811667200.0, "grad_norm": 2.5147561020699962, "language_loss": 0.72492862, "learning_rate": 2.936508368977432e-06, "loss": 0.74660885, "num_input_tokens_seen": 130245380, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.82421875, "step": 6063, "time_per_iteration": 2.5143017768859863 }, { "auxiliary_loss_clip": 0.0112635, "auxiliary_loss_mlp": 0.01041814, "balance_loss_clip": 1.02701414, "balance_loss_mlp": 1.04495311, "epoch": 0.36458740417856605, "flos": 22746860490240.0, "grad_norm": 7.436825882920543, "language_loss": 0.6779083, "learning_rate": 2.936164225292901e-06, "loss": 0.69958997, "num_input_tokens_seen": 130265575, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 6064, "time_per_iteration": 2.51078462600708 }, { "auxiliary_loss_clip": 0.01131632, "auxiliary_loss_mlp": 0.01049528, "balance_loss_clip": 1.03392398, "balance_loss_mlp": 1.04675484, "epoch": 0.364647527431234, "flos": 26140670138880.0, "grad_norm": 1.7282536998315805, "language_loss": 0.7424103, "learning_rate": 2.9358200461089297e-06, "loss": 0.76422191, "num_input_tokens_seen": 130286195, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8515625, "step": 6065, "time_per_iteration": 2.5283989906311035 }, { "auxiliary_loss_clip": 0.01132431, "auxiliary_loss_mlp": 0.01044369, "balance_loss_clip": 1.02782321, "balance_loss_mlp": 1.04693055, "epoch": 0.364707650683902, "flos": 31029907737600.0, "grad_norm": 2.59749929537264, "language_loss": 0.75096601, "learning_rate": 2.9354758314385676e-06, "loss": 0.77273399, "num_input_tokens_seen": 130306095, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.85546875, "step": 6066, "time_per_iteration": 2.5910348892211914 }, { "auxiliary_loss_clip": 0.01126735, "auxiliary_loss_mlp": 0.01036899, "balance_loss_clip": 1.02242672, "balance_loss_mlp": 1.04522491, "epoch": 0.36476777393656995, "flos": 19572896033280.0, "grad_norm": 2.69713544761379, "language_loss": 0.76723003, "learning_rate": 2.9351315812948684e-06, "loss": 0.7888664, "num_input_tokens_seen": 130324685, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.81640625, "step": 6067, "time_per_iteration": 2.4913668632507324 }, { "auxiliary_loss_clip": 0.01127209, "auxiliary_loss_mlp": 0.01044797, "balance_loss_clip": 1.0311296, "balance_loss_mlp": 1.04696941, "epoch": 0.3648278971892379, "flos": 17748849530880.0, "grad_norm": 1.9988184711590389, "language_loss": 0.70923841, "learning_rate": 2.934787295690886e-06, "loss": 0.73095846, "num_input_tokens_seen": 130343855, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.8046875, "step": 6068, "time_per_iteration": 2.475543260574341 }, { "auxiliary_loss_clip": 0.01129031, "auxiliary_loss_mlp": 0.01042439, "balance_loss_clip": 1.02725148, "balance_loss_mlp": 1.04443216, "epoch": 0.3648880204419059, "flos": 17931167988480.0, "grad_norm": 2.1888207194675333, "language_loss": 0.73564047, "learning_rate": 2.9344429746396755e-06, "loss": 0.75735515, "num_input_tokens_seen": 130362320, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.84375, "step": 6069, "time_per_iteration": 2.4416873455047607 }, { "auxiliary_loss_clip": 0.01132871, "auxiliary_loss_mlp": 0.01040965, "balance_loss_clip": 1.02561069, "balance_loss_mlp": 1.04745841, "epoch": 0.3649481436945739, "flos": 22638266697600.0, "grad_norm": 2.0813708748025426, "language_loss": 0.6652894, "learning_rate": 2.9340986181542945e-06, "loss": 0.68702775, "num_input_tokens_seen": 130383165, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.85546875, "step": 6070, "time_per_iteration": 2.4927818775177 }, { "auxiliary_loss_clip": 0.01126595, "auxiliary_loss_mlp": 0.01038617, "balance_loss_clip": 1.02424669, "balance_loss_mlp": 1.04539776, "epoch": 0.36500826694724187, "flos": 21579656042880.0, "grad_norm": 1.798078304411475, "language_loss": 0.73484415, "learning_rate": 2.9337542262477994e-06, "loss": 0.75649631, "num_input_tokens_seen": 130402425, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8125, "step": 6071, "time_per_iteration": 2.474163293838501 }, { "auxiliary_loss_clip": 0.01126988, "auxiliary_loss_mlp": 0.01036099, "balance_loss_clip": 1.02099502, "balance_loss_mlp": 1.04523206, "epoch": 0.36506839019990983, "flos": 13772533023360.0, "grad_norm": 2.07222802465178, "language_loss": 0.88389134, "learning_rate": 2.9334097989332506e-06, "loss": 0.90552223, "num_input_tokens_seen": 130419440, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.81640625, "step": 6072, "time_per_iteration": 2.4499406814575195 }, { "auxiliary_loss_clip": 0.01128838, "auxiliary_loss_mlp": 0.01036786, "balance_loss_clip": 1.02196217, "balance_loss_mlp": 1.0466392, "epoch": 0.3651285134525778, "flos": 17274972378240.0, "grad_norm": 2.014824439202344, "language_loss": 0.72419739, "learning_rate": 2.9330653362237094e-06, "loss": 0.74585366, "num_input_tokens_seen": 130438495, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.82421875, "step": 6073, "time_per_iteration": 2.4570553302764893 }, { "auxiliary_loss_clip": 0.01131901, "auxiliary_loss_mlp": 0.01037031, "balance_loss_clip": 1.02156925, "balance_loss_mlp": 1.04823291, "epoch": 0.36518863670524576, "flos": 21907987286400.0, "grad_norm": 2.4140845606845858, "language_loss": 0.66960263, "learning_rate": 2.932720838132236e-06, "loss": 0.69129199, "num_input_tokens_seen": 130455575, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8359375, "step": 6074, "time_per_iteration": 2.4700005054473877 }, { "auxiliary_loss_clip": 0.01128241, "auxiliary_loss_mlp": 0.01033497, "balance_loss_clip": 1.01924598, "balance_loss_mlp": 1.04598331, "epoch": 0.3652487599579137, "flos": 27122180250240.0, "grad_norm": 1.630534209433784, "language_loss": 0.73174465, "learning_rate": 2.9323763046718954e-06, "loss": 0.75336206, "num_input_tokens_seen": 130476385, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8203125, "step": 6075, "time_per_iteration": 2.5081992149353027 }, { "auxiliary_loss_clip": 0.0113397, "auxiliary_loss_mlp": 0.01041229, "balance_loss_clip": 1.02519536, "balance_loss_mlp": 1.04847443, "epoch": 0.3653088832105817, "flos": 19755573626880.0, "grad_norm": 2.693998158102023, "language_loss": 0.8953501, "learning_rate": 2.9320317358557524e-06, "loss": 0.91710198, "num_input_tokens_seen": 130493630, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.859375, "step": 6076, "time_per_iteration": 2.4923877716064453 }, { "auxiliary_loss_clip": 0.0112906, "auxiliary_loss_mlp": 0.0103547, "balance_loss_clip": 1.02021122, "balance_loss_mlp": 1.04790258, "epoch": 0.36536900646324966, "flos": 13115008609920.0, "grad_norm": 4.401309757055116, "language_loss": 0.69447309, "learning_rate": 2.931687131696872e-06, "loss": 0.71611834, "num_input_tokens_seen": 130510735, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 6077, "time_per_iteration": 2.432875394821167 }, { "auxiliary_loss_clip": 0.01059557, "auxiliary_loss_mlp": 0.01001092, "balance_loss_clip": 0.99939901, "balance_loss_mlp": 1.03252125, "epoch": 0.3654291297159176, "flos": 71100472383360.0, "grad_norm": 0.7479859528970156, "language_loss": 0.61755544, "learning_rate": 2.9313424922083224e-06, "loss": 0.63816196, "num_input_tokens_seen": 130577050, "router_z_loss_clip": 0.01696777, "router_z_loss_mlp": 0.26953125, "step": 6078, "time_per_iteration": 3.2094526290893555 }, { "auxiliary_loss_clip": 0.01127911, "auxiliary_loss_mlp": 0.01036469, "balance_loss_clip": 1.02169847, "balance_loss_mlp": 1.0457778, "epoch": 0.3654892529685856, "flos": 23617478338560.0, "grad_norm": 2.233675951747401, "language_loss": 0.78392768, "learning_rate": 2.930997817403173e-06, "loss": 0.80557156, "num_input_tokens_seen": 130593780, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8203125, "step": 6079, "time_per_iteration": 2.5236287117004395 }, { "auxiliary_loss_clip": 0.01132079, "auxiliary_loss_mlp": 0.01039147, "balance_loss_clip": 1.02356684, "balance_loss_mlp": 1.04820299, "epoch": 0.36554937622125355, "flos": 43470799850880.0, "grad_norm": 1.694315024746473, "language_loss": 0.6243642, "learning_rate": 2.9306531072944913e-06, "loss": 0.64607644, "num_input_tokens_seen": 130615510, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 6080, "time_per_iteration": 2.673046112060547 }, { "auxiliary_loss_clip": 0.01131559, "auxiliary_loss_mlp": 0.01037937, "balance_loss_clip": 1.02160573, "balance_loss_mlp": 1.04682469, "epoch": 0.3656094994739215, "flos": 23294641875840.0, "grad_norm": 3.6467329077119404, "language_loss": 0.68101573, "learning_rate": 2.930308361895352e-06, "loss": 0.70271075, "num_input_tokens_seen": 130635410, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.84765625, "step": 6081, "time_per_iteration": 2.515591859817505 }, { "auxiliary_loss_clip": 0.0113423, "auxiliary_loss_mlp": 0.01039538, "balance_loss_clip": 1.02317107, "balance_loss_mlp": 1.04680133, "epoch": 0.3656696227265895, "flos": 24571984400640.0, "grad_norm": 2.0198290520105804, "language_loss": 0.75059879, "learning_rate": 2.9299635812188257e-06, "loss": 0.77233642, "num_input_tokens_seen": 130657725, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.875, "step": 6082, "time_per_iteration": 2.5558714866638184 }, { "auxiliary_loss_clip": 0.01131543, "auxiliary_loss_mlp": 0.01030676, "balance_loss_clip": 1.01657367, "balance_loss_mlp": 1.04795313, "epoch": 0.3657297459792575, "flos": 27928375056000.0, "grad_norm": 1.9857134992808054, "language_loss": 0.82777768, "learning_rate": 2.929618765277987e-06, "loss": 0.84939992, "num_input_tokens_seen": 130678360, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8359375, "step": 6083, "time_per_iteration": 2.578007936477661 }, { "auxiliary_loss_clip": 0.01061392, "auxiliary_loss_mlp": 0.01002488, "balance_loss_clip": 1.00077152, "balance_loss_mlp": 1.03401005, "epoch": 0.36578986923192547, "flos": 67392622126080.0, "grad_norm": 0.8816305330863035, "language_loss": 0.59377718, "learning_rate": 2.9292739140859125e-06, "loss": 0.614416, "num_input_tokens_seen": 130742110, "router_z_loss_clip": 0.01721191, "router_z_loss_mlp": 0.2734375, "step": 6084, "time_per_iteration": 3.202549695968628 }, { "auxiliary_loss_clip": 0.01128913, "auxiliary_loss_mlp": 0.01036616, "balance_loss_clip": 1.02126169, "balance_loss_mlp": 1.0471952, "epoch": 0.36584999248459343, "flos": 20227511445120.0, "grad_norm": 2.1331764942693283, "language_loss": 0.73330247, "learning_rate": 2.9289290276556767e-06, "loss": 0.7549578, "num_input_tokens_seen": 130759870, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.81640625, "step": 6085, "time_per_iteration": 2.4817075729370117 }, { "auxiliary_loss_clip": 0.01131283, "auxiliary_loss_mlp": 0.01032986, "balance_loss_clip": 1.01874661, "balance_loss_mlp": 1.04909849, "epoch": 0.3659101157372614, "flos": 19062461813760.0, "grad_norm": 2.1373458052479877, "language_loss": 0.78322387, "learning_rate": 2.9285841060003604e-06, "loss": 0.80486655, "num_input_tokens_seen": 130778510, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.82421875, "step": 6086, "time_per_iteration": 2.455939769744873 }, { "auxiliary_loss_clip": 0.01125355, "auxiliary_loss_mlp": 0.0102884, "balance_loss_clip": 1.01427269, "balance_loss_mlp": 1.0464648, "epoch": 0.36597023898992936, "flos": 30810708990720.0, "grad_norm": 1.7889708641321636, "language_loss": 0.76995009, "learning_rate": 2.9282391491330416e-06, "loss": 0.79149199, "num_input_tokens_seen": 130798535, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 6087, "time_per_iteration": 2.5488033294677734 }, { "auxiliary_loss_clip": 0.01132867, "auxiliary_loss_mlp": 0.01036723, "balance_loss_clip": 1.02170825, "balance_loss_mlp": 1.04867256, "epoch": 0.36603036224259733, "flos": 20521799573760.0, "grad_norm": 2.587083862764601, "language_loss": 0.7058965, "learning_rate": 2.9278941570668002e-06, "loss": 0.72759235, "num_input_tokens_seen": 130816655, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.84375, "step": 6088, "time_per_iteration": 2.445981502532959 }, { "auxiliary_loss_clip": 0.01138129, "auxiliary_loss_mlp": 0.01034362, "balance_loss_clip": 1.0185312, "balance_loss_mlp": 1.04963827, "epoch": 0.3660904854952653, "flos": 38329397798400.0, "grad_norm": 1.857213999540725, "language_loss": 0.79906899, "learning_rate": 2.92754912981472e-06, "loss": 0.82079393, "num_input_tokens_seen": 130841225, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8828125, "step": 6089, "time_per_iteration": 2.6636555194854736 }, { "auxiliary_loss_clip": 0.01129053, "auxiliary_loss_mlp": 0.01030969, "balance_loss_clip": 1.01712298, "balance_loss_mlp": 1.04752421, "epoch": 0.36615060874793326, "flos": 21835555511040.0, "grad_norm": 1.808274372374454, "language_loss": 0.71420527, "learning_rate": 2.927204067389884e-06, "loss": 0.73580551, "num_input_tokens_seen": 130861050, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.81640625, "step": 6090, "time_per_iteration": 2.4817280769348145 }, { "auxiliary_loss_clip": 0.01127098, "auxiliary_loss_mlp": 0.01042585, "balance_loss_clip": 1.02840471, "balance_loss_mlp": 1.04914463, "epoch": 0.3662107320006012, "flos": 16581537342720.0, "grad_norm": 2.0271875351237965, "language_loss": 0.74554992, "learning_rate": 2.9268589698053763e-06, "loss": 0.76724672, "num_input_tokens_seen": 130879775, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 6091, "time_per_iteration": 3.915464401245117 }, { "auxiliary_loss_clip": 0.01133244, "auxiliary_loss_mlp": 0.0103428, "balance_loss_clip": 1.0193553, "balance_loss_mlp": 1.05140615, "epoch": 0.3662708552532692, "flos": 20958365473920.0, "grad_norm": 2.103946578772047, "language_loss": 0.72625583, "learning_rate": 2.926513837074284e-06, "loss": 0.74793106, "num_input_tokens_seen": 130898070, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8203125, "step": 6092, "time_per_iteration": 2.5004425048828125 }, { "auxiliary_loss_clip": 0.01131456, "auxiliary_loss_mlp": 0.01041369, "balance_loss_clip": 1.02564526, "balance_loss_mlp": 1.04835713, "epoch": 0.36633097850593715, "flos": 21902707987200.0, "grad_norm": 3.248941716842944, "language_loss": 0.78502452, "learning_rate": 2.9261686692096942e-06, "loss": 0.8067528, "num_input_tokens_seen": 130915250, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.83203125, "step": 6093, "time_per_iteration": 2.456447124481201 }, { "auxiliary_loss_clip": 0.01127174, "auxiliary_loss_mlp": 0.01034907, "balance_loss_clip": 1.02049494, "balance_loss_mlp": 1.04490149, "epoch": 0.3663911017586051, "flos": 32854133808000.0, "grad_norm": 1.7872554278485622, "language_loss": 0.74132979, "learning_rate": 2.925823466224696e-06, "loss": 0.76295066, "num_input_tokens_seen": 130936995, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 6094, "time_per_iteration": 2.611391067504883 }, { "auxiliary_loss_clip": 0.01133318, "auxiliary_loss_mlp": 0.01046809, "balance_loss_clip": 1.03149056, "balance_loss_mlp": 1.04932141, "epoch": 0.3664512250112731, "flos": 27271748482560.0, "grad_norm": 1.6861165258722945, "language_loss": 0.79081136, "learning_rate": 2.9254782281323785e-06, "loss": 0.81261265, "num_input_tokens_seen": 130957970, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.83984375, "step": 6095, "time_per_iteration": 3.974332094192505 }, { "auxiliary_loss_clip": 0.01134129, "auxiliary_loss_mlp": 0.01034634, "balance_loss_clip": 1.01854038, "balance_loss_mlp": 1.04935288, "epoch": 0.3665113482639411, "flos": 17784436930560.0, "grad_norm": 2.4127215631194963, "language_loss": 0.73717797, "learning_rate": 2.925132954945834e-06, "loss": 0.75886559, "num_input_tokens_seen": 130974915, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84765625, "step": 6096, "time_per_iteration": 3.8662309646606445 }, { "auxiliary_loss_clip": 0.01130851, "auxiliary_loss_mlp": 0.01036778, "balance_loss_clip": 1.0215553, "balance_loss_mlp": 1.04667258, "epoch": 0.36657147151660907, "flos": 27854614477440.0, "grad_norm": 2.235949270984476, "language_loss": 0.67240268, "learning_rate": 2.924787646678155e-06, "loss": 0.69407892, "num_input_tokens_seen": 130995745, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84375, "step": 6097, "time_per_iteration": 3.941039800643921 }, { "auxiliary_loss_clip": 0.01133314, "auxiliary_loss_mlp": 0.01040037, "balance_loss_clip": 1.02564836, "balance_loss_mlp": 1.04941702, "epoch": 0.36663159476927704, "flos": 25374013228800.0, "grad_norm": 1.481218125214846, "language_loss": 0.77900589, "learning_rate": 2.9244423033424365e-06, "loss": 0.80073941, "num_input_tokens_seen": 131015545, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.83984375, "step": 6098, "time_per_iteration": 2.502331018447876 }, { "auxiliary_loss_clip": 0.01126869, "auxiliary_loss_mlp": 0.01040685, "balance_loss_clip": 1.02602792, "balance_loss_mlp": 1.04741025, "epoch": 0.366691718021945, "flos": 21357225072000.0, "grad_norm": 1.9865818467365617, "language_loss": 0.73752117, "learning_rate": 2.9240969249517723e-06, "loss": 0.75919676, "num_input_tokens_seen": 131033990, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 6099, "time_per_iteration": 2.4996540546417236 }, { "auxiliary_loss_clip": 0.01124229, "auxiliary_loss_mlp": 0.01045507, "balance_loss_clip": 1.03136921, "balance_loss_mlp": 1.0452565, "epoch": 0.36675184127461297, "flos": 16800376953600.0, "grad_norm": 1.9713595634553187, "language_loss": 0.84634256, "learning_rate": 2.9237515115192602e-06, "loss": 0.86803991, "num_input_tokens_seen": 131050710, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 6100, "time_per_iteration": 2.4404149055480957 }, { "auxiliary_loss_clip": 0.01133582, "auxiliary_loss_mlp": 0.01037583, "balance_loss_clip": 1.02194262, "balance_loss_mlp": 1.04750156, "epoch": 0.36681196452728093, "flos": 21906514828800.0, "grad_norm": 2.4309973570455505, "language_loss": 0.70522571, "learning_rate": 2.9234060630579992e-06, "loss": 0.72693729, "num_input_tokens_seen": 131071435, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 6101, "time_per_iteration": 2.5173656940460205 }, { "auxiliary_loss_clip": 0.01131689, "auxiliary_loss_mlp": 0.01047334, "balance_loss_clip": 1.03070426, "balance_loss_mlp": 1.04802108, "epoch": 0.3668720877799489, "flos": 17712436118400.0, "grad_norm": 3.210516724050721, "language_loss": 0.75875223, "learning_rate": 2.9230605795810865e-06, "loss": 0.78054237, "num_input_tokens_seen": 131088775, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8359375, "step": 6102, "time_per_iteration": 2.4912164211273193 }, { "auxiliary_loss_clip": 0.0113511, "auxiliary_loss_mlp": 0.01037377, "balance_loss_clip": 1.02025831, "balance_loss_mlp": 1.04809678, "epoch": 0.36693221103261686, "flos": 47045455499520.0, "grad_norm": 2.5653480399869455, "language_loss": 0.69941223, "learning_rate": 2.922715061101625e-06, "loss": 0.72113717, "num_input_tokens_seen": 131112800, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.87109375, "step": 6103, "time_per_iteration": 2.7031655311584473 }, { "auxiliary_loss_clip": 0.0112962, "auxiliary_loss_mlp": 0.01036126, "balance_loss_clip": 1.02117729, "balance_loss_mlp": 1.04659033, "epoch": 0.3669923342852848, "flos": 15960929132160.0, "grad_norm": 2.2319973702926554, "language_loss": 0.71522307, "learning_rate": 2.922369507632716e-06, "loss": 0.73688054, "num_input_tokens_seen": 131131150, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.828125, "step": 6104, "time_per_iteration": 2.4640278816223145 }, { "auxiliary_loss_clip": 0.01131053, "auxiliary_loss_mlp": 0.01033838, "balance_loss_clip": 1.01899672, "balance_loss_mlp": 1.04862607, "epoch": 0.3670524575379528, "flos": 19974485064960.0, "grad_norm": 2.2440872586256018, "language_loss": 0.8164534, "learning_rate": 2.9220239191874617e-06, "loss": 0.83810234, "num_input_tokens_seen": 131150365, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.82421875, "step": 6105, "time_per_iteration": 2.511201858520508 }, { "auxiliary_loss_clip": 0.0113491, "auxiliary_loss_mlp": 0.01038491, "balance_loss_clip": 1.02260005, "balance_loss_mlp": 1.04791999, "epoch": 0.36711258079062076, "flos": 25702955003520.0, "grad_norm": 2.141182054621855, "language_loss": 0.80961025, "learning_rate": 2.9216782957789692e-06, "loss": 0.83134425, "num_input_tokens_seen": 131169310, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 6106, "time_per_iteration": 2.5456290245056152 }, { "auxiliary_loss_clip": 0.01058714, "auxiliary_loss_mlp": 0.01018646, "balance_loss_clip": 1.01695275, "balance_loss_mlp": 1.03082895, "epoch": 0.3671727040432887, "flos": 60772743342720.0, "grad_norm": 0.6991164367939047, "language_loss": 0.59235883, "learning_rate": 2.9213326374203426e-06, "loss": 0.61313242, "num_input_tokens_seen": 131232900, "router_z_loss_clip": 0.01696777, "router_z_loss_mlp": 0.27929688, "step": 6107, "time_per_iteration": 3.172438859939575 }, { "auxiliary_loss_clip": 0.01129825, "auxiliary_loss_mlp": 0.01032729, "balance_loss_clip": 1.01847768, "balance_loss_mlp": 1.04800034, "epoch": 0.3672328272959567, "flos": 18661303745280.0, "grad_norm": 1.7937074474828951, "language_loss": 0.74449354, "learning_rate": 2.92098694412469e-06, "loss": 0.76611912, "num_input_tokens_seen": 131250920, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.81640625, "step": 6108, "time_per_iteration": 2.541726589202881 }, { "auxiliary_loss_clip": 0.01129014, "auxiliary_loss_mlp": 0.01037336, "balance_loss_clip": 1.02253056, "balance_loss_mlp": 1.04494369, "epoch": 0.3672929505486247, "flos": 15049049535360.0, "grad_norm": 2.0986902516194017, "language_loss": 0.72857344, "learning_rate": 2.9206412159051213e-06, "loss": 0.75023699, "num_input_tokens_seen": 131267910, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.84375, "step": 6109, "time_per_iteration": 2.4374125003814697 }, { "auxiliary_loss_clip": 0.01128464, "auxiliary_loss_mlp": 0.01036794, "balance_loss_clip": 1.02219701, "balance_loss_mlp": 1.04628456, "epoch": 0.3673530738012927, "flos": 20589347099520.0, "grad_norm": 2.3386145510088414, "language_loss": 0.53116775, "learning_rate": 2.920295452774744e-06, "loss": 0.55282032, "num_input_tokens_seen": 131287150, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8203125, "step": 6110, "time_per_iteration": 2.4854896068573 }, { "auxiliary_loss_clip": 0.01129551, "auxiliary_loss_mlp": 0.01033338, "balance_loss_clip": 1.01816869, "balance_loss_mlp": 1.04890501, "epoch": 0.36741319705396064, "flos": 21689830033920.0, "grad_norm": 1.4413256229265272, "language_loss": 0.80755591, "learning_rate": 2.919949654746672e-06, "loss": 0.82918483, "num_input_tokens_seen": 131308225, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 6111, "time_per_iteration": 2.4890689849853516 }, { "auxiliary_loss_clip": 0.01130342, "auxiliary_loss_mlp": 0.01042468, "balance_loss_clip": 1.02827024, "balance_loss_mlp": 1.04940784, "epoch": 0.3674733203066286, "flos": 29862200499840.0, "grad_norm": 1.4841725316170218, "language_loss": 0.72590864, "learning_rate": 2.9196038218340163e-06, "loss": 0.74763668, "num_input_tokens_seen": 131332115, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80859375, "step": 6112, "time_per_iteration": 2.5869431495666504 }, { "auxiliary_loss_clip": 0.01129349, "auxiliary_loss_mlp": 0.01042462, "balance_loss_clip": 1.02777576, "balance_loss_mlp": 1.04782724, "epoch": 0.36753344355929657, "flos": 18257021193600.0, "grad_norm": 1.6372951593983487, "language_loss": 0.84976172, "learning_rate": 2.919257954049892e-06, "loss": 0.87147987, "num_input_tokens_seen": 131351885, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.81640625, "step": 6113, "time_per_iteration": 2.458258628845215 }, { "auxiliary_loss_clip": 0.01131593, "auxiliary_loss_mlp": 0.01038679, "balance_loss_clip": 1.02293193, "balance_loss_mlp": 1.04724157, "epoch": 0.36759356681196453, "flos": 25301150490240.0, "grad_norm": 1.8402226365131975, "language_loss": 0.79249871, "learning_rate": 2.918912051407413e-06, "loss": 0.81420141, "num_input_tokens_seen": 131370245, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84375, "step": 6114, "time_per_iteration": 2.5207178592681885 }, { "auxiliary_loss_clip": 0.0113492, "auxiliary_loss_mlp": 0.01043213, "balance_loss_clip": 1.02531946, "balance_loss_mlp": 1.04874158, "epoch": 0.3676536900646325, "flos": 21032952065280.0, "grad_norm": 1.807823678220781, "language_loss": 0.67233324, "learning_rate": 2.918566113919698e-06, "loss": 0.69411457, "num_input_tokens_seen": 131388115, "router_z_loss_clip": 0.1796875, "router_z_loss_mlp": 0.86328125, "step": 6115, "time_per_iteration": 2.4617457389831543 }, { "auxiliary_loss_clip": 0.01125757, "auxiliary_loss_mlp": 0.01031524, "balance_loss_clip": 1.01713538, "balance_loss_mlp": 1.04638243, "epoch": 0.36771381331730046, "flos": 16288506190080.0, "grad_norm": 2.385325615641442, "language_loss": 0.76704109, "learning_rate": 2.9182201415998636e-06, "loss": 0.78861392, "num_input_tokens_seen": 131404595, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.79296875, "step": 6116, "time_per_iteration": 2.4850854873657227 }, { "auxiliary_loss_clip": 0.0112855, "auxiliary_loss_mlp": 0.01039248, "balance_loss_clip": 1.02481151, "balance_loss_mlp": 1.04653955, "epoch": 0.36777393656996843, "flos": 22309971367680.0, "grad_norm": 1.8432940801628712, "language_loss": 0.62802076, "learning_rate": 2.9178741344610286e-06, "loss": 0.64969873, "num_input_tokens_seen": 131423760, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 6117, "time_per_iteration": 2.4645092487335205 }, { "auxiliary_loss_clip": 0.01130295, "auxiliary_loss_mlp": 0.01034891, "balance_loss_clip": 1.01951277, "balance_loss_mlp": 1.04860437, "epoch": 0.3678340598226364, "flos": 26834069260800.0, "grad_norm": 1.9912342290562268, "language_loss": 0.72812164, "learning_rate": 2.9175280925163156e-06, "loss": 0.7497735, "num_input_tokens_seen": 131444955, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 6118, "time_per_iteration": 2.569913148880005 }, { "auxiliary_loss_clip": 0.01133916, "auxiliary_loss_mlp": 0.01043247, "balance_loss_clip": 1.02647424, "balance_loss_mlp": 1.04714251, "epoch": 0.36789418307530436, "flos": 21761723105280.0, "grad_norm": 2.0054309619777486, "language_loss": 0.72602081, "learning_rate": 2.9171820157788445e-06, "loss": 0.74779248, "num_input_tokens_seen": 131465720, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8671875, "step": 6119, "time_per_iteration": 2.4654839038848877 }, { "auxiliary_loss_clip": 0.01130288, "auxiliary_loss_mlp": 0.01035142, "balance_loss_clip": 1.02004373, "balance_loss_mlp": 1.04791653, "epoch": 0.3679543063279723, "flos": 15924192497280.0, "grad_norm": 2.1041263254047635, "language_loss": 0.80467141, "learning_rate": 2.9168359042617404e-06, "loss": 0.82632565, "num_input_tokens_seen": 131483080, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.82421875, "step": 6120, "time_per_iteration": 2.503364324569702 }, { "auxiliary_loss_clip": 0.01128715, "auxiliary_loss_mlp": 0.01039157, "balance_loss_clip": 1.02410054, "balance_loss_mlp": 1.04646897, "epoch": 0.3680144295806403, "flos": 24275541456000.0, "grad_norm": 1.6916964887866965, "language_loss": 0.63869643, "learning_rate": 2.916489757978126e-06, "loss": 0.66037518, "num_input_tokens_seen": 131502545, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.82421875, "step": 6121, "time_per_iteration": 2.4815833568573 }, { "auxiliary_loss_clip": 0.01134018, "auxiliary_loss_mlp": 0.01040169, "balance_loss_clip": 1.02479136, "balance_loss_mlp": 1.04930413, "epoch": 0.36807455283330826, "flos": 26104148985600.0, "grad_norm": 2.154232741614555, "language_loss": 0.71549243, "learning_rate": 2.9161435769411286e-06, "loss": 0.73723429, "num_input_tokens_seen": 131522155, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84765625, "step": 6122, "time_per_iteration": 2.58831524848938 }, { "auxiliary_loss_clip": 0.01128145, "auxiliary_loss_mlp": 0.01034131, "balance_loss_clip": 1.01946771, "balance_loss_mlp": 1.04993105, "epoch": 0.3681346760859763, "flos": 24644990793600.0, "grad_norm": 1.9761946818691152, "language_loss": 0.69362652, "learning_rate": 2.915797361163875e-06, "loss": 0.7152493, "num_input_tokens_seen": 131543865, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 6123, "time_per_iteration": 2.5097501277923584 }, { "auxiliary_loss_clip": 0.01136106, "auxiliary_loss_mlp": 0.01037471, "balance_loss_clip": 1.02036488, "balance_loss_mlp": 1.04927146, "epoch": 0.36819479933864424, "flos": 23878369797120.0, "grad_norm": 2.9355262849738524, "language_loss": 0.73477209, "learning_rate": 2.9154511106594933e-06, "loss": 0.75650787, "num_input_tokens_seen": 131562155, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8671875, "step": 6124, "time_per_iteration": 2.5444912910461426 }, { "auxiliary_loss_clip": 0.01133414, "auxiliary_loss_mlp": 0.01040086, "balance_loss_clip": 1.02400506, "balance_loss_mlp": 1.04928279, "epoch": 0.3682549225913122, "flos": 25553997302400.0, "grad_norm": 2.5501944970432544, "language_loss": 0.74331141, "learning_rate": 2.915104825441114e-06, "loss": 0.76504642, "num_input_tokens_seen": 131581695, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 6125, "time_per_iteration": 2.5078446865081787 }, { "auxiliary_loss_clip": 0.01133679, "auxiliary_loss_mlp": 0.01047088, "balance_loss_clip": 1.03068519, "balance_loss_mlp": 1.04840767, "epoch": 0.36831504584398017, "flos": 16946605221120.0, "grad_norm": 1.8792776476969466, "language_loss": 0.78298968, "learning_rate": 2.9147585055218686e-06, "loss": 0.80479735, "num_input_tokens_seen": 131599465, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8515625, "step": 6126, "time_per_iteration": 2.465498685836792 }, { "auxiliary_loss_clip": 0.01135357, "auxiliary_loss_mlp": 0.0104096, "balance_loss_clip": 1.02388895, "balance_loss_mlp": 1.04882693, "epoch": 0.36837516909664814, "flos": 19865065259520.0, "grad_norm": 2.4665254392813396, "language_loss": 0.6541332, "learning_rate": 2.914412150914888e-06, "loss": 0.67589635, "num_input_tokens_seen": 131618330, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8671875, "step": 6127, "time_per_iteration": 2.4564342498779297 }, { "auxiliary_loss_clip": 0.0113657, "auxiliary_loss_mlp": 0.0104473, "balance_loss_clip": 1.02852988, "balance_loss_mlp": 1.0517875, "epoch": 0.3684352923493161, "flos": 37626984362880.0, "grad_norm": 1.9437616878924364, "language_loss": 0.70418388, "learning_rate": 2.9140657616333074e-06, "loss": 0.72599685, "num_input_tokens_seen": 131638960, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84765625, "step": 6128, "time_per_iteration": 2.639866590499878 }, { "auxiliary_loss_clip": 0.01132167, "auxiliary_loss_mlp": 0.01036393, "balance_loss_clip": 1.02076507, "balance_loss_mlp": 1.04936373, "epoch": 0.36849541560198407, "flos": 14465501182080.0, "grad_norm": 1.9889005731780147, "language_loss": 0.75297117, "learning_rate": 2.9137193376902614e-06, "loss": 0.77465677, "num_input_tokens_seen": 131657440, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 6129, "time_per_iteration": 2.431481122970581 }, { "auxiliary_loss_clip": 0.01130586, "auxiliary_loss_mlp": 0.0103912, "balance_loss_clip": 1.02327752, "balance_loss_mlp": 1.04790103, "epoch": 0.36855553885465203, "flos": 25770753924480.0, "grad_norm": 1.8127383610422398, "language_loss": 0.84716332, "learning_rate": 2.9133728790988868e-06, "loss": 0.86886036, "num_input_tokens_seen": 131678035, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 6130, "time_per_iteration": 2.565204381942749 }, { "auxiliary_loss_clip": 0.01057965, "auxiliary_loss_mlp": 0.01002447, "balance_loss_clip": 1.00053942, "balance_loss_mlp": 1.03004813, "epoch": 0.36861566210732, "flos": 65049417377280.0, "grad_norm": 0.8187994075001246, "language_loss": 0.60368037, "learning_rate": 2.913026385872321e-06, "loss": 0.62428451, "num_input_tokens_seen": 131742470, "router_z_loss_clip": 0.01904297, "router_z_loss_mlp": 0.27929688, "step": 6131, "time_per_iteration": 3.197535276412964 }, { "auxiliary_loss_clip": 0.01127784, "auxiliary_loss_mlp": 0.01030834, "balance_loss_clip": 1.0156467, "balance_loss_mlp": 1.04634643, "epoch": 0.36867578535998796, "flos": 30954495133440.0, "grad_norm": 1.6991346152795646, "language_loss": 0.73304784, "learning_rate": 2.9126798580237034e-06, "loss": 0.75463402, "num_input_tokens_seen": 131764570, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 6132, "time_per_iteration": 2.6253583431243896 }, { "auxiliary_loss_clip": 0.01136208, "auxiliary_loss_mlp": 0.01036581, "balance_loss_clip": 1.02077413, "balance_loss_mlp": 1.04939592, "epoch": 0.3687359086126559, "flos": 28837956182400.0, "grad_norm": 1.8538593630656164, "language_loss": 0.74239999, "learning_rate": 2.9123332955661736e-06, "loss": 0.76412785, "num_input_tokens_seen": 131785720, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 6133, "time_per_iteration": 3.9852168560028076 }, { "auxiliary_loss_clip": 0.01127345, "auxiliary_loss_mlp": 0.01040988, "balance_loss_clip": 1.02630126, "balance_loss_mlp": 1.04812729, "epoch": 0.3687960318653239, "flos": 21396798881280.0, "grad_norm": 1.7205020787698457, "language_loss": 0.71521616, "learning_rate": 2.911986698512874e-06, "loss": 0.7368995, "num_input_tokens_seen": 131804430, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 6134, "time_per_iteration": 2.485553026199341 }, { "auxiliary_loss_clip": 0.01128457, "auxiliary_loss_mlp": 0.01034176, "balance_loss_clip": 1.01902413, "balance_loss_mlp": 1.04645169, "epoch": 0.36885615511799186, "flos": 20266043760000.0, "grad_norm": 1.5720774952651537, "language_loss": 0.75385439, "learning_rate": 2.9116400668769477e-06, "loss": 0.77548075, "num_input_tokens_seen": 131822060, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8203125, "step": 6135, "time_per_iteration": 2.578613042831421 }, { "auxiliary_loss_clip": 0.01055768, "auxiliary_loss_mlp": 0.01005402, "balance_loss_clip": 1.00335181, "balance_loss_mlp": 1.02832782, "epoch": 0.3689162783706599, "flos": 63088836301440.0, "grad_norm": 0.8217000664789496, "language_loss": 0.58890676, "learning_rate": 2.9112934006715376e-06, "loss": 0.60951841, "num_input_tokens_seen": 131880715, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.2734375, "step": 6136, "time_per_iteration": 3.035555839538574 }, { "auxiliary_loss_clip": 0.01127835, "auxiliary_loss_mlp": 0.01037884, "balance_loss_clip": 1.02282798, "balance_loss_mlp": 1.0468502, "epoch": 0.36897640162332784, "flos": 10961984419200.0, "grad_norm": 1.921163399964412, "language_loss": 0.79184586, "learning_rate": 2.9109466999097918e-06, "loss": 0.81350303, "num_input_tokens_seen": 131895850, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80859375, "step": 6137, "time_per_iteration": 3.8350157737731934 }, { "auxiliary_loss_clip": 0.01130049, "auxiliary_loss_mlp": 0.01038397, "balance_loss_clip": 1.0230546, "balance_loss_mlp": 1.04713774, "epoch": 0.3690365248759958, "flos": 20704297599360.0, "grad_norm": 2.4376440690675176, "language_loss": 0.73955864, "learning_rate": 2.9105999646048552e-06, "loss": 0.7612431, "num_input_tokens_seen": 131915775, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 6138, "time_per_iteration": 4.004369497299194 }, { "auxiliary_loss_clip": 0.01133078, "auxiliary_loss_mlp": 0.01040695, "balance_loss_clip": 1.02513814, "balance_loss_mlp": 1.04778147, "epoch": 0.3690966481286638, "flos": 31826369957760.0, "grad_norm": 1.9363240277714813, "language_loss": 0.64682811, "learning_rate": 2.9102531947698764e-06, "loss": 0.66856587, "num_input_tokens_seen": 131935715, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8515625, "step": 6139, "time_per_iteration": 3.9992423057556152 }, { "auxiliary_loss_clip": 0.01124169, "auxiliary_loss_mlp": 0.01045087, "balance_loss_clip": 1.02964973, "balance_loss_mlp": 1.04393768, "epoch": 0.36915677138133174, "flos": 13114936782720.0, "grad_norm": 2.2836657844523907, "language_loss": 0.71209216, "learning_rate": 2.909906390418006e-06, "loss": 0.73378474, "num_input_tokens_seen": 131954120, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 6140, "time_per_iteration": 2.4631054401397705 }, { "auxiliary_loss_clip": 0.01054459, "auxiliary_loss_mlp": 0.01002526, "balance_loss_clip": 1.00064218, "balance_loss_mlp": 1.02670372, "epoch": 0.3692168946339997, "flos": 68686879956480.0, "grad_norm": 0.7507393780794023, "language_loss": 0.59348083, "learning_rate": 2.9095595515623934e-06, "loss": 0.61405063, "num_input_tokens_seen": 132017485, "router_z_loss_clip": 0.01879883, "router_z_loss_mlp": 0.27734375, "step": 6141, "time_per_iteration": 3.1604654788970947 }, { "auxiliary_loss_clip": 0.01126954, "auxiliary_loss_mlp": 0.01034805, "balance_loss_clip": 1.01993918, "balance_loss_mlp": 1.04432631, "epoch": 0.36927701788666767, "flos": 22017873968640.0, "grad_norm": 3.2093759946408893, "language_loss": 0.75202268, "learning_rate": 2.909212678216192e-06, "loss": 0.77364028, "num_input_tokens_seen": 132036760, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 6142, "time_per_iteration": 2.4810543060302734 }, { "auxiliary_loss_clip": 0.01124668, "auxiliary_loss_mlp": 0.01034177, "balance_loss_clip": 1.02023554, "balance_loss_mlp": 1.04432321, "epoch": 0.36933714113933563, "flos": 21835591424640.0, "grad_norm": 1.7676984505834865, "language_loss": 0.77080595, "learning_rate": 2.908865770392555e-06, "loss": 0.7923944, "num_input_tokens_seen": 132056935, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8046875, "step": 6143, "time_per_iteration": 2.5019497871398926 }, { "auxiliary_loss_clip": 0.01123904, "auxiliary_loss_mlp": 0.01032133, "balance_loss_clip": 1.01873982, "balance_loss_mlp": 1.04427958, "epoch": 0.3693972643920036, "flos": 23691705793920.0, "grad_norm": 11.254382235394734, "language_loss": 0.81925416, "learning_rate": 2.9085188281046364e-06, "loss": 0.84081453, "num_input_tokens_seen": 132077285, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.796875, "step": 6144, "time_per_iteration": 2.477658987045288 }, { "auxiliary_loss_clip": 0.01126333, "auxiliary_loss_mlp": 0.01036064, "balance_loss_clip": 1.02193213, "balance_loss_mlp": 1.04407179, "epoch": 0.36945738764467156, "flos": 22856747172480.0, "grad_norm": 2.7673791188801946, "language_loss": 0.77093923, "learning_rate": 2.908171851365593e-06, "loss": 0.7925632, "num_input_tokens_seen": 132095520, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8203125, "step": 6145, "time_per_iteration": 2.5144870281219482 }, { "auxiliary_loss_clip": 0.0112767, "auxiliary_loss_mlp": 0.01030004, "balance_loss_clip": 1.01542521, "balance_loss_mlp": 1.04569268, "epoch": 0.36951751089733953, "flos": 16615939593600.0, "grad_norm": 1.8798506810983289, "language_loss": 0.76954579, "learning_rate": 2.9078248401885815e-06, "loss": 0.7911225, "num_input_tokens_seen": 132112810, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8203125, "step": 6146, "time_per_iteration": 2.4711878299713135 }, { "auxiliary_loss_clip": 0.01128096, "auxiliary_loss_mlp": 0.01041746, "balance_loss_clip": 1.025545, "balance_loss_mlp": 1.04491234, "epoch": 0.3695776341500075, "flos": 18914545607040.0, "grad_norm": 5.114037541138191, "language_loss": 0.80958325, "learning_rate": 2.907477794586761e-06, "loss": 0.83128166, "num_input_tokens_seen": 132131615, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.83203125, "step": 6147, "time_per_iteration": 2.4769980907440186 }, { "auxiliary_loss_clip": 0.01125381, "auxiliary_loss_mlp": 0.01032684, "balance_loss_clip": 1.0191952, "balance_loss_mlp": 1.04280293, "epoch": 0.36963775740267546, "flos": 20808474019200.0, "grad_norm": 1.8988134568410955, "language_loss": 0.83527005, "learning_rate": 2.9071307145732926e-06, "loss": 0.85685068, "num_input_tokens_seen": 132149585, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.82421875, "step": 6148, "time_per_iteration": 2.4567980766296387 }, { "auxiliary_loss_clip": 0.0112555, "auxiliary_loss_mlp": 0.0103527, "balance_loss_clip": 1.02001143, "balance_loss_mlp": 1.04528975, "epoch": 0.3696978806553435, "flos": 26061881656320.0, "grad_norm": 2.890364464403632, "language_loss": 0.74276733, "learning_rate": 2.9067836001613357e-06, "loss": 0.76437557, "num_input_tokens_seen": 132165555, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 6149, "time_per_iteration": 2.496152400970459 }, { "auxiliary_loss_clip": 0.01131366, "auxiliary_loss_mlp": 0.0103945, "balance_loss_clip": 1.02329707, "balance_loss_mlp": 1.04762197, "epoch": 0.36975800390801145, "flos": 26833925606400.0, "grad_norm": 1.9216233511546823, "language_loss": 0.71140981, "learning_rate": 2.906436451364054e-06, "loss": 0.73311806, "num_input_tokens_seen": 132185100, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8359375, "step": 6150, "time_per_iteration": 2.5141243934631348 }, { "auxiliary_loss_clip": 0.01127012, "auxiliary_loss_mlp": 0.01041975, "balance_loss_clip": 1.02722907, "balance_loss_mlp": 1.04597187, "epoch": 0.3698181271606794, "flos": 21142623265920.0, "grad_norm": 1.7618156966698018, "language_loss": 0.81668949, "learning_rate": 2.906089268194611e-06, "loss": 0.83837938, "num_input_tokens_seen": 132203930, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80859375, "step": 6151, "time_per_iteration": 2.488727569580078 }, { "auxiliary_loss_clip": 0.01053401, "auxiliary_loss_mlp": 0.0100651, "balance_loss_clip": 1.00472224, "balance_loss_mlp": 1.02577829, "epoch": 0.3698782504133474, "flos": 66742639568640.0, "grad_norm": 0.7735625436342598, "language_loss": 0.63139379, "learning_rate": 2.9057420506661726e-06, "loss": 0.65199286, "num_input_tokens_seen": 132263845, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.27734375, "step": 6152, "time_per_iteration": 3.191697359085083 }, { "auxiliary_loss_clip": 0.01124738, "auxiliary_loss_mlp": 0.01035621, "balance_loss_clip": 1.02131557, "balance_loss_mlp": 1.04590333, "epoch": 0.36993837366601534, "flos": 24311523905280.0, "grad_norm": 2.006409483272262, "language_loss": 0.70448458, "learning_rate": 2.9053947987919044e-06, "loss": 0.72608823, "num_input_tokens_seen": 132282350, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 6153, "time_per_iteration": 2.532374858856201 }, { "auxiliary_loss_clip": 0.011294, "auxiliary_loss_mlp": 0.0103596, "balance_loss_clip": 1.02068329, "balance_loss_mlp": 1.04607475, "epoch": 0.3699984969186833, "flos": 24349194293760.0, "grad_norm": 2.165208772831934, "language_loss": 0.72503018, "learning_rate": 2.9050475125849755e-06, "loss": 0.74668384, "num_input_tokens_seen": 132301930, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.83203125, "step": 6154, "time_per_iteration": 2.490159749984741 }, { "auxiliary_loss_clip": 0.01125803, "auxiliary_loss_mlp": 0.0103183, "balance_loss_clip": 1.01773906, "balance_loss_mlp": 1.04430556, "epoch": 0.37005862017135127, "flos": 19829154637440.0, "grad_norm": 2.4832337030521416, "language_loss": 0.68179291, "learning_rate": 2.9047001920585534e-06, "loss": 0.70336926, "num_input_tokens_seen": 132320915, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 6155, "time_per_iteration": 2.4738962650299072 }, { "auxiliary_loss_clip": 0.01125347, "auxiliary_loss_mlp": 0.01030627, "balance_loss_clip": 1.0158453, "balance_loss_mlp": 1.04419947, "epoch": 0.37011874342401924, "flos": 19573793873280.0, "grad_norm": 2.010434317801859, "language_loss": 0.68121308, "learning_rate": 2.9043528372258097e-06, "loss": 0.70277286, "num_input_tokens_seen": 132340415, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 6156, "time_per_iteration": 2.470453977584839 }, { "auxiliary_loss_clip": 0.01126361, "auxiliary_loss_mlp": 0.01038284, "balance_loss_clip": 1.02496803, "balance_loss_mlp": 1.04601812, "epoch": 0.3701788666766872, "flos": 20374350243840.0, "grad_norm": 1.8491580049702372, "language_loss": 0.82335687, "learning_rate": 2.904005448099916e-06, "loss": 0.84500337, "num_input_tokens_seen": 132358600, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.8046875, "step": 6157, "time_per_iteration": 2.55133056640625 }, { "auxiliary_loss_clip": 0.01130726, "auxiliary_loss_mlp": 0.01035736, "balance_loss_clip": 1.0200007, "balance_loss_mlp": 1.04636955, "epoch": 0.37023898992935517, "flos": 15340931452800.0, "grad_norm": 6.243288276933893, "language_loss": 0.76033336, "learning_rate": 2.9036580246940444e-06, "loss": 0.78199798, "num_input_tokens_seen": 132373160, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.84375, "step": 6158, "time_per_iteration": 2.4287500381469727 }, { "auxiliary_loss_clip": 0.01128737, "auxiliary_loss_mlp": 0.01035314, "balance_loss_clip": 1.01936412, "balance_loss_mlp": 1.044469, "epoch": 0.37029911318202313, "flos": 19573937527680.0, "grad_norm": 3.9326129346624974, "language_loss": 0.68886662, "learning_rate": 2.9033105670213708e-06, "loss": 0.71050715, "num_input_tokens_seen": 132392345, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84375, "step": 6159, "time_per_iteration": 2.487182140350342 }, { "auxiliary_loss_clip": 0.01123449, "auxiliary_loss_mlp": 0.01034489, "balance_loss_clip": 1.0209465, "balance_loss_mlp": 1.04307008, "epoch": 0.3703592364346911, "flos": 26213353309440.0, "grad_norm": 1.808750447491789, "language_loss": 0.70794225, "learning_rate": 2.9029630750950697e-06, "loss": 0.72952163, "num_input_tokens_seen": 132412620, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.8046875, "step": 6160, "time_per_iteration": 2.5125339031219482 }, { "auxiliary_loss_clip": 0.01119708, "auxiliary_loss_mlp": 0.0102846, "balance_loss_clip": 1.01530552, "balance_loss_mlp": 1.04143405, "epoch": 0.37041935968735906, "flos": 20048317470720.0, "grad_norm": 1.596864436181365, "language_loss": 0.79234374, "learning_rate": 2.9026155489283176e-06, "loss": 0.81382543, "num_input_tokens_seen": 132431570, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.78125, "step": 6161, "time_per_iteration": 2.488337278366089 }, { "auxiliary_loss_clip": 0.01127196, "auxiliary_loss_mlp": 0.0103599, "balance_loss_clip": 1.02022493, "balance_loss_mlp": 1.04546666, "epoch": 0.3704794829400271, "flos": 24133802388480.0, "grad_norm": 1.8866165371044608, "language_loss": 0.79520726, "learning_rate": 2.902267988534295e-06, "loss": 0.81683922, "num_input_tokens_seen": 132451525, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.81640625, "step": 6162, "time_per_iteration": 2.4735894203186035 }, { "auxiliary_loss_clip": 0.0112814, "auxiliary_loss_mlp": 0.01034751, "balance_loss_clip": 1.02021956, "balance_loss_mlp": 1.04610467, "epoch": 0.37053960619269505, "flos": 14866874732160.0, "grad_norm": 1.9354770723004957, "language_loss": 0.79547852, "learning_rate": 2.9019203939261783e-06, "loss": 0.81710744, "num_input_tokens_seen": 132469875, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8203125, "step": 6163, "time_per_iteration": 2.4528372287750244 }, { "auxiliary_loss_clip": 0.01127805, "auxiliary_loss_mlp": 0.0103469, "balance_loss_clip": 1.01938343, "balance_loss_mlp": 1.04486859, "epoch": 0.370599729445363, "flos": 21361498790400.0, "grad_norm": 1.6798478795529377, "language_loss": 0.68235719, "learning_rate": 2.9015727651171507e-06, "loss": 0.70398211, "num_input_tokens_seen": 132488360, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 6164, "time_per_iteration": 2.459594488143921 }, { "auxiliary_loss_clip": 0.01132507, "auxiliary_loss_mlp": 0.01034306, "balance_loss_clip": 1.01868999, "balance_loss_mlp": 1.0487051, "epoch": 0.370659852698031, "flos": 26829041356800.0, "grad_norm": 2.3117645235605013, "language_loss": 0.8278594, "learning_rate": 2.9012251021203935e-06, "loss": 0.84952748, "num_input_tokens_seen": 132508630, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 6165, "time_per_iteration": 2.553370475769043 }, { "auxiliary_loss_clip": 0.01133169, "auxiliary_loss_mlp": 0.01035787, "balance_loss_clip": 1.01915681, "balance_loss_mlp": 1.04768896, "epoch": 0.37071997595069894, "flos": 19099018880640.0, "grad_norm": 1.7928675063175898, "language_loss": 0.69083744, "learning_rate": 2.9008774049490896e-06, "loss": 0.71252704, "num_input_tokens_seen": 132527465, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.85546875, "step": 6166, "time_per_iteration": 2.4748036861419678 }, { "auxiliary_loss_clip": 0.01053472, "auxiliary_loss_mlp": 0.01002333, "balance_loss_clip": 1.0005089, "balance_loss_mlp": 1.02602029, "epoch": 0.3707800992033669, "flos": 52178384920320.0, "grad_norm": 0.7955243025149357, "language_loss": 0.56923282, "learning_rate": 2.9005296736164244e-06, "loss": 0.58979094, "num_input_tokens_seen": 132579940, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.2734375, "step": 6167, "time_per_iteration": 2.958284378051758 }, { "auxiliary_loss_clip": 0.01125084, "auxiliary_loss_mlp": 0.01033913, "balance_loss_clip": 1.02001905, "balance_loss_mlp": 1.04543054, "epoch": 0.3708402224560349, "flos": 19901837808000.0, "grad_norm": 7.119168419242966, "language_loss": 0.75559992, "learning_rate": 2.900181908135584e-06, "loss": 0.77718991, "num_input_tokens_seen": 132598390, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.796875, "step": 6168, "time_per_iteration": 2.466111183166504 }, { "auxiliary_loss_clip": 0.01125375, "auxiliary_loss_mlp": 0.01033426, "balance_loss_clip": 1.01928782, "balance_loss_mlp": 1.04364896, "epoch": 0.37090034570870284, "flos": 20007630339840.0, "grad_norm": 1.6106382328920685, "language_loss": 0.73849732, "learning_rate": 2.899834108519755e-06, "loss": 0.76008528, "num_input_tokens_seen": 132616920, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.81640625, "step": 6169, "time_per_iteration": 2.491177797317505 }, { "auxiliary_loss_clip": 0.01125673, "auxiliary_loss_mlp": 0.0103075, "balance_loss_clip": 1.01665926, "balance_loss_mlp": 1.04615152, "epoch": 0.3709604689613708, "flos": 24134700228480.0, "grad_norm": 1.9069137053307765, "language_loss": 0.79420882, "learning_rate": 2.899486274782127e-06, "loss": 0.81577301, "num_input_tokens_seen": 132637660, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 6170, "time_per_iteration": 2.4881582260131836 }, { "auxiliary_loss_clip": 0.01127331, "auxiliary_loss_mlp": 0.01038686, "balance_loss_clip": 1.02331984, "balance_loss_mlp": 1.04528761, "epoch": 0.37102059221403877, "flos": 23876071326720.0, "grad_norm": 1.5616091760172468, "language_loss": 0.76548529, "learning_rate": 2.8991384069358885e-06, "loss": 0.78714544, "num_input_tokens_seen": 132657635, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8203125, "step": 6171, "time_per_iteration": 2.5135767459869385 }, { "auxiliary_loss_clip": 0.01129316, "auxiliary_loss_mlp": 0.01032507, "balance_loss_clip": 1.01777244, "balance_loss_mlp": 1.04799008, "epoch": 0.37108071546670673, "flos": 14501268149760.0, "grad_norm": 1.9433675921851632, "language_loss": 0.80531681, "learning_rate": 2.898790504994232e-06, "loss": 0.82693505, "num_input_tokens_seen": 132674455, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 6172, "time_per_iteration": 2.4306061267852783 }, { "auxiliary_loss_clip": 0.01129065, "auxiliary_loss_mlp": 0.01037111, "balance_loss_clip": 1.0214529, "balance_loss_mlp": 1.04528236, "epoch": 0.3711408387193747, "flos": 34562619279360.0, "grad_norm": 4.60703768767096, "language_loss": 0.59536219, "learning_rate": 2.89844256897035e-06, "loss": 0.61702389, "num_input_tokens_seen": 132695140, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 6173, "time_per_iteration": 2.57867693901062 }, { "auxiliary_loss_clip": 0.01126409, "auxiliary_loss_mlp": 0.01036014, "balance_loss_clip": 1.02117205, "balance_loss_mlp": 1.04439878, "epoch": 0.37120096197204266, "flos": 17310703432320.0, "grad_norm": 1.914653672895532, "language_loss": 0.80244768, "learning_rate": 2.898094598877435e-06, "loss": 0.82407188, "num_input_tokens_seen": 132712470, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 6174, "time_per_iteration": 2.428260087966919 }, { "auxiliary_loss_clip": 0.01124143, "auxiliary_loss_mlp": 0.01034032, "balance_loss_clip": 1.01987004, "balance_loss_mlp": 1.0443325, "epoch": 0.37126108522471063, "flos": 30664049760000.0, "grad_norm": 1.7821901898193457, "language_loss": 0.80065417, "learning_rate": 2.8977465947286826e-06, "loss": 0.822236, "num_input_tokens_seen": 132732945, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.796875, "step": 6175, "time_per_iteration": 3.9814772605895996 }, { "auxiliary_loss_clip": 0.01129827, "auxiliary_loss_mlp": 0.0104193, "balance_loss_clip": 1.02739835, "balance_loss_mlp": 1.04882598, "epoch": 0.37132120847737865, "flos": 25155640494720.0, "grad_norm": 2.5792327637646757, "language_loss": 0.88859522, "learning_rate": 2.89739855653729e-06, "loss": 0.91031283, "num_input_tokens_seen": 132752470, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.80859375, "step": 6176, "time_per_iteration": 2.512037992477417 }, { "auxiliary_loss_clip": 0.01129573, "auxiliary_loss_mlp": 0.01041784, "balance_loss_clip": 1.02731216, "balance_loss_mlp": 1.04615903, "epoch": 0.3713813317300466, "flos": 21213474842880.0, "grad_norm": 3.9756752157030677, "language_loss": 0.73432708, "learning_rate": 2.8970504843164546e-06, "loss": 0.75604069, "num_input_tokens_seen": 132771485, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8359375, "step": 6177, "time_per_iteration": 2.4666011333465576 }, { "auxiliary_loss_clip": 0.01128305, "auxiliary_loss_mlp": 0.01044399, "balance_loss_clip": 1.02968836, "balance_loss_mlp": 1.04704881, "epoch": 0.3714414549827146, "flos": 21616644072960.0, "grad_norm": 2.041583893947456, "language_loss": 0.75593603, "learning_rate": 2.896702378079374e-06, "loss": 0.77766305, "num_input_tokens_seen": 132791465, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 6178, "time_per_iteration": 2.4633164405822754 }, { "auxiliary_loss_clip": 0.01130257, "auxiliary_loss_mlp": 0.01036699, "balance_loss_clip": 1.02169704, "balance_loss_mlp": 1.0490706, "epoch": 0.37150157823538255, "flos": 19972294335360.0, "grad_norm": 1.9989258487403767, "language_loss": 0.7201696, "learning_rate": 2.8963542378392502e-06, "loss": 0.74183917, "num_input_tokens_seen": 132810160, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 6179, "time_per_iteration": 3.9133970737457275 }, { "auxiliary_loss_clip": 0.01130282, "auxiliary_loss_mlp": 0.01038385, "balance_loss_clip": 1.02297139, "balance_loss_mlp": 1.04681087, "epoch": 0.3715617014880505, "flos": 24860562266880.0, "grad_norm": 1.9081977059626847, "language_loss": 0.69863605, "learning_rate": 2.896006063609283e-06, "loss": 0.72032279, "num_input_tokens_seen": 132831265, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83203125, "step": 6180, "time_per_iteration": 3.9373483657836914 }, { "auxiliary_loss_clip": 0.01125327, "auxiliary_loss_mlp": 0.01037034, "balance_loss_clip": 1.02215695, "balance_loss_mlp": 1.04462862, "epoch": 0.3716218247407185, "flos": 20449080489600.0, "grad_norm": 1.6884385651252156, "language_loss": 0.7791459, "learning_rate": 2.8956578554026767e-06, "loss": 0.80076951, "num_input_tokens_seen": 132850005, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 6181, "time_per_iteration": 2.4876790046691895 }, { "auxiliary_loss_clip": 0.01127103, "auxiliary_loss_mlp": 0.01034289, "balance_loss_clip": 1.01897049, "balance_loss_mlp": 1.04624701, "epoch": 0.37168194799338644, "flos": 24133479166080.0, "grad_norm": 3.299821707513135, "language_loss": 0.7847507, "learning_rate": 2.8953096132326343e-06, "loss": 0.80636466, "num_input_tokens_seen": 132865790, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 6182, "time_per_iteration": 2.4773244857788086 }, { "auxiliary_loss_clip": 0.01055515, "auxiliary_loss_mlp": 0.01005617, "balance_loss_clip": 1.00353134, "balance_loss_mlp": 1.02809632, "epoch": 0.3717420712460544, "flos": 67408926900480.0, "grad_norm": 0.7911278067547063, "language_loss": 0.57477129, "learning_rate": 2.894961337112362e-06, "loss": 0.59538263, "num_input_tokens_seen": 132921775, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.2734375, "step": 6183, "time_per_iteration": 3.124450206756592 }, { "auxiliary_loss_clip": 0.01132024, "auxiliary_loss_mlp": 0.01042268, "balance_loss_clip": 1.02622283, "balance_loss_mlp": 1.04598093, "epoch": 0.37180219449872237, "flos": 22376908362240.0, "grad_norm": 1.7067589232164326, "language_loss": 0.77236545, "learning_rate": 2.894613027055066e-06, "loss": 0.79410839, "num_input_tokens_seen": 132941060, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.859375, "step": 6184, "time_per_iteration": 2.460731267929077 }, { "auxiliary_loss_clip": 0.01126515, "auxiliary_loss_mlp": 0.01044667, "balance_loss_clip": 1.0302422, "balance_loss_mlp": 1.04651189, "epoch": 0.37186231775139034, "flos": 21869885934720.0, "grad_norm": 1.8905735937507948, "language_loss": 0.71793985, "learning_rate": 2.894264683073954e-06, "loss": 0.73965168, "num_input_tokens_seen": 132961850, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 6185, "time_per_iteration": 2.4810256958007812 }, { "auxiliary_loss_clip": 0.01126149, "auxiliary_loss_mlp": 0.010305, "balance_loss_clip": 1.01557517, "balance_loss_mlp": 1.04617286, "epoch": 0.3719224410040583, "flos": 22415225195520.0, "grad_norm": 1.7354977646391563, "language_loss": 0.77263963, "learning_rate": 2.8939163051822363e-06, "loss": 0.79420614, "num_input_tokens_seen": 132981625, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 6186, "time_per_iteration": 2.4556453227996826 }, { "auxiliary_loss_clip": 0.01133112, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.02242994, "balance_loss_mlp": 1.0472635, "epoch": 0.37198256425672627, "flos": 25151223121920.0, "grad_norm": 1.933494715140955, "language_loss": 0.83316565, "learning_rate": 2.8935678933931224e-06, "loss": 0.85487974, "num_input_tokens_seen": 133001225, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.859375, "step": 6187, "time_per_iteration": 2.521761178970337 }, { "auxiliary_loss_clip": 0.0112588, "auxiliary_loss_mlp": 0.01035382, "balance_loss_clip": 1.02033186, "balance_loss_mlp": 1.04426312, "epoch": 0.37204268750939423, "flos": 21138313633920.0, "grad_norm": 1.8252065104198953, "language_loss": 0.85015875, "learning_rate": 2.893219447719824e-06, "loss": 0.87177134, "num_input_tokens_seen": 133018820, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.81640625, "step": 6188, "time_per_iteration": 2.459275484085083 }, { "auxiliary_loss_clip": 0.01128548, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.01681662, "balance_loss_mlp": 1.04584503, "epoch": 0.37210281076206225, "flos": 21506829217920.0, "grad_norm": 1.8980052678604942, "language_loss": 0.65758252, "learning_rate": 2.8928709681755548e-06, "loss": 0.67919713, "num_input_tokens_seen": 133040205, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.828125, "step": 6189, "time_per_iteration": 2.4793150424957275 }, { "auxiliary_loss_clip": 0.01129091, "auxiliary_loss_mlp": 0.01038518, "balance_loss_clip": 1.02271128, "balance_loss_mlp": 1.04604018, "epoch": 0.3721629340147302, "flos": 17347835116800.0, "grad_norm": 2.3103740316063086, "language_loss": 0.83936721, "learning_rate": 2.8925224547735293e-06, "loss": 0.86104327, "num_input_tokens_seen": 133058095, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 6190, "time_per_iteration": 2.4294588565826416 }, { "auxiliary_loss_clip": 0.01130127, "auxiliary_loss_mlp": 0.01033604, "balance_loss_clip": 1.0187211, "balance_loss_mlp": 1.04505634, "epoch": 0.3722230572673982, "flos": 16432400073600.0, "grad_norm": 2.3789532592433216, "language_loss": 0.87668669, "learning_rate": 2.8921739075269633e-06, "loss": 0.89832407, "num_input_tokens_seen": 133071530, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8515625, "step": 6191, "time_per_iteration": 2.4543025493621826 }, { "auxiliary_loss_clip": 0.01130385, "auxiliary_loss_mlp": 0.01035616, "balance_loss_clip": 1.01827073, "balance_loss_mlp": 1.04412651, "epoch": 0.37228318052006615, "flos": 22674716023680.0, "grad_norm": 2.3043168680675565, "language_loss": 0.73657894, "learning_rate": 2.891825326449073e-06, "loss": 0.75823897, "num_input_tokens_seen": 133091410, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.86328125, "step": 6192, "time_per_iteration": 2.501558542251587 }, { "auxiliary_loss_clip": 0.0112586, "auxiliary_loss_mlp": 0.01038327, "balance_loss_clip": 1.02383149, "balance_loss_mlp": 1.04351032, "epoch": 0.3723433037727341, "flos": 25265491263360.0, "grad_norm": 2.7419202329438055, "language_loss": 0.7976324, "learning_rate": 2.8914767115530766e-06, "loss": 0.81927425, "num_input_tokens_seen": 133110365, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.82421875, "step": 6193, "time_per_iteration": 2.5122294425964355 }, { "auxiliary_loss_clip": 0.01127082, "auxiliary_loss_mlp": 0.0103653, "balance_loss_clip": 1.0216825, "balance_loss_mlp": 1.04307938, "epoch": 0.3724034270254021, "flos": 10524664333440.0, "grad_norm": 1.962096978666934, "language_loss": 0.84227777, "learning_rate": 2.891128062852194e-06, "loss": 0.86391389, "num_input_tokens_seen": 133128255, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.83984375, "step": 6194, "time_per_iteration": 2.429987668991089 }, { "auxiliary_loss_clip": 0.01124709, "auxiliary_loss_mlp": 0.01031944, "balance_loss_clip": 1.01755571, "balance_loss_mlp": 1.04263496, "epoch": 0.37246355027807004, "flos": 20266223328000.0, "grad_norm": 3.284788764869438, "language_loss": 0.76985109, "learning_rate": 2.890779380359646e-06, "loss": 0.79141766, "num_input_tokens_seen": 133143975, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 6195, "time_per_iteration": 2.5181920528411865 }, { "auxiliary_loss_clip": 0.01126108, "auxiliary_loss_mlp": 0.01031257, "balance_loss_clip": 1.01611781, "balance_loss_mlp": 1.04504347, "epoch": 0.372523673530738, "flos": 19500571998720.0, "grad_norm": 1.7868855679500482, "language_loss": 0.78941661, "learning_rate": 2.890430664088655e-06, "loss": 0.81099027, "num_input_tokens_seen": 133162935, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8125, "step": 6196, "time_per_iteration": 2.450085401535034 }, { "auxiliary_loss_clip": 0.0112717, "auxiliary_loss_mlp": 0.01038592, "balance_loss_clip": 1.02451348, "balance_loss_mlp": 1.04607785, "epoch": 0.372583796783406, "flos": 16764250849920.0, "grad_norm": 1.9317978777204314, "language_loss": 0.83269024, "learning_rate": 2.890081914052443e-06, "loss": 0.85434783, "num_input_tokens_seen": 133181180, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80859375, "step": 6197, "time_per_iteration": 2.449824333190918 }, { "auxiliary_loss_clip": 0.0112255, "auxiliary_loss_mlp": 0.01033812, "balance_loss_clip": 1.01807594, "balance_loss_mlp": 1.04255104, "epoch": 0.37264392003607394, "flos": 22637979388800.0, "grad_norm": 1.573898491755316, "language_loss": 0.64279312, "learning_rate": 2.889733130264237e-06, "loss": 0.66435671, "num_input_tokens_seen": 133199615, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.80078125, "step": 6198, "time_per_iteration": 2.476919174194336 }, { "auxiliary_loss_clip": 0.01122479, "auxiliary_loss_mlp": 0.01047592, "balance_loss_clip": 1.03339398, "balance_loss_mlp": 1.04236674, "epoch": 0.3727040432887419, "flos": 19973120348160.0, "grad_norm": 1.5618702144675631, "language_loss": 0.73890996, "learning_rate": 2.889384312737261e-06, "loss": 0.7606107, "num_input_tokens_seen": 133219650, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.80078125, "step": 6199, "time_per_iteration": 2.509202003479004 }, { "auxiliary_loss_clip": 0.01122671, "auxiliary_loss_mlp": 0.01035039, "balance_loss_clip": 1.02096689, "balance_loss_mlp": 1.04177523, "epoch": 0.37276416654140987, "flos": 63899122279680.0, "grad_norm": 2.3530341451107377, "language_loss": 0.80413324, "learning_rate": 2.889035461484742e-06, "loss": 0.8257103, "num_input_tokens_seen": 133245675, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80859375, "step": 6200, "time_per_iteration": 2.8413219451904297 }, { "auxiliary_loss_clip": 0.0112492, "auxiliary_loss_mlp": 0.01037619, "balance_loss_clip": 1.02330184, "balance_loss_mlp": 1.04377794, "epoch": 0.37282428979407783, "flos": 39785970211200.0, "grad_norm": 1.7707603458189578, "language_loss": 0.60319972, "learning_rate": 2.88868657651991e-06, "loss": 0.62482506, "num_input_tokens_seen": 133266905, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 6201, "time_per_iteration": 2.6170313358306885 }, { "auxiliary_loss_clip": 0.01128663, "auxiliary_loss_mlp": 0.01036215, "balance_loss_clip": 1.02111709, "balance_loss_mlp": 1.04598033, "epoch": 0.37288441304674586, "flos": 22709046447360.0, "grad_norm": 1.7415815506243282, "language_loss": 0.72892201, "learning_rate": 2.8883376578559934e-06, "loss": 0.75057077, "num_input_tokens_seen": 133286865, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.828125, "step": 6202, "time_per_iteration": 2.4702038764953613 }, { "auxiliary_loss_clip": 0.01124071, "auxiliary_loss_mlp": 0.01033463, "balance_loss_clip": 1.01918781, "balance_loss_mlp": 1.04350758, "epoch": 0.3729445362994138, "flos": 18770292587520.0, "grad_norm": 2.285921306664973, "language_loss": 0.73920441, "learning_rate": 2.8879887055062243e-06, "loss": 0.7607798, "num_input_tokens_seen": 133305295, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 6203, "time_per_iteration": 2.4703621864318848 }, { "auxiliary_loss_clip": 0.0112073, "auxiliary_loss_mlp": 0.0103461, "balance_loss_clip": 1.02173543, "balance_loss_mlp": 1.04166377, "epoch": 0.3730046595520818, "flos": 22456199635200.0, "grad_norm": 2.281499863615206, "language_loss": 0.81506097, "learning_rate": 2.8876397194838353e-06, "loss": 0.83661437, "num_input_tokens_seen": 133324625, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.79296875, "step": 6204, "time_per_iteration": 2.4742143154144287 }, { "auxiliary_loss_clip": 0.01128949, "auxiliary_loss_mlp": 0.01038375, "balance_loss_clip": 1.02235878, "balance_loss_mlp": 1.04509771, "epoch": 0.37306478280474975, "flos": 24316372241280.0, "grad_norm": 1.552027486130456, "language_loss": 0.75435323, "learning_rate": 2.8872906998020577e-06, "loss": 0.77602643, "num_input_tokens_seen": 133344625, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 6205, "time_per_iteration": 2.4991273880004883 }, { "auxiliary_loss_clip": 0.0112503, "auxiliary_loss_mlp": 0.01039946, "balance_loss_clip": 1.02477038, "balance_loss_mlp": 1.04411483, "epoch": 0.3731249060574177, "flos": 15815167741440.0, "grad_norm": 1.9857512881340262, "language_loss": 0.78377825, "learning_rate": 2.886941646474128e-06, "loss": 0.80542803, "num_input_tokens_seen": 133363605, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 6206, "time_per_iteration": 2.433513879776001 }, { "auxiliary_loss_clip": 0.0112641, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.01892614, "balance_loss_mlp": 1.04416311, "epoch": 0.3731850293100857, "flos": 19828077229440.0, "grad_norm": 1.9889259898882778, "language_loss": 0.93589377, "learning_rate": 2.886592559513283e-06, "loss": 0.95750242, "num_input_tokens_seen": 133379405, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.82421875, "step": 6207, "time_per_iteration": 2.483152151107788 }, { "auxiliary_loss_clip": 0.0112587, "auxiliary_loss_mlp": 0.010317, "balance_loss_clip": 1.01788354, "balance_loss_mlp": 1.04266965, "epoch": 0.37324515256275365, "flos": 19062354072960.0, "grad_norm": 2.690416940707077, "language_loss": 0.82685071, "learning_rate": 2.886243438932759e-06, "loss": 0.84842646, "num_input_tokens_seen": 133397585, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.8359375, "step": 6208, "time_per_iteration": 2.4373912811279297 }, { "auxiliary_loss_clip": 0.01127985, "auxiliary_loss_mlp": 0.01037441, "balance_loss_clip": 1.02128267, "balance_loss_mlp": 1.0449723, "epoch": 0.3733052758154216, "flos": 20704333512960.0, "grad_norm": 2.029529443034295, "language_loss": 0.73218125, "learning_rate": 2.8858942847457953e-06, "loss": 0.75383556, "num_input_tokens_seen": 133415365, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.828125, "step": 6209, "time_per_iteration": 2.468517541885376 }, { "auxiliary_loss_clip": 0.01125842, "auxiliary_loss_mlp": 0.01037013, "balance_loss_clip": 1.02105129, "balance_loss_mlp": 1.04511976, "epoch": 0.3733653990680896, "flos": 20193504243840.0, "grad_norm": 1.640632943622613, "language_loss": 0.70472544, "learning_rate": 2.8855450969656305e-06, "loss": 0.726354, "num_input_tokens_seen": 133435700, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.80859375, "step": 6210, "time_per_iteration": 2.461893081665039 }, { "auxiliary_loss_clip": 0.01128229, "auxiliary_loss_mlp": 0.01033154, "balance_loss_clip": 1.01765656, "balance_loss_mlp": 1.04431057, "epoch": 0.37342552232075754, "flos": 20339660684160.0, "grad_norm": 3.276044550565471, "language_loss": 0.78032458, "learning_rate": 2.8851958756055073e-06, "loss": 0.80193841, "num_input_tokens_seen": 133455180, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8359375, "step": 6211, "time_per_iteration": 2.5045583248138428 }, { "auxiliary_loss_clip": 0.01128667, "auxiliary_loss_mlp": 0.01043868, "balance_loss_clip": 1.02855015, "balance_loss_mlp": 1.04481721, "epoch": 0.3734856455734255, "flos": 35517879527040.0, "grad_norm": 1.8623381809054635, "language_loss": 0.73254859, "learning_rate": 2.884846620678668e-06, "loss": 0.75427401, "num_input_tokens_seen": 133476715, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.83984375, "step": 6212, "time_per_iteration": 2.5895936489105225 }, { "auxiliary_loss_clip": 0.0113662, "auxiliary_loss_mlp": 0.0104844, "balance_loss_clip": 1.03285909, "balance_loss_mlp": 1.04770303, "epoch": 0.37354576882609347, "flos": 21142300043520.0, "grad_norm": 2.849457061603428, "language_loss": 0.82261217, "learning_rate": 2.884497332198356e-06, "loss": 0.84446275, "num_input_tokens_seen": 133494550, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.890625, "step": 6213, "time_per_iteration": 2.4860453605651855 }, { "auxiliary_loss_clip": 0.01128162, "auxiliary_loss_mlp": 0.01044966, "balance_loss_clip": 1.0295167, "balance_loss_mlp": 1.04500985, "epoch": 0.37360589207876144, "flos": 21506793304320.0, "grad_norm": 2.6874372848358425, "language_loss": 0.78306574, "learning_rate": 2.8841480101778167e-06, "loss": 0.80479693, "num_input_tokens_seen": 133512640, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83203125, "step": 6214, "time_per_iteration": 2.4704394340515137 }, { "auxiliary_loss_clip": 0.01125077, "auxiliary_loss_mlp": 0.01041757, "balance_loss_clip": 1.0275712, "balance_loss_mlp": 1.04360795, "epoch": 0.37366601533142946, "flos": 38435800861440.0, "grad_norm": 1.8203300923732508, "language_loss": 0.84864557, "learning_rate": 2.883798654630296e-06, "loss": 0.87031388, "num_input_tokens_seen": 133535540, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8125, "step": 6215, "time_per_iteration": 2.613600730895996 }, { "auxiliary_loss_clip": 0.01128258, "auxiliary_loss_mlp": 0.01044177, "balance_loss_clip": 1.02837014, "balance_loss_mlp": 1.04243684, "epoch": 0.3737261385840974, "flos": 18441171244800.0, "grad_norm": 1.6459145322968916, "language_loss": 0.67880243, "learning_rate": 2.8834492655690423e-06, "loss": 0.70052683, "num_input_tokens_seen": 133555795, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.859375, "step": 6216, "time_per_iteration": 2.4764575958251953 }, { "auxiliary_loss_clip": 0.01128943, "auxiliary_loss_mlp": 0.0103879, "balance_loss_clip": 1.02269661, "balance_loss_mlp": 1.04427958, "epoch": 0.3737862618367654, "flos": 22929861306240.0, "grad_norm": 3.1652149611710065, "language_loss": 0.65983099, "learning_rate": 2.883099843007303e-06, "loss": 0.6815083, "num_input_tokens_seen": 133575905, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84765625, "step": 6217, "time_per_iteration": 3.948503017425537 }, { "auxiliary_loss_clip": 0.01129978, "auxiliary_loss_mlp": 0.01044289, "balance_loss_clip": 1.02916157, "balance_loss_mlp": 1.04473484, "epoch": 0.37384638508943335, "flos": 15409664127360.0, "grad_norm": 1.8302488925555922, "language_loss": 0.8053329, "learning_rate": 2.88275038695833e-06, "loss": 0.8270756, "num_input_tokens_seen": 133592585, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8515625, "step": 6218, "time_per_iteration": 2.4570565223693848 }, { "auxiliary_loss_clip": 0.01123768, "auxiliary_loss_mlp": 0.01036717, "balance_loss_clip": 1.02204204, "balance_loss_mlp": 1.0446111, "epoch": 0.3739065083421013, "flos": 24280820755200.0, "grad_norm": 1.4544183995120188, "language_loss": 0.7875737, "learning_rate": 2.8824008974353736e-06, "loss": 0.80917859, "num_input_tokens_seen": 133615070, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 6219, "time_per_iteration": 2.5073819160461426 }, { "auxiliary_loss_clip": 0.0112626, "auxiliary_loss_mlp": 0.01042566, "balance_loss_clip": 1.02734876, "balance_loss_mlp": 1.04550302, "epoch": 0.3739666315947693, "flos": 23002831785600.0, "grad_norm": 1.7532068213214373, "language_loss": 0.76798737, "learning_rate": 2.8820513744516866e-06, "loss": 0.78967565, "num_input_tokens_seen": 133633490, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 6220, "time_per_iteration": 5.258239030838013 }, { "auxiliary_loss_clip": 0.01128717, "auxiliary_loss_mlp": 0.01043753, "balance_loss_clip": 1.02858973, "balance_loss_mlp": 1.04541397, "epoch": 0.37402675484743725, "flos": 19391116279680.0, "grad_norm": 1.7559773249633281, "language_loss": 0.82990181, "learning_rate": 2.8817018180205235e-06, "loss": 0.85162652, "num_input_tokens_seen": 133653425, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.83203125, "step": 6221, "time_per_iteration": 2.485548734664917 }, { "auxiliary_loss_clip": 0.01127965, "auxiliary_loss_mlp": 0.01042465, "balance_loss_clip": 1.02730203, "balance_loss_mlp": 1.04465461, "epoch": 0.3740868781001052, "flos": 17126158331520.0, "grad_norm": 1.8026935306457512, "language_loss": 0.76746857, "learning_rate": 2.8813522281551387e-06, "loss": 0.78917283, "num_input_tokens_seen": 133670220, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.83203125, "step": 6222, "time_per_iteration": 3.8420097827911377 }, { "auxiliary_loss_clip": 0.01128831, "auxiliary_loss_mlp": 0.01036872, "balance_loss_clip": 1.02204251, "balance_loss_mlp": 1.0468812, "epoch": 0.3741470013527732, "flos": 20043505048320.0, "grad_norm": 2.3202138717025287, "language_loss": 0.70460743, "learning_rate": 2.881002604868789e-06, "loss": 0.72626448, "num_input_tokens_seen": 133688910, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 6223, "time_per_iteration": 2.4717977046966553 }, { "auxiliary_loss_clip": 0.01126746, "auxiliary_loss_mlp": 0.01038404, "balance_loss_clip": 1.02349687, "balance_loss_mlp": 1.04563653, "epoch": 0.37420712460544114, "flos": 36897279569280.0, "grad_norm": 2.193534109834695, "language_loss": 0.68901372, "learning_rate": 2.8806529481747325e-06, "loss": 0.71066523, "num_input_tokens_seen": 133708690, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8125, "step": 6224, "time_per_iteration": 2.6057162284851074 }, { "auxiliary_loss_clip": 0.0112415, "auxiliary_loss_mlp": 0.01036815, "balance_loss_clip": 1.02246213, "balance_loss_mlp": 1.04424489, "epoch": 0.3742672478581091, "flos": 22201198007040.0, "grad_norm": 1.8305072659294723, "language_loss": 0.70129162, "learning_rate": 2.880303258086228e-06, "loss": 0.72290134, "num_input_tokens_seen": 133728095, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.796875, "step": 6225, "time_per_iteration": 2.4704833030700684 }, { "auxiliary_loss_clip": 0.01125014, "auxiliary_loss_mlp": 0.01044373, "balance_loss_clip": 1.02864981, "balance_loss_mlp": 1.04524505, "epoch": 0.3743273711107771, "flos": 24681547860480.0, "grad_norm": 2.141024400230861, "language_loss": 0.79570687, "learning_rate": 2.879953534616536e-06, "loss": 0.81740081, "num_input_tokens_seen": 133745590, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.796875, "step": 6226, "time_per_iteration": 2.5043275356292725 }, { "auxiliary_loss_clip": 0.01127011, "auxiliary_loss_mlp": 0.01039716, "balance_loss_clip": 1.02381945, "balance_loss_mlp": 1.04443634, "epoch": 0.37438749436344504, "flos": 24459619680000.0, "grad_norm": 2.525018003195242, "language_loss": 0.67795205, "learning_rate": 2.879603777778917e-06, "loss": 0.69961929, "num_input_tokens_seen": 133766155, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 6227, "time_per_iteration": 2.5104076862335205 }, { "auxiliary_loss_clip": 0.01125048, "auxiliary_loss_mlp": 0.01034102, "balance_loss_clip": 1.01930833, "balance_loss_mlp": 1.0449729, "epoch": 0.374447617616113, "flos": 21798747048960.0, "grad_norm": 2.150883425491824, "language_loss": 0.82674921, "learning_rate": 2.879253987586635e-06, "loss": 0.84834069, "num_input_tokens_seen": 133783185, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80078125, "step": 6228, "time_per_iteration": 2.462435007095337 }, { "auxiliary_loss_clip": 0.01124627, "auxiliary_loss_mlp": 0.01038793, "balance_loss_clip": 1.02467239, "balance_loss_mlp": 1.04473698, "epoch": 0.374507740868781, "flos": 17968191932160.0, "grad_norm": 1.5800057848533289, "language_loss": 0.74597025, "learning_rate": 2.8789041640529535e-06, "loss": 0.76760447, "num_input_tokens_seen": 133800975, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.80078125, "step": 6229, "time_per_iteration": 2.4495790004730225 }, { "auxiliary_loss_clip": 0.01125973, "auxiliary_loss_mlp": 0.01038842, "balance_loss_clip": 1.02352357, "balance_loss_mlp": 1.04267502, "epoch": 0.374567864121449, "flos": 16105828596480.0, "grad_norm": 6.238443488301736, "language_loss": 0.83485848, "learning_rate": 2.8785543071911383e-06, "loss": 0.85650659, "num_input_tokens_seen": 133818020, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.83203125, "step": 6230, "time_per_iteration": 2.430790424346924 }, { "auxiliary_loss_clip": 0.01129979, "auxiliary_loss_mlp": 0.01044661, "balance_loss_clip": 1.02983117, "balance_loss_mlp": 1.04644883, "epoch": 0.37462798737411696, "flos": 25773160135680.0, "grad_norm": 2.060654284849793, "language_loss": 0.7373178, "learning_rate": 2.878204417014456e-06, "loss": 0.7590642, "num_input_tokens_seen": 133840690, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8359375, "step": 6231, "time_per_iteration": 2.5398354530334473 }, { "auxiliary_loss_clip": 0.01130878, "auxiliary_loss_mlp": 0.01044942, "balance_loss_clip": 1.0290159, "balance_loss_mlp": 1.04650581, "epoch": 0.3746881106267849, "flos": 16654507822080.0, "grad_norm": 2.1901489320331993, "language_loss": 0.73499811, "learning_rate": 2.8778544935361735e-06, "loss": 0.75675631, "num_input_tokens_seen": 133858350, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84375, "step": 6232, "time_per_iteration": 2.4349608421325684 }, { "auxiliary_loss_clip": 0.01125475, "auxiliary_loss_mlp": 0.01035217, "balance_loss_clip": 1.01962471, "balance_loss_mlp": 1.04302287, "epoch": 0.3747482338794529, "flos": 26177981391360.0, "grad_norm": 1.701760035803004, "language_loss": 0.76777077, "learning_rate": 2.877504536769561e-06, "loss": 0.78937769, "num_input_tokens_seen": 133879775, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.82421875, "step": 6233, "time_per_iteration": 2.5222580432891846 }, { "auxiliary_loss_clip": 0.01129621, "auxiliary_loss_mlp": 0.01038196, "balance_loss_clip": 1.02337813, "balance_loss_mlp": 1.04582191, "epoch": 0.37480835713212085, "flos": 12021061950720.0, "grad_norm": 2.134623635365829, "language_loss": 0.69530177, "learning_rate": 2.8771545467278883e-06, "loss": 0.71697992, "num_input_tokens_seen": 133898295, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8359375, "step": 6234, "time_per_iteration": 2.4644505977630615 }, { "auxiliary_loss_clip": 0.01126855, "auxiliary_loss_mlp": 0.01042533, "balance_loss_clip": 1.02856159, "balance_loss_mlp": 1.04578781, "epoch": 0.3748684803847888, "flos": 19679263182720.0, "grad_norm": 1.8117592574844963, "language_loss": 0.82469302, "learning_rate": 2.8768045234244276e-06, "loss": 0.84638691, "num_input_tokens_seen": 133915230, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.8125, "step": 6235, "time_per_iteration": 2.4606494903564453 }, { "auxiliary_loss_clip": 0.01130627, "auxiliary_loss_mlp": 0.01031472, "balance_loss_clip": 1.01758456, "balance_loss_mlp": 1.04625905, "epoch": 0.3749286036374568, "flos": 20521189042560.0, "grad_norm": 1.7919937228021774, "language_loss": 0.78225178, "learning_rate": 2.8764544668724517e-06, "loss": 0.80387282, "num_input_tokens_seen": 133934110, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.84375, "step": 6236, "time_per_iteration": 2.470735549926758 }, { "auxiliary_loss_clip": 0.01130714, "auxiliary_loss_mlp": 0.01045121, "balance_loss_clip": 1.02819383, "balance_loss_mlp": 1.04483843, "epoch": 0.37498872689012475, "flos": 20704620821760.0, "grad_norm": 1.9583506455734423, "language_loss": 0.73101765, "learning_rate": 2.876104377085234e-06, "loss": 0.75277603, "num_input_tokens_seen": 133952395, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.859375, "step": 6237, "time_per_iteration": 2.4634392261505127 }, { "auxiliary_loss_clip": 0.01126601, "auxiliary_loss_mlp": 0.01038373, "balance_loss_clip": 1.02265537, "balance_loss_mlp": 1.04160571, "epoch": 0.3750488501427927, "flos": 21574843620480.0, "grad_norm": 1.8668843296243463, "language_loss": 0.92828, "learning_rate": 2.8757542540760508e-06, "loss": 0.94992971, "num_input_tokens_seen": 133969635, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8515625, "step": 6238, "time_per_iteration": 2.4438161849975586 }, { "auxiliary_loss_clip": 0.01126594, "auxiliary_loss_mlp": 0.01034191, "balance_loss_clip": 1.01872981, "balance_loss_mlp": 1.04289293, "epoch": 0.3751089733954607, "flos": 15923869274880.0, "grad_norm": 3.875299796308932, "language_loss": 0.70551741, "learning_rate": 2.8754040978581777e-06, "loss": 0.72712529, "num_input_tokens_seen": 133987215, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 6239, "time_per_iteration": 2.4795210361480713 }, { "auxiliary_loss_clip": 0.01128886, "auxiliary_loss_mlp": 0.01035547, "balance_loss_clip": 1.0201571, "balance_loss_mlp": 1.04580843, "epoch": 0.37516909664812864, "flos": 36284644177920.0, "grad_norm": 1.7333500565595574, "language_loss": 0.65056127, "learning_rate": 2.875053908444895e-06, "loss": 0.67220557, "num_input_tokens_seen": 134009250, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83203125, "step": 6240, "time_per_iteration": 2.5862197875976562 }, { "auxiliary_loss_clip": 0.01127097, "auxiliary_loss_mlp": 0.01030324, "balance_loss_clip": 1.01610196, "balance_loss_mlp": 1.04295826, "epoch": 0.3752292199007966, "flos": 13515915283200.0, "grad_norm": 2.8524205021315416, "language_loss": 0.76383293, "learning_rate": 2.8747036858494795e-06, "loss": 0.78540713, "num_input_tokens_seen": 134026875, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.83984375, "step": 6241, "time_per_iteration": 2.4384682178497314 }, { "auxiliary_loss_clip": 0.01127802, "auxiliary_loss_mlp": 0.01042345, "balance_loss_clip": 1.02639484, "balance_loss_mlp": 1.04374146, "epoch": 0.3752893431534646, "flos": 27198095644800.0, "grad_norm": 2.6463597298100456, "language_loss": 0.83902121, "learning_rate": 2.874353430085213e-06, "loss": 0.86072266, "num_input_tokens_seen": 134047185, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 6242, "time_per_iteration": 2.4970285892486572 }, { "auxiliary_loss_clip": 0.01127491, "auxiliary_loss_mlp": 0.01039053, "balance_loss_clip": 1.02446175, "balance_loss_mlp": 1.04320216, "epoch": 0.3753494664061326, "flos": 30007674581760.0, "grad_norm": 3.5840190476428817, "language_loss": 0.6819284, "learning_rate": 2.8740031411653766e-06, "loss": 0.70359385, "num_input_tokens_seen": 134067330, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.84375, "step": 6243, "time_per_iteration": 2.536849021911621 }, { "auxiliary_loss_clip": 0.01126288, "auxiliary_loss_mlp": 0.01039779, "balance_loss_clip": 1.02366233, "balance_loss_mlp": 1.04362786, "epoch": 0.37540958965880056, "flos": 24461954064000.0, "grad_norm": 2.0216875187303334, "language_loss": 0.83679301, "learning_rate": 2.8736528191032535e-06, "loss": 0.85845375, "num_input_tokens_seen": 134085525, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.828125, "step": 6244, "time_per_iteration": 2.485142707824707 }, { "auxiliary_loss_clip": 0.01122529, "auxiliary_loss_mlp": 0.01034408, "balance_loss_clip": 1.01964986, "balance_loss_mlp": 1.04191315, "epoch": 0.3754697129114685, "flos": 16508387295360.0, "grad_norm": 3.622455919240244, "language_loss": 0.82714021, "learning_rate": 2.8733024639121277e-06, "loss": 0.84870964, "num_input_tokens_seen": 134101855, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8046875, "step": 6245, "time_per_iteration": 2.473747730255127 }, { "auxiliary_loss_clip": 0.01127278, "auxiliary_loss_mlp": 0.01039668, "balance_loss_clip": 1.02394426, "balance_loss_mlp": 1.04426658, "epoch": 0.3755298361641365, "flos": 19390900798080.0, "grad_norm": 2.0018328959355247, "language_loss": 0.64187539, "learning_rate": 2.8729520756052853e-06, "loss": 0.66354477, "num_input_tokens_seen": 134119360, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83203125, "step": 6246, "time_per_iteration": 2.447115421295166 }, { "auxiliary_loss_clip": 0.0112784, "auxiliary_loss_mlp": 0.01043741, "balance_loss_clip": 1.02783298, "balance_loss_mlp": 1.04324746, "epoch": 0.37558995941680445, "flos": 14720395069440.0, "grad_norm": 1.8073837822678434, "language_loss": 0.74408358, "learning_rate": 2.8726016541960124e-06, "loss": 0.7657994, "num_input_tokens_seen": 134137475, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84375, "step": 6247, "time_per_iteration": 2.4374914169311523 }, { "auxiliary_loss_clip": 0.01127329, "auxiliary_loss_mlp": 0.01041728, "balance_loss_clip": 1.02649319, "balance_loss_mlp": 1.0431633, "epoch": 0.3756500826694724, "flos": 21689901861120.0, "grad_norm": 3.034187952915738, "language_loss": 0.55442542, "learning_rate": 2.872251199697598e-06, "loss": 0.57611597, "num_input_tokens_seen": 134154580, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 6248, "time_per_iteration": 2.4599947929382324 }, { "auxiliary_loss_clip": 0.0112664, "auxiliary_loss_mlp": 0.01040439, "balance_loss_clip": 1.02548993, "balance_loss_mlp": 1.04469657, "epoch": 0.3757102059221404, "flos": 26505666190080.0, "grad_norm": 2.6210899159115204, "language_loss": 0.84310925, "learning_rate": 2.8719007121233297e-06, "loss": 0.86478001, "num_input_tokens_seen": 134174285, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8203125, "step": 6249, "time_per_iteration": 2.5150763988494873 }, { "auxiliary_loss_clip": 0.01125192, "auxiliary_loss_mlp": 0.0103939, "balance_loss_clip": 1.02490044, "balance_loss_mlp": 1.04321122, "epoch": 0.37577032917480835, "flos": 37338083274240.0, "grad_norm": 1.7555606855777317, "language_loss": 0.67729115, "learning_rate": 2.8715501914864993e-06, "loss": 0.69893694, "num_input_tokens_seen": 134195940, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 6250, "time_per_iteration": 2.590726375579834 }, { "auxiliary_loss_clip": 0.01126537, "auxiliary_loss_mlp": 0.0104182, "balance_loss_clip": 1.02769327, "balance_loss_mlp": 1.04326606, "epoch": 0.3758304524274763, "flos": 21908597817600.0, "grad_norm": 2.405828036980816, "language_loss": 0.78372532, "learning_rate": 2.8711996378003987e-06, "loss": 0.8054089, "num_input_tokens_seen": 134212235, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.83203125, "step": 6251, "time_per_iteration": 2.483821392059326 }, { "auxiliary_loss_clip": 0.01125328, "auxiliary_loss_mlp": 0.01036199, "balance_loss_clip": 1.02163196, "balance_loss_mlp": 1.04291201, "epoch": 0.3758905756801443, "flos": 36569343375360.0, "grad_norm": 2.0823338791359927, "language_loss": 0.58433771, "learning_rate": 2.8708490510783203e-06, "loss": 0.60595298, "num_input_tokens_seen": 134233810, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.82421875, "step": 6252, "time_per_iteration": 2.5859999656677246 }, { "auxiliary_loss_clip": 0.01128368, "auxiliary_loss_mlp": 0.01037612, "balance_loss_clip": 1.02205491, "balance_loss_mlp": 1.04363298, "epoch": 0.37595069893281224, "flos": 24528783317760.0, "grad_norm": 2.109764004038231, "language_loss": 0.89690745, "learning_rate": 2.8704984313335584e-06, "loss": 0.91856724, "num_input_tokens_seen": 134252020, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.84765625, "step": 6253, "time_per_iteration": 2.497711181640625 }, { "auxiliary_loss_clip": 0.01126507, "auxiliary_loss_mlp": 0.01037546, "balance_loss_clip": 1.02350307, "balance_loss_mlp": 1.04568791, "epoch": 0.3760108221854802, "flos": 16435021766400.0, "grad_norm": 5.64965854589621, "language_loss": 0.76611942, "learning_rate": 2.8701477785794097e-06, "loss": 0.78775996, "num_input_tokens_seen": 134269495, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8046875, "step": 6254, "time_per_iteration": 2.43949818611145 }, { "auxiliary_loss_clip": 0.01129693, "auxiliary_loss_mlp": 0.01044589, "balance_loss_clip": 1.02891946, "balance_loss_mlp": 1.04591179, "epoch": 0.37607094543814823, "flos": 13771742924160.0, "grad_norm": 2.2633122973578534, "language_loss": 0.61728305, "learning_rate": 2.869797092829169e-06, "loss": 0.63902587, "num_input_tokens_seen": 134287035, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 6255, "time_per_iteration": 2.4806525707244873 }, { "auxiliary_loss_clip": 0.01129906, "auxiliary_loss_mlp": 0.01034171, "balance_loss_clip": 1.01864958, "balance_loss_mlp": 1.04385161, "epoch": 0.3761310686908162, "flos": 19857918453120.0, "grad_norm": 3.1615536213098094, "language_loss": 0.74360585, "learning_rate": 2.869446374096135e-06, "loss": 0.76524663, "num_input_tokens_seen": 134304840, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.86328125, "step": 6256, "time_per_iteration": 2.4484050273895264 }, { "auxiliary_loss_clip": 0.01129705, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.02322996, "balance_loss_mlp": 1.04556644, "epoch": 0.37619119194348416, "flos": 12750802657920.0, "grad_norm": 1.985429282975109, "language_loss": 0.70212495, "learning_rate": 2.8690956223936088e-06, "loss": 0.72381419, "num_input_tokens_seen": 134323180, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 6257, "time_per_iteration": 2.4714441299438477 }, { "auxiliary_loss_clip": 0.01124681, "auxiliary_loss_mlp": 0.01027773, "balance_loss_clip": 1.01368213, "balance_loss_mlp": 1.04300034, "epoch": 0.3762513151961521, "flos": 17530548624000.0, "grad_norm": 1.6630293948501165, "language_loss": 0.84404933, "learning_rate": 2.868744837734889e-06, "loss": 0.86557388, "num_input_tokens_seen": 134341390, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.81640625, "step": 6258, "time_per_iteration": 2.4244120121002197 }, { "auxiliary_loss_clip": 0.01125733, "auxiliary_loss_mlp": 0.01037617, "balance_loss_clip": 1.02387166, "balance_loss_mlp": 1.04371345, "epoch": 0.3763114384488201, "flos": 23617406511360.0, "grad_norm": 1.554219687383706, "language_loss": 0.80658549, "learning_rate": 2.868394020133277e-06, "loss": 0.828219, "num_input_tokens_seen": 134360425, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.8203125, "step": 6259, "time_per_iteration": 3.9739832878112793 }, { "auxiliary_loss_clip": 0.0113052, "auxiliary_loss_mlp": 0.01043326, "balance_loss_clip": 1.0269829, "balance_loss_mlp": 1.04499495, "epoch": 0.37637156170148806, "flos": 25406978935680.0, "grad_norm": 1.895578275823638, "language_loss": 0.71273011, "learning_rate": 2.8680431696020783e-06, "loss": 0.73446858, "num_input_tokens_seen": 134379775, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.85546875, "step": 6260, "time_per_iteration": 2.529064416885376 }, { "auxiliary_loss_clip": 0.01126263, "auxiliary_loss_mlp": 0.01034104, "balance_loss_clip": 1.01891649, "balance_loss_mlp": 1.04223311, "epoch": 0.376431684954156, "flos": 23440906056960.0, "grad_norm": 1.8105263253426704, "language_loss": 0.78357196, "learning_rate": 2.867692286154594e-06, "loss": 0.80517566, "num_input_tokens_seen": 134400315, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 6261, "time_per_iteration": 2.4618992805480957 }, { "auxiliary_loss_clip": 0.01131979, "auxiliary_loss_mlp": 0.01033458, "balance_loss_clip": 1.01856291, "balance_loss_mlp": 1.04642558, "epoch": 0.376491808206824, "flos": 34204482725760.0, "grad_norm": 1.838538742026725, "language_loss": 0.80442166, "learning_rate": 2.867341369804132e-06, "loss": 0.82607603, "num_input_tokens_seen": 134422875, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.85546875, "step": 6262, "time_per_iteration": 4.018115520477295 }, { "auxiliary_loss_clip": 0.01122876, "auxiliary_loss_mlp": 0.01032466, "balance_loss_clip": 1.01762414, "balance_loss_mlp": 1.04183722, "epoch": 0.37655193145949195, "flos": 35185669614720.0, "grad_norm": 2.3446336189867685, "language_loss": 0.80760038, "learning_rate": 2.866990420563998e-06, "loss": 0.8291539, "num_input_tokens_seen": 134443025, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80859375, "step": 6263, "time_per_iteration": 2.5737545490264893 }, { "auxiliary_loss_clip": 0.01128525, "auxiliary_loss_mlp": 0.01039902, "balance_loss_clip": 1.02503681, "balance_loss_mlp": 1.04547501, "epoch": 0.3766120547121599, "flos": 16761844638720.0, "grad_norm": 5.330038228973422, "language_loss": 0.79770732, "learning_rate": 2.866639438447501e-06, "loss": 0.81939155, "num_input_tokens_seen": 134460945, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 6264, "time_per_iteration": 3.8506405353546143 }, { "auxiliary_loss_clip": 0.01123898, "auxiliary_loss_mlp": 0.01041841, "balance_loss_clip": 1.02715445, "balance_loss_mlp": 1.04158974, "epoch": 0.3766721779648279, "flos": 23550361776000.0, "grad_norm": 2.015799423179545, "language_loss": 0.74138564, "learning_rate": 2.8662884234679497e-06, "loss": 0.76304305, "num_input_tokens_seen": 134480440, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.82421875, "step": 6265, "time_per_iteration": 2.485415458679199 }, { "auxiliary_loss_clip": 0.01125366, "auxiliary_loss_mlp": 0.01035391, "balance_loss_clip": 1.02233171, "balance_loss_mlp": 1.04515791, "epoch": 0.37673230121749585, "flos": 29129191655040.0, "grad_norm": 1.6626349318813944, "language_loss": 0.68762636, "learning_rate": 2.865937375638654e-06, "loss": 0.70923394, "num_input_tokens_seen": 134501110, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.8046875, "step": 6266, "time_per_iteration": 2.532031774520874 }, { "auxiliary_loss_clip": 0.01130362, "auxiliary_loss_mlp": 0.01038834, "balance_loss_clip": 1.02306306, "balance_loss_mlp": 1.04363596, "epoch": 0.3767924244701638, "flos": 28146783703680.0, "grad_norm": 2.527772290390409, "language_loss": 0.63325286, "learning_rate": 2.8655862949729264e-06, "loss": 0.6549449, "num_input_tokens_seen": 134522460, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8671875, "step": 6267, "time_per_iteration": 2.5409440994262695 }, { "auxiliary_loss_clip": 0.01049898, "auxiliary_loss_mlp": 0.01028291, "balance_loss_clip": 1.02658641, "balance_loss_mlp": 1.02173448, "epoch": 0.37685254772283183, "flos": 60797197526400.0, "grad_norm": 0.7351509014783425, "language_loss": 0.58884466, "learning_rate": 2.8652351814840795e-06, "loss": 0.60962665, "num_input_tokens_seen": 134589545, "router_z_loss_clip": 0.01708984, "router_z_loss_mlp": 0.28125, "step": 6268, "time_per_iteration": 3.189319133758545 }, { "auxiliary_loss_clip": 0.01125678, "auxiliary_loss_mlp": 0.01037962, "balance_loss_clip": 1.02229774, "balance_loss_mlp": 1.04274344, "epoch": 0.3769126709754998, "flos": 26032543223040.0, "grad_norm": 1.481089196310416, "language_loss": 0.65151989, "learning_rate": 2.8648840351854283e-06, "loss": 0.67315626, "num_input_tokens_seen": 134610550, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 6269, "time_per_iteration": 2.511334180831909 }, { "auxiliary_loss_clip": 0.01126703, "auxiliary_loss_mlp": 0.01035179, "balance_loss_clip": 1.01971149, "balance_loss_mlp": 1.04610753, "epoch": 0.37697279422816776, "flos": 23579879777280.0, "grad_norm": 1.6935669136014813, "language_loss": 0.70836031, "learning_rate": 2.8645328560902874e-06, "loss": 0.72997916, "num_input_tokens_seen": 134630485, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 6270, "time_per_iteration": 2.5022237300872803 }, { "auxiliary_loss_clip": 0.01049441, "auxiliary_loss_mlp": 0.01001289, "balance_loss_clip": 0.99936932, "balance_loss_mlp": 1.02172112, "epoch": 0.3770329174808357, "flos": 64745935367040.0, "grad_norm": 0.6956785996160386, "language_loss": 0.56107628, "learning_rate": 2.8641816442119746e-06, "loss": 0.5815835, "num_input_tokens_seen": 134693510, "router_z_loss_clip": 0.01916504, "router_z_loss_mlp": 0.27734375, "step": 6271, "time_per_iteration": 3.093209981918335 }, { "auxiliary_loss_clip": 0.01124138, "auxiliary_loss_mlp": 0.01040802, "balance_loss_clip": 1.02507854, "balance_loss_mlp": 1.04317486, "epoch": 0.3770930407335037, "flos": 21835304115840.0, "grad_norm": 1.7476940796720504, "language_loss": 0.79875445, "learning_rate": 2.8638303995638066e-06, "loss": 0.82040381, "num_input_tokens_seen": 134713115, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.80859375, "step": 6272, "time_per_iteration": 2.4625375270843506 }, { "auxiliary_loss_clip": 0.01122741, "auxiliary_loss_mlp": 0.01032677, "balance_loss_clip": 1.01920092, "balance_loss_mlp": 1.04364026, "epoch": 0.37715316398617166, "flos": 22747901984640.0, "grad_norm": 1.4534949036259102, "language_loss": 0.73843509, "learning_rate": 2.863479122159103e-06, "loss": 0.75998926, "num_input_tokens_seen": 134732635, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7890625, "step": 6273, "time_per_iteration": 2.5167853832244873 }, { "auxiliary_loss_clip": 0.01126132, "auxiliary_loss_mlp": 0.01041884, "balance_loss_clip": 1.02787697, "balance_loss_mlp": 1.04484296, "epoch": 0.3772132872388396, "flos": 18914581520640.0, "grad_norm": 4.077907797423643, "language_loss": 0.71791822, "learning_rate": 2.8631278120111858e-06, "loss": 0.73959839, "num_input_tokens_seen": 134750695, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 6274, "time_per_iteration": 2.45607328414917 }, { "auxiliary_loss_clip": 0.01129853, "auxiliary_loss_mlp": 0.01039881, "balance_loss_clip": 1.02630877, "balance_loss_mlp": 1.04706705, "epoch": 0.3772734104915076, "flos": 17346219004800.0, "grad_norm": 1.711497797005744, "language_loss": 0.83758974, "learning_rate": 2.8627764691333742e-06, "loss": 0.85928714, "num_input_tokens_seen": 134768935, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.828125, "step": 6275, "time_per_iteration": 2.455019474029541 }, { "auxiliary_loss_clip": 0.01122825, "auxiliary_loss_mlp": 0.01038705, "balance_loss_clip": 1.02602148, "balance_loss_mlp": 1.04473019, "epoch": 0.37733353374417555, "flos": 32342370785280.0, "grad_norm": 1.409637385608337, "language_loss": 0.75155973, "learning_rate": 2.8624250935389935e-06, "loss": 0.773175, "num_input_tokens_seen": 134791260, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.78125, "step": 6276, "time_per_iteration": 2.5603079795837402 }, { "auxiliary_loss_clip": 0.01129078, "auxiliary_loss_mlp": 0.01040456, "balance_loss_clip": 1.02526259, "balance_loss_mlp": 1.0455575, "epoch": 0.3773936569968435, "flos": 23360681030400.0, "grad_norm": 2.190310080439147, "language_loss": 0.85713339, "learning_rate": 2.862073685241366e-06, "loss": 0.8788287, "num_input_tokens_seen": 134808350, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8359375, "step": 6277, "time_per_iteration": 2.46342396736145 }, { "auxiliary_loss_clip": 0.01123684, "auxiliary_loss_mlp": 0.01032651, "balance_loss_clip": 1.0193001, "balance_loss_mlp": 1.0456742, "epoch": 0.3774537802495115, "flos": 21466788531840.0, "grad_norm": 2.105864744912111, "language_loss": 0.77882123, "learning_rate": 2.861722244253818e-06, "loss": 0.80038458, "num_input_tokens_seen": 134826005, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.78125, "step": 6278, "time_per_iteration": 2.4597055912017822 }, { "auxiliary_loss_clip": 0.01130778, "auxiliary_loss_mlp": 0.01040833, "balance_loss_clip": 1.02533519, "balance_loss_mlp": 1.04652739, "epoch": 0.37751390350217945, "flos": 24973717086720.0, "grad_norm": 1.901676172245662, "language_loss": 0.83428526, "learning_rate": 2.8613707705896767e-06, "loss": 0.85600132, "num_input_tokens_seen": 134844995, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84375, "step": 6279, "time_per_iteration": 2.515594482421875 }, { "auxiliary_loss_clip": 0.01129362, "auxiliary_loss_mlp": 0.0103522, "balance_loss_clip": 1.02191639, "balance_loss_mlp": 1.04700589, "epoch": 0.3775740267548474, "flos": 27819098904960.0, "grad_norm": 1.7825272924216196, "language_loss": 0.74932432, "learning_rate": 2.861019264262269e-06, "loss": 0.77097011, "num_input_tokens_seen": 134865285, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.82421875, "step": 6280, "time_per_iteration": 2.5246918201446533 }, { "auxiliary_loss_clip": 0.01122799, "auxiliary_loss_mlp": 0.01037215, "balance_loss_clip": 1.02433395, "balance_loss_mlp": 1.04417038, "epoch": 0.3776341500075154, "flos": 22565224391040.0, "grad_norm": 1.5650318020453, "language_loss": 0.76207399, "learning_rate": 2.8606677252849242e-06, "loss": 0.78367412, "num_input_tokens_seen": 134886535, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.78515625, "step": 6281, "time_per_iteration": 2.476132869720459 }, { "auxiliary_loss_clip": 0.01124339, "auxiliary_loss_mlp": 0.01039272, "balance_loss_clip": 1.02490139, "balance_loss_mlp": 1.04345214, "epoch": 0.3776942732601834, "flos": 23077238808960.0, "grad_norm": 2.0291367412016, "language_loss": 0.8443917, "learning_rate": 2.860316153670974e-06, "loss": 0.86602783, "num_input_tokens_seen": 134907435, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8125, "step": 6282, "time_per_iteration": 2.4807474613189697 }, { "auxiliary_loss_clip": 0.01121327, "auxiliary_loss_mlp": 0.0103515, "balance_loss_clip": 1.02120268, "balance_loss_mlp": 1.04268074, "epoch": 0.37775439651285136, "flos": 21724411852800.0, "grad_norm": 1.7351909924910052, "language_loss": 0.69817251, "learning_rate": 2.8599645494337484e-06, "loss": 0.71973729, "num_input_tokens_seen": 134925360, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7890625, "step": 6283, "time_per_iteration": 2.480393648147583 }, { "auxiliary_loss_clip": 0.01126172, "auxiliary_loss_mlp": 0.0103881, "balance_loss_clip": 1.02426052, "balance_loss_mlp": 1.04652178, "epoch": 0.37781451976551933, "flos": 23987753688960.0, "grad_norm": 1.89277282254049, "language_loss": 0.75807166, "learning_rate": 2.859612912586581e-06, "loss": 0.7797215, "num_input_tokens_seen": 134944205, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.796875, "step": 6284, "time_per_iteration": 2.4900286197662354 }, { "auxiliary_loss_clip": 0.01132048, "auxiliary_loss_mlp": 0.01032071, "balance_loss_clip": 1.01655042, "balance_loss_mlp": 1.04659784, "epoch": 0.3778746430181873, "flos": 13727967223680.0, "grad_norm": 7.876953872890599, "language_loss": 0.85012066, "learning_rate": 2.8592612431428055e-06, "loss": 0.87176192, "num_input_tokens_seen": 134960255, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.85546875, "step": 6285, "time_per_iteration": 2.43933367729187 }, { "auxiliary_loss_clip": 0.01127755, "auxiliary_loss_mlp": 0.01034523, "balance_loss_clip": 1.01896667, "balance_loss_mlp": 1.04494691, "epoch": 0.37793476627085526, "flos": 19460495399040.0, "grad_norm": 1.9132091901178503, "language_loss": 0.8506639, "learning_rate": 2.858909541115758e-06, "loss": 0.87228668, "num_input_tokens_seen": 134978605, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.828125, "step": 6286, "time_per_iteration": 2.5040056705474854 }, { "auxiliary_loss_clip": 0.01123836, "auxiliary_loss_mlp": 0.01040119, "balance_loss_clip": 1.02596891, "balance_loss_mlp": 1.04289627, "epoch": 0.3779948895235232, "flos": 10707018704640.0, "grad_norm": 2.4676430140831096, "language_loss": 0.82233608, "learning_rate": 2.858557806518775e-06, "loss": 0.8439756, "num_input_tokens_seen": 134995020, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.80859375, "step": 6287, "time_per_iteration": 2.4732470512390137 }, { "auxiliary_loss_clip": 0.0112592, "auxiliary_loss_mlp": 0.01039795, "balance_loss_clip": 1.02550745, "balance_loss_mlp": 1.04486275, "epoch": 0.3780550127761912, "flos": 22310007281280.0, "grad_norm": 2.8115946652812167, "language_loss": 0.73291528, "learning_rate": 2.8582060393651927e-06, "loss": 0.75457245, "num_input_tokens_seen": 135012620, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 6288, "time_per_iteration": 2.4715983867645264 }, { "auxiliary_loss_clip": 0.01127698, "auxiliary_loss_mlp": 0.01035853, "balance_loss_clip": 1.02111864, "balance_loss_mlp": 1.04646695, "epoch": 0.37811513602885916, "flos": 28950644125440.0, "grad_norm": 1.6297056919338386, "language_loss": 0.75243849, "learning_rate": 2.857854239668352e-06, "loss": 0.77407402, "num_input_tokens_seen": 135033365, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 6289, "time_per_iteration": 2.556570529937744 }, { "auxiliary_loss_clip": 0.01125212, "auxiliary_loss_mlp": 0.0103353, "balance_loss_clip": 1.01910591, "balance_loss_mlp": 1.04481816, "epoch": 0.3781752592815271, "flos": 23112933949440.0, "grad_norm": 1.85287970916404, "language_loss": 0.73895514, "learning_rate": 2.857502407441593e-06, "loss": 0.76054251, "num_input_tokens_seen": 135052185, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 6290, "time_per_iteration": 2.5073745250701904 }, { "auxiliary_loss_clip": 0.01130536, "auxiliary_loss_mlp": 0.01036207, "balance_loss_clip": 1.01999474, "balance_loss_mlp": 1.04533648, "epoch": 0.3782353825341951, "flos": 19755932762880.0, "grad_norm": 2.2601965432838704, "language_loss": 0.7934289, "learning_rate": 2.8571505426982566e-06, "loss": 0.81509638, "num_input_tokens_seen": 135070425, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8515625, "step": 6291, "time_per_iteration": 2.486415147781372 }, { "auxiliary_loss_clip": 0.01128751, "auxiliary_loss_mlp": 0.0103103, "balance_loss_clip": 1.01533031, "balance_loss_mlp": 1.04504132, "epoch": 0.37829550578686305, "flos": 22050839675520.0, "grad_norm": 13.212147620263838, "language_loss": 0.76332808, "learning_rate": 2.8567986454516854e-06, "loss": 0.78492594, "num_input_tokens_seen": 135090525, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8359375, "step": 6292, "time_per_iteration": 2.46947979927063 }, { "auxiliary_loss_clip": 0.0112849, "auxiliary_loss_mlp": 0.01047055, "balance_loss_clip": 1.03162897, "balance_loss_mlp": 1.04682088, "epoch": 0.378355629039531, "flos": 16470357770880.0, "grad_norm": 2.2725165506027465, "language_loss": 0.69550818, "learning_rate": 2.856446715715224e-06, "loss": 0.71726358, "num_input_tokens_seen": 135109575, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.81640625, "step": 6293, "time_per_iteration": 2.462110757827759 }, { "auxiliary_loss_clip": 0.01124269, "auxiliary_loss_mlp": 0.01040441, "balance_loss_clip": 1.02524757, "balance_loss_mlp": 1.04342675, "epoch": 0.378415752292199, "flos": 19974844200960.0, "grad_norm": 2.2170400865068194, "language_loss": 0.71655118, "learning_rate": 2.8560947535022173e-06, "loss": 0.73819828, "num_input_tokens_seen": 135127000, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 6294, "time_per_iteration": 2.4457836151123047 }, { "auxiliary_loss_clip": 0.01132661, "auxiliary_loss_mlp": 0.01041736, "balance_loss_clip": 1.02565432, "balance_loss_mlp": 1.04698968, "epoch": 0.378475875544867, "flos": 14647388676480.0, "grad_norm": 2.280754791185452, "language_loss": 0.82822633, "learning_rate": 2.855742758826011e-06, "loss": 0.84997028, "num_input_tokens_seen": 135145285, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.859375, "step": 6295, "time_per_iteration": 2.4539971351623535 }, { "auxiliary_loss_clip": 0.01127157, "auxiliary_loss_mlp": 0.01031727, "balance_loss_clip": 1.01726675, "balance_loss_mlp": 1.04470861, "epoch": 0.37853599879753497, "flos": 26650996617600.0, "grad_norm": 1.6955970302906453, "language_loss": 0.7167443, "learning_rate": 2.8553907316999547e-06, "loss": 0.73833311, "num_input_tokens_seen": 135165240, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.82421875, "step": 6296, "time_per_iteration": 2.510664224624634 }, { "auxiliary_loss_clip": 0.01126549, "auxiliary_loss_mlp": 0.0104333, "balance_loss_clip": 1.02889991, "balance_loss_mlp": 1.04710388, "epoch": 0.37859612205020293, "flos": 17311960408320.0, "grad_norm": 1.7898067864292742, "language_loss": 0.77623254, "learning_rate": 2.855038672137396e-06, "loss": 0.79793137, "num_input_tokens_seen": 135184045, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 6297, "time_per_iteration": 2.467590093612671 }, { "auxiliary_loss_clip": 0.01127582, "auxiliary_loss_mlp": 0.01038275, "balance_loss_clip": 1.02366006, "balance_loss_mlp": 1.04422402, "epoch": 0.3786562453028709, "flos": 18220392299520.0, "grad_norm": 2.149711813755662, "language_loss": 0.78951305, "learning_rate": 2.854686580151684e-06, "loss": 0.81117165, "num_input_tokens_seen": 135202365, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8359375, "step": 6298, "time_per_iteration": 2.4666988849639893 }, { "auxiliary_loss_clip": 0.01125814, "auxiliary_loss_mlp": 0.01037298, "balance_loss_clip": 1.02318943, "balance_loss_mlp": 1.04578745, "epoch": 0.37871636855553886, "flos": 21214875473280.0, "grad_norm": 1.6001612380646102, "language_loss": 0.84116811, "learning_rate": 2.8543344557561722e-06, "loss": 0.86279923, "num_input_tokens_seen": 135220955, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 6299, "time_per_iteration": 2.470984935760498 }, { "auxiliary_loss_clip": 0.01130477, "auxiliary_loss_mlp": 0.01033405, "balance_loss_clip": 1.01870084, "balance_loss_mlp": 1.04680562, "epoch": 0.3787764918082068, "flos": 20952727038720.0, "grad_norm": 2.955675222518733, "language_loss": 0.76269805, "learning_rate": 2.8539822989642116e-06, "loss": 0.78433692, "num_input_tokens_seen": 135239715, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8359375, "step": 6300, "time_per_iteration": 3.9361419677734375 }, { "auxiliary_loss_clip": 0.0113214, "auxiliary_loss_mlp": 0.01037118, "balance_loss_clip": 1.02004719, "balance_loss_mlp": 1.04629874, "epoch": 0.3788366150608748, "flos": 17308009912320.0, "grad_norm": 2.246701866144619, "language_loss": 0.81995451, "learning_rate": 2.8536301097891577e-06, "loss": 0.84164715, "num_input_tokens_seen": 135257035, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.859375, "step": 6301, "time_per_iteration": 2.432018995285034 }, { "auxiliary_loss_clip": 0.01126303, "auxiliary_loss_mlp": 0.01035471, "balance_loss_clip": 1.02086246, "balance_loss_mlp": 1.04334247, "epoch": 0.37889673831354276, "flos": 24311092942080.0, "grad_norm": 2.121124818186508, "language_loss": 0.67610216, "learning_rate": 2.8532778882443636e-06, "loss": 0.69771993, "num_input_tokens_seen": 135275720, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.828125, "step": 6302, "time_per_iteration": 2.4880809783935547 }, { "auxiliary_loss_clip": 0.01127192, "auxiliary_loss_mlp": 0.01041192, "balance_loss_clip": 1.02696991, "balance_loss_mlp": 1.04681098, "epoch": 0.3789568615662107, "flos": 26683603188480.0, "grad_norm": 1.7867727837724823, "language_loss": 0.68353504, "learning_rate": 2.8529256343431867e-06, "loss": 0.70521885, "num_input_tokens_seen": 135294140, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 6303, "time_per_iteration": 2.5088438987731934 }, { "auxiliary_loss_clip": 0.0112535, "auxiliary_loss_mlp": 0.01035555, "balance_loss_clip": 1.02100599, "balance_loss_mlp": 1.04370117, "epoch": 0.3790169848188787, "flos": 23585194990080.0, "grad_norm": 15.86669365690093, "language_loss": 0.77882332, "learning_rate": 2.8525733480989846e-06, "loss": 0.80043232, "num_input_tokens_seen": 135314845, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.81640625, "step": 6304, "time_per_iteration": 3.9271926879882812 }, { "auxiliary_loss_clip": 0.0113459, "auxiliary_loss_mlp": 0.01035845, "balance_loss_clip": 1.01939416, "balance_loss_mlp": 1.04915106, "epoch": 0.37907710807154665, "flos": 18437436230400.0, "grad_norm": 2.163364930845067, "language_loss": 0.8013351, "learning_rate": 2.8522210295251146e-06, "loss": 0.82303941, "num_input_tokens_seen": 135333055, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.85546875, "step": 6305, "time_per_iteration": 3.8688178062438965 }, { "auxiliary_loss_clip": 0.01055087, "auxiliary_loss_mlp": 0.01022148, "balance_loss_clip": 1.02004993, "balance_loss_mlp": 1.02731228, "epoch": 0.3791372313242146, "flos": 50107165954560.0, "grad_norm": 0.9882609373470048, "language_loss": 0.64535832, "learning_rate": 2.8518686786349387e-06, "loss": 0.66613066, "num_input_tokens_seen": 135387865, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.27734375, "step": 6306, "time_per_iteration": 2.9890365600585938 }, { "auxiliary_loss_clip": 0.01131172, "auxiliary_loss_mlp": 0.0104248, "balance_loss_clip": 1.02709031, "balance_loss_mlp": 1.04769206, "epoch": 0.3791973545768826, "flos": 24316551809280.0, "grad_norm": 2.531832529755983, "language_loss": 0.73560476, "learning_rate": 2.851516295441817e-06, "loss": 0.75734127, "num_input_tokens_seen": 135409095, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 6307, "time_per_iteration": 2.506091356277466 }, { "auxiliary_loss_clip": 0.01130844, "auxiliary_loss_mlp": 0.01038752, "balance_loss_clip": 1.02355909, "balance_loss_mlp": 1.04617929, "epoch": 0.3792574778295506, "flos": 21579907438080.0, "grad_norm": 1.6572496869142392, "language_loss": 0.7840575, "learning_rate": 2.851163879959112e-06, "loss": 0.80575347, "num_input_tokens_seen": 135429585, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84765625, "step": 6308, "time_per_iteration": 2.5086114406585693 }, { "auxiliary_loss_clip": 0.01126586, "auxiliary_loss_mlp": 0.01040287, "balance_loss_clip": 1.02483106, "balance_loss_mlp": 1.04481506, "epoch": 0.37931760108221857, "flos": 22272731942400.0, "grad_norm": 2.6250129510100155, "language_loss": 0.73304498, "learning_rate": 2.8508114322001876e-06, "loss": 0.75471371, "num_input_tokens_seen": 135446320, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 6309, "time_per_iteration": 2.5221080780029297 }, { "auxiliary_loss_clip": 0.01125915, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.02253342, "balance_loss_mlp": 1.04527748, "epoch": 0.37937772433488653, "flos": 19682998197120.0, "grad_norm": 3.441875206029615, "language_loss": 0.78678823, "learning_rate": 2.8504589521784083e-06, "loss": 0.80842137, "num_input_tokens_seen": 135465720, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80859375, "step": 6310, "time_per_iteration": 2.482381820678711 }, { "auxiliary_loss_clip": 0.01125916, "auxiliary_loss_mlp": 0.01036157, "balance_loss_clip": 1.02166128, "balance_loss_mlp": 1.04396081, "epoch": 0.3794378475875545, "flos": 19099378016640.0, "grad_norm": 2.2464921220066554, "language_loss": 0.76322567, "learning_rate": 2.8501064399071403e-06, "loss": 0.78484648, "num_input_tokens_seen": 135485155, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 6311, "time_per_iteration": 2.4913747310638428 }, { "auxiliary_loss_clip": 0.01126158, "auxiliary_loss_mlp": 0.01030773, "balance_loss_clip": 1.01550889, "balance_loss_mlp": 1.04523981, "epoch": 0.37949797084022246, "flos": 20339660684160.0, "grad_norm": 1.64395026652869, "language_loss": 0.70664203, "learning_rate": 2.8497538953997504e-06, "loss": 0.72821128, "num_input_tokens_seen": 135502675, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 6312, "time_per_iteration": 2.4536328315734863 }, { "auxiliary_loss_clip": 0.01050157, "auxiliary_loss_mlp": 0.01004745, "balance_loss_clip": 1.00292099, "balance_loss_mlp": 1.02227283, "epoch": 0.37955809409289043, "flos": 63972203477760.0, "grad_norm": 0.7683393239150254, "language_loss": 0.56048924, "learning_rate": 2.849401318669608e-06, "loss": 0.58103824, "num_input_tokens_seen": 135562005, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.27734375, "step": 6313, "time_per_iteration": 3.0417284965515137 }, { "auxiliary_loss_clip": 0.011265, "auxiliary_loss_mlp": 0.0103494, "balance_loss_clip": 1.02090323, "balance_loss_mlp": 1.04539013, "epoch": 0.3796182173455584, "flos": 31540665179520.0, "grad_norm": 1.970653129520354, "language_loss": 0.71209192, "learning_rate": 2.849048709730083e-06, "loss": 0.7337063, "num_input_tokens_seen": 135582600, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 6314, "time_per_iteration": 2.5455880165100098 }, { "auxiliary_loss_clip": 0.01131784, "auxiliary_loss_mlp": 0.01038125, "balance_loss_clip": 1.02244866, "balance_loss_mlp": 1.0460217, "epoch": 0.37967834059822636, "flos": 12130804978560.0, "grad_norm": 2.021616104705937, "language_loss": 0.72941804, "learning_rate": 2.848696068594545e-06, "loss": 0.75111711, "num_input_tokens_seen": 135600280, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.859375, "step": 6315, "time_per_iteration": 2.4577903747558594 }, { "auxiliary_loss_clip": 0.01124557, "auxiliary_loss_mlp": 0.01035648, "balance_loss_clip": 1.02094316, "balance_loss_mlp": 1.04413998, "epoch": 0.3797384638508943, "flos": 39348578298240.0, "grad_norm": 2.0793250322436454, "language_loss": 0.70745176, "learning_rate": 2.8483433952763677e-06, "loss": 0.72905385, "num_input_tokens_seen": 135621560, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8046875, "step": 6316, "time_per_iteration": 2.6058030128479004 }, { "auxiliary_loss_clip": 0.01125737, "auxiliary_loss_mlp": 0.01036127, "balance_loss_clip": 1.0223707, "balance_loss_mlp": 1.04515791, "epoch": 0.3797985871035623, "flos": 34054016653440.0, "grad_norm": 2.099457558495544, "language_loss": 0.64960235, "learning_rate": 2.847990689788923e-06, "loss": 0.67122102, "num_input_tokens_seen": 135641745, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.8046875, "step": 6317, "time_per_iteration": 2.5778188705444336 }, { "auxiliary_loss_clip": 0.01123024, "auxiliary_loss_mlp": 0.01031461, "balance_loss_clip": 1.01786542, "balance_loss_mlp": 1.04285836, "epoch": 0.37985871035623026, "flos": 23222174186880.0, "grad_norm": 2.20877893086722, "language_loss": 0.8559072, "learning_rate": 2.8476379521455877e-06, "loss": 0.87745202, "num_input_tokens_seen": 135660650, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.80078125, "step": 6318, "time_per_iteration": 2.4762816429138184 }, { "auxiliary_loss_clip": 0.01128685, "auxiliary_loss_mlp": 0.01043623, "balance_loss_clip": 1.02795911, "balance_loss_mlp": 1.04541457, "epoch": 0.3799188336088982, "flos": 18114958903680.0, "grad_norm": 2.2910106528720653, "language_loss": 0.76549876, "learning_rate": 2.8472851823597354e-06, "loss": 0.78722191, "num_input_tokens_seen": 135679980, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83203125, "step": 6319, "time_per_iteration": 2.462430953979492 }, { "auxiliary_loss_clip": 0.01127602, "auxiliary_loss_mlp": 0.01037379, "balance_loss_clip": 1.0232048, "balance_loss_mlp": 1.04621696, "epoch": 0.3799789568615662, "flos": 21871897096320.0, "grad_norm": 1.5498234112033242, "language_loss": 0.63650715, "learning_rate": 2.846932380444744e-06, "loss": 0.65815693, "num_input_tokens_seen": 135699400, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8125, "step": 6320, "time_per_iteration": 2.4902920722961426 }, { "auxiliary_loss_clip": 0.01125682, "auxiliary_loss_mlp": 0.01036153, "balance_loss_clip": 1.02187204, "balance_loss_mlp": 1.04463995, "epoch": 0.3800390801142342, "flos": 32962943082240.0, "grad_norm": 3.201432937437584, "language_loss": 0.71230662, "learning_rate": 2.846579546413992e-06, "loss": 0.73392498, "num_input_tokens_seen": 135723455, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 6321, "time_per_iteration": 2.5937085151672363 }, { "auxiliary_loss_clip": 0.01127781, "auxiliary_loss_mlp": 0.01039155, "balance_loss_clip": 1.02495754, "balance_loss_mlp": 1.04478204, "epoch": 0.38009920336690217, "flos": 26907075653760.0, "grad_norm": 1.8244105284878738, "language_loss": 0.749277, "learning_rate": 2.846226680280859e-06, "loss": 0.77094632, "num_input_tokens_seen": 135744335, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.828125, "step": 6322, "time_per_iteration": 2.52187180519104 }, { "auxiliary_loss_clip": 0.01125053, "auxiliary_loss_mlp": 0.01034641, "balance_loss_clip": 1.02026403, "balance_loss_mlp": 1.04458785, "epoch": 0.38015932661957014, "flos": 22488913946880.0, "grad_norm": 1.9696407230871102, "language_loss": 0.84625697, "learning_rate": 2.845873782058725e-06, "loss": 0.86785394, "num_input_tokens_seen": 135761440, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8046875, "step": 6323, "time_per_iteration": 2.465008497238159 }, { "auxiliary_loss_clip": 0.01128675, "auxiliary_loss_mlp": 0.01034026, "balance_loss_clip": 1.01818335, "balance_loss_mlp": 1.04619002, "epoch": 0.3802194498722381, "flos": 21980993679360.0, "grad_norm": 7.897940889325664, "language_loss": 0.73518264, "learning_rate": 2.845520851760973e-06, "loss": 0.75680971, "num_input_tokens_seen": 135779955, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.82421875, "step": 6324, "time_per_iteration": 2.462891101837158 }, { "auxiliary_loss_clip": 0.01127608, "auxiliary_loss_mlp": 0.01036724, "balance_loss_clip": 1.02147698, "balance_loss_mlp": 1.04418945, "epoch": 0.38027957312490607, "flos": 21324869896320.0, "grad_norm": 1.8702661726467646, "language_loss": 0.83854234, "learning_rate": 2.8451678894009847e-06, "loss": 0.86018562, "num_input_tokens_seen": 135799840, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83203125, "step": 6325, "time_per_iteration": 2.486036539077759 }, { "auxiliary_loss_clip": 0.01125191, "auxiliary_loss_mlp": 0.01031392, "balance_loss_clip": 1.01715827, "balance_loss_mlp": 1.0441128, "epoch": 0.38033969637757403, "flos": 16691244456960.0, "grad_norm": 2.0739348354657343, "language_loss": 0.79316747, "learning_rate": 2.8448148949921465e-06, "loss": 0.81473339, "num_input_tokens_seen": 135817880, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80859375, "step": 6326, "time_per_iteration": 2.4386370182037354 }, { "auxiliary_loss_clip": 0.01123117, "auxiliary_loss_mlp": 0.01038508, "balance_loss_clip": 1.02461398, "balance_loss_mlp": 1.04275286, "epoch": 0.380399819630242, "flos": 36210847685760.0, "grad_norm": 3.267750846577505, "language_loss": 0.73262227, "learning_rate": 2.844461868547842e-06, "loss": 0.75423849, "num_input_tokens_seen": 135838940, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8046875, "step": 6327, "time_per_iteration": 2.597797393798828 }, { "auxiliary_loss_clip": 0.01126425, "auxiliary_loss_mlp": 0.0103741, "balance_loss_clip": 1.02262187, "balance_loss_mlp": 1.04611588, "epoch": 0.38045994288290996, "flos": 21288851533440.0, "grad_norm": 1.6461344180041746, "language_loss": 0.8302955, "learning_rate": 2.844108810081459e-06, "loss": 0.85193384, "num_input_tokens_seen": 135858325, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 6328, "time_per_iteration": 2.4891555309295654 }, { "auxiliary_loss_clip": 0.01123882, "auxiliary_loss_mlp": 0.01029823, "balance_loss_clip": 1.01581609, "balance_loss_mlp": 1.04379177, "epoch": 0.38052006613557793, "flos": 20922885815040.0, "grad_norm": 1.4616486669803592, "language_loss": 0.61555016, "learning_rate": 2.843755719606385e-06, "loss": 0.63708717, "num_input_tokens_seen": 135878430, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80078125, "step": 6329, "time_per_iteration": 2.488793134689331 }, { "auxiliary_loss_clip": 0.0112721, "auxiliary_loss_mlp": 0.01032898, "balance_loss_clip": 1.01880133, "balance_loss_mlp": 1.04624605, "epoch": 0.3805801893882459, "flos": 20990720649600.0, "grad_norm": 2.884815281120906, "language_loss": 0.56247807, "learning_rate": 2.8434025971360104e-06, "loss": 0.58407915, "num_input_tokens_seen": 135894755, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80859375, "step": 6330, "time_per_iteration": 2.523925542831421 }, { "auxiliary_loss_clip": 0.01123081, "auxiliary_loss_mlp": 0.01029455, "balance_loss_clip": 1.01655078, "balance_loss_mlp": 1.04564548, "epoch": 0.38064031264091386, "flos": 25558594243200.0, "grad_norm": 1.7575843901169923, "language_loss": 0.66152561, "learning_rate": 2.8430494426837243e-06, "loss": 0.68305099, "num_input_tokens_seen": 135918275, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7734375, "step": 6331, "time_per_iteration": 2.5923244953155518 }, { "auxiliary_loss_clip": 0.01127828, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.02690291, "balance_loss_mlp": 1.04686511, "epoch": 0.3807004358935818, "flos": 15085857997440.0, "grad_norm": 1.911662984546156, "language_loss": 0.76104414, "learning_rate": 2.842696256262919e-06, "loss": 0.78274286, "num_input_tokens_seen": 135937430, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 6332, "time_per_iteration": 2.483520746231079 }, { "auxiliary_loss_clip": 0.01127413, "auxiliary_loss_mlp": 0.01037784, "balance_loss_clip": 1.02257288, "balance_loss_mlp": 1.04423976, "epoch": 0.3807605591462498, "flos": 16399398453120.0, "grad_norm": 2.0732065849430272, "language_loss": 0.81363481, "learning_rate": 2.842343037886987e-06, "loss": 0.83528674, "num_input_tokens_seen": 135954210, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83203125, "step": 6333, "time_per_iteration": 2.451140880584717 }, { "auxiliary_loss_clip": 0.01124972, "auxiliary_loss_mlp": 0.01033487, "balance_loss_clip": 1.01948583, "balance_loss_mlp": 1.04499996, "epoch": 0.3808206823989178, "flos": 29057083102080.0, "grad_norm": 1.8771511504172549, "language_loss": 0.86122024, "learning_rate": 2.8419897875693226e-06, "loss": 0.88280475, "num_input_tokens_seen": 135974425, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80078125, "step": 6334, "time_per_iteration": 2.5423057079315186 }, { "auxiliary_loss_clip": 0.01123898, "auxiliary_loss_mlp": 0.01037395, "balance_loss_clip": 1.02319157, "balance_loss_mlp": 1.04297113, "epoch": 0.3808808056515858, "flos": 15705855676800.0, "grad_norm": 2.1956411062754064, "language_loss": 0.78619003, "learning_rate": 2.841636505323321e-06, "loss": 0.80780292, "num_input_tokens_seen": 135991985, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.80859375, "step": 6335, "time_per_iteration": 2.4323830604553223 }, { "auxiliary_loss_clip": 0.01128358, "auxiliary_loss_mlp": 0.01035581, "balance_loss_clip": 1.02073336, "balance_loss_mlp": 1.047135, "epoch": 0.38094092890425374, "flos": 20704584908160.0, "grad_norm": 1.7473137465460247, "language_loss": 0.72485876, "learning_rate": 2.8412831911623795e-06, "loss": 0.74649823, "num_input_tokens_seen": 136010015, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 6336, "time_per_iteration": 2.4690842628479004 }, { "auxiliary_loss_clip": 0.01123144, "auxiliary_loss_mlp": 0.01033082, "balance_loss_clip": 1.01913428, "balance_loss_mlp": 1.04380882, "epoch": 0.3810010521569217, "flos": 20667956014080.0, "grad_norm": 2.645315730784338, "language_loss": 0.6940769, "learning_rate": 2.840929845099894e-06, "loss": 0.71563923, "num_input_tokens_seen": 136028440, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.79296875, "step": 6337, "time_per_iteration": 2.47546648979187 }, { "auxiliary_loss_clip": 0.01127186, "auxiliary_loss_mlp": 0.01032417, "balance_loss_clip": 1.01768243, "balance_loss_mlp": 1.04579973, "epoch": 0.38106117540958967, "flos": 31827626933760.0, "grad_norm": 1.9799057571173326, "language_loss": 0.63613772, "learning_rate": 2.8405764671492652e-06, "loss": 0.65773368, "num_input_tokens_seen": 136048360, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 6338, "time_per_iteration": 2.5557501316070557 }, { "auxiliary_loss_clip": 0.01127371, "auxiliary_loss_mlp": 0.01045147, "balance_loss_clip": 1.02973294, "balance_loss_mlp": 1.04519463, "epoch": 0.38112129866225763, "flos": 16902757693440.0, "grad_norm": 2.720777449398238, "language_loss": 0.69273412, "learning_rate": 2.8402230573238923e-06, "loss": 0.7144593, "num_input_tokens_seen": 136065500, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 6339, "time_per_iteration": 2.4345531463623047 }, { "auxiliary_loss_clip": 0.01131425, "auxiliary_loss_mlp": 0.01043342, "balance_loss_clip": 1.02920389, "balance_loss_mlp": 1.04815078, "epoch": 0.3811814219149256, "flos": 20887226588160.0, "grad_norm": 2.2831750060652634, "language_loss": 0.67983282, "learning_rate": 2.839869615637177e-06, "loss": 0.70158052, "num_input_tokens_seen": 136084060, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.83203125, "step": 6340, "time_per_iteration": 2.4695606231689453 }, { "auxiliary_loss_clip": 0.0112779, "auxiliary_loss_mlp": 0.01038876, "balance_loss_clip": 1.02327204, "balance_loss_mlp": 1.04418099, "epoch": 0.38124154516759357, "flos": 16690813493760.0, "grad_norm": 2.1693184372377727, "language_loss": 0.8983947, "learning_rate": 2.839516142102522e-06, "loss": 0.92006135, "num_input_tokens_seen": 136102310, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 6341, "time_per_iteration": 3.8996503353118896 }, { "auxiliary_loss_clip": 0.01128827, "auxiliary_loss_mlp": 0.01040026, "balance_loss_clip": 1.02396846, "balance_loss_mlp": 1.04498434, "epoch": 0.38130166842026153, "flos": 19681956702720.0, "grad_norm": 2.028004149646598, "language_loss": 0.74794888, "learning_rate": 2.83916263673333e-06, "loss": 0.76963741, "num_input_tokens_seen": 136120725, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 6342, "time_per_iteration": 2.4827165603637695 }, { "auxiliary_loss_clip": 0.01125601, "auxiliary_loss_mlp": 0.01030921, "balance_loss_clip": 1.01628816, "balance_loss_mlp": 1.04359627, "epoch": 0.3813617916729295, "flos": 22198432659840.0, "grad_norm": 1.799229735210503, "language_loss": 0.83591181, "learning_rate": 2.838809099543007e-06, "loss": 0.85747707, "num_input_tokens_seen": 136139105, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8203125, "step": 6343, "time_per_iteration": 2.4860799312591553 }, { "auxiliary_loss_clip": 0.01125368, "auxiliary_loss_mlp": 0.01038442, "balance_loss_clip": 1.02423835, "balance_loss_mlp": 1.04249024, "epoch": 0.38142191492559746, "flos": 19096899978240.0, "grad_norm": 1.7931781432150324, "language_loss": 0.7732563, "learning_rate": 2.838455530544959e-06, "loss": 0.79489446, "num_input_tokens_seen": 136158265, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.828125, "step": 6344, "time_per_iteration": 2.56679368019104 }, { "auxiliary_loss_clip": 0.01128506, "auxiliary_loss_mlp": 0.01039644, "balance_loss_clip": 1.02426636, "balance_loss_mlp": 1.04566789, "epoch": 0.3814820381782654, "flos": 24097748112000.0, "grad_norm": 2.1798661594460778, "language_loss": 0.7345649, "learning_rate": 2.838101929752593e-06, "loss": 0.75624633, "num_input_tokens_seen": 136176100, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 6345, "time_per_iteration": 5.41127347946167 }, { "auxiliary_loss_clip": 0.01123035, "auxiliary_loss_mlp": 0.01033325, "balance_loss_clip": 1.01943123, "balance_loss_mlp": 1.042454, "epoch": 0.3815421614309334, "flos": 15778502933760.0, "grad_norm": 1.8320716344217212, "language_loss": 0.69712907, "learning_rate": 2.8377482971793187e-06, "loss": 0.71869272, "num_input_tokens_seen": 136195125, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8046875, "step": 6346, "time_per_iteration": 2.4387831687927246 }, { "auxiliary_loss_clip": 0.01128028, "auxiliary_loss_mlp": 0.01033326, "balance_loss_clip": 1.01829958, "balance_loss_mlp": 1.04497576, "epoch": 0.38160228468360136, "flos": 19899754819200.0, "grad_norm": 2.0523397819134095, "language_loss": 0.75509095, "learning_rate": 2.8373946328385437e-06, "loss": 0.77670449, "num_input_tokens_seen": 136213885, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83203125, "step": 6347, "time_per_iteration": 3.955366611480713 }, { "auxiliary_loss_clip": 0.01126081, "auxiliary_loss_mlp": 0.01039994, "balance_loss_clip": 1.02592778, "balance_loss_mlp": 1.0437634, "epoch": 0.3816624079362694, "flos": 19281050029440.0, "grad_norm": 1.6582904476565357, "language_loss": 0.74372542, "learning_rate": 2.8370409367436813e-06, "loss": 0.76538616, "num_input_tokens_seen": 136232700, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.82421875, "step": 6348, "time_per_iteration": 2.5138237476348877 }, { "auxiliary_loss_clip": 0.01124616, "auxiliary_loss_mlp": 0.0103405, "balance_loss_clip": 1.01966166, "balance_loss_mlp": 1.04315996, "epoch": 0.38172253118893734, "flos": 21177564220800.0, "grad_norm": 2.375090054203688, "language_loss": 0.87904394, "learning_rate": 2.836687208908142e-06, "loss": 0.90063065, "num_input_tokens_seen": 136248975, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 6349, "time_per_iteration": 2.467808485031128 }, { "auxiliary_loss_clip": 0.01126473, "auxiliary_loss_mlp": 0.01036719, "balance_loss_clip": 1.0217824, "balance_loss_mlp": 1.04400623, "epoch": 0.3817826544416053, "flos": 17529219820800.0, "grad_norm": 2.1085459730460436, "language_loss": 0.77030408, "learning_rate": 2.836333449345341e-06, "loss": 0.79193604, "num_input_tokens_seen": 136266710, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.82421875, "step": 6350, "time_per_iteration": 2.481201410293579 }, { "auxiliary_loss_clip": 0.01125735, "auxiliary_loss_mlp": 0.01032409, "balance_loss_clip": 1.01632762, "balance_loss_mlp": 1.04412806, "epoch": 0.38184277769427327, "flos": 16326535714560.0, "grad_norm": 1.9885919104848397, "language_loss": 0.76112831, "learning_rate": 2.8359796580686907e-06, "loss": 0.78270972, "num_input_tokens_seen": 136284445, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8125, "step": 6351, "time_per_iteration": 2.4298436641693115 }, { "auxiliary_loss_clip": 0.01128568, "auxiliary_loss_mlp": 0.01034666, "balance_loss_clip": 1.01891875, "balance_loss_mlp": 1.04527092, "epoch": 0.38190290094694124, "flos": 30443450382720.0, "grad_norm": 2.3200582471776086, "language_loss": 0.74468374, "learning_rate": 2.8356258350916085e-06, "loss": 0.76631606, "num_input_tokens_seen": 136305730, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.83203125, "step": 6352, "time_per_iteration": 2.576634645462036 }, { "auxiliary_loss_clip": 0.01123072, "auxiliary_loss_mlp": 0.010329, "balance_loss_clip": 1.01969147, "balance_loss_mlp": 1.04269719, "epoch": 0.3819630241996092, "flos": 14209924936320.0, "grad_norm": 2.036438158296297, "language_loss": 0.64572465, "learning_rate": 2.8352719804275104e-06, "loss": 0.66728437, "num_input_tokens_seen": 136323850, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.8046875, "step": 6353, "time_per_iteration": 2.432013750076294 }, { "auxiliary_loss_clip": 0.01124627, "auxiliary_loss_mlp": 0.01035473, "balance_loss_clip": 1.02137113, "balance_loss_mlp": 1.04379773, "epoch": 0.38202314745227717, "flos": 25009699536000.0, "grad_norm": 1.5755674560982407, "language_loss": 0.83399308, "learning_rate": 2.834918094089816e-06, "loss": 0.8555941, "num_input_tokens_seen": 136344880, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80859375, "step": 6354, "time_per_iteration": 2.5485551357269287 }, { "auxiliary_loss_clip": 0.01121881, "auxiliary_loss_mlp": 0.01035356, "balance_loss_clip": 1.02221894, "balance_loss_mlp": 1.04296041, "epoch": 0.38208327070494513, "flos": 20814507504000.0, "grad_norm": 2.0468744215240764, "language_loss": 0.80463952, "learning_rate": 2.834564176091943e-06, "loss": 0.82621193, "num_input_tokens_seen": 136366060, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7890625, "step": 6355, "time_per_iteration": 2.5252952575683594 }, { "auxiliary_loss_clip": 0.01126452, "auxiliary_loss_mlp": 0.01036424, "balance_loss_clip": 1.02214885, "balance_loss_mlp": 1.04477406, "epoch": 0.3821433939576131, "flos": 22637727993600.0, "grad_norm": 2.339234235632514, "language_loss": 0.75410926, "learning_rate": 2.8342102264473125e-06, "loss": 0.775738, "num_input_tokens_seen": 136385625, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.81640625, "step": 6356, "time_per_iteration": 2.504850387573242 }, { "auxiliary_loss_clip": 0.01125591, "auxiliary_loss_mlp": 0.01038842, "balance_loss_clip": 1.02457809, "balance_loss_mlp": 1.04429066, "epoch": 0.38220351721028106, "flos": 26869872142080.0, "grad_norm": 40.41372413701902, "language_loss": 0.81512731, "learning_rate": 2.833856245169348e-06, "loss": 0.83677161, "num_input_tokens_seen": 136405750, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 6357, "time_per_iteration": 2.49967098236084 }, { "auxiliary_loss_clip": 0.01132272, "auxiliary_loss_mlp": 0.01045353, "balance_loss_clip": 1.02973628, "balance_loss_mlp": 1.04891384, "epoch": 0.38226364046294903, "flos": 23367468700800.0, "grad_norm": 1.9937415292958436, "language_loss": 0.77864456, "learning_rate": 2.8335022322714695e-06, "loss": 0.80042082, "num_input_tokens_seen": 136426085, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 6358, "time_per_iteration": 2.518838405609131 }, { "auxiliary_loss_clip": 0.01127858, "auxiliary_loss_mlp": 0.01043493, "balance_loss_clip": 1.02857971, "balance_loss_mlp": 1.0445416, "epoch": 0.382323763715617, "flos": 19646225648640.0, "grad_norm": 2.198181335101743, "language_loss": 0.78579879, "learning_rate": 2.8331481877671036e-06, "loss": 0.80751228, "num_input_tokens_seen": 136442670, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.83203125, "step": 6359, "time_per_iteration": 2.449389696121216 }, { "auxiliary_loss_clip": 0.0112613, "auxiliary_loss_mlp": 0.01039334, "balance_loss_clip": 1.02445054, "balance_loss_mlp": 1.0450896, "epoch": 0.38238388696828496, "flos": 54124741232640.0, "grad_norm": 5.484784269413752, "language_loss": 0.69530296, "learning_rate": 2.8327941116696754e-06, "loss": 0.71695763, "num_input_tokens_seen": 136465730, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80859375, "step": 6360, "time_per_iteration": 2.7907087802886963 }, { "auxiliary_loss_clip": 0.01124655, "auxiliary_loss_mlp": 0.01032, "balance_loss_clip": 1.01743245, "balance_loss_mlp": 1.04427409, "epoch": 0.382444010220953, "flos": 24936190352640.0, "grad_norm": 2.018066264210857, "language_loss": 0.79012167, "learning_rate": 2.83244000399261e-06, "loss": 0.81168818, "num_input_tokens_seen": 136487215, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8046875, "step": 6361, "time_per_iteration": 2.537825345993042 }, { "auxiliary_loss_clip": 0.01122268, "auxiliary_loss_mlp": 0.01043141, "balance_loss_clip": 1.02930665, "balance_loss_mlp": 1.04294252, "epoch": 0.38250413347362094, "flos": 42337351209600.0, "grad_norm": 1.7107295984293642, "language_loss": 0.65625423, "learning_rate": 2.832085864749337e-06, "loss": 0.67790836, "num_input_tokens_seen": 136510365, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.79296875, "step": 6362, "time_per_iteration": 2.729534149169922 }, { "auxiliary_loss_clip": 0.01123788, "auxiliary_loss_mlp": 0.01038049, "balance_loss_clip": 1.0226233, "balance_loss_mlp": 1.04247546, "epoch": 0.3825642567262889, "flos": 16289224462080.0, "grad_norm": 1.7171911001613218, "language_loss": 0.81680226, "learning_rate": 2.8317316939532848e-06, "loss": 0.83842063, "num_input_tokens_seen": 136527100, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 6363, "time_per_iteration": 2.458895206451416 }, { "auxiliary_loss_clip": 0.01124513, "auxiliary_loss_mlp": 0.01046087, "balance_loss_clip": 1.03158522, "balance_loss_mlp": 1.04500747, "epoch": 0.3826243799789569, "flos": 45654778586880.0, "grad_norm": 2.332848450271436, "language_loss": 0.58819646, "learning_rate": 2.8313774916178825e-06, "loss": 0.6099025, "num_input_tokens_seen": 136550870, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 6364, "time_per_iteration": 2.7200348377227783 }, { "auxiliary_loss_clip": 0.01128377, "auxiliary_loss_mlp": 0.01039481, "balance_loss_clip": 1.02407908, "balance_loss_mlp": 1.04452157, "epoch": 0.38268450323162484, "flos": 25301581453440.0, "grad_norm": 2.091437123582131, "language_loss": 0.69103044, "learning_rate": 2.8310232577565635e-06, "loss": 0.71270901, "num_input_tokens_seen": 136569895, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83984375, "step": 6365, "time_per_iteration": 2.5098586082458496 }, { "auxiliary_loss_clip": 0.0112812, "auxiliary_loss_mlp": 0.01038378, "balance_loss_clip": 1.0230124, "balance_loss_mlp": 1.04301775, "epoch": 0.3827446264842928, "flos": 21836022387840.0, "grad_norm": 2.0544183574062016, "language_loss": 0.73228979, "learning_rate": 2.830668992382758e-06, "loss": 0.75395477, "num_input_tokens_seen": 136588585, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8515625, "step": 6366, "time_per_iteration": 2.5627474784851074 }, { "auxiliary_loss_clip": 0.01128545, "auxiliary_loss_mlp": 0.01036274, "balance_loss_clip": 1.02140832, "balance_loss_mlp": 1.04593241, "epoch": 0.38280474973696077, "flos": 25734591907200.0, "grad_norm": 2.044124184909405, "language_loss": 0.6782347, "learning_rate": 2.830314695509902e-06, "loss": 0.69988286, "num_input_tokens_seen": 136606640, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.82421875, "step": 6367, "time_per_iteration": 2.5508525371551514 }, { "auxiliary_loss_clip": 0.01122273, "auxiliary_loss_mlp": 0.01035389, "balance_loss_clip": 1.02061903, "balance_loss_mlp": 1.04396105, "epoch": 0.38286487298962874, "flos": 24895934184960.0, "grad_norm": 3.8224031261087053, "language_loss": 0.64266801, "learning_rate": 2.82996036715143e-06, "loss": 0.66424465, "num_input_tokens_seen": 136624940, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 6368, "time_per_iteration": 2.5214667320251465 }, { "auxiliary_loss_clip": 0.01127479, "auxiliary_loss_mlp": 0.01035037, "balance_loss_clip": 1.02024317, "balance_loss_mlp": 1.04561114, "epoch": 0.3829249962422967, "flos": 28543703967360.0, "grad_norm": 1.5081143119041722, "language_loss": 0.68465781, "learning_rate": 2.8296060073207763e-06, "loss": 0.70628297, "num_input_tokens_seen": 136645540, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8203125, "step": 6369, "time_per_iteration": 2.5320632457733154 }, { "auxiliary_loss_clip": 0.01125379, "auxiliary_loss_mlp": 0.01044367, "balance_loss_clip": 1.02921593, "balance_loss_mlp": 1.04480481, "epoch": 0.38298511949496467, "flos": 21471205904640.0, "grad_norm": 1.6728703798064255, "language_loss": 0.78552854, "learning_rate": 2.8292516160313804e-06, "loss": 0.80722594, "num_input_tokens_seen": 136664530, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8046875, "step": 6370, "time_per_iteration": 2.5183634757995605 }, { "auxiliary_loss_clip": 0.01127771, "auxiliary_loss_mlp": 0.01035372, "balance_loss_clip": 1.02085781, "balance_loss_mlp": 1.0465095, "epoch": 0.38304524274763263, "flos": 31679998035840.0, "grad_norm": 2.6318088689154533, "language_loss": 0.64577579, "learning_rate": 2.8288971932966805e-06, "loss": 0.66740727, "num_input_tokens_seen": 136682315, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 6371, "time_per_iteration": 2.534759521484375 }, { "auxiliary_loss_clip": 0.01132727, "auxiliary_loss_mlp": 0.01039944, "balance_loss_clip": 1.02364826, "balance_loss_mlp": 1.0478617, "epoch": 0.3831053660003006, "flos": 25076816098560.0, "grad_norm": 1.8445936658789894, "language_loss": 0.73169285, "learning_rate": 2.8285427391301155e-06, "loss": 0.75341952, "num_input_tokens_seen": 136701185, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.84765625, "step": 6372, "time_per_iteration": 2.5095410346984863 }, { "auxiliary_loss_clip": 0.01129087, "auxiliary_loss_mlp": 0.0103179, "balance_loss_clip": 1.01686502, "balance_loss_mlp": 1.0460459, "epoch": 0.38316548925296856, "flos": 23259018562560.0, "grad_norm": 1.6263493801094877, "language_loss": 0.84989953, "learning_rate": 2.8281882535451266e-06, "loss": 0.8715083, "num_input_tokens_seen": 136721265, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.828125, "step": 6373, "time_per_iteration": 2.5053627490997314 }, { "auxiliary_loss_clip": 0.01130517, "auxiliary_loss_mlp": 0.01042907, "balance_loss_clip": 1.02763629, "balance_loss_mlp": 1.04649448, "epoch": 0.3832256125056366, "flos": 34423465991040.0, "grad_norm": 2.197582829527085, "language_loss": 0.74998927, "learning_rate": 2.8278337365551567e-06, "loss": 0.77172351, "num_input_tokens_seen": 136741885, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83984375, "step": 6374, "time_per_iteration": 2.6166296005249023 }, { "auxiliary_loss_clip": 0.01131131, "auxiliary_loss_mlp": 0.01034755, "balance_loss_clip": 1.01996732, "balance_loss_mlp": 1.04718852, "epoch": 0.38328573575830455, "flos": 21762764599680.0, "grad_norm": 2.310941471288262, "language_loss": 0.75612545, "learning_rate": 2.8274791881736485e-06, "loss": 0.77778429, "num_input_tokens_seen": 136760905, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.83984375, "step": 6375, "time_per_iteration": 2.471513032913208 }, { "auxiliary_loss_clip": 0.01129536, "auxiliary_loss_mlp": 0.01038257, "balance_loss_clip": 1.02329004, "balance_loss_mlp": 1.04696727, "epoch": 0.3833458590109725, "flos": 17380010724480.0, "grad_norm": 2.4070411476833673, "language_loss": 0.72942686, "learning_rate": 2.8271246084140457e-06, "loss": 0.75110483, "num_input_tokens_seen": 136777240, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.82421875, "step": 6376, "time_per_iteration": 2.4583466053009033 }, { "auxiliary_loss_clip": 0.01127521, "auxiliary_loss_mlp": 0.01035653, "balance_loss_clip": 1.02031112, "balance_loss_mlp": 1.04589152, "epoch": 0.3834059822636405, "flos": 29424557191680.0, "grad_norm": 2.4850214939241697, "language_loss": 0.67690456, "learning_rate": 2.826769997289796e-06, "loss": 0.69853628, "num_input_tokens_seen": 136801040, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.81640625, "step": 6377, "time_per_iteration": 2.5361204147338867 }, { "auxiliary_loss_clip": 0.01130872, "auxiliary_loss_mlp": 0.01037015, "balance_loss_clip": 1.02147055, "balance_loss_mlp": 1.0473547, "epoch": 0.38346610551630844, "flos": 21470739027840.0, "grad_norm": 2.0127330800664445, "language_loss": 0.72999102, "learning_rate": 2.826415354814344e-06, "loss": 0.75166994, "num_input_tokens_seen": 136819495, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8359375, "step": 6378, "time_per_iteration": 2.525132656097412 }, { "auxiliary_loss_clip": 0.01128453, "auxiliary_loss_mlp": 0.01035017, "balance_loss_clip": 1.02033007, "balance_loss_mlp": 1.04531384, "epoch": 0.3835262287689764, "flos": 27561224188800.0, "grad_norm": 3.7910051943772594, "language_loss": 0.69701159, "learning_rate": 2.8260606810011396e-06, "loss": 0.71864629, "num_input_tokens_seen": 136838840, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.83203125, "step": 6379, "time_per_iteration": 2.585693359375 }, { "auxiliary_loss_clip": 0.01131316, "auxiliary_loss_mlp": 0.01033834, "balance_loss_clip": 1.01900482, "balance_loss_mlp": 1.05015457, "epoch": 0.3835863520216444, "flos": 15523716787200.0, "grad_norm": 1.7582449932945163, "language_loss": 0.83159018, "learning_rate": 2.8257059758636315e-06, "loss": 0.85324168, "num_input_tokens_seen": 136854425, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 6380, "time_per_iteration": 2.5021026134490967 }, { "auxiliary_loss_clip": 0.01126078, "auxiliary_loss_mlp": 0.01032241, "balance_loss_clip": 1.01827526, "balance_loss_mlp": 1.04647362, "epoch": 0.38364647527431234, "flos": 21904934630400.0, "grad_norm": 1.472334848946637, "language_loss": 0.81057823, "learning_rate": 2.8253512394152697e-06, "loss": 0.83216143, "num_input_tokens_seen": 136874355, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.796875, "step": 6381, "time_per_iteration": 2.497352361679077 }, { "auxiliary_loss_clip": 0.01054807, "auxiliary_loss_mlp": 0.01002179, "balance_loss_clip": 1.00031888, "balance_loss_mlp": 1.02747297, "epoch": 0.3837065985269803, "flos": 65534927558400.0, "grad_norm": 0.8366500809968126, "language_loss": 0.60482943, "learning_rate": 2.8249964716695068e-06, "loss": 0.62539929, "num_input_tokens_seen": 136937475, "router_z_loss_clip": 0.01855469, "router_z_loss_mlp": 0.2734375, "step": 6382, "time_per_iteration": 3.0948915481567383 }, { "auxiliary_loss_clip": 0.01132273, "auxiliary_loss_mlp": 0.01035623, "balance_loss_clip": 1.02019751, "balance_loss_mlp": 1.04762924, "epoch": 0.38376672177964827, "flos": 28256598558720.0, "grad_norm": 2.473275851468148, "language_loss": 0.66466415, "learning_rate": 2.824641672639794e-06, "loss": 0.68634307, "num_input_tokens_seen": 136955805, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84375, "step": 6383, "time_per_iteration": 4.023232936859131 }, { "auxiliary_loss_clip": 0.01133794, "auxiliary_loss_mlp": 0.01031944, "balance_loss_clip": 1.01719761, "balance_loss_mlp": 1.05102754, "epoch": 0.38382684503231623, "flos": 20631363033600.0, "grad_norm": 1.880661046654944, "language_loss": 0.75286001, "learning_rate": 2.824286842339587e-06, "loss": 0.77451736, "num_input_tokens_seen": 136975240, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.828125, "step": 6384, "time_per_iteration": 2.504333972930908 }, { "auxiliary_loss_clip": 0.01126918, "auxiliary_loss_mlp": 0.01036674, "balance_loss_clip": 1.02233911, "balance_loss_mlp": 1.04652286, "epoch": 0.3838869682849842, "flos": 19605825826560.0, "grad_norm": 1.358261401148306, "language_loss": 0.76193416, "learning_rate": 2.823931980782341e-06, "loss": 0.78357005, "num_input_tokens_seen": 136994985, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8046875, "step": 6385, "time_per_iteration": 2.5515918731689453 }, { "auxiliary_loss_clip": 0.01054508, "auxiliary_loss_mlp": 0.01002174, "balance_loss_clip": 1.00033844, "balance_loss_mlp": 1.02708519, "epoch": 0.38394709153765216, "flos": 56556110891520.0, "grad_norm": 0.9204130106327668, "language_loss": 0.67045021, "learning_rate": 2.82357708798151e-06, "loss": 0.69101703, "num_input_tokens_seen": 137046290, "router_z_loss_clip": 0.01831055, "router_z_loss_mlp": 0.2734375, "step": 6386, "time_per_iteration": 2.975796699523926 }, { "auxiliary_loss_clip": 0.01129632, "auxiliary_loss_mlp": 0.01034196, "balance_loss_clip": 1.02044582, "balance_loss_mlp": 1.04978108, "epoch": 0.3840072147903202, "flos": 15888748752000.0, "grad_norm": 2.0213177939687337, "language_loss": 0.72304797, "learning_rate": 2.8232221639505547e-06, "loss": 0.74468625, "num_input_tokens_seen": 137064725, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.796875, "step": 6387, "time_per_iteration": 3.9229772090911865 }, { "auxiliary_loss_clip": 0.01128689, "auxiliary_loss_mlp": 0.01034776, "balance_loss_clip": 1.02060246, "balance_loss_mlp": 1.04963064, "epoch": 0.38406733804298815, "flos": 28218030330240.0, "grad_norm": 1.8214044432291834, "language_loss": 0.80855763, "learning_rate": 2.822867208702932e-06, "loss": 0.83019233, "num_input_tokens_seen": 137086030, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 6388, "time_per_iteration": 3.986915111541748 }, { "auxiliary_loss_clip": 0.01124564, "auxiliary_loss_mlp": 0.0103389, "balance_loss_clip": 1.02047288, "balance_loss_mlp": 1.04530263, "epoch": 0.3841274612956561, "flos": 18223588609920.0, "grad_norm": 1.8565392147535587, "language_loss": 0.76311237, "learning_rate": 2.8225122222521026e-06, "loss": 0.78469688, "num_input_tokens_seen": 137105400, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.79296875, "step": 6389, "time_per_iteration": 2.474937915802002 }, { "auxiliary_loss_clip": 0.01131396, "auxiliary_loss_mlp": 0.01043515, "balance_loss_clip": 1.0275054, "balance_loss_mlp": 1.04741538, "epoch": 0.3841875845483241, "flos": 19792884879360.0, "grad_norm": 1.7896179229147793, "language_loss": 0.76257432, "learning_rate": 2.8221572046115273e-06, "loss": 0.78432345, "num_input_tokens_seen": 137124985, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 6390, "time_per_iteration": 2.501887321472168 }, { "auxiliary_loss_clip": 0.01130112, "auxiliary_loss_mlp": 0.01048819, "balance_loss_clip": 1.03272629, "balance_loss_mlp": 1.0457418, "epoch": 0.38424770780099204, "flos": 29898829393920.0, "grad_norm": 2.348222620866825, "language_loss": 0.69705033, "learning_rate": 2.821802155794668e-06, "loss": 0.7188397, "num_input_tokens_seen": 137146745, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 6391, "time_per_iteration": 2.6083824634552 }, { "auxiliary_loss_clip": 0.0112575, "auxiliary_loss_mlp": 0.01036201, "balance_loss_clip": 1.02159762, "balance_loss_mlp": 1.04378688, "epoch": 0.38430783105366, "flos": 20813717404800.0, "grad_norm": 1.9131431897948319, "language_loss": 0.84488875, "learning_rate": 2.8214470758149884e-06, "loss": 0.86650825, "num_input_tokens_seen": 137163195, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8203125, "step": 6392, "time_per_iteration": 2.488947868347168 }, { "auxiliary_loss_clip": 0.01126126, "auxiliary_loss_mlp": 0.01032344, "balance_loss_clip": 1.01845646, "balance_loss_mlp": 1.04492199, "epoch": 0.384367954306328, "flos": 10998577399680.0, "grad_norm": 3.160204362089651, "language_loss": 0.60935998, "learning_rate": 2.8210919646859536e-06, "loss": 0.63094473, "num_input_tokens_seen": 137179330, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8125, "step": 6393, "time_per_iteration": 2.4876904487609863 }, { "auxiliary_loss_clip": 0.01135136, "auxiliary_loss_mlp": 0.01031607, "balance_loss_clip": 1.01595497, "balance_loss_mlp": 1.04870248, "epoch": 0.38442807755899594, "flos": 25338030779520.0, "grad_norm": 1.9789661459923813, "language_loss": 0.71242875, "learning_rate": 2.820736822421029e-06, "loss": 0.73409623, "num_input_tokens_seen": 137198655, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.86328125, "step": 6394, "time_per_iteration": 2.536069393157959 }, { "auxiliary_loss_clip": 0.01132859, "auxiliary_loss_mlp": 0.01034154, "balance_loss_clip": 1.01837051, "balance_loss_mlp": 1.04676819, "epoch": 0.3844882008116639, "flos": 21069760527360.0, "grad_norm": 2.337406030321143, "language_loss": 0.81519365, "learning_rate": 2.8203816490336822e-06, "loss": 0.83686376, "num_input_tokens_seen": 137217120, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.859375, "step": 6395, "time_per_iteration": 2.5240020751953125 }, { "auxiliary_loss_clip": 0.01132085, "auxiliary_loss_mlp": 0.0104265, "balance_loss_clip": 1.02829742, "balance_loss_mlp": 1.04970253, "epoch": 0.38454832406433187, "flos": 17963235855360.0, "grad_norm": 1.765764959093002, "language_loss": 0.70895898, "learning_rate": 2.8200264445373813e-06, "loss": 0.73070633, "num_input_tokens_seen": 137234410, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 6396, "time_per_iteration": 2.453664541244507 }, { "auxiliary_loss_clip": 0.01047615, "auxiliary_loss_mlp": 0.01005444, "balance_loss_clip": 1.00365627, "balance_loss_mlp": 1.01996374, "epoch": 0.38460844731699984, "flos": 67924999555200.0, "grad_norm": 0.8983879556929493, "language_loss": 0.59730637, "learning_rate": 2.8196712089455954e-06, "loss": 0.61783695, "num_input_tokens_seen": 137294940, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.27734375, "step": 6397, "time_per_iteration": 3.1703271865844727 }, { "auxiliary_loss_clip": 0.01129407, "auxiliary_loss_mlp": 0.01030995, "balance_loss_clip": 1.01576006, "balance_loss_mlp": 1.04835856, "epoch": 0.3846685705696678, "flos": 25849075530240.0, "grad_norm": 2.456578997713189, "language_loss": 0.84574306, "learning_rate": 2.819315942271794e-06, "loss": 0.86734712, "num_input_tokens_seen": 137315035, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 6398, "time_per_iteration": 2.557116985321045 }, { "auxiliary_loss_clip": 0.01127948, "auxiliary_loss_mlp": 0.01031359, "balance_loss_clip": 1.01700628, "balance_loss_mlp": 1.04687929, "epoch": 0.38472869382233577, "flos": 16290194129280.0, "grad_norm": 2.237376240606132, "language_loss": 0.80250448, "learning_rate": 2.8189606445294515e-06, "loss": 0.82409751, "num_input_tokens_seen": 137333155, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8125, "step": 6399, "time_per_iteration": 2.497706413269043 }, { "auxiliary_loss_clip": 0.0112813, "auxiliary_loss_mlp": 0.01033848, "balance_loss_clip": 1.01831484, "balance_loss_mlp": 1.04537439, "epoch": 0.38478881707500373, "flos": 19353122668800.0, "grad_norm": 1.9069254955200463, "language_loss": 0.67523682, "learning_rate": 2.818605315732038e-06, "loss": 0.69685656, "num_input_tokens_seen": 137351515, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.828125, "step": 6400, "time_per_iteration": 2.484046220779419 }, { "auxiliary_loss_clip": 0.01133258, "auxiliary_loss_mlp": 0.01043416, "balance_loss_clip": 1.02864552, "balance_loss_mlp": 1.0485642, "epoch": 0.38484894032767175, "flos": 24860849575680.0, "grad_norm": 2.726723727230919, "language_loss": 0.73178369, "learning_rate": 2.81824995589303e-06, "loss": 0.75355041, "num_input_tokens_seen": 137371255, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.84375, "step": 6401, "time_per_iteration": 2.5427372455596924 }, { "auxiliary_loss_clip": 0.01127562, "auxiliary_loss_mlp": 0.01035802, "balance_loss_clip": 1.02105546, "balance_loss_mlp": 1.04510248, "epoch": 0.3849090635803397, "flos": 14501806853760.0, "grad_norm": 2.3626259436290575, "language_loss": 0.71801496, "learning_rate": 2.8178945650259012e-06, "loss": 0.73964858, "num_input_tokens_seen": 137388980, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.82421875, "step": 6402, "time_per_iteration": 2.464515209197998 }, { "auxiliary_loss_clip": 0.01122071, "auxiliary_loss_mlp": 0.01031923, "balance_loss_clip": 1.0178206, "balance_loss_mlp": 1.04304385, "epoch": 0.3849691868330077, "flos": 18515865576960.0, "grad_norm": 2.149790004831815, "language_loss": 0.8332957, "learning_rate": 2.817539143144128e-06, "loss": 0.85483563, "num_input_tokens_seen": 137406885, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 6403, "time_per_iteration": 2.5160129070281982 }, { "auxiliary_loss_clip": 0.01126809, "auxiliary_loss_mlp": 0.01036629, "balance_loss_clip": 1.02153158, "balance_loss_mlp": 1.04664922, "epoch": 0.38502931008567565, "flos": 21616392677760.0, "grad_norm": 2.061860571426755, "language_loss": 0.82639015, "learning_rate": 2.817183690261189e-06, "loss": 0.84802449, "num_input_tokens_seen": 137425535, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 6404, "time_per_iteration": 2.491173267364502 }, { "auxiliary_loss_clip": 0.01128483, "auxiliary_loss_mlp": 0.01035056, "balance_loss_clip": 1.02029753, "balance_loss_mlp": 1.04465032, "epoch": 0.3850894333383436, "flos": 25415346804480.0, "grad_norm": 1.8825500033797051, "language_loss": 0.69682109, "learning_rate": 2.816828206390563e-06, "loss": 0.71845645, "num_input_tokens_seen": 137447700, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8359375, "step": 6405, "time_per_iteration": 2.574737787246704 }, { "auxiliary_loss_clip": 0.01123552, "auxiliary_loss_mlp": 0.010371, "balance_loss_clip": 1.02391505, "balance_loss_mlp": 1.0451417, "epoch": 0.3851495565910116, "flos": 20227870581120.0, "grad_norm": 2.2754177200756165, "language_loss": 0.79086006, "learning_rate": 2.816472691545729e-06, "loss": 0.8124665, "num_input_tokens_seen": 137462245, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.78515625, "step": 6406, "time_per_iteration": 2.4662609100341797 }, { "auxiliary_loss_clip": 0.01129434, "auxiliary_loss_mlp": 0.01036978, "balance_loss_clip": 1.02132583, "balance_loss_mlp": 1.04744244, "epoch": 0.38520967984367954, "flos": 16508459122560.0, "grad_norm": 6.093518161433502, "language_loss": 0.84248066, "learning_rate": 2.8161171457401694e-06, "loss": 0.8641448, "num_input_tokens_seen": 137476455, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.81640625, "step": 6407, "time_per_iteration": 2.4606926441192627 }, { "auxiliary_loss_clip": 0.01048015, "auxiliary_loss_mlp": 0.01003317, "balance_loss_clip": 1.00162458, "balance_loss_mlp": 1.02094007, "epoch": 0.3852698030963475, "flos": 61313772971520.0, "grad_norm": 0.8512619061984317, "language_loss": 0.64980084, "learning_rate": 2.815761568987365e-06, "loss": 0.67031419, "num_input_tokens_seen": 137539845, "router_z_loss_clip": 0.01696777, "router_z_loss_mlp": 0.26953125, "step": 6408, "time_per_iteration": 3.153155565261841 }, { "auxiliary_loss_clip": 0.01127649, "auxiliary_loss_mlp": 0.01040944, "balance_loss_clip": 1.0255425, "balance_loss_mlp": 1.04576445, "epoch": 0.3853299263490155, "flos": 22893016930560.0, "grad_norm": 1.566691416423868, "language_loss": 0.73906791, "learning_rate": 2.8154059613008e-06, "loss": 0.76075381, "num_input_tokens_seen": 137559880, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 6409, "time_per_iteration": 2.5182290077209473 }, { "auxiliary_loss_clip": 0.01133636, "auxiliary_loss_mlp": 0.01045089, "balance_loss_clip": 1.02884078, "balance_loss_mlp": 1.04626346, "epoch": 0.38539004960168344, "flos": 20047491457920.0, "grad_norm": 1.9588842742134362, "language_loss": 0.69864285, "learning_rate": 2.81505032269396e-06, "loss": 0.72043014, "num_input_tokens_seen": 137578225, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.875, "step": 6410, "time_per_iteration": 2.4692745208740234 }, { "auxiliary_loss_clip": 0.01046584, "auxiliary_loss_mlp": 0.01003296, "balance_loss_clip": 1.0014838, "balance_loss_mlp": 1.01948881, "epoch": 0.3854501728543514, "flos": 68730691570560.0, "grad_norm": 0.6758502136408998, "language_loss": 0.60339743, "learning_rate": 2.81469465318033e-06, "loss": 0.62389624, "num_input_tokens_seen": 137645770, "router_z_loss_clip": 0.01806641, "router_z_loss_mlp": 0.26953125, "step": 6411, "time_per_iteration": 3.193270444869995 }, { "auxiliary_loss_clip": 0.01124697, "auxiliary_loss_mlp": 0.01027141, "balance_loss_clip": 1.01328301, "balance_loss_mlp": 1.04324079, "epoch": 0.38551029610701937, "flos": 20485027025280.0, "grad_norm": 3.2348106282426334, "language_loss": 0.77489692, "learning_rate": 2.814338952773397e-06, "loss": 0.79641533, "num_input_tokens_seen": 137664090, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.81640625, "step": 6412, "time_per_iteration": 2.4999284744262695 }, { "auxiliary_loss_clip": 0.01128422, "auxiliary_loss_mlp": 0.01034812, "balance_loss_clip": 1.01846826, "balance_loss_mlp": 1.0442518, "epoch": 0.38557041935968733, "flos": 23471788775040.0, "grad_norm": 1.782915039966288, "language_loss": 0.78069812, "learning_rate": 2.8139832214866493e-06, "loss": 0.80233049, "num_input_tokens_seen": 137683190, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.83984375, "step": 6413, "time_per_iteration": 2.5514466762542725 }, { "auxiliary_loss_clip": 0.01046936, "auxiliary_loss_mlp": 0.01003801, "balance_loss_clip": 1.00207257, "balance_loss_mlp": 1.01992679, "epoch": 0.38563054261235535, "flos": 63966636869760.0, "grad_norm": 1.0739771342375952, "language_loss": 0.61314046, "learning_rate": 2.813627459333576e-06, "loss": 0.63364786, "num_input_tokens_seen": 137737315, "router_z_loss_clip": 0.01733398, "router_z_loss_mlp": 0.26953125, "step": 6414, "time_per_iteration": 2.9736363887786865 }, { "auxiliary_loss_clip": 0.01131273, "auxiliary_loss_mlp": 0.01039852, "balance_loss_clip": 1.02472425, "balance_loss_mlp": 1.04683375, "epoch": 0.3856906658650233, "flos": 23987789602560.0, "grad_norm": 2.0658356276728917, "language_loss": 0.77460182, "learning_rate": 2.8132716663276685e-06, "loss": 0.79631305, "num_input_tokens_seen": 137753535, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.84375, "step": 6415, "time_per_iteration": 2.4985384941101074 }, { "auxiliary_loss_clip": 0.01121597, "auxiliary_loss_mlp": 0.01030009, "balance_loss_clip": 1.01726556, "balance_loss_mlp": 1.04491043, "epoch": 0.3857507891176913, "flos": 25007436979200.0, "grad_norm": 1.634690435811482, "language_loss": 0.79776013, "learning_rate": 2.8129158424824173e-06, "loss": 0.81927615, "num_input_tokens_seen": 137773405, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.765625, "step": 6416, "time_per_iteration": 2.551456928253174 }, { "auxiliary_loss_clip": 0.01124575, "auxiliary_loss_mlp": 0.01035412, "balance_loss_clip": 1.02188134, "balance_loss_mlp": 1.04351783, "epoch": 0.38581091237035925, "flos": 21536778182400.0, "grad_norm": 1.8542654443389683, "language_loss": 0.78955269, "learning_rate": 2.8125599878113155e-06, "loss": 0.81115258, "num_input_tokens_seen": 137790810, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.8125, "step": 6417, "time_per_iteration": 2.484811544418335 }, { "auxiliary_loss_clip": 0.01123147, "auxiliary_loss_mlp": 0.01035636, "balance_loss_clip": 1.0224812, "balance_loss_mlp": 1.04230678, "epoch": 0.3858710356230272, "flos": 17383889393280.0, "grad_norm": 1.8436206957274859, "language_loss": 0.79881954, "learning_rate": 2.8122041023278583e-06, "loss": 0.82040733, "num_input_tokens_seen": 137810265, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.80859375, "step": 6418, "time_per_iteration": 2.521031379699707 }, { "auxiliary_loss_clip": 0.01121981, "auxiliary_loss_mlp": 0.0103101, "balance_loss_clip": 1.01768208, "balance_loss_mlp": 1.04268026, "epoch": 0.3859311588756952, "flos": 20339588856960.0, "grad_norm": 2.3253595010958406, "language_loss": 0.79684889, "learning_rate": 2.8118481860455407e-06, "loss": 0.81837881, "num_input_tokens_seen": 137828580, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.79296875, "step": 6419, "time_per_iteration": 2.4825439453125 }, { "auxiliary_loss_clip": 0.01122611, "auxiliary_loss_mlp": 0.01034698, "balance_loss_clip": 1.01923704, "balance_loss_mlp": 1.04435921, "epoch": 0.38599128212836314, "flos": 26321157002880.0, "grad_norm": 2.1475508038969764, "language_loss": 0.67564458, "learning_rate": 2.8114922389778573e-06, "loss": 0.69721764, "num_input_tokens_seen": 137846145, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78125, "step": 6420, "time_per_iteration": 2.5390024185180664 }, { "auxiliary_loss_clip": 0.0112379, "auxiliary_loss_mlp": 0.01034127, "balance_loss_clip": 1.0210259, "balance_loss_mlp": 1.0456599, "epoch": 0.3860514053810311, "flos": 13553837066880.0, "grad_norm": 2.168606429839363, "language_loss": 0.81553429, "learning_rate": 2.8111362611383076e-06, "loss": 0.8371135, "num_input_tokens_seen": 137863705, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.78125, "step": 6421, "time_per_iteration": 2.458143949508667 }, { "auxiliary_loss_clip": 0.01125506, "auxiliary_loss_mlp": 0.01036485, "balance_loss_clip": 1.02177501, "balance_loss_mlp": 1.04297709, "epoch": 0.3861115286336991, "flos": 20954271323520.0, "grad_norm": 2.1232056020462817, "language_loss": 0.71519816, "learning_rate": 2.8107802525403886e-06, "loss": 0.73681808, "num_input_tokens_seen": 137880285, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.82421875, "step": 6422, "time_per_iteration": 2.503551959991455 }, { "auxiliary_loss_clip": 0.01122371, "auxiliary_loss_mlp": 0.01037143, "balance_loss_clip": 1.02409005, "balance_loss_mlp": 1.04465699, "epoch": 0.38617165188636704, "flos": 16362697731840.0, "grad_norm": 1.842082319876111, "language_loss": 0.66209048, "learning_rate": 2.8104242131976025e-06, "loss": 0.6836856, "num_input_tokens_seen": 137898335, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.77734375, "step": 6423, "time_per_iteration": 2.4561219215393066 }, { "auxiliary_loss_clip": 0.01128188, "auxiliary_loss_mlp": 0.01036254, "balance_loss_clip": 1.02300978, "balance_loss_mlp": 1.04658115, "epoch": 0.386231775139035, "flos": 34787276893440.0, "grad_norm": 2.0136264383151286, "language_loss": 0.69422811, "learning_rate": 2.810068143123449e-06, "loss": 0.71587253, "num_input_tokens_seen": 137918605, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.8125, "step": 6424, "time_per_iteration": 2.6857199668884277 }, { "auxiliary_loss_clip": 0.01122347, "auxiliary_loss_mlp": 0.01034043, "balance_loss_clip": 1.01974964, "balance_loss_mlp": 1.04338896, "epoch": 0.38629189839170297, "flos": 21726171619200.0, "grad_norm": 1.557484346125851, "language_loss": 0.72559518, "learning_rate": 2.809712042331429e-06, "loss": 0.74715912, "num_input_tokens_seen": 137938245, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7890625, "step": 6425, "time_per_iteration": 3.9554872512817383 }, { "auxiliary_loss_clip": 0.01128785, "auxiliary_loss_mlp": 0.01036983, "balance_loss_clip": 1.02223718, "balance_loss_mlp": 1.04455745, "epoch": 0.38635202164437094, "flos": 27923634460800.0, "grad_norm": 2.1388132796805075, "language_loss": 0.80531347, "learning_rate": 2.8093559108350484e-06, "loss": 0.82697105, "num_input_tokens_seen": 137956770, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.84375, "step": 6426, "time_per_iteration": 2.5373620986938477 }, { "auxiliary_loss_clip": 0.01129024, "auxiliary_loss_mlp": 0.01033194, "balance_loss_clip": 1.01858449, "balance_loss_mlp": 1.04751277, "epoch": 0.38641214489703896, "flos": 23586631534080.0, "grad_norm": 2.4605335748451314, "language_loss": 0.74796808, "learning_rate": 2.80899974864781e-06, "loss": 0.76959026, "num_input_tokens_seen": 137977040, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 6427, "time_per_iteration": 2.511234998703003 }, { "auxiliary_loss_clip": 0.01124081, "auxiliary_loss_mlp": 0.01035129, "balance_loss_clip": 1.02095509, "balance_loss_mlp": 1.0437578, "epoch": 0.3864722681497069, "flos": 12641239198080.0, "grad_norm": 1.9094771369803276, "language_loss": 0.70186675, "learning_rate": 2.8086435557832203e-06, "loss": 0.72345889, "num_input_tokens_seen": 137993545, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8046875, "step": 6428, "time_per_iteration": 2.4866039752960205 }, { "auxiliary_loss_clip": 0.01127377, "auxiliary_loss_mlp": 0.01041261, "balance_loss_clip": 1.02697372, "balance_loss_mlp": 1.04524755, "epoch": 0.3865323914023749, "flos": 17598922162560.0, "grad_norm": 4.471142030420939, "language_loss": 0.84051633, "learning_rate": 2.8082873322547863e-06, "loss": 0.8622027, "num_input_tokens_seen": 138010140, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8203125, "step": 6429, "time_per_iteration": 3.8821628093719482 }, { "auxiliary_loss_clip": 0.01127758, "auxiliary_loss_mlp": 0.01037087, "balance_loss_clip": 1.02285314, "balance_loss_mlp": 1.04634678, "epoch": 0.38659251465504285, "flos": 18478949374080.0, "grad_norm": 2.5870671969522285, "language_loss": 0.80728757, "learning_rate": 2.807931078076015e-06, "loss": 0.82893598, "num_input_tokens_seen": 138028880, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.81640625, "step": 6430, "time_per_iteration": 3.9021713733673096 }, { "auxiliary_loss_clip": 0.01047071, "auxiliary_loss_mlp": 0.01004513, "balance_loss_clip": 1.00283194, "balance_loss_mlp": 1.02024353, "epoch": 0.3866526379077108, "flos": 64165726978560.0, "grad_norm": 0.7237378447961825, "language_loss": 0.58820957, "learning_rate": 2.807574793260416e-06, "loss": 0.60872537, "num_input_tokens_seen": 138098090, "router_z_loss_clip": 0.0168457, "router_z_loss_mlp": 0.26757812, "step": 6431, "time_per_iteration": 3.153500556945801 }, { "auxiliary_loss_clip": 0.01128887, "auxiliary_loss_mlp": 0.010329, "balance_loss_clip": 1.01742041, "balance_loss_mlp": 1.04518056, "epoch": 0.3867127611603788, "flos": 14388292897920.0, "grad_norm": 1.9673304185233318, "language_loss": 0.79428947, "learning_rate": 2.8072184778215004e-06, "loss": 0.81590736, "num_input_tokens_seen": 138114735, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 6432, "time_per_iteration": 2.4943606853485107 }, { "auxiliary_loss_clip": 0.01129087, "auxiliary_loss_mlp": 0.01044106, "balance_loss_clip": 1.02875233, "balance_loss_mlp": 1.04388309, "epoch": 0.38677288441304675, "flos": 20010754823040.0, "grad_norm": 2.1066925794054496, "language_loss": 0.80727577, "learning_rate": 2.806862131772779e-06, "loss": 0.82900774, "num_input_tokens_seen": 138130480, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8515625, "step": 6433, "time_per_iteration": 2.479074478149414 }, { "auxiliary_loss_clip": 0.01127793, "auxiliary_loss_mlp": 0.01034224, "balance_loss_clip": 1.01861966, "balance_loss_mlp": 1.04514551, "epoch": 0.3868330076657147, "flos": 22236893147520.0, "grad_norm": 2.112482529994168, "language_loss": 0.71086264, "learning_rate": 2.806505755127765e-06, "loss": 0.73248279, "num_input_tokens_seen": 138150640, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 6434, "time_per_iteration": 2.511789083480835 }, { "auxiliary_loss_clip": 0.01129406, "auxiliary_loss_mlp": 0.01039787, "balance_loss_clip": 1.02489758, "balance_loss_mlp": 1.04391491, "epoch": 0.3868931309183827, "flos": 16727442387840.0, "grad_norm": 1.917772397548338, "language_loss": 0.77464509, "learning_rate": 2.806149347899972e-06, "loss": 0.79633701, "num_input_tokens_seen": 138169700, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.85546875, "step": 6435, "time_per_iteration": 2.472687244415283 }, { "auxiliary_loss_clip": 0.01122936, "auxiliary_loss_mlp": 0.01034118, "balance_loss_clip": 1.02005124, "balance_loss_mlp": 1.04314983, "epoch": 0.38695325417105064, "flos": 22674716023680.0, "grad_norm": 1.849987088398373, "language_loss": 0.79362607, "learning_rate": 2.805792910102915e-06, "loss": 0.81519663, "num_input_tokens_seen": 138185835, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 6436, "time_per_iteration": 2.539429187774658 }, { "auxiliary_loss_clip": 0.01120504, "auxiliary_loss_mlp": 0.01033976, "balance_loss_clip": 1.01986158, "balance_loss_mlp": 1.04147565, "epoch": 0.3870133774237186, "flos": 23112036109440.0, "grad_norm": 1.6872682071274694, "language_loss": 0.76531118, "learning_rate": 2.8054364417501093e-06, "loss": 0.78685594, "num_input_tokens_seen": 138204080, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 6437, "time_per_iteration": 2.4864323139190674 }, { "auxiliary_loss_clip": 0.01124275, "auxiliary_loss_mlp": 0.01034901, "balance_loss_clip": 1.02127576, "balance_loss_mlp": 1.04383063, "epoch": 0.3870735006763866, "flos": 17675699483520.0, "grad_norm": 2.2238737095374566, "language_loss": 0.8207854, "learning_rate": 2.805079942855074e-06, "loss": 0.84237713, "num_input_tokens_seen": 138220710, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.8046875, "step": 6438, "time_per_iteration": 2.467916965484619 }, { "auxiliary_loss_clip": 0.01126438, "auxiliary_loss_mlp": 0.01034628, "balance_loss_clip": 1.01929736, "balance_loss_mlp": 1.04432106, "epoch": 0.38713362392905454, "flos": 23295791111040.0, "grad_norm": 1.8306825932165676, "language_loss": 0.75562149, "learning_rate": 2.804723413431326e-06, "loss": 0.77723217, "num_input_tokens_seen": 138241720, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8203125, "step": 6439, "time_per_iteration": 2.493025064468384 }, { "auxiliary_loss_clip": 0.01119624, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.01803732, "balance_loss_mlp": 1.04193413, "epoch": 0.38719374718172256, "flos": 21031192298880.0, "grad_norm": 1.9316653519937461, "language_loss": 0.73800862, "learning_rate": 2.8043668534923855e-06, "loss": 0.759525, "num_input_tokens_seen": 138261885, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.77734375, "step": 6440, "time_per_iteration": 2.525907039642334 }, { "auxiliary_loss_clip": 0.01127012, "auxiliary_loss_mlp": 0.01036854, "balance_loss_clip": 1.02162516, "balance_loss_mlp": 1.04250574, "epoch": 0.3872538704343905, "flos": 19609776322560.0, "grad_norm": 2.1422564133212836, "language_loss": 0.82159257, "learning_rate": 2.804010263051774e-06, "loss": 0.8432312, "num_input_tokens_seen": 138280255, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84375, "step": 6441, "time_per_iteration": 2.4672608375549316 }, { "auxiliary_loss_clip": 0.0112458, "auxiliary_loss_mlp": 0.01042945, "balance_loss_clip": 1.02911031, "balance_loss_mlp": 1.04342699, "epoch": 0.3873139936870585, "flos": 17530045833600.0, "grad_norm": 2.3347492953044737, "language_loss": 0.81587791, "learning_rate": 2.8036536421230118e-06, "loss": 0.83755314, "num_input_tokens_seen": 138296675, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8125, "step": 6442, "time_per_iteration": 2.485781192779541 }, { "auxiliary_loss_clip": 0.01123354, "auxiliary_loss_mlp": 0.01035378, "balance_loss_clip": 1.02045918, "balance_loss_mlp": 1.04159021, "epoch": 0.38737411693972645, "flos": 17786555832960.0, "grad_norm": 7.5693029097923, "language_loss": 0.83929509, "learning_rate": 2.803296990719624e-06, "loss": 0.8608824, "num_input_tokens_seen": 138314985, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8203125, "step": 6443, "time_per_iteration": 2.476175308227539 }, { "auxiliary_loss_clip": 0.01042171, "auxiliary_loss_mlp": 0.01003186, "balance_loss_clip": 1.0013262, "balance_loss_mlp": 1.01546764, "epoch": 0.3874342401923944, "flos": 58304637048960.0, "grad_norm": 0.7665199156981225, "language_loss": 0.50206423, "learning_rate": 2.8029403088551327e-06, "loss": 0.5225178, "num_input_tokens_seen": 138373275, "router_z_loss_clip": 0.01855469, "router_z_loss_mlp": 0.265625, "step": 6444, "time_per_iteration": 3.082359790802002 }, { "auxiliary_loss_clip": 0.01119774, "auxiliary_loss_mlp": 0.01037474, "balance_loss_clip": 1.02397943, "balance_loss_mlp": 1.04229808, "epoch": 0.3874943634450624, "flos": 17711933328000.0, "grad_norm": 1.5188496810827932, "language_loss": 0.78860724, "learning_rate": 2.802583596543065e-06, "loss": 0.81017977, "num_input_tokens_seen": 138391145, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 6445, "time_per_iteration": 2.4788713455200195 }, { "auxiliary_loss_clip": 0.01119905, "auxiliary_loss_mlp": 0.01035454, "balance_loss_clip": 1.02157843, "balance_loss_mlp": 1.04184651, "epoch": 0.38755448669773035, "flos": 19244852098560.0, "grad_norm": 4.61581082144033, "language_loss": 0.81208265, "learning_rate": 2.8022268537969474e-06, "loss": 0.83363628, "num_input_tokens_seen": 138409875, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 6446, "time_per_iteration": 2.4736618995666504 }, { "auxiliary_loss_clip": 0.01123627, "auxiliary_loss_mlp": 0.01037804, "balance_loss_clip": 1.02421999, "balance_loss_mlp": 1.04267097, "epoch": 0.3876146099503983, "flos": 20594267262720.0, "grad_norm": 1.8237294509493698, "language_loss": 0.77647001, "learning_rate": 2.801870080630306e-06, "loss": 0.79808426, "num_input_tokens_seen": 138428965, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.80859375, "step": 6447, "time_per_iteration": 2.4683291912078857 }, { "auxiliary_loss_clip": 0.01121805, "auxiliary_loss_mlp": 0.01032012, "balance_loss_clip": 1.01833844, "balance_loss_mlp": 1.04260015, "epoch": 0.3876747332030663, "flos": 19281121856640.0, "grad_norm": 1.5461479029921734, "language_loss": 0.7625221, "learning_rate": 2.801513277056671e-06, "loss": 0.78406024, "num_input_tokens_seen": 138448090, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.79296875, "step": 6448, "time_per_iteration": 2.4855194091796875 }, { "auxiliary_loss_clip": 0.01122022, "auxiliary_loss_mlp": 0.01039079, "balance_loss_clip": 1.02491736, "balance_loss_mlp": 1.04257834, "epoch": 0.38773485645573424, "flos": 18945895201920.0, "grad_norm": 3.3376162810646695, "language_loss": 0.76121426, "learning_rate": 2.8011564430895725e-06, "loss": 0.78282523, "num_input_tokens_seen": 138466105, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.796875, "step": 6449, "time_per_iteration": 2.457763671875 }, { "auxiliary_loss_clip": 0.01123712, "auxiliary_loss_mlp": 0.01039322, "balance_loss_clip": 1.02356219, "balance_loss_mlp": 1.04055822, "epoch": 0.3877949797084022, "flos": 23071348978560.0, "grad_norm": 2.0344668733784608, "language_loss": 0.78484875, "learning_rate": 2.800799578742542e-06, "loss": 0.8064791, "num_input_tokens_seen": 138485160, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.83203125, "step": 6450, "time_per_iteration": 2.494642734527588 }, { "auxiliary_loss_clip": 0.0112793, "auxiliary_loss_mlp": 0.01037545, "balance_loss_clip": 1.02264357, "balance_loss_mlp": 1.04185057, "epoch": 0.3878551029610702, "flos": 29095543589760.0, "grad_norm": 2.299930549776212, "language_loss": 0.77501601, "learning_rate": 2.8004426840291106e-06, "loss": 0.79667073, "num_input_tokens_seen": 138504135, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.86328125, "step": 6451, "time_per_iteration": 2.5270774364471436 }, { "auxiliary_loss_clip": 0.01117076, "auxiliary_loss_mlp": 0.01029745, "balance_loss_clip": 1.01684678, "balance_loss_mlp": 1.04017472, "epoch": 0.38791522621373814, "flos": 20996394998400.0, "grad_norm": 1.7013398462711784, "language_loss": 0.76001012, "learning_rate": 2.800085758962812e-06, "loss": 0.7814784, "num_input_tokens_seen": 138523955, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.76953125, "step": 6452, "time_per_iteration": 2.5122616291046143 }, { "auxiliary_loss_clip": 0.01123218, "auxiliary_loss_mlp": 0.01045789, "balance_loss_clip": 1.03204989, "balance_loss_mlp": 1.04291964, "epoch": 0.3879753494664061, "flos": 15486836497920.0, "grad_norm": 1.6070434073851805, "language_loss": 0.8001821, "learning_rate": 2.799728803557182e-06, "loss": 0.82187217, "num_input_tokens_seen": 138541655, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.8046875, "step": 6453, "time_per_iteration": 2.446120262145996 }, { "auxiliary_loss_clip": 0.01131359, "auxiliary_loss_mlp": 0.01039181, "balance_loss_clip": 1.02407098, "balance_loss_mlp": 1.04605794, "epoch": 0.3880354727190741, "flos": 22053964158720.0, "grad_norm": 1.712629507412282, "language_loss": 0.71281511, "learning_rate": 2.7993718178257555e-06, "loss": 0.73452049, "num_input_tokens_seen": 138560860, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8515625, "step": 6454, "time_per_iteration": 2.503392457962036 }, { "auxiliary_loss_clip": 0.01128601, "auxiliary_loss_mlp": 0.01039916, "balance_loss_clip": 1.02466869, "balance_loss_mlp": 1.04372656, "epoch": 0.3880955959717421, "flos": 20340307128960.0, "grad_norm": 1.9123758114128933, "language_loss": 0.77254099, "learning_rate": 2.7990148017820694e-06, "loss": 0.79422617, "num_input_tokens_seen": 138580200, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84765625, "step": 6455, "time_per_iteration": 2.4698779582977295 }, { "auxiliary_loss_clip": 0.01119518, "auxiliary_loss_mlp": 0.01035051, "balance_loss_clip": 1.02079356, "balance_loss_mlp": 1.0406971, "epoch": 0.38815571922441006, "flos": 23075407215360.0, "grad_norm": 1.5873106685110883, "language_loss": 0.75736308, "learning_rate": 2.798657755439662e-06, "loss": 0.77890879, "num_input_tokens_seen": 138598315, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 6456, "time_per_iteration": 2.515320301055908 }, { "auxiliary_loss_clip": 0.01126207, "auxiliary_loss_mlp": 0.01034773, "balance_loss_clip": 1.02041388, "balance_loss_mlp": 1.04323673, "epoch": 0.388215842477078, "flos": 20776944856320.0, "grad_norm": 2.679761396573038, "language_loss": 0.60821247, "learning_rate": 2.7983006788120726e-06, "loss": 0.62982219, "num_input_tokens_seen": 138615695, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.828125, "step": 6457, "time_per_iteration": 2.472684383392334 }, { "auxiliary_loss_clip": 0.01125441, "auxiliary_loss_mlp": 0.01034809, "balance_loss_clip": 1.01890683, "balance_loss_mlp": 1.04214251, "epoch": 0.388275965729746, "flos": 20448182649600.0, "grad_norm": 2.155066445325578, "language_loss": 0.80252153, "learning_rate": 2.797943571912841e-06, "loss": 0.8241241, "num_input_tokens_seen": 138633180, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8359375, "step": 6458, "time_per_iteration": 2.4994945526123047 }, { "auxiliary_loss_clip": 0.01125775, "auxiliary_loss_mlp": 0.01037686, "balance_loss_clip": 1.02326202, "balance_loss_mlp": 1.0428009, "epoch": 0.38833608898241395, "flos": 27892392606720.0, "grad_norm": 1.766198823253757, "language_loss": 0.81605446, "learning_rate": 2.797586434755509e-06, "loss": 0.8376891, "num_input_tokens_seen": 138654785, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.828125, "step": 6459, "time_per_iteration": 2.538036823272705 }, { "auxiliary_loss_clip": 0.01120469, "auxiliary_loss_mlp": 0.01035107, "balance_loss_clip": 1.02137983, "balance_loss_mlp": 1.04201269, "epoch": 0.3883962122350819, "flos": 18076390675200.0, "grad_norm": 1.9853452439765136, "language_loss": 0.61782265, "learning_rate": 2.7972292673536202e-06, "loss": 0.63937843, "num_input_tokens_seen": 138673330, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78515625, "step": 6460, "time_per_iteration": 2.5247116088867188 }, { "auxiliary_loss_clip": 0.01123656, "auxiliary_loss_mlp": 0.01033411, "balance_loss_clip": 1.02031016, "balance_loss_mlp": 1.04374111, "epoch": 0.3884563354877499, "flos": 23622254847360.0, "grad_norm": 1.5806706725783104, "language_loss": 0.86028099, "learning_rate": 2.796872069720717e-06, "loss": 0.88185167, "num_input_tokens_seen": 138694185, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.796875, "step": 6461, "time_per_iteration": 2.5180108547210693 }, { "auxiliary_loss_clip": 0.01125441, "auxiliary_loss_mlp": 0.01036993, "balance_loss_clip": 1.02277708, "balance_loss_mlp": 1.04293573, "epoch": 0.38851645874041785, "flos": 27453528236160.0, "grad_norm": 2.3272527867217736, "language_loss": 0.7134195, "learning_rate": 2.7965148418703456e-06, "loss": 0.73504376, "num_input_tokens_seen": 138714625, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.82421875, "step": 6462, "time_per_iteration": 2.58038067817688 }, { "auxiliary_loss_clip": 0.01124029, "auxiliary_loss_mlp": 0.01037362, "balance_loss_clip": 1.02271128, "balance_loss_mlp": 1.04231, "epoch": 0.3885765819930858, "flos": 25228072270080.0, "grad_norm": 3.0515021354177003, "language_loss": 0.75502586, "learning_rate": 2.796157583816052e-06, "loss": 0.77663976, "num_input_tokens_seen": 138733585, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.81640625, "step": 6463, "time_per_iteration": 2.505662441253662 }, { "auxiliary_loss_clip": 0.01128504, "auxiliary_loss_mlp": 0.01044454, "balance_loss_clip": 1.02802682, "balance_loss_mlp": 1.04546762, "epoch": 0.3886367052457538, "flos": 16946605221120.0, "grad_norm": 2.6619904254813225, "language_loss": 0.69985592, "learning_rate": 2.795800295571382e-06, "loss": 0.72158551, "num_input_tokens_seen": 138752335, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.828125, "step": 6464, "time_per_iteration": 2.4705002307891846 }, { "auxiliary_loss_clip": 0.01123818, "auxiliary_loss_mlp": 0.01032127, "balance_loss_clip": 1.01757717, "balance_loss_mlp": 1.04405832, "epoch": 0.38869682849842174, "flos": 27154140376320.0, "grad_norm": 1.9492455949961458, "language_loss": 0.69362748, "learning_rate": 2.7954429771498858e-06, "loss": 0.71518695, "num_input_tokens_seen": 138768450, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.796875, "step": 6465, "time_per_iteration": 2.50535249710083 }, { "auxiliary_loss_clip": 0.01125672, "auxiliary_loss_mlp": 0.01040311, "balance_loss_clip": 1.02418196, "balance_loss_mlp": 1.04409027, "epoch": 0.3887569517510897, "flos": 21063619301760.0, "grad_norm": 2.0247532395348014, "language_loss": 0.78739923, "learning_rate": 2.7950856285651117e-06, "loss": 0.80905908, "num_input_tokens_seen": 138786775, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8125, "step": 6466, "time_per_iteration": 3.964080572128296 }, { "auxiliary_loss_clip": 0.01126932, "auxiliary_loss_mlp": 0.01038663, "balance_loss_clip": 1.02425027, "balance_loss_mlp": 1.04512644, "epoch": 0.38881707500375773, "flos": 29497384016640.0, "grad_norm": 1.5728771730383089, "language_loss": 0.69331402, "learning_rate": 2.794728249830611e-06, "loss": 0.71496999, "num_input_tokens_seen": 138810100, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.81640625, "step": 6467, "time_per_iteration": 2.5666158199310303 }, { "auxiliary_loss_clip": 0.01126816, "auxiliary_loss_mlp": 0.01042147, "balance_loss_clip": 1.02700758, "balance_loss_mlp": 1.04432535, "epoch": 0.3888771982564257, "flos": 17488281294720.0, "grad_norm": 23.01075374809122, "language_loss": 0.83440548, "learning_rate": 2.794370840959936e-06, "loss": 0.85609508, "num_input_tokens_seen": 138825140, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.82421875, "step": 6468, "time_per_iteration": 2.456754446029663 }, { "auxiliary_loss_clip": 0.01123455, "auxiliary_loss_mlp": 0.0103497, "balance_loss_clip": 1.0217495, "balance_loss_mlp": 1.04305017, "epoch": 0.38893732150909366, "flos": 21942425450880.0, "grad_norm": 1.8719535582416549, "language_loss": 0.84521866, "learning_rate": 2.7940134019666383e-06, "loss": 0.86680287, "num_input_tokens_seen": 138844115, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.8046875, "step": 6469, "time_per_iteration": 2.479538917541504 }, { "auxiliary_loss_clip": 0.0112608, "auxiliary_loss_mlp": 0.01036573, "balance_loss_clip": 1.02173162, "balance_loss_mlp": 1.04577529, "epoch": 0.3889974447617616, "flos": 24276367468800.0, "grad_norm": 1.8426117463670568, "language_loss": 0.74469745, "learning_rate": 2.793655932864273e-06, "loss": 0.76632392, "num_input_tokens_seen": 138860860, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 6470, "time_per_iteration": 5.265816926956177 }, { "auxiliary_loss_clip": 0.0112326, "auxiliary_loss_mlp": 0.01036349, "balance_loss_clip": 1.02145433, "balance_loss_mlp": 1.04232442, "epoch": 0.3890575680144296, "flos": 25667116208640.0, "grad_norm": 1.5656459563910914, "language_loss": 0.74732339, "learning_rate": 2.7932984336663953e-06, "loss": 0.76891947, "num_input_tokens_seen": 138881910, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.80859375, "step": 6471, "time_per_iteration": 2.5487711429595947 }, { "auxiliary_loss_clip": 0.01127456, "auxiliary_loss_mlp": 0.01040754, "balance_loss_clip": 1.02588224, "balance_loss_mlp": 1.04695368, "epoch": 0.38911769126709755, "flos": 22855274714880.0, "grad_norm": 1.7229638975085624, "language_loss": 0.67831159, "learning_rate": 2.792940904386562e-06, "loss": 0.69999373, "num_input_tokens_seen": 138900975, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 6472, "time_per_iteration": 4.0011467933654785 }, { "auxiliary_loss_clip": 0.01125137, "auxiliary_loss_mlp": 0.01041022, "balance_loss_clip": 1.02754557, "balance_loss_mlp": 1.04493117, "epoch": 0.3891778145197655, "flos": 25447522412160.0, "grad_norm": 1.7554293286290257, "language_loss": 0.76094961, "learning_rate": 2.7925833450383293e-06, "loss": 0.78261119, "num_input_tokens_seen": 138920795, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.80078125, "step": 6473, "time_per_iteration": 2.520883083343506 }, { "auxiliary_loss_clip": 0.01128288, "auxiliary_loss_mlp": 0.01045222, "balance_loss_clip": 1.02967691, "balance_loss_mlp": 1.04666615, "epoch": 0.3892379377724335, "flos": 14027965614720.0, "grad_norm": 2.155693891724146, "language_loss": 0.71099025, "learning_rate": 2.792225755635257e-06, "loss": 0.73272538, "num_input_tokens_seen": 138938770, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.81640625, "step": 6474, "time_per_iteration": 2.492488145828247 }, { "auxiliary_loss_clip": 0.01125493, "auxiliary_loss_mlp": 0.01041317, "balance_loss_clip": 1.02733397, "balance_loss_mlp": 1.04474699, "epoch": 0.38929806102510145, "flos": 20157449967360.0, "grad_norm": 1.4695982449505405, "language_loss": 0.68503249, "learning_rate": 2.7918681361909046e-06, "loss": 0.70670062, "num_input_tokens_seen": 138958880, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.80859375, "step": 6475, "time_per_iteration": 2.4894630908966064 }, { "auxiliary_loss_clip": 0.0113213, "auxiliary_loss_mlp": 0.01045291, "balance_loss_clip": 1.03035426, "balance_loss_mlp": 1.04676044, "epoch": 0.3893581842777694, "flos": 22163958581760.0, "grad_norm": 1.7527976325785326, "language_loss": 0.75728452, "learning_rate": 2.7915104867188332e-06, "loss": 0.77905869, "num_input_tokens_seen": 138977240, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.85546875, "step": 6476, "time_per_iteration": 2.5090510845184326 }, { "auxiliary_loss_clip": 0.01049577, "auxiliary_loss_mlp": 0.01016178, "balance_loss_clip": 1.01412797, "balance_loss_mlp": 1.02211738, "epoch": 0.3894183075304374, "flos": 67301877392640.0, "grad_norm": 0.7811219149774855, "language_loss": 0.58144146, "learning_rate": 2.7911528072326055e-06, "loss": 0.602099, "num_input_tokens_seen": 139039035, "router_z_loss_clip": 0.02050781, "router_z_loss_mlp": 0.2734375, "step": 6477, "time_per_iteration": 3.0907554626464844 }, { "auxiliary_loss_clip": 0.01127989, "auxiliary_loss_mlp": 0.01040292, "balance_loss_clip": 1.02530742, "balance_loss_mlp": 1.04601407, "epoch": 0.38947843078310534, "flos": 18547502480640.0, "grad_norm": 1.8520149374939545, "language_loss": 0.78067064, "learning_rate": 2.7907950977457832e-06, "loss": 0.80235344, "num_input_tokens_seen": 139055560, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 6478, "time_per_iteration": 2.4683656692504883 }, { "auxiliary_loss_clip": 0.01124311, "auxiliary_loss_mlp": 0.01037855, "balance_loss_clip": 1.02435398, "balance_loss_mlp": 1.04498243, "epoch": 0.3895385540357733, "flos": 14605875532800.0, "grad_norm": 2.3130110450598353, "language_loss": 0.82770759, "learning_rate": 2.7904373582719317e-06, "loss": 0.84932923, "num_input_tokens_seen": 139071865, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.79296875, "step": 6479, "time_per_iteration": 2.4482803344726562 }, { "auxiliary_loss_clip": 0.01125139, "auxiliary_loss_mlp": 0.01034415, "balance_loss_clip": 1.02021134, "balance_loss_mlp": 1.04555142, "epoch": 0.38959867728844133, "flos": 19975203336960.0, "grad_norm": 1.6784284121730495, "language_loss": 0.80215085, "learning_rate": 2.790079588824617e-06, "loss": 0.82374644, "num_input_tokens_seen": 139089640, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 6480, "time_per_iteration": 2.47666072845459 }, { "auxiliary_loss_clip": 0.01121302, "auxiliary_loss_mlp": 0.01030691, "balance_loss_clip": 1.01727402, "balance_loss_mlp": 1.04338622, "epoch": 0.3896588005411093, "flos": 22672130244480.0, "grad_norm": 3.2029073034654933, "language_loss": 0.82963544, "learning_rate": 2.7897217894174038e-06, "loss": 0.85115534, "num_input_tokens_seen": 139109365, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.78125, "step": 6481, "time_per_iteration": 2.4957385063171387 }, { "auxiliary_loss_clip": 0.01123902, "auxiliary_loss_mlp": 0.01036325, "balance_loss_clip": 1.0237422, "balance_loss_mlp": 1.04718864, "epoch": 0.38971892379377726, "flos": 20996035862400.0, "grad_norm": 1.5650999728383714, "language_loss": 0.75360912, "learning_rate": 2.789363960063863e-06, "loss": 0.77521139, "num_input_tokens_seen": 139128260, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.765625, "step": 6482, "time_per_iteration": 2.4824769496917725 }, { "auxiliary_loss_clip": 0.01126805, "auxiliary_loss_mlp": 0.01033212, "balance_loss_clip": 1.01969337, "balance_loss_mlp": 1.0459168, "epoch": 0.3897790470464452, "flos": 22528487756160.0, "grad_norm": 2.0605455927246368, "language_loss": 0.79110885, "learning_rate": 2.78900610077756e-06, "loss": 0.81270903, "num_input_tokens_seen": 139147315, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.80859375, "step": 6483, "time_per_iteration": 2.481717348098755 }, { "auxiliary_loss_clip": 0.01121737, "auxiliary_loss_mlp": 0.01030424, "balance_loss_clip": 1.01543963, "balance_loss_mlp": 1.04173636, "epoch": 0.3898391702991132, "flos": 26209905603840.0, "grad_norm": 4.460932818893857, "language_loss": 0.8009358, "learning_rate": 2.788648211572067e-06, "loss": 0.82245743, "num_input_tokens_seen": 139167270, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 6484, "time_per_iteration": 2.519822120666504 }, { "auxiliary_loss_clip": 0.0112751, "auxiliary_loss_mlp": 0.01047377, "balance_loss_clip": 1.03233302, "balance_loss_mlp": 1.04836297, "epoch": 0.38989929355178116, "flos": 21065558636160.0, "grad_norm": 5.811183288049799, "language_loss": 0.77786696, "learning_rate": 2.7882902924609557e-06, "loss": 0.7996158, "num_input_tokens_seen": 139185970, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 6485, "time_per_iteration": 2.4770493507385254 }, { "auxiliary_loss_clip": 0.01126216, "auxiliary_loss_mlp": 0.01039101, "balance_loss_clip": 1.02455139, "balance_loss_mlp": 1.04477668, "epoch": 0.3899594168044491, "flos": 25484115392640.0, "grad_norm": 2.7892044741642956, "language_loss": 0.85211837, "learning_rate": 2.7879323434577965e-06, "loss": 0.87377155, "num_input_tokens_seen": 139203730, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8125, "step": 6486, "time_per_iteration": 2.5022904872894287 }, { "auxiliary_loss_clip": 0.01127866, "auxiliary_loss_mlp": 0.01036113, "balance_loss_clip": 1.02207637, "balance_loss_mlp": 1.04425013, "epoch": 0.3900195400571171, "flos": 31139363456640.0, "grad_norm": 1.8813120361137587, "language_loss": 0.8546508, "learning_rate": 2.7875743645761645e-06, "loss": 0.87629056, "num_input_tokens_seen": 139222560, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8359375, "step": 6487, "time_per_iteration": 2.557192325592041 }, { "auxiliary_loss_clip": 0.01125358, "auxiliary_loss_mlp": 0.01036621, "balance_loss_clip": 1.02199435, "balance_loss_mlp": 1.0457449, "epoch": 0.39007966330978505, "flos": 20229917656320.0, "grad_norm": 2.714114918627855, "language_loss": 0.73283064, "learning_rate": 2.787216355829633e-06, "loss": 0.75445044, "num_input_tokens_seen": 139242165, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 6488, "time_per_iteration": 2.4888389110565186 }, { "auxiliary_loss_clip": 0.01129014, "auxiliary_loss_mlp": 0.01039048, "balance_loss_clip": 1.02456379, "balance_loss_mlp": 1.04749441, "epoch": 0.390139786562453, "flos": 22528739151360.0, "grad_norm": 1.8425165537777788, "language_loss": 0.68775451, "learning_rate": 2.786858317231779e-06, "loss": 0.70943511, "num_input_tokens_seen": 139262525, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.81640625, "step": 6489, "time_per_iteration": 2.4983060359954834 }, { "auxiliary_loss_clip": 0.01121676, "auxiliary_loss_mlp": 0.01042032, "balance_loss_clip": 1.02860868, "balance_loss_mlp": 1.04464412, "epoch": 0.390199909815121, "flos": 26432911192320.0, "grad_norm": 1.8307532111038365, "language_loss": 0.80897009, "learning_rate": 2.7865002487961788e-06, "loss": 0.83060706, "num_input_tokens_seen": 139282835, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76953125, "step": 6490, "time_per_iteration": 2.543729543685913 }, { "auxiliary_loss_clip": 0.01128931, "auxiliary_loss_mlp": 0.0103531, "balance_loss_clip": 1.02123129, "balance_loss_mlp": 1.04726183, "epoch": 0.39026003306778895, "flos": 17274577328640.0, "grad_norm": 1.988897865134972, "language_loss": 0.89640462, "learning_rate": 2.7861421505364104e-06, "loss": 0.91804701, "num_input_tokens_seen": 139299490, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.81640625, "step": 6491, "time_per_iteration": 2.4813857078552246 }, { "auxiliary_loss_clip": 0.01125973, "auxiliary_loss_mlp": 0.01037215, "balance_loss_clip": 1.02379823, "balance_loss_mlp": 1.0444417, "epoch": 0.3903201563204569, "flos": 24532841554560.0, "grad_norm": 1.8008101443430335, "language_loss": 0.78943181, "learning_rate": 2.7857840224660523e-06, "loss": 0.81106371, "num_input_tokens_seen": 139317865, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.81640625, "step": 6492, "time_per_iteration": 2.525477647781372 }, { "auxiliary_loss_clip": 0.01126385, "auxiliary_loss_mlp": 0.01043255, "balance_loss_clip": 1.02959991, "balance_loss_mlp": 1.04537749, "epoch": 0.39038027957312493, "flos": 23767944410880.0, "grad_norm": 2.1876165951960767, "language_loss": 0.74312162, "learning_rate": 2.7854258645986857e-06, "loss": 0.76481801, "num_input_tokens_seen": 139339840, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.80859375, "step": 6493, "time_per_iteration": 2.5124385356903076 }, { "auxiliary_loss_clip": 0.01130961, "auxiliary_loss_mlp": 0.01040255, "balance_loss_clip": 1.02552044, "balance_loss_mlp": 1.0462544, "epoch": 0.3904404028257929, "flos": 14100612871680.0, "grad_norm": 2.5524456305753995, "language_loss": 0.7610054, "learning_rate": 2.7850676769478916e-06, "loss": 0.78271753, "num_input_tokens_seen": 139357555, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.84375, "step": 6494, "time_per_iteration": 2.4858591556549072 }, { "auxiliary_loss_clip": 0.01135348, "auxiliary_loss_mlp": 0.01044237, "balance_loss_clip": 1.02826333, "balance_loss_mlp": 1.04791832, "epoch": 0.39050052607846086, "flos": 16910048154240.0, "grad_norm": 7.01964890315926, "language_loss": 0.74292278, "learning_rate": 2.7847094595272525e-06, "loss": 0.76471859, "num_input_tokens_seen": 139374455, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.875, "step": 6495, "time_per_iteration": 2.4572932720184326 }, { "auxiliary_loss_clip": 0.01128154, "auxiliary_loss_mlp": 0.01041305, "balance_loss_clip": 1.0250926, "balance_loss_mlp": 1.04706728, "epoch": 0.39056064933112883, "flos": 25915761129600.0, "grad_norm": 1.8700867358251199, "language_loss": 0.68028998, "learning_rate": 2.784351212350352e-06, "loss": 0.70198452, "num_input_tokens_seen": 139394770, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8125, "step": 6496, "time_per_iteration": 2.5364115238189697 }, { "auxiliary_loss_clip": 0.01049359, "auxiliary_loss_mlp": 0.0100301, "balance_loss_clip": 1.0010314, "balance_loss_mlp": 1.02256024, "epoch": 0.3906207725837968, "flos": 60028421713920.0, "grad_norm": 0.6977743055979115, "language_loss": 0.5398007, "learning_rate": 2.783992935430775e-06, "loss": 0.56032437, "num_input_tokens_seen": 139454760, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.26757812, "step": 6497, "time_per_iteration": 3.207118034362793 }, { "auxiliary_loss_clip": 0.01127778, "auxiliary_loss_mlp": 0.01034261, "balance_loss_clip": 1.02005744, "balance_loss_mlp": 1.04726207, "epoch": 0.39068089583646476, "flos": 21068683119360.0, "grad_norm": 2.1233567330123018, "language_loss": 0.69189775, "learning_rate": 2.7836346287821068e-06, "loss": 0.71351814, "num_input_tokens_seen": 139472645, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 6498, "time_per_iteration": 2.486318349838257 }, { "auxiliary_loss_clip": 0.0104888, "auxiliary_loss_mlp": 0.01003916, "balance_loss_clip": 1.00200868, "balance_loss_mlp": 1.02169371, "epoch": 0.3907410190891327, "flos": 70445677403520.0, "grad_norm": 0.7277597510074346, "language_loss": 0.51818168, "learning_rate": 2.783276292417936e-06, "loss": 0.53870964, "num_input_tokens_seen": 139536730, "router_z_loss_clip": 0.01904297, "router_z_loss_mlp": 0.27148438, "step": 6499, "time_per_iteration": 3.1551930904388428 }, { "auxiliary_loss_clip": 0.01128516, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.0234797, "balance_loss_mlp": 1.04517567, "epoch": 0.3908011423418007, "flos": 27962454084480.0, "grad_norm": 2.3121500709292606, "language_loss": 0.73765486, "learning_rate": 2.7829179263518487e-06, "loss": 0.75934112, "num_input_tokens_seen": 139557540, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8359375, "step": 6500, "time_per_iteration": 2.5626721382141113 }, { "auxiliary_loss_clip": 0.01131988, "auxiliary_loss_mlp": 0.01031532, "balance_loss_clip": 1.01726842, "balance_loss_mlp": 1.04937267, "epoch": 0.39086126559446865, "flos": 24462097718400.0, "grad_norm": 1.9285850456273017, "language_loss": 0.68811786, "learning_rate": 2.7825595305974354e-06, "loss": 0.70975304, "num_input_tokens_seen": 139576875, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.82421875, "step": 6501, "time_per_iteration": 2.5376999378204346 }, { "auxiliary_loss_clip": 0.01128143, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.02341413, "balance_loss_mlp": 1.04669702, "epoch": 0.3909213888471366, "flos": 16941541403520.0, "grad_norm": 1.7043428962860248, "language_loss": 0.79016531, "learning_rate": 2.782201105168287e-06, "loss": 0.81181777, "num_input_tokens_seen": 139594295, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.8125, "step": 6502, "time_per_iteration": 2.452092409133911 }, { "auxiliary_loss_clip": 0.01125722, "auxiliary_loss_mlp": 0.01037352, "balance_loss_clip": 1.02382731, "balance_loss_mlp": 1.04779291, "epoch": 0.3909815120998046, "flos": 29278400751360.0, "grad_norm": 2.2384726624379825, "language_loss": 0.79816329, "learning_rate": 2.7818426500779932e-06, "loss": 0.81979406, "num_input_tokens_seen": 139614080, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.78125, "step": 6503, "time_per_iteration": 2.5595104694366455 }, { "auxiliary_loss_clip": 0.01122942, "auxiliary_loss_mlp": 0.01027854, "balance_loss_clip": 1.01468146, "balance_loss_mlp": 1.04504693, "epoch": 0.39104163535247255, "flos": 18951246328320.0, "grad_norm": 1.793842841886014, "language_loss": 0.71199512, "learning_rate": 2.7814841653401485e-06, "loss": 0.7335031, "num_input_tokens_seen": 139632755, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.77734375, "step": 6504, "time_per_iteration": 2.4641735553741455 }, { "auxiliary_loss_clip": 0.01123641, "auxiliary_loss_mlp": 0.01030578, "balance_loss_clip": 1.0161413, "balance_loss_mlp": 1.04352474, "epoch": 0.3911017586051405, "flos": 26323347732480.0, "grad_norm": 2.929065287281086, "language_loss": 0.83384252, "learning_rate": 2.7811256509683454e-06, "loss": 0.85538471, "num_input_tokens_seen": 139654205, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 6505, "time_per_iteration": 2.5729103088378906 }, { "auxiliary_loss_clip": 0.01125137, "auxiliary_loss_mlp": 0.01035617, "balance_loss_clip": 1.02040577, "balance_loss_mlp": 1.04598904, "epoch": 0.3911618818578085, "flos": 21835770992640.0, "grad_norm": 1.9794784516378872, "language_loss": 0.70884466, "learning_rate": 2.7807671069761797e-06, "loss": 0.73045224, "num_input_tokens_seen": 139673595, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7890625, "step": 6506, "time_per_iteration": 2.479029655456543 }, { "auxiliary_loss_clip": 0.01121806, "auxiliary_loss_mlp": 0.01036777, "balance_loss_clip": 1.02304375, "balance_loss_mlp": 1.04464459, "epoch": 0.3912220051104765, "flos": 16359680989440.0, "grad_norm": 1.955573604331958, "language_loss": 0.75176179, "learning_rate": 2.7804085333772477e-06, "loss": 0.77334762, "num_input_tokens_seen": 139690565, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7734375, "step": 6507, "time_per_iteration": 2.471146583557129 }, { "auxiliary_loss_clip": 0.01046418, "auxiliary_loss_mlp": 0.01001709, "balance_loss_clip": 0.99989718, "balance_loss_mlp": 1.01938868, "epoch": 0.39128212836314447, "flos": 71050986420480.0, "grad_norm": 0.7638712216998674, "language_loss": 0.56576002, "learning_rate": 2.7800499301851446e-06, "loss": 0.5862413, "num_input_tokens_seen": 139749420, "router_z_loss_clip": 0.01806641, "router_z_loss_mlp": 0.26953125, "step": 6508, "time_per_iteration": 4.735147476196289 }, { "auxiliary_loss_clip": 0.01127051, "auxiliary_loss_mlp": 0.01035889, "balance_loss_clip": 1.02186465, "balance_loss_mlp": 1.04622114, "epoch": 0.39134225161581243, "flos": 20331975173760.0, "grad_norm": 2.02654486218331, "language_loss": 0.76567173, "learning_rate": 2.779691297413471e-06, "loss": 0.78730112, "num_input_tokens_seen": 139766265, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80859375, "step": 6509, "time_per_iteration": 2.4799180030822754 }, { "auxiliary_loss_clip": 0.01126254, "auxiliary_loss_mlp": 0.0104122, "balance_loss_clip": 1.02505493, "balance_loss_mlp": 1.04412055, "epoch": 0.3914023748684804, "flos": 17018390551680.0, "grad_norm": 5.311813164988053, "language_loss": 0.82990122, "learning_rate": 2.779332635075825e-06, "loss": 0.85157591, "num_input_tokens_seen": 139782400, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8203125, "step": 6510, "time_per_iteration": 2.480125665664673 }, { "auxiliary_loss_clip": 0.01128027, "auxiliary_loss_mlp": 0.01036437, "balance_loss_clip": 1.0216074, "balance_loss_mlp": 1.04613924, "epoch": 0.39146249812114836, "flos": 18405224709120.0, "grad_norm": 2.1226635488481636, "language_loss": 0.76628089, "learning_rate": 2.7789739431858073e-06, "loss": 0.7879256, "num_input_tokens_seen": 139801435, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 6511, "time_per_iteration": 2.4520509243011475 }, { "auxiliary_loss_clip": 0.01045983, "auxiliary_loss_mlp": 0.01003378, "balance_loss_clip": 1.00153017, "balance_loss_mlp": 1.01899624, "epoch": 0.3915226213738163, "flos": 67637355442560.0, "grad_norm": 0.7394805953008409, "language_loss": 0.5777126, "learning_rate": 2.7786152217570196e-06, "loss": 0.59820622, "num_input_tokens_seen": 139869700, "router_z_loss_clip": 0.01843262, "router_z_loss_mlp": 0.26953125, "step": 6512, "time_per_iteration": 4.573636770248413 }, { "auxiliary_loss_clip": 0.01127178, "auxiliary_loss_mlp": 0.01035455, "balance_loss_clip": 1.02006483, "balance_loss_mlp": 1.04555357, "epoch": 0.3915827446264843, "flos": 26359330181760.0, "grad_norm": 1.7544219330049657, "language_loss": 0.6955843, "learning_rate": 2.7782564708030647e-06, "loss": 0.71721059, "num_input_tokens_seen": 139890140, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.81640625, "step": 6513, "time_per_iteration": 4.014959096908569 }, { "auxiliary_loss_clip": 0.01132123, "auxiliary_loss_mlp": 0.01040643, "balance_loss_clip": 1.02539062, "balance_loss_mlp": 1.04698062, "epoch": 0.39164286787915226, "flos": 21943897908480.0, "grad_norm": 3.269771776598822, "language_loss": 0.75233638, "learning_rate": 2.7778976903375464e-06, "loss": 0.77406406, "num_input_tokens_seen": 139908020, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8515625, "step": 6514, "time_per_iteration": 2.490743637084961 }, { "auxiliary_loss_clip": 0.01124921, "auxiliary_loss_mlp": 0.01038044, "balance_loss_clip": 1.02446651, "balance_loss_mlp": 1.04322457, "epoch": 0.3917029911318202, "flos": 16399829416320.0, "grad_norm": 2.07991322417561, "language_loss": 0.77366942, "learning_rate": 2.7775388803740693e-06, "loss": 0.79529911, "num_input_tokens_seen": 139926180, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.81640625, "step": 6515, "time_per_iteration": 2.5012083053588867 }, { "auxiliary_loss_clip": 0.01120958, "auxiliary_loss_mlp": 0.01043177, "balance_loss_clip": 1.03018928, "balance_loss_mlp": 1.04320121, "epoch": 0.3917631143844882, "flos": 26211701283840.0, "grad_norm": 1.490702875807861, "language_loss": 0.80109292, "learning_rate": 2.7771800409262406e-06, "loss": 0.82273424, "num_input_tokens_seen": 139947420, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.77734375, "step": 6516, "time_per_iteration": 2.5251429080963135 }, { "auxiliary_loss_clip": 0.01125809, "auxiliary_loss_mlp": 0.01039836, "balance_loss_clip": 1.02512515, "balance_loss_mlp": 1.04408169, "epoch": 0.39182323763715615, "flos": 18548364407040.0, "grad_norm": 2.1055385278945287, "language_loss": 0.70542866, "learning_rate": 2.7768211720076665e-06, "loss": 0.72708511, "num_input_tokens_seen": 139965800, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.81640625, "step": 6517, "time_per_iteration": 2.4659838676452637 }, { "auxiliary_loss_clip": 0.01125039, "auxiliary_loss_mlp": 0.01044379, "balance_loss_clip": 1.02982926, "balance_loss_mlp": 1.04369783, "epoch": 0.3918833608898241, "flos": 34313543395200.0, "grad_norm": 1.6079142965692617, "language_loss": 0.72018433, "learning_rate": 2.776462273631956e-06, "loss": 0.74187851, "num_input_tokens_seen": 139988140, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8125, "step": 6518, "time_per_iteration": 2.597147226333618 }, { "auxiliary_loss_clip": 0.01127673, "auxiliary_loss_mlp": 0.01036388, "balance_loss_clip": 1.02137375, "balance_loss_mlp": 1.04547381, "epoch": 0.3919434841424921, "flos": 36939582812160.0, "grad_norm": 1.8685925501966048, "language_loss": 0.61881363, "learning_rate": 2.7761033458127177e-06, "loss": 0.64045429, "num_input_tokens_seen": 140010060, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 6519, "time_per_iteration": 2.612200975418091 }, { "auxiliary_loss_clip": 0.01132758, "auxiliary_loss_mlp": 0.01041387, "balance_loss_clip": 1.02629495, "balance_loss_mlp": 1.04736018, "epoch": 0.3920036073951601, "flos": 23508956373120.0, "grad_norm": 2.4300461785546097, "language_loss": 0.67170268, "learning_rate": 2.775744388563563e-06, "loss": 0.69344413, "num_input_tokens_seen": 140029400, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8515625, "step": 6520, "time_per_iteration": 2.5083041191101074 }, { "auxiliary_loss_clip": 0.01123663, "auxiliary_loss_mlp": 0.01037815, "balance_loss_clip": 1.02353978, "balance_loss_mlp": 1.04338145, "epoch": 0.39206373064782807, "flos": 18406086635520.0, "grad_norm": 1.8760557935364253, "language_loss": 0.78481627, "learning_rate": 2.775385401898104e-06, "loss": 0.806431, "num_input_tokens_seen": 140048940, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 6521, "time_per_iteration": 2.4750614166259766 }, { "auxiliary_loss_clip": 0.01132221, "auxiliary_loss_mlp": 0.01041556, "balance_loss_clip": 1.02445567, "balance_loss_mlp": 1.04747045, "epoch": 0.39212385390049603, "flos": 12313051608960.0, "grad_norm": 2.3139631988988167, "language_loss": 0.70398009, "learning_rate": 2.775026385829952e-06, "loss": 0.7257179, "num_input_tokens_seen": 140066380, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.84765625, "step": 6522, "time_per_iteration": 2.452316999435425 }, { "auxiliary_loss_clip": 0.01128084, "auxiliary_loss_mlp": 0.01032432, "balance_loss_clip": 1.01848483, "balance_loss_mlp": 1.04474342, "epoch": 0.392183977153164, "flos": 19719160214400.0, "grad_norm": 1.820566125461562, "language_loss": 0.76867902, "learning_rate": 2.774667340372722e-06, "loss": 0.79028422, "num_input_tokens_seen": 140085275, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.83203125, "step": 6523, "time_per_iteration": 2.4720191955566406 }, { "auxiliary_loss_clip": 0.01127824, "auxiliary_loss_mlp": 0.01042588, "balance_loss_clip": 1.02825856, "balance_loss_mlp": 1.04593074, "epoch": 0.39224410040583196, "flos": 33144902403840.0, "grad_norm": 2.7327903867593686, "language_loss": 0.61643863, "learning_rate": 2.7743082655400293e-06, "loss": 0.6381427, "num_input_tokens_seen": 140105105, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.81640625, "step": 6524, "time_per_iteration": 2.5907390117645264 }, { "auxiliary_loss_clip": 0.01124294, "auxiliary_loss_mlp": 0.01040324, "balance_loss_clip": 1.02518451, "balance_loss_mlp": 1.04215562, "epoch": 0.39230422365849993, "flos": 27782434097280.0, "grad_norm": 1.9873257583158614, "language_loss": 0.74015731, "learning_rate": 2.773949161345489e-06, "loss": 0.76180351, "num_input_tokens_seen": 140125645, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8203125, "step": 6525, "time_per_iteration": 2.546677827835083 }, { "auxiliary_loss_clip": 0.01124741, "auxiliary_loss_mlp": 0.01041634, "balance_loss_clip": 1.02790749, "balance_loss_mlp": 1.04245424, "epoch": 0.3923643469111679, "flos": 17931634865280.0, "grad_norm": 1.9757895999650124, "language_loss": 0.80991602, "learning_rate": 2.773590027802719e-06, "loss": 0.83157974, "num_input_tokens_seen": 140141925, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.82421875, "step": 6526, "time_per_iteration": 2.477238893508911 }, { "auxiliary_loss_clip": 0.01124768, "auxiliary_loss_mlp": 0.01045064, "balance_loss_clip": 1.03093803, "balance_loss_mlp": 1.04346812, "epoch": 0.39242447016383586, "flos": 24059539019520.0, "grad_norm": 1.6835647183929843, "language_loss": 0.70234698, "learning_rate": 2.7732308649253383e-06, "loss": 0.72404528, "num_input_tokens_seen": 140160965, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8125, "step": 6527, "time_per_iteration": 2.499135971069336 }, { "auxiliary_loss_clip": 0.01122698, "auxiliary_loss_mlp": 0.01035181, "balance_loss_clip": 1.02134657, "balance_loss_mlp": 1.04299474, "epoch": 0.3924845934165038, "flos": 10664069016960.0, "grad_norm": 5.221986583948033, "language_loss": 0.82506102, "learning_rate": 2.772871672726965e-06, "loss": 0.84663981, "num_input_tokens_seen": 140177780, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.796875, "step": 6528, "time_per_iteration": 2.446300983428955 }, { "auxiliary_loss_clip": 0.01122372, "auxiliary_loss_mlp": 0.01038559, "balance_loss_clip": 1.02433157, "balance_loss_mlp": 1.04463696, "epoch": 0.3925447166691718, "flos": 31245910174080.0, "grad_norm": 2.5428015979346843, "language_loss": 0.68739283, "learning_rate": 2.7725124512212205e-06, "loss": 0.70900214, "num_input_tokens_seen": 140201660, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 6529, "time_per_iteration": 2.5890519618988037 }, { "auxiliary_loss_clip": 0.01125814, "auxiliary_loss_mlp": 0.01040378, "balance_loss_clip": 1.02568543, "balance_loss_mlp": 1.04395449, "epoch": 0.39260483992183975, "flos": 29415040087680.0, "grad_norm": 2.2263252875408472, "language_loss": 0.79940283, "learning_rate": 2.7721532004217267e-06, "loss": 0.82106483, "num_input_tokens_seen": 140218585, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8203125, "step": 6530, "time_per_iteration": 2.5311055183410645 }, { "auxiliary_loss_clip": 0.01121701, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.02247143, "balance_loss_mlp": 1.04240596, "epoch": 0.3926649631745077, "flos": 22857788666880.0, "grad_norm": 1.5978022780129448, "language_loss": 0.75562918, "learning_rate": 2.7717939203421063e-06, "loss": 0.77721077, "num_input_tokens_seen": 140239905, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.79296875, "step": 6531, "time_per_iteration": 2.5040528774261475 }, { "auxiliary_loss_clip": 0.01044015, "auxiliary_loss_mlp": 0.01005515, "balance_loss_clip": 1.00352371, "balance_loss_mlp": 1.01705098, "epoch": 0.3927250864271757, "flos": 63893881872000.0, "grad_norm": 0.829474557835343, "language_loss": 0.60380864, "learning_rate": 2.7714346109959822e-06, "loss": 0.62430394, "num_input_tokens_seen": 140293820, "router_z_loss_clip": 0.01989746, "router_z_loss_mlp": 0.26953125, "step": 6532, "time_per_iteration": 2.979581594467163 }, { "auxiliary_loss_clip": 0.01044808, "auxiliary_loss_mlp": 0.01001306, "balance_loss_clip": 0.9995541, "balance_loss_mlp": 1.01796651, "epoch": 0.3927852096798437, "flos": 68909741890560.0, "grad_norm": 0.8079806825721428, "language_loss": 0.55548096, "learning_rate": 2.771075272396981e-06, "loss": 0.57594216, "num_input_tokens_seen": 140360420, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.26953125, "step": 6533, "time_per_iteration": 3.2034387588500977 }, { "auxiliary_loss_clip": 0.0112837, "auxiliary_loss_mlp": 0.0104039, "balance_loss_clip": 1.02604878, "balance_loss_mlp": 1.04603207, "epoch": 0.39284533293251167, "flos": 29715972232320.0, "grad_norm": 1.7316607536245758, "language_loss": 0.76190633, "learning_rate": 2.7707159045587284e-06, "loss": 0.78359389, "num_input_tokens_seen": 140381950, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.82421875, "step": 6534, "time_per_iteration": 2.5761067867279053 }, { "auxiliary_loss_clip": 0.011298, "auxiliary_loss_mlp": 0.01035696, "balance_loss_clip": 1.02154601, "balance_loss_mlp": 1.04579473, "epoch": 0.39290545618517964, "flos": 18552027594240.0, "grad_norm": 5.698468497417189, "language_loss": 0.78865826, "learning_rate": 2.770356507494851e-06, "loss": 0.81031322, "num_input_tokens_seen": 140399410, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.83984375, "step": 6535, "time_per_iteration": 2.4639878273010254 }, { "auxiliary_loss_clip": 0.01121098, "auxiliary_loss_mlp": 0.01030125, "balance_loss_clip": 1.01744664, "balance_loss_mlp": 1.04345536, "epoch": 0.3929655794378476, "flos": 26249479413120.0, "grad_norm": 1.7677986075917482, "language_loss": 0.68191385, "learning_rate": 2.769997081218978e-06, "loss": 0.70342612, "num_input_tokens_seen": 140419055, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.77734375, "step": 6536, "time_per_iteration": 2.5283069610595703 }, { "auxiliary_loss_clip": 0.01120323, "auxiliary_loss_mlp": 0.01033305, "balance_loss_clip": 1.02066267, "balance_loss_mlp": 1.04380989, "epoch": 0.39302570269051557, "flos": 29277933874560.0, "grad_norm": 2.376252887473527, "language_loss": 0.68927395, "learning_rate": 2.769637625744738e-06, "loss": 0.71081018, "num_input_tokens_seen": 140438800, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.765625, "step": 6537, "time_per_iteration": 2.5575661659240723 }, { "auxiliary_loss_clip": 0.01126014, "auxiliary_loss_mlp": 0.01036199, "balance_loss_clip": 1.02262104, "balance_loss_mlp": 1.04603958, "epoch": 0.39308582594318353, "flos": 17347440067200.0, "grad_norm": 2.26043555456762, "language_loss": 0.79024464, "learning_rate": 2.769278141085763e-06, "loss": 0.81186676, "num_input_tokens_seen": 140456880, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.80078125, "step": 6538, "time_per_iteration": 2.466350555419922 }, { "auxiliary_loss_clip": 0.01046423, "auxiliary_loss_mlp": 0.01004268, "balance_loss_clip": 1.00259876, "balance_loss_mlp": 1.01975071, "epoch": 0.3931459491958515, "flos": 61007094650880.0, "grad_norm": 0.8092916593763873, "language_loss": 0.61938286, "learning_rate": 2.768918627255683e-06, "loss": 0.63988978, "num_input_tokens_seen": 140507510, "router_z_loss_clip": 0.01672363, "router_z_loss_mlp": 0.265625, "step": 6539, "time_per_iteration": 2.9192912578582764 }, { "auxiliary_loss_clip": 0.01123912, "auxiliary_loss_mlp": 0.01031152, "balance_loss_clip": 1.01697779, "balance_loss_mlp": 1.04447544, "epoch": 0.39320607244851946, "flos": 39016009249920.0, "grad_norm": 2.0893914537381972, "language_loss": 0.67983127, "learning_rate": 2.7685590842681315e-06, "loss": 0.70138186, "num_input_tokens_seen": 140528740, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.79296875, "step": 6540, "time_per_iteration": 2.6363463401794434 }, { "auxiliary_loss_clip": 0.01123631, "auxiliary_loss_mlp": 0.01034373, "balance_loss_clip": 1.02019906, "balance_loss_mlp": 1.04432213, "epoch": 0.3932661957011874, "flos": 24679752180480.0, "grad_norm": 3.0514237214020765, "language_loss": 0.72379482, "learning_rate": 2.7681995121367433e-06, "loss": 0.74537492, "num_input_tokens_seen": 140547560, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.79296875, "step": 6541, "time_per_iteration": 2.5337958335876465 }, { "auxiliary_loss_clip": 0.01045711, "auxiliary_loss_mlp": 0.01009654, "balance_loss_clip": 1.00763917, "balance_loss_mlp": 1.01911724, "epoch": 0.3933263189538554, "flos": 70096552185600.0, "grad_norm": 0.8278151883288275, "language_loss": 0.60340542, "learning_rate": 2.7678399108751516e-06, "loss": 0.62395906, "num_input_tokens_seen": 140601175, "router_z_loss_clip": 0.0201416, "router_z_loss_mlp": 0.265625, "step": 6542, "time_per_iteration": 2.9468977451324463 }, { "auxiliary_loss_clip": 0.01124767, "auxiliary_loss_mlp": 0.01032176, "balance_loss_clip": 1.01869345, "balance_loss_mlp": 1.04462767, "epoch": 0.39338644220652336, "flos": 22929071207040.0, "grad_norm": 1.5998597155858547, "language_loss": 0.8245371, "learning_rate": 2.7674802804969947e-06, "loss": 0.84610653, "num_input_tokens_seen": 140622200, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.80078125, "step": 6543, "time_per_iteration": 2.4982240200042725 }, { "auxiliary_loss_clip": 0.01120682, "auxiliary_loss_mlp": 0.01033617, "balance_loss_clip": 1.01941371, "balance_loss_mlp": 1.04135287, "epoch": 0.3934465654591913, "flos": 30848163897600.0, "grad_norm": 1.6684897359687014, "language_loss": 0.69023979, "learning_rate": 2.767120621015908e-06, "loss": 0.71178281, "num_input_tokens_seen": 140643125, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.79296875, "step": 6544, "time_per_iteration": 2.564201831817627 }, { "auxiliary_loss_clip": 0.01125143, "auxiliary_loss_mlp": 0.0104283, "balance_loss_clip": 1.02782202, "balance_loss_mlp": 1.04325104, "epoch": 0.3935066887118593, "flos": 29236528471680.0, "grad_norm": 2.1172373466917764, "language_loss": 0.74517918, "learning_rate": 2.76676093244553e-06, "loss": 0.76685894, "num_input_tokens_seen": 140662500, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 6545, "time_per_iteration": 2.5607473850250244 }, { "auxiliary_loss_clip": 0.01120101, "auxiliary_loss_mlp": 0.01031116, "balance_loss_clip": 1.01900995, "balance_loss_mlp": 1.04508388, "epoch": 0.3935668119645273, "flos": 19135288638720.0, "grad_norm": 1.6142012327689919, "language_loss": 0.74745929, "learning_rate": 2.7664012147995015e-06, "loss": 0.76897156, "num_input_tokens_seen": 140681960, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.75, "step": 6546, "time_per_iteration": 2.5049896240234375 }, { "auxiliary_loss_clip": 0.01129055, "auxiliary_loss_mlp": 0.01035506, "balance_loss_clip": 1.02139175, "balance_loss_mlp": 1.04462373, "epoch": 0.3936269352171953, "flos": 18516116972160.0, "grad_norm": 1.7001295651032338, "language_loss": 0.81380069, "learning_rate": 2.7660414680914617e-06, "loss": 0.83544624, "num_input_tokens_seen": 140699170, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.84375, "step": 6547, "time_per_iteration": 2.4549386501312256 }, { "auxiliary_loss_clip": 0.01122519, "auxiliary_loss_mlp": 0.01029572, "balance_loss_clip": 1.01607203, "balance_loss_mlp": 1.04280424, "epoch": 0.39368705846986324, "flos": 15632813370240.0, "grad_norm": 2.8160058550032407, "language_loss": 0.84428132, "learning_rate": 2.7656816923350525e-06, "loss": 0.86580229, "num_input_tokens_seen": 140714920, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.796875, "step": 6548, "time_per_iteration": 2.46636700630188 }, { "auxiliary_loss_clip": 0.01119206, "auxiliary_loss_mlp": 0.01028953, "balance_loss_clip": 1.01631689, "balance_loss_mlp": 1.04212093, "epoch": 0.3937471817225312, "flos": 21325839563520.0, "grad_norm": 1.5479253023310766, "language_loss": 0.72425288, "learning_rate": 2.7653218875439174e-06, "loss": 0.74573451, "num_input_tokens_seen": 140734595, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7734375, "step": 6549, "time_per_iteration": 2.500875234603882 }, { "auxiliary_loss_clip": 0.01127398, "auxiliary_loss_mlp": 0.01032165, "balance_loss_clip": 1.01849782, "balance_loss_mlp": 1.04688275, "epoch": 0.39380730497519917, "flos": 20776693461120.0, "grad_norm": 2.341349901360461, "language_loss": 0.77822006, "learning_rate": 2.764962053731699e-06, "loss": 0.79981565, "num_input_tokens_seen": 140754050, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.8046875, "step": 6550, "time_per_iteration": 4.0095460414886475 }, { "auxiliary_loss_clip": 0.01120549, "auxiliary_loss_mlp": 0.01029804, "balance_loss_clip": 1.01635766, "balance_loss_mlp": 1.0419271, "epoch": 0.39386742822786713, "flos": 21609784575360.0, "grad_norm": 1.726815935574901, "language_loss": 0.81244773, "learning_rate": 2.7646021909120434e-06, "loss": 0.83395123, "num_input_tokens_seen": 140771440, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7890625, "step": 6551, "time_per_iteration": 2.5135645866394043 }, { "auxiliary_loss_clip": 0.0112326, "auxiliary_loss_mlp": 0.01035831, "balance_loss_clip": 1.021842, "balance_loss_mlp": 1.04235077, "epoch": 0.3939275514805351, "flos": 12414642249600.0, "grad_norm": 2.1629636228211555, "language_loss": 0.8008064, "learning_rate": 2.764242299098596e-06, "loss": 0.82239729, "num_input_tokens_seen": 140786715, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8046875, "step": 6552, "time_per_iteration": 2.4486396312713623 }, { "auxiliary_loss_clip": 0.01125817, "auxiliary_loss_mlp": 0.01039919, "balance_loss_clip": 1.02598393, "balance_loss_mlp": 1.0442884, "epoch": 0.39398767473320306, "flos": 18552027594240.0, "grad_norm": 1.9425169452499156, "language_loss": 0.71534312, "learning_rate": 2.763882378305003e-06, "loss": 0.73700047, "num_input_tokens_seen": 140804950, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.81640625, "step": 6553, "time_per_iteration": 2.4774069786071777 }, { "auxiliary_loss_clip": 0.01122118, "auxiliary_loss_mlp": 0.01042412, "balance_loss_clip": 1.02824962, "balance_loss_mlp": 1.04296362, "epoch": 0.39404779798587103, "flos": 29308888419840.0, "grad_norm": 1.6030706142681548, "language_loss": 0.63778782, "learning_rate": 2.7635224285449144e-06, "loss": 0.65943313, "num_input_tokens_seen": 140822800, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7890625, "step": 6554, "time_per_iteration": 4.055180072784424 }, { "auxiliary_loss_clip": 0.01123137, "auxiliary_loss_mlp": 0.01033819, "balance_loss_clip": 1.02123022, "balance_loss_mlp": 1.04377556, "epoch": 0.394107921238539, "flos": 34897055834880.0, "grad_norm": 2.513711823837186, "language_loss": 0.79516351, "learning_rate": 2.7631624498319796e-06, "loss": 0.81673306, "num_input_tokens_seen": 140842940, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.79296875, "step": 6555, "time_per_iteration": 4.014192342758179 }, { "auxiliary_loss_clip": 0.01127458, "auxiliary_loss_mlp": 0.01035581, "balance_loss_clip": 1.02101326, "balance_loss_mlp": 1.04567981, "epoch": 0.39416804449120696, "flos": 25081413039360.0, "grad_norm": 1.7417778257863776, "language_loss": 0.71818221, "learning_rate": 2.7628024421798473e-06, "loss": 0.73981255, "num_input_tokens_seen": 140863060, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.81640625, "step": 6556, "time_per_iteration": 2.5281338691711426 }, { "auxiliary_loss_clip": 0.01122279, "auxiliary_loss_mlp": 0.01031476, "balance_loss_clip": 1.01754022, "balance_loss_mlp": 1.04156089, "epoch": 0.3942281677438749, "flos": 32306639731200.0, "grad_norm": 1.9872462169792504, "language_loss": 0.83401918, "learning_rate": 2.7624424056021705e-06, "loss": 0.85555679, "num_input_tokens_seen": 140883795, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.8046875, "step": 6557, "time_per_iteration": 2.5843825340270996 }, { "auxiliary_loss_clip": 0.01123802, "auxiliary_loss_mlp": 0.01032581, "balance_loss_clip": 1.01878262, "balance_loss_mlp": 1.0446403, "epoch": 0.3942882909965429, "flos": 24936621315840.0, "grad_norm": 2.6423752693734155, "language_loss": 0.80261689, "learning_rate": 2.7620823401126004e-06, "loss": 0.82418072, "num_input_tokens_seen": 140903055, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7890625, "step": 6558, "time_per_iteration": 2.508512496948242 }, { "auxiliary_loss_clip": 0.01123827, "auxiliary_loss_mlp": 0.01035764, "balance_loss_clip": 1.02298474, "balance_loss_mlp": 1.04595864, "epoch": 0.39434841424921085, "flos": 11874797769600.0, "grad_norm": 1.7588843167660464, "language_loss": 0.70987362, "learning_rate": 2.761722245724792e-06, "loss": 0.73146951, "num_input_tokens_seen": 140920685, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.78125, "step": 6559, "time_per_iteration": 2.455496072769165 }, { "auxiliary_loss_clip": 0.01129087, "auxiliary_loss_mlp": 0.01039371, "balance_loss_clip": 1.02349818, "balance_loss_mlp": 1.04486072, "epoch": 0.3944085375018789, "flos": 16361620323840.0, "grad_norm": 2.255154247510037, "language_loss": 0.80180371, "learning_rate": 2.7613621224524003e-06, "loss": 0.82348835, "num_input_tokens_seen": 140937320, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84375, "step": 6560, "time_per_iteration": 2.498959541320801 }, { "auxiliary_loss_clip": 0.0112934, "auxiliary_loss_mlp": 0.01042956, "balance_loss_clip": 1.0283587, "balance_loss_mlp": 1.04742241, "epoch": 0.39446866075454684, "flos": 10633365866880.0, "grad_norm": 2.2466083450612, "language_loss": 0.82832718, "learning_rate": 2.7610019703090803e-06, "loss": 0.85005021, "num_input_tokens_seen": 140954855, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8203125, "step": 6561, "time_per_iteration": 2.4569478034973145 }, { "auxiliary_loss_clip": 0.01123586, "auxiliary_loss_mlp": 0.01047588, "balance_loss_clip": 1.03396821, "balance_loss_mlp": 1.04351532, "epoch": 0.3945287840072148, "flos": 18187498419840.0, "grad_norm": 2.1579623292179453, "language_loss": 0.79846883, "learning_rate": 2.7606417893084887e-06, "loss": 0.82018054, "num_input_tokens_seen": 140973250, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.80078125, "step": 6562, "time_per_iteration": 2.487056255340576 }, { "auxiliary_loss_clip": 0.01121619, "auxiliary_loss_mlp": 0.0103675, "balance_loss_clip": 1.02289248, "balance_loss_mlp": 1.04502952, "epoch": 0.39458890725988277, "flos": 23039891642880.0, "grad_norm": 1.5404318879042427, "language_loss": 0.81385589, "learning_rate": 2.7602815794642853e-06, "loss": 0.83543956, "num_input_tokens_seen": 140993050, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 6563, "time_per_iteration": 2.491201877593994 }, { "auxiliary_loss_clip": 0.01124193, "auxiliary_loss_mlp": 0.01035567, "balance_loss_clip": 1.02098203, "balance_loss_mlp": 1.04410315, "epoch": 0.39464903051255074, "flos": 17159052211200.0, "grad_norm": 1.9277895995606065, "language_loss": 0.69568503, "learning_rate": 2.759921340790127e-06, "loss": 0.71728265, "num_input_tokens_seen": 141010815, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8046875, "step": 6564, "time_per_iteration": 2.4686691761016846 }, { "auxiliary_loss_clip": 0.01126826, "auxiliary_loss_mlp": 0.01036918, "balance_loss_clip": 1.02290463, "balance_loss_mlp": 1.0453217, "epoch": 0.3947091537652187, "flos": 15889000147200.0, "grad_norm": 2.14724720281655, "language_loss": 0.83093864, "learning_rate": 2.759561073299676e-06, "loss": 0.85257608, "num_input_tokens_seen": 141028720, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 6565, "time_per_iteration": 2.444641590118408 }, { "auxiliary_loss_clip": 0.0112668, "auxiliary_loss_mlp": 0.01038492, "balance_loss_clip": 1.02500904, "balance_loss_mlp": 1.04626179, "epoch": 0.39476927701788667, "flos": 18545491319040.0, "grad_norm": 2.0533363698728833, "language_loss": 0.83823216, "learning_rate": 2.7592007770065937e-06, "loss": 0.8598839, "num_input_tokens_seen": 141046025, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.8046875, "step": 6566, "time_per_iteration": 2.467940092086792 }, { "auxiliary_loss_clip": 0.01131855, "auxiliary_loss_mlp": 0.01042373, "balance_loss_clip": 1.02775764, "balance_loss_mlp": 1.04664886, "epoch": 0.39482940027055463, "flos": 22275712771200.0, "grad_norm": 1.8851094331926046, "language_loss": 0.77155858, "learning_rate": 2.7588404519245403e-06, "loss": 0.79330081, "num_input_tokens_seen": 141066865, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8515625, "step": 6567, "time_per_iteration": 2.514042854309082 }, { "auxiliary_loss_clip": 0.01121257, "auxiliary_loss_mlp": 0.01045186, "balance_loss_clip": 1.03164411, "balance_loss_mlp": 1.04442573, "epoch": 0.3948895235232226, "flos": 14757634494720.0, "grad_norm": 2.011954801772072, "language_loss": 0.80489254, "learning_rate": 2.758480098067182e-06, "loss": 0.82655704, "num_input_tokens_seen": 141084210, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76953125, "step": 6568, "time_per_iteration": 2.467595100402832 }, { "auxiliary_loss_clip": 0.01125233, "auxiliary_loss_mlp": 0.01037558, "balance_loss_clip": 1.02345514, "balance_loss_mlp": 1.04585052, "epoch": 0.39494964677589056, "flos": 22565763095040.0, "grad_norm": 1.8598776179456573, "language_loss": 0.84872174, "learning_rate": 2.7581197154481816e-06, "loss": 0.87034959, "num_input_tokens_seen": 141103895, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.79296875, "step": 6569, "time_per_iteration": 2.487292766571045 }, { "auxiliary_loss_clip": 0.01128589, "auxiliary_loss_mlp": 0.01038933, "balance_loss_clip": 1.0253911, "balance_loss_mlp": 1.04895413, "epoch": 0.3950097700285585, "flos": 22963186149120.0, "grad_norm": 1.9557990056824486, "language_loss": 0.74455017, "learning_rate": 2.7577593040812066e-06, "loss": 0.7662254, "num_input_tokens_seen": 141124000, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.796875, "step": 6570, "time_per_iteration": 2.5386433601379395 }, { "auxiliary_loss_clip": 0.01127124, "auxiliary_loss_mlp": 0.0103886, "balance_loss_clip": 1.02453661, "balance_loss_mlp": 1.04613042, "epoch": 0.3950698932812265, "flos": 20595236929920.0, "grad_norm": 2.629671941716914, "language_loss": 0.79926717, "learning_rate": 2.757398863979922e-06, "loss": 0.82092702, "num_input_tokens_seen": 141142535, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8125, "step": 6571, "time_per_iteration": 2.4818074703216553 }, { "auxiliary_loss_clip": 0.01127623, "auxiliary_loss_mlp": 0.01044168, "balance_loss_clip": 1.02967811, "balance_loss_mlp": 1.04756737, "epoch": 0.39513001653389446, "flos": 20375786787840.0, "grad_norm": 2.4130246540413696, "language_loss": 0.78193998, "learning_rate": 2.757038395157997e-06, "loss": 0.80365801, "num_input_tokens_seen": 141161575, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 6572, "time_per_iteration": 2.52336049079895 }, { "auxiliary_loss_clip": 0.01129518, "auxiliary_loss_mlp": 0.01042485, "balance_loss_clip": 1.02721477, "balance_loss_mlp": 1.04684794, "epoch": 0.3951901397865625, "flos": 26463650256000.0, "grad_norm": 2.0576749378618873, "language_loss": 0.75297165, "learning_rate": 2.7566778976291002e-06, "loss": 0.7746917, "num_input_tokens_seen": 141181150, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 6573, "time_per_iteration": 2.538884162902832 }, { "auxiliary_loss_clip": 0.011261, "auxiliary_loss_mlp": 0.0103241, "balance_loss_clip": 1.01959479, "balance_loss_mlp": 1.04677045, "epoch": 0.39525026303923044, "flos": 43838345767680.0, "grad_norm": 1.6293030480149313, "language_loss": 0.67955637, "learning_rate": 2.7563173714069017e-06, "loss": 0.70114148, "num_input_tokens_seen": 141206310, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.79296875, "step": 6574, "time_per_iteration": 2.784320116043091 }, { "auxiliary_loss_clip": 0.01131008, "auxiliary_loss_mlp": 0.01040441, "balance_loss_clip": 1.02520013, "balance_loss_mlp": 1.04849601, "epoch": 0.3953103862918984, "flos": 18040803275520.0, "grad_norm": 3.977533460359529, "language_loss": 0.71389496, "learning_rate": 2.755956816505072e-06, "loss": 0.73560941, "num_input_tokens_seen": 141223925, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.82421875, "step": 6575, "time_per_iteration": 2.4924120903015137 }, { "auxiliary_loss_clip": 0.01131158, "auxiliary_loss_mlp": 0.01046187, "balance_loss_clip": 1.03080916, "balance_loss_mlp": 1.04732919, "epoch": 0.3953705095445664, "flos": 16976015481600.0, "grad_norm": 2.2951313252439607, "language_loss": 0.73823476, "learning_rate": 2.7555962329372845e-06, "loss": 0.76000822, "num_input_tokens_seen": 141239010, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.83984375, "step": 6576, "time_per_iteration": 2.4823296070098877 }, { "auxiliary_loss_clip": 0.01127424, "auxiliary_loss_mlp": 0.01037518, "balance_loss_clip": 1.02440512, "balance_loss_mlp": 1.04602313, "epoch": 0.39543063279723434, "flos": 17411144837760.0, "grad_norm": 2.764296326958509, "language_loss": 0.83974659, "learning_rate": 2.7552356207172124e-06, "loss": 0.86139601, "num_input_tokens_seen": 141252255, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.81640625, "step": 6577, "time_per_iteration": 2.424548387527466 }, { "auxiliary_loss_clip": 0.01128982, "auxiliary_loss_mlp": 0.01033067, "balance_loss_clip": 1.0195967, "balance_loss_mlp": 1.04987168, "epoch": 0.3954907560499023, "flos": 22784207656320.0, "grad_norm": 3.3964277122894475, "language_loss": 0.90901726, "learning_rate": 2.75487497985853e-06, "loss": 0.93063772, "num_input_tokens_seen": 141269325, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.79296875, "step": 6578, "time_per_iteration": 2.5363142490386963 }, { "auxiliary_loss_clip": 0.01131635, "auxiliary_loss_mlp": 0.01036161, "balance_loss_clip": 1.02011538, "balance_loss_mlp": 1.04764009, "epoch": 0.39555087930257027, "flos": 21944400698880.0, "grad_norm": 1.8480777154332406, "language_loss": 0.78343213, "learning_rate": 2.7545143103749117e-06, "loss": 0.8051101, "num_input_tokens_seen": 141288505, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 6579, "time_per_iteration": 2.47912335395813 }, { "auxiliary_loss_clip": 0.01131742, "auxiliary_loss_mlp": 0.0103252, "balance_loss_clip": 1.01727962, "balance_loss_mlp": 1.04751205, "epoch": 0.39561100255523823, "flos": 20404622430720.0, "grad_norm": 1.9947992501142133, "language_loss": 0.68347007, "learning_rate": 2.754153612280037e-06, "loss": 0.7051127, "num_input_tokens_seen": 141303680, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.84375, "step": 6580, "time_per_iteration": 2.518695592880249 }, { "auxiliary_loss_clip": 0.01126316, "auxiliary_loss_mlp": 0.01032718, "balance_loss_clip": 1.01896763, "balance_loss_mlp": 1.04694009, "epoch": 0.3956711258079062, "flos": 27964572986880.0, "grad_norm": 2.4930451613460876, "language_loss": 0.58527106, "learning_rate": 2.7537928855875797e-06, "loss": 0.60686141, "num_input_tokens_seen": 141324090, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.79296875, "step": 6581, "time_per_iteration": 2.5391685962677 }, { "auxiliary_loss_clip": 0.01131342, "auxiliary_loss_mlp": 0.01040957, "balance_loss_clip": 1.02607942, "balance_loss_mlp": 1.04955745, "epoch": 0.39573124906057416, "flos": 14428297670400.0, "grad_norm": 1.9934190369571734, "language_loss": 0.69020927, "learning_rate": 2.7534321303112224e-06, "loss": 0.71193224, "num_input_tokens_seen": 141342235, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 6582, "time_per_iteration": 2.5049614906311035 }, { "auxiliary_loss_clip": 0.01129689, "auxiliary_loss_mlp": 0.01032547, "balance_loss_clip": 1.01831341, "balance_loss_mlp": 1.04837883, "epoch": 0.39579137231324213, "flos": 18733699607040.0, "grad_norm": 3.3899454760932564, "language_loss": 0.76199329, "learning_rate": 2.753071346464642e-06, "loss": 0.78361565, "num_input_tokens_seen": 141361195, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 6583, "time_per_iteration": 2.458355188369751 }, { "auxiliary_loss_clip": 0.01127589, "auxiliary_loss_mlp": 0.01033872, "balance_loss_clip": 1.02019882, "balance_loss_mlp": 1.0469867, "epoch": 0.3958514955659101, "flos": 17676417755520.0, "grad_norm": 1.6035646197016735, "language_loss": 0.65822357, "learning_rate": 2.7527105340615207e-06, "loss": 0.67983818, "num_input_tokens_seen": 141378275, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.8046875, "step": 6584, "time_per_iteration": 2.503201961517334 }, { "auxiliary_loss_clip": 0.01131074, "auxiliary_loss_mlp": 0.01040481, "balance_loss_clip": 1.02563357, "balance_loss_mlp": 1.04747546, "epoch": 0.39591161881857806, "flos": 29309103901440.0, "grad_norm": 2.7635102304472605, "language_loss": 0.72800505, "learning_rate": 2.7523496931155413e-06, "loss": 0.74972057, "num_input_tokens_seen": 141396960, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8359375, "step": 6585, "time_per_iteration": 2.547034740447998 }, { "auxiliary_loss_clip": 0.0113013, "auxiliary_loss_mlp": 0.01036731, "balance_loss_clip": 1.02236605, "balance_loss_mlp": 1.04819381, "epoch": 0.3959717420712461, "flos": 25771831332480.0, "grad_norm": 1.9009432302206781, "language_loss": 0.73360717, "learning_rate": 2.7519888236403856e-06, "loss": 0.75527585, "num_input_tokens_seen": 141417320, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8203125, "step": 6586, "time_per_iteration": 2.5716707706451416 }, { "auxiliary_loss_clip": 0.01130316, "auxiliary_loss_mlp": 0.01030247, "balance_loss_clip": 1.01622224, "balance_loss_mlp": 1.04982519, "epoch": 0.39603186532391405, "flos": 20923783655040.0, "grad_norm": 1.7481958576214527, "language_loss": 0.71393752, "learning_rate": 2.7516279256497382e-06, "loss": 0.73554319, "num_input_tokens_seen": 141435985, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.8046875, "step": 6587, "time_per_iteration": 2.495525360107422 }, { "auxiliary_loss_clip": 0.01050388, "auxiliary_loss_mlp": 0.01004407, "balance_loss_clip": 1.00218928, "balance_loss_mlp": 1.02330518, "epoch": 0.396091988576582, "flos": 54880986176640.0, "grad_norm": 0.9024461766826652, "language_loss": 0.6117065, "learning_rate": 2.751266999157285e-06, "loss": 0.63225436, "num_input_tokens_seen": 141486075, "router_z_loss_clip": 0.0222168, "router_z_loss_mlp": 0.27148438, "step": 6588, "time_per_iteration": 2.9405517578125 }, { "auxiliary_loss_clip": 0.01130639, "auxiliary_loss_mlp": 0.01035827, "balance_loss_clip": 1.02109265, "balance_loss_mlp": 1.04833555, "epoch": 0.39615211182925, "flos": 20702896968960.0, "grad_norm": 1.7694100150309664, "language_loss": 0.81328213, "learning_rate": 2.7509060441767115e-06, "loss": 0.83494675, "num_input_tokens_seen": 141505280, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8203125, "step": 6589, "time_per_iteration": 2.5053322315216064 }, { "auxiliary_loss_clip": 0.01131184, "auxiliary_loss_mlp": 0.01033087, "balance_loss_clip": 1.01789975, "balance_loss_mlp": 1.04905093, "epoch": 0.39621223508191794, "flos": 20994312009600.0, "grad_norm": 2.0695056396821823, "language_loss": 0.70218819, "learning_rate": 2.7505450607217057e-06, "loss": 0.72383082, "num_input_tokens_seen": 141523930, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 6590, "time_per_iteration": 2.50701904296875 }, { "auxiliary_loss_clip": 0.0113125, "auxiliary_loss_mlp": 0.0103971, "balance_loss_clip": 1.02567863, "balance_loss_mlp": 1.0500493, "epoch": 0.3962723583345859, "flos": 23368833417600.0, "grad_norm": 2.069386838493547, "language_loss": 0.75465667, "learning_rate": 2.750184048805956e-06, "loss": 0.77636623, "num_input_tokens_seen": 141541320, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 6591, "time_per_iteration": 2.5408804416656494 }, { "auxiliary_loss_clip": 0.01132386, "auxiliary_loss_mlp": 0.01038343, "balance_loss_clip": 1.02396667, "balance_loss_mlp": 1.05044484, "epoch": 0.39633248158725387, "flos": 25115599808640.0, "grad_norm": 1.8779981770482268, "language_loss": 0.78484321, "learning_rate": 2.749823008443152e-06, "loss": 0.8065505, "num_input_tokens_seen": 141561880, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8203125, "step": 6592, "time_per_iteration": 4.004610538482666 }, { "auxiliary_loss_clip": 0.01126248, "auxiliary_loss_mlp": 0.01030689, "balance_loss_clip": 1.0162226, "balance_loss_mlp": 1.04825354, "epoch": 0.39639260483992184, "flos": 39787622236800.0, "grad_norm": 2.1242557110294276, "language_loss": 0.69376719, "learning_rate": 2.7494619396469843e-06, "loss": 0.71533656, "num_input_tokens_seen": 141586460, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 6593, "time_per_iteration": 2.68911075592041 }, { "auxiliary_loss_clip": 0.01132371, "auxiliary_loss_mlp": 0.01039331, "balance_loss_clip": 1.02438211, "balance_loss_mlp": 1.04884624, "epoch": 0.3964527280925898, "flos": 17347045017600.0, "grad_norm": 1.7748577092973552, "language_loss": 0.77829534, "learning_rate": 2.7491008424311452e-06, "loss": 0.80001235, "num_input_tokens_seen": 141605955, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8359375, "step": 6594, "time_per_iteration": 2.4870007038116455 }, { "auxiliary_loss_clip": 0.01053279, "auxiliary_loss_mlp": 0.01005175, "balance_loss_clip": 1.00324345, "balance_loss_mlp": 1.02594519, "epoch": 0.39651285134525777, "flos": 71717848369920.0, "grad_norm": 0.9640126533770508, "language_loss": 0.63019282, "learning_rate": 2.7487397168093265e-06, "loss": 0.65077734, "num_input_tokens_seen": 141673140, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.2734375, "step": 6595, "time_per_iteration": 5.979623079299927 }, { "auxiliary_loss_clip": 0.01137973, "auxiliary_loss_mlp": 0.01040417, "balance_loss_clip": 1.02500284, "balance_loss_mlp": 1.05293429, "epoch": 0.39657297459792573, "flos": 25775710001280.0, "grad_norm": 2.681378659797045, "language_loss": 0.63534057, "learning_rate": 2.748378562795223e-06, "loss": 0.65712452, "num_input_tokens_seen": 141692955, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8515625, "step": 6596, "time_per_iteration": 4.055939435958862 }, { "auxiliary_loss_clip": 0.01128222, "auxiliary_loss_mlp": 0.01035871, "balance_loss_clip": 1.02152991, "balance_loss_mlp": 1.04910159, "epoch": 0.3966330978505937, "flos": 20266115587200.0, "grad_norm": 3.684061085746304, "language_loss": 0.78676307, "learning_rate": 2.7480173804025293e-06, "loss": 0.80840403, "num_input_tokens_seen": 141710680, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7890625, "step": 6597, "time_per_iteration": 2.4984242916107178 }, { "auxiliary_loss_clip": 0.01136887, "auxiliary_loss_mlp": 0.01037722, "balance_loss_clip": 1.02277935, "balance_loss_mlp": 1.05248904, "epoch": 0.39669322110326166, "flos": 20631183465600.0, "grad_norm": 2.1689101467554366, "language_loss": 0.68188155, "learning_rate": 2.747656169644941e-06, "loss": 0.70362759, "num_input_tokens_seen": 141729860, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.84375, "step": 6598, "time_per_iteration": 2.485591411590576 }, { "auxiliary_loss_clip": 0.01132598, "auxiliary_loss_mlp": 0.01037522, "balance_loss_clip": 1.02345586, "balance_loss_mlp": 1.05071402, "epoch": 0.3967533443559297, "flos": 21726063878400.0, "grad_norm": 1.8184598352206793, "language_loss": 0.78928435, "learning_rate": 2.747294930536157e-06, "loss": 0.81098557, "num_input_tokens_seen": 141749060, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8203125, "step": 6599, "time_per_iteration": 2.5451197624206543 }, { "auxiliary_loss_clip": 0.01131512, "auxiliary_loss_mlp": 0.01030641, "balance_loss_clip": 1.0147382, "balance_loss_mlp": 1.04977107, "epoch": 0.39681346760859765, "flos": 25484151306240.0, "grad_norm": 1.8860554085408494, "language_loss": 0.72552431, "learning_rate": 2.7469336630898737e-06, "loss": 0.74714577, "num_input_tokens_seen": 141769860, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8203125, "step": 6600, "time_per_iteration": 2.5409131050109863 }, { "auxiliary_loss_clip": 0.0112931, "auxiliary_loss_mlp": 0.0103365, "balance_loss_clip": 1.01890349, "balance_loss_mlp": 1.04780781, "epoch": 0.3968735908612656, "flos": 20959586536320.0, "grad_norm": 2.0132585195510546, "language_loss": 0.85621786, "learning_rate": 2.746572367319791e-06, "loss": 0.87784743, "num_input_tokens_seen": 141788465, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 6601, "time_per_iteration": 2.508402109146118 }, { "auxiliary_loss_clip": 0.01137792, "auxiliary_loss_mlp": 0.01038194, "balance_loss_clip": 1.02187479, "balance_loss_mlp": 1.05083203, "epoch": 0.3969337141139336, "flos": 10707090531840.0, "grad_norm": 3.422651833699313, "language_loss": 0.70594448, "learning_rate": 2.7462110432396095e-06, "loss": 0.72770435, "num_input_tokens_seen": 141804955, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.87109375, "step": 6602, "time_per_iteration": 2.458824634552002 }, { "auxiliary_loss_clip": 0.01133537, "auxiliary_loss_mlp": 0.01038784, "balance_loss_clip": 1.02438915, "balance_loss_mlp": 1.05115581, "epoch": 0.39699383736660154, "flos": 17593714690560.0, "grad_norm": 2.3288252818764548, "language_loss": 0.83146775, "learning_rate": 2.7458496908630305e-06, "loss": 0.85319102, "num_input_tokens_seen": 141820025, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.82421875, "step": 6603, "time_per_iteration": 2.4559993743896484 }, { "auxiliary_loss_clip": 0.01129487, "auxiliary_loss_mlp": 0.01028776, "balance_loss_clip": 1.01504898, "balance_loss_mlp": 1.04981983, "epoch": 0.3970539606192695, "flos": 17785945301760.0, "grad_norm": 1.4896554488805855, "language_loss": 0.72927988, "learning_rate": 2.7454883102037563e-06, "loss": 0.75086248, "num_input_tokens_seen": 141838735, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.796875, "step": 6604, "time_per_iteration": 2.466224431991577 }, { "auxiliary_loss_clip": 0.01128548, "auxiliary_loss_mlp": 0.01034616, "balance_loss_clip": 1.02051353, "balance_loss_mlp": 1.05107236, "epoch": 0.3971140838719375, "flos": 24789495208320.0, "grad_norm": 1.5998240096168086, "language_loss": 0.82400364, "learning_rate": 2.745126901275491e-06, "loss": 0.84563529, "num_input_tokens_seen": 141858090, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 6605, "time_per_iteration": 2.5485687255859375 }, { "auxiliary_loss_clip": 0.01126505, "auxiliary_loss_mlp": 0.01030238, "balance_loss_clip": 1.01754832, "balance_loss_mlp": 1.04770446, "epoch": 0.39717420712460544, "flos": 24243581329920.0, "grad_norm": 1.8787578977407466, "language_loss": 0.73581135, "learning_rate": 2.7447654640919383e-06, "loss": 0.75737876, "num_input_tokens_seen": 141877540, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7890625, "step": 6606, "time_per_iteration": 2.5212764739990234 }, { "auxiliary_loss_clip": 0.01131884, "auxiliary_loss_mlp": 0.01035579, "balance_loss_clip": 1.02132154, "balance_loss_mlp": 1.04984355, "epoch": 0.3972343303772734, "flos": 25884698843520.0, "grad_norm": 1.898794598570879, "language_loss": 0.74086809, "learning_rate": 2.744403998666805e-06, "loss": 0.76254272, "num_input_tokens_seen": 141897315, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8203125, "step": 6607, "time_per_iteration": 2.532776117324829 }, { "auxiliary_loss_clip": 0.01133639, "auxiliary_loss_mlp": 0.01034288, "balance_loss_clip": 1.02063859, "balance_loss_mlp": 1.05124688, "epoch": 0.39729445362994137, "flos": 45623716300800.0, "grad_norm": 2.073033455095663, "language_loss": 0.67741746, "learning_rate": 2.744042505013797e-06, "loss": 0.69909674, "num_input_tokens_seen": 141919580, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.82421875, "step": 6608, "time_per_iteration": 2.725727081298828 }, { "auxiliary_loss_clip": 0.01132703, "auxiliary_loss_mlp": 0.01039932, "balance_loss_clip": 1.02394652, "balance_loss_mlp": 1.0486784, "epoch": 0.39735457688260933, "flos": 20193971120640.0, "grad_norm": 2.0252280770375735, "language_loss": 0.74671638, "learning_rate": 2.7436809831466233e-06, "loss": 0.76844275, "num_input_tokens_seen": 141937045, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83984375, "step": 6609, "time_per_iteration": 2.5035526752471924 }, { "auxiliary_loss_clip": 0.01132234, "auxiliary_loss_mlp": 0.01033907, "balance_loss_clip": 1.0197866, "balance_loss_mlp": 1.04991019, "epoch": 0.3974147001352773, "flos": 23331163029120.0, "grad_norm": 1.662770024536802, "language_loss": 0.71268129, "learning_rate": 2.7433194330789927e-06, "loss": 0.73434269, "num_input_tokens_seen": 141956695, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.82421875, "step": 6610, "time_per_iteration": 2.5378267765045166 }, { "auxiliary_loss_clip": 0.01123441, "auxiliary_loss_mlp": 0.01029237, "balance_loss_clip": 1.01519394, "balance_loss_mlp": 1.0459106, "epoch": 0.39747482338794526, "flos": 21688644885120.0, "grad_norm": 1.828096661746446, "language_loss": 0.78939068, "learning_rate": 2.7429578548246133e-06, "loss": 0.8109175, "num_input_tokens_seen": 141975935, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 6611, "time_per_iteration": 2.4872453212738037 }, { "auxiliary_loss_clip": 0.01130486, "auxiliary_loss_mlp": 0.01036672, "balance_loss_clip": 1.02305233, "balance_loss_mlp": 1.05012381, "epoch": 0.3975349466406133, "flos": 30988717816320.0, "grad_norm": 2.3177530501338617, "language_loss": 0.79220593, "learning_rate": 2.7425962483971985e-06, "loss": 0.81387746, "num_input_tokens_seen": 141995750, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.8046875, "step": 6612, "time_per_iteration": 2.5631041526794434 }, { "auxiliary_loss_clip": 0.01048231, "auxiliary_loss_mlp": 0.01005626, "balance_loss_clip": 1.00383782, "balance_loss_mlp": 1.02117109, "epoch": 0.39759506989328125, "flos": 63683948833920.0, "grad_norm": 0.870043812681733, "language_loss": 0.64925194, "learning_rate": 2.742234613810459e-06, "loss": 0.66979051, "num_input_tokens_seen": 142057655, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.26953125, "step": 6613, "time_per_iteration": 3.0072009563446045 }, { "auxiliary_loss_clip": 0.01128172, "auxiliary_loss_mlp": 0.01032509, "balance_loss_clip": 1.01806641, "balance_loss_mlp": 1.04819477, "epoch": 0.3976551931459492, "flos": 23695835857920.0, "grad_norm": 2.779648598799406, "language_loss": 0.71596652, "learning_rate": 2.741872951078109e-06, "loss": 0.73757327, "num_input_tokens_seen": 142076020, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 6614, "time_per_iteration": 2.5279648303985596 }, { "auxiliary_loss_clip": 0.01129601, "auxiliary_loss_mlp": 0.01031525, "balance_loss_clip": 1.01752949, "balance_loss_mlp": 1.04900193, "epoch": 0.3977153163986172, "flos": 15669657745920.0, "grad_norm": 1.7969204025345338, "language_loss": 0.8175298, "learning_rate": 2.741511260213862e-06, "loss": 0.83914101, "num_input_tokens_seen": 142093790, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.8046875, "step": 6615, "time_per_iteration": 2.4619483947753906 }, { "auxiliary_loss_clip": 0.01127752, "auxiliary_loss_mlp": 0.01032244, "balance_loss_clip": 1.01894617, "balance_loss_mlp": 1.04779494, "epoch": 0.39777543965128515, "flos": 14064702249600.0, "grad_norm": 1.9074860916000533, "language_loss": 0.66982508, "learning_rate": 2.741149541231434e-06, "loss": 0.69142497, "num_input_tokens_seen": 142110545, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.80078125, "step": 6616, "time_per_iteration": 2.459320306777954 }, { "auxiliary_loss_clip": 0.01132538, "auxiliary_loss_mlp": 0.01040809, "balance_loss_clip": 1.026474, "balance_loss_mlp": 1.04890227, "epoch": 0.3978355629039531, "flos": 23367468700800.0, "grad_norm": 2.244920230233955, "language_loss": 0.83712828, "learning_rate": 2.740787794144541e-06, "loss": 0.85886168, "num_input_tokens_seen": 142128695, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8359375, "step": 6617, "time_per_iteration": 2.477994203567505 }, { "auxiliary_loss_clip": 0.01125058, "auxiliary_loss_mlp": 0.01037631, "balance_loss_clip": 1.02448189, "balance_loss_mlp": 1.04896474, "epoch": 0.3978956861566211, "flos": 19062785036160.0, "grad_norm": 1.5436496868041933, "language_loss": 0.72583818, "learning_rate": 2.7404260189669e-06, "loss": 0.74746501, "num_input_tokens_seen": 142148375, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.76171875, "step": 6618, "time_per_iteration": 2.508610486984253 }, { "auxiliary_loss_clip": 0.01129835, "auxiliary_loss_mlp": 0.01035464, "balance_loss_clip": 1.01910305, "balance_loss_mlp": 1.04905629, "epoch": 0.39795580940928904, "flos": 30227699341440.0, "grad_norm": 1.6724324281735783, "language_loss": 0.65568101, "learning_rate": 2.740064215712231e-06, "loss": 0.67733395, "num_input_tokens_seen": 142169735, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.80859375, "step": 6619, "time_per_iteration": 2.5845561027526855 }, { "auxiliary_loss_clip": 0.01046368, "auxiliary_loss_mlp": 0.01007437, "balance_loss_clip": 1.00561285, "balance_loss_mlp": 1.01917744, "epoch": 0.398015932661957, "flos": 69847224906240.0, "grad_norm": 0.7744612462756284, "language_loss": 0.58316576, "learning_rate": 2.7397023843942527e-06, "loss": 0.60370374, "num_input_tokens_seen": 142229520, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.2734375, "step": 6620, "time_per_iteration": 3.0532848834991455 }, { "auxiliary_loss_clip": 0.01128089, "auxiliary_loss_mlp": 0.0103479, "balance_loss_clip": 1.02195144, "balance_loss_mlp": 1.04780531, "epoch": 0.39807605591462497, "flos": 20157773189760.0, "grad_norm": 2.5701712889526362, "language_loss": 0.79178256, "learning_rate": 2.739340525026686e-06, "loss": 0.81341136, "num_input_tokens_seen": 142247660, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.80078125, "step": 6621, "time_per_iteration": 2.4799065589904785 }, { "auxiliary_loss_clip": 0.01125399, "auxiliary_loss_mlp": 0.01033647, "balance_loss_clip": 1.02015841, "balance_loss_mlp": 1.04651833, "epoch": 0.39813617916729294, "flos": 21141761339520.0, "grad_norm": 2.02013594801065, "language_loss": 0.78285563, "learning_rate": 2.738978637623252e-06, "loss": 0.8044461, "num_input_tokens_seen": 142266990, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7890625, "step": 6622, "time_per_iteration": 2.4970057010650635 }, { "auxiliary_loss_clip": 0.01127088, "auxiliary_loss_mlp": 0.01034787, "balance_loss_clip": 1.02077413, "balance_loss_mlp": 1.04632378, "epoch": 0.3981963024199609, "flos": 18988485753600.0, "grad_norm": 1.6708794548500603, "language_loss": 0.75005651, "learning_rate": 2.738616722197674e-06, "loss": 0.77167529, "num_input_tokens_seen": 142287170, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80859375, "step": 6623, "time_per_iteration": 2.5034074783325195 }, { "auxiliary_loss_clip": 0.0112706, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.02244556, "balance_loss_mlp": 1.04773319, "epoch": 0.39825642567262887, "flos": 16575108808320.0, "grad_norm": 9.711742695407521, "language_loss": 0.79647577, "learning_rate": 2.7382547787636766e-06, "loss": 0.8181141, "num_input_tokens_seen": 142305405, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.79296875, "step": 6624, "time_per_iteration": 2.5048985481262207 }, { "auxiliary_loss_clip": 0.01134562, "auxiliary_loss_mlp": 0.01046452, "balance_loss_clip": 1.02959609, "balance_loss_mlp": 1.049739, "epoch": 0.39831654892529683, "flos": 22199833290240.0, "grad_norm": 2.92523502057736, "language_loss": 0.83786476, "learning_rate": 2.7378928073349832e-06, "loss": 0.85967493, "num_input_tokens_seen": 142322710, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.84765625, "step": 6625, "time_per_iteration": 2.5012636184692383 }, { "auxiliary_loss_clip": 0.01124743, "auxiliary_loss_mlp": 0.01040111, "balance_loss_clip": 1.02547836, "balance_loss_mlp": 1.04533792, "epoch": 0.39837667217796485, "flos": 10487963612160.0, "grad_norm": 2.105189721904114, "language_loss": 0.87067044, "learning_rate": 2.737530807925321e-06, "loss": 0.89231896, "num_input_tokens_seen": 142338535, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 6626, "time_per_iteration": 2.4794998168945312 }, { "auxiliary_loss_clip": 0.01127453, "auxiliary_loss_mlp": 0.01035458, "balance_loss_clip": 1.02058053, "balance_loss_mlp": 1.04724693, "epoch": 0.3984367954306328, "flos": 17965282930560.0, "grad_norm": 2.3656937217264846, "language_loss": 0.83903074, "learning_rate": 2.737168780548417e-06, "loss": 0.86065984, "num_input_tokens_seen": 142354570, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80078125, "step": 6627, "time_per_iteration": 2.460364580154419 }, { "auxiliary_loss_clip": 0.01126307, "auxiliary_loss_mlp": 0.01038151, "balance_loss_clip": 1.02475178, "balance_loss_mlp": 1.04746842, "epoch": 0.3984969186833008, "flos": 22711057608960.0, "grad_norm": 2.005887858372334, "language_loss": 0.82812309, "learning_rate": 2.736806725217998e-06, "loss": 0.84976768, "num_input_tokens_seen": 142374395, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7890625, "step": 6628, "time_per_iteration": 2.5322139263153076 }, { "auxiliary_loss_clip": 0.01130157, "auxiliary_loss_mlp": 0.01045588, "balance_loss_clip": 1.03131247, "balance_loss_mlp": 1.04891348, "epoch": 0.39855704193596875, "flos": 23405785534080.0, "grad_norm": 1.7416412351589183, "language_loss": 0.71008003, "learning_rate": 2.7364446419477945e-06, "loss": 0.73183751, "num_input_tokens_seen": 142396040, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 6629, "time_per_iteration": 2.533010959625244 }, { "auxiliary_loss_clip": 0.01128865, "auxiliary_loss_mlp": 0.01035402, "balance_loss_clip": 1.02131164, "balance_loss_mlp": 1.05195069, "epoch": 0.3986171651886367, "flos": 21251935330560.0, "grad_norm": 2.0114654625437502, "language_loss": 0.80636674, "learning_rate": 2.7360825307515366e-06, "loss": 0.82800943, "num_input_tokens_seen": 142415495, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 6630, "time_per_iteration": 2.5035696029663086 }, { "auxiliary_loss_clip": 0.0113071, "auxiliary_loss_mlp": 0.01030225, "balance_loss_clip": 1.01603341, "balance_loss_mlp": 1.04857242, "epoch": 0.3986772884413047, "flos": 12458705258880.0, "grad_norm": 2.3929390946407385, "language_loss": 0.75056559, "learning_rate": 2.7357203916429555e-06, "loss": 0.77217495, "num_input_tokens_seen": 142431865, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8203125, "step": 6631, "time_per_iteration": 2.4578397274017334 }, { "auxiliary_loss_clip": 0.01131947, "auxiliary_loss_mlp": 0.01034003, "balance_loss_clip": 1.01902437, "balance_loss_mlp": 1.05034471, "epoch": 0.39873741169397264, "flos": 19646117907840.0, "grad_norm": 1.914190513341054, "language_loss": 0.71284747, "learning_rate": 2.735358224635783e-06, "loss": 0.73450696, "num_input_tokens_seen": 142450595, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8125, "step": 6632, "time_per_iteration": 2.5128164291381836 }, { "auxiliary_loss_clip": 0.01125997, "auxiliary_loss_mlp": 0.01037407, "balance_loss_clip": 1.02405012, "balance_loss_mlp": 1.04698348, "epoch": 0.3987975349466406, "flos": 21684766216320.0, "grad_norm": 1.9522665874762053, "language_loss": 0.74977726, "learning_rate": 2.7349960297437533e-06, "loss": 0.7714113, "num_input_tokens_seen": 142466650, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7890625, "step": 6633, "time_per_iteration": 2.477025270462036 }, { "auxiliary_loss_clip": 0.01130288, "auxiliary_loss_mlp": 0.01026583, "balance_loss_clip": 1.01342821, "balance_loss_mlp": 1.04914641, "epoch": 0.3988576581993086, "flos": 23914064937600.0, "grad_norm": 1.6887590848636402, "language_loss": 0.81243861, "learning_rate": 2.7346338069806e-06, "loss": 0.83400732, "num_input_tokens_seen": 142486165, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.8125, "step": 6634, "time_per_iteration": 3.9975669384002686 }, { "auxiliary_loss_clip": 0.01131078, "auxiliary_loss_mlp": 0.01029355, "balance_loss_clip": 1.01489532, "balance_loss_mlp": 1.05051148, "epoch": 0.39891778145197654, "flos": 18149899858560.0, "grad_norm": 2.2730147999639465, "language_loss": 0.75558275, "learning_rate": 2.7342715563600597e-06, "loss": 0.77718705, "num_input_tokens_seen": 142505035, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 6635, "time_per_iteration": 2.4838755130767822 }, { "auxiliary_loss_clip": 0.01137195, "auxiliary_loss_mlp": 0.01036341, "balance_loss_clip": 1.0202719, "balance_loss_mlp": 1.05126274, "epoch": 0.3989779047046445, "flos": 22595281096320.0, "grad_norm": 2.4045852680101, "language_loss": 0.66179907, "learning_rate": 2.733909277895868e-06, "loss": 0.6835345, "num_input_tokens_seen": 142521870, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.859375, "step": 6636, "time_per_iteration": 2.491922616958618 }, { "auxiliary_loss_clip": 0.01129147, "auxiliary_loss_mlp": 0.01032861, "balance_loss_clip": 1.01910985, "balance_loss_mlp": 1.0493089, "epoch": 0.39903802795731247, "flos": 18077216688000.0, "grad_norm": 11.470250662730308, "language_loss": 0.81819892, "learning_rate": 2.733546971601763e-06, "loss": 0.83981901, "num_input_tokens_seen": 142540455, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.796875, "step": 6637, "time_per_iteration": 3.9597437381744385 }, { "auxiliary_loss_clip": 0.01050637, "auxiliary_loss_mlp": 0.01002353, "balance_loss_clip": 1.00018334, "balance_loss_mlp": 1.0230329, "epoch": 0.39909815120998043, "flos": 70441367771520.0, "grad_norm": 0.7178678013445059, "language_loss": 0.53223705, "learning_rate": 2.733184637491484e-06, "loss": 0.55276704, "num_input_tokens_seen": 142599665, "router_z_loss_clip": 0.02172852, "router_z_loss_mlp": 0.27734375, "step": 6638, "time_per_iteration": 4.6114325523376465 }, { "auxiliary_loss_clip": 0.01130798, "auxiliary_loss_mlp": 0.01036166, "balance_loss_clip": 1.02235603, "balance_loss_mlp": 1.04968095, "epoch": 0.39915827446264845, "flos": 18549262247040.0, "grad_norm": 1.4630727855158594, "language_loss": 0.75506139, "learning_rate": 2.732822275578769e-06, "loss": 0.77673107, "num_input_tokens_seen": 142618845, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8125, "step": 6639, "time_per_iteration": 2.483856678009033 }, { "auxiliary_loss_clip": 0.01127127, "auxiliary_loss_mlp": 0.01032066, "balance_loss_clip": 1.01843441, "balance_loss_mlp": 1.0489707, "epoch": 0.3992183977153164, "flos": 29897249195520.0, "grad_norm": 2.579945363236524, "language_loss": 0.75976044, "learning_rate": 2.7324598858773603e-06, "loss": 0.78135234, "num_input_tokens_seen": 142640885, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 6640, "time_per_iteration": 2.5755209922790527 }, { "auxiliary_loss_clip": 0.01133445, "auxiliary_loss_mlp": 0.01037342, "balance_loss_clip": 1.02340603, "balance_loss_mlp": 1.05078351, "epoch": 0.3992785209679844, "flos": 22565080736640.0, "grad_norm": 2.142166891970981, "language_loss": 0.8198657, "learning_rate": 2.7320974684009996e-06, "loss": 0.8415736, "num_input_tokens_seen": 142659340, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.828125, "step": 6641, "time_per_iteration": 2.501847267150879 }, { "auxiliary_loss_clip": 0.01132291, "auxiliary_loss_mlp": 0.01033293, "balance_loss_clip": 1.0187676, "balance_loss_mlp": 1.05061054, "epoch": 0.39933864422065235, "flos": 19682674974720.0, "grad_norm": 1.999023246188291, "language_loss": 0.76990294, "learning_rate": 2.7317350231634288e-06, "loss": 0.79155874, "num_input_tokens_seen": 142677085, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.81640625, "step": 6642, "time_per_iteration": 2.534705638885498 }, { "auxiliary_loss_clip": 0.01129781, "auxiliary_loss_mlp": 0.01035091, "balance_loss_clip": 1.02057695, "balance_loss_mlp": 1.04868126, "epoch": 0.3993987674733203, "flos": 23038491012480.0, "grad_norm": 2.11612331347892, "language_loss": 0.7260704, "learning_rate": 2.731372550178393e-06, "loss": 0.74771917, "num_input_tokens_seen": 142694595, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8125, "step": 6643, "time_per_iteration": 2.4898619651794434 }, { "auxiliary_loss_clip": 0.01132667, "auxiliary_loss_mlp": 0.01035369, "balance_loss_clip": 1.02111197, "balance_loss_mlp": 1.05081093, "epoch": 0.3994588907259883, "flos": 19390828970880.0, "grad_norm": 1.529704182075678, "language_loss": 0.66362441, "learning_rate": 2.7310100494596375e-06, "loss": 0.68530476, "num_input_tokens_seen": 142714175, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.81640625, "step": 6644, "time_per_iteration": 2.5251429080963135 }, { "auxiliary_loss_clip": 0.01128026, "auxiliary_loss_mlp": 0.01035209, "balance_loss_clip": 1.02060556, "balance_loss_mlp": 1.04662657, "epoch": 0.39951901397865625, "flos": 13734395758080.0, "grad_norm": 1.9421197510496997, "language_loss": 0.78174245, "learning_rate": 2.730647521020907e-06, "loss": 0.80337483, "num_input_tokens_seen": 142730955, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 6645, "time_per_iteration": 2.465117931365967 }, { "auxiliary_loss_clip": 0.01132775, "auxiliary_loss_mlp": 0.01031314, "balance_loss_clip": 1.01719999, "balance_loss_mlp": 1.05019689, "epoch": 0.3995791372313242, "flos": 23586451966080.0, "grad_norm": 4.553852429006499, "language_loss": 0.69887763, "learning_rate": 2.73028496487595e-06, "loss": 0.72051853, "num_input_tokens_seen": 142751200, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.82421875, "step": 6646, "time_per_iteration": 2.543076276779175 }, { "auxiliary_loss_clip": 0.01130815, "auxiliary_loss_mlp": 0.0103682, "balance_loss_clip": 1.02262211, "balance_loss_mlp": 1.04834735, "epoch": 0.3996392604839922, "flos": 21355896268800.0, "grad_norm": 2.286023369779209, "language_loss": 0.71592885, "learning_rate": 2.729922381038513e-06, "loss": 0.73760521, "num_input_tokens_seen": 142770170, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.828125, "step": 6647, "time_per_iteration": 2.5052473545074463 }, { "auxiliary_loss_clip": 0.01123105, "auxiliary_loss_mlp": 0.01038986, "balance_loss_clip": 1.02620697, "balance_loss_mlp": 1.04623234, "epoch": 0.39969938373666014, "flos": 26032255914240.0, "grad_norm": 1.652037457557452, "language_loss": 0.74534678, "learning_rate": 2.7295597695223463e-06, "loss": 0.76696771, "num_input_tokens_seen": 142792680, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.765625, "step": 6648, "time_per_iteration": 2.5534956455230713 }, { "auxiliary_loss_clip": 0.01131234, "auxiliary_loss_mlp": 0.01035524, "balance_loss_clip": 1.02026546, "balance_loss_mlp": 1.04922795, "epoch": 0.3997595069893281, "flos": 20116367786880.0, "grad_norm": 2.0621033618673743, "language_loss": 0.65799475, "learning_rate": 2.7291971303412006e-06, "loss": 0.67966235, "num_input_tokens_seen": 142810510, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 6649, "time_per_iteration": 2.480271339416504 }, { "auxiliary_loss_clip": 0.01132255, "auxiliary_loss_mlp": 0.01034428, "balance_loss_clip": 1.02055788, "balance_loss_mlp": 1.05003548, "epoch": 0.39981963024199607, "flos": 27783403764480.0, "grad_norm": 2.7055118426670384, "language_loss": 0.75535196, "learning_rate": 2.728834463508826e-06, "loss": 0.77701879, "num_input_tokens_seen": 142832455, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8203125, "step": 6650, "time_per_iteration": 2.5719690322875977 }, { "auxiliary_loss_clip": 0.01129316, "auxiliary_loss_mlp": 0.01038174, "balance_loss_clip": 1.02388096, "balance_loss_mlp": 1.04834044, "epoch": 0.39987975349466404, "flos": 21944436612480.0, "grad_norm": 1.5853150614600813, "language_loss": 0.71844065, "learning_rate": 2.728471769038975e-06, "loss": 0.74011552, "num_input_tokens_seen": 142852590, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 6651, "time_per_iteration": 2.501339912414551 }, { "auxiliary_loss_clip": 0.01130734, "auxiliary_loss_mlp": 0.01039357, "balance_loss_clip": 1.0253973, "balance_loss_mlp": 1.04819691, "epoch": 0.39993987674733206, "flos": 20704405340160.0, "grad_norm": 1.7646325084962067, "language_loss": 0.73374569, "learning_rate": 2.728109046945403e-06, "loss": 0.75544667, "num_input_tokens_seen": 142870595, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.82421875, "step": 6652, "time_per_iteration": 2.5021145343780518 }, { "auxiliary_loss_clip": 0.01049636, "auxiliary_loss_mlp": 0.01002342, "balance_loss_clip": 1.00043476, "balance_loss_mlp": 1.0219419, "epoch": 0.4, "flos": 61525429862400.0, "grad_norm": 0.9661555436313137, "language_loss": 0.60678077, "learning_rate": 2.727746297241862e-06, "loss": 0.62730056, "num_input_tokens_seen": 142925805, "router_z_loss_clip": 0.01904297, "router_z_loss_mlp": 0.27734375, "step": 6653, "time_per_iteration": 3.01009202003479 }, { "auxiliary_loss_clip": 0.01128427, "auxiliary_loss_mlp": 0.01036972, "balance_loss_clip": 1.02306008, "balance_loss_mlp": 1.05064297, "epoch": 0.400060123252668, "flos": 14502309644160.0, "grad_norm": 2.278693063735514, "language_loss": 0.66822249, "learning_rate": 2.7273835199421085e-06, "loss": 0.68987644, "num_input_tokens_seen": 142943145, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.77734375, "step": 6654, "time_per_iteration": 2.5210258960723877 }, { "auxiliary_loss_clip": 0.0112916, "auxiliary_loss_mlp": 0.01043153, "balance_loss_clip": 1.03079128, "balance_loss_mlp": 1.04886746, "epoch": 0.40012024650533595, "flos": 19093308618240.0, "grad_norm": 2.053311929539133, "language_loss": 0.90024024, "learning_rate": 2.7270207150599e-06, "loss": 0.92196339, "num_input_tokens_seen": 142956925, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.8046875, "step": 6655, "time_per_iteration": 2.47299861907959 }, { "auxiliary_loss_clip": 0.01127016, "auxiliary_loss_mlp": 0.01034014, "balance_loss_clip": 1.02172995, "balance_loss_mlp": 1.05021727, "epoch": 0.4001803697580039, "flos": 29351012094720.0, "grad_norm": 1.55272279769937, "language_loss": 0.73341858, "learning_rate": 2.7266578826089917e-06, "loss": 0.75502896, "num_input_tokens_seen": 142978040, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.765625, "step": 6656, "time_per_iteration": 2.5545547008514404 }, { "auxiliary_loss_clip": 0.01127939, "auxiliary_loss_mlp": 0.01041821, "balance_loss_clip": 1.02704549, "balance_loss_mlp": 1.04668379, "epoch": 0.4002404930106719, "flos": 20920048640640.0, "grad_norm": 1.7709934165487478, "language_loss": 0.73936296, "learning_rate": 2.726295022603144e-06, "loss": 0.76106054, "num_input_tokens_seen": 142998390, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 6657, "time_per_iteration": 2.5988388061523438 }, { "auxiliary_loss_clip": 0.01132094, "auxiliary_loss_mlp": 0.01040205, "balance_loss_clip": 1.02485096, "balance_loss_mlp": 1.0496943, "epoch": 0.40030061626333985, "flos": 28405735827840.0, "grad_norm": 1.5938806572297615, "language_loss": 0.79657441, "learning_rate": 2.725932135056117e-06, "loss": 0.81829739, "num_input_tokens_seen": 143021505, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.82421875, "step": 6658, "time_per_iteration": 2.5946884155273438 }, { "auxiliary_loss_clip": 0.01128904, "auxiliary_loss_mlp": 0.01039963, "balance_loss_clip": 1.02578294, "balance_loss_mlp": 1.04756558, "epoch": 0.4003607395160078, "flos": 25921615046400.0, "grad_norm": 1.9949539484354775, "language_loss": 0.7707963, "learning_rate": 2.72556921998167e-06, "loss": 0.792485, "num_input_tokens_seen": 143041375, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8125, "step": 6659, "time_per_iteration": 2.568394422531128 }, { "auxiliary_loss_clip": 0.01119295, "auxiliary_loss_mlp": 0.01030707, "balance_loss_clip": 1.01884556, "balance_loss_mlp": 1.04507303, "epoch": 0.4004208627686758, "flos": 20768648814720.0, "grad_norm": 2.7572616152109797, "language_loss": 0.72642004, "learning_rate": 2.7252062773935662e-06, "loss": 0.74792004, "num_input_tokens_seen": 143058725, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.7421875, "step": 6660, "time_per_iteration": 2.484734058380127 }, { "auxiliary_loss_clip": 0.01128421, "auxiliary_loss_mlp": 0.01041265, "balance_loss_clip": 1.02821779, "balance_loss_mlp": 1.04731464, "epoch": 0.40048098602134374, "flos": 24681224638080.0, "grad_norm": 2.371139439372665, "language_loss": 0.7130031, "learning_rate": 2.7248433073055674e-06, "loss": 0.73469996, "num_input_tokens_seen": 143076995, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.8125, "step": 6661, "time_per_iteration": 2.5357768535614014 }, { "auxiliary_loss_clip": 0.01132287, "auxiliary_loss_mlp": 0.0104284, "balance_loss_clip": 1.02858233, "balance_loss_mlp": 1.04999495, "epoch": 0.4005411092740117, "flos": 23185688947200.0, "grad_norm": 2.069508733515309, "language_loss": 0.75598323, "learning_rate": 2.724480309731437e-06, "loss": 0.77773452, "num_input_tokens_seen": 143096780, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.82421875, "step": 6662, "time_per_iteration": 2.51125168800354 }, { "auxiliary_loss_clip": 0.01130772, "auxiliary_loss_mlp": 0.01033477, "balance_loss_clip": 1.01831341, "balance_loss_mlp": 1.04746938, "epoch": 0.4006012325266797, "flos": 17522324409600.0, "grad_norm": 2.490953524553618, "language_loss": 0.66215098, "learning_rate": 2.7241172846849417e-06, "loss": 0.68379343, "num_input_tokens_seen": 143112590, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.83203125, "step": 6663, "time_per_iteration": 2.4712696075439453 }, { "auxiliary_loss_clip": 0.01128668, "auxiliary_loss_mlp": 0.01036501, "balance_loss_clip": 1.02297711, "balance_loss_mlp": 1.04755521, "epoch": 0.40066135577934764, "flos": 19857200181120.0, "grad_norm": 3.3179020686636793, "language_loss": 0.85889494, "learning_rate": 2.7237542321798455e-06, "loss": 0.88054669, "num_input_tokens_seen": 143130220, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.8125, "step": 6664, "time_per_iteration": 2.466843843460083 }, { "auxiliary_loss_clip": 0.01130964, "auxiliary_loss_mlp": 0.01033718, "balance_loss_clip": 1.01915073, "balance_loss_mlp": 1.04973161, "epoch": 0.40072147903201566, "flos": 18150007599360.0, "grad_norm": 3.758400829960532, "language_loss": 0.84509218, "learning_rate": 2.723391152229917e-06, "loss": 0.86673898, "num_input_tokens_seen": 143147160, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 6665, "time_per_iteration": 2.478952169418335 }, { "auxiliary_loss_clip": 0.01131373, "auxiliary_loss_mlp": 0.01035876, "balance_loss_clip": 1.02072418, "balance_loss_mlp": 1.04908848, "epoch": 0.4007816022846836, "flos": 18661267831680.0, "grad_norm": 2.1955234549433778, "language_loss": 0.78628671, "learning_rate": 2.7230280448489236e-06, "loss": 0.8079592, "num_input_tokens_seen": 143164605, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.82421875, "step": 6666, "time_per_iteration": 2.4812674522399902 }, { "auxiliary_loss_clip": 0.01133362, "auxiliary_loss_mlp": 0.01035273, "balance_loss_clip": 1.01956165, "balance_loss_mlp": 1.05130732, "epoch": 0.4008417255373516, "flos": 25703170485120.0, "grad_norm": 1.8445706342110433, "language_loss": 0.73745883, "learning_rate": 2.7226649100506333e-06, "loss": 0.75914514, "num_input_tokens_seen": 143183965, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8203125, "step": 6667, "time_per_iteration": 2.5456128120422363 }, { "auxiliary_loss_clip": 0.0113494, "auxiliary_loss_mlp": 0.01048417, "balance_loss_clip": 1.03187108, "balance_loss_mlp": 1.05112374, "epoch": 0.40090184879001955, "flos": 22858614679680.0, "grad_norm": 1.4844307435088897, "language_loss": 0.75620735, "learning_rate": 2.7223017478488183e-06, "loss": 0.77804089, "num_input_tokens_seen": 143204965, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8359375, "step": 6668, "time_per_iteration": 2.516061544418335 }, { "auxiliary_loss_clip": 0.01133571, "auxiliary_loss_mlp": 0.01040353, "balance_loss_clip": 1.0264349, "balance_loss_mlp": 1.05420899, "epoch": 0.4009619720426875, "flos": 29059848449280.0, "grad_norm": 1.836546945849653, "language_loss": 0.81966639, "learning_rate": 2.721938558257248e-06, "loss": 0.84140563, "num_input_tokens_seen": 143225015, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.79296875, "step": 6669, "time_per_iteration": 2.6248574256896973 }, { "auxiliary_loss_clip": 0.01047719, "auxiliary_loss_mlp": 0.01011598, "balance_loss_clip": 1.00976181, "balance_loss_mlp": 1.02009177, "epoch": 0.4010220952953555, "flos": 66059763131520.0, "grad_norm": 0.7574620422497369, "language_loss": 0.53334248, "learning_rate": 2.721575341289695e-06, "loss": 0.55393565, "num_input_tokens_seen": 143294925, "router_z_loss_clip": 0.01831055, "router_z_loss_mlp": 0.27734375, "step": 6670, "time_per_iteration": 3.287757396697998 }, { "auxiliary_loss_clip": 0.01130677, "auxiliary_loss_mlp": 0.01038276, "balance_loss_clip": 1.02355325, "balance_loss_mlp": 1.05062914, "epoch": 0.40108221854802345, "flos": 29642822184960.0, "grad_norm": 1.8053772402033958, "language_loss": 0.88313103, "learning_rate": 2.7212120969599333e-06, "loss": 0.90482056, "num_input_tokens_seen": 143314170, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 6671, "time_per_iteration": 2.5764424800872803 }, { "auxiliary_loss_clip": 0.01131536, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.01946998, "balance_loss_mlp": 1.05040264, "epoch": 0.4011423418006914, "flos": 19929560129280.0, "grad_norm": 1.9233970584636273, "language_loss": 0.78970599, "learning_rate": 2.720848825281736e-06, "loss": 0.81137156, "num_input_tokens_seen": 143330050, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8125, "step": 6672, "time_per_iteration": 2.495666027069092 }, { "auxiliary_loss_clip": 0.01128657, "auxiliary_loss_mlp": 0.01031929, "balance_loss_clip": 1.0177964, "balance_loss_mlp": 1.0498116, "epoch": 0.4012024650533594, "flos": 20084299920000.0, "grad_norm": 2.403880745072225, "language_loss": 0.63820124, "learning_rate": 2.72048552626888e-06, "loss": 0.65980721, "num_input_tokens_seen": 143348650, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 6673, "time_per_iteration": 2.5107245445251465 }, { "auxiliary_loss_clip": 0.01131177, "auxiliary_loss_mlp": 0.0103609, "balance_loss_clip": 1.02164209, "balance_loss_mlp": 1.05048335, "epoch": 0.40126258830602735, "flos": 21695719864320.0, "grad_norm": 1.5772683596391517, "language_loss": 0.80355346, "learning_rate": 2.7201221999351402e-06, "loss": 0.82522619, "num_input_tokens_seen": 143370275, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80859375, "step": 6674, "time_per_iteration": 3.9998202323913574 }, { "auxiliary_loss_clip": 0.01134976, "auxiliary_loss_mlp": 0.01037811, "balance_loss_clip": 1.0233624, "balance_loss_mlp": 1.05126739, "epoch": 0.4013227115586953, "flos": 12020379592320.0, "grad_norm": 2.552618087067809, "language_loss": 0.82484645, "learning_rate": 2.719758846294294e-06, "loss": 0.84657431, "num_input_tokens_seen": 143385390, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8359375, "step": 6675, "time_per_iteration": 2.45489239692688 }, { "auxiliary_loss_clip": 0.01130933, "auxiliary_loss_mlp": 0.01038347, "balance_loss_clip": 1.02221811, "balance_loss_mlp": 1.05056953, "epoch": 0.4013828348113633, "flos": 25447522412160.0, "grad_norm": 1.923190557691957, "language_loss": 0.93451118, "learning_rate": 2.71939546536012e-06, "loss": 0.95620394, "num_input_tokens_seen": 143404215, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8046875, "step": 6676, "time_per_iteration": 2.54923677444458 }, { "auxiliary_loss_clip": 0.01136081, "auxiliary_loss_mlp": 0.01037508, "balance_loss_clip": 1.0206635, "balance_loss_mlp": 1.05000639, "epoch": 0.40144295806403124, "flos": 18582946225920.0, "grad_norm": 4.317514243610054, "language_loss": 0.79250598, "learning_rate": 2.719032057146399e-06, "loss": 0.81424183, "num_input_tokens_seen": 143422245, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.859375, "step": 6677, "time_per_iteration": 2.458526372909546 }, { "auxiliary_loss_clip": 0.01134376, "auxiliary_loss_mlp": 0.01039634, "balance_loss_clip": 1.02554333, "balance_loss_mlp": 1.05388892, "epoch": 0.4015030813166992, "flos": 22930220442240.0, "grad_norm": 1.96673397610214, "language_loss": 0.838018, "learning_rate": 2.71866862166691e-06, "loss": 0.85975808, "num_input_tokens_seen": 143443130, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8046875, "step": 6678, "time_per_iteration": 3.9738175868988037 }, { "auxiliary_loss_clip": 0.01129992, "auxiliary_loss_mlp": 0.01045273, "balance_loss_clip": 1.03058624, "balance_loss_mlp": 1.05056095, "epoch": 0.4015632045693672, "flos": 20595057361920.0, "grad_norm": 2.2771484199221503, "language_loss": 0.63888299, "learning_rate": 2.718305158935434e-06, "loss": 0.66063559, "num_input_tokens_seen": 143461385, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 6679, "time_per_iteration": 3.937904119491577 }, { "auxiliary_loss_clip": 0.01126662, "auxiliary_loss_mlp": 0.01033444, "balance_loss_clip": 1.01935911, "balance_loss_mlp": 1.0481447, "epoch": 0.4016233278220352, "flos": 23438930808960.0, "grad_norm": 1.5819459976408523, "language_loss": 0.78520328, "learning_rate": 2.7179416689657554e-06, "loss": 0.80680436, "num_input_tokens_seen": 143481750, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 6680, "time_per_iteration": 2.518918752670288 }, { "auxiliary_loss_clip": 0.01137587, "auxiliary_loss_mlp": 0.01047484, "balance_loss_clip": 1.03161728, "balance_loss_mlp": 1.0522089, "epoch": 0.40168345107470316, "flos": 21431057477760.0, "grad_norm": 1.803051616432426, "language_loss": 0.75636947, "learning_rate": 2.7175781517716556e-06, "loss": 0.77822012, "num_input_tokens_seen": 143501540, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.85546875, "step": 6681, "time_per_iteration": 2.5059261322021484 }, { "auxiliary_loss_clip": 0.01131704, "auxiliary_loss_mlp": 0.01033704, "balance_loss_clip": 1.01946998, "balance_loss_mlp": 1.04987741, "epoch": 0.4017435743273711, "flos": 22857214049280.0, "grad_norm": 2.2052281169905164, "language_loss": 0.64243352, "learning_rate": 2.7172146073669213e-06, "loss": 0.66408753, "num_input_tokens_seen": 143520530, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.81640625, "step": 6682, "time_per_iteration": 2.5458784103393555 }, { "auxiliary_loss_clip": 0.01135128, "auxiliary_loss_mlp": 0.01037798, "balance_loss_clip": 1.02320647, "balance_loss_mlp": 1.05230868, "epoch": 0.4018036975800391, "flos": 28622312881920.0, "grad_norm": 1.9523244416402472, "language_loss": 0.7263779, "learning_rate": 2.716851035765337e-06, "loss": 0.74810719, "num_input_tokens_seen": 143540210, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.828125, "step": 6683, "time_per_iteration": 2.561082601547241 }, { "auxiliary_loss_clip": 0.01131266, "auxiliary_loss_mlp": 0.01045046, "balance_loss_clip": 1.0300374, "balance_loss_mlp": 1.05018926, "epoch": 0.40186382083270705, "flos": 26651212099200.0, "grad_norm": 2.3242139038617458, "language_loss": 0.73028266, "learning_rate": 2.7164874369806896e-06, "loss": 0.75204575, "num_input_tokens_seen": 143560940, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 6684, "time_per_iteration": 2.5371577739715576 }, { "auxiliary_loss_clip": 0.01051143, "auxiliary_loss_mlp": 0.01004228, "balance_loss_clip": 1.00246358, "balance_loss_mlp": 1.0232507, "epoch": 0.401923944085375, "flos": 59259969123840.0, "grad_norm": 0.9997654251824482, "language_loss": 0.6039046, "learning_rate": 2.716123811026767e-06, "loss": 0.62445831, "num_input_tokens_seen": 143624015, "router_z_loss_clip": 0.0177002, "router_z_loss_mlp": 0.27929688, "step": 6685, "time_per_iteration": 3.234727144241333 }, { "auxiliary_loss_clip": 0.01135101, "auxiliary_loss_mlp": 0.01034536, "balance_loss_clip": 1.01967025, "balance_loss_mlp": 1.05117011, "epoch": 0.401984067338043, "flos": 16982803152000.0, "grad_norm": 1.9585365509634185, "language_loss": 0.69904977, "learning_rate": 2.715760157917357e-06, "loss": 0.72074616, "num_input_tokens_seen": 143642750, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8359375, "step": 6686, "time_per_iteration": 2.479275941848755 }, { "auxiliary_loss_clip": 0.01130152, "auxiliary_loss_mlp": 0.01035499, "balance_loss_clip": 1.02139628, "balance_loss_mlp": 1.04994512, "epoch": 0.40204419059071095, "flos": 24972496024320.0, "grad_norm": 1.5940156021761547, "language_loss": 0.74577785, "learning_rate": 2.7153964776662504e-06, "loss": 0.76743436, "num_input_tokens_seen": 143664515, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80078125, "step": 6687, "time_per_iteration": 2.5504419803619385 }, { "auxiliary_loss_clip": 0.01133605, "auxiliary_loss_mlp": 0.01038935, "balance_loss_clip": 1.02412939, "balance_loss_mlp": 1.05155265, "epoch": 0.4021043138433789, "flos": 23477463123840.0, "grad_norm": 1.802641353196968, "language_loss": 0.71083635, "learning_rate": 2.7150327702872385e-06, "loss": 0.73256177, "num_input_tokens_seen": 143683135, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 6688, "time_per_iteration": 2.5612878799438477 }, { "auxiliary_loss_clip": 0.0113471, "auxiliary_loss_mlp": 0.01038839, "balance_loss_clip": 1.02344942, "balance_loss_mlp": 1.05006003, "epoch": 0.4021644370960469, "flos": 25995806588160.0, "grad_norm": 1.9140962091419669, "language_loss": 0.64375854, "learning_rate": 2.7146690357941112e-06, "loss": 0.66549408, "num_input_tokens_seen": 143703985, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84765625, "step": 6689, "time_per_iteration": 2.5656702518463135 }, { "auxiliary_loss_clip": 0.01133683, "auxiliary_loss_mlp": 0.0102923, "balance_loss_clip": 1.01438224, "balance_loss_mlp": 1.05017853, "epoch": 0.40222456034871484, "flos": 13587987922560.0, "grad_norm": 2.217247379983179, "language_loss": 0.73366976, "learning_rate": 2.7143052742006632e-06, "loss": 0.75529885, "num_input_tokens_seen": 143719245, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8359375, "step": 6690, "time_per_iteration": 2.4705538749694824 }, { "auxiliary_loss_clip": 0.01127985, "auxiliary_loss_mlp": 0.01028314, "balance_loss_clip": 1.01386571, "balance_loss_mlp": 1.04696834, "epoch": 0.4022846836013828, "flos": 24278019494400.0, "grad_norm": 1.5575129834627244, "language_loss": 0.74799436, "learning_rate": 2.7139414855206872e-06, "loss": 0.76955742, "num_input_tokens_seen": 143739575, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80859375, "step": 6691, "time_per_iteration": 2.5773661136627197 }, { "auxiliary_loss_clip": 0.01132709, "auxiliary_loss_mlp": 0.01038004, "balance_loss_clip": 1.02325225, "balance_loss_mlp": 1.05059457, "epoch": 0.40234480685405083, "flos": 20151596050560.0, "grad_norm": 1.6002484746635244, "language_loss": 0.72385883, "learning_rate": 2.7135776697679785e-06, "loss": 0.74556601, "num_input_tokens_seen": 143758515, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8203125, "step": 6692, "time_per_iteration": 2.5135231018066406 }, { "auxiliary_loss_clip": 0.01130429, "auxiliary_loss_mlp": 0.01036607, "balance_loss_clip": 1.02177787, "balance_loss_mlp": 1.04818654, "epoch": 0.4024049301067188, "flos": 22930220442240.0, "grad_norm": 2.9648841516025635, "language_loss": 0.84323597, "learning_rate": 2.7132138269563333e-06, "loss": 0.86490631, "num_input_tokens_seen": 143776770, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8203125, "step": 6693, "time_per_iteration": 2.5202653408050537 }, { "auxiliary_loss_clip": 0.01132934, "auxiliary_loss_mlp": 0.01040586, "balance_loss_clip": 1.02612627, "balance_loss_mlp": 1.05076981, "epoch": 0.40246505335938676, "flos": 36028421487360.0, "grad_norm": 1.8936456594170321, "language_loss": 0.70791507, "learning_rate": 2.7128499570995483e-06, "loss": 0.72965032, "num_input_tokens_seen": 143798450, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.82421875, "step": 6694, "time_per_iteration": 2.6498448848724365 }, { "auxiliary_loss_clip": 0.01130674, "auxiliary_loss_mlp": 0.01037086, "balance_loss_clip": 1.02203, "balance_loss_mlp": 1.04952741, "epoch": 0.4025251766120547, "flos": 20594303176320.0, "grad_norm": 2.2632210903588126, "language_loss": 0.67900729, "learning_rate": 2.7124860602114212e-06, "loss": 0.70068491, "num_input_tokens_seen": 143816995, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 6695, "time_per_iteration": 2.5044920444488525 }, { "auxiliary_loss_clip": 0.01129067, "auxiliary_loss_mlp": 0.01036472, "balance_loss_clip": 1.02195239, "balance_loss_mlp": 1.04721379, "epoch": 0.4025852998647227, "flos": 64523932381440.0, "grad_norm": 2.314872672776118, "language_loss": 0.79673809, "learning_rate": 2.7121221363057515e-06, "loss": 0.81839359, "num_input_tokens_seen": 143842090, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8203125, "step": 6696, "time_per_iteration": 2.92484712600708 }, { "auxiliary_loss_clip": 0.01133051, "auxiliary_loss_mlp": 0.01039896, "balance_loss_clip": 1.02485216, "balance_loss_mlp": 1.05050445, "epoch": 0.40264542311739066, "flos": 20886292834560.0, "grad_norm": 2.4689896480804934, "language_loss": 0.70786941, "learning_rate": 2.7117581853963393e-06, "loss": 0.72959888, "num_input_tokens_seen": 143860800, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 6697, "time_per_iteration": 2.5429282188415527 }, { "auxiliary_loss_clip": 0.01127725, "auxiliary_loss_mlp": 0.01036191, "balance_loss_clip": 1.02244043, "balance_loss_mlp": 1.04889631, "epoch": 0.4027055463700586, "flos": 26250197685120.0, "grad_norm": 2.718234424412734, "language_loss": 0.62221813, "learning_rate": 2.711394207496984e-06, "loss": 0.64385736, "num_input_tokens_seen": 143878950, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7890625, "step": 6698, "time_per_iteration": 2.5527303218841553 }, { "auxiliary_loss_clip": 0.01131912, "auxiliary_loss_mlp": 0.01033728, "balance_loss_clip": 1.01879096, "balance_loss_mlp": 1.04975462, "epoch": 0.4027656696227266, "flos": 20631398947200.0, "grad_norm": 1.9058673647649296, "language_loss": 0.76796263, "learning_rate": 2.711030202621491e-06, "loss": 0.78961903, "num_input_tokens_seen": 143898385, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8203125, "step": 6699, "time_per_iteration": 2.511817455291748 }, { "auxiliary_loss_clip": 0.01125835, "auxiliary_loss_mlp": 0.01033611, "balance_loss_clip": 1.01925266, "balance_loss_mlp": 1.0479672, "epoch": 0.40282579287539455, "flos": 22346277039360.0, "grad_norm": 2.1223810755041406, "language_loss": 0.80050516, "learning_rate": 2.7106661707836605e-06, "loss": 0.82209969, "num_input_tokens_seen": 143918795, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 6700, "time_per_iteration": 2.5318615436553955 }, { "auxiliary_loss_clip": 0.01131864, "auxiliary_loss_mlp": 0.01039137, "balance_loss_clip": 1.02323437, "balance_loss_mlp": 1.04684472, "epoch": 0.4028859161280625, "flos": 29274988959360.0, "grad_norm": 2.1565806238609304, "language_loss": 0.74944061, "learning_rate": 2.7103021119972977e-06, "loss": 0.77115059, "num_input_tokens_seen": 143938245, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8515625, "step": 6701, "time_per_iteration": 2.5688540935516357 }, { "auxiliary_loss_clip": 0.01127541, "auxiliary_loss_mlp": 0.01036271, "balance_loss_clip": 1.02273476, "balance_loss_mlp": 1.04751968, "epoch": 0.4029460393807305, "flos": 28622312881920.0, "grad_norm": 2.42465791693503, "language_loss": 0.65600073, "learning_rate": 2.709938026276208e-06, "loss": 0.67763883, "num_input_tokens_seen": 143960995, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.80078125, "step": 6702, "time_per_iteration": 2.587242603302002 }, { "auxiliary_loss_clip": 0.01130887, "auxiliary_loss_mlp": 0.01041103, "balance_loss_clip": 1.02585661, "balance_loss_mlp": 1.04861212, "epoch": 0.40300616263339845, "flos": 22601925112320.0, "grad_norm": 2.745791211450669, "language_loss": 0.65842879, "learning_rate": 2.7095739136341964e-06, "loss": 0.68014866, "num_input_tokens_seen": 143979910, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 6703, "time_per_iteration": 2.546567678451538 }, { "auxiliary_loss_clip": 0.01134181, "auxiliary_loss_mlp": 0.0103673, "balance_loss_clip": 1.02187657, "balance_loss_mlp": 1.05138111, "epoch": 0.4030662858860664, "flos": 25520313323520.0, "grad_norm": 1.888780205709071, "language_loss": 0.82365066, "learning_rate": 2.709209774085071e-06, "loss": 0.84535974, "num_input_tokens_seen": 144000095, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.828125, "step": 6704, "time_per_iteration": 2.56598162651062 }, { "auxiliary_loss_clip": 0.0112953, "auxiliary_loss_mlp": 0.01035966, "balance_loss_clip": 1.02165461, "balance_loss_mlp": 1.04629004, "epoch": 0.40312640913873443, "flos": 23586703361280.0, "grad_norm": 1.6683418044612917, "language_loss": 0.73283339, "learning_rate": 2.7088456076426407e-06, "loss": 0.75448835, "num_input_tokens_seen": 144019695, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.83203125, "step": 6705, "time_per_iteration": 2.5212793350219727 }, { "auxiliary_loss_clip": 0.01124611, "auxiliary_loss_mlp": 0.01033278, "balance_loss_clip": 1.01952076, "balance_loss_mlp": 1.04625738, "epoch": 0.4031865323914024, "flos": 20011042131840.0, "grad_norm": 1.66084744660736, "language_loss": 0.66177589, "learning_rate": 2.708481414320713e-06, "loss": 0.68335479, "num_input_tokens_seen": 144038525, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.78515625, "step": 6706, "time_per_iteration": 2.5133423805236816 }, { "auxiliary_loss_clip": 0.01128094, "auxiliary_loss_mlp": 0.01043046, "balance_loss_clip": 1.02826357, "balance_loss_mlp": 1.04743516, "epoch": 0.40324665564407036, "flos": 21871430219520.0, "grad_norm": 1.3536370817198606, "language_loss": 0.71121287, "learning_rate": 2.7081171941330992e-06, "loss": 0.73292428, "num_input_tokens_seen": 144059485, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8046875, "step": 6707, "time_per_iteration": 2.5068132877349854 }, { "auxiliary_loss_clip": 0.01121719, "auxiliary_loss_mlp": 0.01034851, "balance_loss_clip": 1.02008128, "balance_loss_mlp": 1.04567575, "epoch": 0.4033067788967383, "flos": 23878728933120.0, "grad_norm": 1.6378500403600904, "language_loss": 0.7998414, "learning_rate": 2.707752947093611e-06, "loss": 0.82140708, "num_input_tokens_seen": 144080265, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.76171875, "step": 6708, "time_per_iteration": 2.5660102367401123 }, { "auxiliary_loss_clip": 0.01129069, "auxiliary_loss_mlp": 0.01038928, "balance_loss_clip": 1.0243727, "balance_loss_mlp": 1.04464293, "epoch": 0.4033669021494063, "flos": 17419907756160.0, "grad_norm": 3.7018379435046596, "language_loss": 0.82537782, "learning_rate": 2.70738867321606e-06, "loss": 0.84705776, "num_input_tokens_seen": 144098040, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.84375, "step": 6709, "time_per_iteration": 2.463224172592163 }, { "auxiliary_loss_clip": 0.01132984, "auxiliary_loss_mlp": 0.01040458, "balance_loss_clip": 1.02552152, "balance_loss_mlp": 1.05113983, "epoch": 0.40342702540207426, "flos": 29600554855680.0, "grad_norm": 1.4984215979809177, "language_loss": 0.71445894, "learning_rate": 2.70702437251426e-06, "loss": 0.7361933, "num_input_tokens_seen": 144118265, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8203125, "step": 6710, "time_per_iteration": 2.5870018005371094 }, { "auxiliary_loss_clip": 0.01125947, "auxiliary_loss_mlp": 0.01036994, "balance_loss_clip": 1.02234864, "balance_loss_mlp": 1.04610109, "epoch": 0.4034871486547422, "flos": 11284605400320.0, "grad_norm": 2.021782958849316, "language_loss": 0.84470046, "learning_rate": 2.7066600450020236e-06, "loss": 0.86632985, "num_input_tokens_seen": 144133865, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 6711, "time_per_iteration": 2.448277235031128 }, { "auxiliary_loss_clip": 0.01129138, "auxiliary_loss_mlp": 0.01038193, "balance_loss_clip": 1.02337539, "balance_loss_mlp": 1.04781437, "epoch": 0.4035472719074102, "flos": 15552839738880.0, "grad_norm": 2.4124634605168906, "language_loss": 0.76086962, "learning_rate": 2.706295690693168e-06, "loss": 0.78254294, "num_input_tokens_seen": 144150125, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 6712, "time_per_iteration": 2.456913948059082 }, { "auxiliary_loss_clip": 0.01130804, "auxiliary_loss_mlp": 0.01040339, "balance_loss_clip": 1.02671373, "balance_loss_mlp": 1.05027318, "epoch": 0.40360739516007815, "flos": 24674365140480.0, "grad_norm": 1.9747819101586197, "language_loss": 0.79466987, "learning_rate": 2.7059313096015096e-06, "loss": 0.81638134, "num_input_tokens_seen": 144169295, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.8046875, "step": 6713, "time_per_iteration": 2.499420642852783 }, { "auxiliary_loss_clip": 0.01129179, "auxiliary_loss_mlp": 0.01034317, "balance_loss_clip": 1.01995265, "balance_loss_mlp": 1.04703045, "epoch": 0.4036675184127461, "flos": 17304095329920.0, "grad_norm": 2.1466355481442685, "language_loss": 0.87841415, "learning_rate": 2.705566901740865e-06, "loss": 0.90004915, "num_input_tokens_seen": 144185790, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8203125, "step": 6714, "time_per_iteration": 2.5240392684936523 }, { "auxiliary_loss_clip": 0.01128978, "auxiliary_loss_mlp": 0.01044036, "balance_loss_clip": 1.02986765, "balance_loss_mlp": 1.04893935, "epoch": 0.4037276416654141, "flos": 19864023765120.0, "grad_norm": 1.706562408442288, "language_loss": 0.69374758, "learning_rate": 2.7052024671250527e-06, "loss": 0.71547771, "num_input_tokens_seen": 144205190, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.80078125, "step": 6715, "time_per_iteration": 2.481785535812378 }, { "auxiliary_loss_clip": 0.01131408, "auxiliary_loss_mlp": 0.01037812, "balance_loss_clip": 1.02294636, "balance_loss_mlp": 1.04733169, "epoch": 0.40378776491808205, "flos": 18296271780480.0, "grad_norm": 2.694662189892962, "language_loss": 0.77592582, "learning_rate": 2.704838005767892e-06, "loss": 0.79761803, "num_input_tokens_seen": 144222705, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.83984375, "step": 6716, "time_per_iteration": 3.9273054599761963 }, { "auxiliary_loss_clip": 0.01126934, "auxiliary_loss_mlp": 0.01038049, "balance_loss_clip": 1.02453065, "balance_loss_mlp": 1.04802644, "epoch": 0.40384788817075, "flos": 15049372757760.0, "grad_norm": 1.9606662078807457, "language_loss": 0.75735939, "learning_rate": 2.7044735176832037e-06, "loss": 0.77900922, "num_input_tokens_seen": 144239545, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7890625, "step": 6717, "time_per_iteration": 2.4709935188293457 }, { "auxiliary_loss_clip": 0.0105179, "auxiliary_loss_mlp": 0.0100935, "balance_loss_clip": 1.00772858, "balance_loss_mlp": 1.02449799, "epoch": 0.40390801142341803, "flos": 61929927895680.0, "grad_norm": 0.9404960960025921, "language_loss": 0.60745955, "learning_rate": 2.7041090028848084e-06, "loss": 0.62807095, "num_input_tokens_seen": 144288145, "router_z_loss_clip": 0.01623535, "router_z_loss_mlp": 0.2734375, "step": 6718, "time_per_iteration": 2.9730796813964844 }, { "auxiliary_loss_clip": 0.01132262, "auxiliary_loss_mlp": 0.01040594, "balance_loss_clip": 1.02448845, "balance_loss_mlp": 1.0474627, "epoch": 0.403968134676086, "flos": 22738779930240.0, "grad_norm": 2.5017433456688023, "language_loss": 0.74826229, "learning_rate": 2.7037444613865306e-06, "loss": 0.7699908, "num_input_tokens_seen": 144302315, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84765625, "step": 6719, "time_per_iteration": 2.465923547744751 }, { "auxiliary_loss_clip": 0.01128808, "auxiliary_loss_mlp": 0.01038503, "balance_loss_clip": 1.02246976, "balance_loss_mlp": 1.04733527, "epoch": 0.40402825792875396, "flos": 19784409269760.0, "grad_norm": 2.1432413066722305, "language_loss": 0.81307077, "learning_rate": 2.7033798932021906e-06, "loss": 0.83474386, "num_input_tokens_seen": 144318990, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.81640625, "step": 6720, "time_per_iteration": 5.205447673797607 }, { "auxiliary_loss_clip": 0.01127675, "auxiliary_loss_mlp": 0.01031525, "balance_loss_clip": 1.01706481, "balance_loss_mlp": 1.04495502, "epoch": 0.40408838118142193, "flos": 19609273532160.0, "grad_norm": 1.8973446186352192, "language_loss": 0.7698462, "learning_rate": 2.7030152983456153e-06, "loss": 0.79143822, "num_input_tokens_seen": 144335765, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.828125, "step": 6721, "time_per_iteration": 3.9370827674865723 }, { "auxiliary_loss_clip": 0.01124091, "auxiliary_loss_mlp": 0.01031375, "balance_loss_clip": 1.01890552, "balance_loss_mlp": 1.0475415, "epoch": 0.4041485044340899, "flos": 24426043441920.0, "grad_norm": 1.837385656188429, "language_loss": 0.72690439, "learning_rate": 2.7026506768306304e-06, "loss": 0.74845898, "num_input_tokens_seen": 144355825, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.765625, "step": 6722, "time_per_iteration": 2.51737904548645 }, { "auxiliary_loss_clip": 0.01127844, "auxiliary_loss_mlp": 0.01031551, "balance_loss_clip": 1.01771104, "balance_loss_mlp": 1.04743981, "epoch": 0.40420862768675786, "flos": 16760192613120.0, "grad_norm": 2.003685562322026, "language_loss": 0.65513682, "learning_rate": 2.7022860286710602e-06, "loss": 0.67673075, "num_input_tokens_seen": 144374320, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8046875, "step": 6723, "time_per_iteration": 2.453702926635742 }, { "auxiliary_loss_clip": 0.01130785, "auxiliary_loss_mlp": 0.01038745, "balance_loss_clip": 1.02343893, "balance_loss_mlp": 1.04803443, "epoch": 0.4042687509394258, "flos": 22491571553280.0, "grad_norm": 1.5146180019615532, "language_loss": 0.73432362, "learning_rate": 2.701921353880734e-06, "loss": 0.75601888, "num_input_tokens_seen": 144394325, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 6724, "time_per_iteration": 2.5288069248199463 }, { "auxiliary_loss_clip": 0.01122214, "auxiliary_loss_mlp": 0.01030581, "balance_loss_clip": 1.0167408, "balance_loss_mlp": 1.04524362, "epoch": 0.4043288741920938, "flos": 30336149479680.0, "grad_norm": 1.8613754997924308, "language_loss": 0.75394082, "learning_rate": 2.7015566524734787e-06, "loss": 0.77546871, "num_input_tokens_seen": 144412765, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 6725, "time_per_iteration": 2.5629236698150635 }, { "auxiliary_loss_clip": 0.01126697, "auxiliary_loss_mlp": 0.01033731, "balance_loss_clip": 1.01869261, "balance_loss_mlp": 1.04665256, "epoch": 0.40438899744476176, "flos": 46348321363200.0, "grad_norm": 1.623749758588319, "language_loss": 0.76549244, "learning_rate": 2.701191924463126e-06, "loss": 0.78709674, "num_input_tokens_seen": 144435400, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 6726, "time_per_iteration": 2.7206273078918457 }, { "auxiliary_loss_clip": 0.01128747, "auxiliary_loss_mlp": 0.01034354, "balance_loss_clip": 1.01931024, "balance_loss_mlp": 1.04579282, "epoch": 0.4044491206974297, "flos": 13333524998400.0, "grad_norm": 2.4057085688960322, "language_loss": 0.81938541, "learning_rate": 2.7008271698635054e-06, "loss": 0.84101641, "num_input_tokens_seen": 144452925, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 6727, "time_per_iteration": 2.4980416297912598 }, { "auxiliary_loss_clip": 0.01125714, "auxiliary_loss_mlp": 0.01034074, "balance_loss_clip": 1.0198288, "balance_loss_mlp": 1.04501176, "epoch": 0.4045092439500977, "flos": 12093745121280.0, "grad_norm": 4.764050141275163, "language_loss": 0.85491693, "learning_rate": 2.700462388688447e-06, "loss": 0.87651479, "num_input_tokens_seen": 144470195, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80859375, "step": 6728, "time_per_iteration": 2.4386420249938965 }, { "auxiliary_loss_clip": 0.01129576, "auxiliary_loss_mlp": 0.01031408, "balance_loss_clip": 1.01713884, "balance_loss_mlp": 1.04863799, "epoch": 0.40456936720276565, "flos": 21179683123200.0, "grad_norm": 1.8499932710061386, "language_loss": 0.81865132, "learning_rate": 2.700097580951786e-06, "loss": 0.8402611, "num_input_tokens_seen": 144490320, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 6729, "time_per_iteration": 2.5086443424224854 }, { "auxiliary_loss_clip": 0.01127118, "auxiliary_loss_mlp": 0.0103697, "balance_loss_clip": 1.02227116, "balance_loss_mlp": 1.04696441, "epoch": 0.4046294904554336, "flos": 23915286000000.0, "grad_norm": 2.0753502862731965, "language_loss": 0.73697376, "learning_rate": 2.6997327466673533e-06, "loss": 0.75861466, "num_input_tokens_seen": 144508990, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 6730, "time_per_iteration": 2.514350414276123 }, { "auxiliary_loss_clip": 0.01125866, "auxiliary_loss_mlp": 0.01035002, "balance_loss_clip": 1.01992202, "balance_loss_mlp": 1.04587364, "epoch": 0.4046896137081016, "flos": 38071235773440.0, "grad_norm": 2.1255547369665213, "language_loss": 0.67542243, "learning_rate": 2.699367885848985e-06, "loss": 0.69703102, "num_input_tokens_seen": 144529550, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 6731, "time_per_iteration": 2.7154335975646973 }, { "auxiliary_loss_clip": 0.01125573, "auxiliary_loss_mlp": 0.0103207, "balance_loss_clip": 1.01871204, "balance_loss_mlp": 1.04572999, "epoch": 0.4047497369607696, "flos": 23617262856960.0, "grad_norm": 1.7688614448807853, "language_loss": 0.74135733, "learning_rate": 2.699002998510517e-06, "loss": 0.76293379, "num_input_tokens_seen": 144549310, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.796875, "step": 6732, "time_per_iteration": 2.5513877868652344 }, { "auxiliary_loss_clip": 0.0112623, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.01793623, "balance_loss_mlp": 1.04726934, "epoch": 0.40480986021343757, "flos": 12823593569280.0, "grad_norm": 1.7993608379793513, "language_loss": 0.77395022, "learning_rate": 2.6986380846657852e-06, "loss": 0.79552311, "num_input_tokens_seen": 144567430, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7890625, "step": 6733, "time_per_iteration": 2.4990711212158203 }, { "auxiliary_loss_clip": 0.01130084, "auxiliary_loss_mlp": 0.01039067, "balance_loss_clip": 1.02296209, "balance_loss_mlp": 1.04610157, "epoch": 0.40486998346610553, "flos": 23768770423680.0, "grad_norm": 2.0007539290798024, "language_loss": 0.77011061, "learning_rate": 2.698273144328627e-06, "loss": 0.79180205, "num_input_tokens_seen": 144585975, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.83984375, "step": 6734, "time_per_iteration": 2.4865474700927734 }, { "auxiliary_loss_clip": 0.01133601, "auxiliary_loss_mlp": 0.01033562, "balance_loss_clip": 1.01926303, "balance_loss_mlp": 1.04928637, "epoch": 0.4049301067187735, "flos": 22856818999680.0, "grad_norm": 3.6270403759766157, "language_loss": 0.65503359, "learning_rate": 2.6979081775128805e-06, "loss": 0.67670524, "num_input_tokens_seen": 144605225, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.84375, "step": 6735, "time_per_iteration": 2.50106143951416 }, { "auxiliary_loss_clip": 0.01123646, "auxiliary_loss_mlp": 0.01034338, "balance_loss_clip": 1.02114153, "balance_loss_mlp": 1.0451057, "epoch": 0.40499022997144146, "flos": 22783992174720.0, "grad_norm": 2.101240090922692, "language_loss": 0.82884824, "learning_rate": 2.697543184232387e-06, "loss": 0.8504281, "num_input_tokens_seen": 144624145, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.78515625, "step": 6736, "time_per_iteration": 2.4904160499572754 }, { "auxiliary_loss_clip": 0.0113144, "auxiliary_loss_mlp": 0.01040462, "balance_loss_clip": 1.02494657, "balance_loss_mlp": 1.04868293, "epoch": 0.4050503532241094, "flos": 23039352938880.0, "grad_norm": 1.5970210959941578, "language_loss": 0.74917579, "learning_rate": 2.6971781645009863e-06, "loss": 0.77089477, "num_input_tokens_seen": 144644470, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.828125, "step": 6737, "time_per_iteration": 2.5229125022888184 }, { "auxiliary_loss_clip": 0.01127011, "auxiliary_loss_mlp": 0.01043374, "balance_loss_clip": 1.02896786, "balance_loss_mlp": 1.04723477, "epoch": 0.4051104764767774, "flos": 16647756065280.0, "grad_norm": 2.1359842494176564, "language_loss": 0.72186023, "learning_rate": 2.696813118332519e-06, "loss": 0.74356413, "num_input_tokens_seen": 144661055, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 6738, "time_per_iteration": 2.460209369659424 }, { "auxiliary_loss_clip": 0.01124303, "auxiliary_loss_mlp": 0.01034272, "balance_loss_clip": 1.02075422, "balance_loss_mlp": 1.04503632, "epoch": 0.40517059972944536, "flos": 16358962717440.0, "grad_norm": 2.4810533898659948, "language_loss": 0.74659067, "learning_rate": 2.696448045740828e-06, "loss": 0.76817644, "num_input_tokens_seen": 144677935, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.796875, "step": 6739, "time_per_iteration": 2.456984043121338 }, { "auxiliary_loss_clip": 0.01130259, "auxiliary_loss_mlp": 0.01035571, "balance_loss_clip": 1.02126598, "balance_loss_mlp": 1.04892612, "epoch": 0.4052307229821133, "flos": 28803374363520.0, "grad_norm": 1.8988162058057967, "language_loss": 0.7436856, "learning_rate": 2.6960829467397576e-06, "loss": 0.7653439, "num_input_tokens_seen": 144697725, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 6740, "time_per_iteration": 2.546104907989502 }, { "auxiliary_loss_clip": 0.01123674, "auxiliary_loss_mlp": 0.01034946, "balance_loss_clip": 1.02130842, "balance_loss_mlp": 1.04529691, "epoch": 0.4052908462347813, "flos": 21397876289280.0, "grad_norm": 1.6391426451955828, "language_loss": 0.77055669, "learning_rate": 2.695717821343153e-06, "loss": 0.79214293, "num_input_tokens_seen": 144718805, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78515625, "step": 6741, "time_per_iteration": 2.513343095779419 }, { "auxiliary_loss_clip": 0.01129046, "auxiliary_loss_mlp": 0.01042727, "balance_loss_clip": 1.0271703, "balance_loss_mlp": 1.04772079, "epoch": 0.40535096948744925, "flos": 22419067950720.0, "grad_norm": 1.7898982703393997, "language_loss": 0.71246517, "learning_rate": 2.6953526695648577e-06, "loss": 0.73418295, "num_input_tokens_seen": 144737105, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8125, "step": 6742, "time_per_iteration": 2.49592924118042 }, { "auxiliary_loss_clip": 0.01130673, "auxiliary_loss_mlp": 0.01032767, "balance_loss_clip": 1.01849163, "balance_loss_mlp": 1.0486542, "epoch": 0.4054110927401172, "flos": 17010776868480.0, "grad_norm": 2.3155786058531276, "language_loss": 0.72468907, "learning_rate": 2.6949874914187202e-06, "loss": 0.74632347, "num_input_tokens_seen": 144751350, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8203125, "step": 6743, "time_per_iteration": 2.4771440029144287 }, { "auxiliary_loss_clip": 0.01133968, "auxiliary_loss_mlp": 0.01035394, "balance_loss_clip": 1.02025437, "balance_loss_mlp": 1.05037451, "epoch": 0.4054712159927852, "flos": 21614848392960.0, "grad_norm": 2.9825662326277618, "language_loss": 0.70487022, "learning_rate": 2.694622286918588e-06, "loss": 0.72656381, "num_input_tokens_seen": 144770030, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8359375, "step": 6744, "time_per_iteration": 2.4832820892333984 }, { "auxiliary_loss_clip": 0.01127207, "auxiliary_loss_mlp": 0.01043977, "balance_loss_clip": 1.03008354, "balance_loss_mlp": 1.04805446, "epoch": 0.4055313392454532, "flos": 25812554376960.0, "grad_norm": 1.6653818963268152, "language_loss": 0.79877615, "learning_rate": 2.6942570560783076e-06, "loss": 0.82048798, "num_input_tokens_seen": 144790965, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.79296875, "step": 6745, "time_per_iteration": 2.5879111289978027 }, { "auxiliary_loss_clip": 0.0112759, "auxiliary_loss_mlp": 0.01036778, "balance_loss_clip": 1.02158439, "balance_loss_mlp": 1.04891229, "epoch": 0.40559146249812117, "flos": 14137098111360.0, "grad_norm": 1.942328597313641, "language_loss": 0.67305809, "learning_rate": 2.693891798911731e-06, "loss": 0.69470179, "num_input_tokens_seen": 144807755, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7890625, "step": 6746, "time_per_iteration": 2.4446821212768555 }, { "auxiliary_loss_clip": 0.01125579, "auxiliary_loss_mlp": 0.0102848, "balance_loss_clip": 1.01519465, "balance_loss_mlp": 1.04691994, "epoch": 0.40565158575078913, "flos": 41355481962240.0, "grad_norm": 1.6224874537020137, "language_loss": 0.57656032, "learning_rate": 2.6935265154327075e-06, "loss": 0.5981009, "num_input_tokens_seen": 144832405, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.78515625, "step": 6747, "time_per_iteration": 2.714744806289673 }, { "auxiliary_loss_clip": 0.01128021, "auxiliary_loss_mlp": 0.01042804, "balance_loss_clip": 1.02892256, "balance_loss_mlp": 1.04723501, "epoch": 0.4057117090034571, "flos": 28544529980160.0, "grad_norm": 1.7626383961672356, "language_loss": 0.84708822, "learning_rate": 2.693161205655089e-06, "loss": 0.86879653, "num_input_tokens_seen": 144853890, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.80859375, "step": 6748, "time_per_iteration": 2.567575216293335 }, { "auxiliary_loss_clip": 0.01131262, "auxiliary_loss_mlp": 0.01040725, "balance_loss_clip": 1.02596104, "balance_loss_mlp": 1.04911017, "epoch": 0.40577183225612506, "flos": 18004066640640.0, "grad_norm": 2.40880051885706, "language_loss": 0.81581581, "learning_rate": 2.6927958695927287e-06, "loss": 0.83753574, "num_input_tokens_seen": 144871395, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8203125, "step": 6749, "time_per_iteration": 2.4834861755371094 }, { "auxiliary_loss_clip": 0.01131288, "auxiliary_loss_mlp": 0.01042696, "balance_loss_clip": 1.02899313, "balance_loss_mlp": 1.05081725, "epoch": 0.40583195550879303, "flos": 19536734016000.0, "grad_norm": 1.8826014272391405, "language_loss": 0.75612545, "learning_rate": 2.6924305072594784e-06, "loss": 0.77786529, "num_input_tokens_seen": 144890975, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.8046875, "step": 6750, "time_per_iteration": 2.510969877243042 }, { "auxiliary_loss_clip": 0.01131782, "auxiliary_loss_mlp": 0.01037103, "balance_loss_clip": 1.02167749, "balance_loss_mlp": 1.04682446, "epoch": 0.405892078761461, "flos": 22309468577280.0, "grad_norm": 2.696835448227446, "language_loss": 0.73512554, "learning_rate": 2.692065118669195e-06, "loss": 0.75681442, "num_input_tokens_seen": 144908170, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.84765625, "step": 6751, "time_per_iteration": 2.5079476833343506 }, { "auxiliary_loss_clip": 0.01130778, "auxiliary_loss_mlp": 0.01039043, "balance_loss_clip": 1.02395082, "balance_loss_mlp": 1.04924345, "epoch": 0.40595220201412896, "flos": 25484402701440.0, "grad_norm": 1.843938636466987, "language_loss": 0.66580021, "learning_rate": 2.6916997038357326e-06, "loss": 0.68749845, "num_input_tokens_seen": 144928020, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.81640625, "step": 6752, "time_per_iteration": 2.5411264896392822 }, { "auxiliary_loss_clip": 0.01135771, "auxiliary_loss_mlp": 0.01039672, "balance_loss_clip": 1.02348304, "balance_loss_mlp": 1.05069506, "epoch": 0.4060123252667969, "flos": 49856004103680.0, "grad_norm": 2.0014988067172554, "language_loss": 0.70599544, "learning_rate": 2.691334262772948e-06, "loss": 0.72774988, "num_input_tokens_seen": 144951240, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84765625, "step": 6753, "time_per_iteration": 2.750056028366089 }, { "auxiliary_loss_clip": 0.01129284, "auxiliary_loss_mlp": 0.01040713, "balance_loss_clip": 1.02472734, "balance_loss_mlp": 1.04612672, "epoch": 0.4060724485194649, "flos": 21135476459520.0, "grad_norm": 2.412362214237516, "language_loss": 0.72190732, "learning_rate": 2.690968795494699e-06, "loss": 0.74360728, "num_input_tokens_seen": 144969100, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 6754, "time_per_iteration": 2.4759325981140137 }, { "auxiliary_loss_clip": 0.01131571, "auxiliary_loss_mlp": 0.01043274, "balance_loss_clip": 1.02833688, "balance_loss_mlp": 1.04850876, "epoch": 0.40613257177213286, "flos": 21758059918080.0, "grad_norm": 1.8754836718432393, "language_loss": 0.83119488, "learning_rate": 2.690603302014844e-06, "loss": 0.8529433, "num_input_tokens_seen": 144987065, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.83203125, "step": 6755, "time_per_iteration": 2.496760606765747 }, { "auxiliary_loss_clip": 0.01133537, "auxiliary_loss_mlp": 0.01039714, "balance_loss_clip": 1.02395451, "balance_loss_mlp": 1.04932857, "epoch": 0.4061926950248008, "flos": 25555074710400.0, "grad_norm": 2.597552098652993, "language_loss": 0.71307075, "learning_rate": 2.6902377823472426e-06, "loss": 0.73480326, "num_input_tokens_seen": 145007310, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.83984375, "step": 6756, "time_per_iteration": 2.525750160217285 }, { "auxiliary_loss_clip": 0.01132386, "auxiliary_loss_mlp": 0.01045714, "balance_loss_clip": 1.02988327, "balance_loss_mlp": 1.04757547, "epoch": 0.4062528182774688, "flos": 23695799944320.0, "grad_norm": 2.452894892918417, "language_loss": 0.7868728, "learning_rate": 2.689872236505755e-06, "loss": 0.80865383, "num_input_tokens_seen": 145026210, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.84765625, "step": 6757, "time_per_iteration": 2.5156238079071045 }, { "auxiliary_loss_clip": 0.01131439, "auxiliary_loss_mlp": 0.01031366, "balance_loss_clip": 1.01625061, "balance_loss_mlp": 1.05094385, "epoch": 0.4063129415301368, "flos": 21726027964800.0, "grad_norm": 2.0881540282027546, "language_loss": 0.78645658, "learning_rate": 2.6895066645042437e-06, "loss": 0.80808461, "num_input_tokens_seen": 145045475, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8046875, "step": 6758, "time_per_iteration": 3.967188596725464 }, { "auxiliary_loss_clip": 0.01128586, "auxiliary_loss_mlp": 0.01035019, "balance_loss_clip": 1.01962936, "balance_loss_mlp": 1.04838729, "epoch": 0.40637306478280477, "flos": 12787575206400.0, "grad_norm": 1.8771251231519273, "language_loss": 0.88704211, "learning_rate": 2.6891410663565703e-06, "loss": 0.90867817, "num_input_tokens_seen": 145062260, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80078125, "step": 6759, "time_per_iteration": 2.4752395153045654 }, { "auxiliary_loss_clip": 0.01131904, "auxiliary_loss_mlp": 0.01038226, "balance_loss_clip": 1.02328265, "balance_loss_mlp": 1.04972577, "epoch": 0.40643318803547274, "flos": 24024490323840.0, "grad_norm": 1.913946496027899, "language_loss": 0.64577609, "learning_rate": 2.688775442076598e-06, "loss": 0.66747743, "num_input_tokens_seen": 145082470, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8203125, "step": 6760, "time_per_iteration": 2.505429744720459 }, { "auxiliary_loss_clip": 0.01130053, "auxiliary_loss_mlp": 0.01034029, "balance_loss_clip": 1.01789951, "balance_loss_mlp": 1.04768801, "epoch": 0.4064933112881407, "flos": 25592421876480.0, "grad_norm": 2.6794298795390437, "language_loss": 0.75333261, "learning_rate": 2.688409791678193e-06, "loss": 0.77497339, "num_input_tokens_seen": 145105685, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.82421875, "step": 6761, "time_per_iteration": 2.5233232975006104 }, { "auxiliary_loss_clip": 0.01123894, "auxiliary_loss_mlp": 0.01040285, "balance_loss_clip": 1.02560997, "balance_loss_mlp": 1.0475378, "epoch": 0.40655343454080867, "flos": 22054323294720.0, "grad_norm": 1.425439205641409, "language_loss": 0.70052123, "learning_rate": 2.6880441151752185e-06, "loss": 0.72216308, "num_input_tokens_seen": 145125590, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 6762, "time_per_iteration": 6.802795886993408 }, { "auxiliary_loss_clip": 0.0112935, "auxiliary_loss_mlp": 0.01039033, "balance_loss_clip": 1.02478182, "balance_loss_mlp": 1.04938972, "epoch": 0.40661355779347663, "flos": 26468893641600.0, "grad_norm": 1.7154913799006557, "language_loss": 0.73404419, "learning_rate": 2.6876784125815433e-06, "loss": 0.75572801, "num_input_tokens_seen": 145146810, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80078125, "step": 6763, "time_per_iteration": 2.5474138259887695 }, { "auxiliary_loss_clip": 0.01132764, "auxiliary_loss_mlp": 0.01035392, "balance_loss_clip": 1.02005005, "balance_loss_mlp": 1.04936087, "epoch": 0.4066736810461446, "flos": 13261129136640.0, "grad_norm": 1.8878006922520891, "language_loss": 0.68932891, "learning_rate": 2.687312683911033e-06, "loss": 0.71101046, "num_input_tokens_seen": 145163130, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8359375, "step": 6764, "time_per_iteration": 2.4441847801208496 }, { "auxiliary_loss_clip": 0.01135542, "auxiliary_loss_mlp": 0.01044101, "balance_loss_clip": 1.02726829, "balance_loss_mlp": 1.05168724, "epoch": 0.40673380429881256, "flos": 28803625758720.0, "grad_norm": 2.036479805531594, "language_loss": 0.9079777, "learning_rate": 2.686946929177557e-06, "loss": 0.92977405, "num_input_tokens_seen": 145181420, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.8359375, "step": 6765, "time_per_iteration": 2.5334503650665283 }, { "auxiliary_loss_clip": 0.01134979, "auxiliary_loss_mlp": 0.0104239, "balance_loss_clip": 1.02655935, "balance_loss_mlp": 1.04968441, "epoch": 0.4067939275514805, "flos": 12495334152960.0, "grad_norm": 3.575056265365178, "language_loss": 0.78405595, "learning_rate": 2.6865811483949855e-06, "loss": 0.80582964, "num_input_tokens_seen": 145198545, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8515625, "step": 6766, "time_per_iteration": 2.438400983810425 }, { "auxiliary_loss_clip": 0.01131901, "auxiliary_loss_mlp": 0.01040938, "balance_loss_clip": 1.02461815, "balance_loss_mlp": 1.04854226, "epoch": 0.4068540508041485, "flos": 18770508069120.0, "grad_norm": 2.3918327188308424, "language_loss": 0.76780236, "learning_rate": 2.6862153415771867e-06, "loss": 0.78953075, "num_input_tokens_seen": 145215835, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8359375, "step": 6767, "time_per_iteration": 2.4720828533172607 }, { "auxiliary_loss_clip": 0.01132127, "auxiliary_loss_mlp": 0.01043645, "balance_loss_clip": 1.02842188, "balance_loss_mlp": 1.0527451, "epoch": 0.40691417405681646, "flos": 28512821249280.0, "grad_norm": 1.965398579670024, "language_loss": 0.77291977, "learning_rate": 2.685849508738034e-06, "loss": 0.7946775, "num_input_tokens_seen": 145236555, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.79296875, "step": 6768, "time_per_iteration": 2.5446555614471436 }, { "auxiliary_loss_clip": 0.0113083, "auxiliary_loss_mlp": 0.01037204, "balance_loss_clip": 1.02237439, "balance_loss_mlp": 1.0499444, "epoch": 0.4069742973094844, "flos": 20814040627200.0, "grad_norm": 1.8827690222259519, "language_loss": 0.87325191, "learning_rate": 2.6854836498913995e-06, "loss": 0.89493227, "num_input_tokens_seen": 145254595, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80859375, "step": 6769, "time_per_iteration": 2.518490791320801 }, { "auxiliary_loss_clip": 0.01128903, "auxiliary_loss_mlp": 0.01035597, "balance_loss_clip": 1.02143478, "balance_loss_mlp": 1.05095863, "epoch": 0.4070344205621524, "flos": 21470272151040.0, "grad_norm": 13.583411466004826, "language_loss": 0.80820048, "learning_rate": 2.685117765051156e-06, "loss": 0.82984555, "num_input_tokens_seen": 145274005, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 6770, "time_per_iteration": 2.4826624393463135 }, { "auxiliary_loss_clip": 0.01132068, "auxiliary_loss_mlp": 0.01035165, "balance_loss_clip": 1.01943552, "balance_loss_mlp": 1.04849541, "epoch": 0.4070945438148204, "flos": 26830046937600.0, "grad_norm": 1.7488927050973457, "language_loss": 0.80217195, "learning_rate": 2.6847518542311783e-06, "loss": 0.82384425, "num_input_tokens_seen": 145294850, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8359375, "step": 6771, "time_per_iteration": 2.524167537689209 }, { "auxiliary_loss_clip": 0.01130704, "auxiliary_loss_mlp": 0.01043459, "balance_loss_clip": 1.02814031, "balance_loss_mlp": 1.05021596, "epoch": 0.4071546670674884, "flos": 26354158623360.0, "grad_norm": 1.4447998472637233, "language_loss": 0.76225466, "learning_rate": 2.6843859174453417e-06, "loss": 0.78399622, "num_input_tokens_seen": 145317050, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8046875, "step": 6772, "time_per_iteration": 2.5329041481018066 }, { "auxiliary_loss_clip": 0.01128811, "auxiliary_loss_mlp": 0.01049476, "balance_loss_clip": 1.03346586, "balance_loss_mlp": 1.04726636, "epoch": 0.40721479032015634, "flos": 17895401020800.0, "grad_norm": 2.1986933062320806, "language_loss": 0.81367922, "learning_rate": 2.6840199547075218e-06, "loss": 0.83546209, "num_input_tokens_seen": 145334480, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.81640625, "step": 6773, "time_per_iteration": 2.4742000102996826 }, { "auxiliary_loss_clip": 0.01053741, "auxiliary_loss_mlp": 0.01011266, "balance_loss_clip": 1.00892949, "balance_loss_mlp": 1.0261302, "epoch": 0.4072749135728243, "flos": 49854570537600.0, "grad_norm": 0.8306288854167212, "language_loss": 0.64338195, "learning_rate": 2.683653966031597e-06, "loss": 0.66403198, "num_input_tokens_seen": 145388695, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.27734375, "step": 6774, "time_per_iteration": 2.978151559829712 }, { "auxiliary_loss_clip": 0.01131181, "auxiliary_loss_mlp": 0.01040571, "balance_loss_clip": 1.02572894, "balance_loss_mlp": 1.04798877, "epoch": 0.40733503682549227, "flos": 27563630400000.0, "grad_norm": 3.0792130773759516, "language_loss": 0.72642857, "learning_rate": 2.683287951431446e-06, "loss": 0.74814606, "num_input_tokens_seen": 145408240, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.83203125, "step": 6775, "time_per_iteration": 2.539188861846924 }, { "auxiliary_loss_clip": 0.01130045, "auxiliary_loss_mlp": 0.0105138, "balance_loss_clip": 1.03560853, "balance_loss_mlp": 1.04785228, "epoch": 0.40739516007816023, "flos": 22126970551680.0, "grad_norm": 4.505541337577781, "language_loss": 0.77876586, "learning_rate": 2.6829219109209474e-06, "loss": 0.80058008, "num_input_tokens_seen": 145428395, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8203125, "step": 6776, "time_per_iteration": 2.497720241546631 }, { "auxiliary_loss_clip": 0.01132583, "auxiliary_loss_mlp": 0.01044882, "balance_loss_clip": 1.02951574, "balance_loss_mlp": 1.04885066, "epoch": 0.4074552833308282, "flos": 23842243693440.0, "grad_norm": 2.3407925271974355, "language_loss": 0.78817213, "learning_rate": 2.682555844513981e-06, "loss": 0.80994672, "num_input_tokens_seen": 145448290, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8359375, "step": 6777, "time_per_iteration": 2.5180232524871826 }, { "auxiliary_loss_clip": 0.01053204, "auxiliary_loss_mlp": 0.01003655, "balance_loss_clip": 1.00131881, "balance_loss_mlp": 1.02545774, "epoch": 0.40751540658349616, "flos": 58000008781440.0, "grad_norm": 0.6854912054486969, "language_loss": 0.53218299, "learning_rate": 2.6821897522244286e-06, "loss": 0.55275154, "num_input_tokens_seen": 145509785, "router_z_loss_clip": 0.02331543, "router_z_loss_mlp": 0.27734375, "step": 6778, "time_per_iteration": 3.120537757873535 }, { "auxiliary_loss_clip": 0.01131006, "auxiliary_loss_mlp": 0.01044987, "balance_loss_clip": 1.02959716, "balance_loss_mlp": 1.05078661, "epoch": 0.40757552983616413, "flos": 21214659991680.0, "grad_norm": 2.5074475957349804, "language_loss": 0.82192421, "learning_rate": 2.6818236340661718e-06, "loss": 0.84368408, "num_input_tokens_seen": 145528620, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 6779, "time_per_iteration": 2.459794282913208 }, { "auxiliary_loss_clip": 0.01130019, "auxiliary_loss_mlp": 0.01046071, "balance_loss_clip": 1.03026438, "balance_loss_mlp": 1.04884362, "epoch": 0.4076356530888321, "flos": 26833530556800.0, "grad_norm": 1.8184395895489174, "language_loss": 0.76360267, "learning_rate": 2.6814574900530957e-06, "loss": 0.78536355, "num_input_tokens_seen": 145547775, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8125, "step": 6780, "time_per_iteration": 2.559555768966675 }, { "auxiliary_loss_clip": 0.0112513, "auxiliary_loss_mlp": 0.01035645, "balance_loss_clip": 1.02211475, "balance_loss_mlp": 1.04723227, "epoch": 0.40769577634150006, "flos": 12203021272320.0, "grad_norm": 3.120325918298554, "language_loss": 0.6635747, "learning_rate": 2.6810913201990827e-06, "loss": 0.68518245, "num_input_tokens_seen": 145564465, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.77734375, "step": 6781, "time_per_iteration": 2.4946727752685547 }, { "auxiliary_loss_clip": 0.01128731, "auxiliary_loss_mlp": 0.01040603, "balance_loss_clip": 1.02520072, "balance_loss_mlp": 1.04842818, "epoch": 0.407755899594168, "flos": 33655264796160.0, "grad_norm": 1.8572601171574061, "language_loss": 0.71474659, "learning_rate": 2.6807251245180183e-06, "loss": 0.73643994, "num_input_tokens_seen": 145585965, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80078125, "step": 6782, "time_per_iteration": 2.5991897583007812 }, { "auxiliary_loss_clip": 0.01128546, "auxiliary_loss_mlp": 0.01041899, "balance_loss_clip": 1.02731943, "balance_loss_mlp": 1.0476644, "epoch": 0.407816022846836, "flos": 20157342226560.0, "grad_norm": 1.661341926067685, "language_loss": 0.8207894, "learning_rate": 2.6803589030237897e-06, "loss": 0.84249383, "num_input_tokens_seen": 145605000, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.80859375, "step": 6783, "time_per_iteration": 2.498257875442505 }, { "auxiliary_loss_clip": 0.01126974, "auxiliary_loss_mlp": 0.01042414, "balance_loss_clip": 1.02777541, "balance_loss_mlp": 1.04607272, "epoch": 0.40787614609950396, "flos": 21178821196800.0, "grad_norm": 1.5478983632399907, "language_loss": 0.80746722, "learning_rate": 2.679992655730283e-06, "loss": 0.82916105, "num_input_tokens_seen": 145623740, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80859375, "step": 6784, "time_per_iteration": 2.547797679901123 }, { "auxiliary_loss_clip": 0.01132574, "auxiliary_loss_mlp": 0.01044178, "balance_loss_clip": 1.02753043, "balance_loss_mlp": 1.04729676, "epoch": 0.407936269352172, "flos": 20520650338560.0, "grad_norm": 1.9590215019062196, "language_loss": 0.66054189, "learning_rate": 2.679626382651386e-06, "loss": 0.68230951, "num_input_tokens_seen": 145643515, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8515625, "step": 6785, "time_per_iteration": 2.4805192947387695 }, { "auxiliary_loss_clip": 0.01126929, "auxiliary_loss_mlp": 0.01042067, "balance_loss_clip": 1.02734506, "balance_loss_mlp": 1.04661942, "epoch": 0.40799639260483994, "flos": 20118809911680.0, "grad_norm": 3.55420751655311, "language_loss": 0.796767, "learning_rate": 2.679260083800989e-06, "loss": 0.81845695, "num_input_tokens_seen": 145660890, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 6786, "time_per_iteration": 2.482116222381592 }, { "auxiliary_loss_clip": 0.01127127, "auxiliary_loss_mlp": 0.01041573, "balance_loss_clip": 1.02796555, "balance_loss_mlp": 1.04725826, "epoch": 0.4080565158575079, "flos": 20997328752000.0, "grad_norm": 1.881544031220056, "language_loss": 0.81810522, "learning_rate": 2.678893759192982e-06, "loss": 0.83979225, "num_input_tokens_seen": 145680070, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.796875, "step": 6787, "time_per_iteration": 2.487053155899048 }, { "auxiliary_loss_clip": 0.01125084, "auxiliary_loss_mlp": 0.01038338, "balance_loss_clip": 1.02332973, "balance_loss_mlp": 1.0455718, "epoch": 0.40811663911017587, "flos": 19317714837120.0, "grad_norm": 2.6043365953859214, "language_loss": 0.67835116, "learning_rate": 2.678527408841255e-06, "loss": 0.69998538, "num_input_tokens_seen": 145698010, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.796875, "step": 6788, "time_per_iteration": 2.482980728149414 }, { "auxiliary_loss_clip": 0.01123842, "auxiliary_loss_mlp": 0.01042626, "balance_loss_clip": 1.02776098, "balance_loss_mlp": 1.04371536, "epoch": 0.40817676236284384, "flos": 40625382119040.0, "grad_norm": 1.878789047568896, "language_loss": 0.66124362, "learning_rate": 2.678161032759701e-06, "loss": 0.6829083, "num_input_tokens_seen": 145722215, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80078125, "step": 6789, "time_per_iteration": 2.6505401134490967 }, { "auxiliary_loss_clip": 0.01126294, "auxiliary_loss_mlp": 0.01033118, "balance_loss_clip": 1.01796627, "balance_loss_mlp": 1.04636121, "epoch": 0.4082368856155118, "flos": 20522086882560.0, "grad_norm": 1.7232214655081222, "language_loss": 0.60691994, "learning_rate": 2.6777946309622123e-06, "loss": 0.62851405, "num_input_tokens_seen": 145741090, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.80078125, "step": 6790, "time_per_iteration": 2.478130340576172 }, { "auxiliary_loss_clip": 0.01128275, "auxiliary_loss_mlp": 0.01038825, "balance_loss_clip": 1.02404284, "balance_loss_mlp": 1.04934561, "epoch": 0.40829700886817977, "flos": 11427745098240.0, "grad_norm": 3.3683865540553115, "language_loss": 0.70049822, "learning_rate": 2.677428203462683e-06, "loss": 0.72216922, "num_input_tokens_seen": 145754985, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7890625, "step": 6791, "time_per_iteration": 2.4074277877807617 }, { "auxiliary_loss_clip": 0.01051316, "auxiliary_loss_mlp": 0.01010621, "balance_loss_clip": 1.00814152, "balance_loss_mlp": 1.0235765, "epoch": 0.40835713212084773, "flos": 67330677121920.0, "grad_norm": 0.7529297025445288, "language_loss": 0.59631717, "learning_rate": 2.6770617502750093e-06, "loss": 0.6169365, "num_input_tokens_seen": 145815260, "router_z_loss_clip": 0.02478027, "router_z_loss_mlp": 0.27734375, "step": 6792, "time_per_iteration": 3.0730881690979004 }, { "auxiliary_loss_clip": 0.01131729, "auxiliary_loss_mlp": 0.01041515, "balance_loss_clip": 1.02546954, "balance_loss_mlp": 1.04958177, "epoch": 0.4084172553735157, "flos": 21762010414080.0, "grad_norm": 1.8655921965126334, "language_loss": 0.79780531, "learning_rate": 2.6766952714130857e-06, "loss": 0.81953776, "num_input_tokens_seen": 145832665, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8203125, "step": 6793, "time_per_iteration": 2.476436138153076 }, { "auxiliary_loss_clip": 0.01127994, "auxiliary_loss_mlp": 0.01034668, "balance_loss_clip": 1.01911092, "balance_loss_mlp": 1.04667592, "epoch": 0.40847737862618366, "flos": 27417258478080.0, "grad_norm": 1.8765803869905777, "language_loss": 0.85032344, "learning_rate": 2.6763287668908094e-06, "loss": 0.87195015, "num_input_tokens_seen": 145850240, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8125, "step": 6794, "time_per_iteration": 2.5153298377990723 }, { "auxiliary_loss_clip": 0.01127893, "auxiliary_loss_mlp": 0.01038835, "balance_loss_clip": 1.0235759, "balance_loss_mlp": 1.04788482, "epoch": 0.4085375018788516, "flos": 18587255857920.0, "grad_norm": 1.542603987492018, "language_loss": 0.80199134, "learning_rate": 2.6759622367220788e-06, "loss": 0.82365865, "num_input_tokens_seen": 145869545, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80078125, "step": 6795, "time_per_iteration": 2.450549840927124 }, { "auxiliary_loss_clip": 0.01130896, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.02098513, "balance_loss_mlp": 1.04660845, "epoch": 0.4085976251315196, "flos": 15411783029760.0, "grad_norm": 2.9240213970994913, "language_loss": 0.69683003, "learning_rate": 2.675595680920792e-06, "loss": 0.71850985, "num_input_tokens_seen": 145884025, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 6796, "time_per_iteration": 2.58335018157959 }, { "auxiliary_loss_clip": 0.01126873, "auxiliary_loss_mlp": 0.01035662, "balance_loss_clip": 1.02167928, "balance_loss_mlp": 1.04623401, "epoch": 0.40865774838418756, "flos": 21252222639360.0, "grad_norm": 2.484967417124297, "language_loss": 0.7781893, "learning_rate": 2.6752290995008498e-06, "loss": 0.79981464, "num_input_tokens_seen": 145903210, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.8046875, "step": 6797, "time_per_iteration": 2.475435733795166 }, { "auxiliary_loss_clip": 0.01124788, "auxiliary_loss_mlp": 0.01043481, "balance_loss_clip": 1.02780533, "balance_loss_mlp": 1.04419971, "epoch": 0.4087178716368556, "flos": 13772245714560.0, "grad_norm": 1.9873676266932232, "language_loss": 0.85911131, "learning_rate": 2.6748624924761523e-06, "loss": 0.88079405, "num_input_tokens_seen": 145920985, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8046875, "step": 6798, "time_per_iteration": 2.469074010848999 }, { "auxiliary_loss_clip": 0.01122754, "auxiliary_loss_mlp": 0.01036248, "balance_loss_clip": 1.02259851, "balance_loss_mlp": 1.04458606, "epoch": 0.40877799488952354, "flos": 23621752056960.0, "grad_norm": 1.5391447179744218, "language_loss": 0.84445953, "learning_rate": 2.674495859860601e-06, "loss": 0.86604953, "num_input_tokens_seen": 145940350, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 6799, "time_per_iteration": 3.9693689346313477 }, { "auxiliary_loss_clip": 0.01127913, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.01928568, "balance_loss_mlp": 1.04748976, "epoch": 0.4088381181421915, "flos": 20918791664640.0, "grad_norm": 2.298927200338699, "language_loss": 0.83603275, "learning_rate": 2.6741292016681e-06, "loss": 0.85766697, "num_input_tokens_seen": 145957460, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8046875, "step": 6800, "time_per_iteration": 2.4620003700256348 }, { "auxiliary_loss_clip": 0.01128759, "auxiliary_loss_mlp": 0.01043487, "balance_loss_clip": 1.02746522, "balance_loss_mlp": 1.04665852, "epoch": 0.4088982413948595, "flos": 13297578462720.0, "grad_norm": 2.1888110858087724, "language_loss": 0.74709964, "learning_rate": 2.6737625179125514e-06, "loss": 0.76882207, "num_input_tokens_seen": 145975285, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8203125, "step": 6801, "time_per_iteration": 2.4726645946502686 }, { "auxiliary_loss_clip": 0.01127757, "auxiliary_loss_mlp": 0.01040776, "balance_loss_clip": 1.02482593, "balance_loss_mlp": 1.04636574, "epoch": 0.40895836464752744, "flos": 15267673664640.0, "grad_norm": 2.179512078227495, "language_loss": 0.8024044, "learning_rate": 2.673395808607861e-06, "loss": 0.82408965, "num_input_tokens_seen": 145989150, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8125, "step": 6802, "time_per_iteration": 2.433960199356079 }, { "auxiliary_loss_clip": 0.01130951, "auxiliary_loss_mlp": 0.01042467, "balance_loss_clip": 1.02490711, "balance_loss_mlp": 1.04801154, "epoch": 0.4090184879001954, "flos": 14501411804160.0, "grad_norm": 3.0715406822505735, "language_loss": 0.76388466, "learning_rate": 2.673029073767934e-06, "loss": 0.78561884, "num_input_tokens_seen": 146006980, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.828125, "step": 6803, "time_per_iteration": 5.2051308155059814 }, { "auxiliary_loss_clip": 0.01126183, "auxiliary_loss_mlp": 0.01036491, "balance_loss_clip": 1.02074337, "balance_loss_mlp": 1.0450542, "epoch": 0.40907861115286337, "flos": 13881593692800.0, "grad_norm": 4.323212257045652, "language_loss": 0.78835499, "learning_rate": 2.6726623134066764e-06, "loss": 0.80998164, "num_input_tokens_seen": 146025125, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8125, "step": 6804, "time_per_iteration": 3.8723011016845703 }, { "auxiliary_loss_clip": 0.01132515, "auxiliary_loss_mlp": 0.01037845, "balance_loss_clip": 1.02262211, "balance_loss_mlp": 1.04765236, "epoch": 0.40913873440553133, "flos": 28037615293440.0, "grad_norm": 1.9247636312268646, "language_loss": 0.74966872, "learning_rate": 2.672295527537998e-06, "loss": 0.77137232, "num_input_tokens_seen": 146044990, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8515625, "step": 6805, "time_per_iteration": 2.511186122894287 }, { "auxiliary_loss_clip": 0.01133021, "auxiliary_loss_mlp": 0.01037528, "balance_loss_clip": 1.02224541, "balance_loss_mlp": 1.04995477, "epoch": 0.4091988576581993, "flos": 21618188357760.0, "grad_norm": 1.8748967499680826, "language_loss": 0.79381824, "learning_rate": 2.671928716175804e-06, "loss": 0.81552374, "num_input_tokens_seen": 146066045, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83203125, "step": 6806, "time_per_iteration": 2.5462441444396973 }, { "auxiliary_loss_clip": 0.01131029, "auxiliary_loss_mlp": 0.01028776, "balance_loss_clip": 1.01331496, "balance_loss_mlp": 1.04756999, "epoch": 0.40925898091086726, "flos": 25224085860480.0, "grad_norm": 2.443598351632157, "language_loss": 0.7181524, "learning_rate": 2.671561879334007e-06, "loss": 0.73975044, "num_input_tokens_seen": 146086280, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.83203125, "step": 6807, "time_per_iteration": 2.505863904953003 }, { "auxiliary_loss_clip": 0.01053996, "auxiliary_loss_mlp": 0.0100798, "balance_loss_clip": 1.00588167, "balance_loss_mlp": 1.02663589, "epoch": 0.40931910416353523, "flos": 68930568800640.0, "grad_norm": 0.8270007276323084, "language_loss": 0.58801138, "learning_rate": 2.6711950170265155e-06, "loss": 0.60863113, "num_input_tokens_seen": 146148840, "router_z_loss_clip": 0.02099609, "router_z_loss_mlp": 0.2734375, "step": 6808, "time_per_iteration": 3.1915640830993652 }, { "auxiliary_loss_clip": 0.01127379, "auxiliary_loss_mlp": 0.0104196, "balance_loss_clip": 1.02727342, "balance_loss_mlp": 1.04650116, "epoch": 0.4093792274162032, "flos": 20189553747840.0, "grad_norm": 1.8394307774077434, "language_loss": 0.54850233, "learning_rate": 2.670828129267242e-06, "loss": 0.57019573, "num_input_tokens_seen": 146166195, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80859375, "step": 6809, "time_per_iteration": 2.450416088104248 }, { "auxiliary_loss_clip": 0.01128322, "auxiliary_loss_mlp": 0.01027702, "balance_loss_clip": 1.01359391, "balance_loss_mlp": 1.04822159, "epoch": 0.40943935066887116, "flos": 25228754628480.0, "grad_norm": 1.7729971152437574, "language_loss": 0.83320868, "learning_rate": 2.6704612160700983e-06, "loss": 0.85476893, "num_input_tokens_seen": 146185045, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80078125, "step": 6810, "time_per_iteration": 2.5104503631591797 }, { "auxiliary_loss_clip": 0.01133575, "auxiliary_loss_mlp": 0.01037072, "balance_loss_clip": 1.02100253, "balance_loss_mlp": 1.05033386, "epoch": 0.4094994739215392, "flos": 23255319461760.0, "grad_norm": 2.2583391053986146, "language_loss": 0.77527875, "learning_rate": 2.670094277448999e-06, "loss": 0.79698527, "num_input_tokens_seen": 146204655, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.83203125, "step": 6811, "time_per_iteration": 2.4792098999023438 }, { "auxiliary_loss_clip": 0.01130176, "auxiliary_loss_mlp": 0.01034895, "balance_loss_clip": 1.01846778, "balance_loss_mlp": 1.04772687, "epoch": 0.40955959717420715, "flos": 17382165540480.0, "grad_norm": 2.2709567717422194, "language_loss": 0.7020157, "learning_rate": 2.669727313417857e-06, "loss": 0.72366643, "num_input_tokens_seen": 146222000, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.82421875, "step": 6812, "time_per_iteration": 2.4576668739318848 }, { "auxiliary_loss_clip": 0.01126894, "auxiliary_loss_mlp": 0.01044444, "balance_loss_clip": 1.02888703, "balance_loss_mlp": 1.04648685, "epoch": 0.4096197204268751, "flos": 25082418620160.0, "grad_norm": 3.6801040084676866, "language_loss": 0.66275728, "learning_rate": 2.6693603239905872e-06, "loss": 0.68447071, "num_input_tokens_seen": 146242630, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8046875, "step": 6813, "time_per_iteration": 2.500016689300537 }, { "auxiliary_loss_clip": 0.01128828, "auxiliary_loss_mlp": 0.01036274, "balance_loss_clip": 1.02101517, "balance_loss_mlp": 1.04851937, "epoch": 0.4096798436795431, "flos": 30586769648640.0, "grad_norm": 2.0730041666537686, "language_loss": 0.74105179, "learning_rate": 2.6689933091811087e-06, "loss": 0.76270282, "num_input_tokens_seen": 146263070, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 6814, "time_per_iteration": 2.5580854415893555 }, { "auxiliary_loss_clip": 0.01132007, "auxiliary_loss_mlp": 0.01035479, "balance_loss_clip": 1.01988614, "balance_loss_mlp": 1.04873669, "epoch": 0.40973996693221104, "flos": 24133622820480.0, "grad_norm": 2.045685724395862, "language_loss": 0.65800607, "learning_rate": 2.6686262690033357e-06, "loss": 0.67968088, "num_input_tokens_seen": 146282890, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 6815, "time_per_iteration": 2.491955280303955 }, { "auxiliary_loss_clip": 0.01126652, "auxiliary_loss_mlp": 0.01042825, "balance_loss_clip": 1.02780426, "balance_loss_mlp": 1.04904628, "epoch": 0.409800090184879, "flos": 23988974751360.0, "grad_norm": 1.6679925540160425, "language_loss": 0.76749223, "learning_rate": 2.668259203471188e-06, "loss": 0.78918695, "num_input_tokens_seen": 146301755, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.77734375, "step": 6816, "time_per_iteration": 2.507139205932617 }, { "auxiliary_loss_clip": 0.01129265, "auxiliary_loss_mlp": 0.01042667, "balance_loss_clip": 1.02729487, "balance_loss_mlp": 1.04870617, "epoch": 0.40986021343754697, "flos": 16143678552960.0, "grad_norm": 2.108944313301918, "language_loss": 0.81633329, "learning_rate": 2.6678921125985843e-06, "loss": 0.83805257, "num_input_tokens_seen": 146316835, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8046875, "step": 6817, "time_per_iteration": 2.4321787357330322 }, { "auxiliary_loss_clip": 0.01132286, "auxiliary_loss_mlp": 0.01043921, "balance_loss_clip": 1.0268383, "balance_loss_mlp": 1.04749608, "epoch": 0.40992033669021494, "flos": 24790824011520.0, "grad_norm": 7.258720278501491, "language_loss": 0.79853845, "learning_rate": 2.667524996399444e-06, "loss": 0.82030046, "num_input_tokens_seen": 146336650, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.8515625, "step": 6818, "time_per_iteration": 2.5314221382141113 }, { "auxiliary_loss_clip": 0.01125902, "auxiliary_loss_mlp": 0.01040382, "balance_loss_clip": 1.02548718, "balance_loss_mlp": 1.04633963, "epoch": 0.4099804599428829, "flos": 29641888431360.0, "grad_norm": 1.6507111859871235, "language_loss": 0.65950286, "learning_rate": 2.66715785488769e-06, "loss": 0.6811657, "num_input_tokens_seen": 146357640, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 6819, "time_per_iteration": 2.539677381515503 }, { "auxiliary_loss_clip": 0.0113341, "auxiliary_loss_mlp": 0.01042139, "balance_loss_clip": 1.02581894, "balance_loss_mlp": 1.04743862, "epoch": 0.41004058319555087, "flos": 24826590979200.0, "grad_norm": 1.672730082401129, "language_loss": 0.85136938, "learning_rate": 2.6667906880772428e-06, "loss": 0.87312484, "num_input_tokens_seen": 146379325, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.859375, "step": 6820, "time_per_iteration": 2.5204076766967773 }, { "auxiliary_loss_clip": 0.01127605, "auxiliary_loss_mlp": 0.01034735, "balance_loss_clip": 1.0195719, "balance_loss_mlp": 1.04815316, "epoch": 0.41010070644821883, "flos": 25737464995200.0, "grad_norm": 2.2117866253403604, "language_loss": 0.70854306, "learning_rate": 2.6664234959820256e-06, "loss": 0.73016655, "num_input_tokens_seen": 146398635, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.796875, "step": 6821, "time_per_iteration": 2.5657825469970703 }, { "auxiliary_loss_clip": 0.01128399, "auxiliary_loss_mlp": 0.01034863, "balance_loss_clip": 1.02017593, "balance_loss_mlp": 1.04757142, "epoch": 0.4101608297008868, "flos": 22346061557760.0, "grad_norm": 1.8498034183097591, "language_loss": 0.74896675, "learning_rate": 2.6660562786159634e-06, "loss": 0.77059937, "num_input_tokens_seen": 146417585, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80859375, "step": 6822, "time_per_iteration": 2.5107805728912354 }, { "auxiliary_loss_clip": 0.01128869, "auxiliary_loss_mlp": 0.01034373, "balance_loss_clip": 1.01923323, "balance_loss_mlp": 1.04757023, "epoch": 0.41022095295355476, "flos": 21945083057280.0, "grad_norm": 2.189676473470215, "language_loss": 0.76300716, "learning_rate": 2.6656890359929796e-06, "loss": 0.78463954, "num_input_tokens_seen": 146437035, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8125, "step": 6823, "time_per_iteration": 2.5242958068847656 }, { "auxiliary_loss_clip": 0.01133639, "auxiliary_loss_mlp": 0.01040536, "balance_loss_clip": 1.0240612, "balance_loss_mlp": 1.04802954, "epoch": 0.4102810762062228, "flos": 27450511493760.0, "grad_norm": 1.8890243791356638, "language_loss": 0.73096514, "learning_rate": 2.665321768127001e-06, "loss": 0.75270683, "num_input_tokens_seen": 146457370, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.85546875, "step": 6824, "time_per_iteration": 2.5489070415496826 }, { "auxiliary_loss_clip": 0.01131622, "auxiliary_loss_mlp": 0.01034709, "balance_loss_clip": 1.01866329, "balance_loss_mlp": 1.04691482, "epoch": 0.41034119945889075, "flos": 24499265316480.0, "grad_norm": 2.1026675304454203, "language_loss": 0.72052562, "learning_rate": 2.6649544750319548e-06, "loss": 0.74218893, "num_input_tokens_seen": 146478105, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84765625, "step": 6825, "time_per_iteration": 2.5335371494293213 }, { "auxiliary_loss_clip": 0.01128852, "auxiliary_loss_mlp": 0.01041108, "balance_loss_clip": 1.02638543, "balance_loss_mlp": 1.04833198, "epoch": 0.4104013227115587, "flos": 24352641999360.0, "grad_norm": 2.3952580449466803, "language_loss": 0.85269362, "learning_rate": 2.664587156721768e-06, "loss": 0.87439322, "num_input_tokens_seen": 146497835, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 6826, "time_per_iteration": 2.531923294067383 }, { "auxiliary_loss_clip": 0.01126538, "auxiliary_loss_mlp": 0.01035746, "balance_loss_clip": 1.02008152, "balance_loss_mlp": 1.04827881, "epoch": 0.4104614459642267, "flos": 23729340268800.0, "grad_norm": 3.998909308318903, "language_loss": 0.66866064, "learning_rate": 2.6642198132103696e-06, "loss": 0.69028342, "num_input_tokens_seen": 146517735, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.78515625, "step": 6827, "time_per_iteration": 2.527482271194458 }, { "auxiliary_loss_clip": 0.01125094, "auxiliary_loss_mlp": 0.01033601, "balance_loss_clip": 1.01841354, "balance_loss_mlp": 1.04571116, "epoch": 0.41052156921689464, "flos": 22127976132480.0, "grad_norm": 1.3611061301207377, "language_loss": 0.72195327, "learning_rate": 2.663852444511689e-06, "loss": 0.74354029, "num_input_tokens_seen": 146537640, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.79296875, "step": 6828, "time_per_iteration": 2.504570722579956 }, { "auxiliary_loss_clip": 0.01133081, "auxiliary_loss_mlp": 0.01038482, "balance_loss_clip": 1.02240086, "balance_loss_mlp": 1.04865003, "epoch": 0.4105816924695626, "flos": 20084371747200.0, "grad_norm": 1.9191387958909376, "language_loss": 0.83707273, "learning_rate": 2.6634850506396574e-06, "loss": 0.85878837, "num_input_tokens_seen": 146554695, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.84375, "step": 6829, "time_per_iteration": 2.4881794452667236 }, { "auxiliary_loss_clip": 0.01127752, "auxiliary_loss_mlp": 0.01035952, "balance_loss_clip": 1.02097297, "balance_loss_mlp": 1.04768538, "epoch": 0.4106418157222306, "flos": 18076785724800.0, "grad_norm": 1.654218875991246, "language_loss": 0.89728379, "learning_rate": 2.663117631608206e-06, "loss": 0.91892087, "num_input_tokens_seen": 146573740, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.80078125, "step": 6830, "time_per_iteration": 2.5404536724090576 }, { "auxiliary_loss_clip": 0.01129391, "auxiliary_loss_mlp": 0.01029422, "balance_loss_clip": 1.01402009, "balance_loss_mlp": 1.0487926, "epoch": 0.41070193897489854, "flos": 21647850013440.0, "grad_norm": 1.8511584054123382, "language_loss": 0.65504491, "learning_rate": 2.662750187431268e-06, "loss": 0.67663312, "num_input_tokens_seen": 146592885, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8046875, "step": 6831, "time_per_iteration": 2.502255439758301 }, { "auxiliary_loss_clip": 0.01127101, "auxiliary_loss_mlp": 0.01032932, "balance_loss_clip": 1.01792347, "balance_loss_mlp": 1.04748678, "epoch": 0.4107620622275665, "flos": 26648195356800.0, "grad_norm": 2.164445956125737, "language_loss": 0.691993, "learning_rate": 2.662382718122776e-06, "loss": 0.7135933, "num_input_tokens_seen": 146611995, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.796875, "step": 6832, "time_per_iteration": 2.5111231803894043 }, { "auxiliary_loss_clip": 0.01124643, "auxiliary_loss_mlp": 0.01038403, "balance_loss_clip": 1.02388966, "balance_loss_mlp": 1.04592133, "epoch": 0.41082218548023447, "flos": 18734310138240.0, "grad_norm": 2.2725309908177804, "language_loss": 0.73275781, "learning_rate": 2.662015223696666e-06, "loss": 0.75438821, "num_input_tokens_seen": 146628045, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 6833, "time_per_iteration": 2.4834859371185303 }, { "auxiliary_loss_clip": 0.01132128, "auxiliary_loss_mlp": 0.01036697, "balance_loss_clip": 1.01974559, "balance_loss_mlp": 1.04843497, "epoch": 0.41088230873290243, "flos": 22893771116160.0, "grad_norm": 2.013680950596463, "language_loss": 0.72428215, "learning_rate": 2.6616477041668713e-06, "loss": 0.74597037, "num_input_tokens_seen": 146648355, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8359375, "step": 6834, "time_per_iteration": 2.502331256866455 }, { "auxiliary_loss_clip": 0.01131248, "auxiliary_loss_mlp": 0.01046562, "balance_loss_clip": 1.03080249, "balance_loss_mlp": 1.04776978, "epoch": 0.4109424319855704, "flos": 24276978000000.0, "grad_norm": 1.9433279552871645, "language_loss": 0.71238339, "learning_rate": 2.661280159547329e-06, "loss": 0.7341615, "num_input_tokens_seen": 146668370, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8359375, "step": 6835, "time_per_iteration": 2.5518791675567627 }, { "auxiliary_loss_clip": 0.01129844, "auxiliary_loss_mlp": 0.0103707, "balance_loss_clip": 1.02065516, "balance_loss_mlp": 1.04845607, "epoch": 0.41100255523823837, "flos": 12969139478400.0, "grad_norm": 2.9329426733757247, "language_loss": 0.87581819, "learning_rate": 2.660912589851978e-06, "loss": 0.89748728, "num_input_tokens_seen": 146686665, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8125, "step": 6836, "time_per_iteration": 2.468038320541382 }, { "auxiliary_loss_clip": 0.01127487, "auxiliary_loss_mlp": 0.01039746, "balance_loss_clip": 1.02404594, "balance_loss_mlp": 1.04847848, "epoch": 0.4110626784909064, "flos": 23145648261120.0, "grad_norm": 6.850456135929455, "language_loss": 0.69553465, "learning_rate": 2.6605449950947547e-06, "loss": 0.71720695, "num_input_tokens_seen": 146706570, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7890625, "step": 6837, "time_per_iteration": 2.5229291915893555 }, { "auxiliary_loss_clip": 0.01132218, "auxiliary_loss_mlp": 0.0103813, "balance_loss_clip": 1.0218823, "balance_loss_mlp": 1.0499295, "epoch": 0.41112280174357435, "flos": 22747399194240.0, "grad_norm": 1.8503858898644598, "language_loss": 0.75510955, "learning_rate": 2.660177375289599e-06, "loss": 0.77681297, "num_input_tokens_seen": 146723425, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8203125, "step": 6838, "time_per_iteration": 2.4912710189819336 }, { "auxiliary_loss_clip": 0.01128438, "auxiliary_loss_mlp": 0.01033759, "balance_loss_clip": 1.01766562, "balance_loss_mlp": 1.04884088, "epoch": 0.4111829249962423, "flos": 21102403011840.0, "grad_norm": 2.36420006901876, "language_loss": 0.82204783, "learning_rate": 2.659809730450451e-06, "loss": 0.84366989, "num_input_tokens_seen": 146741640, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.796875, "step": 6839, "time_per_iteration": 2.5048062801361084 }, { "auxiliary_loss_clip": 0.01124994, "auxiliary_loss_mlp": 0.01031363, "balance_loss_clip": 1.01636648, "balance_loss_mlp": 1.04585147, "epoch": 0.4112430482489103, "flos": 21505787723520.0, "grad_norm": 2.0285059719079643, "language_loss": 0.80397105, "learning_rate": 2.6594420605912523e-06, "loss": 0.82553458, "num_input_tokens_seen": 146759195, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 6840, "time_per_iteration": 2.4893198013305664 }, { "auxiliary_loss_clip": 0.01124265, "auxiliary_loss_mlp": 0.01035023, "balance_loss_clip": 1.02034211, "balance_loss_mlp": 1.04642844, "epoch": 0.41130317150157825, "flos": 19570022945280.0, "grad_norm": 3.14474629133896, "language_loss": 0.67724329, "learning_rate": 2.6590743657259442e-06, "loss": 0.69883621, "num_input_tokens_seen": 146774990, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 6841, "time_per_iteration": 3.945662498474121 }, { "auxiliary_loss_clip": 0.01057283, "auxiliary_loss_mlp": 0.01009524, "balance_loss_clip": 1.00731909, "balance_loss_mlp": 1.02973318, "epoch": 0.4113632947542462, "flos": 62383157706240.0, "grad_norm": 0.983548500684981, "language_loss": 0.59759617, "learning_rate": 2.65870664586847e-06, "loss": 0.6182642, "num_input_tokens_seen": 146839610, "router_z_loss_clip": 0.02209473, "router_z_loss_mlp": 0.27539062, "step": 6842, "time_per_iteration": 3.1683785915374756 }, { "auxiliary_loss_clip": 0.01125988, "auxiliary_loss_mlp": 0.01033307, "balance_loss_clip": 1.01891875, "balance_loss_mlp": 1.04901409, "epoch": 0.4114234180069142, "flos": 13918617636480.0, "grad_norm": 2.2505738619056452, "language_loss": 0.69820011, "learning_rate": 2.6583389010327742e-06, "loss": 0.71979302, "num_input_tokens_seen": 146857360, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76953125, "step": 6843, "time_per_iteration": 2.4623398780822754 }, { "auxiliary_loss_clip": 0.01057222, "auxiliary_loss_mlp": 0.0100669, "balance_loss_clip": 1.00460362, "balance_loss_mlp": 1.03001332, "epoch": 0.41148354125958214, "flos": 64928505219840.0, "grad_norm": 0.7369373293942993, "language_loss": 0.53650844, "learning_rate": 2.6579711312328013e-06, "loss": 0.55714756, "num_input_tokens_seen": 146917055, "router_z_loss_clip": 0.02087402, "router_z_loss_mlp": 0.27148438, "step": 6844, "time_per_iteration": 4.4798264503479 }, { "auxiliary_loss_clip": 0.01126597, "auxiliary_loss_mlp": 0.01035267, "balance_loss_clip": 1.0206759, "balance_loss_mlp": 1.04794633, "epoch": 0.4115436645122501, "flos": 18728779443840.0, "grad_norm": 1.8300635575851145, "language_loss": 0.65716124, "learning_rate": 2.6576033364824967e-06, "loss": 0.6787799, "num_input_tokens_seen": 146935215, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 6845, "time_per_iteration": 5.408437013626099 }, { "auxiliary_loss_clip": 0.01127013, "auxiliary_loss_mlp": 0.01033506, "balance_loss_clip": 1.01870048, "balance_loss_mlp": 1.04987192, "epoch": 0.41160378776491807, "flos": 16252918790400.0, "grad_norm": 2.0330714372960252, "language_loss": 0.70139748, "learning_rate": 2.657235516795808e-06, "loss": 0.72300267, "num_input_tokens_seen": 146951970, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76953125, "step": 6846, "time_per_iteration": 2.4740700721740723 }, { "auxiliary_loss_clip": 0.01125295, "auxiliary_loss_mlp": 0.01038796, "balance_loss_clip": 1.02304804, "balance_loss_mlp": 1.04612279, "epoch": 0.41166391101758604, "flos": 27970031854080.0, "grad_norm": 1.5876331714585903, "language_loss": 0.65042633, "learning_rate": 2.6568676721866826e-06, "loss": 0.67206722, "num_input_tokens_seen": 146975615, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.7890625, "step": 6847, "time_per_iteration": 2.585414171218872 }, { "auxiliary_loss_clip": 0.01125121, "auxiliary_loss_mlp": 0.01036003, "balance_loss_clip": 1.02095914, "balance_loss_mlp": 1.04610062, "epoch": 0.411724034270254, "flos": 34131296764800.0, "grad_norm": 1.6318357189629964, "language_loss": 0.70606595, "learning_rate": 2.656499802669069e-06, "loss": 0.72767723, "num_input_tokens_seen": 146998855, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 6848, "time_per_iteration": 2.589982509613037 }, { "auxiliary_loss_clip": 0.01054545, "auxiliary_loss_mlp": 0.01003768, "balance_loss_clip": 1.00181329, "balance_loss_mlp": 1.02735591, "epoch": 0.41178415752292197, "flos": 67923670752000.0, "grad_norm": 0.8943949843588317, "language_loss": 0.56261432, "learning_rate": 2.6561319082569174e-06, "loss": 0.58319747, "num_input_tokens_seen": 147062710, "router_z_loss_clip": 0.01953125, "router_z_loss_mlp": 0.27148438, "step": 6849, "time_per_iteration": 3.184873342514038 }, { "auxiliary_loss_clip": 0.01126512, "auxiliary_loss_mlp": 0.01040216, "balance_loss_clip": 1.02525544, "balance_loss_mlp": 1.04914427, "epoch": 0.41184428077558993, "flos": 34313938444800.0, "grad_norm": 1.691595112258273, "language_loss": 0.75935829, "learning_rate": 2.6557639889641783e-06, "loss": 0.78102553, "num_input_tokens_seen": 147086075, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7734375, "step": 6850, "time_per_iteration": 2.6151552200317383 }, { "auxiliary_loss_clip": 0.01124746, "auxiliary_loss_mlp": 0.01034386, "balance_loss_clip": 1.01994991, "balance_loss_mlp": 1.04690111, "epoch": 0.41190440402825795, "flos": 35444118948480.0, "grad_norm": 1.6067530425404524, "language_loss": 0.68222392, "learning_rate": 2.6553960448048025e-06, "loss": 0.70381516, "num_input_tokens_seen": 147107590, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 6851, "time_per_iteration": 2.611186981201172 }, { "auxiliary_loss_clip": 0.0113136, "auxiliary_loss_mlp": 0.0104406, "balance_loss_clip": 1.02713227, "balance_loss_mlp": 1.04784703, "epoch": 0.4119645272809259, "flos": 20849879422080.0, "grad_norm": 2.1594892822017067, "language_loss": 0.79211825, "learning_rate": 2.655028075792743e-06, "loss": 0.81387252, "num_input_tokens_seen": 147123715, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8359375, "step": 6852, "time_per_iteration": 2.471613645553589 }, { "auxiliary_loss_clip": 0.01130189, "auxiliary_loss_mlp": 0.01032679, "balance_loss_clip": 1.01655006, "balance_loss_mlp": 1.04733849, "epoch": 0.4120246505335939, "flos": 27562050201600.0, "grad_norm": 2.538599176167766, "language_loss": 0.77579176, "learning_rate": 2.6546600819419537e-06, "loss": 0.79742038, "num_input_tokens_seen": 147144290, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.828125, "step": 6853, "time_per_iteration": 2.519622325897217 }, { "auxiliary_loss_clip": 0.01131686, "auxiliary_loss_mlp": 0.01039704, "balance_loss_clip": 1.02349198, "balance_loss_mlp": 1.04732466, "epoch": 0.41208477378626185, "flos": 37815444046080.0, "grad_norm": 7.111496244281271, "language_loss": 0.65650749, "learning_rate": 2.6542920632663883e-06, "loss": 0.6782214, "num_input_tokens_seen": 147166340, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84375, "step": 6854, "time_per_iteration": 2.6273417472839355 }, { "auxiliary_loss_clip": 0.01123592, "auxiliary_loss_mlp": 0.01033405, "balance_loss_clip": 1.01922524, "balance_loss_mlp": 1.0455271, "epoch": 0.4121448970389298, "flos": 23440762402560.0, "grad_norm": 1.9939675292005565, "language_loss": 0.83915675, "learning_rate": 2.6539240197800023e-06, "loss": 0.86072671, "num_input_tokens_seen": 147184025, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78125, "step": 6855, "time_per_iteration": 2.4818103313446045 }, { "auxiliary_loss_clip": 0.01127227, "auxiliary_loss_mlp": 0.01037562, "balance_loss_clip": 1.02361417, "balance_loss_mlp": 1.04877543, "epoch": 0.4122050202915978, "flos": 21325300859520.0, "grad_norm": 2.0109172212620217, "language_loss": 0.78841233, "learning_rate": 2.6535559514967517e-06, "loss": 0.81006026, "num_input_tokens_seen": 147202730, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78515625, "step": 6856, "time_per_iteration": 2.4925482273101807 }, { "auxiliary_loss_clip": 0.01127126, "auxiliary_loss_mlp": 0.01035217, "balance_loss_clip": 1.02090561, "balance_loss_mlp": 1.04758477, "epoch": 0.41226514354426574, "flos": 17306286059520.0, "grad_norm": 2.9108273589180382, "language_loss": 0.80238354, "learning_rate": 2.6531878584305935e-06, "loss": 0.82400703, "num_input_tokens_seen": 147215315, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 6857, "time_per_iteration": 2.4404044151306152 }, { "auxiliary_loss_clip": 0.0112611, "auxiliary_loss_mlp": 0.01036625, "balance_loss_clip": 1.02145004, "balance_loss_mlp": 1.04488802, "epoch": 0.4123252667969337, "flos": 17638855107840.0, "grad_norm": 1.7996154534764377, "language_loss": 0.70600116, "learning_rate": 2.6528197405954873e-06, "loss": 0.72762847, "num_input_tokens_seen": 147233330, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8125, "step": 6858, "time_per_iteration": 2.484941005706787 }, { "auxiliary_loss_clip": 0.01125478, "auxiliary_loss_mlp": 0.01039866, "balance_loss_clip": 1.02494097, "balance_loss_mlp": 1.04673934, "epoch": 0.4123853900496017, "flos": 46424811375360.0, "grad_norm": 2.213414852663847, "language_loss": 0.59134507, "learning_rate": 2.652451598005391e-06, "loss": 0.6129986, "num_input_tokens_seen": 147257780, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7890625, "step": 6859, "time_per_iteration": 2.692765474319458 }, { "auxiliary_loss_clip": 0.01127197, "auxiliary_loss_mlp": 0.01036525, "balance_loss_clip": 1.02225602, "balance_loss_mlp": 1.04576516, "epoch": 0.41244551330226964, "flos": 17675160779520.0, "grad_norm": 2.3996301307203924, "language_loss": 0.7352128, "learning_rate": 2.652083430674264e-06, "loss": 0.75685, "num_input_tokens_seen": 147276055, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.81640625, "step": 6860, "time_per_iteration": 2.461491823196411 }, { "auxiliary_loss_clip": 0.01122916, "auxiliary_loss_mlp": 0.01030583, "balance_loss_clip": 1.01679111, "balance_loss_mlp": 1.04484546, "epoch": 0.4125056365549376, "flos": 18693730748160.0, "grad_norm": 1.6155680089363504, "language_loss": 0.74181867, "learning_rate": 2.651715238616068e-06, "loss": 0.76335371, "num_input_tokens_seen": 147293200, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.78125, "step": 6861, "time_per_iteration": 2.439451217651367 }, { "auxiliary_loss_clip": 0.01123663, "auxiliary_loss_mlp": 0.01033111, "balance_loss_clip": 1.01959872, "balance_loss_mlp": 1.04630959, "epoch": 0.41256575980760557, "flos": 17895293280000.0, "grad_norm": 2.757037102548061, "language_loss": 0.80035228, "learning_rate": 2.651347021844765e-06, "loss": 0.82192004, "num_input_tokens_seen": 147310640, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7734375, "step": 6862, "time_per_iteration": 2.4537646770477295 }, { "auxiliary_loss_clip": 0.01126281, "auxiliary_loss_mlp": 0.01034543, "balance_loss_clip": 1.02054846, "balance_loss_mlp": 1.04689908, "epoch": 0.41262588306027354, "flos": 21981316901760.0, "grad_norm": 1.9860163420847559, "language_loss": 0.75854528, "learning_rate": 2.650978780374318e-06, "loss": 0.78015351, "num_input_tokens_seen": 147329435, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.79296875, "step": 6863, "time_per_iteration": 2.4752116203308105 }, { "auxiliary_loss_clip": 0.0105395, "auxiliary_loss_mlp": 0.01010481, "balance_loss_clip": 1.00868082, "balance_loss_mlp": 1.02722871, "epoch": 0.41268600631294156, "flos": 53350006740480.0, "grad_norm": 0.7069419993066436, "language_loss": 0.52747333, "learning_rate": 2.650610514218691e-06, "loss": 0.54811776, "num_input_tokens_seen": 147385805, "router_z_loss_clip": 0.01794434, "router_z_loss_mlp": 0.26757812, "step": 6864, "time_per_iteration": 3.102839708328247 }, { "auxiliary_loss_clip": 0.01129869, "auxiliary_loss_mlp": 0.01033147, "balance_loss_clip": 1.01819777, "balance_loss_mlp": 1.04650831, "epoch": 0.4127461295656095, "flos": 24385356311040.0, "grad_norm": 1.8291766303313242, "language_loss": 0.7277869, "learning_rate": 2.6502422233918468e-06, "loss": 0.74941707, "num_input_tokens_seen": 147405160, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8359375, "step": 6865, "time_per_iteration": 2.500577926635742 }, { "auxiliary_loss_clip": 0.010518, "auxiliary_loss_mlp": 0.01006299, "balance_loss_clip": 1.00453508, "balance_loss_mlp": 1.02541614, "epoch": 0.4128062528182775, "flos": 71705242696320.0, "grad_norm": 0.9295767873586442, "language_loss": 0.66573673, "learning_rate": 2.649873907907753e-06, "loss": 0.68631774, "num_input_tokens_seen": 147460245, "router_z_loss_clip": 0.0177002, "router_z_loss_mlp": 0.26367188, "step": 6866, "time_per_iteration": 2.9780635833740234 }, { "auxiliary_loss_clip": 0.01122843, "auxiliary_loss_mlp": 0.01034075, "balance_loss_clip": 1.01984167, "balance_loss_mlp": 1.04421139, "epoch": 0.41286637607094545, "flos": 17849111368320.0, "grad_norm": 2.4240197684271974, "language_loss": 0.81318086, "learning_rate": 2.649505567780375e-06, "loss": 0.83475006, "num_input_tokens_seen": 147476200, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 6867, "time_per_iteration": 2.431774139404297 }, { "auxiliary_loss_clip": 0.01128323, "auxiliary_loss_mlp": 0.01034697, "balance_loss_clip": 1.02032638, "balance_loss_mlp": 1.04672837, "epoch": 0.4129264993236134, "flos": 25549544016000.0, "grad_norm": 3.3522900427973097, "language_loss": 0.77903473, "learning_rate": 2.6491372030236815e-06, "loss": 0.8006649, "num_input_tokens_seen": 147494315, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.81640625, "step": 6868, "time_per_iteration": 2.5061304569244385 }, { "auxiliary_loss_clip": 0.01051068, "auxiliary_loss_mlp": 0.01012646, "balance_loss_clip": 1.01107204, "balance_loss_mlp": 1.02436411, "epoch": 0.4129866225762814, "flos": 65414446364160.0, "grad_norm": 0.8624430411693321, "language_loss": 0.57822037, "learning_rate": 2.64876881365164e-06, "loss": 0.59885746, "num_input_tokens_seen": 147543665, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.265625, "step": 6869, "time_per_iteration": 2.845895767211914 }, { "auxiliary_loss_clip": 0.01122609, "auxiliary_loss_mlp": 0.0103342, "balance_loss_clip": 1.01943684, "balance_loss_mlp": 1.04534841, "epoch": 0.41304674582894935, "flos": 28876991287680.0, "grad_norm": 1.8579481388323307, "language_loss": 0.75508201, "learning_rate": 2.64840039967822e-06, "loss": 0.77664232, "num_input_tokens_seen": 147564870, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7734375, "step": 6870, "time_per_iteration": 2.546006679534912 }, { "auxiliary_loss_clip": 0.01126652, "auxiliary_loss_mlp": 0.01041744, "balance_loss_clip": 1.02695036, "balance_loss_mlp": 1.04663205, "epoch": 0.4131068690816173, "flos": 22891975436160.0, "grad_norm": 1.5834974998911566, "language_loss": 0.83282232, "learning_rate": 2.6480319611173912e-06, "loss": 0.85450625, "num_input_tokens_seen": 147584840, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80078125, "step": 6871, "time_per_iteration": 2.4803152084350586 }, { "auxiliary_loss_clip": 0.01131304, "auxiliary_loss_mlp": 0.01043325, "balance_loss_clip": 1.02868569, "balance_loss_mlp": 1.0497539, "epoch": 0.4131669923342853, "flos": 26065185707520.0, "grad_norm": 2.037640971397104, "language_loss": 0.67905915, "learning_rate": 2.6476634979831263e-06, "loss": 0.70080549, "num_input_tokens_seen": 147604635, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.81640625, "step": 6872, "time_per_iteration": 2.5517873764038086 }, { "auxiliary_loss_clip": 0.01127908, "auxiliary_loss_mlp": 0.01038168, "balance_loss_clip": 1.02435184, "balance_loss_mlp": 1.04750967, "epoch": 0.41322711558695324, "flos": 19244564789760.0, "grad_norm": 2.127872599886323, "language_loss": 0.75765264, "learning_rate": 2.6472950102893964e-06, "loss": 0.77931345, "num_input_tokens_seen": 147620700, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8046875, "step": 6873, "time_per_iteration": 2.4417526721954346 }, { "auxiliary_loss_clip": 0.01128867, "auxiliary_loss_mlp": 0.01038795, "balance_loss_clip": 1.02420342, "balance_loss_mlp": 1.04780626, "epoch": 0.4132872388396212, "flos": 22674464628480.0, "grad_norm": 2.263668075553518, "language_loss": 0.8377201, "learning_rate": 2.6469264980501746e-06, "loss": 0.8593967, "num_input_tokens_seen": 147639490, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 6874, "time_per_iteration": 2.4746642112731934 }, { "auxiliary_loss_clip": 0.01126974, "auxiliary_loss_mlp": 0.0103573, "balance_loss_clip": 1.02098393, "balance_loss_mlp": 1.04631829, "epoch": 0.4133473620922892, "flos": 20150195420160.0, "grad_norm": 1.9438511420704787, "language_loss": 0.7155093, "learning_rate": 2.646557961279436e-06, "loss": 0.73713636, "num_input_tokens_seen": 147657205, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8046875, "step": 6875, "time_per_iteration": 2.4712798595428467 }, { "auxiliary_loss_clip": 0.01123352, "auxiliary_loss_mlp": 0.01037763, "balance_loss_clip": 1.02483475, "balance_loss_mlp": 1.04829252, "epoch": 0.41340748534495714, "flos": 24242755317120.0, "grad_norm": 1.8829892425390475, "language_loss": 0.82736218, "learning_rate": 2.646189399991154e-06, "loss": 0.84897333, "num_input_tokens_seen": 147677005, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.75, "step": 6876, "time_per_iteration": 2.5218286514282227 }, { "auxiliary_loss_clip": 0.01128893, "auxiliary_loss_mlp": 0.01040191, "balance_loss_clip": 1.02456319, "balance_loss_mlp": 1.04518378, "epoch": 0.41346760859762516, "flos": 14392171566720.0, "grad_norm": 2.623445530830812, "language_loss": 0.65566671, "learning_rate": 2.6458208141993048e-06, "loss": 0.67735755, "num_input_tokens_seen": 147693435, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 6877, "time_per_iteration": 2.4492528438568115 }, { "auxiliary_loss_clip": 0.01124968, "auxiliary_loss_mlp": 0.01032403, "balance_loss_clip": 1.0183363, "balance_loss_mlp": 1.04633689, "epoch": 0.4135277318502931, "flos": 22492002516480.0, "grad_norm": 1.8543861882628518, "language_loss": 0.76727581, "learning_rate": 2.6454522039178668e-06, "loss": 0.78884953, "num_input_tokens_seen": 147714000, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 6878, "time_per_iteration": 2.4952611923217773 }, { "auxiliary_loss_clip": 0.01125368, "auxiliary_loss_mlp": 0.01036798, "balance_loss_clip": 1.02277875, "balance_loss_mlp": 1.04682302, "epoch": 0.4135878551029611, "flos": 22418744728320.0, "grad_norm": 10.706198646628899, "language_loss": 0.802127, "learning_rate": 2.6450835691608154e-06, "loss": 0.82374859, "num_input_tokens_seen": 147731010, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 6879, "time_per_iteration": 2.5604429244995117 }, { "auxiliary_loss_clip": 0.01125813, "auxiliary_loss_mlp": 0.01037202, "balance_loss_clip": 1.02250397, "balance_loss_mlp": 1.04702544, "epoch": 0.41364797835562905, "flos": 27053232094080.0, "grad_norm": 1.8320031118481261, "language_loss": 0.84607667, "learning_rate": 2.6447149099421315e-06, "loss": 0.86770689, "num_input_tokens_seen": 147750880, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 6880, "time_per_iteration": 2.5340075492858887 }, { "auxiliary_loss_clip": 0.01128067, "auxiliary_loss_mlp": 0.01032869, "balance_loss_clip": 1.01823032, "balance_loss_mlp": 1.04736459, "epoch": 0.413708101608297, "flos": 22967603521920.0, "grad_norm": 4.618036155924346, "language_loss": 0.70435441, "learning_rate": 2.6443462262757927e-06, "loss": 0.72596383, "num_input_tokens_seen": 147771360, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80859375, "step": 6881, "time_per_iteration": 2.4781196117401123 }, { "auxiliary_loss_clip": 0.01123643, "auxiliary_loss_mlp": 0.01037206, "balance_loss_clip": 1.02404571, "balance_loss_mlp": 1.04762638, "epoch": 0.413768224860965, "flos": 13333991875200.0, "grad_norm": 1.8775069889119154, "language_loss": 0.81317824, "learning_rate": 2.6439775181757805e-06, "loss": 0.83478671, "num_input_tokens_seen": 147787440, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.76171875, "step": 6882, "time_per_iteration": 2.470297336578369 }, { "auxiliary_loss_clip": 0.01131064, "auxiliary_loss_mlp": 0.01043758, "balance_loss_clip": 1.02630544, "balance_loss_mlp": 1.04933119, "epoch": 0.41382834811363295, "flos": 20813968800000.0, "grad_norm": 2.701250280372556, "language_loss": 0.7002399, "learning_rate": 2.643608785656077e-06, "loss": 0.72198808, "num_input_tokens_seen": 147805720, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.81640625, "step": 6883, "time_per_iteration": 3.9307851791381836 }, { "auxiliary_loss_clip": 0.011261, "auxiliary_loss_mlp": 0.01034953, "balance_loss_clip": 1.02095807, "balance_loss_mlp": 1.0472821, "epoch": 0.4138884713663009, "flos": 20667130001280.0, "grad_norm": 1.911189714412798, "language_loss": 0.75616562, "learning_rate": 2.643240028730663e-06, "loss": 0.77777618, "num_input_tokens_seen": 147824605, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7890625, "step": 6884, "time_per_iteration": 2.514932155609131 }, { "auxiliary_loss_clip": 0.0112672, "auxiliary_loss_mlp": 0.01036871, "balance_loss_clip": 1.02229202, "balance_loss_mlp": 1.0456748, "epoch": 0.4139485946189689, "flos": 29056616225280.0, "grad_norm": 1.6454067675740682, "language_loss": 0.75750458, "learning_rate": 2.642871247413523e-06, "loss": 0.77914047, "num_input_tokens_seen": 147845445, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8125, "step": 6885, "time_per_iteration": 2.5540380477905273 }, { "auxiliary_loss_clip": 0.01128245, "auxiliary_loss_mlp": 0.01033289, "balance_loss_clip": 1.01866221, "balance_loss_mlp": 1.04733801, "epoch": 0.41400871787163684, "flos": 24425720219520.0, "grad_norm": 2.42523353566601, "language_loss": 0.70020151, "learning_rate": 2.6425024417186414e-06, "loss": 0.72181678, "num_input_tokens_seen": 147865580, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80859375, "step": 6886, "time_per_iteration": 5.260590553283691 }, { "auxiliary_loss_clip": 0.01130258, "auxiliary_loss_mlp": 0.01034818, "balance_loss_clip": 1.02090049, "balance_loss_mlp": 1.04952741, "epoch": 0.4140688411243048, "flos": 19464050845440.0, "grad_norm": 1.5639855495220873, "language_loss": 0.7550056, "learning_rate": 2.642133611660002e-06, "loss": 0.77665639, "num_input_tokens_seen": 147885230, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.80859375, "step": 6887, "time_per_iteration": 3.901463747024536 }, { "auxiliary_loss_clip": 0.01126679, "auxiliary_loss_mlp": 0.01029284, "balance_loss_clip": 1.01413226, "balance_loss_mlp": 1.04655612, "epoch": 0.4141289643769728, "flos": 19313656600320.0, "grad_norm": 2.067582346189412, "language_loss": 0.70552158, "learning_rate": 2.641764757251592e-06, "loss": 0.72708118, "num_input_tokens_seen": 147903035, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.80078125, "step": 6888, "time_per_iteration": 2.4392948150634766 }, { "auxiliary_loss_clip": 0.01126445, "auxiliary_loss_mlp": 0.01036257, "balance_loss_clip": 1.02169561, "balance_loss_mlp": 1.04822254, "epoch": 0.41418908762964074, "flos": 16726903683840.0, "grad_norm": 1.8925899907352888, "language_loss": 0.76214498, "learning_rate": 2.6413958785073976e-06, "loss": 0.78377199, "num_input_tokens_seen": 147918745, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 6889, "time_per_iteration": 2.4669296741485596 }, { "auxiliary_loss_clip": 0.01128869, "auxiliary_loss_mlp": 0.01036267, "balance_loss_clip": 1.02226019, "balance_loss_mlp": 1.05039525, "epoch": 0.41424921088230876, "flos": 25296840858240.0, "grad_norm": 1.9316471573883762, "language_loss": 0.80030203, "learning_rate": 2.6410269754414074e-06, "loss": 0.82195342, "num_input_tokens_seen": 147938265, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 6890, "time_per_iteration": 2.4966378211975098 }, { "auxiliary_loss_clip": 0.0112515, "auxiliary_loss_mlp": 0.01036475, "balance_loss_clip": 1.02163386, "balance_loss_mlp": 1.04785728, "epoch": 0.4143093341349767, "flos": 20960520289920.0, "grad_norm": 1.4691849302539448, "language_loss": 0.74117279, "learning_rate": 2.6406580480676113e-06, "loss": 0.76278907, "num_input_tokens_seen": 147957320, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 6891, "time_per_iteration": 2.476266384124756 }, { "auxiliary_loss_clip": 0.01129991, "auxiliary_loss_mlp": 0.01035373, "balance_loss_clip": 1.01952434, "balance_loss_mlp": 1.04859698, "epoch": 0.4143694573876447, "flos": 22017694400640.0, "grad_norm": 2.453785446511791, "language_loss": 0.84164047, "learning_rate": 2.6402890963999963e-06, "loss": 0.86329412, "num_input_tokens_seen": 147977045, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8125, "step": 6892, "time_per_iteration": 2.6031453609466553 }, { "auxiliary_loss_clip": 0.01126857, "auxiliary_loss_mlp": 0.01030857, "balance_loss_clip": 1.01688576, "balance_loss_mlp": 1.04886341, "epoch": 0.41442958064031266, "flos": 35697396723840.0, "grad_norm": 1.752257558994062, "language_loss": 0.70824724, "learning_rate": 2.6399201204525554e-06, "loss": 0.72982442, "num_input_tokens_seen": 147996905, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 6893, "time_per_iteration": 2.6035234928131104 }, { "auxiliary_loss_clip": 0.01124939, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.01747739, "balance_loss_mlp": 1.04715812, "epoch": 0.4144897038929806, "flos": 28293766156800.0, "grad_norm": 1.878520285724688, "language_loss": 0.7281726, "learning_rate": 2.639551120239279e-06, "loss": 0.7497381, "num_input_tokens_seen": 148017875, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.77734375, "step": 6894, "time_per_iteration": 2.5224242210388184 }, { "auxiliary_loss_clip": 0.01128439, "auxiliary_loss_mlp": 0.01030465, "balance_loss_clip": 1.01611245, "balance_loss_mlp": 1.04834867, "epoch": 0.4145498271456486, "flos": 11648093080320.0, "grad_norm": 2.984004671741265, "language_loss": 0.62771469, "learning_rate": 2.63918209577416e-06, "loss": 0.64930379, "num_input_tokens_seen": 148032300, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.80078125, "step": 6895, "time_per_iteration": 2.436945676803589 }, { "auxiliary_loss_clip": 0.01124374, "auxiliary_loss_mlp": 0.01031807, "balance_loss_clip": 1.0179311, "balance_loss_mlp": 1.04663742, "epoch": 0.41460995039831655, "flos": 27235622378880.0, "grad_norm": 1.5501037379738414, "language_loss": 0.70174414, "learning_rate": 2.638813047071192e-06, "loss": 0.72330594, "num_input_tokens_seen": 148053260, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.77734375, "step": 6896, "time_per_iteration": 2.5430729389190674 }, { "auxiliary_loss_clip": 0.01126509, "auxiliary_loss_mlp": 0.01037034, "balance_loss_clip": 1.02145338, "balance_loss_mlp": 1.04554987, "epoch": 0.4146700736509845, "flos": 25922369232000.0, "grad_norm": 1.877283848517641, "language_loss": 0.73327887, "learning_rate": 2.6384439741443696e-06, "loss": 0.75491428, "num_input_tokens_seen": 148072965, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.80859375, "step": 6897, "time_per_iteration": 2.555211067199707 }, { "auxiliary_loss_clip": 0.01126508, "auxiliary_loss_mlp": 0.01039634, "balance_loss_clip": 1.02537048, "balance_loss_mlp": 1.04860485, "epoch": 0.4147301969036525, "flos": 26833243248000.0, "grad_norm": 1.6633246430650808, "language_loss": 0.84840697, "learning_rate": 2.6380748770076873e-06, "loss": 0.87006837, "num_input_tokens_seen": 148093240, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 6898, "time_per_iteration": 2.5229833126068115 }, { "auxiliary_loss_clip": 0.01126501, "auxiliary_loss_mlp": 0.01033302, "balance_loss_clip": 1.01871097, "balance_loss_mlp": 1.04577696, "epoch": 0.41479032015632045, "flos": 20298291194880.0, "grad_norm": 1.9366029700477068, "language_loss": 0.75045204, "learning_rate": 2.6377057556751416e-06, "loss": 0.77205002, "num_input_tokens_seen": 148110925, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 6899, "time_per_iteration": 2.4848105907440186 }, { "auxiliary_loss_clip": 0.01129064, "auxiliary_loss_mlp": 0.01036475, "balance_loss_clip": 1.02052522, "balance_loss_mlp": 1.04488635, "epoch": 0.4148504434089884, "flos": 25264988472960.0, "grad_norm": 2.0182108069771556, "language_loss": 0.7541095, "learning_rate": 2.6373366101607306e-06, "loss": 0.77576494, "num_input_tokens_seen": 148130670, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.84375, "step": 6900, "time_per_iteration": 2.5120370388031006 }, { "auxiliary_loss_clip": 0.01128145, "auxiliary_loss_mlp": 0.01037512, "balance_loss_clip": 1.02245021, "balance_loss_mlp": 1.04794025, "epoch": 0.4149105666616564, "flos": 12822300679680.0, "grad_norm": 2.24080451935149, "language_loss": 0.80062962, "learning_rate": 2.6369674404784503e-06, "loss": 0.82228625, "num_input_tokens_seen": 148148350, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 6901, "time_per_iteration": 2.4605460166931152 }, { "auxiliary_loss_clip": 0.0112596, "auxiliary_loss_mlp": 0.01033034, "balance_loss_clip": 1.01891363, "balance_loss_mlp": 1.04732144, "epoch": 0.41497068991432434, "flos": 16763891713920.0, "grad_norm": 1.839725137604137, "language_loss": 0.6970529, "learning_rate": 2.6365982466423014e-06, "loss": 0.71864283, "num_input_tokens_seen": 148167550, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 6902, "time_per_iteration": 2.467075824737549 }, { "auxiliary_loss_clip": 0.01124853, "auxiliary_loss_mlp": 0.01040071, "balance_loss_clip": 1.0256412, "balance_loss_mlp": 1.04730439, "epoch": 0.4150308131669923, "flos": 18000906243840.0, "grad_norm": 1.8785252335714848, "language_loss": 0.83848691, "learning_rate": 2.6362290286662834e-06, "loss": 0.86013615, "num_input_tokens_seen": 148184740, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 6903, "time_per_iteration": 2.4659347534179688 }, { "auxiliary_loss_clip": 0.01131276, "auxiliary_loss_mlp": 0.01041316, "balance_loss_clip": 1.02485299, "balance_loss_mlp": 1.04776394, "epoch": 0.41509093641966033, "flos": 30044770352640.0, "grad_norm": 4.718786298581729, "language_loss": 0.68439633, "learning_rate": 2.6358597865643968e-06, "loss": 0.70612228, "num_input_tokens_seen": 148204605, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 6904, "time_per_iteration": 2.557526111602783 }, { "auxiliary_loss_clip": 0.01128664, "auxiliary_loss_mlp": 0.0103693, "balance_loss_clip": 1.02210617, "balance_loss_mlp": 1.04639912, "epoch": 0.4151510596723283, "flos": 24279994742400.0, "grad_norm": 2.555848354793458, "language_loss": 0.7758714, "learning_rate": 2.635490520350643e-06, "loss": 0.79752737, "num_input_tokens_seen": 148224675, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.82421875, "step": 6905, "time_per_iteration": 2.515580415725708 }, { "auxiliary_loss_clip": 0.01129162, "auxiliary_loss_mlp": 0.01031718, "balance_loss_clip": 1.01684046, "balance_loss_mlp": 1.04739571, "epoch": 0.41521118292499626, "flos": 23476206147840.0, "grad_norm": 1.853435656741376, "language_loss": 0.68490887, "learning_rate": 2.635121230039025e-06, "loss": 0.7065177, "num_input_tokens_seen": 148243375, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.81640625, "step": 6906, "time_per_iteration": 2.4831864833831787 }, { "auxiliary_loss_clip": 0.01123115, "auxiliary_loss_mlp": 0.01034423, "balance_loss_clip": 1.02013016, "balance_loss_mlp": 1.04502881, "epoch": 0.4152713061776642, "flos": 22125498094080.0, "grad_norm": 2.3463966896563746, "language_loss": 0.67129409, "learning_rate": 2.6347519156435467e-06, "loss": 0.69286954, "num_input_tokens_seen": 148261140, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 6907, "time_per_iteration": 2.492903470993042 }, { "auxiliary_loss_clip": 0.01128237, "auxiliary_loss_mlp": 0.01032966, "balance_loss_clip": 1.0188036, "balance_loss_mlp": 1.04772782, "epoch": 0.4153314294303322, "flos": 21251396626560.0, "grad_norm": 1.8353347366079908, "language_loss": 0.77260989, "learning_rate": 2.6343825771782123e-06, "loss": 0.79422188, "num_input_tokens_seen": 148279655, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8046875, "step": 6908, "time_per_iteration": 2.4677281379699707 }, { "auxiliary_loss_clip": 0.0105197, "auxiliary_loss_mlp": 0.01017688, "balance_loss_clip": 1.01570892, "balance_loss_mlp": 1.02451921, "epoch": 0.41539155268300015, "flos": 57920681594880.0, "grad_norm": 0.8060022984268861, "language_loss": 0.64851725, "learning_rate": 2.634013214657026e-06, "loss": 0.66921389, "num_input_tokens_seen": 148339005, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.2734375, "step": 6909, "time_per_iteration": 3.0537166595458984 }, { "auxiliary_loss_clip": 0.01125039, "auxiliary_loss_mlp": 0.01035593, "balance_loss_clip": 1.02143073, "balance_loss_mlp": 1.04589772, "epoch": 0.4154516759356681, "flos": 21903677654400.0, "grad_norm": 1.5900734213312386, "language_loss": 0.87020576, "learning_rate": 2.633643828093996e-06, "loss": 0.89181209, "num_input_tokens_seen": 148358715, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7890625, "step": 6910, "time_per_iteration": 2.556624412536621 }, { "auxiliary_loss_clip": 0.01050291, "auxiliary_loss_mlp": 0.01002982, "balance_loss_clip": 1.00095594, "balance_loss_mlp": 1.02305269, "epoch": 0.4155117991883361, "flos": 67833677226240.0, "grad_norm": 0.8179317253356614, "language_loss": 0.62177998, "learning_rate": 2.633274417503128e-06, "loss": 0.64231271, "num_input_tokens_seen": 148417280, "router_z_loss_clip": 0.02026367, "router_z_loss_mlp": 0.2734375, "step": 6911, "time_per_iteration": 3.0654895305633545 }, { "auxiliary_loss_clip": 0.01132991, "auxiliary_loss_mlp": 0.01038983, "balance_loss_clip": 1.0235455, "balance_loss_mlp": 1.04800642, "epoch": 0.41557192244100405, "flos": 14282679934080.0, "grad_norm": 2.619172751897302, "language_loss": 0.88193488, "learning_rate": 2.6329049828984312e-06, "loss": 0.90365463, "num_input_tokens_seen": 148432610, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8515625, "step": 6912, "time_per_iteration": 2.4393792152404785 }, { "auxiliary_loss_clip": 0.01125777, "auxiliary_loss_mlp": 0.01037127, "balance_loss_clip": 1.02331662, "balance_loss_mlp": 1.0465039, "epoch": 0.415632045693672, "flos": 24461954064000.0, "grad_norm": 2.216116545059145, "language_loss": 0.63604975, "learning_rate": 2.632535524293914e-06, "loss": 0.65767884, "num_input_tokens_seen": 148451510, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.79296875, "step": 6913, "time_per_iteration": 2.5052409172058105 }, { "auxiliary_loss_clip": 0.01122843, "auxiliary_loss_mlp": 0.01033346, "balance_loss_clip": 1.01932645, "balance_loss_mlp": 1.04466367, "epoch": 0.41569216894634, "flos": 20115290378880.0, "grad_norm": 2.80319454044787, "language_loss": 0.75281191, "learning_rate": 2.632166041703586e-06, "loss": 0.77437377, "num_input_tokens_seen": 148469945, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 6914, "time_per_iteration": 2.4722535610198975 }, { "auxiliary_loss_clip": 0.01125996, "auxiliary_loss_mlp": 0.01036781, "balance_loss_clip": 1.0214268, "balance_loss_mlp": 1.04388392, "epoch": 0.41575229219900794, "flos": 23798827128960.0, "grad_norm": 1.9743703099528955, "language_loss": 0.87759382, "learning_rate": 2.631796535141458e-06, "loss": 0.8992216, "num_input_tokens_seen": 148486655, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8203125, "step": 6915, "time_per_iteration": 2.516486167907715 }, { "auxiliary_loss_clip": 0.01127027, "auxiliary_loss_mlp": 0.01038877, "balance_loss_clip": 1.02450061, "balance_loss_mlp": 1.04656243, "epoch": 0.4158124154516759, "flos": 23108229267840.0, "grad_norm": 2.5914908961746024, "language_loss": 0.71491086, "learning_rate": 2.6314270046215426e-06, "loss": 0.73656994, "num_input_tokens_seen": 148505035, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 6916, "time_per_iteration": 2.5233359336853027 }, { "auxiliary_loss_clip": 0.01130466, "auxiliary_loss_mlp": 0.01033922, "balance_loss_clip": 1.01824629, "balance_loss_mlp": 1.04732096, "epoch": 0.41587253870434393, "flos": 24242970798720.0, "grad_norm": 1.545506429032295, "language_loss": 0.72115588, "learning_rate": 2.631057450157852e-06, "loss": 0.74279976, "num_input_tokens_seen": 148525575, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83203125, "step": 6917, "time_per_iteration": 2.515082359313965 }, { "auxiliary_loss_clip": 0.01123087, "auxiliary_loss_mlp": 0.01026847, "balance_loss_clip": 1.01225555, "balance_loss_mlp": 1.04338646, "epoch": 0.4159326619570119, "flos": 23881602021120.0, "grad_norm": 1.6514724796298974, "language_loss": 0.81015372, "learning_rate": 2.6306878717643988e-06, "loss": 0.83165312, "num_input_tokens_seen": 148547270, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 6918, "time_per_iteration": 2.519059419631958 }, { "auxiliary_loss_clip": 0.01129183, "auxiliary_loss_mlp": 0.01034638, "balance_loss_clip": 1.01939094, "balance_loss_mlp": 1.0471046, "epoch": 0.41599278520967986, "flos": 40626531354240.0, "grad_norm": 1.407528578194315, "language_loss": 0.70247698, "learning_rate": 2.6303182694551995e-06, "loss": 0.72411519, "num_input_tokens_seen": 148572100, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 6919, "time_per_iteration": 2.6642420291900635 }, { "auxiliary_loss_clip": 0.0112786, "auxiliary_loss_mlp": 0.01033259, "balance_loss_clip": 1.01757729, "balance_loss_mlp": 1.04682708, "epoch": 0.4160529084623478, "flos": 18222942165120.0, "grad_norm": 2.0193672670025697, "language_loss": 0.81067175, "learning_rate": 2.6299486432442677e-06, "loss": 0.8322829, "num_input_tokens_seen": 148591245, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.80859375, "step": 6920, "time_per_iteration": 2.4582109451293945 }, { "auxiliary_loss_clip": 0.01130935, "auxiliary_loss_mlp": 0.01039376, "balance_loss_clip": 1.02354515, "balance_loss_mlp": 1.04796147, "epoch": 0.4161130317150158, "flos": 13661963982720.0, "grad_norm": 2.52763130942423, "language_loss": 0.66098112, "learning_rate": 2.6295789931456195e-06, "loss": 0.68268424, "num_input_tokens_seen": 148607980, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 6921, "time_per_iteration": 2.481076717376709 }, { "auxiliary_loss_clip": 0.01127162, "auxiliary_loss_mlp": 0.01035505, "balance_loss_clip": 1.02028227, "balance_loss_mlp": 1.0465709, "epoch": 0.41617315496768376, "flos": 16178511767040.0, "grad_norm": 2.3387213903629647, "language_loss": 0.80607057, "learning_rate": 2.629209319173274e-06, "loss": 0.82769722, "num_input_tokens_seen": 148624490, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 6922, "time_per_iteration": 2.493274211883545 }, { "auxiliary_loss_clip": 0.01129398, "auxiliary_loss_mlp": 0.0103421, "balance_loss_clip": 1.01896882, "balance_loss_mlp": 1.04618073, "epoch": 0.4162332782203517, "flos": 26213317395840.0, "grad_norm": 1.616534731975177, "language_loss": 0.67698336, "learning_rate": 2.628839621341247e-06, "loss": 0.69861948, "num_input_tokens_seen": 148646490, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 6923, "time_per_iteration": 2.5491995811462402 }, { "auxiliary_loss_clip": 0.01129843, "auxiliary_loss_mlp": 0.01042879, "balance_loss_clip": 1.02670264, "balance_loss_mlp": 1.04802048, "epoch": 0.4162934014730197, "flos": 28183987215360.0, "grad_norm": 2.2096950287822166, "language_loss": 0.75410211, "learning_rate": 2.6284698996635593e-06, "loss": 0.77582932, "num_input_tokens_seen": 148668580, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8203125, "step": 6924, "time_per_iteration": 2.5490822792053223 }, { "auxiliary_loss_clip": 0.01128016, "auxiliary_loss_mlp": 0.01034468, "balance_loss_clip": 1.02002001, "balance_loss_mlp": 1.04580688, "epoch": 0.41635352472568765, "flos": 19865316654720.0, "grad_norm": 1.8070562096690757, "language_loss": 0.72682619, "learning_rate": 2.62810015415423e-06, "loss": 0.74845099, "num_input_tokens_seen": 148688410, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 6925, "time_per_iteration": 3.9528541564941406 }, { "auxiliary_loss_clip": 0.01125056, "auxiliary_loss_mlp": 0.01034031, "balance_loss_clip": 1.01943994, "balance_loss_mlp": 1.044186, "epoch": 0.4164136479783556, "flos": 14935356011520.0, "grad_norm": 2.4521874633130203, "language_loss": 0.84277356, "learning_rate": 2.6277303848272792e-06, "loss": 0.8643645, "num_input_tokens_seen": 148704855, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80859375, "step": 6926, "time_per_iteration": 2.4565200805664062 }, { "auxiliary_loss_clip": 0.01123006, "auxiliary_loss_mlp": 0.01035849, "balance_loss_clip": 1.02239645, "balance_loss_mlp": 1.04583526, "epoch": 0.4164737712310236, "flos": 21757593041280.0, "grad_norm": 1.6391131177740634, "language_loss": 0.86369228, "learning_rate": 2.6273605916967302e-06, "loss": 0.88528079, "num_input_tokens_seen": 148723065, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 6927, "time_per_iteration": 2.4831109046936035 }, { "auxiliary_loss_clip": 0.01126561, "auxiliary_loss_mlp": 0.01040503, "balance_loss_clip": 1.02463651, "balance_loss_mlp": 1.04596817, "epoch": 0.41653389448369155, "flos": 20740136394240.0, "grad_norm": 2.2091426220892254, "language_loss": 0.7241317, "learning_rate": 2.626990774776604e-06, "loss": 0.7458024, "num_input_tokens_seen": 148741780, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8046875, "step": 6928, "time_per_iteration": 5.352672100067139 }, { "auxiliary_loss_clip": 0.01122788, "auxiliary_loss_mlp": 0.01035118, "balance_loss_clip": 1.01999068, "balance_loss_mlp": 1.04344702, "epoch": 0.4165940177363595, "flos": 24972891073920.0, "grad_norm": 2.6247693487899695, "language_loss": 0.77766484, "learning_rate": 2.6266209340809254e-06, "loss": 0.79924393, "num_input_tokens_seen": 148759795, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.79296875, "step": 6929, "time_per_iteration": 2.5190811157226562 }, { "auxiliary_loss_clip": 0.01125211, "auxiliary_loss_mlp": 0.01030647, "balance_loss_clip": 1.01615143, "balance_loss_mlp": 1.04511905, "epoch": 0.41665414098902753, "flos": 20521727746560.0, "grad_norm": 2.111668745100508, "language_loss": 0.70976853, "learning_rate": 2.6262510696237182e-06, "loss": 0.73132712, "num_input_tokens_seen": 148778680, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 6930, "time_per_iteration": 2.458878517150879 }, { "auxiliary_loss_clip": 0.01125822, "auxiliary_loss_mlp": 0.01036492, "balance_loss_clip": 1.02164471, "balance_loss_mlp": 1.04476881, "epoch": 0.4167142642416955, "flos": 19682926369920.0, "grad_norm": 1.7163802046916798, "language_loss": 0.81259924, "learning_rate": 2.625881181419007e-06, "loss": 0.83422238, "num_input_tokens_seen": 148796470, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8125, "step": 6931, "time_per_iteration": 2.4725992679595947 }, { "auxiliary_loss_clip": 0.0112285, "auxiliary_loss_mlp": 0.01037319, "balance_loss_clip": 1.02207851, "balance_loss_mlp": 1.0442493, "epoch": 0.41677438749436346, "flos": 23763742519680.0, "grad_norm": 2.751435913736887, "language_loss": 0.79126829, "learning_rate": 2.6255112694808193e-06, "loss": 0.81286997, "num_input_tokens_seen": 148815300, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78515625, "step": 6932, "time_per_iteration": 2.51798677444458 }, { "auxiliary_loss_clip": 0.01124805, "auxiliary_loss_mlp": 0.01039999, "balance_loss_clip": 1.02421546, "balance_loss_mlp": 1.04425001, "epoch": 0.41683451074703143, "flos": 30410053712640.0, "grad_norm": 2.0593350715630807, "language_loss": 0.81424856, "learning_rate": 2.6251413338231813e-06, "loss": 0.83589655, "num_input_tokens_seen": 148834315, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8046875, "step": 6933, "time_per_iteration": 2.610649347305298 }, { "auxiliary_loss_clip": 0.01127458, "auxiliary_loss_mlp": 0.01032364, "balance_loss_clip": 1.01662838, "balance_loss_mlp": 1.04406309, "epoch": 0.4168946339996994, "flos": 21506757390720.0, "grad_norm": 2.8713556900833237, "language_loss": 0.76738024, "learning_rate": 2.624771374460121e-06, "loss": 0.7889784, "num_input_tokens_seen": 148852420, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8359375, "step": 6934, "time_per_iteration": 2.4771265983581543 }, { "auxiliary_loss_clip": 0.01128466, "auxiliary_loss_mlp": 0.01033073, "balance_loss_clip": 1.01793361, "balance_loss_mlp": 1.04687238, "epoch": 0.41695475725236736, "flos": 17638675539840.0, "grad_norm": 2.153872744337356, "language_loss": 0.67187107, "learning_rate": 2.624401391405668e-06, "loss": 0.69348645, "num_input_tokens_seen": 148869305, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.81640625, "step": 6935, "time_per_iteration": 2.452733278274536 }, { "auxiliary_loss_clip": 0.01125873, "auxiliary_loss_mlp": 0.01041538, "balance_loss_clip": 1.02652907, "balance_loss_mlp": 1.04599607, "epoch": 0.4170148805050353, "flos": 15668903560320.0, "grad_norm": 2.6812464428929026, "language_loss": 0.728562, "learning_rate": 2.6240313846738513e-06, "loss": 0.75023615, "num_input_tokens_seen": 148886395, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 6936, "time_per_iteration": 2.4515364170074463 }, { "auxiliary_loss_clip": 0.01123823, "auxiliary_loss_mlp": 0.01034224, "balance_loss_clip": 1.0201695, "balance_loss_mlp": 1.04437745, "epoch": 0.4170750037577033, "flos": 15159151699200.0, "grad_norm": 2.462748803197623, "language_loss": 0.73543513, "learning_rate": 2.6236613542787024e-06, "loss": 0.75701559, "num_input_tokens_seen": 148905235, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.79296875, "step": 6937, "time_per_iteration": 2.476846933364868 }, { "auxiliary_loss_clip": 0.01123511, "auxiliary_loss_mlp": 0.01038254, "balance_loss_clip": 1.02386546, "balance_loss_mlp": 1.04467785, "epoch": 0.41713512701037125, "flos": 28768289754240.0, "grad_norm": 1.7178005027597227, "language_loss": 0.8438645, "learning_rate": 2.6232913002342518e-06, "loss": 0.86548215, "num_input_tokens_seen": 148928130, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7890625, "step": 6938, "time_per_iteration": 2.5726118087768555 }, { "auxiliary_loss_clip": 0.01129748, "auxiliary_loss_mlp": 0.01037648, "balance_loss_clip": 1.0216496, "balance_loss_mlp": 1.04737222, "epoch": 0.4171952502630392, "flos": 28256993608320.0, "grad_norm": 2.125690650388171, "language_loss": 0.74034661, "learning_rate": 2.6229212225545334e-06, "loss": 0.76202053, "num_input_tokens_seen": 148948790, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.82421875, "step": 6939, "time_per_iteration": 2.5511035919189453 }, { "auxiliary_loss_clip": 0.01126829, "auxiliary_loss_mlp": 0.01033518, "balance_loss_clip": 1.01830041, "balance_loss_mlp": 1.04664505, "epoch": 0.4172553735157072, "flos": 24571697091840.0, "grad_norm": 1.6298483780665154, "language_loss": 0.74948442, "learning_rate": 2.622551121253579e-06, "loss": 0.77108788, "num_input_tokens_seen": 148967690, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80078125, "step": 6940, "time_per_iteration": 2.516249895095825 }, { "auxiliary_loss_clip": 0.01127393, "auxiliary_loss_mlp": 0.01036333, "balance_loss_clip": 1.02196836, "balance_loss_mlp": 1.04685104, "epoch": 0.41731549676837515, "flos": 27045797978880.0, "grad_norm": 1.8211433410331608, "language_loss": 0.71776545, "learning_rate": 2.622180996345424e-06, "loss": 0.73940265, "num_input_tokens_seen": 148987150, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8046875, "step": 6941, "time_per_iteration": 2.5478949546813965 }, { "auxiliary_loss_clip": 0.0112815, "auxiliary_loss_mlp": 0.01039668, "balance_loss_clip": 1.02457607, "balance_loss_mlp": 1.04573727, "epoch": 0.4173756200210431, "flos": 28394063907840.0, "grad_norm": 3.6222797653839987, "language_loss": 0.74103618, "learning_rate": 2.621810847844104e-06, "loss": 0.76271439, "num_input_tokens_seen": 149004895, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.82421875, "step": 6942, "time_per_iteration": 2.5446958541870117 }, { "auxiliary_loss_clip": 0.01132278, "auxiliary_loss_mlp": 0.01037454, "balance_loss_clip": 1.02155173, "balance_loss_mlp": 1.04887199, "epoch": 0.41743574327371114, "flos": 22521556431360.0, "grad_norm": 5.781430658939198, "language_loss": 0.72646165, "learning_rate": 2.6214406757636534e-06, "loss": 0.74815899, "num_input_tokens_seen": 149020970, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 6943, "time_per_iteration": 2.480379343032837 }, { "auxiliary_loss_clip": 0.01129296, "auxiliary_loss_mlp": 0.01029827, "balance_loss_clip": 1.01417518, "balance_loss_mlp": 1.04682565, "epoch": 0.4174958665263791, "flos": 30113431200000.0, "grad_norm": 2.211455810819741, "language_loss": 0.63734978, "learning_rate": 2.621070480118111e-06, "loss": 0.65894103, "num_input_tokens_seen": 149041795, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.82421875, "step": 6944, "time_per_iteration": 2.542851209640503 }, { "auxiliary_loss_clip": 0.01126469, "auxiliary_loss_mlp": 0.0103337, "balance_loss_clip": 1.01866591, "balance_loss_mlp": 1.04598212, "epoch": 0.41755598977904707, "flos": 25263444188160.0, "grad_norm": 1.5632569140015582, "language_loss": 0.70400083, "learning_rate": 2.620700260921513e-06, "loss": 0.72559923, "num_input_tokens_seen": 149063700, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 6945, "time_per_iteration": 2.5239691734313965 }, { "auxiliary_loss_clip": 0.01125532, "auxiliary_loss_mlp": 0.01038798, "balance_loss_clip": 1.02216828, "balance_loss_mlp": 1.0448525, "epoch": 0.41761611303171503, "flos": 19828580019840.0, "grad_norm": 1.693128826677835, "language_loss": 0.80565095, "learning_rate": 2.620330018187899e-06, "loss": 0.82729423, "num_input_tokens_seen": 149082410, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8046875, "step": 6946, "time_per_iteration": 2.4660394191741943 }, { "auxiliary_loss_clip": 0.01127067, "auxiliary_loss_mlp": 0.01032637, "balance_loss_clip": 1.01801038, "balance_loss_mlp": 1.04719543, "epoch": 0.417676236284383, "flos": 15523249910400.0, "grad_norm": 2.201342462742078, "language_loss": 0.77295113, "learning_rate": 2.6199597519313086e-06, "loss": 0.79454821, "num_input_tokens_seen": 149098745, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 6947, "time_per_iteration": 2.46234393119812 }, { "auxiliary_loss_clip": 0.01127895, "auxiliary_loss_mlp": 0.01035104, "balance_loss_clip": 1.01882017, "balance_loss_mlp": 1.04583788, "epoch": 0.41773635953705096, "flos": 32524473761280.0, "grad_norm": 1.8814672016230811, "language_loss": 0.71832669, "learning_rate": 2.6195894621657825e-06, "loss": 0.73995674, "num_input_tokens_seen": 149122255, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8203125, "step": 6948, "time_per_iteration": 2.5802574157714844 }, { "auxiliary_loss_clip": 0.01121643, "auxiliary_loss_mlp": 0.01029772, "balance_loss_clip": 1.01477587, "balance_loss_mlp": 1.0432117, "epoch": 0.4177964827897189, "flos": 23440941970560.0, "grad_norm": 1.5819102485798378, "language_loss": 0.76839197, "learning_rate": 2.619219148905362e-06, "loss": 0.78990608, "num_input_tokens_seen": 149142845, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78515625, "step": 6949, "time_per_iteration": 2.529111862182617 }, { "auxiliary_loss_clip": 0.01132051, "auxiliary_loss_mlp": 0.0104057, "balance_loss_clip": 1.02516842, "balance_loss_mlp": 1.04929376, "epoch": 0.4178566060423869, "flos": 22748907565440.0, "grad_norm": 1.9965956147903818, "language_loss": 0.81980324, "learning_rate": 2.6188488121640888e-06, "loss": 0.84152949, "num_input_tokens_seen": 149163375, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 6950, "time_per_iteration": 2.496037721633911 }, { "auxiliary_loss_clip": 0.01122408, "auxiliary_loss_mlp": 0.01032189, "balance_loss_clip": 1.01821208, "balance_loss_mlp": 1.0456022, "epoch": 0.41791672929505486, "flos": 26032794618240.0, "grad_norm": 1.4123242653950727, "language_loss": 0.76135099, "learning_rate": 2.618478451956007e-06, "loss": 0.782897, "num_input_tokens_seen": 149185610, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 6951, "time_per_iteration": 2.552400588989258 }, { "auxiliary_loss_clip": 0.01130744, "auxiliary_loss_mlp": 0.01035691, "balance_loss_clip": 1.01962137, "balance_loss_mlp": 1.04620814, "epoch": 0.4179768525477228, "flos": 19568694142080.0, "grad_norm": 1.7618761432090448, "language_loss": 0.72972453, "learning_rate": 2.61810806829516e-06, "loss": 0.75138891, "num_input_tokens_seen": 149203990, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84375, "step": 6952, "time_per_iteration": 2.454134225845337 }, { "auxiliary_loss_clip": 0.0112836, "auxiliary_loss_mlp": 0.01030391, "balance_loss_clip": 1.01594317, "balance_loss_mlp": 1.04714596, "epoch": 0.4180369758003908, "flos": 17783826399360.0, "grad_norm": 2.5124221422071322, "language_loss": 0.72232944, "learning_rate": 2.617737661195593e-06, "loss": 0.74391699, "num_input_tokens_seen": 149221385, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 6953, "time_per_iteration": 2.464822292327881 }, { "auxiliary_loss_clip": 0.01125613, "auxiliary_loss_mlp": 0.0103491, "balance_loss_clip": 1.01959157, "balance_loss_mlp": 1.04685056, "epoch": 0.41809709905305875, "flos": 20960663944320.0, "grad_norm": 1.8515856672010405, "language_loss": 0.76142359, "learning_rate": 2.617367230671353e-06, "loss": 0.78302884, "num_input_tokens_seen": 149241175, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.7890625, "step": 6954, "time_per_iteration": 2.4839255809783936 }, { "auxiliary_loss_clip": 0.01126957, "auxiliary_loss_mlp": 0.01040879, "balance_loss_clip": 1.02525115, "balance_loss_mlp": 1.04525352, "epoch": 0.4181572223057267, "flos": 22017622573440.0, "grad_norm": 2.2320735822977253, "language_loss": 0.84246939, "learning_rate": 2.616996776736485e-06, "loss": 0.86414778, "num_input_tokens_seen": 149259115, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.81640625, "step": 6955, "time_per_iteration": 2.4902241230010986 }, { "auxiliary_loss_clip": 0.01123848, "auxiliary_loss_mlp": 0.0103742, "balance_loss_clip": 1.02314496, "balance_loss_mlp": 1.04610157, "epoch": 0.4182173455583947, "flos": 26245528917120.0, "grad_norm": 1.5029200911435567, "language_loss": 0.83049917, "learning_rate": 2.616626299405037e-06, "loss": 0.85211182, "num_input_tokens_seen": 149278705, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 6956, "time_per_iteration": 2.516263723373413 }, { "auxiliary_loss_clip": 0.01129407, "auxiliary_loss_mlp": 0.01038298, "balance_loss_clip": 1.02265763, "balance_loss_mlp": 1.04652095, "epoch": 0.4182774688110627, "flos": 14791605782400.0, "grad_norm": 2.2892594441562615, "language_loss": 0.71419358, "learning_rate": 2.616255798691059e-06, "loss": 0.7358706, "num_input_tokens_seen": 149294040, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 6957, "time_per_iteration": 2.4820592403411865 }, { "auxiliary_loss_clip": 0.01125095, "auxiliary_loss_mlp": 0.01041, "balance_loss_clip": 1.02723694, "balance_loss_mlp": 1.04524851, "epoch": 0.41833759206373067, "flos": 20412020632320.0, "grad_norm": 1.863446579787708, "language_loss": 0.74893856, "learning_rate": 2.6158852746085982e-06, "loss": 0.77059948, "num_input_tokens_seen": 149310385, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.796875, "step": 6958, "time_per_iteration": 2.465280294418335 }, { "auxiliary_loss_clip": 0.01127332, "auxiliary_loss_mlp": 0.01034023, "balance_loss_clip": 1.01872885, "balance_loss_mlp": 1.04667044, "epoch": 0.41839771531639863, "flos": 23656333875840.0, "grad_norm": 1.6282400609596763, "language_loss": 0.76899028, "learning_rate": 2.6155147271717066e-06, "loss": 0.79060382, "num_input_tokens_seen": 149328235, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8046875, "step": 6959, "time_per_iteration": 2.497777223587036 }, { "auxiliary_loss_clip": 0.01126708, "auxiliary_loss_mlp": 0.01038784, "balance_loss_clip": 1.0228101, "balance_loss_mlp": 1.04547918, "epoch": 0.4184578385690666, "flos": 19754137082880.0, "grad_norm": 2.0795780076223855, "language_loss": 0.76745939, "learning_rate": 2.6151441563944347e-06, "loss": 0.78911436, "num_input_tokens_seen": 149347465, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8125, "step": 6960, "time_per_iteration": 2.4833502769470215 }, { "auxiliary_loss_clip": 0.01121894, "auxiliary_loss_mlp": 0.01035548, "balance_loss_clip": 1.02129078, "balance_loss_mlp": 1.04561901, "epoch": 0.41851796182173456, "flos": 20193396503040.0, "grad_norm": 1.9735831654123093, "language_loss": 0.75642669, "learning_rate": 2.614773562290835e-06, "loss": 0.77800113, "num_input_tokens_seen": 149366685, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76171875, "step": 6961, "time_per_iteration": 2.5535991191864014 }, { "auxiliary_loss_clip": 0.01047508, "auxiliary_loss_mlp": 0.01008561, "balance_loss_clip": 1.00658226, "balance_loss_mlp": 1.02095532, "epoch": 0.41857808507440253, "flos": 59018794231680.0, "grad_norm": 0.8602561323103577, "language_loss": 0.54723012, "learning_rate": 2.61440294487496e-06, "loss": 0.56779075, "num_input_tokens_seen": 149422925, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.265625, "step": 6962, "time_per_iteration": 3.020573616027832 }, { "auxiliary_loss_clip": 0.01129006, "auxiliary_loss_mlp": 0.01040789, "balance_loss_clip": 1.02623999, "balance_loss_mlp": 1.04651272, "epoch": 0.4186382083270705, "flos": 18478805719680.0, "grad_norm": 1.8567192117302531, "language_loss": 0.85293329, "learning_rate": 2.614032304160864e-06, "loss": 0.87463117, "num_input_tokens_seen": 149440820, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.82421875, "step": 6963, "time_per_iteration": 2.45428466796875 }, { "auxiliary_loss_clip": 0.01128222, "auxiliary_loss_mlp": 0.01036913, "balance_loss_clip": 1.02160668, "balance_loss_mlp": 1.04758322, "epoch": 0.41869833157973846, "flos": 21578758202880.0, "grad_norm": 1.736169945969329, "language_loss": 0.7073977, "learning_rate": 2.6136616401626014e-06, "loss": 0.72904909, "num_input_tokens_seen": 149461060, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8046875, "step": 6964, "time_per_iteration": 2.485764741897583 }, { "auxiliary_loss_clip": 0.01126791, "auxiliary_loss_mlp": 0.01038148, "balance_loss_clip": 1.02397394, "balance_loss_mlp": 1.04821277, "epoch": 0.4187584548324064, "flos": 35517412650240.0, "grad_norm": 2.407440800076957, "language_loss": 0.71419901, "learning_rate": 2.6132909528942273e-06, "loss": 0.73584843, "num_input_tokens_seen": 149483115, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78515625, "step": 6965, "time_per_iteration": 2.607191801071167 }, { "auxiliary_loss_clip": 0.01126417, "auxiliary_loss_mlp": 0.01034621, "balance_loss_clip": 1.02104914, "balance_loss_mlp": 1.04766047, "epoch": 0.4188185780850744, "flos": 18655880791680.0, "grad_norm": 1.5825100359140611, "language_loss": 0.7262578, "learning_rate": 2.6129202423697997e-06, "loss": 0.74786818, "num_input_tokens_seen": 149501495, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7890625, "step": 6966, "time_per_iteration": 3.9760398864746094 }, { "auxiliary_loss_clip": 0.01130555, "auxiliary_loss_mlp": 0.01036903, "balance_loss_clip": 1.02041674, "balance_loss_mlp": 1.04634202, "epoch": 0.41887870133774235, "flos": 40333428374400.0, "grad_norm": 42.34269592585516, "language_loss": 0.71275461, "learning_rate": 2.612549508603375e-06, "loss": 0.73442918, "num_input_tokens_seen": 149523170, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.84375, "step": 6967, "time_per_iteration": 2.6434435844421387 }, { "auxiliary_loss_clip": 0.01047295, "auxiliary_loss_mlp": 0.01004252, "balance_loss_clip": 1.00227308, "balance_loss_mlp": 1.02016735, "epoch": 0.4189388245904103, "flos": 61371336516480.0, "grad_norm": 0.6730311528632175, "language_loss": 0.46224838, "learning_rate": 2.612178751609011e-06, "loss": 0.48276386, "num_input_tokens_seen": 149583955, "router_z_loss_clip": 0.01977539, "router_z_loss_mlp": 0.27148438, "step": 6968, "time_per_iteration": 3.094648599624634 }, { "auxiliary_loss_clip": 0.01130879, "auxiliary_loss_mlp": 0.01039889, "balance_loss_clip": 1.02330673, "balance_loss_mlp": 1.04626894, "epoch": 0.4189989478430783, "flos": 28215624119040.0, "grad_norm": 1.9743976047105023, "language_loss": 0.74597573, "learning_rate": 2.6118079714007685e-06, "loss": 0.76768345, "num_input_tokens_seen": 149604440, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.84765625, "step": 6969, "time_per_iteration": 5.284193515777588 }, { "auxiliary_loss_clip": 0.0112596, "auxiliary_loss_mlp": 0.01041186, "balance_loss_clip": 1.02729249, "balance_loss_mlp": 1.04562855, "epoch": 0.4190590710957463, "flos": 24565879088640.0, "grad_norm": 1.9496513527165933, "language_loss": 0.8074156, "learning_rate": 2.611437167992705e-06, "loss": 0.82908708, "num_input_tokens_seen": 149623745, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.80078125, "step": 6970, "time_per_iteration": 3.94474720954895 }, { "auxiliary_loss_clip": 0.01126137, "auxiliary_loss_mlp": 0.0103806, "balance_loss_clip": 1.02342141, "balance_loss_mlp": 1.04597664, "epoch": 0.41911919434841427, "flos": 21726027964800.0, "grad_norm": 2.0082723143779146, "language_loss": 0.82863748, "learning_rate": 2.6110663413988835e-06, "loss": 0.85027945, "num_input_tokens_seen": 149643025, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 6971, "time_per_iteration": 2.491881847381592 }, { "auxiliary_loss_clip": 0.01125663, "auxiliary_loss_mlp": 0.01039747, "balance_loss_clip": 1.02396369, "balance_loss_mlp": 1.04672492, "epoch": 0.41917931760108224, "flos": 17601543855360.0, "grad_norm": 2.0439979297234565, "language_loss": 0.74700737, "learning_rate": 2.6106954916333648e-06, "loss": 0.7686615, "num_input_tokens_seen": 149660695, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.7890625, "step": 6972, "time_per_iteration": 2.481940507888794 }, { "auxiliary_loss_clip": 0.0112391, "auxiliary_loss_mlp": 0.01039314, "balance_loss_clip": 1.02503228, "balance_loss_mlp": 1.04394031, "epoch": 0.4192394408537502, "flos": 37816701022080.0, "grad_norm": 2.674604218327967, "language_loss": 0.72999775, "learning_rate": 2.610324618710212e-06, "loss": 0.75163001, "num_input_tokens_seen": 149682040, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80078125, "step": 6973, "time_per_iteration": 2.631288528442383 }, { "auxiliary_loss_clip": 0.01135652, "auxiliary_loss_mlp": 0.01043427, "balance_loss_clip": 1.02850175, "balance_loss_mlp": 1.04966581, "epoch": 0.41929956410641817, "flos": 23107726477440.0, "grad_norm": 2.49711530982834, "language_loss": 0.745579, "learning_rate": 2.609953722643489e-06, "loss": 0.76736975, "num_input_tokens_seen": 149700855, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.859375, "step": 6974, "time_per_iteration": 2.509768009185791 }, { "auxiliary_loss_clip": 0.01122849, "auxiliary_loss_mlp": 0.01033804, "balance_loss_clip": 1.01896214, "balance_loss_mlp": 1.04351115, "epoch": 0.41935968735908613, "flos": 22524537260160.0, "grad_norm": 1.8406698590219222, "language_loss": 0.72391725, "learning_rate": 2.609582803447259e-06, "loss": 0.74548376, "num_input_tokens_seen": 149717360, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 6975, "time_per_iteration": 2.4922056198120117 }, { "auxiliary_loss_clip": 0.01126442, "auxiliary_loss_mlp": 0.01036885, "balance_loss_clip": 1.0213995, "balance_loss_mlp": 1.04695404, "epoch": 0.4194198106117541, "flos": 26870446759680.0, "grad_norm": 1.7152081394273802, "language_loss": 0.80919337, "learning_rate": 2.6092118611355885e-06, "loss": 0.83082664, "num_input_tokens_seen": 149738975, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.79296875, "step": 6976, "time_per_iteration": 2.6159961223602295 }, { "auxiliary_loss_clip": 0.01126527, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.01955688, "balance_loss_mlp": 1.04524326, "epoch": 0.41947993386442206, "flos": 19902412425600.0, "grad_norm": 2.3372351755675926, "language_loss": 0.67460102, "learning_rate": 2.6088408957225425e-06, "loss": 0.69620317, "num_input_tokens_seen": 149757055, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8125, "step": 6977, "time_per_iteration": 2.489176034927368 }, { "auxiliary_loss_clip": 0.0112733, "auxiliary_loss_mlp": 0.01039302, "balance_loss_clip": 1.02468729, "balance_loss_mlp": 1.04579329, "epoch": 0.41954005711709, "flos": 17383889393280.0, "grad_norm": 3.174653175041983, "language_loss": 0.80890667, "learning_rate": 2.6084699072221898e-06, "loss": 0.83057296, "num_input_tokens_seen": 149772885, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.81640625, "step": 6978, "time_per_iteration": 2.5120034217834473 }, { "auxiliary_loss_clip": 0.01127421, "auxiliary_loss_mlp": 0.01034924, "balance_loss_clip": 1.01935542, "balance_loss_mlp": 1.0443567, "epoch": 0.419600180369758, "flos": 25003306915200.0, "grad_norm": 2.223783513754659, "language_loss": 0.82606614, "learning_rate": 2.6080988956485964e-06, "loss": 0.84768951, "num_input_tokens_seen": 149791515, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83203125, "step": 6979, "time_per_iteration": 2.5123605728149414 }, { "auxiliary_loss_clip": 0.01122764, "auxiliary_loss_mlp": 0.0103352, "balance_loss_clip": 1.0188036, "balance_loss_mlp": 1.04307592, "epoch": 0.41966030362242596, "flos": 17383781652480.0, "grad_norm": 2.2993555289670886, "language_loss": 0.83296078, "learning_rate": 2.6077278610158325e-06, "loss": 0.85452366, "num_input_tokens_seen": 149807250, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.796875, "step": 6980, "time_per_iteration": 2.4743289947509766 }, { "auxiliary_loss_clip": 0.01128651, "auxiliary_loss_mlp": 0.01035028, "balance_loss_clip": 1.02001333, "balance_loss_mlp": 1.04578662, "epoch": 0.4197204268750939, "flos": 22156165330560.0, "grad_norm": 2.435102611220097, "language_loss": 0.78926206, "learning_rate": 2.6073568033379665e-06, "loss": 0.81089884, "num_input_tokens_seen": 149821640, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 6981, "time_per_iteration": 2.4686062335968018 }, { "auxiliary_loss_clip": 0.01122154, "auxiliary_loss_mlp": 0.01032946, "balance_loss_clip": 1.01837254, "balance_loss_mlp": 1.04411221, "epoch": 0.4197805501277619, "flos": 22084128604800.0, "grad_norm": 1.7299750919864587, "language_loss": 0.84554791, "learning_rate": 2.6069857226290696e-06, "loss": 0.86709893, "num_input_tokens_seen": 149840545, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78125, "step": 6982, "time_per_iteration": 2.509981393814087 }, { "auxiliary_loss_clip": 0.01127001, "auxiliary_loss_mlp": 0.01037315, "balance_loss_clip": 1.02143669, "balance_loss_mlp": 1.0439291, "epoch": 0.4198406733804299, "flos": 26432192920320.0, "grad_norm": 3.780596559950973, "language_loss": 0.56625903, "learning_rate": 2.606614618903214e-06, "loss": 0.58790219, "num_input_tokens_seen": 149860375, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.828125, "step": 6983, "time_per_iteration": 2.5363609790802 }, { "auxiliary_loss_clip": 0.01123863, "auxiliary_loss_mlp": 0.01034745, "balance_loss_clip": 1.02046371, "balance_loss_mlp": 1.04466701, "epoch": 0.4199007966330979, "flos": 12531029293440.0, "grad_norm": 1.9628889786248611, "language_loss": 0.82341552, "learning_rate": 2.606243492174471e-06, "loss": 0.84500164, "num_input_tokens_seen": 149877850, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.79296875, "step": 6984, "time_per_iteration": 2.473661422729492 }, { "auxiliary_loss_clip": 0.01122313, "auxiliary_loss_mlp": 0.01028868, "balance_loss_clip": 1.01441431, "balance_loss_mlp": 1.04257226, "epoch": 0.41996091988576584, "flos": 21762944167680.0, "grad_norm": 4.646031116609103, "language_loss": 0.79004776, "learning_rate": 2.605872342456914e-06, "loss": 0.81155956, "num_input_tokens_seen": 149896110, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 6985, "time_per_iteration": 2.492772340774536 }, { "auxiliary_loss_clip": 0.0112991, "auxiliary_loss_mlp": 0.01034118, "balance_loss_clip": 1.01863241, "balance_loss_mlp": 1.0440495, "epoch": 0.4200210431384338, "flos": 26541935948160.0, "grad_norm": 2.0051503318520543, "language_loss": 0.7812258, "learning_rate": 2.6055011697646173e-06, "loss": 0.8028661, "num_input_tokens_seen": 149916495, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.859375, "step": 6986, "time_per_iteration": 2.5527098178863525 }, { "auxiliary_loss_clip": 0.01119647, "auxiliary_loss_mlp": 0.01031593, "balance_loss_clip": 1.01778865, "balance_loss_mlp": 1.04246271, "epoch": 0.42008116639110177, "flos": 26795824254720.0, "grad_norm": 2.12365569570895, "language_loss": 0.72234869, "learning_rate": 2.605129974111655e-06, "loss": 0.74386108, "num_input_tokens_seen": 149936445, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 6987, "time_per_iteration": 2.531765937805176 }, { "auxiliary_loss_clip": 0.01127646, "auxiliary_loss_mlp": 0.01038794, "balance_loss_clip": 1.02351093, "balance_loss_mlp": 1.04610646, "epoch": 0.42014128964376973, "flos": 32087333243520.0, "grad_norm": 1.5018969024037963, "language_loss": 0.74922812, "learning_rate": 2.604758755512104e-06, "loss": 0.7708925, "num_input_tokens_seen": 149959430, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 6988, "time_per_iteration": 2.5946755409240723 }, { "auxiliary_loss_clip": 0.01127932, "auxiliary_loss_mlp": 0.01040642, "balance_loss_clip": 1.02484655, "balance_loss_mlp": 1.04409671, "epoch": 0.4202014128964377, "flos": 26467133875200.0, "grad_norm": 2.3007568871771755, "language_loss": 0.74044335, "learning_rate": 2.60438751398004e-06, "loss": 0.76212907, "num_input_tokens_seen": 149980365, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83984375, "step": 6989, "time_per_iteration": 2.5221376419067383 }, { "auxiliary_loss_clip": 0.01126631, "auxiliary_loss_mlp": 0.01032571, "balance_loss_clip": 1.01710975, "balance_loss_mlp": 1.04425359, "epoch": 0.42026153614910566, "flos": 13401216178560.0, "grad_norm": 2.344635473131938, "language_loss": 0.70875978, "learning_rate": 2.6040162495295404e-06, "loss": 0.73035187, "num_input_tokens_seen": 149997375, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.82421875, "step": 6990, "time_per_iteration": 2.5158615112304688 }, { "auxiliary_loss_clip": 0.01042814, "auxiliary_loss_mlp": 0.01006513, "balance_loss_clip": 1.00465286, "balance_loss_mlp": 1.01599813, "epoch": 0.42032165940177363, "flos": 60250457635200.0, "grad_norm": 0.8286392329848363, "language_loss": 0.60452288, "learning_rate": 2.603644962174685e-06, "loss": 0.62501615, "num_input_tokens_seen": 150051230, "router_z_loss_clip": 0.01855469, "router_z_loss_mlp": 0.26953125, "step": 6991, "time_per_iteration": 3.0161423683166504 }, { "auxiliary_loss_clip": 0.01127879, "auxiliary_loss_mlp": 0.01041018, "balance_loss_clip": 1.02565813, "balance_loss_mlp": 1.0457983, "epoch": 0.4203817826544416, "flos": 24535211852160.0, "grad_norm": 1.7996308752561274, "language_loss": 0.83114183, "learning_rate": 2.6032736519295517e-06, "loss": 0.85283077, "num_input_tokens_seen": 150071135, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8203125, "step": 6992, "time_per_iteration": 2.5301215648651123 }, { "auxiliary_loss_clip": 0.01043544, "auxiliary_loss_mlp": 0.01001707, "balance_loss_clip": 0.99977571, "balance_loss_mlp": 1.01685297, "epoch": 0.42044190590710956, "flos": 58820781530880.0, "grad_norm": 0.8913823125189905, "language_loss": 0.65478361, "learning_rate": 2.6029023188082217e-06, "loss": 0.67523611, "num_input_tokens_seen": 150125220, "router_z_loss_clip": 0.01928711, "router_z_loss_mlp": 0.265625, "step": 6993, "time_per_iteration": 3.0698788166046143 }, { "auxiliary_loss_clip": 0.01131343, "auxiliary_loss_mlp": 0.01035995, "balance_loss_clip": 1.01859093, "balance_loss_mlp": 1.04520237, "epoch": 0.4205020291597775, "flos": 16436063260800.0, "grad_norm": 2.0591740869111654, "language_loss": 0.82694483, "learning_rate": 2.6025309628247746e-06, "loss": 0.84861821, "num_input_tokens_seen": 150142300, "router_z_loss_clip": 0.17382812, "router_z_loss_mlp": 0.859375, "step": 6994, "time_per_iteration": 2.4657392501831055 }, { "auxiliary_loss_clip": 0.01123519, "auxiliary_loss_mlp": 0.01035584, "balance_loss_clip": 1.02152944, "balance_loss_mlp": 1.0447458, "epoch": 0.4205621524124455, "flos": 18405655672320.0, "grad_norm": 1.6016801388027, "language_loss": 0.78340018, "learning_rate": 2.6021595839932934e-06, "loss": 0.80499125, "num_input_tokens_seen": 150161345, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 6995, "time_per_iteration": 2.496492624282837 }, { "auxiliary_loss_clip": 0.01120617, "auxiliary_loss_mlp": 0.01032559, "balance_loss_clip": 1.01830721, "balance_loss_mlp": 1.04271889, "epoch": 0.4206222756651135, "flos": 25520097841920.0, "grad_norm": 1.5918997303291764, "language_loss": 0.79992831, "learning_rate": 2.60178818232786e-06, "loss": 0.82146007, "num_input_tokens_seen": 150182420, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 6996, "time_per_iteration": 2.538706064224243 }, { "auxiliary_loss_clip": 0.01125896, "auxiliary_loss_mlp": 0.0103625, "balance_loss_clip": 1.02164102, "balance_loss_mlp": 1.04420066, "epoch": 0.4206823989177815, "flos": 15304338472320.0, "grad_norm": 4.216042611123766, "language_loss": 0.75382066, "learning_rate": 2.601416757842559e-06, "loss": 0.77544212, "num_input_tokens_seen": 150200175, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.81640625, "step": 6997, "time_per_iteration": 2.482717514038086 }, { "auxiliary_loss_clip": 0.0112272, "auxiliary_loss_mlp": 0.01039277, "balance_loss_clip": 1.02398217, "balance_loss_mlp": 1.04053819, "epoch": 0.42074252217044944, "flos": 15554096714880.0, "grad_norm": 1.867660278318959, "language_loss": 0.75511575, "learning_rate": 2.6010453105514743e-06, "loss": 0.77673578, "num_input_tokens_seen": 150217100, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8203125, "step": 6998, "time_per_iteration": 2.4546446800231934 }, { "auxiliary_loss_clip": 0.01129431, "auxiliary_loss_mlp": 0.01042769, "balance_loss_clip": 1.02704549, "balance_loss_mlp": 1.04581618, "epoch": 0.4208026454231174, "flos": 26145877610880.0, "grad_norm": 1.6802959991509103, "language_loss": 0.76117128, "learning_rate": 2.60067384046869e-06, "loss": 0.7828933, "num_input_tokens_seen": 150239830, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8359375, "step": 6999, "time_per_iteration": 2.5776925086975098 }, { "auxiliary_loss_clip": 0.0112494, "auxiliary_loss_mlp": 0.01042297, "balance_loss_clip": 1.02654982, "balance_loss_mlp": 1.04516268, "epoch": 0.42086276867578537, "flos": 23550110380800.0, "grad_norm": 1.9123282800010901, "language_loss": 0.64377558, "learning_rate": 2.600302347608295e-06, "loss": 0.66544801, "num_input_tokens_seen": 150260690, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.796875, "step": 7000, "time_per_iteration": 2.5235116481781006 }, { "auxiliary_loss_clip": 0.01127485, "auxiliary_loss_mlp": 0.01045663, "balance_loss_clip": 1.03045177, "balance_loss_mlp": 1.0452683, "epoch": 0.42092289192845334, "flos": 18113414618880.0, "grad_norm": 1.6025917613629137, "language_loss": 0.76362443, "learning_rate": 2.5999308319843743e-06, "loss": 0.78535581, "num_input_tokens_seen": 150279885, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 7001, "time_per_iteration": 2.491309404373169 }, { "auxiliary_loss_clip": 0.0112598, "auxiliary_loss_mlp": 0.01038615, "balance_loss_clip": 1.02408981, "balance_loss_mlp": 1.04637766, "epoch": 0.4209830151811213, "flos": 20006588845440.0, "grad_norm": 1.6475935187619548, "language_loss": 0.86330926, "learning_rate": 2.5995592936110154e-06, "loss": 0.88495523, "num_input_tokens_seen": 150297390, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.796875, "step": 7002, "time_per_iteration": 2.4828712940216064 }, { "auxiliary_loss_clip": 0.01124848, "auxiliary_loss_mlp": 0.01041486, "balance_loss_clip": 1.0266149, "balance_loss_mlp": 1.0447464, "epoch": 0.42104313843378927, "flos": 21978946604160.0, "grad_norm": 1.9792797933001554, "language_loss": 0.67533624, "learning_rate": 2.5991877325023096e-06, "loss": 0.69699955, "num_input_tokens_seen": 150317390, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80078125, "step": 7003, "time_per_iteration": 2.499307632446289 }, { "auxiliary_loss_clip": 0.01126459, "auxiliary_loss_mlp": 0.0103869, "balance_loss_clip": 1.02337193, "balance_loss_mlp": 1.04355204, "epoch": 0.42110326168645723, "flos": 25443966965760.0, "grad_norm": 2.1708525380783623, "language_loss": 0.77497011, "learning_rate": 2.598816148672344e-06, "loss": 0.79662156, "num_input_tokens_seen": 150337455, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 7004, "time_per_iteration": 2.534639596939087 }, { "auxiliary_loss_clip": 0.01123551, "auxiliary_loss_mlp": 0.01041538, "balance_loss_clip": 1.02633929, "balance_loss_mlp": 1.04480624, "epoch": 0.4211633849391252, "flos": 17822574195840.0, "grad_norm": 1.7784635279569978, "language_loss": 0.68371254, "learning_rate": 2.59844454213521e-06, "loss": 0.70536339, "num_input_tokens_seen": 150355385, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78515625, "step": 7005, "time_per_iteration": 2.481079339981079 }, { "auxiliary_loss_clip": 0.01126625, "auxiliary_loss_mlp": 0.0103573, "balance_loss_clip": 1.02050686, "balance_loss_mlp": 1.04471076, "epoch": 0.42122350819179316, "flos": 16282436791680.0, "grad_norm": 1.9562090621756476, "language_loss": 0.72338045, "learning_rate": 2.5980729129049994e-06, "loss": 0.745004, "num_input_tokens_seen": 150371750, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 7006, "time_per_iteration": 2.4453420639038086 }, { "auxiliary_loss_clip": 0.01126601, "auxiliary_loss_mlp": 0.0103516, "balance_loss_clip": 1.0202291, "balance_loss_mlp": 1.04391253, "epoch": 0.4212836314444611, "flos": 19645866512640.0, "grad_norm": 1.7733142550926317, "language_loss": 0.70584738, "learning_rate": 2.5977012609958033e-06, "loss": 0.72746497, "num_input_tokens_seen": 150389955, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.828125, "step": 7007, "time_per_iteration": 2.475449800491333 }, { "auxiliary_loss_clip": 0.01124715, "auxiliary_loss_mlp": 0.01040006, "balance_loss_clip": 1.02477086, "balance_loss_mlp": 1.0434761, "epoch": 0.4213437546971291, "flos": 18369026778240.0, "grad_norm": 2.1725067056466805, "language_loss": 0.82469368, "learning_rate": 2.5973295864217166e-06, "loss": 0.84634089, "num_input_tokens_seen": 150405780, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 7008, "time_per_iteration": 3.9022116661071777 }, { "auxiliary_loss_clip": 0.01123652, "auxiliary_loss_mlp": 0.01040031, "balance_loss_clip": 1.02468932, "balance_loss_mlp": 1.04219484, "epoch": 0.42140387794979706, "flos": 27704507541120.0, "grad_norm": 1.947044612739869, "language_loss": 0.71830392, "learning_rate": 2.596957889196831e-06, "loss": 0.73994076, "num_input_tokens_seen": 150425615, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8125, "step": 7009, "time_per_iteration": 2.5543999671936035 }, { "auxiliary_loss_clip": 0.01126209, "auxiliary_loss_mlp": 0.01034882, "balance_loss_clip": 1.01896191, "balance_loss_mlp": 1.04348779, "epoch": 0.4214640012024651, "flos": 28147071012480.0, "grad_norm": 2.7627268217547494, "language_loss": 0.66233337, "learning_rate": 2.596586169335243e-06, "loss": 0.68394423, "num_input_tokens_seen": 150445765, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.828125, "step": 7010, "time_per_iteration": 2.54081392288208 }, { "auxiliary_loss_clip": 0.01122289, "auxiliary_loss_mlp": 0.01033183, "balance_loss_clip": 1.01813912, "balance_loss_mlp": 1.04246747, "epoch": 0.42152412445513304, "flos": 22997265177600.0, "grad_norm": 1.7210645388853494, "language_loss": 0.72233564, "learning_rate": 2.5962144268510477e-06, "loss": 0.7438904, "num_input_tokens_seen": 150464405, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 7011, "time_per_iteration": 6.729538202285767 }, { "auxiliary_loss_clip": 0.01044611, "auxiliary_loss_mlp": 0.01005157, "balance_loss_clip": 1.00314188, "balance_loss_mlp": 1.01746178, "epoch": 0.421584247707801, "flos": 63749592938880.0, "grad_norm": 0.8003874195120433, "language_loss": 0.54369485, "learning_rate": 2.5958426617583417e-06, "loss": 0.56419253, "num_input_tokens_seen": 150520430, "router_z_loss_clip": 0.0201416, "router_z_loss_mlp": 0.27148438, "step": 7012, "time_per_iteration": 2.9960885047912598 }, { "auxiliary_loss_clip": 0.01126904, "auxiliary_loss_mlp": 0.01035654, "balance_loss_clip": 1.01997805, "balance_loss_mlp": 1.0448848, "epoch": 0.421644370960469, "flos": 24314612474880.0, "grad_norm": 2.4300113070985487, "language_loss": 0.78885031, "learning_rate": 2.5954708740712215e-06, "loss": 0.81047589, "num_input_tokens_seen": 150542610, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 7013, "time_per_iteration": 2.5399625301361084 }, { "auxiliary_loss_clip": 0.01124666, "auxiliary_loss_mlp": 0.01038471, "balance_loss_clip": 1.02262843, "balance_loss_mlp": 1.04206872, "epoch": 0.42170449421313694, "flos": 23440690575360.0, "grad_norm": 2.175511808075272, "language_loss": 0.81128532, "learning_rate": 2.595099063803787e-06, "loss": 0.83291668, "num_input_tokens_seen": 150560970, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 7014, "time_per_iteration": 2.502634048461914 }, { "auxiliary_loss_clip": 0.01127221, "auxiliary_loss_mlp": 0.01036026, "balance_loss_clip": 1.02082646, "balance_loss_mlp": 1.04430246, "epoch": 0.4217646174658049, "flos": 23695476721920.0, "grad_norm": 1.6420701443101702, "language_loss": 0.77557278, "learning_rate": 2.5947272309701354e-06, "loss": 0.79720521, "num_input_tokens_seen": 150582615, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 7015, "time_per_iteration": 2.575258255004883 }, { "auxiliary_loss_clip": 0.01127549, "auxiliary_loss_mlp": 0.01039766, "balance_loss_clip": 1.02339888, "balance_loss_mlp": 1.04503644, "epoch": 0.42182474071847287, "flos": 24971562270720.0, "grad_norm": 1.6497609747429927, "language_loss": 0.82207108, "learning_rate": 2.594355375584368e-06, "loss": 0.84374428, "num_input_tokens_seen": 150603640, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.82421875, "step": 7016, "time_per_iteration": 2.5291435718536377 }, { "auxiliary_loss_clip": 0.0112787, "auxiliary_loss_mlp": 0.01033995, "balance_loss_clip": 1.0185101, "balance_loss_mlp": 1.04451787, "epoch": 0.42188486397114083, "flos": 22856639431680.0, "grad_norm": 2.1563859878884872, "language_loss": 0.68436682, "learning_rate": 2.593983497660586e-06, "loss": 0.70598555, "num_input_tokens_seen": 150622490, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8359375, "step": 7017, "time_per_iteration": 2.511596441268921 }, { "auxiliary_loss_clip": 0.01043239, "auxiliary_loss_mlp": 0.0100171, "balance_loss_clip": 0.99987453, "balance_loss_mlp": 1.01609635, "epoch": 0.4219449872238088, "flos": 66975700965120.0, "grad_norm": 0.6813194299047504, "language_loss": 0.59425437, "learning_rate": 2.5936115972128895e-06, "loss": 0.61470389, "num_input_tokens_seen": 150689545, "router_z_loss_clip": 0.01831055, "router_z_loss_mlp": 0.27148438, "step": 7018, "time_per_iteration": 3.159440517425537 }, { "auxiliary_loss_clip": 0.01128161, "auxiliary_loss_mlp": 0.01038966, "balance_loss_clip": 1.02339685, "balance_loss_mlp": 1.0441606, "epoch": 0.42200511047647676, "flos": 13115367745920.0, "grad_norm": 2.581961981174285, "language_loss": 0.76104951, "learning_rate": 2.593239674255382e-06, "loss": 0.7827208, "num_input_tokens_seen": 150707610, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83984375, "step": 7019, "time_per_iteration": 2.467245101928711 }, { "auxiliary_loss_clip": 0.01124915, "auxiliary_loss_mlp": 0.01035651, "balance_loss_clip": 1.01964092, "balance_loss_mlp": 1.04336286, "epoch": 0.42206523372914473, "flos": 13991193066240.0, "grad_norm": 2.3730810338468116, "language_loss": 0.69074988, "learning_rate": 2.592867728802166e-06, "loss": 0.71235549, "num_input_tokens_seen": 150724530, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.81640625, "step": 7020, "time_per_iteration": 2.4708187580108643 }, { "auxiliary_loss_clip": 0.01121274, "auxiliary_loss_mlp": 0.01031337, "balance_loss_clip": 1.01756787, "balance_loss_mlp": 1.04406643, "epoch": 0.4221253569818127, "flos": 21942317710080.0, "grad_norm": 1.6764478845677027, "language_loss": 0.80622137, "learning_rate": 2.592495760867347e-06, "loss": 0.82774752, "num_input_tokens_seen": 150742870, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7734375, "step": 7021, "time_per_iteration": 2.504866600036621 }, { "auxiliary_loss_clip": 0.01128905, "auxiliary_loss_mlp": 0.01032876, "balance_loss_clip": 1.01740217, "balance_loss_mlp": 1.04628181, "epoch": 0.42218548023448066, "flos": 32192587071360.0, "grad_norm": 1.71560762122898, "language_loss": 0.70047164, "learning_rate": 2.5921237704650293e-06, "loss": 0.72208947, "num_input_tokens_seen": 150765500, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.82421875, "step": 7022, "time_per_iteration": 2.574688673019409 }, { "auxiliary_loss_clip": 0.0112121, "auxiliary_loss_mlp": 0.01028123, "balance_loss_clip": 1.01502228, "balance_loss_mlp": 1.04535675, "epoch": 0.4222456034871487, "flos": 30118961894400.0, "grad_norm": 2.0068620865616276, "language_loss": 0.67503542, "learning_rate": 2.5917517576093188e-06, "loss": 0.69652879, "num_input_tokens_seen": 150784945, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 7023, "time_per_iteration": 2.562000036239624 }, { "auxiliary_loss_clip": 0.01124288, "auxiliary_loss_mlp": 0.01038238, "balance_loss_clip": 1.02286005, "balance_loss_mlp": 1.04579711, "epoch": 0.42230572673981664, "flos": 22127904305280.0, "grad_norm": 1.59753797344058, "language_loss": 0.6938256, "learning_rate": 2.591379722314322e-06, "loss": 0.71545088, "num_input_tokens_seen": 150803120, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78515625, "step": 7024, "time_per_iteration": 2.5065579414367676 }, { "auxiliary_loss_clip": 0.01127192, "auxiliary_loss_mlp": 0.01034801, "balance_loss_clip": 1.01973271, "balance_loss_mlp": 1.04579759, "epoch": 0.4223658499924846, "flos": 22055077480320.0, "grad_norm": 1.967875466945393, "language_loss": 0.76740038, "learning_rate": 2.591007664594147e-06, "loss": 0.7890203, "num_input_tokens_seen": 150823135, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 7025, "time_per_iteration": 2.529466390609741 }, { "auxiliary_loss_clip": 0.01121036, "auxiliary_loss_mlp": 0.01035567, "balance_loss_clip": 1.02073133, "balance_loss_mlp": 1.04262471, "epoch": 0.4224259732451526, "flos": 20410727742720.0, "grad_norm": 2.1817823769035454, "language_loss": 0.7979008, "learning_rate": 2.5906355844629024e-06, "loss": 0.81946683, "num_input_tokens_seen": 150842070, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 7026, "time_per_iteration": 2.4925944805145264 }, { "auxiliary_loss_clip": 0.01041733, "auxiliary_loss_mlp": 0.01003635, "balance_loss_clip": 1.00171566, "balance_loss_mlp": 1.01467967, "epoch": 0.42248609649782054, "flos": 62846655828480.0, "grad_norm": 0.7522486479728429, "language_loss": 0.61997849, "learning_rate": 2.5902634819346966e-06, "loss": 0.64043218, "num_input_tokens_seen": 150907450, "router_z_loss_clip": 0.01916504, "router_z_loss_mlp": 0.26953125, "step": 7027, "time_per_iteration": 3.2018301486968994 }, { "auxiliary_loss_clip": 0.0112446, "auxiliary_loss_mlp": 0.01036503, "balance_loss_clip": 1.02180481, "balance_loss_mlp": 1.04418623, "epoch": 0.4225462197504885, "flos": 26249946289920.0, "grad_norm": 2.005606651262289, "language_loss": 0.71146339, "learning_rate": 2.5898913570236414e-06, "loss": 0.733073, "num_input_tokens_seen": 150928040, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 7028, "time_per_iteration": 2.553652763366699 }, { "auxiliary_loss_clip": 0.01126726, "auxiliary_loss_mlp": 0.010413, "balance_loss_clip": 1.02595782, "balance_loss_mlp": 1.04386163, "epoch": 0.42260634300315647, "flos": 20521943228160.0, "grad_norm": 2.4209383447439277, "language_loss": 0.82479101, "learning_rate": 2.589519209743846e-06, "loss": 0.84647131, "num_input_tokens_seen": 150945760, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.828125, "step": 7029, "time_per_iteration": 2.482553720474243 }, { "auxiliary_loss_clip": 0.01129834, "auxiliary_loss_mlp": 0.01040135, "balance_loss_clip": 1.02430439, "balance_loss_mlp": 1.04599023, "epoch": 0.42266646625582444, "flos": 24316731377280.0, "grad_norm": 1.8640981153370992, "language_loss": 0.75126243, "learning_rate": 2.589147040109424e-06, "loss": 0.77296209, "num_input_tokens_seen": 150965665, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83984375, "step": 7030, "time_per_iteration": 2.5284311771392822 }, { "auxiliary_loss_clip": 0.01125281, "auxiliary_loss_mlp": 0.01035601, "balance_loss_clip": 1.01869702, "balance_loss_mlp": 1.04387772, "epoch": 0.4227265895084924, "flos": 24204151175040.0, "grad_norm": 2.023422404287624, "language_loss": 0.86489713, "learning_rate": 2.588774848134486e-06, "loss": 0.88650596, "num_input_tokens_seen": 150982260, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8125, "step": 7031, "time_per_iteration": 2.5190091133117676 }, { "auxiliary_loss_clip": 0.01127007, "auxiliary_loss_mlp": 0.01038091, "balance_loss_clip": 1.02136612, "balance_loss_mlp": 1.04526496, "epoch": 0.42278671276116037, "flos": 16909760845440.0, "grad_norm": 1.8620612434016504, "language_loss": 0.73332536, "learning_rate": 2.5884026338331473e-06, "loss": 0.75497639, "num_input_tokens_seen": 150999990, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.81640625, "step": 7032, "time_per_iteration": 2.4923949241638184 }, { "auxiliary_loss_clip": 0.01125929, "auxiliary_loss_mlp": 0.01043758, "balance_loss_clip": 1.02818918, "balance_loss_mlp": 1.04250717, "epoch": 0.42284683601382833, "flos": 25411073086080.0, "grad_norm": 1.6956089646486696, "language_loss": 0.70592284, "learning_rate": 2.5880303972195222e-06, "loss": 0.72761977, "num_input_tokens_seen": 151021105, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8359375, "step": 7033, "time_per_iteration": 2.5324959754943848 }, { "auxiliary_loss_clip": 0.01125258, "auxiliary_loss_mlp": 0.01032383, "balance_loss_clip": 1.01683831, "balance_loss_mlp": 1.04284811, "epoch": 0.4229069592664963, "flos": 23040322606080.0, "grad_norm": 2.0789028335135598, "language_loss": 0.90637404, "learning_rate": 2.5876581383077256e-06, "loss": 0.92795038, "num_input_tokens_seen": 151040665, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.82421875, "step": 7034, "time_per_iteration": 2.517038345336914 }, { "auxiliary_loss_clip": 0.01122554, "auxiliary_loss_mlp": 0.01038215, "balance_loss_clip": 1.02368903, "balance_loss_mlp": 1.04207361, "epoch": 0.42296708251916426, "flos": 26067448264320.0, "grad_norm": 1.6178378488317235, "language_loss": 0.77084684, "learning_rate": 2.5872858571118723e-06, "loss": 0.79245448, "num_input_tokens_seen": 151061240, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 7035, "time_per_iteration": 2.5355734825134277 }, { "auxiliary_loss_clip": 0.01127746, "auxiliary_loss_mlp": 0.01043449, "balance_loss_clip": 1.02768981, "balance_loss_mlp": 1.0449059, "epoch": 0.4230272057718323, "flos": 19458376496640.0, "grad_norm": 1.818959175067572, "language_loss": 0.82325774, "learning_rate": 2.5869135536460817e-06, "loss": 0.84496969, "num_input_tokens_seen": 151076870, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 7036, "time_per_iteration": 2.4859883785247803 }, { "auxiliary_loss_clip": 0.0112248, "auxiliary_loss_mlp": 0.01033551, "balance_loss_clip": 1.01832163, "balance_loss_mlp": 1.04432201, "epoch": 0.42308732902450025, "flos": 22383300983040.0, "grad_norm": 1.5844618948975913, "language_loss": 0.70538163, "learning_rate": 2.58654122792447e-06, "loss": 0.72694194, "num_input_tokens_seen": 151095110, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78125, "step": 7037, "time_per_iteration": 2.5093488693237305 }, { "auxiliary_loss_clip": 0.01124092, "auxiliary_loss_mlp": 0.01035644, "balance_loss_clip": 1.02024794, "balance_loss_mlp": 1.04293919, "epoch": 0.4231474522771682, "flos": 20995425331200.0, "grad_norm": 1.635621568024174, "language_loss": 0.77818203, "learning_rate": 2.586168879961155e-06, "loss": 0.79977942, "num_input_tokens_seen": 151114355, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8125, "step": 7038, "time_per_iteration": 2.512110710144043 }, { "auxiliary_loss_clip": 0.0112971, "auxiliary_loss_mlp": 0.01047451, "balance_loss_clip": 1.03113151, "balance_loss_mlp": 1.04346573, "epoch": 0.4232075755298362, "flos": 14975863574400.0, "grad_norm": 2.5452942757654164, "language_loss": 0.67445987, "learning_rate": 2.585796509770259e-06, "loss": 0.69623148, "num_input_tokens_seen": 151131505, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.86328125, "step": 7039, "time_per_iteration": 2.451965093612671 }, { "auxiliary_loss_clip": 0.01129976, "auxiliary_loss_mlp": 0.01038401, "balance_loss_clip": 1.02226019, "balance_loss_mlp": 1.04470515, "epoch": 0.42326769878250414, "flos": 24532661986560.0, "grad_norm": 1.8668758433952575, "language_loss": 0.75752258, "learning_rate": 2.5854241173658996e-06, "loss": 0.7792064, "num_input_tokens_seen": 151151555, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.8515625, "step": 7040, "time_per_iteration": 2.5662384033203125 }, { "auxiliary_loss_clip": 0.01124673, "auxiliary_loss_mlp": 0.0103055, "balance_loss_clip": 1.01561272, "balance_loss_mlp": 1.04296637, "epoch": 0.4233278220351721, "flos": 26870303105280.0, "grad_norm": 1.733600938109935, "language_loss": 0.65794903, "learning_rate": 2.5850517027621996e-06, "loss": 0.6795013, "num_input_tokens_seen": 151172385, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.81640625, "step": 7041, "time_per_iteration": 2.5338973999023438 }, { "auxiliary_loss_clip": 0.01127404, "auxiliary_loss_mlp": 0.01033375, "balance_loss_clip": 1.01773477, "balance_loss_mlp": 1.04337549, "epoch": 0.4233879452878401, "flos": 42814927463040.0, "grad_norm": 1.8174546004357703, "language_loss": 0.74084753, "learning_rate": 2.5846792659732803e-06, "loss": 0.76245534, "num_input_tokens_seen": 151194930, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.83984375, "step": 7042, "time_per_iteration": 2.6891489028930664 }, { "auxiliary_loss_clip": 0.01121968, "auxiliary_loss_mlp": 0.01031505, "balance_loss_clip": 1.01693773, "balance_loss_mlp": 1.04330957, "epoch": 0.42344806854050804, "flos": 25229006023680.0, "grad_norm": 1.2946851277844893, "language_loss": 0.82353121, "learning_rate": 2.5843068070132643e-06, "loss": 0.84506595, "num_input_tokens_seen": 151217905, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78515625, "step": 7043, "time_per_iteration": 2.54567813873291 }, { "auxiliary_loss_clip": 0.01126563, "auxiliary_loss_mlp": 0.01039782, "balance_loss_clip": 1.02362275, "balance_loss_mlp": 1.04632902, "epoch": 0.423508191793176, "flos": 22778820616320.0, "grad_norm": 2.48187575576318, "language_loss": 0.64896202, "learning_rate": 2.5839343258962763e-06, "loss": 0.67062545, "num_input_tokens_seen": 151234580, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.80078125, "step": 7044, "time_per_iteration": 2.503478527069092 }, { "auxiliary_loss_clip": 0.01130936, "auxiliary_loss_mlp": 0.01045873, "balance_loss_clip": 1.0287782, "balance_loss_mlp": 1.04685092, "epoch": 0.42356831504584397, "flos": 34637493179520.0, "grad_norm": 1.8518028095951413, "language_loss": 0.75421643, "learning_rate": 2.5835618226364393e-06, "loss": 0.77598453, "num_input_tokens_seen": 151254765, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.83984375, "step": 7045, "time_per_iteration": 2.6097419261932373 }, { "auxiliary_loss_clip": 0.01123517, "auxiliary_loss_mlp": 0.01036559, "balance_loss_clip": 1.02108002, "balance_loss_mlp": 1.04439545, "epoch": 0.42362843829851193, "flos": 17596767346560.0, "grad_norm": 2.1312048906171843, "language_loss": 0.81002104, "learning_rate": 2.5831892972478797e-06, "loss": 0.83162177, "num_input_tokens_seen": 151269045, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.79296875, "step": 7046, "time_per_iteration": 2.4850122928619385 }, { "auxiliary_loss_clip": 0.01128257, "auxiliary_loss_mlp": 0.01033749, "balance_loss_clip": 1.01789415, "balance_loss_mlp": 1.04486144, "epoch": 0.4236885615511799, "flos": 22565691267840.0, "grad_norm": 1.9160601581767165, "language_loss": 0.76568663, "learning_rate": 2.5828167497447242e-06, "loss": 0.78730667, "num_input_tokens_seen": 151287530, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 7047, "time_per_iteration": 2.512028932571411 }, { "auxiliary_loss_clip": 0.01124775, "auxiliary_loss_mlp": 0.01032338, "balance_loss_clip": 1.01771104, "balance_loss_mlp": 1.04563081, "epoch": 0.42374868480384786, "flos": 26469216864000.0, "grad_norm": 1.8330750594050227, "language_loss": 0.67793727, "learning_rate": 2.582444180141098e-06, "loss": 0.69950831, "num_input_tokens_seen": 151308905, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 7048, "time_per_iteration": 2.5684947967529297 }, { "auxiliary_loss_clip": 0.01127904, "auxiliary_loss_mlp": 0.01036836, "balance_loss_clip": 1.02021873, "balance_loss_mlp": 1.04597878, "epoch": 0.4238088080565159, "flos": 20370220179840.0, "grad_norm": 1.876392553681037, "language_loss": 0.77967083, "learning_rate": 2.5820715884511307e-06, "loss": 0.80131829, "num_input_tokens_seen": 151326525, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.8203125, "step": 7049, "time_per_iteration": 3.9667632579803467 }, { "auxiliary_loss_clip": 0.01131404, "auxiliary_loss_mlp": 0.01040213, "balance_loss_clip": 1.0248704, "balance_loss_mlp": 1.0479393, "epoch": 0.42386893130918385, "flos": 21172105353600.0, "grad_norm": 1.8632779205904493, "language_loss": 0.82566631, "learning_rate": 2.5816989746889504e-06, "loss": 0.84738243, "num_input_tokens_seen": 151344675, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8359375, "step": 7050, "time_per_iteration": 2.5348708629608154 }, { "auxiliary_loss_clip": 0.01125798, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.01914179, "balance_loss_mlp": 1.0430851, "epoch": 0.4239290545618518, "flos": 17675627656320.0, "grad_norm": 3.9488548310681555, "language_loss": 0.73170108, "learning_rate": 2.581326338868687e-06, "loss": 0.75330484, "num_input_tokens_seen": 151360730, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 7051, "time_per_iteration": 2.491358995437622 }, { "auxiliary_loss_clip": 0.01126765, "auxiliary_loss_mlp": 0.01038541, "balance_loss_clip": 1.02348483, "balance_loss_mlp": 1.04559982, "epoch": 0.4239891778145198, "flos": 24314504734080.0, "grad_norm": 1.5034758383938291, "language_loss": 0.86624336, "learning_rate": 2.5809536810044706e-06, "loss": 0.88789642, "num_input_tokens_seen": 151380445, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 7052, "time_per_iteration": 3.90470552444458 }, { "auxiliary_loss_clip": 0.01125794, "auxiliary_loss_mlp": 0.01040103, "balance_loss_clip": 1.02442694, "balance_loss_mlp": 1.04348218, "epoch": 0.42404930106718774, "flos": 20558428467840.0, "grad_norm": 1.4704277136809474, "language_loss": 0.7252357, "learning_rate": 2.5805810011104323e-06, "loss": 0.74689466, "num_input_tokens_seen": 151399325, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8203125, "step": 7053, "time_per_iteration": 3.943162202835083 }, { "auxiliary_loss_clip": 0.01125866, "auxiliary_loss_mlp": 0.01034252, "balance_loss_clip": 1.01927924, "balance_loss_mlp": 1.04594612, "epoch": 0.4241094243198557, "flos": 22308067946880.0, "grad_norm": 1.6267864528332674, "language_loss": 0.82401067, "learning_rate": 2.580208299200704e-06, "loss": 0.84561187, "num_input_tokens_seen": 151417240, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.80078125, "step": 7054, "time_per_iteration": 2.5410587787628174 }, { "auxiliary_loss_clip": 0.01044817, "auxiliary_loss_mlp": 0.01002271, "balance_loss_clip": 1.00051916, "balance_loss_mlp": 1.01801741, "epoch": 0.4241695475725237, "flos": 70612445272320.0, "grad_norm": 0.791957937895534, "language_loss": 0.6038419, "learning_rate": 2.5798355752894183e-06, "loss": 0.62431276, "num_input_tokens_seen": 151476015, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.26757812, "step": 7055, "time_per_iteration": 3.055528402328491 }, { "auxiliary_loss_clip": 0.01129295, "auxiliary_loss_mlp": 0.01043115, "balance_loss_clip": 1.02673531, "balance_loss_mlp": 1.04602039, "epoch": 0.42422967082519164, "flos": 14027462824320.0, "grad_norm": 3.397880639940753, "language_loss": 0.77372688, "learning_rate": 2.5794628293907107e-06, "loss": 0.79545099, "num_input_tokens_seen": 151492035, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.83203125, "step": 7056, "time_per_iteration": 2.521480083465576 }, { "auxiliary_loss_clip": 0.0112942, "auxiliary_loss_mlp": 0.01039654, "balance_loss_clip": 1.02270877, "balance_loss_mlp": 1.04417121, "epoch": 0.4242897940778596, "flos": 22345522853760.0, "grad_norm": 2.191911452249002, "language_loss": 0.84386337, "learning_rate": 2.579090061518714e-06, "loss": 0.86555415, "num_input_tokens_seen": 151508970, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.85546875, "step": 7057, "time_per_iteration": 2.4857661724090576 }, { "auxiliary_loss_clip": 0.01128547, "auxiliary_loss_mlp": 0.01039854, "balance_loss_clip": 1.02333152, "balance_loss_mlp": 1.04304457, "epoch": 0.42434991733052757, "flos": 22595855713920.0, "grad_norm": 2.384127867997011, "language_loss": 0.8295936, "learning_rate": 2.5787172716875642e-06, "loss": 0.85127765, "num_input_tokens_seen": 151525295, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.85546875, "step": 7058, "time_per_iteration": 2.5480406284332275 }, { "auxiliary_loss_clip": 0.01127175, "auxiliary_loss_mlp": 0.01028387, "balance_loss_clip": 1.0139389, "balance_loss_mlp": 1.04780638, "epoch": 0.42441004058319554, "flos": 20011437181440.0, "grad_norm": 2.233514430133853, "language_loss": 0.80714726, "learning_rate": 2.5783444599113973e-06, "loss": 0.82870287, "num_input_tokens_seen": 151544435, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.79296875, "step": 7059, "time_per_iteration": 2.492980718612671 }, { "auxiliary_loss_clip": 0.01128442, "auxiliary_loss_mlp": 0.01038268, "balance_loss_clip": 1.02146542, "balance_loss_mlp": 1.04544616, "epoch": 0.4244701638358635, "flos": 11144985235200.0, "grad_norm": 1.9066404842001734, "language_loss": 0.6964581, "learning_rate": 2.57797162620435e-06, "loss": 0.71812522, "num_input_tokens_seen": 151559520, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.828125, "step": 7060, "time_per_iteration": 2.49704909324646 }, { "auxiliary_loss_clip": 0.01128918, "auxiliary_loss_mlp": 0.01036301, "balance_loss_clip": 1.02071476, "balance_loss_mlp": 1.04615211, "epoch": 0.42453028708853147, "flos": 23987753688960.0, "grad_norm": 1.6914048150647552, "language_loss": 0.76379532, "learning_rate": 2.577598770580562e-06, "loss": 0.78544748, "num_input_tokens_seen": 151579790, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.828125, "step": 7061, "time_per_iteration": 2.5123980045318604 }, { "auxiliary_loss_clip": 0.01130709, "auxiliary_loss_mlp": 0.01041472, "balance_loss_clip": 1.02454424, "balance_loss_mlp": 1.04721975, "epoch": 0.42459041034119943, "flos": 18406338030720.0, "grad_norm": 1.911597028323078, "language_loss": 0.72652459, "learning_rate": 2.5772258930541693e-06, "loss": 0.74824643, "num_input_tokens_seen": 151598285, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.8359375, "step": 7062, "time_per_iteration": 2.517310857772827 }, { "auxiliary_loss_clip": 0.01126747, "auxiliary_loss_mlp": 0.01042382, "balance_loss_clip": 1.0269444, "balance_loss_mlp": 1.04420495, "epoch": 0.42465053359386745, "flos": 20958006337920.0, "grad_norm": 1.9093820528448076, "language_loss": 0.66381997, "learning_rate": 2.5768529936393137e-06, "loss": 0.68551123, "num_input_tokens_seen": 151615430, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.82421875, "step": 7063, "time_per_iteration": 2.480576276779175 }, { "auxiliary_loss_clip": 0.01121874, "auxiliary_loss_mlp": 0.01033658, "balance_loss_clip": 1.01924586, "balance_loss_mlp": 1.04348207, "epoch": 0.4247106568465354, "flos": 33106190520960.0, "grad_norm": 1.6406201925855362, "language_loss": 0.78746819, "learning_rate": 2.5764800723501354e-06, "loss": 0.8090235, "num_input_tokens_seen": 151637030, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 7064, "time_per_iteration": 2.6383719444274902 }, { "auxiliary_loss_clip": 0.01128942, "auxiliary_loss_mlp": 0.01040368, "balance_loss_clip": 1.02483487, "balance_loss_mlp": 1.0454452, "epoch": 0.4247707800992034, "flos": 20046916840320.0, "grad_norm": 2.212008062505103, "language_loss": 0.75315154, "learning_rate": 2.5761071292007736e-06, "loss": 0.77484465, "num_input_tokens_seen": 151655745, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 7065, "time_per_iteration": 2.4756293296813965 }, { "auxiliary_loss_clip": 0.01127607, "auxiliary_loss_mlp": 0.01035873, "balance_loss_clip": 1.01996493, "balance_loss_mlp": 1.0466516, "epoch": 0.42483090335187135, "flos": 22385132576640.0, "grad_norm": 1.425190885177774, "language_loss": 0.72348589, "learning_rate": 2.5757341642053725e-06, "loss": 0.7451207, "num_input_tokens_seen": 151678040, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.80859375, "step": 7066, "time_per_iteration": 2.5776288509368896 }, { "auxiliary_loss_clip": 0.01127534, "auxiliary_loss_mlp": 0.01042934, "balance_loss_clip": 1.02593505, "balance_loss_mlp": 1.04361737, "epoch": 0.4248910266045393, "flos": 21356830022400.0, "grad_norm": 2.0286475779536612, "language_loss": 0.79866982, "learning_rate": 2.5753611773780745e-06, "loss": 0.82037449, "num_input_tokens_seen": 151696410, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.83984375, "step": 7067, "time_per_iteration": 2.4889328479766846 }, { "auxiliary_loss_clip": 0.01043603, "auxiliary_loss_mlp": 0.01002967, "balance_loss_clip": 1.00120282, "balance_loss_mlp": 1.01711857, "epoch": 0.4249511498572073, "flos": 64008114099840.0, "grad_norm": 0.9124186321152341, "language_loss": 0.63477612, "learning_rate": 2.574988168733022e-06, "loss": 0.65524185, "num_input_tokens_seen": 151756365, "router_z_loss_clip": 0.0177002, "router_z_loss_mlp": 0.265625, "step": 7068, "time_per_iteration": 3.0665102005004883 }, { "auxiliary_loss_clip": 0.01127486, "auxiliary_loss_mlp": 0.01035268, "balance_loss_clip": 1.01818526, "balance_loss_mlp": 1.04477406, "epoch": 0.42501127310987524, "flos": 19607046888960.0, "grad_norm": 1.7156597218191536, "language_loss": 0.72495711, "learning_rate": 2.574615138284361e-06, "loss": 0.74658465, "num_input_tokens_seen": 151775165, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.828125, "step": 7069, "time_per_iteration": 2.487016201019287 }, { "auxiliary_loss_clip": 0.01130164, "auxiliary_loss_mlp": 0.01036235, "balance_loss_clip": 1.01911616, "balance_loss_mlp": 1.04613614, "epoch": 0.4250713963625432, "flos": 19462326992640.0, "grad_norm": 2.146522876340513, "language_loss": 0.78937113, "learning_rate": 2.5742420860462364e-06, "loss": 0.8110351, "num_input_tokens_seen": 151792620, "router_z_loss_clip": 0.171875, "router_z_loss_mlp": 0.83984375, "step": 7070, "time_per_iteration": 2.5449371337890625 }, { "auxiliary_loss_clip": 0.01127114, "auxiliary_loss_mlp": 0.01033119, "balance_loss_clip": 1.017169, "balance_loss_mlp": 1.0435679, "epoch": 0.4251315196152112, "flos": 25337707557120.0, "grad_norm": 1.7893313670955477, "language_loss": 0.7026999, "learning_rate": 2.573869012032795e-06, "loss": 0.72430223, "num_input_tokens_seen": 151812850, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8359375, "step": 7071, "time_per_iteration": 2.5323052406311035 }, { "auxiliary_loss_clip": 0.01125855, "auxiliary_loss_mlp": 0.01034254, "balance_loss_clip": 1.01887631, "balance_loss_mlp": 1.0436008, "epoch": 0.42519164286787914, "flos": 26359186527360.0, "grad_norm": 3.041435671203106, "language_loss": 0.71275729, "learning_rate": 2.5734959162581824e-06, "loss": 0.73435831, "num_input_tokens_seen": 151831785, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8203125, "step": 7072, "time_per_iteration": 2.570202112197876 }, { "auxiliary_loss_clip": 0.01129901, "auxiliary_loss_mlp": 0.01041993, "balance_loss_clip": 1.02674651, "balance_loss_mlp": 1.04469383, "epoch": 0.4252517661205471, "flos": 26031070765440.0, "grad_norm": 1.8053276746026534, "language_loss": 0.81717372, "learning_rate": 2.5731227987365475e-06, "loss": 0.8388927, "num_input_tokens_seen": 151853885, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8515625, "step": 7073, "time_per_iteration": 2.5635159015655518 }, { "auxiliary_loss_clip": 0.01125082, "auxiliary_loss_mlp": 0.01032261, "balance_loss_clip": 1.01729417, "balance_loss_mlp": 1.04469907, "epoch": 0.42531188937321507, "flos": 12713635059840.0, "grad_norm": 2.5152314324679077, "language_loss": 0.90419847, "learning_rate": 2.5727496594820386e-06, "loss": 0.92577189, "num_input_tokens_seen": 151871780, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.80078125, "step": 7074, "time_per_iteration": 2.5127577781677246 }, { "auxiliary_loss_clip": 0.01128506, "auxiliary_loss_mlp": 0.01038657, "balance_loss_clip": 1.02171683, "balance_loss_mlp": 1.04393196, "epoch": 0.42537201262588303, "flos": 22091670460800.0, "grad_norm": 2.1720138943095915, "language_loss": 0.63888305, "learning_rate": 2.572376498508805e-06, "loss": 0.66055471, "num_input_tokens_seen": 151891600, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.84375, "step": 7075, "time_per_iteration": 2.533430337905884 }, { "auxiliary_loss_clip": 0.01121781, "auxiliary_loss_mlp": 0.01032771, "balance_loss_clip": 1.01819181, "balance_loss_mlp": 1.04354763, "epoch": 0.42543213587855105, "flos": 23003119094400.0, "grad_norm": 1.86839097270832, "language_loss": 0.73604, "learning_rate": 2.5720033158309973e-06, "loss": 0.75758553, "num_input_tokens_seen": 151911330, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78125, "step": 7076, "time_per_iteration": 2.5202383995056152 }, { "auxiliary_loss_clip": 0.01129273, "auxiliary_loss_mlp": 0.01040376, "balance_loss_clip": 1.02486038, "balance_loss_mlp": 1.04513001, "epoch": 0.425492259131219, "flos": 25082454533760.0, "grad_norm": 2.351782117804341, "language_loss": 0.79230136, "learning_rate": 2.571630111462766e-06, "loss": 0.81399786, "num_input_tokens_seen": 151930355, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.83984375, "step": 7077, "time_per_iteration": 2.5117557048797607 }, { "auxiliary_loss_clip": 0.01122405, "auxiliary_loss_mlp": 0.01036515, "balance_loss_clip": 1.02265739, "balance_loss_mlp": 1.04435611, "epoch": 0.425552382383887, "flos": 22816850140800.0, "grad_norm": 1.957812865617238, "language_loss": 0.73096728, "learning_rate": 2.571256885418265e-06, "loss": 0.75255644, "num_input_tokens_seen": 151949695, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 7078, "time_per_iteration": 2.519542932510376 }, { "auxiliary_loss_clip": 0.01128918, "auxiliary_loss_mlp": 0.01037277, "balance_loss_clip": 1.02326369, "balance_loss_mlp": 1.04895818, "epoch": 0.42561250563655495, "flos": 13553585671680.0, "grad_norm": 2.543027266000465, "language_loss": 0.8006866, "learning_rate": 2.5708836377116445e-06, "loss": 0.82234854, "num_input_tokens_seen": 151967640, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80078125, "step": 7079, "time_per_iteration": 2.462048053741455 }, { "auxiliary_loss_clip": 0.01126977, "auxiliary_loss_mlp": 0.01033665, "balance_loss_clip": 1.01899076, "balance_loss_mlp": 1.0478518, "epoch": 0.4256726288892229, "flos": 46978303023360.0, "grad_norm": 1.4382021542982772, "language_loss": 0.72086555, "learning_rate": 2.5705103683570592e-06, "loss": 0.74247205, "num_input_tokens_seen": 151994020, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 7080, "time_per_iteration": 2.7605109214782715 }, { "auxiliary_loss_clip": 0.01126192, "auxiliary_loss_mlp": 0.01037517, "balance_loss_clip": 1.02239537, "balance_loss_mlp": 1.04484177, "epoch": 0.4257327521418909, "flos": 23586451966080.0, "grad_norm": 2.0119806586669005, "language_loss": 0.8084197, "learning_rate": 2.5701370773686646e-06, "loss": 0.83005679, "num_input_tokens_seen": 152013415, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 7081, "time_per_iteration": 2.52001953125 }, { "auxiliary_loss_clip": 0.01121917, "auxiliary_loss_mlp": 0.01033481, "balance_loss_clip": 1.01877093, "balance_loss_mlp": 1.04474723, "epoch": 0.42579287539455885, "flos": 18989994124800.0, "grad_norm": 1.6416038166054738, "language_loss": 0.81564659, "learning_rate": 2.5697637647606138e-06, "loss": 0.83720058, "num_input_tokens_seen": 152030860, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7734375, "step": 7082, "time_per_iteration": 2.5220532417297363 }, { "auxiliary_loss_clip": 0.01127123, "auxiliary_loss_mlp": 0.01037139, "balance_loss_clip": 1.02220166, "balance_loss_mlp": 1.04620123, "epoch": 0.4258529986472268, "flos": 25191910252800.0, "grad_norm": 1.899113058827048, "language_loss": 0.69772601, "learning_rate": 2.569390430547065e-06, "loss": 0.71936864, "num_input_tokens_seen": 152050395, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8125, "step": 7083, "time_per_iteration": 2.528204917907715 }, { "auxiliary_loss_clip": 0.0104265, "auxiliary_loss_mlp": 0.01009552, "balance_loss_clip": 1.00776339, "balance_loss_mlp": 1.01621151, "epoch": 0.4259131218998948, "flos": 69968280718080.0, "grad_norm": 0.8749004705558933, "language_loss": 0.67180669, "learning_rate": 2.569017074742173e-06, "loss": 0.69232869, "num_input_tokens_seen": 152113555, "router_z_loss_clip": 0.01782227, "router_z_loss_mlp": 0.265625, "step": 7084, "time_per_iteration": 3.222227096557617 }, { "auxiliary_loss_clip": 0.01126795, "auxiliary_loss_mlp": 0.01042058, "balance_loss_clip": 1.02595854, "balance_loss_mlp": 1.04673743, "epoch": 0.42597324515256274, "flos": 18004964480640.0, "grad_norm": 1.8622866703742067, "language_loss": 0.78370702, "learning_rate": 2.5686436973600964e-06, "loss": 0.80539548, "num_input_tokens_seen": 152131575, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.80078125, "step": 7085, "time_per_iteration": 2.498143434524536 }, { "auxiliary_loss_clip": 0.01131888, "auxiliary_loss_mlp": 0.01041114, "balance_loss_clip": 1.02410305, "balance_loss_mlp": 1.04765999, "epoch": 0.4260333684052307, "flos": 15158792563200.0, "grad_norm": 2.1375453951919297, "language_loss": 0.75925356, "learning_rate": 2.568270298414995e-06, "loss": 0.78098357, "num_input_tokens_seen": 152149435, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.84375, "step": 7086, "time_per_iteration": 2.4899542331695557 }, { "auxiliary_loss_clip": 0.01125938, "auxiliary_loss_mlp": 0.01039607, "balance_loss_clip": 1.02397227, "balance_loss_mlp": 1.04463172, "epoch": 0.42609349165789867, "flos": 14939342421120.0, "grad_norm": 1.857387420979885, "language_loss": 0.8005206, "learning_rate": 2.5678968779210255e-06, "loss": 0.82217604, "num_input_tokens_seen": 152166860, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8125, "step": 7087, "time_per_iteration": 2.4904961585998535 }, { "auxiliary_loss_clip": 0.01127963, "auxiliary_loss_mlp": 0.01032636, "balance_loss_clip": 1.01663816, "balance_loss_mlp": 1.0459981, "epoch": 0.42615361491056664, "flos": 23731961961600.0, "grad_norm": 1.8751613425433606, "language_loss": 0.66236418, "learning_rate": 2.5675234358923505e-06, "loss": 0.68397021, "num_input_tokens_seen": 152187475, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8203125, "step": 7088, "time_per_iteration": 2.53686261177063 }, { "auxiliary_loss_clip": 0.01128525, "auxiliary_loss_mlp": 0.01038244, "balance_loss_clip": 1.02285457, "balance_loss_mlp": 1.04672503, "epoch": 0.42621373816323466, "flos": 24936441747840.0, "grad_norm": 4.001592872804541, "language_loss": 0.6911509, "learning_rate": 2.56714997234313e-06, "loss": 0.71281862, "num_input_tokens_seen": 152207235, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.81640625, "step": 7089, "time_per_iteration": 2.565988302230835 }, { "auxiliary_loss_clip": 0.01126223, "auxiliary_loss_mlp": 0.01034289, "balance_loss_clip": 1.01922107, "balance_loss_mlp": 1.04281986, "epoch": 0.4262738614159026, "flos": 13552975140480.0, "grad_norm": 12.320013311476416, "language_loss": 0.72982681, "learning_rate": 2.566776487287525e-06, "loss": 0.75143194, "num_input_tokens_seen": 152224240, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83203125, "step": 7090, "time_per_iteration": 2.4723896980285645 }, { "auxiliary_loss_clip": 0.01130835, "auxiliary_loss_mlp": 0.01039466, "balance_loss_clip": 1.02481544, "balance_loss_mlp": 1.04530787, "epoch": 0.4263339846685706, "flos": 29748794284800.0, "grad_norm": 1.8604774940896593, "language_loss": 0.74905521, "learning_rate": 2.5664029807396994e-06, "loss": 0.77075821, "num_input_tokens_seen": 152242595, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.85546875, "step": 7091, "time_per_iteration": 4.021637916564941 }, { "auxiliary_loss_clip": 0.01119443, "auxiliary_loss_mlp": 0.0102911, "balance_loss_clip": 1.01595545, "balance_loss_mlp": 1.04309702, "epoch": 0.42639410792123855, "flos": 16834204586880.0, "grad_norm": 1.8624967042538003, "language_loss": 0.82354861, "learning_rate": 2.5660294527138156e-06, "loss": 0.84503412, "num_input_tokens_seen": 152260840, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.765625, "step": 7092, "time_per_iteration": 2.5040078163146973 }, { "auxiliary_loss_clip": 0.01131205, "auxiliary_loss_mlp": 0.01038936, "balance_loss_clip": 1.02393937, "balance_loss_mlp": 1.04639339, "epoch": 0.4264542311739065, "flos": 28763118195840.0, "grad_norm": 1.625776272329245, "language_loss": 0.73666328, "learning_rate": 2.565655903224038e-06, "loss": 0.75836474, "num_input_tokens_seen": 152280580, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8515625, "step": 7093, "time_per_iteration": 2.566497564315796 }, { "auxiliary_loss_clip": 0.01123186, "auxiliary_loss_mlp": 0.01036261, "balance_loss_clip": 1.02079988, "balance_loss_mlp": 1.04315639, "epoch": 0.4265143544265745, "flos": 24713615727360.0, "grad_norm": 2.664707167330887, "language_loss": 0.70342159, "learning_rate": 2.565282332284532e-06, "loss": 0.72501612, "num_input_tokens_seen": 152298455, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80078125, "step": 7094, "time_per_iteration": 6.80787205696106 }, { "auxiliary_loss_clip": 0.01126262, "auxiliary_loss_mlp": 0.01033138, "balance_loss_clip": 1.01770008, "balance_loss_mlp": 1.04446113, "epoch": 0.42657447767924245, "flos": 21865971352320.0, "grad_norm": 1.5627399216394593, "language_loss": 0.81631136, "learning_rate": 2.564908739909464e-06, "loss": 0.83790535, "num_input_tokens_seen": 152316995, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.81640625, "step": 7095, "time_per_iteration": 2.487938642501831 }, { "auxiliary_loss_clip": 0.01126946, "auxiliary_loss_mlp": 0.01040621, "balance_loss_clip": 1.02488494, "balance_loss_mlp": 1.0455904, "epoch": 0.4266346009319104, "flos": 21470236237440.0, "grad_norm": 2.2420838234044833, "language_loss": 0.80609763, "learning_rate": 2.5645351261129996e-06, "loss": 0.82777327, "num_input_tokens_seen": 152334800, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8125, "step": 7096, "time_per_iteration": 2.5072529315948486 }, { "auxiliary_loss_clip": 0.01129635, "auxiliary_loss_mlp": 0.0103549, "balance_loss_clip": 1.0208509, "balance_loss_mlp": 1.04588008, "epoch": 0.4266947241845784, "flos": 25519379569920.0, "grad_norm": 2.188611407475293, "language_loss": 0.66177523, "learning_rate": 2.5641614909093066e-06, "loss": 0.68342644, "num_input_tokens_seen": 152355175, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8359375, "step": 7097, "time_per_iteration": 2.5192227363586426 }, { "auxiliary_loss_clip": 0.01122629, "auxiliary_loss_mlp": 0.01030853, "balance_loss_clip": 1.016101, "balance_loss_mlp": 1.04287028, "epoch": 0.42675484743724634, "flos": 26541217676160.0, "grad_norm": 2.1907590471286107, "language_loss": 0.74399912, "learning_rate": 2.5637878343125535e-06, "loss": 0.76553392, "num_input_tokens_seen": 152377245, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.796875, "step": 7098, "time_per_iteration": 2.5654122829437256 }, { "auxiliary_loss_clip": 0.01122041, "auxiliary_loss_mlp": 0.01031843, "balance_loss_clip": 1.018062, "balance_loss_mlp": 1.0427804, "epoch": 0.4268149706899143, "flos": 23112718467840.0, "grad_norm": 1.7311673362846316, "language_loss": 0.74878311, "learning_rate": 2.5634141563369086e-06, "loss": 0.77032197, "num_input_tokens_seen": 152396985, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.79296875, "step": 7099, "time_per_iteration": 2.515752077102661 }, { "auxiliary_loss_clip": 0.01126764, "auxiliary_loss_mlp": 0.01042354, "balance_loss_clip": 1.02707195, "balance_loss_mlp": 1.04483318, "epoch": 0.4268750939425823, "flos": 22706532495360.0, "grad_norm": 3.368700086493404, "language_loss": 0.83217382, "learning_rate": 2.5630404569965432e-06, "loss": 0.85386503, "num_input_tokens_seen": 152415590, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 7100, "time_per_iteration": 2.5181567668914795 }, { "auxiliary_loss_clip": 0.01125084, "auxiliary_loss_mlp": 0.01033873, "balance_loss_clip": 1.01920462, "balance_loss_mlp": 1.04300547, "epoch": 0.42693521719525024, "flos": 25374875155200.0, "grad_norm": 1.3275827837191896, "language_loss": 0.82316631, "learning_rate": 2.562666736305627e-06, "loss": 0.84475589, "num_input_tokens_seen": 152436735, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8203125, "step": 7101, "time_per_iteration": 2.519101619720459 }, { "auxiliary_loss_clip": 0.01128474, "auxiliary_loss_mlp": 0.01029064, "balance_loss_clip": 1.01406729, "balance_loss_mlp": 1.04486871, "epoch": 0.42699534044791826, "flos": 18150689957760.0, "grad_norm": 1.906361611897779, "language_loss": 0.72774529, "learning_rate": 2.5622929942783314e-06, "loss": 0.74932069, "num_input_tokens_seen": 152455685, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8359375, "step": 7102, "time_per_iteration": 2.4909698963165283 }, { "auxiliary_loss_clip": 0.01121505, "auxiliary_loss_mlp": 0.01028961, "balance_loss_clip": 1.01516831, "balance_loss_mlp": 1.04367793, "epoch": 0.4270554637005862, "flos": 13698413308800.0, "grad_norm": 2.1283450890404696, "language_loss": 0.8338359, "learning_rate": 2.5619192309288297e-06, "loss": 0.8553406, "num_input_tokens_seen": 152473500, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.77734375, "step": 7103, "time_per_iteration": 2.4615378379821777 }, { "auxiliary_loss_clip": 0.011284, "auxiliary_loss_mlp": 0.01037588, "balance_loss_clip": 1.02231693, "balance_loss_mlp": 1.04523301, "epoch": 0.4271155869532542, "flos": 17493596507520.0, "grad_norm": 2.4773037516055627, "language_loss": 0.7366851, "learning_rate": 2.561545446271294e-06, "loss": 0.75834501, "num_input_tokens_seen": 152491320, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.83203125, "step": 7104, "time_per_iteration": 2.503873109817505 }, { "auxiliary_loss_clip": 0.01126211, "auxiliary_loss_mlp": 0.01031301, "balance_loss_clip": 1.01712084, "balance_loss_mlp": 1.04571605, "epoch": 0.42717571020592215, "flos": 32452293381120.0, "grad_norm": 2.019327136392632, "language_loss": 0.74964559, "learning_rate": 2.5611716403198987e-06, "loss": 0.77122068, "num_input_tokens_seen": 152511970, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.8046875, "step": 7105, "time_per_iteration": 2.567335605621338 }, { "auxiliary_loss_clip": 0.01128594, "auxiliary_loss_mlp": 0.01039419, "balance_loss_clip": 1.02563882, "balance_loss_mlp": 1.04642677, "epoch": 0.4272358334585901, "flos": 16253062444800.0, "grad_norm": 2.2778010918267246, "language_loss": 0.77120376, "learning_rate": 2.560797813088819e-06, "loss": 0.79288399, "num_input_tokens_seen": 152530515, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.8203125, "step": 7106, "time_per_iteration": 2.4691619873046875 }, { "auxiliary_loss_clip": 0.01121548, "auxiliary_loss_mlp": 0.01032226, "balance_loss_clip": 1.0182364, "balance_loss_mlp": 1.04188919, "epoch": 0.4272959567112581, "flos": 24200092938240.0, "grad_norm": 1.9745599984796034, "language_loss": 0.80180866, "learning_rate": 2.560423964592229e-06, "loss": 0.82334638, "num_input_tokens_seen": 152549295, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.796875, "step": 7107, "time_per_iteration": 2.502246618270874 }, { "auxiliary_loss_clip": 0.01122439, "auxiliary_loss_mlp": 0.01034372, "balance_loss_clip": 1.02009094, "balance_loss_mlp": 1.04447985, "epoch": 0.42735607996392605, "flos": 27963495578880.0, "grad_norm": 1.4383122453810868, "language_loss": 0.68079698, "learning_rate": 2.5600500948443075e-06, "loss": 0.70236504, "num_input_tokens_seen": 152570725, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 7108, "time_per_iteration": 2.5405569076538086 }, { "auxiliary_loss_clip": 0.01124783, "auxiliary_loss_mlp": 0.01036711, "balance_loss_clip": 1.02280509, "balance_loss_mlp": 1.04484415, "epoch": 0.427416203216594, "flos": 20295597674880.0, "grad_norm": 1.6629986621550243, "language_loss": 0.71763474, "learning_rate": 2.5596762038592294e-06, "loss": 0.73924971, "num_input_tokens_seen": 152588950, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.80078125, "step": 7109, "time_per_iteration": 2.466817855834961 }, { "auxiliary_loss_clip": 0.0112572, "auxiliary_loss_mlp": 0.01034083, "balance_loss_clip": 1.01752472, "balance_loss_mlp": 1.04480743, "epoch": 0.427476326469262, "flos": 26943955943040.0, "grad_norm": 1.8715883721458788, "language_loss": 0.64836156, "learning_rate": 2.559302291651174e-06, "loss": 0.66995955, "num_input_tokens_seen": 152608965, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.80859375, "step": 7110, "time_per_iteration": 2.5374529361724854 }, { "auxiliary_loss_clip": 0.01125168, "auxiliary_loss_mlp": 0.01039049, "balance_loss_clip": 1.02377796, "balance_loss_mlp": 1.04377103, "epoch": 0.42753644972192995, "flos": 25702847262720.0, "grad_norm": 2.115348194302699, "language_loss": 0.76506919, "learning_rate": 2.5589283582343197e-06, "loss": 0.78671134, "num_input_tokens_seen": 152630220, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 7111, "time_per_iteration": 2.5293920040130615 }, { "auxiliary_loss_clip": 0.01126417, "auxiliary_loss_mlp": 0.01033963, "balance_loss_clip": 1.0190258, "balance_loss_mlp": 1.04466009, "epoch": 0.4275965729745979, "flos": 18767419499520.0, "grad_norm": 2.319159015964419, "language_loss": 0.72967428, "learning_rate": 2.558554403622845e-06, "loss": 0.7512781, "num_input_tokens_seen": 152648835, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.81640625, "step": 7112, "time_per_iteration": 2.485537052154541 }, { "auxiliary_loss_clip": 0.01122281, "auxiliary_loss_mlp": 0.01037427, "balance_loss_clip": 1.02413476, "balance_loss_mlp": 1.0441339, "epoch": 0.4276566962272659, "flos": 23764424878080.0, "grad_norm": 1.6493452133695676, "language_loss": 0.71317703, "learning_rate": 2.5581804278309323e-06, "loss": 0.73477411, "num_input_tokens_seen": 152668375, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.78125, "step": 7113, "time_per_iteration": 2.4978108406066895 }, { "auxiliary_loss_clip": 0.01128177, "auxiliary_loss_mlp": 0.01039828, "balance_loss_clip": 1.02549922, "balance_loss_mlp": 1.04581761, "epoch": 0.42771681947993384, "flos": 22492505306880.0, "grad_norm": 1.5994642288829777, "language_loss": 0.6147756, "learning_rate": 2.5578064308727617e-06, "loss": 0.63645566, "num_input_tokens_seen": 152689725, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.82421875, "step": 7114, "time_per_iteration": 2.5573081970214844 }, { "auxiliary_loss_clip": 0.01133173, "auxiliary_loss_mlp": 0.01045315, "balance_loss_clip": 1.02849436, "balance_loss_mlp": 1.04679966, "epoch": 0.42777694273260186, "flos": 25044712318080.0, "grad_norm": 1.9112764229982169, "language_loss": 0.65015602, "learning_rate": 2.5574324127625153e-06, "loss": 0.67194092, "num_input_tokens_seen": 152709375, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.86328125, "step": 7115, "time_per_iteration": 2.513267993927002 }, { "auxiliary_loss_clip": 0.01123286, "auxiliary_loss_mlp": 0.01034492, "balance_loss_clip": 1.02062798, "balance_loss_mlp": 1.04300153, "epoch": 0.4278370659852698, "flos": 18661519226880.0, "grad_norm": 1.6889364810222893, "language_loss": 0.73713267, "learning_rate": 2.5570583735143753e-06, "loss": 0.75871044, "num_input_tokens_seen": 152727510, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.80078125, "step": 7116, "time_per_iteration": 2.4826982021331787 }, { "auxiliary_loss_clip": 0.01119091, "auxiliary_loss_mlp": 0.01040644, "balance_loss_clip": 1.02745986, "balance_loss_mlp": 1.04168665, "epoch": 0.4278971892379378, "flos": 27308269635840.0, "grad_norm": 1.738674754914623, "language_loss": 0.6886276, "learning_rate": 2.5566843131425275e-06, "loss": 0.71022499, "num_input_tokens_seen": 152746670, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7734375, "step": 7117, "time_per_iteration": 2.5419511795043945 }, { "auxiliary_loss_clip": 0.01126054, "auxiliary_loss_mlp": 0.01044026, "balance_loss_clip": 1.0297389, "balance_loss_mlp": 1.04624462, "epoch": 0.42795731249060576, "flos": 12888698970240.0, "grad_norm": 2.647912560523678, "language_loss": 0.69679058, "learning_rate": 2.5563102316611536e-06, "loss": 0.71849132, "num_input_tokens_seen": 152760545, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 7118, "time_per_iteration": 2.455521821975708 }, { "auxiliary_loss_clip": 0.01124103, "auxiliary_loss_mlp": 0.01043138, "balance_loss_clip": 1.02813005, "balance_loss_mlp": 1.04535556, "epoch": 0.4280174357432737, "flos": 33401448316800.0, "grad_norm": 2.09302322920507, "language_loss": 0.74541008, "learning_rate": 2.55593612908444e-06, "loss": 0.76708251, "num_input_tokens_seen": 152780970, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 7119, "time_per_iteration": 2.576812267303467 }, { "auxiliary_loss_clip": 0.01123339, "auxiliary_loss_mlp": 0.01034049, "balance_loss_clip": 1.02023864, "balance_loss_mlp": 1.04412079, "epoch": 0.4280775589959417, "flos": 18259104182400.0, "grad_norm": 1.7367599251760608, "language_loss": 0.7454828, "learning_rate": 2.555562005426573e-06, "loss": 0.7670567, "num_input_tokens_seen": 152798475, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.79296875, "step": 7120, "time_per_iteration": 2.467653751373291 }, { "auxiliary_loss_clip": 0.01125411, "auxiliary_loss_mlp": 0.01037043, "balance_loss_clip": 1.0229342, "balance_loss_mlp": 1.04579949, "epoch": 0.42813768224860965, "flos": 21471277731840.0, "grad_norm": 2.768423512821432, "language_loss": 0.77273178, "learning_rate": 2.5551878607017385e-06, "loss": 0.79435635, "num_input_tokens_seen": 152817555, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 7121, "time_per_iteration": 2.4874820709228516 }, { "auxiliary_loss_clip": 0.01122707, "auxiliary_loss_mlp": 0.01034044, "balance_loss_clip": 1.02097845, "balance_loss_mlp": 1.04501605, "epoch": 0.4281978055012776, "flos": 15669262696320.0, "grad_norm": 2.0912717159711334, "language_loss": 0.85685623, "learning_rate": 2.554813694924126e-06, "loss": 0.87842369, "num_input_tokens_seen": 152836295, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.77734375, "step": 7122, "time_per_iteration": 2.4667701721191406 }, { "auxiliary_loss_clip": 0.01121384, "auxiliary_loss_mlp": 0.01033113, "balance_loss_clip": 1.01895666, "balance_loss_mlp": 1.04407406, "epoch": 0.4282579287539456, "flos": 17712005155200.0, "grad_norm": 3.898009020703201, "language_loss": 0.81199884, "learning_rate": 2.554439508107921e-06, "loss": 0.83354384, "num_input_tokens_seen": 152854950, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7734375, "step": 7123, "time_per_iteration": 2.463578224182129 }, { "auxiliary_loss_clip": 0.01124273, "auxiliary_loss_mlp": 0.01033746, "balance_loss_clip": 1.02007866, "balance_loss_mlp": 1.04648519, "epoch": 0.42831805200661355, "flos": 19281157770240.0, "grad_norm": 5.092061760858676, "language_loss": 0.81065226, "learning_rate": 2.5540653002673153e-06, "loss": 0.83223248, "num_input_tokens_seen": 152873995, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.77734375, "step": 7124, "time_per_iteration": 2.5157203674316406 }, { "auxiliary_loss_clip": 0.01123974, "auxiliary_loss_mlp": 0.01039144, "balance_loss_clip": 1.02398109, "balance_loss_mlp": 1.044631, "epoch": 0.4283781752592815, "flos": 19792633484160.0, "grad_norm": 1.9252386782384183, "language_loss": 0.80638766, "learning_rate": 2.553691071416498e-06, "loss": 0.8280189, "num_input_tokens_seen": 152892925, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.79296875, "step": 7125, "time_per_iteration": 2.4749646186828613 }, { "auxiliary_loss_clip": 0.01123235, "auxiliary_loss_mlp": 0.01032031, "balance_loss_clip": 1.01859641, "balance_loss_mlp": 1.04560423, "epoch": 0.4284382985119495, "flos": 16508064072960.0, "grad_norm": 1.9061888783658345, "language_loss": 0.75373644, "learning_rate": 2.553316821569659e-06, "loss": 0.77528912, "num_input_tokens_seen": 152910935, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.77734375, "step": 7126, "time_per_iteration": 2.488528251647949 }, { "auxiliary_loss_clip": 0.01124511, "auxiliary_loss_mlp": 0.01037794, "balance_loss_clip": 1.02321434, "balance_loss_mlp": 1.04537272, "epoch": 0.42849842176461744, "flos": 23330767979520.0, "grad_norm": 2.0156049437462156, "language_loss": 0.81036162, "learning_rate": 2.5529425507409913e-06, "loss": 0.83198464, "num_input_tokens_seen": 152931030, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 7127, "time_per_iteration": 2.544492244720459 }, { "auxiliary_loss_clip": 0.01123283, "auxiliary_loss_mlp": 0.01040459, "balance_loss_clip": 1.02626157, "balance_loss_mlp": 1.04463971, "epoch": 0.4285585450172854, "flos": 17274433674240.0, "grad_norm": 1.76205940259258, "language_loss": 0.76116323, "learning_rate": 2.5525682589446867e-06, "loss": 0.78280073, "num_input_tokens_seen": 152948085, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78515625, "step": 7128, "time_per_iteration": 2.478971004486084 }, { "auxiliary_loss_clip": 0.01126183, "auxiliary_loss_mlp": 0.01036996, "balance_loss_clip": 1.02255988, "balance_loss_mlp": 1.04479527, "epoch": 0.42861866826995343, "flos": 24279599692800.0, "grad_norm": 1.9470848355339945, "language_loss": 0.7399919, "learning_rate": 2.552193946194937e-06, "loss": 0.76162374, "num_input_tokens_seen": 152966265, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 7129, "time_per_iteration": 2.5398292541503906 }, { "auxiliary_loss_clip": 0.01128231, "auxiliary_loss_mlp": 0.01034758, "balance_loss_clip": 1.02063227, "balance_loss_mlp": 1.04810059, "epoch": 0.4286787915226214, "flos": 24353108876160.0, "grad_norm": 1.7734679554202604, "language_loss": 0.78144407, "learning_rate": 2.5518196125059394e-06, "loss": 0.803074, "num_input_tokens_seen": 152986775, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8046875, "step": 7130, "time_per_iteration": 2.547217845916748 }, { "auxiliary_loss_clip": 0.01131932, "auxiliary_loss_mlp": 0.01038419, "balance_loss_clip": 1.02338672, "balance_loss_mlp": 1.05027616, "epoch": 0.42873891477528936, "flos": 15449992122240.0, "grad_norm": 1.9892954190486678, "language_loss": 0.73298109, "learning_rate": 2.551445257891886e-06, "loss": 0.75468457, "num_input_tokens_seen": 153003595, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.81640625, "step": 7131, "time_per_iteration": 2.4789693355560303 }, { "auxiliary_loss_clip": 0.01131127, "auxiliary_loss_mlp": 0.0103592, "balance_loss_clip": 1.02125108, "balance_loss_mlp": 1.04953456, "epoch": 0.4287990380279573, "flos": 17639573379840.0, "grad_norm": 2.56078915537437, "language_loss": 0.77380908, "learning_rate": 2.551070882366973e-06, "loss": 0.79547954, "num_input_tokens_seen": 153021960, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.81640625, "step": 7132, "time_per_iteration": 2.4570109844207764 }, { "auxiliary_loss_clip": 0.01127391, "auxiliary_loss_mlp": 0.01039834, "balance_loss_clip": 1.02526653, "balance_loss_mlp": 1.04645884, "epoch": 0.4288591612806253, "flos": 27162328677120.0, "grad_norm": 1.7146299894636108, "language_loss": 0.78570402, "learning_rate": 2.550696485945397e-06, "loss": 0.80737621, "num_input_tokens_seen": 153042110, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.80859375, "step": 7133, "time_per_iteration": 3.9958138465881348 }, { "auxiliary_loss_clip": 0.01127776, "auxiliary_loss_mlp": 0.01036347, "balance_loss_clip": 1.02157736, "balance_loss_mlp": 1.04587412, "epoch": 0.42891928453329325, "flos": 17163182275200.0, "grad_norm": 1.859300290412139, "language_loss": 0.75128323, "learning_rate": 2.550322068641355e-06, "loss": 0.77292448, "num_input_tokens_seen": 153058925, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8203125, "step": 7134, "time_per_iteration": 2.461010694503784 }, { "auxiliary_loss_clip": 0.01122237, "auxiliary_loss_mlp": 0.01029309, "balance_loss_clip": 1.01607704, "balance_loss_mlp": 1.0443145, "epoch": 0.4289794077859612, "flos": 18187031543040.0, "grad_norm": 2.0938405985413895, "language_loss": 0.84274524, "learning_rate": 2.5499476304690455e-06, "loss": 0.86426067, "num_input_tokens_seen": 153078070, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.78125, "step": 7135, "time_per_iteration": 3.8315446376800537 }, { "auxiliary_loss_clip": 0.01120957, "auxiliary_loss_mlp": 0.01032059, "balance_loss_clip": 1.01806951, "balance_loss_mlp": 1.04382718, "epoch": 0.4290395310386292, "flos": 28256885867520.0, "grad_norm": 2.8819793023051155, "language_loss": 0.75081825, "learning_rate": 2.549573171442666e-06, "loss": 0.7723484, "num_input_tokens_seen": 153096680, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7734375, "step": 7136, "time_per_iteration": 5.417269229888916 }, { "auxiliary_loss_clip": 0.01127946, "auxiliary_loss_mlp": 0.01041386, "balance_loss_clip": 1.02706861, "balance_loss_mlp": 1.04638696, "epoch": 0.42909965429129715, "flos": 16216074414720.0, "grad_norm": 1.9768138709000203, "language_loss": 0.7886045, "learning_rate": 2.5491986915764175e-06, "loss": 0.81029779, "num_input_tokens_seen": 153113305, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.81640625, "step": 7137, "time_per_iteration": 2.4519095420837402 }, { "auxiliary_loss_clip": 0.01130806, "auxiliary_loss_mlp": 0.01034263, "balance_loss_clip": 1.01927292, "balance_loss_mlp": 1.0487597, "epoch": 0.4291597775439651, "flos": 23112862122240.0, "grad_norm": 5.208527022422745, "language_loss": 0.76197278, "learning_rate": 2.548824190884499e-06, "loss": 0.78362346, "num_input_tokens_seen": 153132735, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 7138, "time_per_iteration": 2.5481669902801514 }, { "auxiliary_loss_clip": 0.01050237, "auxiliary_loss_mlp": 0.01002103, "balance_loss_clip": 1.00036228, "balance_loss_mlp": 1.0238843, "epoch": 0.4292199007966331, "flos": 67546212681600.0, "grad_norm": 0.7709816838327047, "language_loss": 0.56222546, "learning_rate": 2.548449669381113e-06, "loss": 0.58274889, "num_input_tokens_seen": 153187925, "router_z_loss_clip": 0.01745605, "router_z_loss_mlp": 0.26367188, "step": 7139, "time_per_iteration": 2.973269462585449 }, { "auxiliary_loss_clip": 0.01121638, "auxiliary_loss_mlp": 0.0104195, "balance_loss_clip": 1.0290339, "balance_loss_mlp": 1.04531765, "epoch": 0.42928002404930105, "flos": 22999850956800.0, "grad_norm": 1.6154972375678915, "language_loss": 0.80443633, "learning_rate": 2.5480751270804595e-06, "loss": 0.82607222, "num_input_tokens_seen": 153206990, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.76171875, "step": 7140, "time_per_iteration": 2.52653431892395 }, { "auxiliary_loss_clip": 0.01126746, "auxiliary_loss_mlp": 0.01033045, "balance_loss_clip": 1.01835203, "balance_loss_mlp": 1.04677427, "epoch": 0.429340147301969, "flos": 11544922241280.0, "grad_norm": 1.877237268247182, "language_loss": 0.82176751, "learning_rate": 2.5477005639967424e-06, "loss": 0.84336543, "num_input_tokens_seen": 153222345, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 7141, "time_per_iteration": 2.4441959857940674 }, { "auxiliary_loss_clip": 0.0113064, "auxiliary_loss_mlp": 0.01040028, "balance_loss_clip": 1.02475095, "balance_loss_mlp": 1.04782963, "epoch": 0.42940027055463703, "flos": 25264988472960.0, "grad_norm": 2.21350368975284, "language_loss": 0.86665106, "learning_rate": 2.547325980144166e-06, "loss": 0.88835776, "num_input_tokens_seen": 153240570, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 7142, "time_per_iteration": 2.5573747158050537 }, { "auxiliary_loss_clip": 0.01126898, "auxiliary_loss_mlp": 0.01031293, "balance_loss_clip": 1.01741743, "balance_loss_mlp": 1.04954493, "epoch": 0.429460393807305, "flos": 23805004268160.0, "grad_norm": 2.206890013246102, "language_loss": 0.78206134, "learning_rate": 2.5469513755369323e-06, "loss": 0.80364329, "num_input_tokens_seen": 153259575, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 7143, "time_per_iteration": 2.4930572509765625 }, { "auxiliary_loss_clip": 0.01129807, "auxiliary_loss_mlp": 0.01042036, "balance_loss_clip": 1.02731395, "balance_loss_mlp": 1.05010033, "epoch": 0.42952051705997296, "flos": 13918294414080.0, "grad_norm": 3.1894433988356528, "language_loss": 0.7718097, "learning_rate": 2.5465767501892484e-06, "loss": 0.7935282, "num_input_tokens_seen": 153276650, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.796875, "step": 7144, "time_per_iteration": 2.5165915489196777 }, { "auxiliary_loss_clip": 0.01127691, "auxiliary_loss_mlp": 0.0103411, "balance_loss_clip": 1.01957846, "balance_loss_mlp": 1.0474745, "epoch": 0.4295806403126409, "flos": 26760380509440.0, "grad_norm": 1.7595488559970196, "language_loss": 0.73828077, "learning_rate": 2.54620210411532e-06, "loss": 0.75989884, "num_input_tokens_seen": 153298025, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8046875, "step": 7145, "time_per_iteration": 2.5260655879974365 }, { "auxiliary_loss_clip": 0.01129741, "auxiliary_loss_mlp": 0.0103558, "balance_loss_clip": 1.02043402, "balance_loss_mlp": 1.04783368, "epoch": 0.4296407635653089, "flos": 20952619297920.0, "grad_norm": 7.443751903977238, "language_loss": 0.79416144, "learning_rate": 2.545827437329352e-06, "loss": 0.81581473, "num_input_tokens_seen": 153315775, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8203125, "step": 7146, "time_per_iteration": 2.5014419555664062 }, { "auxiliary_loss_clip": 0.01122713, "auxiliary_loss_mlp": 0.0103171, "balance_loss_clip": 1.0185132, "balance_loss_mlp": 1.04500067, "epoch": 0.42970088681797686, "flos": 15852335339520.0, "grad_norm": 2.2687684492739173, "language_loss": 0.82827264, "learning_rate": 2.5454527498455532e-06, "loss": 0.84981686, "num_input_tokens_seen": 153332765, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.77734375, "step": 7147, "time_per_iteration": 2.461164712905884 }, { "auxiliary_loss_clip": 0.01131067, "auxiliary_loss_mlp": 0.0103872, "balance_loss_clip": 1.02287769, "balance_loss_mlp": 1.05015397, "epoch": 0.4297610100706448, "flos": 22382618624640.0, "grad_norm": 2.2764939182914943, "language_loss": 0.87110364, "learning_rate": 2.545078041678131e-06, "loss": 0.89280152, "num_input_tokens_seen": 153350760, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.80859375, "step": 7148, "time_per_iteration": 2.5416259765625 }, { "auxiliary_loss_clip": 0.01127307, "auxiliary_loss_mlp": 0.01033381, "balance_loss_clip": 1.01922488, "balance_loss_mlp": 1.04708076, "epoch": 0.4298211333233128, "flos": 27925681536000.0, "grad_norm": 1.5860355503837118, "language_loss": 0.77679688, "learning_rate": 2.5447033128412957e-06, "loss": 0.7984038, "num_input_tokens_seen": 153370765, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.80078125, "step": 7149, "time_per_iteration": 2.5415289402008057 }, { "auxiliary_loss_clip": 0.01124889, "auxiliary_loss_mlp": 0.01037226, "balance_loss_clip": 1.02282572, "balance_loss_mlp": 1.04680884, "epoch": 0.42988125657598075, "flos": 24425612478720.0, "grad_norm": 1.8617645915461805, "language_loss": 0.79862964, "learning_rate": 2.544328563349256e-06, "loss": 0.82025075, "num_input_tokens_seen": 153390725, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 7150, "time_per_iteration": 2.5538742542266846 }, { "auxiliary_loss_clip": 0.0113331, "auxiliary_loss_mlp": 0.01041947, "balance_loss_clip": 1.02442348, "balance_loss_mlp": 1.04891253, "epoch": 0.4299413798286487, "flos": 15850180523520.0, "grad_norm": 1.8589906296554977, "language_loss": 0.75035155, "learning_rate": 2.5439537932162222e-06, "loss": 0.77210408, "num_input_tokens_seen": 153408010, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.84375, "step": 7151, "time_per_iteration": 2.454739570617676 }, { "auxiliary_loss_clip": 0.01130496, "auxiliary_loss_mlp": 0.01035457, "balance_loss_clip": 1.02050829, "balance_loss_mlp": 1.0471648, "epoch": 0.4300015030813167, "flos": 22309504490880.0, "grad_norm": 2.16022356411317, "language_loss": 0.70591629, "learning_rate": 2.543579002456406e-06, "loss": 0.72757578, "num_input_tokens_seen": 153426865, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8359375, "step": 7152, "time_per_iteration": 2.530883550643921 }, { "auxiliary_loss_clip": 0.01125484, "auxiliary_loss_mlp": 0.01035275, "balance_loss_clip": 1.02123213, "balance_loss_mlp": 1.04456258, "epoch": 0.43006162633398465, "flos": 34897666366080.0, "grad_norm": 1.5432866154476514, "language_loss": 0.71130097, "learning_rate": 2.54320419108402e-06, "loss": 0.73290861, "num_input_tokens_seen": 153449410, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 7153, "time_per_iteration": 2.5928022861480713 }, { "auxiliary_loss_clip": 0.01129336, "auxiliary_loss_mlp": 0.01036333, "balance_loss_clip": 1.02195084, "balance_loss_mlp": 1.04731393, "epoch": 0.4301217495866526, "flos": 15961575576960.0, "grad_norm": 2.0944977314245072, "language_loss": 0.78425926, "learning_rate": 2.542829359113276e-06, "loss": 0.80591601, "num_input_tokens_seen": 153467910, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8203125, "step": 7154, "time_per_iteration": 2.4993674755096436 }, { "auxiliary_loss_clip": 0.01122237, "auxiliary_loss_mlp": 0.01032344, "balance_loss_clip": 1.01839685, "balance_loss_mlp": 1.0432179, "epoch": 0.43018187283932063, "flos": 18770364414720.0, "grad_norm": 1.757928041473841, "language_loss": 0.79034913, "learning_rate": 2.542454506558389e-06, "loss": 0.81189489, "num_input_tokens_seen": 153487100, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7890625, "step": 7155, "time_per_iteration": 2.44915771484375 }, { "auxiliary_loss_clip": 0.01123328, "auxiliary_loss_mlp": 0.01035407, "balance_loss_clip": 1.02115583, "balance_loss_mlp": 1.04450989, "epoch": 0.4302419960919886, "flos": 20151703791360.0, "grad_norm": 1.7146501539431795, "language_loss": 0.88362306, "learning_rate": 2.5420796334335723e-06, "loss": 0.90521044, "num_input_tokens_seen": 153505565, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 7156, "time_per_iteration": 2.517547369003296 }, { "auxiliary_loss_clip": 0.01128638, "auxiliary_loss_mlp": 0.01035906, "balance_loss_clip": 1.02040839, "balance_loss_mlp": 1.04628968, "epoch": 0.43030211934465656, "flos": 26432731624320.0, "grad_norm": 1.8741276182019242, "language_loss": 0.82791483, "learning_rate": 2.541704739753042e-06, "loss": 0.8495602, "num_input_tokens_seen": 153526130, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.82421875, "step": 7157, "time_per_iteration": 2.5139081478118896 }, { "auxiliary_loss_clip": 0.01128494, "auxiliary_loss_mlp": 0.01034553, "balance_loss_clip": 1.01898456, "balance_loss_mlp": 1.04475892, "epoch": 0.43036224259732453, "flos": 24389234979840.0, "grad_norm": 2.1361508266557334, "language_loss": 0.71411455, "learning_rate": 2.5413298255310132e-06, "loss": 0.73574507, "num_input_tokens_seen": 153546370, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 7158, "time_per_iteration": 2.5227582454681396 }, { "auxiliary_loss_clip": 0.01126463, "auxiliary_loss_mlp": 0.0103488, "balance_loss_clip": 1.020509, "balance_loss_mlp": 1.0454843, "epoch": 0.4304223658499925, "flos": 17201714590080.0, "grad_norm": 2.352997557534947, "language_loss": 0.82119697, "learning_rate": 2.5409548907817034e-06, "loss": 0.84281039, "num_input_tokens_seen": 153562800, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.80859375, "step": 7159, "time_per_iteration": 2.443470001220703 }, { "auxiliary_loss_clip": 0.01125339, "auxiliary_loss_mlp": 0.01034647, "balance_loss_clip": 1.0196867, "balance_loss_mlp": 1.04430151, "epoch": 0.43048248910266046, "flos": 14903000835840.0, "grad_norm": 2.2898921223666124, "language_loss": 0.82761145, "learning_rate": 2.54057993551933e-06, "loss": 0.84921128, "num_input_tokens_seen": 153578395, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8125, "step": 7160, "time_per_iteration": 2.473827838897705 }, { "auxiliary_loss_clip": 0.01130493, "auxiliary_loss_mlp": 0.01040495, "balance_loss_clip": 1.02343583, "balance_loss_mlp": 1.04571939, "epoch": 0.4305426123553284, "flos": 21579835610880.0, "grad_norm": 2.608675127338327, "language_loss": 0.77387381, "learning_rate": 2.5402049597581116e-06, "loss": 0.79558372, "num_input_tokens_seen": 153596880, "router_z_loss_clip": 0.17089844, "router_z_loss_mlp": 0.84765625, "step": 7161, "time_per_iteration": 2.5107953548431396 }, { "auxiliary_loss_clip": 0.01127093, "auxiliary_loss_mlp": 0.01038422, "balance_loss_clip": 1.02415252, "balance_loss_mlp": 1.04547346, "epoch": 0.4306027356079964, "flos": 22601278667520.0, "grad_norm": 2.1121112872426364, "language_loss": 0.73049831, "learning_rate": 2.5398299635122662e-06, "loss": 0.75215352, "num_input_tokens_seen": 153616570, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.81640625, "step": 7162, "time_per_iteration": 2.5759963989257812 }, { "auxiliary_loss_clip": 0.01046652, "auxiliary_loss_mlp": 0.01004207, "balance_loss_clip": 1.00257385, "balance_loss_mlp": 1.02056098, "epoch": 0.43066285886066435, "flos": 70672091806080.0, "grad_norm": 0.7912049530881433, "language_loss": 0.59092283, "learning_rate": 2.5394549467960147e-06, "loss": 0.61143148, "num_input_tokens_seen": 153671450, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.26171875, "step": 7163, "time_per_iteration": 2.97749662399292 }, { "auxiliary_loss_clip": 0.0112175, "auxiliary_loss_mlp": 0.0103307, "balance_loss_clip": 1.01957583, "balance_loss_mlp": 1.04228783, "epoch": 0.4307229821133323, "flos": 26720591218560.0, "grad_norm": 1.9009384019452054, "language_loss": 0.79074496, "learning_rate": 2.5390799096235783e-06, "loss": 0.81229317, "num_input_tokens_seen": 153691405, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.796875, "step": 7164, "time_per_iteration": 2.5666439533233643 }, { "auxiliary_loss_clip": 0.01127003, "auxiliary_loss_mlp": 0.01039925, "balance_loss_clip": 1.02520263, "balance_loss_mlp": 1.04370546, "epoch": 0.4307831053660003, "flos": 26177119464960.0, "grad_norm": 1.9093622103266819, "language_loss": 0.68296814, "learning_rate": 2.538704852009177e-06, "loss": 0.70463741, "num_input_tokens_seen": 153711555, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.83203125, "step": 7165, "time_per_iteration": 2.5129141807556152 }, { "auxiliary_loss_clip": 0.01126905, "auxiliary_loss_mlp": 0.01051084, "balance_loss_clip": 1.03687406, "balance_loss_mlp": 1.04666567, "epoch": 0.43084322861866825, "flos": 18910343715840.0, "grad_norm": 2.125930541008073, "language_loss": 0.75012147, "learning_rate": 2.538329773967034e-06, "loss": 0.77190137, "num_input_tokens_seen": 153730095, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80078125, "step": 7166, "time_per_iteration": 2.4679336547851562 }, { "auxiliary_loss_clip": 0.01124607, "auxiliary_loss_mlp": 0.01045957, "balance_loss_clip": 1.0320456, "balance_loss_mlp": 1.04602981, "epoch": 0.4309033518713362, "flos": 26432911192320.0, "grad_norm": 1.7528882433424242, "language_loss": 0.72098416, "learning_rate": 2.537954675511372e-06, "loss": 0.74268973, "num_input_tokens_seen": 153749320, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78515625, "step": 7167, "time_per_iteration": 2.5329716205596924 }, { "auxiliary_loss_clip": 0.01118895, "auxiliary_loss_mlp": 0.0103884, "balance_loss_clip": 1.02545857, "balance_loss_mlp": 1.04305243, "epoch": 0.43096347512400424, "flos": 21213295274880.0, "grad_norm": 1.5971181491185724, "language_loss": 0.78316236, "learning_rate": 2.537579556656414e-06, "loss": 0.80473971, "num_input_tokens_seen": 153767825, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7578125, "step": 7168, "time_per_iteration": 2.5170395374298096 }, { "auxiliary_loss_clip": 0.01125425, "auxiliary_loss_mlp": 0.01038815, "balance_loss_clip": 1.02430153, "balance_loss_mlp": 1.04459453, "epoch": 0.4310235983766722, "flos": 16540131939840.0, "grad_norm": 2.8158391488738324, "language_loss": 0.82587075, "learning_rate": 2.537204417416387e-06, "loss": 0.84751308, "num_input_tokens_seen": 153785350, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.80859375, "step": 7169, "time_per_iteration": 2.524228096008301 }, { "auxiliary_loss_clip": 0.01044055, "auxiliary_loss_mlp": 0.0100467, "balance_loss_clip": 1.00310826, "balance_loss_mlp": 1.01816797, "epoch": 0.43108372162934017, "flos": 64775704763520.0, "grad_norm": 0.6879084812072174, "language_loss": 0.60819685, "learning_rate": 2.5368292578055132e-06, "loss": 0.6286841, "num_input_tokens_seen": 153856400, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.2578125, "step": 7170, "time_per_iteration": 3.2103540897369385 }, { "auxiliary_loss_clip": 0.01123632, "auxiliary_loss_mlp": 0.01035994, "balance_loss_clip": 1.02223778, "balance_loss_mlp": 1.04353821, "epoch": 0.43114384488200813, "flos": 13444094039040.0, "grad_norm": 1.7435570381740229, "language_loss": 0.75966173, "learning_rate": 2.536454077838021e-06, "loss": 0.78125799, "num_input_tokens_seen": 153875230, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.80078125, "step": 7171, "time_per_iteration": 2.4476430416107178 }, { "auxiliary_loss_clip": 0.01121997, "auxiliary_loss_mlp": 0.01035528, "balance_loss_clip": 1.02181339, "balance_loss_mlp": 1.04325819, "epoch": 0.4312039681346761, "flos": 26286682924800.0, "grad_norm": 1.6001891342977648, "language_loss": 0.77251673, "learning_rate": 2.5360788775281357e-06, "loss": 0.79409206, "num_input_tokens_seen": 153894740, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7890625, "step": 7172, "time_per_iteration": 2.5519208908081055 }, { "auxiliary_loss_clip": 0.01128072, "auxiliary_loss_mlp": 0.01036778, "balance_loss_clip": 1.02086318, "balance_loss_mlp": 1.0456984, "epoch": 0.43126409138734406, "flos": 20376684627840.0, "grad_norm": 3.0428740493749524, "language_loss": 0.76721132, "learning_rate": 2.535703656890086e-06, "loss": 0.78885978, "num_input_tokens_seen": 153913230, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.82421875, "step": 7173, "time_per_iteration": 2.4711153507232666 }, { "auxiliary_loss_clip": 0.01122714, "auxiliary_loss_mlp": 0.01035874, "balance_loss_clip": 1.02144337, "balance_loss_mlp": 1.0442524, "epoch": 0.431324214640012, "flos": 22123091882880.0, "grad_norm": 1.4508814422073664, "language_loss": 0.76941943, "learning_rate": 2.5353284159381e-06, "loss": 0.79100537, "num_input_tokens_seen": 153933250, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 7174, "time_per_iteration": 2.500398874282837 }, { "auxiliary_loss_clip": 0.01125244, "auxiliary_loss_mlp": 0.01033182, "balance_loss_clip": 1.0176487, "balance_loss_mlp": 1.04351437, "epoch": 0.43138433789268, "flos": 15231008856960.0, "grad_norm": 1.7179234225034838, "language_loss": 0.82514656, "learning_rate": 2.534953154686407e-06, "loss": 0.84673083, "num_input_tokens_seen": 153951325, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8203125, "step": 7175, "time_per_iteration": 3.9239721298217773 }, { "auxiliary_loss_clip": 0.01126205, "auxiliary_loss_mlp": 0.01041489, "balance_loss_clip": 1.02557456, "balance_loss_mlp": 1.04361081, "epoch": 0.43144446114534796, "flos": 18150294908160.0, "grad_norm": 2.6060450941857396, "language_loss": 0.74327254, "learning_rate": 2.5345778731492366e-06, "loss": 0.76494956, "num_input_tokens_seen": 153966975, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.828125, "step": 7176, "time_per_iteration": 2.446712017059326 }, { "auxiliary_loss_clip": 0.01124982, "auxiliary_loss_mlp": 0.01034566, "balance_loss_clip": 1.01939082, "balance_loss_mlp": 1.04349041, "epoch": 0.4315045843980159, "flos": 22929861306240.0, "grad_norm": 1.5842273302759584, "language_loss": 0.73633665, "learning_rate": 2.534202571340819e-06, "loss": 0.75793219, "num_input_tokens_seen": 153986695, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 7177, "time_per_iteration": 5.329214334487915 }, { "auxiliary_loss_clip": 0.01132051, "auxiliary_loss_mlp": 0.01041078, "balance_loss_clip": 1.02342284, "balance_loss_mlp": 1.04392815, "epoch": 0.4315647076506839, "flos": 22126862810880.0, "grad_norm": 2.0729831625258335, "language_loss": 0.81562018, "learning_rate": 2.533827249275387e-06, "loss": 0.83735144, "num_input_tokens_seen": 154004710, "router_z_loss_clip": 0.17578125, "router_z_loss_mlp": 0.8828125, "step": 7178, "time_per_iteration": 3.869537830352783 }, { "auxiliary_loss_clip": 0.01120485, "auxiliary_loss_mlp": 0.01034871, "balance_loss_clip": 1.02088213, "balance_loss_mlp": 1.04421985, "epoch": 0.43162483090335185, "flos": 26871129118080.0, "grad_norm": 1.4936268806597468, "language_loss": 0.84094518, "learning_rate": 2.5334519069671725e-06, "loss": 0.86249876, "num_input_tokens_seen": 154024320, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 7179, "time_per_iteration": 2.5434536933898926 }, { "auxiliary_loss_clip": 0.01121349, "auxiliary_loss_mlp": 0.0103394, "balance_loss_clip": 1.01900315, "balance_loss_mlp": 1.04224885, "epoch": 0.4316849541560198, "flos": 13913122855680.0, "grad_norm": 1.9418063192182733, "language_loss": 0.75597966, "learning_rate": 2.5330765444304075e-06, "loss": 0.77753246, "num_input_tokens_seen": 154041755, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7890625, "step": 7180, "time_per_iteration": 2.43994140625 }, { "auxiliary_loss_clip": 0.01123158, "auxiliary_loss_mlp": 0.01038102, "balance_loss_clip": 1.02315271, "balance_loss_mlp": 1.04087567, "epoch": 0.4317450774086878, "flos": 16435165420800.0, "grad_norm": 1.7841129204110853, "language_loss": 0.81897128, "learning_rate": 2.5327011616793274e-06, "loss": 0.84058386, "num_input_tokens_seen": 154056775, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.82421875, "step": 7181, "time_per_iteration": 2.435230255126953 }, { "auxiliary_loss_clip": 0.01125593, "auxiliary_loss_mlp": 0.01040723, "balance_loss_clip": 1.02397394, "balance_loss_mlp": 1.04319203, "epoch": 0.4318052006613558, "flos": 20554980762240.0, "grad_norm": 17.074500159834386, "language_loss": 0.88876998, "learning_rate": 2.532325758728165e-06, "loss": 0.91043317, "num_input_tokens_seen": 154075015, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.82421875, "step": 7182, "time_per_iteration": 2.4709835052490234 }, { "auxiliary_loss_clip": 0.01121634, "auxiliary_loss_mlp": 0.01034942, "balance_loss_clip": 1.02078629, "balance_loss_mlp": 1.04371238, "epoch": 0.43186532391402377, "flos": 22820046451200.0, "grad_norm": 1.636223542579515, "language_loss": 0.75832033, "learning_rate": 2.5319503355911566e-06, "loss": 0.77988613, "num_input_tokens_seen": 154095170, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.77734375, "step": 7183, "time_per_iteration": 2.5012102127075195 }, { "auxiliary_loss_clip": 0.01122217, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.01851642, "balance_loss_mlp": 1.04080617, "epoch": 0.43192544716669173, "flos": 25556583081600.0, "grad_norm": 1.5504690771214302, "language_loss": 0.77624094, "learning_rate": 2.5315748922825393e-06, "loss": 0.79780143, "num_input_tokens_seen": 154116895, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8125, "step": 7184, "time_per_iteration": 2.5078494548797607 }, { "auxiliary_loss_clip": 0.01116975, "auxiliary_loss_mlp": 0.01036066, "balance_loss_clip": 1.02190995, "balance_loss_mlp": 1.04188275, "epoch": 0.4319855704193597, "flos": 30954674701440.0, "grad_norm": 2.0533846503581556, "language_loss": 0.73465961, "learning_rate": 2.5311994288165474e-06, "loss": 0.75619006, "num_input_tokens_seen": 154138395, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 7185, "time_per_iteration": 2.5981593132019043 }, { "auxiliary_loss_clip": 0.01125131, "auxiliary_loss_mlp": 0.01036642, "balance_loss_clip": 1.02083516, "balance_loss_mlp": 1.04174984, "epoch": 0.43204569367202766, "flos": 24238732993920.0, "grad_norm": 2.5814179275518767, "language_loss": 0.7571255, "learning_rate": 2.530823945207421e-06, "loss": 0.77874321, "num_input_tokens_seen": 154156775, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.83203125, "step": 7186, "time_per_iteration": 2.5021474361419678 }, { "auxiliary_loss_clip": 0.01120738, "auxiliary_loss_mlp": 0.01032266, "balance_loss_clip": 1.01782954, "balance_loss_mlp": 1.04082012, "epoch": 0.43210581692469563, "flos": 18406948561920.0, "grad_norm": 2.4867715762013605, "language_loss": 0.75828105, "learning_rate": 2.5304484414693962e-06, "loss": 0.77981114, "num_input_tokens_seen": 154177500, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 7187, "time_per_iteration": 2.4985129833221436 }, { "auxiliary_loss_clip": 0.01037728, "auxiliary_loss_mlp": 0.01002109, "balance_loss_clip": 1.00035691, "balance_loss_mlp": 1.01130176, "epoch": 0.4321659401773636, "flos": 49832378910720.0, "grad_norm": 0.8819363553223997, "language_loss": 0.68176806, "learning_rate": 2.530072917616714e-06, "loss": 0.70216644, "num_input_tokens_seen": 154237110, "router_z_loss_clip": 0.01757812, "router_z_loss_mlp": 0.265625, "step": 7188, "time_per_iteration": 3.1138558387756348 }, { "auxiliary_loss_clip": 0.01118533, "auxiliary_loss_mlp": 0.01031069, "balance_loss_clip": 1.01730633, "balance_loss_mlp": 1.04246402, "epoch": 0.43222606343003156, "flos": 17128564542720.0, "grad_norm": 1.903859792525845, "language_loss": 0.78081381, "learning_rate": 2.529697373663614e-06, "loss": 0.80230981, "num_input_tokens_seen": 154253910, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76171875, "step": 7189, "time_per_iteration": 2.4617414474487305 }, { "auxiliary_loss_clip": 0.01126843, "auxiliary_loss_mlp": 0.010424, "balance_loss_clip": 1.02621126, "balance_loss_mlp": 1.04208076, "epoch": 0.4322861866826995, "flos": 22749949059840.0, "grad_norm": 2.008146980943029, "language_loss": 0.71695238, "learning_rate": 2.5293218096243364e-06, "loss": 0.73864478, "num_input_tokens_seen": 154274770, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.84765625, "step": 7190, "time_per_iteration": 2.4844183921813965 }, { "auxiliary_loss_clip": 0.01117434, "auxiliary_loss_mlp": 0.01033409, "balance_loss_clip": 1.01931882, "balance_loss_mlp": 1.03890848, "epoch": 0.4323463099353675, "flos": 27891925729920.0, "grad_norm": 1.4268764286093312, "language_loss": 0.79731375, "learning_rate": 2.5289462255131223e-06, "loss": 0.81882215, "num_input_tokens_seen": 154295035, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 7191, "time_per_iteration": 2.5259838104248047 }, { "auxiliary_loss_clip": 0.01119246, "auxiliary_loss_mlp": 0.01031467, "balance_loss_clip": 1.01751375, "balance_loss_mlp": 1.04126227, "epoch": 0.43240643318803546, "flos": 21614740652160.0, "grad_norm": 1.8986457235157428, "language_loss": 0.74497974, "learning_rate": 2.5285706213442146e-06, "loss": 0.76648688, "num_input_tokens_seen": 154314905, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 7192, "time_per_iteration": 2.471426010131836 }, { "auxiliary_loss_clip": 0.01123044, "auxiliary_loss_mlp": 0.01037256, "balance_loss_clip": 1.02235532, "balance_loss_mlp": 1.0438869, "epoch": 0.4324665564407034, "flos": 17558378686080.0, "grad_norm": 2.0585277234592674, "language_loss": 0.79202211, "learning_rate": 2.5281949971318557e-06, "loss": 0.8136251, "num_input_tokens_seen": 154331740, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.79296875, "step": 7193, "time_per_iteration": 2.4553062915802 }, { "auxiliary_loss_clip": 0.01120778, "auxiliary_loss_mlp": 0.01041502, "balance_loss_clip": 1.0263145, "balance_loss_mlp": 1.04067445, "epoch": 0.4325266796933714, "flos": 18402423448320.0, "grad_norm": 2.778428057735287, "language_loss": 0.75892067, "learning_rate": 2.5278193528902897e-06, "loss": 0.78054345, "num_input_tokens_seen": 154348740, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80078125, "step": 7194, "time_per_iteration": 2.432621717453003 }, { "auxiliary_loss_clip": 0.01120832, "auxiliary_loss_mlp": 0.01039497, "balance_loss_clip": 1.02527523, "balance_loss_mlp": 1.04147577, "epoch": 0.4325868029460394, "flos": 22564793427840.0, "grad_norm": 2.104774782781364, "language_loss": 0.6008333, "learning_rate": 2.5274436886337613e-06, "loss": 0.62243664, "num_input_tokens_seen": 154368835, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.79296875, "step": 7195, "time_per_iteration": 2.4939897060394287 }, { "auxiliary_loss_clip": 0.0112418, "auxiliary_loss_mlp": 0.01035982, "balance_loss_clip": 1.02048492, "balance_loss_mlp": 1.04197311, "epoch": 0.43264692619870737, "flos": 14605516396800.0, "grad_norm": 2.4815441863732937, "language_loss": 0.65192485, "learning_rate": 2.527068004376515e-06, "loss": 0.67352641, "num_input_tokens_seen": 154384620, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.82421875, "step": 7196, "time_per_iteration": 2.428328514099121 }, { "auxiliary_loss_clip": 0.01126808, "auxiliary_loss_mlp": 0.01040133, "balance_loss_clip": 1.02419424, "balance_loss_mlp": 1.04311144, "epoch": 0.43270704945137534, "flos": 21501657659520.0, "grad_norm": 2.530557259940558, "language_loss": 0.72561026, "learning_rate": 2.526692300132797e-06, "loss": 0.74727964, "num_input_tokens_seen": 154402865, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8359375, "step": 7197, "time_per_iteration": 2.4933791160583496 }, { "auxiliary_loss_clip": 0.01117586, "auxiliary_loss_mlp": 0.0104219, "balance_loss_clip": 1.02740836, "balance_loss_mlp": 1.04166162, "epoch": 0.4327671727040433, "flos": 25155891889920.0, "grad_norm": 2.0482890184423805, "language_loss": 0.72797567, "learning_rate": 2.5263165759168547e-06, "loss": 0.74957347, "num_input_tokens_seen": 154423625, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7578125, "step": 7198, "time_per_iteration": 2.4917588233947754 }, { "auxiliary_loss_clip": 0.01115556, "auxiliary_loss_mlp": 0.01032198, "balance_loss_clip": 1.01814294, "balance_loss_mlp": 1.0378685, "epoch": 0.43282729595671127, "flos": 25447163276160.0, "grad_norm": 1.5880894254024929, "language_loss": 0.81234908, "learning_rate": 2.525940831742934e-06, "loss": 0.8338266, "num_input_tokens_seen": 154444775, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.77734375, "step": 7199, "time_per_iteration": 2.5244688987731934 }, { "auxiliary_loss_clip": 0.01121803, "auxiliary_loss_mlp": 0.01034316, "balance_loss_clip": 1.02005291, "balance_loss_mlp": 1.04292083, "epoch": 0.43288741920937923, "flos": 24126116878080.0, "grad_norm": 2.4578663581067905, "language_loss": 0.68517143, "learning_rate": 2.525565067625286e-06, "loss": 0.70673263, "num_input_tokens_seen": 154460815, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 7200, "time_per_iteration": 2.4748013019561768 }, { "auxiliary_loss_clip": 0.01120975, "auxiliary_loss_mlp": 0.01040297, "balance_loss_clip": 1.02564585, "balance_loss_mlp": 1.04164016, "epoch": 0.4329475424620472, "flos": 19204955066880.0, "grad_norm": 1.9401005611576936, "language_loss": 0.87088501, "learning_rate": 2.525189283578157e-06, "loss": 0.89249766, "num_input_tokens_seen": 154479145, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 7201, "time_per_iteration": 2.481163263320923 }, { "auxiliary_loss_clip": 0.01127624, "auxiliary_loss_mlp": 0.01040651, "balance_loss_clip": 1.02422369, "balance_loss_mlp": 1.04394317, "epoch": 0.43300766571471516, "flos": 22638374438400.0, "grad_norm": 2.3403628701837027, "language_loss": 0.64863276, "learning_rate": 2.5248134796157974e-06, "loss": 0.6703155, "num_input_tokens_seen": 154498905, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.8359375, "step": 7202, "time_per_iteration": 2.475768804550171 }, { "auxiliary_loss_clip": 0.01119468, "auxiliary_loss_mlp": 0.01029099, "balance_loss_clip": 1.01600361, "balance_loss_mlp": 1.0406431, "epoch": 0.4330677889673831, "flos": 22121080721280.0, "grad_norm": 1.7657348942209485, "language_loss": 0.8180021, "learning_rate": 2.5244376557524586e-06, "loss": 0.83948779, "num_input_tokens_seen": 154517270, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7890625, "step": 7203, "time_per_iteration": 2.4965646266937256 }, { "auxiliary_loss_clip": 0.01125256, "auxiliary_loss_mlp": 0.01046394, "balance_loss_clip": 1.0313139, "balance_loss_mlp": 1.04193139, "epoch": 0.4331279122200511, "flos": 23221527742080.0, "grad_norm": 2.03484373676812, "language_loss": 0.81400621, "learning_rate": 2.5240618120023912e-06, "loss": 0.83572268, "num_input_tokens_seen": 154535945, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8359375, "step": 7204, "time_per_iteration": 2.4653849601745605 }, { "auxiliary_loss_clip": 0.01119621, "auxiliary_loss_mlp": 0.01035683, "balance_loss_clip": 1.02136624, "balance_loss_mlp": 1.04026866, "epoch": 0.43318803547271906, "flos": 18259750627200.0, "grad_norm": 2.181805163842796, "language_loss": 0.73451692, "learning_rate": 2.5236859483798468e-06, "loss": 0.7560699, "num_input_tokens_seen": 154554935, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.79296875, "step": 7205, "time_per_iteration": 2.4537060260772705 }, { "auxiliary_loss_clip": 0.0112128, "auxiliary_loss_mlp": 0.01038373, "balance_loss_clip": 1.02406216, "balance_loss_mlp": 1.04394019, "epoch": 0.433248158725387, "flos": 27418407713280.0, "grad_norm": 1.7146579984602734, "language_loss": 0.75104511, "learning_rate": 2.5233100648990803e-06, "loss": 0.7726416, "num_input_tokens_seen": 154576065, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 7206, "time_per_iteration": 2.5974910259246826 }, { "auxiliary_loss_clip": 0.01118832, "auxiliary_loss_mlp": 0.01033919, "balance_loss_clip": 1.01973367, "balance_loss_mlp": 1.04040873, "epoch": 0.433308281978055, "flos": 23218008209280.0, "grad_norm": 1.7574330285292779, "language_loss": 0.78973389, "learning_rate": 2.522934161574342e-06, "loss": 0.81126136, "num_input_tokens_seen": 154595110, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 7207, "time_per_iteration": 2.5025906562805176 }, { "auxiliary_loss_clip": 0.01122947, "auxiliary_loss_mlp": 0.01034182, "balance_loss_clip": 1.01835084, "balance_loss_mlp": 1.04104793, "epoch": 0.433368405230723, "flos": 15852407166720.0, "grad_norm": 1.8967564188982944, "language_loss": 0.80945599, "learning_rate": 2.5225582384198888e-06, "loss": 0.83102727, "num_input_tokens_seen": 154612255, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8203125, "step": 7208, "time_per_iteration": 2.4237258434295654 }, { "auxiliary_loss_clip": 0.01121533, "auxiliary_loss_mlp": 0.01033313, "balance_loss_clip": 1.01960957, "balance_loss_mlp": 1.04291701, "epoch": 0.433428528483391, "flos": 19026084314880.0, "grad_norm": 2.824269768827871, "language_loss": 0.70062786, "learning_rate": 2.5221822954499744e-06, "loss": 0.72217631, "num_input_tokens_seen": 154630440, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78515625, "step": 7209, "time_per_iteration": 2.4810173511505127 }, { "auxiliary_loss_clip": 0.01118008, "auxiliary_loss_mlp": 0.01035153, "balance_loss_clip": 1.02056193, "balance_loss_mlp": 1.04073358, "epoch": 0.43348865173605894, "flos": 24718248581760.0, "grad_norm": 1.4261398972476436, "language_loss": 0.81294119, "learning_rate": 2.5218063326788557e-06, "loss": 0.83447284, "num_input_tokens_seen": 154652515, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 7210, "time_per_iteration": 2.5113027095794678 }, { "auxiliary_loss_clip": 0.01118305, "auxiliary_loss_mlp": 0.01037557, "balance_loss_clip": 1.0244385, "balance_loss_mlp": 1.04053855, "epoch": 0.4335487749887269, "flos": 22090664880000.0, "grad_norm": 1.702257683521338, "language_loss": 0.81947219, "learning_rate": 2.5214303501207885e-06, "loss": 0.84103078, "num_input_tokens_seen": 154670965, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.77734375, "step": 7211, "time_per_iteration": 2.512416362762451 }, { "auxiliary_loss_clip": 0.01119922, "auxiliary_loss_mlp": 0.01034827, "balance_loss_clip": 1.02206516, "balance_loss_mlp": 1.04035342, "epoch": 0.43360889824139487, "flos": 22382941847040.0, "grad_norm": 1.6988760469026434, "language_loss": 0.75007367, "learning_rate": 2.521054347790029e-06, "loss": 0.77162117, "num_input_tokens_seen": 154689980, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.796875, "step": 7212, "time_per_iteration": 2.456468343734741 }, { "auxiliary_loss_clip": 0.01121475, "auxiliary_loss_mlp": 0.01033631, "balance_loss_clip": 1.02026165, "balance_loss_mlp": 1.04300177, "epoch": 0.43366902149406283, "flos": 17528286067200.0, "grad_norm": 2.0331782597054335, "language_loss": 0.76620507, "learning_rate": 2.5206783257008375e-06, "loss": 0.78775615, "num_input_tokens_seen": 154706570, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.78515625, "step": 7213, "time_per_iteration": 2.4507524967193604 }, { "auxiliary_loss_clip": 0.01120778, "auxiliary_loss_mlp": 0.0103575, "balance_loss_clip": 1.02246439, "balance_loss_mlp": 1.04164195, "epoch": 0.4337291447467308, "flos": 19022672522880.0, "grad_norm": 1.729717030288343, "language_loss": 0.65090984, "learning_rate": 2.520302283867471e-06, "loss": 0.67247516, "num_input_tokens_seen": 154725210, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7890625, "step": 7214, "time_per_iteration": 2.4402053356170654 }, { "auxiliary_loss_clip": 0.01114515, "auxiliary_loss_mlp": 0.0103788, "balance_loss_clip": 1.02461231, "balance_loss_mlp": 1.03968275, "epoch": 0.43378926799939876, "flos": 27234042180480.0, "grad_norm": 1.6606313062237885, "language_loss": 0.71764505, "learning_rate": 2.519926222304191e-06, "loss": 0.73916906, "num_input_tokens_seen": 154745945, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 7215, "time_per_iteration": 2.5323257446289062 }, { "auxiliary_loss_clip": 0.01118074, "auxiliary_loss_mlp": 0.01032968, "balance_loss_clip": 1.01927686, "balance_loss_mlp": 1.04194951, "epoch": 0.43384939125206673, "flos": 15961108700160.0, "grad_norm": 2.2494042982691624, "language_loss": 0.75100148, "learning_rate": 2.519550141025255e-06, "loss": 0.77251184, "num_input_tokens_seen": 154763580, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76171875, "step": 7216, "time_per_iteration": 2.4247422218322754 }, { "auxiliary_loss_clip": 0.0112812, "auxiliary_loss_mlp": 0.01042511, "balance_loss_clip": 1.02644777, "balance_loss_mlp": 1.04311538, "epoch": 0.4339095145047347, "flos": 21793216354560.0, "grad_norm": 2.8572376342361383, "language_loss": 0.75249535, "learning_rate": 2.519174040044927e-06, "loss": 0.77420163, "num_input_tokens_seen": 154776825, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.84765625, "step": 7217, "time_per_iteration": 3.913990020751953 }, { "auxiliary_loss_clip": 0.01118916, "auxiliary_loss_mlp": 0.01033564, "balance_loss_clip": 1.01947951, "balance_loss_mlp": 1.0407064, "epoch": 0.43396963775740266, "flos": 14209853109120.0, "grad_norm": 2.206682882193531, "language_loss": 0.7383588, "learning_rate": 2.5187979193774664e-06, "loss": 0.75988364, "num_input_tokens_seen": 154794025, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 7218, "time_per_iteration": 3.909661293029785 }, { "auxiliary_loss_clip": 0.01122772, "auxiliary_loss_mlp": 0.01028679, "balance_loss_clip": 1.01511335, "balance_loss_mlp": 1.04286945, "epoch": 0.4340297610100706, "flos": 19719052473600.0, "grad_norm": 1.7626786400785635, "language_loss": 0.68414974, "learning_rate": 2.5184217790371367e-06, "loss": 0.70566428, "num_input_tokens_seen": 154813105, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.80078125, "step": 7219, "time_per_iteration": 5.302636623382568 }, { "auxiliary_loss_clip": 0.01119619, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.02054763, "balance_loss_mlp": 1.04231095, "epoch": 0.4340898842627386, "flos": 18953508885120.0, "grad_norm": 1.5676406081630396, "language_loss": 0.77044165, "learning_rate": 2.518045619038202e-06, "loss": 0.79198235, "num_input_tokens_seen": 154833525, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 7220, "time_per_iteration": 2.4953701496124268 }, { "auxiliary_loss_clip": 0.01119197, "auxiliary_loss_mlp": 0.01033695, "balance_loss_clip": 1.01968825, "balance_loss_mlp": 1.04081535, "epoch": 0.4341500075154066, "flos": 22018304931840.0, "grad_norm": 1.9018371495207649, "language_loss": 0.69191849, "learning_rate": 2.5176694393949243e-06, "loss": 0.71344745, "num_input_tokens_seen": 154853090, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 7221, "time_per_iteration": 2.5179238319396973 }, { "auxiliary_loss_clip": 0.01120746, "auxiliary_loss_mlp": 0.01032692, "balance_loss_clip": 1.01912582, "balance_loss_mlp": 1.04036403, "epoch": 0.4342101307680746, "flos": 23582465556480.0, "grad_norm": 1.7402617103002065, "language_loss": 0.65471631, "learning_rate": 2.51729324012157e-06, "loss": 0.6762507, "num_input_tokens_seen": 154872055, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.8046875, "step": 7222, "time_per_iteration": 2.478044033050537 }, { "auxiliary_loss_clip": 0.01118487, "auxiliary_loss_mlp": 0.01031568, "balance_loss_clip": 1.01710761, "balance_loss_mlp": 1.04057121, "epoch": 0.43427025402074254, "flos": 17967976450560.0, "grad_norm": 6.694035700273423, "language_loss": 0.72885007, "learning_rate": 2.5169170212324053e-06, "loss": 0.75035059, "num_input_tokens_seen": 154886645, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 7223, "time_per_iteration": 2.4606974124908447 }, { "auxiliary_loss_clip": 0.01121321, "auxiliary_loss_mlp": 0.01031494, "balance_loss_clip": 1.01660442, "balance_loss_mlp": 1.04070401, "epoch": 0.4343303772734105, "flos": 26286395616000.0, "grad_norm": 3.583809848284316, "language_loss": 0.94034743, "learning_rate": 2.516540782741694e-06, "loss": 0.96187568, "num_input_tokens_seen": 154906775, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 7224, "time_per_iteration": 2.5139248371124268 }, { "auxiliary_loss_clip": 0.01118128, "auxiliary_loss_mlp": 0.01035847, "balance_loss_clip": 1.02181578, "balance_loss_mlp": 1.04224229, "epoch": 0.43439050052607847, "flos": 26833961520000.0, "grad_norm": 1.6517298293247777, "language_loss": 0.61209142, "learning_rate": 2.5161645246637056e-06, "loss": 0.63363117, "num_input_tokens_seen": 154926990, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 7225, "time_per_iteration": 2.528775453567505 }, { "auxiliary_loss_clip": 0.01120171, "auxiliary_loss_mlp": 0.01032849, "balance_loss_clip": 1.01818085, "balance_loss_mlp": 1.04256904, "epoch": 0.43445062377874644, "flos": 21397660807680.0, "grad_norm": 2.0099836483656164, "language_loss": 0.77645612, "learning_rate": 2.5157882470127054e-06, "loss": 0.79798633, "num_input_tokens_seen": 154946210, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 7226, "time_per_iteration": 2.477363109588623 }, { "auxiliary_loss_clip": 0.01117657, "auxiliary_loss_mlp": 0.01028431, "balance_loss_clip": 1.01499605, "balance_loss_mlp": 1.04217303, "epoch": 0.4345107470314144, "flos": 19901945548800.0, "grad_norm": 1.616615399314539, "language_loss": 0.84927201, "learning_rate": 2.515411949802964e-06, "loss": 0.8707329, "num_input_tokens_seen": 154964995, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75390625, "step": 7227, "time_per_iteration": 2.475821018218994 }, { "auxiliary_loss_clip": 0.01116746, "auxiliary_loss_mlp": 0.01035186, "balance_loss_clip": 1.02047539, "balance_loss_mlp": 1.04096794, "epoch": 0.43457087028408237, "flos": 26432623883520.0, "grad_norm": 2.630979089603753, "language_loss": 0.76249921, "learning_rate": 2.5150356330487498e-06, "loss": 0.78401864, "num_input_tokens_seen": 154984775, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7578125, "step": 7228, "time_per_iteration": 2.5014524459838867 }, { "auxiliary_loss_clip": 0.01120363, "auxiliary_loss_mlp": 0.01039167, "balance_loss_clip": 1.02509987, "balance_loss_mlp": 1.04326677, "epoch": 0.43463099353675033, "flos": 31868816855040.0, "grad_norm": 1.9176160348535098, "language_loss": 0.80318213, "learning_rate": 2.5146592967643324e-06, "loss": 0.82477742, "num_input_tokens_seen": 155008125, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 7229, "time_per_iteration": 2.5964269638061523 }, { "auxiliary_loss_clip": 0.01120806, "auxiliary_loss_mlp": 0.01041409, "balance_loss_clip": 1.0268774, "balance_loss_mlp": 1.04166305, "epoch": 0.4346911167894183, "flos": 24571266128640.0, "grad_norm": 2.1235344471948308, "language_loss": 0.81893611, "learning_rate": 2.5142829409639834e-06, "loss": 0.84055829, "num_input_tokens_seen": 155027885, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.79296875, "step": 7230, "time_per_iteration": 2.491891384124756 }, { "auxiliary_loss_clip": 0.01125753, "auxiliary_loss_mlp": 0.01036806, "balance_loss_clip": 1.02243567, "balance_loss_mlp": 1.04476833, "epoch": 0.43475124004208626, "flos": 17090678672640.0, "grad_norm": 2.4416913814190564, "language_loss": 0.77141786, "learning_rate": 2.513906565661973e-06, "loss": 0.79304349, "num_input_tokens_seen": 155043375, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.80859375, "step": 7231, "time_per_iteration": 2.452198028564453 }, { "auxiliary_loss_clip": 0.0112031, "auxiliary_loss_mlp": 0.01033839, "balance_loss_clip": 1.02082729, "balance_loss_mlp": 1.0438807, "epoch": 0.4348113632947542, "flos": 26104615862400.0, "grad_norm": 1.5734592757557508, "language_loss": 0.68652463, "learning_rate": 2.513530170872575e-06, "loss": 0.70806611, "num_input_tokens_seen": 155062930, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.765625, "step": 7232, "time_per_iteration": 2.512845277786255 }, { "auxiliary_loss_clip": 0.01123838, "auxiliary_loss_mlp": 0.01031706, "balance_loss_clip": 1.01682258, "balance_loss_mlp": 1.04305303, "epoch": 0.4348714865474222, "flos": 34200496316160.0, "grad_norm": 1.6849459067181092, "language_loss": 0.71474409, "learning_rate": 2.5131537566100605e-06, "loss": 0.73629951, "num_input_tokens_seen": 155084980, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80859375, "step": 7233, "time_per_iteration": 2.607177495956421 }, { "auxiliary_loss_clip": 0.01125225, "auxiliary_loss_mlp": 0.01040588, "balance_loss_clip": 1.02553153, "balance_loss_mlp": 1.04426861, "epoch": 0.43493160980009016, "flos": 31537468869120.0, "grad_norm": 1.8070052100698049, "language_loss": 0.74261427, "learning_rate": 2.5127773228887053e-06, "loss": 0.76427239, "num_input_tokens_seen": 155107260, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80859375, "step": 7234, "time_per_iteration": 2.5507652759552 }, { "auxiliary_loss_clip": 0.01127472, "auxiliary_loss_mlp": 0.01042485, "balance_loss_clip": 1.02778625, "balance_loss_mlp": 1.04510701, "epoch": 0.4349917330527582, "flos": 24061334699520.0, "grad_norm": 3.001224185149905, "language_loss": 0.59422278, "learning_rate": 2.512400869722782e-06, "loss": 0.61592233, "num_input_tokens_seen": 155126720, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.82421875, "step": 7235, "time_per_iteration": 2.5001583099365234 }, { "auxiliary_loss_clip": 0.01120584, "auxiliary_loss_mlp": 0.01037766, "balance_loss_clip": 1.02350879, "balance_loss_mlp": 1.04220426, "epoch": 0.43505185630542614, "flos": 30519329863680.0, "grad_norm": 1.5723069080626695, "language_loss": 0.77551872, "learning_rate": 2.512024397126566e-06, "loss": 0.79710221, "num_input_tokens_seen": 155148640, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 7236, "time_per_iteration": 2.5401241779327393 }, { "auxiliary_loss_clip": 0.01118335, "auxiliary_loss_mlp": 0.01032959, "balance_loss_clip": 1.01857626, "balance_loss_mlp": 1.0429467, "epoch": 0.4351119795580941, "flos": 15735158196480.0, "grad_norm": 1.7788935733767282, "language_loss": 0.81685185, "learning_rate": 2.5116479051143345e-06, "loss": 0.83836478, "num_input_tokens_seen": 155165870, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 7237, "time_per_iteration": 2.4535434246063232 }, { "auxiliary_loss_clip": 0.01120397, "auxiliary_loss_mlp": 0.01037718, "balance_loss_clip": 1.02354431, "balance_loss_mlp": 1.04314852, "epoch": 0.4351721028107621, "flos": 18731760272640.0, "grad_norm": 1.6562358368184023, "language_loss": 0.63352495, "learning_rate": 2.5112713937003623e-06, "loss": 0.65510607, "num_input_tokens_seen": 155185315, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 7238, "time_per_iteration": 2.470802068710327 }, { "auxiliary_loss_clip": 0.01117523, "auxiliary_loss_mlp": 0.01040944, "balance_loss_clip": 1.02659154, "balance_loss_mlp": 1.04164982, "epoch": 0.43523222606343004, "flos": 25226887121280.0, "grad_norm": 1.6719491255702073, "language_loss": 0.85905707, "learning_rate": 2.510894862898928e-06, "loss": 0.88064182, "num_input_tokens_seen": 155205790, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 7239, "time_per_iteration": 2.556602716445923 }, { "auxiliary_loss_clip": 0.01123645, "auxiliary_loss_mlp": 0.01035207, "balance_loss_clip": 1.02131283, "balance_loss_mlp": 1.04535556, "epoch": 0.435292349316098, "flos": 22709190101760.0, "grad_norm": 1.5205633487983994, "language_loss": 0.72162789, "learning_rate": 2.510518312724309e-06, "loss": 0.7432164, "num_input_tokens_seen": 155226475, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78515625, "step": 7240, "time_per_iteration": 2.493101119995117 }, { "auxiliary_loss_clip": 0.0112578, "auxiliary_loss_mlp": 0.01035823, "balance_loss_clip": 1.02121949, "balance_loss_mlp": 1.04557514, "epoch": 0.43535247256876597, "flos": 25775889569280.0, "grad_norm": 2.4014439640154035, "language_loss": 0.82331753, "learning_rate": 2.5101417431907842e-06, "loss": 0.84493351, "num_input_tokens_seen": 155247110, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8046875, "step": 7241, "time_per_iteration": 2.5523033142089844 }, { "auxiliary_loss_clip": 0.01127366, "auxiliary_loss_mlp": 0.01039733, "balance_loss_clip": 1.02426004, "balance_loss_mlp": 1.04456639, "epoch": 0.43541259582143393, "flos": 17528142412800.0, "grad_norm": 3.1861886365330956, "language_loss": 0.79673386, "learning_rate": 2.5097651543126345e-06, "loss": 0.81840485, "num_input_tokens_seen": 155261335, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 7242, "time_per_iteration": 2.4337966442108154 }, { "auxiliary_loss_clip": 0.01124171, "auxiliary_loss_mlp": 0.01036098, "balance_loss_clip": 1.02134025, "balance_loss_mlp": 1.04317105, "epoch": 0.4354727190741019, "flos": 15195205975680.0, "grad_norm": 2.1279621960255524, "language_loss": 0.67618024, "learning_rate": 2.509388546104138e-06, "loss": 0.69778287, "num_input_tokens_seen": 155278510, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80859375, "step": 7243, "time_per_iteration": 2.4920034408569336 }, { "auxiliary_loss_clip": 0.01119384, "auxiliary_loss_mlp": 0.01035318, "balance_loss_clip": 1.02158475, "balance_loss_mlp": 1.04437256, "epoch": 0.43553284232676986, "flos": 16649264436480.0, "grad_norm": 2.207427787550501, "language_loss": 0.81364691, "learning_rate": 2.5090119185795766e-06, "loss": 0.83519387, "num_input_tokens_seen": 155296450, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.74609375, "step": 7244, "time_per_iteration": 2.4559805393218994 }, { "auxiliary_loss_clip": 0.01121178, "auxiliary_loss_mlp": 0.01029922, "balance_loss_clip": 1.01636815, "balance_loss_mlp": 1.04340374, "epoch": 0.43559296557943783, "flos": 23400865370880.0, "grad_norm": 1.7614308274878792, "language_loss": 0.73259699, "learning_rate": 2.508635271753234e-06, "loss": 0.75410795, "num_input_tokens_seen": 155316080, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.77734375, "step": 7245, "time_per_iteration": 2.5132229328155518 }, { "auxiliary_loss_clip": 0.01121488, "auxiliary_loss_mlp": 0.01041631, "balance_loss_clip": 1.0277667, "balance_loss_mlp": 1.04356563, "epoch": 0.4356530888321058, "flos": 22419067950720.0, "grad_norm": 3.3139075767411863, "language_loss": 0.76769853, "learning_rate": 2.508258605639389e-06, "loss": 0.78932971, "num_input_tokens_seen": 155336765, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 7246, "time_per_iteration": 2.475771427154541 }, { "auxiliary_loss_clip": 0.0112313, "auxiliary_loss_mlp": 0.01041, "balance_loss_clip": 1.02643239, "balance_loss_mlp": 1.04447436, "epoch": 0.43571321208477376, "flos": 21616141282560.0, "grad_norm": 1.8626234467888045, "language_loss": 0.85561901, "learning_rate": 2.5078819202523275e-06, "loss": 0.87726033, "num_input_tokens_seen": 155356440, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78515625, "step": 7247, "time_per_iteration": 2.4848368167877197 }, { "auxiliary_loss_clip": 0.01123895, "auxiliary_loss_mlp": 0.01037927, "balance_loss_clip": 1.0242002, "balance_loss_mlp": 1.04541099, "epoch": 0.4357733353374418, "flos": 23987358639360.0, "grad_norm": 2.088876337994879, "language_loss": 0.72706723, "learning_rate": 2.507505215606333e-06, "loss": 0.74868548, "num_input_tokens_seen": 155377070, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.78515625, "step": 7248, "time_per_iteration": 2.4895834922790527 }, { "auxiliary_loss_clip": 0.01122227, "auxiliary_loss_mlp": 0.01033213, "balance_loss_clip": 1.01928353, "balance_loss_mlp": 1.04494774, "epoch": 0.43583345859010975, "flos": 25264737077760.0, "grad_norm": 1.6477318737806212, "language_loss": 0.87290502, "learning_rate": 2.5071284917156893e-06, "loss": 0.89445943, "num_input_tokens_seen": 155398415, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 7249, "time_per_iteration": 2.522064208984375 }, { "auxiliary_loss_clip": 0.01124973, "auxiliary_loss_mlp": 0.0104041, "balance_loss_clip": 1.02636743, "balance_loss_mlp": 1.04462194, "epoch": 0.4358935818427777, "flos": 23696302734720.0, "grad_norm": 2.0303927043295547, "language_loss": 0.81916785, "learning_rate": 2.506751748594683e-06, "loss": 0.84082162, "num_input_tokens_seen": 155415625, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8046875, "step": 7250, "time_per_iteration": 2.4779603481292725 }, { "auxiliary_loss_clip": 0.01127655, "auxiliary_loss_mlp": 0.01034104, "balance_loss_clip": 1.01945364, "balance_loss_mlp": 1.04823041, "epoch": 0.4359537050954457, "flos": 29532827761920.0, "grad_norm": 1.979234163060599, "language_loss": 0.84920824, "learning_rate": 2.5063749862575988e-06, "loss": 0.87082577, "num_input_tokens_seen": 155435505, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 7251, "time_per_iteration": 2.573484420776367 }, { "auxiliary_loss_clip": 0.01119341, "auxiliary_loss_mlp": 0.01036366, "balance_loss_clip": 1.02194166, "balance_loss_mlp": 1.0422895, "epoch": 0.43601382834811364, "flos": 22711273090560.0, "grad_norm": 2.231235948705985, "language_loss": 0.69232297, "learning_rate": 2.5059982047187245e-06, "loss": 0.71388006, "num_input_tokens_seen": 155455425, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76953125, "step": 7252, "time_per_iteration": 2.4826669692993164 }, { "auxiliary_loss_clip": 0.01120015, "auxiliary_loss_mlp": 0.01034726, "balance_loss_clip": 1.02017665, "balance_loss_mlp": 1.04428566, "epoch": 0.4360739516007816, "flos": 19098731571840.0, "grad_norm": 2.360084230445277, "language_loss": 0.83535868, "learning_rate": 2.505621403992348e-06, "loss": 0.85690606, "num_input_tokens_seen": 155474250, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7578125, "step": 7253, "time_per_iteration": 2.5128872394561768 }, { "auxiliary_loss_clip": 0.01122862, "auxiliary_loss_mlp": 0.01037019, "balance_loss_clip": 1.02235031, "balance_loss_mlp": 1.04544806, "epoch": 0.43613407485344957, "flos": 23404420817280.0, "grad_norm": 1.7633033490200511, "language_loss": 0.70199257, "learning_rate": 2.505244584092757e-06, "loss": 0.72359139, "num_input_tokens_seen": 155494685, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 7254, "time_per_iteration": 2.482362747192383 }, { "auxiliary_loss_clip": 0.01120418, "auxiliary_loss_mlp": 0.01034932, "balance_loss_clip": 1.02106822, "balance_loss_mlp": 1.04504526, "epoch": 0.43619419810611754, "flos": 22637799820800.0, "grad_norm": 1.8996989093373022, "language_loss": 0.81842661, "learning_rate": 2.5048677450342406e-06, "loss": 0.83998007, "num_input_tokens_seen": 155513040, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75390625, "step": 7255, "time_per_iteration": 2.4982738494873047 }, { "auxiliary_loss_clip": 0.01121616, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.02012706, "balance_loss_mlp": 1.04364467, "epoch": 0.4362543213587855, "flos": 20047958334720.0, "grad_norm": 1.7741361785084209, "language_loss": 0.7734046, "learning_rate": 2.504490886831089e-06, "loss": 0.79495722, "num_input_tokens_seen": 155530100, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78125, "step": 7256, "time_per_iteration": 2.459867000579834 }, { "auxiliary_loss_clip": 0.01122732, "auxiliary_loss_mlp": 0.01034385, "balance_loss_clip": 1.0206821, "balance_loss_mlp": 1.04636359, "epoch": 0.43631444461145347, "flos": 21361319222400.0, "grad_norm": 1.6856831266821506, "language_loss": 0.76020736, "learning_rate": 2.5041140094975922e-06, "loss": 0.78177851, "num_input_tokens_seen": 155549375, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 7257, "time_per_iteration": 2.516145706176758 }, { "auxiliary_loss_clip": 0.01121462, "auxiliary_loss_mlp": 0.01033644, "balance_loss_clip": 1.01889753, "balance_loss_mlp": 1.0431639, "epoch": 0.43637456786412143, "flos": 22418529246720.0, "grad_norm": 1.7568829298013016, "language_loss": 0.72982931, "learning_rate": 2.5037371130480417e-06, "loss": 0.75138038, "num_input_tokens_seen": 155569395, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 7258, "time_per_iteration": 3.9875810146331787 }, { "auxiliary_loss_clip": 0.0112244, "auxiliary_loss_mlp": 0.01037165, "balance_loss_clip": 1.02272904, "balance_loss_mlp": 1.04346824, "epoch": 0.4364346911167894, "flos": 28548839612160.0, "grad_norm": 1.7642229913872074, "language_loss": 0.7687977, "learning_rate": 2.5033601974967297e-06, "loss": 0.79039371, "num_input_tokens_seen": 155589090, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 7259, "time_per_iteration": 2.5544562339782715 }, { "auxiliary_loss_clip": 0.01049562, "auxiliary_loss_mlp": 0.01005283, "balance_loss_clip": 1.00345945, "balance_loss_mlp": 1.02367604, "epoch": 0.43649481436945736, "flos": 62659345380480.0, "grad_norm": 0.74725374050953, "language_loss": 0.56992781, "learning_rate": 2.5029832628579483e-06, "loss": 0.59047627, "num_input_tokens_seen": 155648660, "router_z_loss_clip": 0.01818848, "router_z_loss_mlp": 0.2578125, "step": 7260, "time_per_iteration": 5.795280694961548 }, { "auxiliary_loss_clip": 0.01123381, "auxiliary_loss_mlp": 0.01046152, "balance_loss_clip": 1.03154337, "balance_loss_mlp": 1.04372525, "epoch": 0.4365549376221254, "flos": 30592120775040.0, "grad_norm": 2.2166120900896114, "language_loss": 0.71241575, "learning_rate": 2.5026063091459907e-06, "loss": 0.73411113, "num_input_tokens_seen": 155669945, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 7261, "time_per_iteration": 3.96502685546875 }, { "auxiliary_loss_clip": 0.01121249, "auxiliary_loss_mlp": 0.01040031, "balance_loss_clip": 1.02536201, "balance_loss_mlp": 1.04270685, "epoch": 0.43661506087479335, "flos": 17165875795200.0, "grad_norm": 5.5782690911052955, "language_loss": 0.6931361, "learning_rate": 2.5022293363751522e-06, "loss": 0.71474892, "num_input_tokens_seen": 155688555, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 7262, "time_per_iteration": 2.5324058532714844 }, { "auxiliary_loss_clip": 0.01114343, "auxiliary_loss_mlp": 0.0103718, "balance_loss_clip": 1.02477074, "balance_loss_mlp": 1.04166329, "epoch": 0.4366751841274613, "flos": 22047499710720.0, "grad_norm": 2.3196838622386076, "language_loss": 0.7968061, "learning_rate": 2.501852344559726e-06, "loss": 0.81832135, "num_input_tokens_seen": 155705370, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 7263, "time_per_iteration": 2.507046699523926 }, { "auxiliary_loss_clip": 0.01121791, "auxiliary_loss_mlp": 0.01047723, "balance_loss_clip": 1.03288722, "balance_loss_mlp": 1.04481339, "epoch": 0.4367353073801293, "flos": 15997306631040.0, "grad_norm": 1.9137509890232172, "language_loss": 0.7542187, "learning_rate": 2.50147533371401e-06, "loss": 0.77591383, "num_input_tokens_seen": 155721890, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76953125, "step": 7264, "time_per_iteration": 2.50128173828125 }, { "auxiliary_loss_clip": 0.01119405, "auxiliary_loss_mlp": 0.01038445, "balance_loss_clip": 1.02393675, "balance_loss_mlp": 1.04306173, "epoch": 0.43679543063279724, "flos": 38217535868160.0, "grad_norm": 1.8660922637555282, "language_loss": 0.62332278, "learning_rate": 2.501098303852298e-06, "loss": 0.64490128, "num_input_tokens_seen": 155743970, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76171875, "step": 7265, "time_per_iteration": 2.63101863861084 }, { "auxiliary_loss_clip": 0.01119074, "auxiliary_loss_mlp": 0.01032299, "balance_loss_clip": 1.01842928, "balance_loss_mlp": 1.04314923, "epoch": 0.4368555538854652, "flos": 15193230727680.0, "grad_norm": 2.8961537449858885, "language_loss": 0.72715944, "learning_rate": 2.5007212549888884e-06, "loss": 0.74867314, "num_input_tokens_seen": 155761830, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 7266, "time_per_iteration": 2.468965530395508 }, { "auxiliary_loss_clip": 0.01122214, "auxiliary_loss_mlp": 0.01034348, "balance_loss_clip": 1.0197804, "balance_loss_mlp": 1.04376996, "epoch": 0.4369156771381332, "flos": 23069086421760.0, "grad_norm": 2.2065613620436024, "language_loss": 0.81921762, "learning_rate": 2.5003441871380794e-06, "loss": 0.84078324, "num_input_tokens_seen": 155779610, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78515625, "step": 7267, "time_per_iteration": 2.4811580181121826 }, { "auxiliary_loss_clip": 0.01118018, "auxiliary_loss_mlp": 0.01030285, "balance_loss_clip": 1.01707697, "balance_loss_mlp": 1.04234481, "epoch": 0.43697580039080114, "flos": 23441085624960.0, "grad_norm": 2.064925053105522, "language_loss": 0.74709684, "learning_rate": 2.4999671003141674e-06, "loss": 0.7685799, "num_input_tokens_seen": 155798765, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 7268, "time_per_iteration": 2.53670334815979 }, { "auxiliary_loss_clip": 0.01124564, "auxiliary_loss_mlp": 0.01035745, "balance_loss_clip": 1.02098644, "balance_loss_mlp": 1.04381323, "epoch": 0.4370359236434691, "flos": 18514680428160.0, "grad_norm": 9.5228041549276, "language_loss": 0.79711604, "learning_rate": 2.499589994531454e-06, "loss": 0.81871915, "num_input_tokens_seen": 155817750, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.80859375, "step": 7269, "time_per_iteration": 2.465294122695923 }, { "auxiliary_loss_clip": 0.01120245, "auxiliary_loss_mlp": 0.01038261, "balance_loss_clip": 1.02415276, "balance_loss_mlp": 1.04324961, "epoch": 0.43709604689613707, "flos": 23222497409280.0, "grad_norm": 1.997502345347419, "language_loss": 0.74838734, "learning_rate": 2.499212869804237e-06, "loss": 0.76997244, "num_input_tokens_seen": 155836490, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 7270, "time_per_iteration": 2.5559582710266113 }, { "auxiliary_loss_clip": 0.01119102, "auxiliary_loss_mlp": 0.01032738, "balance_loss_clip": 1.01853967, "balance_loss_mlp": 1.0426023, "epoch": 0.43715617014880503, "flos": 23803711378560.0, "grad_norm": 2.183820967917051, "language_loss": 0.80082375, "learning_rate": 2.4988357261468182e-06, "loss": 0.82234216, "num_input_tokens_seen": 155856225, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 7271, "time_per_iteration": 2.4787304401397705 }, { "auxiliary_loss_clip": 0.01045622, "auxiliary_loss_mlp": 0.01006646, "balance_loss_clip": 1.00478673, "balance_loss_mlp": 1.01969266, "epoch": 0.437216293401473, "flos": 61941204766080.0, "grad_norm": 0.6995866946863661, "language_loss": 0.54925197, "learning_rate": 2.4984585635734993e-06, "loss": 0.56977463, "num_input_tokens_seen": 155916770, "router_z_loss_clip": 0.01855469, "router_z_loss_mlp": 0.2578125, "step": 7272, "time_per_iteration": 3.1769063472747803 }, { "auxiliary_loss_clip": 0.01121699, "auxiliary_loss_mlp": 0.01038211, "balance_loss_clip": 1.02302957, "balance_loss_mlp": 1.04260349, "epoch": 0.43727641665414096, "flos": 21982250655360.0, "grad_norm": 1.7509816481427127, "language_loss": 0.70158827, "learning_rate": 2.498081382098581e-06, "loss": 0.72318733, "num_input_tokens_seen": 155936490, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7890625, "step": 7273, "time_per_iteration": 2.463583469390869 }, { "auxiliary_loss_clip": 0.01120732, "auxiliary_loss_mlp": 0.01037522, "balance_loss_clip": 1.02265692, "balance_loss_mlp": 1.04158652, "epoch": 0.437336539906809, "flos": 39530860842240.0, "grad_norm": 1.873988556420746, "language_loss": 0.75480902, "learning_rate": 2.497704181736367e-06, "loss": 0.77639163, "num_input_tokens_seen": 155957595, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 7274, "time_per_iteration": 2.6703603267669678 }, { "auxiliary_loss_clip": 0.01116136, "auxiliary_loss_mlp": 0.01025754, "balance_loss_clip": 1.01327848, "balance_loss_mlp": 1.04050827, "epoch": 0.43739666315947695, "flos": 17457147181440.0, "grad_norm": 2.063367526555797, "language_loss": 0.80079597, "learning_rate": 2.49732696250116e-06, "loss": 0.82221484, "num_input_tokens_seen": 155975710, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7578125, "step": 7275, "time_per_iteration": 2.4482083320617676 }, { "auxiliary_loss_clip": 0.01121094, "auxiliary_loss_mlp": 0.01034867, "balance_loss_clip": 1.02108693, "balance_loss_mlp": 1.04449415, "epoch": 0.4374567864121449, "flos": 16358747235840.0, "grad_norm": 2.2275053678694263, "language_loss": 0.81005239, "learning_rate": 2.496949724407266e-06, "loss": 0.83161205, "num_input_tokens_seen": 155993090, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.765625, "step": 7276, "time_per_iteration": 2.4861485958099365 }, { "auxiliary_loss_clip": 0.01124428, "auxiliary_loss_mlp": 0.01030145, "balance_loss_clip": 1.01559591, "balance_loss_mlp": 1.04251063, "epoch": 0.4375169096648129, "flos": 30587523834240.0, "grad_norm": 2.762837629277479, "language_loss": 0.7273463, "learning_rate": 2.496572467468988e-06, "loss": 0.74889201, "num_input_tokens_seen": 156013685, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8203125, "step": 7277, "time_per_iteration": 2.551912546157837 }, { "auxiliary_loss_clip": 0.01119791, "auxiliary_loss_mlp": 0.01032145, "balance_loss_clip": 1.01733923, "balance_loss_mlp": 1.0426929, "epoch": 0.43757703291748085, "flos": 30555599621760.0, "grad_norm": 2.1443771207504874, "language_loss": 0.72966021, "learning_rate": 2.4961951917006317e-06, "loss": 0.75117958, "num_input_tokens_seen": 156034300, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76953125, "step": 7278, "time_per_iteration": 2.59450364112854 }, { "auxiliary_loss_clip": 0.01117584, "auxiliary_loss_mlp": 0.01037546, "balance_loss_clip": 1.02446818, "balance_loss_mlp": 1.04243898, "epoch": 0.4376371561701488, "flos": 21397373498880.0, "grad_norm": 1.571835297400935, "language_loss": 0.6616447, "learning_rate": 2.4958178971165046e-06, "loss": 0.68319601, "num_input_tokens_seen": 156053805, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 7279, "time_per_iteration": 2.4757444858551025 }, { "auxiliary_loss_clip": 0.01123613, "auxiliary_loss_mlp": 0.01037713, "balance_loss_clip": 1.02339637, "balance_loss_mlp": 1.04278064, "epoch": 0.4376972794228168, "flos": 23404384903680.0, "grad_norm": 1.9712847397078546, "language_loss": 0.82254755, "learning_rate": 2.4954405837309126e-06, "loss": 0.8441608, "num_input_tokens_seen": 156073295, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.80859375, "step": 7280, "time_per_iteration": 2.5252785682678223 }, { "auxiliary_loss_clip": 0.01114861, "auxiliary_loss_mlp": 0.01033294, "balance_loss_clip": 1.01977551, "balance_loss_mlp": 1.03974915, "epoch": 0.43775740267548474, "flos": 22892945103360.0, "grad_norm": 2.403132998216051, "language_loss": 0.76860887, "learning_rate": 2.4950632515581653e-06, "loss": 0.79009044, "num_input_tokens_seen": 156094540, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 7281, "time_per_iteration": 2.4900007247924805 }, { "auxiliary_loss_clip": 0.0111802, "auxiliary_loss_mlp": 0.01040732, "balance_loss_clip": 1.02696335, "balance_loss_mlp": 1.04045057, "epoch": 0.4378175259281527, "flos": 23294390480640.0, "grad_norm": 2.042380795009528, "language_loss": 0.75846076, "learning_rate": 2.494685900612569e-06, "loss": 0.78004831, "num_input_tokens_seen": 156114070, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7734375, "step": 7282, "time_per_iteration": 2.5468599796295166 }, { "auxiliary_loss_clip": 0.01120364, "auxiliary_loss_mlp": 0.01037624, "balance_loss_clip": 1.02330101, "balance_loss_mlp": 1.04216743, "epoch": 0.43787764918082067, "flos": 23876897339520.0, "grad_norm": 1.83172650979542, "language_loss": 0.8471148, "learning_rate": 2.4943085309084333e-06, "loss": 0.86869466, "num_input_tokens_seen": 156132130, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 7283, "time_per_iteration": 2.472607135772705 }, { "auxiliary_loss_clip": 0.01123459, "auxiliary_loss_mlp": 0.01034806, "balance_loss_clip": 1.01983881, "balance_loss_mlp": 1.04257655, "epoch": 0.43793777243348864, "flos": 23988148738560.0, "grad_norm": 2.590635924519567, "language_loss": 0.80499524, "learning_rate": 2.49393114246007e-06, "loss": 0.8265779, "num_input_tokens_seen": 156150820, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.80859375, "step": 7284, "time_per_iteration": 2.530747413635254 }, { "auxiliary_loss_clip": 0.01120497, "auxiliary_loss_mlp": 0.01041897, "balance_loss_clip": 1.02841496, "balance_loss_mlp": 1.04306531, "epoch": 0.4379978956861566, "flos": 18624064320000.0, "grad_norm": 1.7371295165548957, "language_loss": 0.80219448, "learning_rate": 2.493553735281787e-06, "loss": 0.82381845, "num_input_tokens_seen": 156170125, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 7285, "time_per_iteration": 2.456022262573242 }, { "auxiliary_loss_clip": 0.01117311, "auxiliary_loss_mlp": 0.01028527, "balance_loss_clip": 1.01497269, "balance_loss_mlp": 1.04107404, "epoch": 0.43805801893882457, "flos": 21981388728960.0, "grad_norm": 2.039057104663033, "language_loss": 0.74920082, "learning_rate": 2.493176309387897e-06, "loss": 0.77065921, "num_input_tokens_seen": 156187320, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 7286, "time_per_iteration": 2.5049705505371094 }, { "auxiliary_loss_clip": 0.01120318, "auxiliary_loss_mlp": 0.01031001, "balance_loss_clip": 1.01670206, "balance_loss_mlp": 1.04123306, "epoch": 0.43811814219149253, "flos": 26393337383040.0, "grad_norm": 1.6526708131459207, "language_loss": 0.73425925, "learning_rate": 2.492798864792712e-06, "loss": 0.75577247, "num_input_tokens_seen": 156207455, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.79296875, "step": 7287, "time_per_iteration": 2.5206918716430664 }, { "auxiliary_loss_clip": 0.0112209, "auxiliary_loss_mlp": 0.01041584, "balance_loss_clip": 1.02684975, "balance_loss_mlp": 1.04296827, "epoch": 0.43817826544416055, "flos": 17493309198720.0, "grad_norm": 1.7733184952370111, "language_loss": 0.8245216, "learning_rate": 2.492421401510545e-06, "loss": 0.84615839, "num_input_tokens_seen": 156226560, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.79296875, "step": 7288, "time_per_iteration": 2.489328622817993 }, { "auxiliary_loss_clip": 0.01120911, "auxiliary_loss_mlp": 0.01033329, "balance_loss_clip": 1.0187974, "balance_loss_mlp": 1.03937531, "epoch": 0.4382383886968285, "flos": 21581020759680.0, "grad_norm": 1.8543170805547495, "language_loss": 0.84180403, "learning_rate": 2.4920439195557093e-06, "loss": 0.86334646, "num_input_tokens_seen": 156246740, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.81640625, "step": 7289, "time_per_iteration": 2.4697701930999756 }, { "auxiliary_loss_clip": 0.01124491, "auxiliary_loss_mlp": 0.01033769, "balance_loss_clip": 1.02016711, "balance_loss_mlp": 1.04196703, "epoch": 0.4382985119494965, "flos": 27923742201600.0, "grad_norm": 1.6029399775788018, "language_loss": 0.78163552, "learning_rate": 2.4916664189425183e-06, "loss": 0.80321813, "num_input_tokens_seen": 156266440, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.828125, "step": 7290, "time_per_iteration": 2.5991413593292236 }, { "auxiliary_loss_clip": 0.01120018, "auxiliary_loss_mlp": 0.01037404, "balance_loss_clip": 1.02420187, "balance_loss_mlp": 1.04325986, "epoch": 0.43835863520216445, "flos": 24936836797440.0, "grad_norm": 1.8224081230110232, "language_loss": 0.77735651, "learning_rate": 2.491288899685288e-06, "loss": 0.79893076, "num_input_tokens_seen": 156286900, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.765625, "step": 7291, "time_per_iteration": 2.5110671520233154 }, { "auxiliary_loss_clip": 0.01118753, "auxiliary_loss_mlp": 0.0103227, "balance_loss_clip": 1.0184834, "balance_loss_mlp": 1.04052746, "epoch": 0.4384187584548324, "flos": 33510293504640.0, "grad_norm": 1.8861374018302959, "language_loss": 0.65122616, "learning_rate": 2.4909113617983325e-06, "loss": 0.67273641, "num_input_tokens_seen": 156307690, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.78125, "step": 7292, "time_per_iteration": 2.636956214904785 }, { "auxiliary_loss_clip": 0.01119592, "auxiliary_loss_mlp": 0.01031523, "balance_loss_clip": 1.01813626, "balance_loss_mlp": 1.04047775, "epoch": 0.4384788817075004, "flos": 23951052967680.0, "grad_norm": 1.7621181969562179, "language_loss": 0.74370551, "learning_rate": 2.49053380529597e-06, "loss": 0.76521659, "num_input_tokens_seen": 156326620, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7890625, "step": 7293, "time_per_iteration": 2.5009026527404785 }, { "auxiliary_loss_clip": 0.01121782, "auxiliary_loss_mlp": 0.01038928, "balance_loss_clip": 1.02397919, "balance_loss_mlp": 1.04332113, "epoch": 0.43853900496016834, "flos": 19098516090240.0, "grad_norm": 2.188108386768947, "language_loss": 0.7833156, "learning_rate": 2.490156230192516e-06, "loss": 0.8049227, "num_input_tokens_seen": 156345495, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78125, "step": 7294, "time_per_iteration": 2.522757053375244 }, { "auxiliary_loss_clip": 0.01121153, "auxiliary_loss_mlp": 0.01040725, "balance_loss_clip": 1.02667642, "balance_loss_mlp": 1.0421977, "epoch": 0.4385991282128363, "flos": 13225362168960.0, "grad_norm": 1.7805510334647539, "language_loss": 0.7344535, "learning_rate": 2.4897786365022883e-06, "loss": 0.75607228, "num_input_tokens_seen": 156363155, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 7295, "time_per_iteration": 2.482450485229492 }, { "auxiliary_loss_clip": 0.01123574, "auxiliary_loss_mlp": 0.01040295, "balance_loss_clip": 1.02510726, "balance_loss_mlp": 1.04326057, "epoch": 0.4386592514655043, "flos": 14319883445760.0, "grad_norm": 2.196578690112048, "language_loss": 0.75313717, "learning_rate": 2.4894010242396063e-06, "loss": 0.77477586, "num_input_tokens_seen": 156380940, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80078125, "step": 7296, "time_per_iteration": 2.4891278743743896 }, { "auxiliary_loss_clip": 0.01122179, "auxiliary_loss_mlp": 0.01032992, "balance_loss_clip": 1.01876438, "balance_loss_mlp": 1.04375219, "epoch": 0.43871937471817224, "flos": 22784423137920.0, "grad_norm": 1.8399022149006106, "language_loss": 0.69245243, "learning_rate": 2.4890233934187873e-06, "loss": 0.71400416, "num_input_tokens_seen": 156400415, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 7297, "time_per_iteration": 2.511587619781494 }, { "auxiliary_loss_clip": 0.01118764, "auxiliary_loss_mlp": 0.01031342, "balance_loss_clip": 1.01789498, "balance_loss_mlp": 1.04197884, "epoch": 0.4387794979708402, "flos": 28072304853120.0, "grad_norm": 1.62756967674236, "language_loss": 0.70236403, "learning_rate": 2.4886457440541535e-06, "loss": 0.72386509, "num_input_tokens_seen": 156421120, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 7298, "time_per_iteration": 2.5639283657073975 }, { "auxiliary_loss_clip": 0.01120113, "auxiliary_loss_mlp": 0.01027142, "balance_loss_clip": 1.0135169, "balance_loss_mlp": 1.0437206, "epoch": 0.43883962122350817, "flos": 26249551240320.0, "grad_norm": 1.4613365723086567, "language_loss": 0.72355127, "learning_rate": 2.4882680761600238e-06, "loss": 0.74502385, "num_input_tokens_seen": 156441535, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 7299, "time_per_iteration": 4.005906105041504 }, { "auxiliary_loss_clip": 0.01123573, "auxiliary_loss_mlp": 0.01043217, "balance_loss_clip": 1.02843451, "balance_loss_mlp": 1.04449844, "epoch": 0.43889974447617613, "flos": 25883765089920.0, "grad_norm": 2.5159791967158176, "language_loss": 0.76753438, "learning_rate": 2.487890389750719e-06, "loss": 0.78920221, "num_input_tokens_seen": 156462015, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 7300, "time_per_iteration": 2.54876446723938 }, { "auxiliary_loss_clip": 0.01120811, "auxiliary_loss_mlp": 0.0103101, "balance_loss_clip": 1.016204, "balance_loss_mlp": 1.04266524, "epoch": 0.43895986772884416, "flos": 25046615738880.0, "grad_norm": 1.980803739605453, "language_loss": 0.70855141, "learning_rate": 2.4875126848405626e-06, "loss": 0.73006964, "num_input_tokens_seen": 156482165, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 7301, "time_per_iteration": 3.9022023677825928 }, { "auxiliary_loss_clip": 0.01125674, "auxiliary_loss_mlp": 0.0103359, "balance_loss_clip": 1.01824796, "balance_loss_mlp": 1.04525173, "epoch": 0.4390199909815121, "flos": 25994585525760.0, "grad_norm": 1.96494158519671, "language_loss": 0.70299125, "learning_rate": 2.4871349614438757e-06, "loss": 0.72458386, "num_input_tokens_seen": 156503170, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8046875, "step": 7302, "time_per_iteration": 3.9819047451019287 }, { "auxiliary_loss_clip": 0.01121316, "auxiliary_loss_mlp": 0.01035483, "balance_loss_clip": 1.02198815, "balance_loss_mlp": 1.04376364, "epoch": 0.4390801142341801, "flos": 29022249888000.0, "grad_norm": 2.284773226073573, "language_loss": 0.82394391, "learning_rate": 2.486757219574983e-06, "loss": 0.84551191, "num_input_tokens_seen": 156523005, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 7303, "time_per_iteration": 3.9103457927703857 }, { "auxiliary_loss_clip": 0.01129052, "auxiliary_loss_mlp": 0.01040656, "balance_loss_clip": 1.02446771, "balance_loss_mlp": 1.04604626, "epoch": 0.43914023748684805, "flos": 33438544087680.0, "grad_norm": 2.421814584614666, "language_loss": 0.68363774, "learning_rate": 2.4863794592482067e-06, "loss": 0.70533478, "num_input_tokens_seen": 156544440, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.828125, "step": 7304, "time_per_iteration": 2.6221814155578613 }, { "auxiliary_loss_clip": 0.01119567, "auxiliary_loss_mlp": 0.01037683, "balance_loss_clip": 1.02392006, "balance_loss_mlp": 1.04381669, "epoch": 0.439200360739516, "flos": 34531844302080.0, "grad_norm": 2.018296830944988, "language_loss": 0.78154236, "learning_rate": 2.486001680477873e-06, "loss": 0.80311483, "num_input_tokens_seen": 156565410, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7578125, "step": 7305, "time_per_iteration": 2.6055634021759033 }, { "auxiliary_loss_clip": 0.01120396, "auxiliary_loss_mlp": 0.01032691, "balance_loss_clip": 1.01835585, "balance_loss_mlp": 1.04266596, "epoch": 0.439260483992184, "flos": 21907843632000.0, "grad_norm": 2.133134504207139, "language_loss": 0.69089687, "learning_rate": 2.485623883278308e-06, "loss": 0.71242774, "num_input_tokens_seen": 156584210, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.77734375, "step": 7306, "time_per_iteration": 2.4964170455932617 }, { "auxiliary_loss_clip": 0.01120414, "auxiliary_loss_mlp": 0.01028961, "balance_loss_clip": 1.01421428, "balance_loss_mlp": 1.04172444, "epoch": 0.43932060724485195, "flos": 20996430912000.0, "grad_norm": 1.545664373886244, "language_loss": 0.62437969, "learning_rate": 2.4852460676638344e-06, "loss": 0.64587343, "num_input_tokens_seen": 156602730, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7890625, "step": 7307, "time_per_iteration": 2.496649980545044 }, { "auxiliary_loss_clip": 0.0112573, "auxiliary_loss_mlp": 0.01030372, "balance_loss_clip": 1.0162518, "balance_loss_mlp": 1.04416502, "epoch": 0.4393807304975199, "flos": 17747053850880.0, "grad_norm": 6.164402264490347, "language_loss": 0.72181857, "learning_rate": 2.4848682336487828e-06, "loss": 0.74337959, "num_input_tokens_seen": 156619405, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 7308, "time_per_iteration": 2.4752368927001953 }, { "auxiliary_loss_clip": 0.01126188, "auxiliary_loss_mlp": 0.01034708, "balance_loss_clip": 1.02042091, "balance_loss_mlp": 1.04345989, "epoch": 0.4394408537501879, "flos": 22528523669760.0, "grad_norm": 1.91159903202055, "language_loss": 0.76677299, "learning_rate": 2.4844903812474787e-06, "loss": 0.78838199, "num_input_tokens_seen": 156638165, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.828125, "step": 7309, "time_per_iteration": 2.4960930347442627 }, { "auxiliary_loss_clip": 0.01118246, "auxiliary_loss_mlp": 0.01026938, "balance_loss_clip": 1.01383662, "balance_loss_mlp": 1.04342043, "epoch": 0.43950097700285584, "flos": 23440654661760.0, "grad_norm": 1.880678165411086, "language_loss": 0.71152353, "learning_rate": 2.484112510474251e-06, "loss": 0.73297542, "num_input_tokens_seen": 156658845, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 7310, "time_per_iteration": 2.5044775009155273 }, { "auxiliary_loss_clip": 0.01124073, "auxiliary_loss_mlp": 0.010321, "balance_loss_clip": 1.01759195, "balance_loss_mlp": 1.0439465, "epoch": 0.4395611002555238, "flos": 23180696956800.0, "grad_norm": 2.3087069554684088, "language_loss": 0.75747216, "learning_rate": 2.483734621343429e-06, "loss": 0.7790339, "num_input_tokens_seen": 156677275, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 7311, "time_per_iteration": 2.4841229915618896 }, { "auxiliary_loss_clip": 0.01124801, "auxiliary_loss_mlp": 0.01037352, "balance_loss_clip": 1.02373242, "balance_loss_mlp": 1.04408836, "epoch": 0.43962122350819177, "flos": 22127365601280.0, "grad_norm": 2.291727776186316, "language_loss": 0.81379128, "learning_rate": 2.483356713869341e-06, "loss": 0.83541286, "num_input_tokens_seen": 156695815, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.8046875, "step": 7312, "time_per_iteration": 2.5240983963012695 }, { "auxiliary_loss_clip": 0.01120292, "auxiliary_loss_mlp": 0.01033115, "balance_loss_clip": 1.01891756, "balance_loss_mlp": 1.04252338, "epoch": 0.43968134676085974, "flos": 17420554200960.0, "grad_norm": 1.8635352458722565, "language_loss": 0.84689891, "learning_rate": 2.482978788066318e-06, "loss": 0.868433, "num_input_tokens_seen": 156714385, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.77734375, "step": 7313, "time_per_iteration": 2.473809003829956 }, { "auxiliary_loss_clip": 0.01124857, "auxiliary_loss_mlp": 0.01032714, "balance_loss_clip": 1.01873696, "balance_loss_mlp": 1.04412484, "epoch": 0.43974147001352776, "flos": 18952646958720.0, "grad_norm": 2.015932307766107, "language_loss": 0.67570454, "learning_rate": 2.4826008439486904e-06, "loss": 0.69728029, "num_input_tokens_seen": 156732615, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.8046875, "step": 7314, "time_per_iteration": 2.475888252258301 }, { "auxiliary_loss_clip": 0.01126657, "auxiliary_loss_mlp": 0.01031357, "balance_loss_clip": 1.01663423, "balance_loss_mlp": 1.04601717, "epoch": 0.4398015932661957, "flos": 18953508885120.0, "grad_norm": 1.9577637914067028, "language_loss": 0.76468396, "learning_rate": 2.4822228815307915e-06, "loss": 0.78626412, "num_input_tokens_seen": 156750920, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8046875, "step": 7315, "time_per_iteration": 2.4898409843444824 }, { "auxiliary_loss_clip": 0.01122661, "auxiliary_loss_mlp": 0.01031084, "balance_loss_clip": 1.01637363, "balance_loss_mlp": 1.04489923, "epoch": 0.4398617165188637, "flos": 24199913370240.0, "grad_norm": 2.06258056930065, "language_loss": 0.7398349, "learning_rate": 2.4818449008269523e-06, "loss": 0.76137239, "num_input_tokens_seen": 156768520, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.77734375, "step": 7316, "time_per_iteration": 2.5389840602874756 }, { "auxiliary_loss_clip": 0.01123186, "auxiliary_loss_mlp": 0.01036589, "balance_loss_clip": 1.02338672, "balance_loss_mlp": 1.04487884, "epoch": 0.43992183977153165, "flos": 22236677665920.0, "grad_norm": 3.652731490104672, "language_loss": 0.65381157, "learning_rate": 2.481466901851506e-06, "loss": 0.67540932, "num_input_tokens_seen": 156788700, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.78125, "step": 7317, "time_per_iteration": 2.500330924987793 }, { "auxiliary_loss_clip": 0.01125424, "auxiliary_loss_mlp": 0.01035137, "balance_loss_clip": 1.02120137, "balance_loss_mlp": 1.0454762, "epoch": 0.4399819630241996, "flos": 18697465762560.0, "grad_norm": 5.446485105286052, "language_loss": 0.79870498, "learning_rate": 2.4810888846187865e-06, "loss": 0.82031059, "num_input_tokens_seen": 156806470, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.796875, "step": 7318, "time_per_iteration": 2.4603183269500732 }, { "auxiliary_loss_clip": 0.01127787, "auxiliary_loss_mlp": 0.01040646, "balance_loss_clip": 1.02620935, "balance_loss_mlp": 1.04671252, "epoch": 0.4400420862768676, "flos": 23879375377920.0, "grad_norm": 2.4032395210382065, "language_loss": 0.7951737, "learning_rate": 2.4807108491431283e-06, "loss": 0.81685799, "num_input_tokens_seen": 156825895, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 7319, "time_per_iteration": 2.5603575706481934 }, { "auxiliary_loss_clip": 0.01124243, "auxiliary_loss_mlp": 0.01037912, "balance_loss_clip": 1.02246308, "balance_loss_mlp": 1.04480803, "epoch": 0.44010220952953555, "flos": 28037615293440.0, "grad_norm": 2.3132961725634713, "language_loss": 0.79353881, "learning_rate": 2.4803327954388667e-06, "loss": 0.81516039, "num_input_tokens_seen": 156845990, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.79296875, "step": 7320, "time_per_iteration": 2.5547163486480713 }, { "auxiliary_loss_clip": 0.01122132, "auxiliary_loss_mlp": 0.01037788, "balance_loss_clip": 1.02385235, "balance_loss_mlp": 1.04412067, "epoch": 0.4401623327822035, "flos": 23768985905280.0, "grad_norm": 1.826452076663503, "language_loss": 0.69845122, "learning_rate": 2.4799547235203376e-06, "loss": 0.72005045, "num_input_tokens_seen": 156866685, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 7321, "time_per_iteration": 2.5671873092651367 }, { "auxiliary_loss_clip": 0.01049857, "auxiliary_loss_mlp": 0.01007384, "balance_loss_clip": 1.00565517, "balance_loss_mlp": 1.02390325, "epoch": 0.4402224560348715, "flos": 70774583264640.0, "grad_norm": 0.889154450414549, "language_loss": 0.56909752, "learning_rate": 2.4795766334018763e-06, "loss": 0.58966994, "num_input_tokens_seen": 156923450, "router_z_loss_clip": 0.01733398, "router_z_loss_mlp": 0.2578125, "step": 7322, "time_per_iteration": 3.1826930046081543 }, { "auxiliary_loss_clip": 0.01121788, "auxiliary_loss_mlp": 0.01033409, "balance_loss_clip": 1.02079654, "balance_loss_mlp": 1.04444706, "epoch": 0.44028257928753944, "flos": 22891795868160.0, "grad_norm": 1.47265548339171, "language_loss": 0.76417089, "learning_rate": 2.479198525097822e-06, "loss": 0.78572285, "num_input_tokens_seen": 156944795, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7734375, "step": 7323, "time_per_iteration": 2.5395309925079346 }, { "auxiliary_loss_clip": 0.01124126, "auxiliary_loss_mlp": 0.01036499, "balance_loss_clip": 1.02246761, "balance_loss_mlp": 1.04391348, "epoch": 0.4403427025402074, "flos": 17895760156800.0, "grad_norm": 2.4240620581225194, "language_loss": 0.80424649, "learning_rate": 2.478820398622511e-06, "loss": 0.82585275, "num_input_tokens_seen": 156962755, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80078125, "step": 7324, "time_per_iteration": 2.488752841949463 }, { "auxiliary_loss_clip": 0.01050586, "auxiliary_loss_mlp": 0.01004393, "balance_loss_clip": 1.00259256, "balance_loss_mlp": 1.0247097, "epoch": 0.4404028257928754, "flos": 69562525708800.0, "grad_norm": 0.6733587173895695, "language_loss": 0.54635179, "learning_rate": 2.478442253990283e-06, "loss": 0.56690156, "num_input_tokens_seen": 157028095, "router_z_loss_clip": 0.01794434, "router_z_loss_mlp": 0.2578125, "step": 7325, "time_per_iteration": 3.1528875827789307 }, { "auxiliary_loss_clip": 0.01121386, "auxiliary_loss_mlp": 0.01030258, "balance_loss_clip": 1.01740766, "balance_loss_mlp": 1.04577267, "epoch": 0.44046294904554334, "flos": 20923675914240.0, "grad_norm": 1.6957856255321009, "language_loss": 0.69715589, "learning_rate": 2.4780640912154766e-06, "loss": 0.71867234, "num_input_tokens_seen": 157048365, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7578125, "step": 7326, "time_per_iteration": 2.501652240753174 }, { "auxiliary_loss_clip": 0.01122039, "auxiliary_loss_mlp": 0.01031582, "balance_loss_clip": 1.01796865, "balance_loss_mlp": 1.04532075, "epoch": 0.44052307229821136, "flos": 23623475909760.0, "grad_norm": 1.5666954168021197, "language_loss": 0.76550347, "learning_rate": 2.477685910312432e-06, "loss": 0.7870397, "num_input_tokens_seen": 157069130, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 7327, "time_per_iteration": 2.5429446697235107 }, { "auxiliary_loss_clip": 0.01122324, "auxiliary_loss_mlp": 0.01034104, "balance_loss_clip": 1.02012098, "balance_loss_mlp": 1.04506016, "epoch": 0.4405831955508793, "flos": 17597665186560.0, "grad_norm": 2.2180038966773057, "language_loss": 0.83941323, "learning_rate": 2.4773077112954897e-06, "loss": 0.86097753, "num_input_tokens_seen": 157084940, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7734375, "step": 7328, "time_per_iteration": 2.4245667457580566 }, { "auxiliary_loss_clip": 0.0112077, "auxiliary_loss_mlp": 0.01030635, "balance_loss_clip": 1.01639545, "balance_loss_mlp": 1.04402339, "epoch": 0.4406433188035473, "flos": 21463376739840.0, "grad_norm": 2.2753467051546266, "language_loss": 0.77538878, "learning_rate": 2.4769294941789908e-06, "loss": 0.79690284, "num_input_tokens_seen": 157102770, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 7329, "time_per_iteration": 2.4853858947753906 }, { "auxiliary_loss_clip": 0.01123611, "auxiliary_loss_mlp": 0.01034434, "balance_loss_clip": 1.02024806, "balance_loss_mlp": 1.04301679, "epoch": 0.44070344205621526, "flos": 22673566788480.0, "grad_norm": 1.7114299958507677, "language_loss": 0.73017699, "learning_rate": 2.476551258977278e-06, "loss": 0.75175744, "num_input_tokens_seen": 157122035, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 7330, "time_per_iteration": 2.4667391777038574 }, { "auxiliary_loss_clip": 0.01123875, "auxiliary_loss_mlp": 0.01033232, "balance_loss_clip": 1.01975536, "balance_loss_mlp": 1.04573417, "epoch": 0.4407635653088832, "flos": 23441193365760.0, "grad_norm": 1.933525874305024, "language_loss": 0.74828243, "learning_rate": 2.4761730057046936e-06, "loss": 0.76985347, "num_input_tokens_seen": 157142800, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78125, "step": 7331, "time_per_iteration": 2.522200107574463 }, { "auxiliary_loss_clip": 0.01117856, "auxiliary_loss_mlp": 0.01031887, "balance_loss_clip": 1.0185653, "balance_loss_mlp": 1.04200959, "epoch": 0.4408236885615512, "flos": 24021294013440.0, "grad_norm": 1.6144542451172808, "language_loss": 0.76618373, "learning_rate": 2.475794734375581e-06, "loss": 0.78768116, "num_input_tokens_seen": 157163295, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 7332, "time_per_iteration": 2.497633934020996 }, { "auxiliary_loss_clip": 0.01121827, "auxiliary_loss_mlp": 0.01035626, "balance_loss_clip": 1.02262592, "balance_loss_mlp": 1.04342508, "epoch": 0.44088381181421915, "flos": 12676826597760.0, "grad_norm": 1.8482350935840812, "language_loss": 0.73655879, "learning_rate": 2.475416445004285e-06, "loss": 0.75813329, "num_input_tokens_seen": 157180890, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.78515625, "step": 7333, "time_per_iteration": 2.4439685344696045 }, { "auxiliary_loss_clip": 0.01119711, "auxiliary_loss_mlp": 0.01033571, "balance_loss_clip": 1.02045774, "balance_loss_mlp": 1.04493523, "epoch": 0.4409439350668871, "flos": 24569865498240.0, "grad_norm": 2.1421375902985806, "language_loss": 0.79381073, "learning_rate": 2.4750381376051493e-06, "loss": 0.81534356, "num_input_tokens_seen": 157200580, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 7334, "time_per_iteration": 2.5062241554260254 }, { "auxiliary_loss_clip": 0.01130125, "auxiliary_loss_mlp": 0.01040026, "balance_loss_clip": 1.02324116, "balance_loss_mlp": 1.04457951, "epoch": 0.4410040583195551, "flos": 22668574798080.0, "grad_norm": 2.1383324541144515, "language_loss": 0.75235784, "learning_rate": 2.47465981219252e-06, "loss": 0.7740593, "num_input_tokens_seen": 157218345, "router_z_loss_clip": 0.16796875, "router_z_loss_mlp": 0.859375, "step": 7335, "time_per_iteration": 2.476393938064575 }, { "auxiliary_loss_clip": 0.01123533, "auxiliary_loss_mlp": 0.01033909, "balance_loss_clip": 1.01956272, "balance_loss_mlp": 1.0443424, "epoch": 0.44106418157222305, "flos": 10852528700160.0, "grad_norm": 1.9673790992449969, "language_loss": 0.72272992, "learning_rate": 2.4742814687807423e-06, "loss": 0.7443043, "num_input_tokens_seen": 157234395, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.79296875, "step": 7336, "time_per_iteration": 2.445134162902832 }, { "auxiliary_loss_clip": 0.0112572, "auxiliary_loss_mlp": 0.0104051, "balance_loss_clip": 1.02552569, "balance_loss_mlp": 1.04422176, "epoch": 0.441124304824891, "flos": 21726710323200.0, "grad_norm": 2.624464499133957, "language_loss": 0.63764435, "learning_rate": 2.473903107384165e-06, "loss": 0.65930665, "num_input_tokens_seen": 157254805, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 7337, "time_per_iteration": 2.4990482330322266 }, { "auxiliary_loss_clip": 0.01048094, "auxiliary_loss_mlp": 0.01000539, "balance_loss_clip": 0.99910277, "balance_loss_mlp": 1.02233362, "epoch": 0.441184428077559, "flos": 63220486625280.0, "grad_norm": 0.7494904658969461, "language_loss": 0.5263195, "learning_rate": 2.473524728017134e-06, "loss": 0.54680586, "num_input_tokens_seen": 157317870, "router_z_loss_clip": 0.01434326, "router_z_loss_mlp": 0.2578125, "step": 7338, "time_per_iteration": 3.1155996322631836 }, { "auxiliary_loss_clip": 0.01127862, "auxiliary_loss_mlp": 0.01043931, "balance_loss_clip": 1.02863026, "balance_loss_mlp": 1.04429603, "epoch": 0.44124455133022694, "flos": 21177959270400.0, "grad_norm": 3.5021722225214664, "language_loss": 0.7065919, "learning_rate": 2.473146330693997e-06, "loss": 0.72830981, "num_input_tokens_seen": 157336505, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8359375, "step": 7339, "time_per_iteration": 2.492475748062134 }, { "auxiliary_loss_clip": 0.01120615, "auxiliary_loss_mlp": 0.01035185, "balance_loss_clip": 1.02245355, "balance_loss_mlp": 1.04582727, "epoch": 0.4413046745828949, "flos": 17457865453440.0, "grad_norm": 1.8411660294141952, "language_loss": 0.69318587, "learning_rate": 2.472767915429105e-06, "loss": 0.71474385, "num_input_tokens_seen": 157354995, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.74609375, "step": 7340, "time_per_iteration": 2.4472148418426514 }, { "auxiliary_loss_clip": 0.01047191, "auxiliary_loss_mlp": 0.01005011, "balance_loss_clip": 1.00354445, "balance_loss_mlp": 1.02142572, "epoch": 0.4413647978355629, "flos": 61586153804160.0, "grad_norm": 0.8878908608217223, "language_loss": 0.63995039, "learning_rate": 2.4723894822368054e-06, "loss": 0.66047239, "num_input_tokens_seen": 157404260, "router_z_loss_clip": 0.01464844, "router_z_loss_mlp": 0.2578125, "step": 7341, "time_per_iteration": 4.325274705886841 }, { "auxiliary_loss_clip": 0.01119229, "auxiliary_loss_mlp": 0.01034315, "balance_loss_clip": 1.02017117, "balance_loss_mlp": 1.04240847, "epoch": 0.4414249210882309, "flos": 27527001505920.0, "grad_norm": 2.306618024253176, "language_loss": 0.73781472, "learning_rate": 2.47201103113145e-06, "loss": 0.75935018, "num_input_tokens_seen": 157423045, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76953125, "step": 7342, "time_per_iteration": 2.5526068210601807 }, { "auxiliary_loss_clip": 0.01119787, "auxiliary_loss_mlp": 0.01032826, "balance_loss_clip": 1.01867557, "balance_loss_mlp": 1.04059911, "epoch": 0.44148504434089886, "flos": 23513984277120.0, "grad_norm": 1.8937335435076688, "language_loss": 0.80094111, "learning_rate": 2.4716325621273886e-06, "loss": 0.82246721, "num_input_tokens_seen": 157441815, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7890625, "step": 7343, "time_per_iteration": 5.3687744140625 }, { "auxiliary_loss_clip": 0.01118659, "auxiliary_loss_mlp": 0.01034439, "balance_loss_clip": 1.02050304, "balance_loss_mlp": 1.04126954, "epoch": 0.4415451675935668, "flos": 21580589796480.0, "grad_norm": 1.6960665273877742, "language_loss": 0.76303279, "learning_rate": 2.4712540752389725e-06, "loss": 0.78456378, "num_input_tokens_seen": 157460470, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7734375, "step": 7344, "time_per_iteration": 3.8973171710968018 }, { "auxiliary_loss_clip": 0.01047703, "auxiliary_loss_mlp": 0.01001236, "balance_loss_clip": 0.99976331, "balance_loss_mlp": 1.02170205, "epoch": 0.4416052908462348, "flos": 59006368126080.0, "grad_norm": 0.8109260919502875, "language_loss": 0.63806915, "learning_rate": 2.470875570480556e-06, "loss": 0.65855849, "num_input_tokens_seen": 157512655, "router_z_loss_clip": 0.01470947, "router_z_loss_mlp": 0.25976562, "step": 7345, "time_per_iteration": 2.888814687728882 }, { "auxiliary_loss_clip": 0.01124089, "auxiliary_loss_mlp": 0.01034941, "balance_loss_clip": 1.02088046, "balance_loss_mlp": 1.0439992, "epoch": 0.44166541409890275, "flos": 26357642242560.0, "grad_norm": 1.9211112050386052, "language_loss": 0.8605361, "learning_rate": 2.470497047866489e-06, "loss": 0.88212639, "num_input_tokens_seen": 157533700, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80078125, "step": 7346, "time_per_iteration": 2.5333240032196045 }, { "auxiliary_loss_clip": 0.01122226, "auxiliary_loss_mlp": 0.01038125, "balance_loss_clip": 1.02286601, "balance_loss_mlp": 1.04329944, "epoch": 0.4417255373515707, "flos": 20192678231040.0, "grad_norm": 2.138215841169899, "language_loss": 0.80145049, "learning_rate": 2.470118507411128e-06, "loss": 0.82305402, "num_input_tokens_seen": 157551105, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7890625, "step": 7347, "time_per_iteration": 2.484896183013916 }, { "auxiliary_loss_clip": 0.01121841, "auxiliary_loss_mlp": 0.0103491, "balance_loss_clip": 1.0204376, "balance_loss_mlp": 1.04349077, "epoch": 0.4417856606042387, "flos": 17887895078400.0, "grad_norm": 2.3692772399074937, "language_loss": 0.83482313, "learning_rate": 2.4697399491288263e-06, "loss": 0.85639066, "num_input_tokens_seen": 157568285, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 7348, "time_per_iteration": 2.4706239700317383 }, { "auxiliary_loss_clip": 0.01125549, "auxiliary_loss_mlp": 0.01031992, "balance_loss_clip": 1.01759124, "balance_loss_mlp": 1.0446012, "epoch": 0.44184578385690665, "flos": 27964034282880.0, "grad_norm": 1.7807198851290214, "language_loss": 0.70356631, "learning_rate": 2.469361373033938e-06, "loss": 0.72514176, "num_input_tokens_seen": 157590405, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.80859375, "step": 7349, "time_per_iteration": 2.5553202629089355 }, { "auxiliary_loss_clip": 0.01122206, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.02011585, "balance_loss_mlp": 1.04228687, "epoch": 0.4419059071095746, "flos": 23367899664000.0, "grad_norm": 1.9858420417514462, "language_loss": 0.74546719, "learning_rate": 2.468982779140819e-06, "loss": 0.7670415, "num_input_tokens_seen": 157607420, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 7350, "time_per_iteration": 2.509242534637451 }, { "auxiliary_loss_clip": 0.01123057, "auxiliary_loss_mlp": 0.01034899, "balance_loss_clip": 1.02045119, "balance_loss_mlp": 1.04312539, "epoch": 0.4419660303622426, "flos": 15012169246080.0, "grad_norm": 2.1982550985235645, "language_loss": 0.80536646, "learning_rate": 2.468604167463827e-06, "loss": 0.82694602, "num_input_tokens_seen": 157624990, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80078125, "step": 7351, "time_per_iteration": 2.485257625579834 }, { "auxiliary_loss_clip": 0.0111809, "auxiliary_loss_mlp": 0.01037628, "balance_loss_clip": 1.02486706, "balance_loss_mlp": 1.04277349, "epoch": 0.44202615361491054, "flos": 25371750672000.0, "grad_norm": 4.335791688591933, "language_loss": 0.73251581, "learning_rate": 2.4682255380173176e-06, "loss": 0.75407302, "num_input_tokens_seen": 157645300, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.75390625, "step": 7352, "time_per_iteration": 2.567613363265991 }, { "auxiliary_loss_clip": 0.01121582, "auxiliary_loss_mlp": 0.01028377, "balance_loss_clip": 1.01379824, "balance_loss_mlp": 1.04308701, "epoch": 0.4420862768675785, "flos": 24681116897280.0, "grad_norm": 2.258957065555403, "language_loss": 0.87229568, "learning_rate": 2.467846890815649e-06, "loss": 0.89379525, "num_input_tokens_seen": 157664060, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78515625, "step": 7353, "time_per_iteration": 2.5377962589263916 }, { "auxiliary_loss_clip": 0.01124037, "auxiliary_loss_mlp": 0.01034895, "balance_loss_clip": 1.02159071, "balance_loss_mlp": 1.04405165, "epoch": 0.44214640012024653, "flos": 19528437974400.0, "grad_norm": 2.1818876467142183, "language_loss": 0.7587074, "learning_rate": 2.4674682258731795e-06, "loss": 0.7802968, "num_input_tokens_seen": 157680905, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.80078125, "step": 7354, "time_per_iteration": 2.455301523208618 }, { "auxiliary_loss_clip": 0.01117793, "auxiliary_loss_mlp": 0.01032209, "balance_loss_clip": 1.01898277, "balance_loss_mlp": 1.04171073, "epoch": 0.4422065233729145, "flos": 47557434003840.0, "grad_norm": 2.1523687478107467, "language_loss": 0.64658439, "learning_rate": 2.467089543204268e-06, "loss": 0.66808444, "num_input_tokens_seen": 157701980, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.76171875, "step": 7355, "time_per_iteration": 2.7445268630981445 }, { "auxiliary_loss_clip": 0.0112587, "auxiliary_loss_mlp": 0.01035277, "balance_loss_clip": 1.02023888, "balance_loss_mlp": 1.04255748, "epoch": 0.44226664662558246, "flos": 19281050029440.0, "grad_norm": 2.4291083191322924, "language_loss": 0.7788946, "learning_rate": 2.466710842823274e-06, "loss": 0.80050606, "num_input_tokens_seen": 157720555, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.83203125, "step": 7356, "time_per_iteration": 2.505031108856201 }, { "auxiliary_loss_clip": 0.0112439, "auxiliary_loss_mlp": 0.01039049, "balance_loss_clip": 1.02442169, "balance_loss_mlp": 1.0434742, "epoch": 0.4423267698782504, "flos": 17821820010240.0, "grad_norm": 1.7820791857541411, "language_loss": 0.7705723, "learning_rate": 2.4663321247445577e-06, "loss": 0.7922067, "num_input_tokens_seen": 157739160, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80859375, "step": 7357, "time_per_iteration": 2.5275323390960693 }, { "auxiliary_loss_clip": 0.01122585, "auxiliary_loss_mlp": 0.01036902, "balance_loss_clip": 1.02262092, "balance_loss_mlp": 1.04466629, "epoch": 0.4423868931309184, "flos": 29204424691200.0, "grad_norm": 1.746098030443523, "language_loss": 0.73186541, "learning_rate": 2.465953388982481e-06, "loss": 0.75346029, "num_input_tokens_seen": 157760020, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 7358, "time_per_iteration": 2.554380416870117 }, { "auxiliary_loss_clip": 0.01125856, "auxiliary_loss_mlp": 0.01035339, "balance_loss_clip": 1.02169573, "balance_loss_mlp": 1.04559195, "epoch": 0.44244701638358636, "flos": 29713135057920.0, "grad_norm": 1.950568042866298, "language_loss": 0.75883669, "learning_rate": 2.465574635551405e-06, "loss": 0.78044868, "num_input_tokens_seen": 157780435, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.8046875, "step": 7359, "time_per_iteration": 2.5387637615203857 }, { "auxiliary_loss_clip": 0.01125655, "auxiliary_loss_mlp": 0.01035404, "balance_loss_clip": 1.02029967, "balance_loss_mlp": 1.04607308, "epoch": 0.4425071396362543, "flos": 22930040874240.0, "grad_norm": 2.3159222839068434, "language_loss": 0.70319623, "learning_rate": 2.4651958644656923e-06, "loss": 0.72480679, "num_input_tokens_seen": 157799420, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 7360, "time_per_iteration": 2.5075297355651855 }, { "auxiliary_loss_clip": 0.01123082, "auxiliary_loss_mlp": 0.01033692, "balance_loss_clip": 1.01911819, "balance_loss_mlp": 1.04314804, "epoch": 0.4425672628889223, "flos": 19792346175360.0, "grad_norm": 2.674430232432806, "language_loss": 0.69546127, "learning_rate": 2.4648170757397053e-06, "loss": 0.71702898, "num_input_tokens_seen": 157817025, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.80078125, "step": 7361, "time_per_iteration": 2.462925910949707 }, { "auxiliary_loss_clip": 0.01123926, "auxiliary_loss_mlp": 0.01037669, "balance_loss_clip": 1.02240467, "balance_loss_mlp": 1.04368877, "epoch": 0.44262738614159025, "flos": 13662215377920.0, "grad_norm": 2.1855939385213703, "language_loss": 0.82275999, "learning_rate": 2.464438269387809e-06, "loss": 0.84437597, "num_input_tokens_seen": 157834345, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80078125, "step": 7362, "time_per_iteration": 2.4465181827545166 }, { "auxiliary_loss_clip": 0.01130046, "auxiliary_loss_mlp": 0.0104073, "balance_loss_clip": 1.02518487, "balance_loss_mlp": 1.04589844, "epoch": 0.4426875093942582, "flos": 14210212245120.0, "grad_norm": 1.9348033636831123, "language_loss": 0.74676514, "learning_rate": 2.464059445424366e-06, "loss": 0.76847285, "num_input_tokens_seen": 157852290, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.84375, "step": 7363, "time_per_iteration": 2.455980062484741 }, { "auxiliary_loss_clip": 0.01047915, "auxiliary_loss_mlp": 0.01003519, "balance_loss_clip": 1.00210631, "balance_loss_mlp": 1.02223611, "epoch": 0.4427476326469262, "flos": 70117525728000.0, "grad_norm": 0.6877225640782763, "language_loss": 0.55664986, "learning_rate": 2.463680603863743e-06, "loss": 0.57716417, "num_input_tokens_seen": 157923060, "router_z_loss_clip": 0.01409912, "router_z_loss_mlp": 0.2578125, "step": 7364, "time_per_iteration": 3.1855695247650146 }, { "auxiliary_loss_clip": 0.01119651, "auxiliary_loss_mlp": 0.01038363, "balance_loss_clip": 1.02489805, "balance_loss_mlp": 1.0417521, "epoch": 0.44280775589959415, "flos": 25445080287360.0, "grad_norm": 2.7541126141096526, "language_loss": 0.74669313, "learning_rate": 2.463301744720305e-06, "loss": 0.76827323, "num_input_tokens_seen": 157944110, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78125, "step": 7365, "time_per_iteration": 2.5608468055725098 }, { "auxiliary_loss_clip": 0.01121501, "auxiliary_loss_mlp": 0.01038222, "balance_loss_clip": 1.02369642, "balance_loss_mlp": 1.04273975, "epoch": 0.4428678791522621, "flos": 22857214049280.0, "grad_norm": 1.7992453493483358, "language_loss": 0.74398911, "learning_rate": 2.4629228680084184e-06, "loss": 0.76558638, "num_input_tokens_seen": 157964295, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 7366, "time_per_iteration": 2.516447067260742 }, { "auxiliary_loss_clip": 0.01123244, "auxiliary_loss_mlp": 0.01037628, "balance_loss_clip": 1.02292943, "balance_loss_mlp": 1.04462981, "epoch": 0.44292800240493013, "flos": 25812446636160.0, "grad_norm": 2.853536947919461, "language_loss": 0.73763061, "learning_rate": 2.46254397374245e-06, "loss": 0.75923938, "num_input_tokens_seen": 157983970, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78515625, "step": 7367, "time_per_iteration": 2.4973528385162354 }, { "auxiliary_loss_clip": 0.01125693, "auxiliary_loss_mlp": 0.01039615, "balance_loss_clip": 1.02525616, "balance_loss_mlp": 1.04597783, "epoch": 0.4429881256575981, "flos": 32416885549440.0, "grad_norm": 1.721305144656918, "language_loss": 0.73757875, "learning_rate": 2.4621650619367677e-06, "loss": 0.75923181, "num_input_tokens_seen": 158006515, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.796875, "step": 7368, "time_per_iteration": 2.601733684539795 }, { "auxiliary_loss_clip": 0.01122966, "auxiliary_loss_mlp": 0.01034172, "balance_loss_clip": 1.02070725, "balance_loss_mlp": 1.04510903, "epoch": 0.44304824891026606, "flos": 22163707186560.0, "grad_norm": 1.9095309370403386, "language_loss": 0.79806292, "learning_rate": 2.4617861326057403e-06, "loss": 0.81963432, "num_input_tokens_seen": 158025565, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.77734375, "step": 7369, "time_per_iteration": 2.576841354370117 }, { "auxiliary_loss_clip": 0.01119499, "auxiliary_loss_mlp": 0.01033094, "balance_loss_clip": 1.02013016, "balance_loss_mlp": 1.0432843, "epoch": 0.443108372162934, "flos": 25338569483520.0, "grad_norm": 2.09887155870718, "language_loss": 0.72128105, "learning_rate": 2.461407185763737e-06, "loss": 0.74280703, "num_input_tokens_seen": 158045620, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.76171875, "step": 7370, "time_per_iteration": 2.512697696685791 }, { "auxiliary_loss_clip": 0.01123332, "auxiliary_loss_mlp": 0.01034618, "balance_loss_clip": 1.02033687, "balance_loss_mlp": 1.04449844, "epoch": 0.443168495415602, "flos": 23330947547520.0, "grad_norm": 1.884921888094137, "language_loss": 0.70640337, "learning_rate": 2.461028221425126e-06, "loss": 0.72798288, "num_input_tokens_seen": 158063505, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 7371, "time_per_iteration": 2.4744176864624023 }, { "auxiliary_loss_clip": 0.01120197, "auxiliary_loss_mlp": 0.01031108, "balance_loss_clip": 1.01816738, "balance_loss_mlp": 1.04339528, "epoch": 0.44322861866826996, "flos": 21871502046720.0, "grad_norm": 6.068776425460837, "language_loss": 0.68456268, "learning_rate": 2.4606492396042786e-06, "loss": 0.70607573, "num_input_tokens_seen": 158080335, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.765625, "step": 7372, "time_per_iteration": 2.5057356357574463 }, { "auxiliary_loss_clip": 0.01124256, "auxiliary_loss_mlp": 0.01033443, "balance_loss_clip": 1.01810098, "balance_loss_mlp": 1.0436151, "epoch": 0.4432887419209379, "flos": 20084407660800.0, "grad_norm": 1.997574699786807, "language_loss": 0.8340953, "learning_rate": 2.4602702403155664e-06, "loss": 0.8556723, "num_input_tokens_seen": 158098955, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8046875, "step": 7373, "time_per_iteration": 2.4729840755462646 }, { "auxiliary_loss_clip": 0.0104943, "auxiliary_loss_mlp": 0.01001141, "balance_loss_clip": 0.9997102, "balance_loss_mlp": 1.02383757, "epoch": 0.4433488651736059, "flos": 70035540935040.0, "grad_norm": 0.76322180884018, "language_loss": 0.55262744, "learning_rate": 2.4598912235733604e-06, "loss": 0.57313317, "num_input_tokens_seen": 158164110, "router_z_loss_clip": 0.01428223, "router_z_loss_mlp": 0.25585938, "step": 7374, "time_per_iteration": 3.173175573348999 }, { "auxiliary_loss_clip": 0.0112359, "auxiliary_loss_mlp": 0.01044628, "balance_loss_clip": 1.02899992, "balance_loss_mlp": 1.04605508, "epoch": 0.44340898842627385, "flos": 16282472705280.0, "grad_norm": 3.0155917150586706, "language_loss": 0.825791, "learning_rate": 2.4595121893920327e-06, "loss": 0.84747314, "num_input_tokens_seen": 158179850, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7734375, "step": 7375, "time_per_iteration": 2.4737744331359863 }, { "auxiliary_loss_clip": 0.01125383, "auxiliary_loss_mlp": 0.01032423, "balance_loss_clip": 1.01802301, "balance_loss_mlp": 1.04479074, "epoch": 0.4434691116789418, "flos": 16611989097600.0, "grad_norm": 2.045740582371851, "language_loss": 0.83664232, "learning_rate": 2.4591331377859578e-06, "loss": 0.85822046, "num_input_tokens_seen": 158196590, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 7376, "time_per_iteration": 2.429809331893921 }, { "auxiliary_loss_clip": 0.01123522, "auxiliary_loss_mlp": 0.01036402, "balance_loss_clip": 1.02249026, "balance_loss_mlp": 1.04535425, "epoch": 0.4435292349316098, "flos": 19063251912960.0, "grad_norm": 1.8107088962665274, "language_loss": 0.77345228, "learning_rate": 2.4587540687695077e-06, "loss": 0.79505146, "num_input_tokens_seen": 158216355, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 7377, "time_per_iteration": 2.4826385974884033 }, { "auxiliary_loss_clip": 0.01121071, "auxiliary_loss_mlp": 0.01030985, "balance_loss_clip": 1.01718044, "balance_loss_mlp": 1.04494286, "epoch": 0.44358935818427775, "flos": 21251324799360.0, "grad_norm": 2.21413899276646, "language_loss": 0.75745028, "learning_rate": 2.458374982357057e-06, "loss": 0.77897084, "num_input_tokens_seen": 158235825, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76171875, "step": 7378, "time_per_iteration": 2.4825868606567383 }, { "auxiliary_loss_clip": 0.0112268, "auxiliary_loss_mlp": 0.01036501, "balance_loss_clip": 1.02255952, "balance_loss_mlp": 1.04411602, "epoch": 0.4436494814369457, "flos": 12495298239360.0, "grad_norm": 1.9392760905783253, "language_loss": 0.68902254, "learning_rate": 2.457995878562982e-06, "loss": 0.71061438, "num_input_tokens_seen": 158254230, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78515625, "step": 7379, "time_per_iteration": 2.470487356185913 }, { "auxiliary_loss_clip": 0.01124326, "auxiliary_loss_mlp": 0.01031074, "balance_loss_clip": 1.01743078, "balance_loss_mlp": 1.04516459, "epoch": 0.44370960468961373, "flos": 23659853408640.0, "grad_norm": 1.7249727651642575, "language_loss": 0.7297532, "learning_rate": 2.457616757401656e-06, "loss": 0.75130713, "num_input_tokens_seen": 158273400, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.79296875, "step": 7380, "time_per_iteration": 2.484259605407715 }, { "auxiliary_loss_clip": 0.01125157, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.0185225, "balance_loss_mlp": 1.04566646, "epoch": 0.4437697279422817, "flos": 32416849635840.0, "grad_norm": 1.6567668319623585, "language_loss": 0.64936113, "learning_rate": 2.457237618887458e-06, "loss": 0.67094123, "num_input_tokens_seen": 158296840, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.79296875, "step": 7381, "time_per_iteration": 2.584914445877075 }, { "auxiliary_loss_clip": 0.01125604, "auxiliary_loss_mlp": 0.01032691, "balance_loss_clip": 1.01892781, "balance_loss_mlp": 1.04622853, "epoch": 0.44382985119494966, "flos": 18112875914880.0, "grad_norm": 2.567960271703265, "language_loss": 0.80268055, "learning_rate": 2.456858463034763e-06, "loss": 0.82426351, "num_input_tokens_seen": 158314935, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.79296875, "step": 7382, "time_per_iteration": 2.436582088470459 }, { "auxiliary_loss_clip": 0.01125795, "auxiliary_loss_mlp": 0.0103913, "balance_loss_clip": 1.02478933, "balance_loss_mlp": 1.04652023, "epoch": 0.44388997444761763, "flos": 30774151923840.0, "grad_norm": 1.7412986318050936, "language_loss": 0.64922106, "learning_rate": 2.456479289857949e-06, "loss": 0.6708703, "num_input_tokens_seen": 158334620, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.79296875, "step": 7383, "time_per_iteration": 3.9714348316192627 }, { "auxiliary_loss_clip": 0.01128721, "auxiliary_loss_mlp": 0.01031305, "balance_loss_clip": 1.01658273, "balance_loss_mlp": 1.04672503, "epoch": 0.4439500977002856, "flos": 20339157893760.0, "grad_norm": 2.3865470656527528, "language_loss": 0.75559849, "learning_rate": 2.4561000993713953e-06, "loss": 0.77719873, "num_input_tokens_seen": 158350550, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8203125, "step": 7384, "time_per_iteration": 2.463754892349243 }, { "auxiliary_loss_clip": 0.01126778, "auxiliary_loss_mlp": 0.01033455, "balance_loss_clip": 1.01875079, "balance_loss_mlp": 1.04571772, "epoch": 0.44401022095295356, "flos": 20371225760640.0, "grad_norm": 3.249291305397578, "language_loss": 0.81245697, "learning_rate": 2.4557208915894796e-06, "loss": 0.83405924, "num_input_tokens_seen": 158369555, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80859375, "step": 7385, "time_per_iteration": 3.891893148422241 }, { "auxiliary_loss_clip": 0.01124163, "auxiliary_loss_mlp": 0.01036281, "balance_loss_clip": 1.02122462, "balance_loss_mlp": 1.04368341, "epoch": 0.4440703442056215, "flos": 20230635928320.0, "grad_norm": 2.0371040820026614, "language_loss": 0.81678599, "learning_rate": 2.455341666526582e-06, "loss": 0.83839047, "num_input_tokens_seen": 158388045, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 7386, "time_per_iteration": 3.8453783988952637 }, { "auxiliary_loss_clip": 0.01128213, "auxiliary_loss_mlp": 0.01038209, "balance_loss_clip": 1.02244997, "balance_loss_mlp": 1.04472184, "epoch": 0.4441304674582895, "flos": 39494698824960.0, "grad_norm": 2.046158370994137, "language_loss": 0.69969618, "learning_rate": 2.4549624241970832e-06, "loss": 0.72136039, "num_input_tokens_seen": 158410115, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8359375, "step": 7387, "time_per_iteration": 2.6433424949645996 }, { "auxiliary_loss_clip": 0.01121622, "auxiliary_loss_mlp": 0.0104011, "balance_loss_clip": 1.02541137, "balance_loss_mlp": 1.04235673, "epoch": 0.44419059071095746, "flos": 14829671220480.0, "grad_norm": 2.435335951124897, "language_loss": 0.72044039, "learning_rate": 2.4545831646153628e-06, "loss": 0.74205768, "num_input_tokens_seen": 158427765, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 7388, "time_per_iteration": 2.448136568069458 }, { "auxiliary_loss_clip": 0.01124057, "auxiliary_loss_mlp": 0.01034561, "balance_loss_clip": 1.01985097, "balance_loss_mlp": 1.04302073, "epoch": 0.4442507139636254, "flos": 22637835734400.0, "grad_norm": 1.582497599114321, "language_loss": 0.69187802, "learning_rate": 2.4542038877958044e-06, "loss": 0.7134642, "num_input_tokens_seen": 158446375, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 7389, "time_per_iteration": 2.4827492237091064 }, { "auxiliary_loss_clip": 0.01121317, "auxiliary_loss_mlp": 0.01030177, "balance_loss_clip": 1.01596773, "balance_loss_mlp": 1.04291773, "epoch": 0.4443108372162934, "flos": 38290721829120.0, "grad_norm": 3.459843210770599, "language_loss": 0.74684453, "learning_rate": 2.453824593752788e-06, "loss": 0.76835954, "num_input_tokens_seen": 158467260, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 7390, "time_per_iteration": 2.6374809741973877 }, { "auxiliary_loss_clip": 0.01120104, "auxiliary_loss_mlp": 0.01031881, "balance_loss_clip": 1.0183984, "balance_loss_mlp": 1.04269695, "epoch": 0.44437096046896135, "flos": 17748993185280.0, "grad_norm": 2.1718267010306356, "language_loss": 0.81434679, "learning_rate": 2.4534452825006988e-06, "loss": 0.83586657, "num_input_tokens_seen": 158486720, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 7391, "time_per_iteration": 2.4643003940582275 }, { "auxiliary_loss_clip": 0.01124227, "auxiliary_loss_mlp": 0.01037596, "balance_loss_clip": 1.02312398, "balance_loss_mlp": 1.04625285, "epoch": 0.4444310837216293, "flos": 13732348682880.0, "grad_norm": 1.8780035745684442, "language_loss": 0.73869908, "learning_rate": 2.4530659540539185e-06, "loss": 0.76031733, "num_input_tokens_seen": 158502530, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 7392, "time_per_iteration": 2.4613869190216064 }, { "auxiliary_loss_clip": 0.01118222, "auxiliary_loss_mlp": 0.01032472, "balance_loss_clip": 1.01883459, "balance_loss_mlp": 1.04070878, "epoch": 0.44449120697429734, "flos": 25010238240000.0, "grad_norm": 1.6595028355624133, "language_loss": 0.79391432, "learning_rate": 2.4526866084268313e-06, "loss": 0.81542122, "num_input_tokens_seen": 158522715, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7734375, "step": 7393, "time_per_iteration": 2.5181405544281006 }, { "auxiliary_loss_clip": 0.01125251, "auxiliary_loss_mlp": 0.01034062, "balance_loss_clip": 1.0196619, "balance_loss_mlp": 1.04314947, "epoch": 0.4445513302269653, "flos": 32671707609600.0, "grad_norm": 2.567559561021493, "language_loss": 0.80879426, "learning_rate": 2.4523072456338226e-06, "loss": 0.83038735, "num_input_tokens_seen": 158543615, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 7394, "time_per_iteration": 2.5715084075927734 }, { "auxiliary_loss_clip": 0.01119769, "auxiliary_loss_mlp": 0.01038401, "balance_loss_clip": 1.02542484, "balance_loss_mlp": 1.0434159, "epoch": 0.44461145347963327, "flos": 11655814504320.0, "grad_norm": 2.977622826045145, "language_loss": 0.8001318, "learning_rate": 2.4519278656892785e-06, "loss": 0.82171357, "num_input_tokens_seen": 158560330, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.765625, "step": 7395, "time_per_iteration": 2.448127031326294 }, { "auxiliary_loss_clip": 0.01121691, "auxiliary_loss_mlp": 0.01037337, "balance_loss_clip": 1.02344882, "balance_loss_mlp": 1.04322207, "epoch": 0.44467157673230123, "flos": 20886759711360.0, "grad_norm": 2.390987172482371, "language_loss": 0.68565977, "learning_rate": 2.451548468607584e-06, "loss": 0.70725012, "num_input_tokens_seen": 158579735, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78515625, "step": 7396, "time_per_iteration": 2.4883620738983154 }, { "auxiliary_loss_clip": 0.01123249, "auxiliary_loss_mlp": 0.01030322, "balance_loss_clip": 1.01645792, "balance_loss_mlp": 1.04324293, "epoch": 0.4447316999849692, "flos": 18546137763840.0, "grad_norm": 2.86393932960537, "language_loss": 0.8028065, "learning_rate": 2.451169054403126e-06, "loss": 0.82434213, "num_input_tokens_seen": 158597075, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.80078125, "step": 7397, "time_per_iteration": 2.435844659805298 }, { "auxiliary_loss_clip": 0.01121441, "auxiliary_loss_mlp": 0.01035053, "balance_loss_clip": 1.02164841, "balance_loss_mlp": 1.04457009, "epoch": 0.44479182323763716, "flos": 23769057732480.0, "grad_norm": 1.8026652139669006, "language_loss": 0.67303145, "learning_rate": 2.450789623090293e-06, "loss": 0.69459641, "num_input_tokens_seen": 158616650, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76953125, "step": 7398, "time_per_iteration": 2.519841432571411 }, { "auxiliary_loss_clip": 0.01119832, "auxiliary_loss_mlp": 0.01035963, "balance_loss_clip": 1.02253366, "balance_loss_mlp": 1.04265594, "epoch": 0.44485194649030513, "flos": 16543831040640.0, "grad_norm": 2.0234154603237022, "language_loss": 0.69726032, "learning_rate": 2.450410174683472e-06, "loss": 0.71881825, "num_input_tokens_seen": 158634515, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76953125, "step": 7399, "time_per_iteration": 2.437526226043701 }, { "auxiliary_loss_clip": 0.01119024, "auxiliary_loss_mlp": 0.0103481, "balance_loss_clip": 1.02124357, "balance_loss_mlp": 1.04272807, "epoch": 0.4449120697429731, "flos": 22600955445120.0, "grad_norm": 1.8096523443297197, "language_loss": 0.72185564, "learning_rate": 2.4500307091970514e-06, "loss": 0.74339402, "num_input_tokens_seen": 158653760, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 7400, "time_per_iteration": 2.490062952041626 }, { "auxiliary_loss_clip": 0.01120692, "auxiliary_loss_mlp": 0.01032773, "balance_loss_clip": 1.0189327, "balance_loss_mlp": 1.04425228, "epoch": 0.44497219299564106, "flos": 20004864992640.0, "grad_norm": 1.983732185503139, "language_loss": 0.85112828, "learning_rate": 2.449651226645422e-06, "loss": 0.87266296, "num_input_tokens_seen": 158672190, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 7401, "time_per_iteration": 2.4685420989990234 }, { "auxiliary_loss_clip": 0.01118575, "auxiliary_loss_mlp": 0.01033116, "balance_loss_clip": 1.02044976, "balance_loss_mlp": 1.04388559, "epoch": 0.445032316248309, "flos": 25594253470080.0, "grad_norm": 1.5642542388737406, "language_loss": 0.83193439, "learning_rate": 2.449271727042973e-06, "loss": 0.85345131, "num_input_tokens_seen": 158694115, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.74609375, "step": 7402, "time_per_iteration": 2.526506185531616 }, { "auxiliary_loss_clip": 0.01121972, "auxiliary_loss_mlp": 0.01030095, "balance_loss_clip": 1.01628447, "balance_loss_mlp": 1.04437518, "epoch": 0.445092439500977, "flos": 21250426959360.0, "grad_norm": 2.366868754765384, "language_loss": 0.77004588, "learning_rate": 2.4488922104040947e-06, "loss": 0.79156655, "num_input_tokens_seen": 158711000, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7734375, "step": 7403, "time_per_iteration": 2.462433338165283 }, { "auxiliary_loss_clip": 0.01050176, "auxiliary_loss_mlp": 0.01002513, "balance_loss_clip": 1.00101125, "balance_loss_mlp": 1.02423406, "epoch": 0.44515256275364495, "flos": 57764900309760.0, "grad_norm": 0.7853495452906958, "language_loss": 0.60059935, "learning_rate": 2.4485126767431793e-06, "loss": 0.62112623, "num_input_tokens_seen": 158769675, "router_z_loss_clip": 0.01501465, "router_z_loss_mlp": 0.25976562, "step": 7404, "time_per_iteration": 3.1031131744384766 }, { "auxiliary_loss_clip": 0.0112597, "auxiliary_loss_mlp": 0.01037613, "balance_loss_clip": 1.02361226, "balance_loss_mlp": 1.04614234, "epoch": 0.4452126860063129, "flos": 15596004908160.0, "grad_norm": 2.3640350273788, "language_loss": 0.82339841, "learning_rate": 2.4481331260746177e-06, "loss": 0.84503424, "num_input_tokens_seen": 158788215, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 7405, "time_per_iteration": 2.491022825241089 }, { "auxiliary_loss_clip": 0.01120503, "auxiliary_loss_mlp": 0.01028127, "balance_loss_clip": 1.01444769, "balance_loss_mlp": 1.04350173, "epoch": 0.4452728092589809, "flos": 21617398258560.0, "grad_norm": 1.6006704137548544, "language_loss": 0.75304091, "learning_rate": 2.4477535584128036e-06, "loss": 0.77452719, "num_input_tokens_seen": 158809090, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76953125, "step": 7406, "time_per_iteration": 2.496934175491333 }, { "auxiliary_loss_clip": 0.01116934, "auxiliary_loss_mlp": 0.01027698, "balance_loss_clip": 1.01486468, "balance_loss_mlp": 1.04198968, "epoch": 0.4453329325116489, "flos": 29497491757440.0, "grad_norm": 2.0960380869325332, "language_loss": 0.65745258, "learning_rate": 2.447373973772129e-06, "loss": 0.67889887, "num_input_tokens_seen": 158828320, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 7407, "time_per_iteration": 2.649238109588623 }, { "auxiliary_loss_clip": 0.01126092, "auxiliary_loss_mlp": 0.01030769, "balance_loss_clip": 1.01781666, "balance_loss_mlp": 1.04721189, "epoch": 0.44539305576431687, "flos": 21361139654400.0, "grad_norm": 1.903399975537425, "language_loss": 0.68622094, "learning_rate": 2.4469943721669887e-06, "loss": 0.70778954, "num_input_tokens_seen": 158847040, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7890625, "step": 7408, "time_per_iteration": 2.482551336288452 }, { "auxiliary_loss_clip": 0.0112131, "auxiliary_loss_mlp": 0.01033256, "balance_loss_clip": 1.01920128, "balance_loss_mlp": 1.04338443, "epoch": 0.44545317901698483, "flos": 41427626428800.0, "grad_norm": 1.765821041380703, "language_loss": 0.71816474, "learning_rate": 2.4466147536117776e-06, "loss": 0.73971045, "num_input_tokens_seen": 158870490, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 7409, "time_per_iteration": 2.6985208988189697 }, { "auxiliary_loss_clip": 0.01122938, "auxiliary_loss_mlp": 0.01033806, "balance_loss_clip": 1.01907194, "balance_loss_mlp": 1.04431891, "epoch": 0.4455133022696528, "flos": 22055005653120.0, "grad_norm": 2.0324923876890235, "language_loss": 0.65047324, "learning_rate": 2.4462351181208895e-06, "loss": 0.6720407, "num_input_tokens_seen": 158889920, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78515625, "step": 7410, "time_per_iteration": 2.4922068119049072 }, { "auxiliary_loss_clip": 0.01127995, "auxiliary_loss_mlp": 0.01030266, "balance_loss_clip": 1.0160265, "balance_loss_mlp": 1.04588747, "epoch": 0.44557342552232077, "flos": 23476960333440.0, "grad_norm": 2.3391046347310955, "language_loss": 0.73953205, "learning_rate": 2.4458554657087217e-06, "loss": 0.76111472, "num_input_tokens_seen": 158909580, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8203125, "step": 7411, "time_per_iteration": 2.5188732147216797 }, { "auxiliary_loss_clip": 0.01121249, "auxiliary_loss_mlp": 0.01028292, "balance_loss_clip": 1.01575077, "balance_loss_mlp": 1.04673767, "epoch": 0.44563354877498873, "flos": 19134678107520.0, "grad_norm": 1.9274576660539435, "language_loss": 0.79453933, "learning_rate": 2.4454757963896695e-06, "loss": 0.81603473, "num_input_tokens_seen": 158924600, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.74609375, "step": 7412, "time_per_iteration": 2.4533450603485107 }, { "auxiliary_loss_clip": 0.01121178, "auxiliary_loss_mlp": 0.01034978, "balance_loss_clip": 1.0219183, "balance_loss_mlp": 1.04241574, "epoch": 0.4456936720276567, "flos": 13621420506240.0, "grad_norm": 2.2322985312816495, "language_loss": 0.80029106, "learning_rate": 2.4450961101781304e-06, "loss": 0.82185256, "num_input_tokens_seen": 158939345, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7890625, "step": 7413, "time_per_iteration": 2.4557015895843506 }, { "auxiliary_loss_clip": 0.01118833, "auxiliary_loss_mlp": 0.01027102, "balance_loss_clip": 1.01457262, "balance_loss_mlp": 1.04387331, "epoch": 0.44575379528032466, "flos": 14713715139840.0, "grad_norm": 1.9649487731965596, "language_loss": 0.7600944, "learning_rate": 2.4447164070885026e-06, "loss": 0.78155375, "num_input_tokens_seen": 158955855, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.75, "step": 7414, "time_per_iteration": 2.4312243461608887 }, { "auxiliary_loss_clip": 0.01121376, "auxiliary_loss_mlp": 0.01033699, "balance_loss_clip": 1.02036548, "balance_loss_mlp": 1.04503703, "epoch": 0.4458139185329926, "flos": 24170682677760.0, "grad_norm": 1.63311409695539, "language_loss": 0.83099723, "learning_rate": 2.4443366871351837e-06, "loss": 0.852548, "num_input_tokens_seen": 158976315, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 7415, "time_per_iteration": 2.520489454269409 }, { "auxiliary_loss_clip": 0.01118569, "auxiliary_loss_mlp": 0.01036935, "balance_loss_clip": 1.02421522, "balance_loss_mlp": 1.04288912, "epoch": 0.4458740417856606, "flos": 21762225895680.0, "grad_norm": 1.7299478142290676, "language_loss": 0.83997804, "learning_rate": 2.4439569503325732e-06, "loss": 0.86153305, "num_input_tokens_seen": 158996725, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7578125, "step": 7416, "time_per_iteration": 2.482665777206421 }, { "auxiliary_loss_clip": 0.01122483, "auxiliary_loss_mlp": 0.01033411, "balance_loss_clip": 1.01989257, "balance_loss_mlp": 1.04399812, "epoch": 0.44593416503832856, "flos": 21068790860160.0, "grad_norm": 1.6740325024137923, "language_loss": 0.81168258, "learning_rate": 2.4435771966950706e-06, "loss": 0.83324152, "num_input_tokens_seen": 159017255, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78515625, "step": 7417, "time_per_iteration": 2.510025978088379 }, { "auxiliary_loss_clip": 0.0112231, "auxiliary_loss_mlp": 0.01040849, "balance_loss_clip": 1.02734256, "balance_loss_mlp": 1.04350388, "epoch": 0.4459942882909965, "flos": 22600488568320.0, "grad_norm": 2.570408281692529, "language_loss": 0.80766487, "learning_rate": 2.443197426237077e-06, "loss": 0.82929647, "num_input_tokens_seen": 159035010, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7890625, "step": 7418, "time_per_iteration": 2.471102237701416 }, { "auxiliary_loss_clip": 0.01120916, "auxiliary_loss_mlp": 0.01031414, "balance_loss_clip": 1.01808095, "balance_loss_mlp": 1.04338264, "epoch": 0.4460544115436645, "flos": 26505486622080.0, "grad_norm": 2.5389598006714973, "language_loss": 0.77366698, "learning_rate": 2.442817638972991e-06, "loss": 0.79519027, "num_input_tokens_seen": 159055345, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.77734375, "step": 7419, "time_per_iteration": 2.5609023571014404 }, { "auxiliary_loss_clip": 0.01119839, "auxiliary_loss_mlp": 0.01036287, "balance_loss_clip": 1.02345383, "balance_loss_mlp": 1.04278111, "epoch": 0.4461145347963325, "flos": 17604021893760.0, "grad_norm": 1.885083859089236, "language_loss": 0.72562468, "learning_rate": 2.4424378349172176e-06, "loss": 0.74718595, "num_input_tokens_seen": 159074225, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7734375, "step": 7420, "time_per_iteration": 2.475778818130493 }, { "auxiliary_loss_clip": 0.01118379, "auxiliary_loss_mlp": 0.01031285, "balance_loss_clip": 1.01756418, "balance_loss_mlp": 1.04389942, "epoch": 0.44617465804900047, "flos": 27268193036160.0, "grad_norm": 1.651516408353796, "language_loss": 0.75057518, "learning_rate": 2.442058014084156e-06, "loss": 0.77207184, "num_input_tokens_seen": 159095415, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.74609375, "step": 7421, "time_per_iteration": 2.543463945388794 }, { "auxiliary_loss_clip": 0.01116301, "auxiliary_loss_mlp": 0.01038277, "balance_loss_clip": 1.02538466, "balance_loss_mlp": 1.04335237, "epoch": 0.44623478130166844, "flos": 17786412178560.0, "grad_norm": 1.951251069682369, "language_loss": 0.75764489, "learning_rate": 2.44167817648821e-06, "loss": 0.77919066, "num_input_tokens_seen": 159114615, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 7422, "time_per_iteration": 2.465965509414673 }, { "auxiliary_loss_clip": 0.01120654, "auxiliary_loss_mlp": 0.0103298, "balance_loss_clip": 1.01991439, "balance_loss_mlp": 1.04395723, "epoch": 0.4462949045543364, "flos": 23003011353600.0, "grad_norm": 1.4991211013698764, "language_loss": 0.65070677, "learning_rate": 2.441298322143784e-06, "loss": 0.67224312, "num_input_tokens_seen": 159134370, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.76953125, "step": 7423, "time_per_iteration": 2.515177011489868 }, { "auxiliary_loss_clip": 0.01116512, "auxiliary_loss_mlp": 0.01030914, "balance_loss_clip": 1.0193392, "balance_loss_mlp": 1.04275846, "epoch": 0.44635502780700437, "flos": 17820096157440.0, "grad_norm": 1.6151709012360893, "language_loss": 0.78937835, "learning_rate": 2.4409184510652807e-06, "loss": 0.81085259, "num_input_tokens_seen": 159152540, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.73828125, "step": 7424, "time_per_iteration": 3.963397741317749 }, { "auxiliary_loss_clip": 0.01117302, "auxiliary_loss_mlp": 0.01033086, "balance_loss_clip": 1.02107549, "balance_loss_mlp": 1.04390454, "epoch": 0.44641515105967233, "flos": 26688020561280.0, "grad_norm": 1.647779545363476, "language_loss": 0.8024562, "learning_rate": 2.4405385632671063e-06, "loss": 0.82396007, "num_input_tokens_seen": 159173425, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.734375, "step": 7425, "time_per_iteration": 2.5366785526275635 }, { "auxiliary_loss_clip": 0.01119233, "auxiliary_loss_mlp": 0.01033092, "balance_loss_clip": 1.02081382, "balance_loss_mlp": 1.0446856, "epoch": 0.4464752743123403, "flos": 18913324544640.0, "grad_norm": 1.94166398265698, "language_loss": 0.7703585, "learning_rate": 2.4401586587636655e-06, "loss": 0.7918818, "num_input_tokens_seen": 159191210, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.74609375, "step": 7426, "time_per_iteration": 5.3094494342803955 }, { "auxiliary_loss_clip": 0.01118068, "auxiliary_loss_mlp": 0.01030545, "balance_loss_clip": 1.01779544, "balance_loss_mlp": 1.04121435, "epoch": 0.44653539756500826, "flos": 29570318582400.0, "grad_norm": 1.631378383066642, "language_loss": 0.6446048, "learning_rate": 2.4397787375693634e-06, "loss": 0.66609091, "num_input_tokens_seen": 159211755, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.765625, "step": 7427, "time_per_iteration": 2.582341194152832 }, { "auxiliary_loss_clip": 0.0112089, "auxiliary_loss_mlp": 0.01030912, "balance_loss_clip": 1.01795948, "balance_loss_mlp": 1.04614341, "epoch": 0.44659552081767623, "flos": 21468979261440.0, "grad_norm": 6.872175113124772, "language_loss": 0.75362396, "learning_rate": 2.439398799698608e-06, "loss": 0.77514195, "num_input_tokens_seen": 159230315, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 7428, "time_per_iteration": 3.8678488731384277 }, { "auxiliary_loss_clip": 0.0111838, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.02033639, "balance_loss_mlp": 1.04307199, "epoch": 0.4466556440703442, "flos": 17931886260480.0, "grad_norm": 1.838969873082846, "language_loss": 0.77624232, "learning_rate": 2.439018845165806e-06, "loss": 0.79776013, "num_input_tokens_seen": 159249810, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75390625, "step": 7429, "time_per_iteration": 2.491300106048584 }, { "auxiliary_loss_clip": 0.01120383, "auxiliary_loss_mlp": 0.01030529, "balance_loss_clip": 1.01680183, "balance_loss_mlp": 1.04320598, "epoch": 0.44671576732301216, "flos": 21107430915840.0, "grad_norm": 1.711205351351842, "language_loss": 0.90952986, "learning_rate": 2.438638873985366e-06, "loss": 0.93103898, "num_input_tokens_seen": 159271715, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7734375, "step": 7430, "time_per_iteration": 2.5096681118011475 }, { "auxiliary_loss_clip": 0.01124715, "auxiliary_loss_mlp": 0.01036145, "balance_loss_clip": 1.02132773, "balance_loss_mlp": 1.04498947, "epoch": 0.4467758905756801, "flos": 23508920459520.0, "grad_norm": 1.7997131764692893, "language_loss": 0.7971347, "learning_rate": 2.4382588861716954e-06, "loss": 0.81874329, "num_input_tokens_seen": 159290690, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 7431, "time_per_iteration": 2.525655508041382 }, { "auxiliary_loss_clip": 0.01121589, "auxiliary_loss_mlp": 0.01032793, "balance_loss_clip": 1.01934659, "balance_loss_mlp": 1.04374373, "epoch": 0.4468360138283481, "flos": 18734022829440.0, "grad_norm": 1.9398258700245814, "language_loss": 0.80021614, "learning_rate": 2.437878881739204e-06, "loss": 0.82175994, "num_input_tokens_seen": 159309400, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.77734375, "step": 7432, "time_per_iteration": 2.4423394203186035 }, { "auxiliary_loss_clip": 0.01121942, "auxiliary_loss_mlp": 0.01030286, "balance_loss_clip": 1.01738119, "balance_loss_mlp": 1.04346538, "epoch": 0.4468961370810161, "flos": 23477139901440.0, "grad_norm": 1.9120857030776652, "language_loss": 0.76948178, "learning_rate": 2.437498860702301e-06, "loss": 0.79100406, "num_input_tokens_seen": 159327425, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.78515625, "step": 7433, "time_per_iteration": 2.498197317123413 }, { "auxiliary_loss_clip": 0.0111522, "auxiliary_loss_mlp": 0.01029416, "balance_loss_clip": 1.0182755, "balance_loss_mlp": 1.04254198, "epoch": 0.4469562603336841, "flos": 30075042539520.0, "grad_norm": 1.7274933449290686, "language_loss": 0.7751739, "learning_rate": 2.437118823075398e-06, "loss": 0.79662031, "num_input_tokens_seen": 159345805, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.7265625, "step": 7434, "time_per_iteration": 2.5368242263793945 }, { "auxiliary_loss_clip": 0.01122482, "auxiliary_loss_mlp": 0.0102832, "balance_loss_clip": 1.01587403, "balance_loss_mlp": 1.04534006, "epoch": 0.44701638358635204, "flos": 22456415116800.0, "grad_norm": 4.359966789556432, "language_loss": 0.64467537, "learning_rate": 2.436738768872905e-06, "loss": 0.66618341, "num_input_tokens_seen": 159364595, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7734375, "step": 7435, "time_per_iteration": 2.4913392066955566 }, { "auxiliary_loss_clip": 0.01120454, "auxiliary_loss_mlp": 0.0102885, "balance_loss_clip": 1.01552868, "balance_loss_mlp": 1.0433743, "epoch": 0.44707650683902, "flos": 24057851080320.0, "grad_norm": 1.7232222614754489, "language_loss": 0.83528471, "learning_rate": 2.4363586981092346e-06, "loss": 0.85677779, "num_input_tokens_seen": 159385265, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7734375, "step": 7436, "time_per_iteration": 2.4955763816833496 }, { "auxiliary_loss_clip": 0.01123219, "auxiliary_loss_mlp": 0.01032787, "balance_loss_clip": 1.01885736, "balance_loss_mlp": 1.04527211, "epoch": 0.44713663009168797, "flos": 23766938830080.0, "grad_norm": 2.6532939595402194, "language_loss": 0.79696977, "learning_rate": 2.435978610798798e-06, "loss": 0.81852984, "num_input_tokens_seen": 159405080, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 7437, "time_per_iteration": 2.5122201442718506 }, { "auxiliary_loss_clip": 0.01122405, "auxiliary_loss_mlp": 0.01031495, "balance_loss_clip": 1.01827502, "balance_loss_mlp": 1.04435372, "epoch": 0.44719675334435594, "flos": 24499265316480.0, "grad_norm": 1.726636269960532, "language_loss": 0.71585089, "learning_rate": 2.435598506956009e-06, "loss": 0.73738992, "num_input_tokens_seen": 159424595, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.78125, "step": 7438, "time_per_iteration": 2.527409791946411 }, { "auxiliary_loss_clip": 0.01120418, "auxiliary_loss_mlp": 0.01035461, "balance_loss_clip": 1.02192473, "balance_loss_mlp": 1.04228616, "epoch": 0.4472568765970239, "flos": 29781759991680.0, "grad_norm": 2.077019218025876, "language_loss": 0.67076969, "learning_rate": 2.4352183865952808e-06, "loss": 0.69232845, "num_input_tokens_seen": 159443865, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78125, "step": 7439, "time_per_iteration": 2.526508092880249 }, { "auxiliary_loss_clip": 0.0112261, "auxiliary_loss_mlp": 0.01035026, "balance_loss_clip": 1.02083361, "balance_loss_mlp": 1.04408383, "epoch": 0.44731699984969187, "flos": 24643123286400.0, "grad_norm": 2.4519591108208605, "language_loss": 0.73606163, "learning_rate": 2.4348382497310285e-06, "loss": 0.75763798, "num_input_tokens_seen": 159464525, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78515625, "step": 7440, "time_per_iteration": 2.5071136951446533 }, { "auxiliary_loss_clip": 0.01118662, "auxiliary_loss_mlp": 0.01028544, "balance_loss_clip": 1.01607466, "balance_loss_mlp": 1.04219794, "epoch": 0.44737712310235983, "flos": 29455691304960.0, "grad_norm": 2.6009672169688196, "language_loss": 0.73996115, "learning_rate": 2.4344580963776655e-06, "loss": 0.76143324, "num_input_tokens_seen": 159486385, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.765625, "step": 7441, "time_per_iteration": 2.5623435974121094 }, { "auxiliary_loss_clip": 0.01121708, "auxiliary_loss_mlp": 0.01034038, "balance_loss_clip": 1.02031112, "balance_loss_mlp": 1.0435617, "epoch": 0.4474372463550278, "flos": 24896832024960.0, "grad_norm": 2.203572066507703, "language_loss": 0.75301522, "learning_rate": 2.4340779265496082e-06, "loss": 0.77457261, "num_input_tokens_seen": 159503880, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 7442, "time_per_iteration": 2.4918079376220703 }, { "auxiliary_loss_clip": 0.011227, "auxiliary_loss_mlp": 0.01031771, "balance_loss_clip": 1.01786506, "balance_loss_mlp": 1.04199457, "epoch": 0.44749736960769576, "flos": 33181603125120.0, "grad_norm": 1.9899712891970507, "language_loss": 0.74199677, "learning_rate": 2.433697740261273e-06, "loss": 0.76354146, "num_input_tokens_seen": 159522980, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8046875, "step": 7443, "time_per_iteration": 2.5951216220855713 }, { "auxiliary_loss_clip": 0.01116998, "auxiliary_loss_mlp": 0.01028236, "balance_loss_clip": 1.01402068, "balance_loss_mlp": 1.0400027, "epoch": 0.4475574928603637, "flos": 21071807602560.0, "grad_norm": 1.975643224928389, "language_loss": 0.77858472, "learning_rate": 2.4333175375270748e-06, "loss": 0.80003703, "num_input_tokens_seen": 159543340, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 7444, "time_per_iteration": 2.4785985946655273 }, { "auxiliary_loss_clip": 0.01115357, "auxiliary_loss_mlp": 0.01031719, "balance_loss_clip": 1.01889181, "balance_loss_mlp": 1.04045999, "epoch": 0.4476176161130317, "flos": 21862523646720.0, "grad_norm": 2.4190049099144604, "language_loss": 0.84643495, "learning_rate": 2.4329373183614333e-06, "loss": 0.86790574, "num_input_tokens_seen": 159558210, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.75, "step": 7445, "time_per_iteration": 2.458491802215576 }, { "auxiliary_loss_clip": 0.01121271, "auxiliary_loss_mlp": 0.01029055, "balance_loss_clip": 1.01507735, "balance_loss_mlp": 1.04337549, "epoch": 0.4476777393656997, "flos": 22528667324160.0, "grad_norm": 2.6236314733803514, "language_loss": 0.64006066, "learning_rate": 2.432557082778765e-06, "loss": 0.66156387, "num_input_tokens_seen": 159577920, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.77734375, "step": 7446, "time_per_iteration": 2.4754323959350586 }, { "auxiliary_loss_clip": 0.01048019, "auxiliary_loss_mlp": 0.01001731, "balance_loss_clip": 1.00021672, "balance_loss_mlp": 1.02187812, "epoch": 0.4477378626183677, "flos": 49017133877760.0, "grad_norm": 0.7474963924548089, "language_loss": 0.50334728, "learning_rate": 2.4321768307934884e-06, "loss": 0.52384478, "num_input_tokens_seen": 159632295, "router_z_loss_clip": 0.01513672, "router_z_loss_mlp": 0.26171875, "step": 7447, "time_per_iteration": 2.958324670791626 }, { "auxiliary_loss_clip": 0.01047072, "auxiliary_loss_mlp": 0.01002831, "balance_loss_clip": 1.00134134, "balance_loss_mlp": 1.02103627, "epoch": 0.44779798587103564, "flos": 56542179392640.0, "grad_norm": 0.7751134003920676, "language_loss": 0.59389549, "learning_rate": 2.4317965624200235e-06, "loss": 0.61439455, "num_input_tokens_seen": 159698435, "router_z_loss_clip": 0.01489258, "router_z_loss_mlp": 0.26171875, "step": 7448, "time_per_iteration": 3.1491804122924805 }, { "auxiliary_loss_clip": 0.01117204, "auxiliary_loss_mlp": 0.01034937, "balance_loss_clip": 1.02205634, "balance_loss_mlp": 1.04033017, "epoch": 0.4478581091237036, "flos": 46498536040320.0, "grad_norm": 2.351309365552347, "language_loss": 0.59193856, "learning_rate": 2.431416277672789e-06, "loss": 0.61345994, "num_input_tokens_seen": 159722150, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.765625, "step": 7449, "time_per_iteration": 2.6945571899414062 }, { "auxiliary_loss_clip": 0.01119639, "auxiliary_loss_mlp": 0.01025397, "balance_loss_clip": 1.01306438, "balance_loss_mlp": 1.04222834, "epoch": 0.4479182323763716, "flos": 20814363849600.0, "grad_norm": 1.9217073726771268, "language_loss": 0.80193758, "learning_rate": 2.4310359765662065e-06, "loss": 0.82338798, "num_input_tokens_seen": 159740550, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7734375, "step": 7450, "time_per_iteration": 2.4800708293914795 }, { "auxiliary_loss_clip": 0.01119603, "auxiliary_loss_mlp": 0.01038985, "balance_loss_clip": 1.02541351, "balance_loss_mlp": 1.04291308, "epoch": 0.44797835562903954, "flos": 14245979212800.0, "grad_norm": 3.7965678303683816, "language_loss": 0.79465425, "learning_rate": 2.430655659114697e-06, "loss": 0.81624019, "num_input_tokens_seen": 159758245, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76953125, "step": 7451, "time_per_iteration": 2.4255940914154053 }, { "auxiliary_loss_clip": 0.01045383, "auxiliary_loss_mlp": 0.01002205, "balance_loss_clip": 1.00061917, "balance_loss_mlp": 1.01927471, "epoch": 0.4480384788817075, "flos": 63534560169600.0, "grad_norm": 0.8345063393452776, "language_loss": 0.62861663, "learning_rate": 2.430275325332681e-06, "loss": 0.6490925, "num_input_tokens_seen": 159826790, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.26171875, "step": 7452, "time_per_iteration": 3.1843342781066895 }, { "auxiliary_loss_clip": 0.0112154, "auxiliary_loss_mlp": 0.0102995, "balance_loss_clip": 1.01615179, "balance_loss_mlp": 1.04380047, "epoch": 0.44809860213437547, "flos": 21652626522240.0, "grad_norm": 2.3214406123203872, "language_loss": 0.62706053, "learning_rate": 2.429894975234582e-06, "loss": 0.64857543, "num_input_tokens_seen": 159845805, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.77734375, "step": 7453, "time_per_iteration": 2.4715991020202637 }, { "auxiliary_loss_clip": 0.01044779, "auxiliary_loss_mlp": 0.01002639, "balance_loss_clip": 1.00100577, "balance_loss_mlp": 1.01851606, "epoch": 0.44815872538704343, "flos": 69190634246400.0, "grad_norm": 0.7778857218116512, "language_loss": 0.57051837, "learning_rate": 2.4295146088348224e-06, "loss": 0.59099257, "num_input_tokens_seen": 159898860, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.26171875, "step": 7454, "time_per_iteration": 3.0046889781951904 }, { "auxiliary_loss_clip": 0.01119838, "auxiliary_loss_mlp": 0.01031374, "balance_loss_clip": 1.01846385, "balance_loss_mlp": 1.04224515, "epoch": 0.4482188486397114, "flos": 12598289510400.0, "grad_norm": 3.5547143405014006, "language_loss": 0.75128031, "learning_rate": 2.4291342261478255e-06, "loss": 0.77279246, "num_input_tokens_seen": 159911555, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7734375, "step": 7455, "time_per_iteration": 2.426297903060913 }, { "auxiliary_loss_clip": 0.01118932, "auxiliary_loss_mlp": 0.01031208, "balance_loss_clip": 1.0182147, "balance_loss_mlp": 1.0414151, "epoch": 0.44827897189237936, "flos": 34058182631040.0, "grad_norm": 1.7038702693966632, "language_loss": 0.75983119, "learning_rate": 2.428753827188016e-06, "loss": 0.78133255, "num_input_tokens_seen": 159931470, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7734375, "step": 7456, "time_per_iteration": 2.5985445976257324 }, { "auxiliary_loss_clip": 0.01120907, "auxiliary_loss_mlp": 0.01034461, "balance_loss_clip": 1.02152109, "balance_loss_mlp": 1.04448307, "epoch": 0.44833909514504733, "flos": 25147416280320.0, "grad_norm": 2.021207953475958, "language_loss": 0.76498115, "learning_rate": 2.428373411969818e-06, "loss": 0.78653491, "num_input_tokens_seen": 159946115, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.765625, "step": 7457, "time_per_iteration": 2.474804639816284 }, { "auxiliary_loss_clip": 0.01120706, "auxiliary_loss_mlp": 0.01034869, "balance_loss_clip": 1.02074826, "balance_loss_mlp": 1.04212785, "epoch": 0.4483992183977153, "flos": 16179984224640.0, "grad_norm": 2.1898322933704377, "language_loss": 0.67732739, "learning_rate": 2.4279929805076576e-06, "loss": 0.69888312, "num_input_tokens_seen": 159963915, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 7458, "time_per_iteration": 2.45833158493042 }, { "auxiliary_loss_clip": 0.0112518, "auxiliary_loss_mlp": 0.01033739, "balance_loss_clip": 1.01920784, "balance_loss_mlp": 1.04453778, "epoch": 0.44845934165038326, "flos": 17746048270080.0, "grad_norm": 3.8542360500585753, "language_loss": 0.71578902, "learning_rate": 2.427612532815961e-06, "loss": 0.73737824, "num_input_tokens_seen": 159982140, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8046875, "step": 7459, "time_per_iteration": 2.4589288234710693 }, { "auxiliary_loss_clip": 0.0111785, "auxiliary_loss_mlp": 0.01034685, "balance_loss_clip": 1.02079153, "balance_loss_mlp": 1.04033685, "epoch": 0.4485194649030513, "flos": 21835914647040.0, "grad_norm": 1.8410126806564875, "language_loss": 0.69380569, "learning_rate": 2.427232068909154e-06, "loss": 0.71533096, "num_input_tokens_seen": 160002280, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 7460, "time_per_iteration": 2.4892213344573975 }, { "auxiliary_loss_clip": 0.01119488, "auxiliary_loss_mlp": 0.01036176, "balance_loss_clip": 1.02264619, "balance_loss_mlp": 1.04206085, "epoch": 0.44857958815571924, "flos": 20084515401600.0, "grad_norm": 2.223821179371035, "language_loss": 0.77006221, "learning_rate": 2.4268515888016635e-06, "loss": 0.79161882, "num_input_tokens_seen": 160020260, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 7461, "time_per_iteration": 2.4842751026153564 }, { "auxiliary_loss_clip": 0.01121347, "auxiliary_loss_mlp": 0.01032172, "balance_loss_clip": 1.01929164, "balance_loss_mlp": 1.04224777, "epoch": 0.4486397114083872, "flos": 27053519402880.0, "grad_norm": 1.6847401046585233, "language_loss": 0.67621434, "learning_rate": 2.4264710925079184e-06, "loss": 0.69774956, "num_input_tokens_seen": 160040240, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7890625, "step": 7462, "time_per_iteration": 2.572732448577881 }, { "auxiliary_loss_clip": 0.01043031, "auxiliary_loss_mlp": 0.01005343, "balance_loss_clip": 1.00384128, "balance_loss_mlp": 1.01709151, "epoch": 0.4486998346610552, "flos": 67321195931520.0, "grad_norm": 0.857979187164642, "language_loss": 0.54464948, "learning_rate": 2.4260905800423462e-06, "loss": 0.56513327, "num_input_tokens_seen": 160093865, "router_z_loss_clip": 0.01501465, "router_z_loss_mlp": 0.2578125, "step": 7463, "time_per_iteration": 3.107977867126465 }, { "auxiliary_loss_clip": 0.01118935, "auxiliary_loss_mlp": 0.01029197, "balance_loss_clip": 1.01626253, "balance_loss_mlp": 1.04222667, "epoch": 0.44875995791372314, "flos": 27636816360960.0, "grad_norm": 2.6210742979236183, "language_loss": 0.76045442, "learning_rate": 2.4257100514193775e-06, "loss": 0.78193569, "num_input_tokens_seen": 160113590, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.765625, "step": 7464, "time_per_iteration": 2.5415022373199463 }, { "auxiliary_loss_clip": 0.01119542, "auxiliary_loss_mlp": 0.01033965, "balance_loss_clip": 1.02108431, "balance_loss_mlp": 1.04388249, "epoch": 0.4488200811663911, "flos": 13005947940480.0, "grad_norm": 2.167792598486473, "language_loss": 0.73603332, "learning_rate": 2.425329506653441e-06, "loss": 0.75756836, "num_input_tokens_seen": 160131795, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7578125, "step": 7465, "time_per_iteration": 2.4506003856658936 }, { "auxiliary_loss_clip": 0.01128099, "auxiliary_loss_mlp": 0.01040447, "balance_loss_clip": 1.02530742, "balance_loss_mlp": 1.04539418, "epoch": 0.44888020441905907, "flos": 27489977562240.0, "grad_norm": 3.1370486277730554, "language_loss": 0.7973727, "learning_rate": 2.424948945758966e-06, "loss": 0.81905818, "num_input_tokens_seen": 160150635, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.828125, "step": 7466, "time_per_iteration": 3.9912984371185303 }, { "auxiliary_loss_clip": 0.01124366, "auxiliary_loss_mlp": 0.0103654, "balance_loss_clip": 1.02315843, "balance_loss_mlp": 1.04553211, "epoch": 0.44894032767172704, "flos": 18259678800000.0, "grad_norm": 5.120207417485547, "language_loss": 0.80318117, "learning_rate": 2.4245683687503844e-06, "loss": 0.82479024, "num_input_tokens_seen": 160168615, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7890625, "step": 7467, "time_per_iteration": 2.4803812503814697 }, { "auxiliary_loss_clip": 0.01117538, "auxiliary_loss_mlp": 0.0103622, "balance_loss_clip": 1.02360773, "balance_loss_mlp": 1.04377949, "epoch": 0.449000450924395, "flos": 21579835610880.0, "grad_norm": 2.0181208246366658, "language_loss": 0.74897873, "learning_rate": 2.424187775642129e-06, "loss": 0.7705164, "num_input_tokens_seen": 160187295, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73828125, "step": 7468, "time_per_iteration": 5.243594408035278 }, { "auxiliary_loss_clip": 0.01119383, "auxiliary_loss_mlp": 0.01029771, "balance_loss_clip": 1.0175221, "balance_loss_mlp": 1.04328322, "epoch": 0.44906057417706297, "flos": 17967904623360.0, "grad_norm": 1.9317051260143399, "language_loss": 0.70629221, "learning_rate": 2.4238071664486297e-06, "loss": 0.7277838, "num_input_tokens_seen": 160205115, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.76171875, "step": 7469, "time_per_iteration": 3.9357974529266357 }, { "auxiliary_loss_clip": 0.01123197, "auxiliary_loss_mlp": 0.01037709, "balance_loss_clip": 1.02374375, "balance_loss_mlp": 1.04461861, "epoch": 0.44912069742973093, "flos": 20047347803520.0, "grad_norm": 2.1472906686673023, "language_loss": 0.71643984, "learning_rate": 2.4234265411843203e-06, "loss": 0.73804891, "num_input_tokens_seen": 160222580, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78515625, "step": 7470, "time_per_iteration": 2.463139772415161 }, { "auxiliary_loss_clip": 0.01122073, "auxiliary_loss_mlp": 0.01033868, "balance_loss_clip": 1.01956928, "balance_loss_mlp": 1.04332435, "epoch": 0.4491808206823989, "flos": 21033526682880.0, "grad_norm": 1.956287644594139, "language_loss": 0.76961446, "learning_rate": 2.423045899863634e-06, "loss": 0.79117382, "num_input_tokens_seen": 160241520, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 7471, "time_per_iteration": 2.4779915809631348 }, { "auxiliary_loss_clip": 0.01122761, "auxiliary_loss_mlp": 0.01035599, "balance_loss_clip": 1.02258158, "balance_loss_mlp": 1.0454818, "epoch": 0.44924094393506686, "flos": 22967136645120.0, "grad_norm": 2.3120593850694724, "language_loss": 0.70025218, "learning_rate": 2.4226652425010048e-06, "loss": 0.72183579, "num_input_tokens_seen": 160261815, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7734375, "step": 7472, "time_per_iteration": 2.494899272918701 }, { "auxiliary_loss_clip": 0.01044686, "auxiliary_loss_mlp": 0.01001802, "balance_loss_clip": 1.00019288, "balance_loss_mlp": 1.01852989, "epoch": 0.4493010671877349, "flos": 59233467864960.0, "grad_norm": 0.733499518340093, "language_loss": 0.6168704, "learning_rate": 2.4222845691108676e-06, "loss": 0.6373353, "num_input_tokens_seen": 160317070, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.26171875, "step": 7473, "time_per_iteration": 3.0539090633392334 }, { "auxiliary_loss_clip": 0.01122783, "auxiliary_loss_mlp": 0.01036987, "balance_loss_clip": 1.02268171, "balance_loss_mlp": 1.04505038, "epoch": 0.44936119044040285, "flos": 18004892653440.0, "grad_norm": 2.4715651088889556, "language_loss": 0.77390873, "learning_rate": 2.421903879707657e-06, "loss": 0.79550648, "num_input_tokens_seen": 160334980, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 7474, "time_per_iteration": 2.4454874992370605 }, { "auxiliary_loss_clip": 0.01119467, "auxiliary_loss_mlp": 0.01037225, "balance_loss_clip": 1.02372456, "balance_loss_mlp": 1.04374647, "epoch": 0.4494213136930708, "flos": 21251827589760.0, "grad_norm": 1.761070353567327, "language_loss": 0.72275472, "learning_rate": 2.4215231743058086e-06, "loss": 0.74432158, "num_input_tokens_seen": 160354500, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 7475, "time_per_iteration": 2.485539674758911 }, { "auxiliary_loss_clip": 0.01119625, "auxiliary_loss_mlp": 0.01028801, "balance_loss_clip": 1.01591432, "balance_loss_mlp": 1.04201448, "epoch": 0.4494814369457388, "flos": 27418695022080.0, "grad_norm": 2.1673728195362663, "language_loss": 0.76562345, "learning_rate": 2.4211424529197594e-06, "loss": 0.78710771, "num_input_tokens_seen": 160373650, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.77734375, "step": 7476, "time_per_iteration": 2.5346686840057373 }, { "auxiliary_loss_clip": 0.01123734, "auxiliary_loss_mlp": 0.01041344, "balance_loss_clip": 1.02599013, "balance_loss_mlp": 1.04300463, "epoch": 0.44954156019840674, "flos": 22854053652480.0, "grad_norm": 2.0510399556581107, "language_loss": 0.71382487, "learning_rate": 2.4207617155639464e-06, "loss": 0.73547566, "num_input_tokens_seen": 160393430, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8046875, "step": 7477, "time_per_iteration": 2.513643741607666 }, { "auxiliary_loss_clip": 0.01127443, "auxiliary_loss_mlp": 0.01035689, "balance_loss_clip": 1.0218668, "balance_loss_mlp": 1.04601824, "epoch": 0.4496016834510747, "flos": 17201570935680.0, "grad_norm": 2.4794725766052452, "language_loss": 0.6837101, "learning_rate": 2.4203809622528062e-06, "loss": 0.70534146, "num_input_tokens_seen": 160410545, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.8125, "step": 7478, "time_per_iteration": 2.4519083499908447 }, { "auxiliary_loss_clip": 0.01119251, "auxiliary_loss_mlp": 0.01034043, "balance_loss_clip": 1.02119839, "balance_loss_mlp": 1.04380798, "epoch": 0.4496618067037427, "flos": 18916628595840.0, "grad_norm": 2.3960980205421274, "language_loss": 0.89621449, "learning_rate": 2.420000193000779e-06, "loss": 0.91774744, "num_input_tokens_seen": 160428105, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7578125, "step": 7479, "time_per_iteration": 2.5023701190948486 }, { "auxiliary_loss_clip": 0.01123939, "auxiliary_loss_mlp": 0.0103961, "balance_loss_clip": 1.02518594, "balance_loss_mlp": 1.0458746, "epoch": 0.44972192995641064, "flos": 21031659175680.0, "grad_norm": 2.2581789107084016, "language_loss": 0.75676763, "learning_rate": 2.419619407822302e-06, "loss": 0.7784031, "num_input_tokens_seen": 160448815, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 7480, "time_per_iteration": 2.4925570487976074 }, { "auxiliary_loss_clip": 0.01123841, "auxiliary_loss_mlp": 0.01032142, "balance_loss_clip": 1.01776564, "balance_loss_mlp": 1.04381049, "epoch": 0.4497820532090786, "flos": 20777088510720.0, "grad_norm": 2.2462085039955335, "language_loss": 0.79818237, "learning_rate": 2.419238606731815e-06, "loss": 0.8197422, "num_input_tokens_seen": 160465940, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.80078125, "step": 7481, "time_per_iteration": 2.4819529056549072 }, { "auxiliary_loss_clip": 0.0111827, "auxiliary_loss_mlp": 0.01031298, "balance_loss_clip": 1.01728511, "balance_loss_mlp": 1.04352808, "epoch": 0.44984217646174657, "flos": 33802606385280.0, "grad_norm": 1.8015369939448738, "language_loss": 0.68431473, "learning_rate": 2.418857789743758e-06, "loss": 0.70581043, "num_input_tokens_seen": 160486710, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.74609375, "step": 7482, "time_per_iteration": 2.5775609016418457 }, { "auxiliary_loss_clip": 0.01124508, "auxiliary_loss_mlp": 0.01039719, "balance_loss_clip": 1.02518713, "balance_loss_mlp": 1.04538822, "epoch": 0.44990229971441453, "flos": 15518365660800.0, "grad_norm": 2.150809332523228, "language_loss": 0.85128963, "learning_rate": 2.418476956872571e-06, "loss": 0.8729319, "num_input_tokens_seen": 160503405, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 7483, "time_per_iteration": 2.450756072998047 }, { "auxiliary_loss_clip": 0.01128691, "auxiliary_loss_mlp": 0.01038423, "balance_loss_clip": 1.02451122, "balance_loss_mlp": 1.04780126, "epoch": 0.4499624229670825, "flos": 29861913191040.0, "grad_norm": 1.9080419716933772, "language_loss": 0.80732954, "learning_rate": 2.4180961081326967e-06, "loss": 0.82900071, "num_input_tokens_seen": 160525080, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.80859375, "step": 7484, "time_per_iteration": 2.5450022220611572 }, { "auxiliary_loss_clip": 0.01124532, "auxiliary_loss_mlp": 0.01030773, "balance_loss_clip": 1.01557374, "balance_loss_mlp": 1.0419023, "epoch": 0.45002254621975046, "flos": 18513674847360.0, "grad_norm": 2.5775527896603934, "language_loss": 0.74833858, "learning_rate": 2.4177152435385754e-06, "loss": 0.76989162, "num_input_tokens_seen": 160540895, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 7485, "time_per_iteration": 2.46090030670166 }, { "auxiliary_loss_clip": 0.01048994, "auxiliary_loss_mlp": 0.01006971, "balance_loss_clip": 1.00523078, "balance_loss_mlp": 1.0224824, "epoch": 0.4500826694724185, "flos": 70420394229120.0, "grad_norm": 0.7967614914121383, "language_loss": 0.58733982, "learning_rate": 2.4173343631046504e-06, "loss": 0.60789949, "num_input_tokens_seen": 160598270, "router_z_loss_clip": 0.01745605, "router_z_loss_mlp": 0.265625, "step": 7486, "time_per_iteration": 3.132580518722534 }, { "auxiliary_loss_clip": 0.01122805, "auxiliary_loss_mlp": 0.01031303, "balance_loss_clip": 1.01727772, "balance_loss_mlp": 1.04445958, "epoch": 0.45014279272508645, "flos": 15778897983360.0, "grad_norm": 2.207573916554682, "language_loss": 0.83156395, "learning_rate": 2.4169534668453654e-06, "loss": 0.85310495, "num_input_tokens_seen": 160614720, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78515625, "step": 7487, "time_per_iteration": 2.464205741882324 }, { "auxiliary_loss_clip": 0.01119486, "auxiliary_loss_mlp": 0.01033843, "balance_loss_clip": 1.02021766, "balance_loss_mlp": 1.0428865, "epoch": 0.4502029159777544, "flos": 21799573061760.0, "grad_norm": 1.5037174398836863, "language_loss": 0.76871961, "learning_rate": 2.4165725547751622e-06, "loss": 0.79025292, "num_input_tokens_seen": 160635170, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 7488, "time_per_iteration": 2.5150668621063232 }, { "auxiliary_loss_clip": 0.01129733, "auxiliary_loss_mlp": 0.0103471, "balance_loss_clip": 1.01961792, "balance_loss_mlp": 1.04641724, "epoch": 0.4502630392304224, "flos": 28767966531840.0, "grad_norm": 2.3896647040507957, "language_loss": 0.72116238, "learning_rate": 2.4161916269084858e-06, "loss": 0.74280685, "num_input_tokens_seen": 160654490, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.83203125, "step": 7489, "time_per_iteration": 2.552579879760742 }, { "auxiliary_loss_clip": 0.01126752, "auxiliary_loss_mlp": 0.01035586, "balance_loss_clip": 1.01977921, "balance_loss_mlp": 1.04571629, "epoch": 0.45032316248309034, "flos": 15844182952320.0, "grad_norm": 2.257770996744858, "language_loss": 0.6963861, "learning_rate": 2.4158106832597817e-06, "loss": 0.71800953, "num_input_tokens_seen": 160669400, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8125, "step": 7490, "time_per_iteration": 2.425758123397827 }, { "auxiliary_loss_clip": 0.01047221, "auxiliary_loss_mlp": 0.01002032, "balance_loss_clip": 1.00043416, "balance_loss_mlp": 1.02088904, "epoch": 0.4503832857357583, "flos": 57853600945920.0, "grad_norm": 0.7391986025178271, "language_loss": 0.56671077, "learning_rate": 2.415429723843495e-06, "loss": 0.58720326, "num_input_tokens_seen": 160733820, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.26367188, "step": 7491, "time_per_iteration": 3.0702972412109375 }, { "auxiliary_loss_clip": 0.01118985, "auxiliary_loss_mlp": 0.01032804, "balance_loss_clip": 1.0192796, "balance_loss_mlp": 1.04306769, "epoch": 0.4504434089884263, "flos": 23878082488320.0, "grad_norm": 1.7098659980146362, "language_loss": 0.79622072, "learning_rate": 2.4150487486740713e-06, "loss": 0.81773865, "num_input_tokens_seen": 160753175, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 7492, "time_per_iteration": 2.497629404067993 }, { "auxiliary_loss_clip": 0.01125224, "auxiliary_loss_mlp": 0.01038955, "balance_loss_clip": 1.02473354, "balance_loss_mlp": 1.04344285, "epoch": 0.45050353224109424, "flos": 17785083375360.0, "grad_norm": 2.5470902775373445, "language_loss": 0.91888851, "learning_rate": 2.4146677577659573e-06, "loss": 0.9405303, "num_input_tokens_seen": 160768310, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.81640625, "step": 7493, "time_per_iteration": 2.4519917964935303 }, { "auxiliary_loss_clip": 0.01046587, "auxiliary_loss_mlp": 0.0100184, "balance_loss_clip": 1.00019479, "balance_loss_mlp": 1.02035666, "epoch": 0.4505636554937622, "flos": 65063420703360.0, "grad_norm": 0.8031233661919122, "language_loss": 0.62918097, "learning_rate": 2.4142867511336e-06, "loss": 0.64966518, "num_input_tokens_seen": 160827370, "router_z_loss_clip": 0.01647949, "router_z_loss_mlp": 0.26171875, "step": 7494, "time_per_iteration": 3.100754976272583 }, { "auxiliary_loss_clip": 0.0112176, "auxiliary_loss_mlp": 0.01031925, "balance_loss_clip": 1.01868641, "balance_loss_mlp": 1.04406857, "epoch": 0.45062377874643017, "flos": 22200084685440.0, "grad_norm": 1.6089749095539594, "language_loss": 0.82269561, "learning_rate": 2.4139057287914484e-06, "loss": 0.84423244, "num_input_tokens_seen": 160849140, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.77734375, "step": 7495, "time_per_iteration": 2.508631706237793 }, { "auxiliary_loss_clip": 0.01122252, "auxiliary_loss_mlp": 0.01032687, "balance_loss_clip": 1.01811361, "balance_loss_mlp": 1.04318082, "epoch": 0.45068390199909814, "flos": 37670293186560.0, "grad_norm": 2.2045565590502645, "language_loss": 0.85963619, "learning_rate": 2.41352469075395e-06, "loss": 0.88118565, "num_input_tokens_seen": 160871280, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 7496, "time_per_iteration": 2.6158456802368164 }, { "auxiliary_loss_clip": 0.01121726, "auxiliary_loss_mlp": 0.01034048, "balance_loss_clip": 1.01955807, "balance_loss_mlp": 1.04284406, "epoch": 0.4507440252517661, "flos": 22302501338880.0, "grad_norm": 2.446584001452951, "language_loss": 0.76288331, "learning_rate": 2.4131436370355534e-06, "loss": 0.78444099, "num_input_tokens_seen": 160888625, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 7497, "time_per_iteration": 2.4815211296081543 }, { "auxiliary_loss_clip": 0.01122407, "auxiliary_loss_mlp": 0.01037759, "balance_loss_clip": 1.02368069, "balance_loss_mlp": 1.04216957, "epoch": 0.45080414850443407, "flos": 13188374138880.0, "grad_norm": 1.957877228123757, "language_loss": 0.75438672, "learning_rate": 2.4127625676507088e-06, "loss": 0.77598834, "num_input_tokens_seen": 160907040, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8046875, "step": 7498, "time_per_iteration": 2.4418997764587402 }, { "auxiliary_loss_clip": 0.01121576, "auxiliary_loss_mlp": 0.0103964, "balance_loss_clip": 1.02486444, "balance_loss_mlp": 1.04217172, "epoch": 0.4508642717571021, "flos": 21944939402880.0, "grad_norm": 3.0648701212781493, "language_loss": 0.70246124, "learning_rate": 2.4123814826138663e-06, "loss": 0.72407341, "num_input_tokens_seen": 160927115, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.79296875, "step": 7499, "time_per_iteration": 2.4950368404388428 }, { "auxiliary_loss_clip": 0.01123385, "auxiliary_loss_mlp": 0.01036248, "balance_loss_clip": 1.02172828, "balance_loss_mlp": 1.04211783, "epoch": 0.45092439500977005, "flos": 23367468700800.0, "grad_norm": 2.1065631491238808, "language_loss": 0.76769072, "learning_rate": 2.412000381939477e-06, "loss": 0.78928703, "num_input_tokens_seen": 160944405, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.8125, "step": 7500, "time_per_iteration": 2.4911210536956787 }, { "auxiliary_loss_clip": 0.01122805, "auxiliary_loss_mlp": 0.01033616, "balance_loss_clip": 1.01942968, "balance_loss_mlp": 1.04357159, "epoch": 0.450984518262438, "flos": 20772958446720.0, "grad_norm": 2.0506597612322257, "language_loss": 0.6270439, "learning_rate": 2.411619265641992e-06, "loss": 0.64860809, "num_input_tokens_seen": 160961345, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.79296875, "step": 7501, "time_per_iteration": 2.48116397857666 }, { "auxiliary_loss_clip": 0.01123965, "auxiliary_loss_mlp": 0.01037179, "balance_loss_clip": 1.02200925, "balance_loss_mlp": 1.04291701, "epoch": 0.451044641515106, "flos": 17707372300800.0, "grad_norm": 14.268579303023898, "language_loss": 0.84156048, "learning_rate": 2.411238133735863e-06, "loss": 0.86317188, "num_input_tokens_seen": 160977330, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 7502, "time_per_iteration": 2.430582046508789 }, { "auxiliary_loss_clip": 0.01118514, "auxiliary_loss_mlp": 0.01033276, "balance_loss_clip": 1.01991272, "balance_loss_mlp": 1.04240966, "epoch": 0.45110476476777395, "flos": 20594698225920.0, "grad_norm": 3.2963562363526724, "language_loss": 0.79711175, "learning_rate": 2.4108569862355418e-06, "loss": 0.81862962, "num_input_tokens_seen": 160997280, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76171875, "step": 7503, "time_per_iteration": 2.487684965133667 }, { "auxiliary_loss_clip": 0.01118784, "auxiliary_loss_mlp": 0.01039082, "balance_loss_clip": 1.02530706, "balance_loss_mlp": 1.04276013, "epoch": 0.4511648880204419, "flos": 16034043265920.0, "grad_norm": 2.098061605377893, "language_loss": 0.808025, "learning_rate": 2.410475823155484e-06, "loss": 0.82960367, "num_input_tokens_seen": 161014235, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76171875, "step": 7504, "time_per_iteration": 2.438831329345703 }, { "auxiliary_loss_clip": 0.01118453, "auxiliary_loss_mlp": 0.010342, "balance_loss_clip": 1.02134347, "balance_loss_mlp": 1.04176235, "epoch": 0.4512250112731099, "flos": 23978811202560.0, "grad_norm": 2.4447592505101965, "language_loss": 0.63744849, "learning_rate": 2.4100946445101405e-06, "loss": 0.65897506, "num_input_tokens_seen": 161032360, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.765625, "step": 7505, "time_per_iteration": 2.489164113998413 }, { "auxiliary_loss_clip": 0.01044371, "auxiliary_loss_mlp": 0.0100193, "balance_loss_clip": 1.00032067, "balance_loss_mlp": 1.01834023, "epoch": 0.45128513452577784, "flos": 71462308037760.0, "grad_norm": 0.834041570435327, "language_loss": 0.58940333, "learning_rate": 2.409713450313968e-06, "loss": 0.60986632, "num_input_tokens_seen": 161091360, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.25976562, "step": 7506, "time_per_iteration": 3.1560394763946533 }, { "auxiliary_loss_clip": 0.01119776, "auxiliary_loss_mlp": 0.01038152, "balance_loss_clip": 1.02397752, "balance_loss_mlp": 1.04354882, "epoch": 0.4513452577784458, "flos": 22090844448000.0, "grad_norm": 2.72005383289159, "language_loss": 0.79542804, "learning_rate": 2.40933224058142e-06, "loss": 0.8170073, "num_input_tokens_seen": 161110825, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76171875, "step": 7507, "time_per_iteration": 2.471421003341675 }, { "auxiliary_loss_clip": 0.01121487, "auxiliary_loss_mlp": 0.01031567, "balance_loss_clip": 1.01692832, "balance_loss_mlp": 1.0427115, "epoch": 0.4514053810311138, "flos": 24276403382400.0, "grad_norm": 2.078220609160875, "language_loss": 0.7428472, "learning_rate": 2.4089510153269526e-06, "loss": 0.76437771, "num_input_tokens_seen": 161130685, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 7508, "time_per_iteration": 3.944302558898926 }, { "auxiliary_loss_clip": 0.01119212, "auxiliary_loss_mlp": 0.01034938, "balance_loss_clip": 1.02152705, "balance_loss_mlp": 1.0437243, "epoch": 0.45146550428378174, "flos": 17886781756800.0, "grad_norm": 1.9787814958928858, "language_loss": 0.79010415, "learning_rate": 2.4085697745650217e-06, "loss": 0.81164569, "num_input_tokens_seen": 161147555, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 7509, "time_per_iteration": 3.9688472747802734 }, { "auxiliary_loss_clip": 0.01119575, "auxiliary_loss_mlp": 0.01035449, "balance_loss_clip": 1.02215672, "balance_loss_mlp": 1.04288411, "epoch": 0.4515256275364497, "flos": 24243437675520.0, "grad_norm": 1.8315551740239473, "language_loss": 0.72868407, "learning_rate": 2.4081885183100837e-06, "loss": 0.75023431, "num_input_tokens_seen": 161166255, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 7510, "time_per_iteration": 3.9351067543029785 }, { "auxiliary_loss_clip": 0.0112057, "auxiliary_loss_mlp": 0.01034367, "balance_loss_clip": 1.01957309, "balance_loss_mlp": 1.04160607, "epoch": 0.45158575078911767, "flos": 20631039811200.0, "grad_norm": 2.080792405427828, "language_loss": 0.76994693, "learning_rate": 2.4078072465765964e-06, "loss": 0.79149634, "num_input_tokens_seen": 161184720, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 7511, "time_per_iteration": 3.862877607345581 }, { "auxiliary_loss_clip": 0.01120016, "auxiliary_loss_mlp": 0.01036879, "balance_loss_clip": 1.0224961, "balance_loss_mlp": 1.04157996, "epoch": 0.45164587404178563, "flos": 23327751237120.0, "grad_norm": 1.6464096916070912, "language_loss": 0.78835911, "learning_rate": 2.4074259593790174e-06, "loss": 0.80992806, "num_input_tokens_seen": 161204360, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78515625, "step": 7512, "time_per_iteration": 2.5058727264404297 }, { "auxiliary_loss_clip": 0.01125652, "auxiliary_loss_mlp": 0.01034851, "balance_loss_clip": 1.01989055, "balance_loss_mlp": 1.0433712, "epoch": 0.45170599729445365, "flos": 23805973935360.0, "grad_norm": 2.4183270397600203, "language_loss": 0.87516868, "learning_rate": 2.4070446567318053e-06, "loss": 0.8967737, "num_input_tokens_seen": 161223575, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.8203125, "step": 7513, "time_per_iteration": 2.4865643978118896 }, { "auxiliary_loss_clip": 0.01113954, "auxiliary_loss_mlp": 0.01030198, "balance_loss_clip": 1.01787782, "balance_loss_mlp": 1.04227304, "epoch": 0.4517661205471216, "flos": 23512942782720.0, "grad_norm": 1.760476477610097, "language_loss": 0.66724646, "learning_rate": 2.406663338649419e-06, "loss": 0.68868798, "num_input_tokens_seen": 161243805, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 7514, "time_per_iteration": 2.5247766971588135 }, { "auxiliary_loss_clip": 0.01123804, "auxiliary_loss_mlp": 0.01031695, "balance_loss_clip": 1.01608515, "balance_loss_mlp": 1.04545879, "epoch": 0.4518262437997896, "flos": 23513948363520.0, "grad_norm": 3.5264530734393915, "language_loss": 0.69217181, "learning_rate": 2.406282005146318e-06, "loss": 0.71372682, "num_input_tokens_seen": 161261450, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.78125, "step": 7515, "time_per_iteration": 2.5227739810943604 }, { "auxiliary_loss_clip": 0.01124061, "auxiliary_loss_mlp": 0.01035269, "balance_loss_clip": 1.02084434, "balance_loss_mlp": 1.04259253, "epoch": 0.45188636705245755, "flos": 14568061489920.0, "grad_norm": 4.016133686562595, "language_loss": 0.81701696, "learning_rate": 2.405900656236963e-06, "loss": 0.83861029, "num_input_tokens_seen": 161276965, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8125, "step": 7516, "time_per_iteration": 2.4738783836364746 }, { "auxiliary_loss_clip": 0.01120599, "auxiliary_loss_mlp": 0.01031131, "balance_loss_clip": 1.01771438, "balance_loss_mlp": 1.04528928, "epoch": 0.4519464903051255, "flos": 19901550499200.0, "grad_norm": 4.2492906511021, "language_loss": 0.65345788, "learning_rate": 2.4055192919358137e-06, "loss": 0.67497516, "num_input_tokens_seen": 161295375, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75390625, "step": 7517, "time_per_iteration": 2.4962944984436035 }, { "auxiliary_loss_clip": 0.01117861, "auxiliary_loss_mlp": 0.01026825, "balance_loss_clip": 1.01429653, "balance_loss_mlp": 1.04384518, "epoch": 0.4520066135577935, "flos": 18844376388480.0, "grad_norm": 2.00864073303027, "language_loss": 0.62697053, "learning_rate": 2.405137912257333e-06, "loss": 0.64841741, "num_input_tokens_seen": 161313010, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73828125, "step": 7518, "time_per_iteration": 2.480177164077759 }, { "auxiliary_loss_clip": 0.01120607, "auxiliary_loss_mlp": 0.0103345, "balance_loss_clip": 1.01993179, "balance_loss_mlp": 1.04350388, "epoch": 0.45206673681046144, "flos": 48214419713280.0, "grad_norm": 1.4677122800013391, "language_loss": 0.59424651, "learning_rate": 2.404756517215982e-06, "loss": 0.61578703, "num_input_tokens_seen": 161336690, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 7519, "time_per_iteration": 2.717203378677368 }, { "auxiliary_loss_clip": 0.01123752, "auxiliary_loss_mlp": 0.0103579, "balance_loss_clip": 1.02217054, "balance_loss_mlp": 1.04543352, "epoch": 0.4521268600631294, "flos": 23842171866240.0, "grad_norm": 1.512739731947706, "language_loss": 0.72671467, "learning_rate": 2.404375106826223e-06, "loss": 0.74831015, "num_input_tokens_seen": 161357845, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 7520, "time_per_iteration": 2.544304132461548 }, { "auxiliary_loss_clip": 0.01121828, "auxiliary_loss_mlp": 0.0103479, "balance_loss_clip": 1.02159405, "balance_loss_mlp": 1.04418993, "epoch": 0.4521869833157974, "flos": 18843622202880.0, "grad_norm": 1.9536661751828388, "language_loss": 0.75606465, "learning_rate": 2.4039936811025194e-06, "loss": 0.77763081, "num_input_tokens_seen": 161375160, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.77734375, "step": 7521, "time_per_iteration": 2.4597315788269043 }, { "auxiliary_loss_clip": 0.01127144, "auxiliary_loss_mlp": 0.01040478, "balance_loss_clip": 1.0256958, "balance_loss_mlp": 1.04624557, "epoch": 0.45224710656846534, "flos": 19788072456960.0, "grad_norm": 2.018590596005813, "language_loss": 0.67981255, "learning_rate": 2.4036122400593343e-06, "loss": 0.70148879, "num_input_tokens_seen": 161393690, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80859375, "step": 7522, "time_per_iteration": 2.4880666732788086 }, { "auxiliary_loss_clip": 0.01119867, "auxiliary_loss_mlp": 0.01034188, "balance_loss_clip": 1.02108693, "balance_loss_mlp": 1.0435133, "epoch": 0.4523072298211333, "flos": 28256131681920.0, "grad_norm": 1.826764675258877, "language_loss": 0.61505061, "learning_rate": 2.403230783711134e-06, "loss": 0.6365912, "num_input_tokens_seen": 161415015, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.765625, "step": 7523, "time_per_iteration": 2.5317349433898926 }, { "auxiliary_loss_clip": 0.01125866, "auxiliary_loss_mlp": 0.01034271, "balance_loss_clip": 1.01978123, "balance_loss_mlp": 1.04513872, "epoch": 0.45236735307380127, "flos": 11181039511680.0, "grad_norm": 3.1948572043404537, "language_loss": 0.78113592, "learning_rate": 2.4028493120723813e-06, "loss": 0.8027373, "num_input_tokens_seen": 161432940, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.80859375, "step": 7524, "time_per_iteration": 2.4678587913513184 }, { "auxiliary_loss_clip": 0.01122737, "auxiliary_loss_mlp": 0.01036397, "balance_loss_clip": 1.02304542, "balance_loss_mlp": 1.046206, "epoch": 0.45242747632646924, "flos": 22601386408320.0, "grad_norm": 1.7093905921174621, "language_loss": 0.63770318, "learning_rate": 2.4024678251575417e-06, "loss": 0.65929449, "num_input_tokens_seen": 161452215, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 7525, "time_per_iteration": 2.4745733737945557 }, { "auxiliary_loss_clip": 0.0112164, "auxiliary_loss_mlp": 0.01039499, "balance_loss_clip": 1.02593887, "balance_loss_mlp": 1.04576039, "epoch": 0.45248759957913726, "flos": 18256267008000.0, "grad_norm": 1.526582185119365, "language_loss": 0.78825474, "learning_rate": 2.402086322981083e-06, "loss": 0.80986607, "num_input_tokens_seen": 161469520, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 7526, "time_per_iteration": 2.4725730419158936 }, { "auxiliary_loss_clip": 0.01122718, "auxiliary_loss_mlp": 0.01032817, "balance_loss_clip": 1.01877999, "balance_loss_mlp": 1.04604602, "epoch": 0.4525477228318052, "flos": 22450094323200.0, "grad_norm": 1.7703319363813543, "language_loss": 0.80608344, "learning_rate": 2.40170480555747e-06, "loss": 0.82763875, "num_input_tokens_seen": 161487335, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 7527, "time_per_iteration": 2.4952821731567383 }, { "auxiliary_loss_clip": 0.01125691, "auxiliary_loss_mlp": 0.01033229, "balance_loss_clip": 1.01930571, "balance_loss_mlp": 1.04863155, "epoch": 0.4526078460844732, "flos": 29644869260160.0, "grad_norm": 2.47200696518537, "language_loss": 0.65305215, "learning_rate": 2.4013232729011706e-06, "loss": 0.67464137, "num_input_tokens_seen": 161510095, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76953125, "step": 7528, "time_per_iteration": 2.5726702213287354 }, { "auxiliary_loss_clip": 0.01122673, "auxiliary_loss_mlp": 0.01031923, "balance_loss_clip": 1.0186547, "balance_loss_mlp": 1.04824257, "epoch": 0.45266796933714115, "flos": 23039747988480.0, "grad_norm": 1.6099758829077204, "language_loss": 0.75291038, "learning_rate": 2.4009417250266525e-06, "loss": 0.77445632, "num_input_tokens_seen": 161528725, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 7529, "time_per_iteration": 2.4961750507354736 }, { "auxiliary_loss_clip": 0.01127024, "auxiliary_loss_mlp": 0.01030852, "balance_loss_clip": 1.01743484, "balance_loss_mlp": 1.04982996, "epoch": 0.4527280925898091, "flos": 14428405411200.0, "grad_norm": 2.5940764488083587, "language_loss": 0.72846568, "learning_rate": 2.400560161948384e-06, "loss": 0.75004435, "num_input_tokens_seen": 161547195, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7734375, "step": 7530, "time_per_iteration": 2.483781099319458 }, { "auxiliary_loss_clip": 0.01127911, "auxiliary_loss_mlp": 0.0103142, "balance_loss_clip": 1.01864684, "balance_loss_mlp": 1.05011702, "epoch": 0.4527882158424771, "flos": 22925515760640.0, "grad_norm": 1.8350893478539554, "language_loss": 0.7632829, "learning_rate": 2.400178583680834e-06, "loss": 0.78487623, "num_input_tokens_seen": 161565565, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.77734375, "step": 7531, "time_per_iteration": 2.4849584102630615 }, { "auxiliary_loss_clip": 0.01126085, "auxiliary_loss_mlp": 0.0103389, "balance_loss_clip": 1.01978135, "balance_loss_mlp": 1.05161119, "epoch": 0.45284833909514505, "flos": 25555326105600.0, "grad_norm": 1.643792651268986, "language_loss": 0.67016208, "learning_rate": 2.3997969902384717e-06, "loss": 0.69176179, "num_input_tokens_seen": 161586630, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.74609375, "step": 7532, "time_per_iteration": 2.534796953201294 }, { "auxiliary_loss_clip": 0.01125668, "auxiliary_loss_mlp": 0.01034799, "balance_loss_clip": 1.02163219, "balance_loss_mlp": 1.04920363, "epoch": 0.452908462347813, "flos": 18150007599360.0, "grad_norm": 2.5895423773330952, "language_loss": 0.78238523, "learning_rate": 2.399415381635768e-06, "loss": 0.80398989, "num_input_tokens_seen": 161603815, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.765625, "step": 7533, "time_per_iteration": 2.4556190967559814 }, { "auxiliary_loss_clip": 0.01134223, "auxiliary_loss_mlp": 0.0103697, "balance_loss_clip": 1.02165186, "balance_loss_mlp": 1.05034649, "epoch": 0.452968585600481, "flos": 19062749122560.0, "grad_norm": 2.0090611741407107, "language_loss": 0.82867634, "learning_rate": 2.3990337578871927e-06, "loss": 0.85038835, "num_input_tokens_seen": 161622900, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.83984375, "step": 7534, "time_per_iteration": 2.4907495975494385 }, { "auxiliary_loss_clip": 0.01132681, "auxiliary_loss_mlp": 0.01035243, "balance_loss_clip": 1.02078319, "balance_loss_mlp": 1.05240321, "epoch": 0.45302870885314894, "flos": 22051737515520.0, "grad_norm": 1.6223421523507255, "language_loss": 0.76709616, "learning_rate": 2.3986521190072176e-06, "loss": 0.78877532, "num_input_tokens_seen": 161641700, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 7535, "time_per_iteration": 2.492259979248047 }, { "auxiliary_loss_clip": 0.01130358, "auxiliary_loss_mlp": 0.01033413, "balance_loss_clip": 1.02035928, "balance_loss_mlp": 1.0537132, "epoch": 0.4530888321058169, "flos": 20376217751040.0, "grad_norm": 1.6592237576334143, "language_loss": 0.80744618, "learning_rate": 2.3982704650103138e-06, "loss": 0.82908386, "num_input_tokens_seen": 161661955, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.765625, "step": 7536, "time_per_iteration": 2.529301881790161 }, { "auxiliary_loss_clip": 0.01130373, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.02057552, "balance_loss_mlp": 1.05110598, "epoch": 0.4531489553584849, "flos": 14830425406080.0, "grad_norm": 1.7820152299303595, "language_loss": 0.76107872, "learning_rate": 2.3978887959109544e-06, "loss": 0.78271937, "num_input_tokens_seen": 161679245, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.79296875, "step": 7537, "time_per_iteration": 2.449213743209839 }, { "auxiliary_loss_clip": 0.01133811, "auxiliary_loss_mlp": 0.01030672, "balance_loss_clip": 1.01783919, "balance_loss_mlp": 1.05536342, "epoch": 0.45320907861115284, "flos": 21944975316480.0, "grad_norm": 2.1253729819971565, "language_loss": 0.76050633, "learning_rate": 2.3975071117236118e-06, "loss": 0.78215122, "num_input_tokens_seen": 161698795, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.78515625, "step": 7538, "time_per_iteration": 2.5313074588775635 }, { "auxiliary_loss_clip": 0.01066292, "auxiliary_loss_mlp": 0.01009787, "balance_loss_clip": 1.00795078, "balance_loss_mlp": 1.03975964, "epoch": 0.45326920186382086, "flos": 66251455038720.0, "grad_norm": 0.8066980347002739, "language_loss": 0.62427145, "learning_rate": 2.3971254124627593e-06, "loss": 0.64503223, "num_input_tokens_seen": 161761980, "router_z_loss_clip": 0.01831055, "router_z_loss_mlp": 0.265625, "step": 7539, "time_per_iteration": 3.1494762897491455 }, { "auxiliary_loss_clip": 0.01130681, "auxiliary_loss_mlp": 0.01038641, "balance_loss_clip": 1.0250212, "balance_loss_mlp": 1.05365431, "epoch": 0.4533293251164888, "flos": 14684233052160.0, "grad_norm": 2.083666696107654, "language_loss": 0.65972793, "learning_rate": 2.396743698142872e-06, "loss": 0.68142116, "num_input_tokens_seen": 161779455, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 7540, "time_per_iteration": 2.4901061058044434 }, { "auxiliary_loss_clip": 0.01138276, "auxiliary_loss_mlp": 0.01040924, "balance_loss_clip": 1.02640986, "balance_loss_mlp": 1.05565727, "epoch": 0.4533894483691568, "flos": 22601206840320.0, "grad_norm": 2.46677234009655, "language_loss": 0.85021132, "learning_rate": 2.396361968778424e-06, "loss": 0.87200332, "num_input_tokens_seen": 161798980, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.82421875, "step": 7541, "time_per_iteration": 2.5067145824432373 }, { "auxiliary_loss_clip": 0.01131168, "auxiliary_loss_mlp": 0.01033948, "balance_loss_clip": 1.0205487, "balance_loss_mlp": 1.05212533, "epoch": 0.45344957162182475, "flos": 34751617666560.0, "grad_norm": 2.0532537327225118, "language_loss": 0.76678699, "learning_rate": 2.395980224383889e-06, "loss": 0.78843814, "num_input_tokens_seen": 161819745, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7890625, "step": 7542, "time_per_iteration": 2.6353113651275635 }, { "auxiliary_loss_clip": 0.01132808, "auxiliary_loss_mlp": 0.01029163, "balance_loss_clip": 1.01504302, "balance_loss_mlp": 1.05414844, "epoch": 0.4535096948744927, "flos": 23550218121600.0, "grad_norm": 2.590011273291193, "language_loss": 0.80515051, "learning_rate": 2.395598464973746e-06, "loss": 0.82677019, "num_input_tokens_seen": 161838575, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78515625, "step": 7543, "time_per_iteration": 2.5280253887176514 }, { "auxiliary_loss_clip": 0.0113398, "auxiliary_loss_mlp": 0.01034924, "balance_loss_clip": 1.0214057, "balance_loss_mlp": 1.05390048, "epoch": 0.4535698181271607, "flos": 25557552748800.0, "grad_norm": 1.6740423002330544, "language_loss": 0.76206291, "learning_rate": 2.395216690562469e-06, "loss": 0.78375196, "num_input_tokens_seen": 161858590, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.796875, "step": 7544, "time_per_iteration": 2.5420589447021484 }, { "auxiliary_loss_clip": 0.0113496, "auxiliary_loss_mlp": 0.01036798, "balance_loss_clip": 1.023453, "balance_loss_mlp": 1.0553329, "epoch": 0.45362994137982865, "flos": 24864117713280.0, "grad_norm": 1.9023040162572227, "language_loss": 0.75350827, "learning_rate": 2.3948349011645355e-06, "loss": 0.77522588, "num_input_tokens_seen": 161878390, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.796875, "step": 7545, "time_per_iteration": 2.5451581478118896 }, { "auxiliary_loss_clip": 0.01134139, "auxiliary_loss_mlp": 0.01028957, "balance_loss_clip": 1.01519394, "balance_loss_mlp": 1.05551577, "epoch": 0.4536900646324966, "flos": 30806794408320.0, "grad_norm": 1.8746816630016052, "language_loss": 0.72383171, "learning_rate": 2.394453096794423e-06, "loss": 0.74546266, "num_input_tokens_seen": 161898610, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.78515625, "step": 7546, "time_per_iteration": 2.5648558139801025 }, { "auxiliary_loss_clip": 0.01134721, "auxiliary_loss_mlp": 0.01031694, "balance_loss_clip": 1.0174067, "balance_loss_mlp": 1.05341721, "epoch": 0.4537501878851646, "flos": 23404313076480.0, "grad_norm": 1.599822323739814, "language_loss": 0.75434947, "learning_rate": 2.394071277466609e-06, "loss": 0.77601361, "num_input_tokens_seen": 161918210, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8125, "step": 7547, "time_per_iteration": 2.497183084487915 }, { "auxiliary_loss_clip": 0.01133855, "auxiliary_loss_mlp": 0.01031294, "balance_loss_clip": 1.01739478, "balance_loss_mlp": 1.05415297, "epoch": 0.45381031113783254, "flos": 18149289327360.0, "grad_norm": 4.150028139612107, "language_loss": 0.69695926, "learning_rate": 2.393689443195573e-06, "loss": 0.71861076, "num_input_tokens_seen": 161936950, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.796875, "step": 7548, "time_per_iteration": 2.5191845893859863 }, { "auxiliary_loss_clip": 0.01128183, "auxiliary_loss_mlp": 0.01038658, "balance_loss_clip": 1.02580094, "balance_loss_mlp": 1.04959464, "epoch": 0.4538704343905005, "flos": 25336666062720.0, "grad_norm": 2.1963634759049455, "language_loss": 0.72256219, "learning_rate": 2.393307593995794e-06, "loss": 0.74423051, "num_input_tokens_seen": 161955550, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.78515625, "step": 7549, "time_per_iteration": 3.9948201179504395 }, { "auxiliary_loss_clip": 0.01127981, "auxiliary_loss_mlp": 0.01027933, "balance_loss_clip": 1.01496339, "balance_loss_mlp": 1.05074501, "epoch": 0.4539305576431685, "flos": 28731445378560.0, "grad_norm": 2.9987203628341095, "language_loss": 0.64894748, "learning_rate": 2.392925729881751e-06, "loss": 0.67050666, "num_input_tokens_seen": 161976760, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7734375, "step": 7550, "time_per_iteration": 2.5556013584136963 }, { "auxiliary_loss_clip": 0.01129359, "auxiliary_loss_mlp": 0.01035712, "balance_loss_clip": 1.02246225, "balance_loss_mlp": 1.05217481, "epoch": 0.45399068089583644, "flos": 22492397566080.0, "grad_norm": 1.9995251111872703, "language_loss": 0.69228685, "learning_rate": 2.3925438508679263e-06, "loss": 0.71393758, "num_input_tokens_seen": 161996120, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7734375, "step": 7551, "time_per_iteration": 5.340280532836914 }, { "auxiliary_loss_clip": 0.0112929, "auxiliary_loss_mlp": 0.01033976, "balance_loss_clip": 1.01968241, "balance_loss_mlp": 1.04994535, "epoch": 0.45405080414850446, "flos": 12893403651840.0, "grad_norm": 2.3463455600641336, "language_loss": 0.79384255, "learning_rate": 2.392161956968798e-06, "loss": 0.81547523, "num_input_tokens_seen": 162011125, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.79296875, "step": 7552, "time_per_iteration": 2.4802932739257812 }, { "auxiliary_loss_clip": 0.01059793, "auxiliary_loss_mlp": 0.01003506, "balance_loss_clip": 1.00174141, "balance_loss_mlp": 1.0335834, "epoch": 0.4541109274011724, "flos": 59766919724160.0, "grad_norm": 0.8107509395183549, "language_loss": 0.57828927, "learning_rate": 2.39178004819885e-06, "loss": 0.59892231, "num_input_tokens_seen": 162068705, "router_z_loss_clip": 0.0177002, "router_z_loss_mlp": 0.26171875, "step": 7553, "time_per_iteration": 4.380093812942505 }, { "auxiliary_loss_clip": 0.01126756, "auxiliary_loss_mlp": 0.01033105, "balance_loss_clip": 1.02030826, "balance_loss_mlp": 1.04969311, "epoch": 0.4541710506538404, "flos": 28511743841280.0, "grad_norm": 1.492572689542602, "language_loss": 0.76855069, "learning_rate": 2.3913981245725626e-06, "loss": 0.79014933, "num_input_tokens_seen": 162089655, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7734375, "step": 7554, "time_per_iteration": 2.543915033340454 }, { "auxiliary_loss_clip": 0.01131967, "auxiliary_loss_mlp": 0.01032756, "balance_loss_clip": 1.01790237, "balance_loss_mlp": 1.0503943, "epoch": 0.45423117390650836, "flos": 17675591742720.0, "grad_norm": 3.020133788840276, "language_loss": 0.76514375, "learning_rate": 2.3910161861044194e-06, "loss": 0.78679091, "num_input_tokens_seen": 162108465, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.81640625, "step": 7555, "time_per_iteration": 2.4839680194854736 }, { "auxiliary_loss_clip": 0.01127032, "auxiliary_loss_mlp": 0.01035879, "balance_loss_clip": 1.02242029, "balance_loss_mlp": 1.05033863, "epoch": 0.4542912971591763, "flos": 28072556248320.0, "grad_norm": 1.3108607813952606, "language_loss": 0.72520471, "learning_rate": 2.390634232808903e-06, "loss": 0.7468338, "num_input_tokens_seen": 162129910, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 7556, "time_per_iteration": 2.560303211212158 }, { "auxiliary_loss_clip": 0.01132519, "auxiliary_loss_mlp": 0.01037322, "balance_loss_clip": 1.02285016, "balance_loss_mlp": 1.0506537, "epoch": 0.4543514204118443, "flos": 22671771108480.0, "grad_norm": 2.295145609943709, "language_loss": 0.63214767, "learning_rate": 2.3902522647004982e-06, "loss": 0.65384614, "num_input_tokens_seen": 162148840, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8203125, "step": 7557, "time_per_iteration": 2.51096510887146 }, { "auxiliary_loss_clip": 0.01057243, "auxiliary_loss_mlp": 0.01001996, "balance_loss_clip": 1.00023198, "balance_loss_mlp": 1.03085279, "epoch": 0.45441154366451225, "flos": 58216549921920.0, "grad_norm": 0.6840442715742788, "language_loss": 0.5763424, "learning_rate": 2.3898702817936875e-06, "loss": 0.5969348, "num_input_tokens_seen": 162208500, "router_z_loss_clip": 0.0177002, "router_z_loss_mlp": 0.26367188, "step": 7558, "time_per_iteration": 3.0161356925964355 }, { "auxiliary_loss_clip": 0.0113191, "auxiliary_loss_mlp": 0.01032269, "balance_loss_clip": 1.01725435, "balance_loss_mlp": 1.051283, "epoch": 0.4544716669171802, "flos": 16764286763520.0, "grad_norm": 3.2698488673969885, "language_loss": 0.56292534, "learning_rate": 2.3894882841029573e-06, "loss": 0.58456713, "num_input_tokens_seen": 162224650, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 7559, "time_per_iteration": 2.4889285564422607 }, { "auxiliary_loss_clip": 0.01128869, "auxiliary_loss_mlp": 0.01034637, "balance_loss_clip": 1.02017677, "balance_loss_mlp": 1.05045474, "epoch": 0.4545317901698482, "flos": 15925233991680.0, "grad_norm": 2.701890394946194, "language_loss": 0.71657211, "learning_rate": 2.389106271642792e-06, "loss": 0.73820722, "num_input_tokens_seen": 162242930, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 7560, "time_per_iteration": 2.454434871673584 }, { "auxiliary_loss_clip": 0.01133547, "auxiliary_loss_mlp": 0.01034905, "balance_loss_clip": 1.02052212, "balance_loss_mlp": 1.05117381, "epoch": 0.45459191342251615, "flos": 17639752947840.0, "grad_norm": 2.069002163803806, "language_loss": 0.68921292, "learning_rate": 2.3887242444276775e-06, "loss": 0.71089745, "num_input_tokens_seen": 162261455, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.82421875, "step": 7561, "time_per_iteration": 2.481351613998413 }, { "auxiliary_loss_clip": 0.01123111, "auxiliary_loss_mlp": 0.01033937, "balance_loss_clip": 1.02138448, "balance_loss_mlp": 1.04748571, "epoch": 0.4546520366751841, "flos": 16176608346240.0, "grad_norm": 1.9412158426865271, "language_loss": 0.85037094, "learning_rate": 2.3883422024721015e-06, "loss": 0.87194145, "num_input_tokens_seen": 162279725, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.75390625, "step": 7562, "time_per_iteration": 2.456087350845337 }, { "auxiliary_loss_clip": 0.01124598, "auxiliary_loss_mlp": 0.01031086, "balance_loss_clip": 1.01784217, "balance_loss_mlp": 1.04901314, "epoch": 0.4547121599278521, "flos": 19751443562880.0, "grad_norm": 2.027352143792716, "language_loss": 0.89150524, "learning_rate": 2.38796014579055e-06, "loss": 0.91306204, "num_input_tokens_seen": 162297865, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 7563, "time_per_iteration": 2.499505043029785 }, { "auxiliary_loss_clip": 0.01125588, "auxiliary_loss_mlp": 0.01036438, "balance_loss_clip": 1.02219892, "balance_loss_mlp": 1.04665852, "epoch": 0.45477228318052004, "flos": 19937461121280.0, "grad_norm": 1.9337944323344065, "language_loss": 0.7134192, "learning_rate": 2.3875780743975097e-06, "loss": 0.73503947, "num_input_tokens_seen": 162316010, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 7564, "time_per_iteration": 2.50276255607605 }, { "auxiliary_loss_clip": 0.01126499, "auxiliary_loss_mlp": 0.01034461, "balance_loss_clip": 1.02052522, "balance_loss_mlp": 1.04660821, "epoch": 0.454832406433188, "flos": 21288312829440.0, "grad_norm": 2.0338043649674074, "language_loss": 0.67980355, "learning_rate": 2.3871959883074713e-06, "loss": 0.70141315, "num_input_tokens_seen": 162336115, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.796875, "step": 7565, "time_per_iteration": 2.5008862018585205 }, { "auxiliary_loss_clip": 0.01123501, "auxiliary_loss_mlp": 0.01032402, "balance_loss_clip": 1.01865101, "balance_loss_mlp": 1.04667938, "epoch": 0.45489252968585603, "flos": 24498726612480.0, "grad_norm": 1.9004675354978078, "language_loss": 0.80412167, "learning_rate": 2.386813887534922e-06, "loss": 0.82568067, "num_input_tokens_seen": 162355705, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76953125, "step": 7566, "time_per_iteration": 2.512718439102173 }, { "auxiliary_loss_clip": 0.01125854, "auxiliary_loss_mlp": 0.01031796, "balance_loss_clip": 1.01661432, "balance_loss_mlp": 1.04606843, "epoch": 0.454952652938524, "flos": 17092474352640.0, "grad_norm": 1.7856265142521695, "language_loss": 0.73980105, "learning_rate": 2.3864317720943508e-06, "loss": 0.76137757, "num_input_tokens_seen": 162374055, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 7567, "time_per_iteration": 2.485386610031128 }, { "auxiliary_loss_clip": 0.01129018, "auxiliary_loss_mlp": 0.01034014, "balance_loss_clip": 1.01958978, "balance_loss_mlp": 1.04860878, "epoch": 0.45501277619119196, "flos": 27630387826560.0, "grad_norm": 3.0121725884204182, "language_loss": 0.81012696, "learning_rate": 2.386049642000249e-06, "loss": 0.83175725, "num_input_tokens_seen": 162393560, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 7568, "time_per_iteration": 2.5487115383148193 }, { "auxiliary_loss_clip": 0.01129037, "auxiliary_loss_mlp": 0.01042993, "balance_loss_clip": 1.02713811, "balance_loss_mlp": 1.04682148, "epoch": 0.4550728994438599, "flos": 19974664632960.0, "grad_norm": 1.976924506355935, "language_loss": 0.79445422, "learning_rate": 2.3856674972671055e-06, "loss": 0.81617451, "num_input_tokens_seen": 162413170, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8203125, "step": 7569, "time_per_iteration": 2.4965624809265137 }, { "auxiliary_loss_clip": 0.01129111, "auxiliary_loss_mlp": 0.01032832, "balance_loss_clip": 1.01746631, "balance_loss_mlp": 1.04789138, "epoch": 0.4551330226965279, "flos": 26066873646720.0, "grad_norm": 1.48506495846087, "language_loss": 0.7501775, "learning_rate": 2.385285337909412e-06, "loss": 0.77179694, "num_input_tokens_seen": 162434080, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.8125, "step": 7570, "time_per_iteration": 2.526153564453125 }, { "auxiliary_loss_clip": 0.01124401, "auxiliary_loss_mlp": 0.01036579, "balance_loss_clip": 1.02235746, "balance_loss_mlp": 1.04785168, "epoch": 0.45519314594919585, "flos": 32781091501440.0, "grad_norm": 1.959250554023603, "language_loss": 0.74597049, "learning_rate": 2.3849031639416596e-06, "loss": 0.76758027, "num_input_tokens_seen": 162455445, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 7571, "time_per_iteration": 2.6013991832733154 }, { "auxiliary_loss_clip": 0.01120448, "auxiliary_loss_mlp": 0.01031427, "balance_loss_clip": 1.01784909, "balance_loss_mlp": 1.04667687, "epoch": 0.4552532692018638, "flos": 19172671718400.0, "grad_norm": 2.1712622921579166, "language_loss": 0.80975926, "learning_rate": 2.3845209753783414e-06, "loss": 0.83127809, "num_input_tokens_seen": 162474940, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 7572, "time_per_iteration": 2.479978561401367 }, { "auxiliary_loss_clip": 0.01130889, "auxiliary_loss_mlp": 0.01039661, "balance_loss_clip": 1.02404523, "balance_loss_mlp": 1.04940724, "epoch": 0.4553133924545318, "flos": 26027156183040.0, "grad_norm": 1.8679462075733624, "language_loss": 0.73072636, "learning_rate": 2.3841387722339486e-06, "loss": 0.75243187, "num_input_tokens_seen": 162493340, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8125, "step": 7573, "time_per_iteration": 2.5356640815734863 }, { "auxiliary_loss_clip": 0.01130437, "auxiliary_loss_mlp": 0.01035171, "balance_loss_clip": 1.01893497, "balance_loss_mlp": 1.04852366, "epoch": 0.45537351570719975, "flos": 30661535808000.0, "grad_norm": 3.7215492238681853, "language_loss": 0.74331546, "learning_rate": 2.3837565545229748e-06, "loss": 0.76497155, "num_input_tokens_seen": 162514360, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.81640625, "step": 7574, "time_per_iteration": 2.5597336292266846 }, { "auxiliary_loss_clip": 0.01125958, "auxiliary_loss_mlp": 0.01030827, "balance_loss_clip": 1.01637256, "balance_loss_mlp": 1.04601431, "epoch": 0.4554336389598677, "flos": 24353396184960.0, "grad_norm": 1.97735366214388, "language_loss": 0.71452695, "learning_rate": 2.383374322259915e-06, "loss": 0.73609483, "num_input_tokens_seen": 162535240, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 7575, "time_per_iteration": 2.5406007766723633 }, { "auxiliary_loss_clip": 0.01124261, "auxiliary_loss_mlp": 0.01030376, "balance_loss_clip": 1.01627922, "balance_loss_mlp": 1.04576862, "epoch": 0.4554937622125357, "flos": 20557925677440.0, "grad_norm": 2.002139596178647, "language_loss": 0.73289269, "learning_rate": 2.3829920754592617e-06, "loss": 0.754439, "num_input_tokens_seen": 162553880, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78515625, "step": 7576, "time_per_iteration": 2.5314979553222656 }, { "auxiliary_loss_clip": 0.01123144, "auxiliary_loss_mlp": 0.01033756, "balance_loss_clip": 1.01931977, "balance_loss_mlp": 1.04702103, "epoch": 0.45555388546520365, "flos": 22820764723200.0, "grad_norm": 1.9088931829498794, "language_loss": 0.66269845, "learning_rate": 2.382609814135511e-06, "loss": 0.68426746, "num_input_tokens_seen": 162574485, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76171875, "step": 7577, "time_per_iteration": 2.4943363666534424 }, { "auxiliary_loss_clip": 0.01126066, "auxiliary_loss_mlp": 0.01043467, "balance_loss_clip": 1.02700996, "balance_loss_mlp": 1.04774201, "epoch": 0.4556140087178716, "flos": 21725992051200.0, "grad_norm": 3.461320982412079, "language_loss": 0.74064833, "learning_rate": 2.382227538303157e-06, "loss": 0.76234365, "num_input_tokens_seen": 162595130, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.78125, "step": 7578, "time_per_iteration": 2.4931130409240723 }, { "auxiliary_loss_clip": 0.01125513, "auxiliary_loss_mlp": 0.01028195, "balance_loss_clip": 1.0142653, "balance_loss_mlp": 1.04824173, "epoch": 0.45567413197053963, "flos": 25994513698560.0, "grad_norm": 1.9227798026136282, "language_loss": 0.70333433, "learning_rate": 2.381845247976697e-06, "loss": 0.72487134, "num_input_tokens_seen": 162615720, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 7579, "time_per_iteration": 2.549664258956909 }, { "auxiliary_loss_clip": 0.01121667, "auxiliary_loss_mlp": 0.01034536, "balance_loss_clip": 1.02074981, "balance_loss_mlp": 1.04542089, "epoch": 0.4557342552232076, "flos": 21537604195200.0, "grad_norm": 1.7738874132135607, "language_loss": 0.78205329, "learning_rate": 2.381462943170627e-06, "loss": 0.80361533, "num_input_tokens_seen": 162635825, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 7580, "time_per_iteration": 2.484677791595459 }, { "auxiliary_loss_clip": 0.01125857, "auxiliary_loss_mlp": 0.01029292, "balance_loss_clip": 1.01479626, "balance_loss_mlp": 1.04869962, "epoch": 0.45579437847587556, "flos": 40001972647680.0, "grad_norm": 2.2112164873446583, "language_loss": 0.69005877, "learning_rate": 2.381080623899444e-06, "loss": 0.7116102, "num_input_tokens_seen": 162659130, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 7581, "time_per_iteration": 2.646998405456543 }, { "auxiliary_loss_clip": 0.01122239, "auxiliary_loss_mlp": 0.01029779, "balance_loss_clip": 1.01641583, "balance_loss_mlp": 1.04567361, "epoch": 0.4558545017285435, "flos": 31138501530240.0, "grad_norm": 1.7130419969863235, "language_loss": 0.73106164, "learning_rate": 2.3806982901776455e-06, "loss": 0.75258183, "num_input_tokens_seen": 162681665, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 7582, "time_per_iteration": 2.5561537742614746 }, { "auxiliary_loss_clip": 0.0112895, "auxiliary_loss_mlp": 0.01047284, "balance_loss_clip": 1.03140521, "balance_loss_mlp": 1.04828191, "epoch": 0.4559146249812115, "flos": 21725776569600.0, "grad_norm": 1.9100950812149187, "language_loss": 0.72536403, "learning_rate": 2.380315942019729e-06, "loss": 0.74712634, "num_input_tokens_seen": 162702040, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8046875, "step": 7583, "time_per_iteration": 2.508772850036621 }, { "auxiliary_loss_clip": 0.01130778, "auxiliary_loss_mlp": 0.01035709, "balance_loss_clip": 1.02140939, "balance_loss_mlp": 1.04864192, "epoch": 0.45597474823387946, "flos": 23805973935360.0, "grad_norm": 3.110388162156993, "language_loss": 0.72539854, "learning_rate": 2.379933579440195e-06, "loss": 0.7470634, "num_input_tokens_seen": 162722375, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.8203125, "step": 7584, "time_per_iteration": 2.530773401260376 }, { "auxiliary_loss_clip": 0.01124552, "auxiliary_loss_mlp": 0.01032892, "balance_loss_clip": 1.01855707, "balance_loss_mlp": 1.0477823, "epoch": 0.4560348714865474, "flos": 31905661230720.0, "grad_norm": 1.5550842031142624, "language_loss": 0.67744195, "learning_rate": 2.379551202453541e-06, "loss": 0.69901633, "num_input_tokens_seen": 162746095, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.765625, "step": 7585, "time_per_iteration": 2.5679824352264404 }, { "auxiliary_loss_clip": 0.01123597, "auxiliary_loss_mlp": 0.01030227, "balance_loss_clip": 1.01675618, "balance_loss_mlp": 1.04587996, "epoch": 0.4560949947392154, "flos": 22048828513920.0, "grad_norm": 1.577818618551849, "language_loss": 0.76066512, "learning_rate": 2.379168811074267e-06, "loss": 0.78220338, "num_input_tokens_seen": 162766330, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.77734375, "step": 7586, "time_per_iteration": 2.4803874492645264 }, { "auxiliary_loss_clip": 0.01124099, "auxiliary_loss_mlp": 0.01031469, "balance_loss_clip": 1.01901758, "balance_loss_mlp": 1.04750633, "epoch": 0.45615511799188335, "flos": 24571804832640.0, "grad_norm": 1.9830854563147384, "language_loss": 0.78168607, "learning_rate": 2.3787864053168747e-06, "loss": 0.80324173, "num_input_tokens_seen": 162784755, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.765625, "step": 7587, "time_per_iteration": 2.519685745239258 }, { "auxiliary_loss_clip": 0.0112836, "auxiliary_loss_mlp": 0.01042386, "balance_loss_clip": 1.02756882, "balance_loss_mlp": 1.04471612, "epoch": 0.4562152412445513, "flos": 18330709944960.0, "grad_norm": 2.1212913609547566, "language_loss": 0.68938243, "learning_rate": 2.378403985195863e-06, "loss": 0.71108985, "num_input_tokens_seen": 162803850, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8359375, "step": 7588, "time_per_iteration": 2.462214708328247 }, { "auxiliary_loss_clip": 0.01123883, "auxiliary_loss_mlp": 0.01031088, "balance_loss_clip": 1.01771867, "balance_loss_mlp": 1.04740167, "epoch": 0.4562753644972193, "flos": 13516525814400.0, "grad_norm": 1.8437593912827437, "language_loss": 0.7948541, "learning_rate": 2.378021550725735e-06, "loss": 0.81640381, "num_input_tokens_seen": 162820775, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 7589, "time_per_iteration": 2.463594436645508 }, { "auxiliary_loss_clip": 0.01126669, "auxiliary_loss_mlp": 0.01037271, "balance_loss_clip": 1.02273965, "balance_loss_mlp": 1.04804349, "epoch": 0.45633548774988725, "flos": 29639697701760.0, "grad_norm": 3.610760092013607, "language_loss": 0.62409616, "learning_rate": 2.377639101920992e-06, "loss": 0.64573556, "num_input_tokens_seen": 162839695, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78515625, "step": 7590, "time_per_iteration": 2.53135085105896 }, { "auxiliary_loss_clip": 0.01123101, "auxiliary_loss_mlp": 0.01037276, "balance_loss_clip": 1.02390671, "balance_loss_mlp": 1.04481006, "epoch": 0.4563956110025552, "flos": 22233409528320.0, "grad_norm": 2.2271215925833783, "language_loss": 0.73133296, "learning_rate": 2.377256638796135e-06, "loss": 0.75293672, "num_input_tokens_seen": 162856095, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.78515625, "step": 7591, "time_per_iteration": 3.941211462020874 }, { "auxiliary_loss_clip": 0.0112945, "auxiliary_loss_mlp": 0.01043813, "balance_loss_clip": 1.02898908, "balance_loss_mlp": 1.04965496, "epoch": 0.45645573425522323, "flos": 17092043389440.0, "grad_norm": 2.439169699184272, "language_loss": 0.77195638, "learning_rate": 2.3768741613656695e-06, "loss": 0.79368901, "num_input_tokens_seen": 162874070, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 7592, "time_per_iteration": 2.485959053039551 }, { "auxiliary_loss_clip": 0.01127038, "auxiliary_loss_mlp": 0.0103218, "balance_loss_clip": 1.01824999, "balance_loss_mlp": 1.04876232, "epoch": 0.4565158575078912, "flos": 20332334309760.0, "grad_norm": 2.89018169999484, "language_loss": 0.69473475, "learning_rate": 2.376491669644098e-06, "loss": 0.71632689, "num_input_tokens_seen": 162891000, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 7593, "time_per_iteration": 3.91548490524292 }, { "auxiliary_loss_clip": 0.01118674, "auxiliary_loss_mlp": 0.010286, "balance_loss_clip": 1.01620781, "balance_loss_mlp": 1.04412496, "epoch": 0.45657598076055916, "flos": 23983013093760.0, "grad_norm": 1.8912262747253936, "language_loss": 0.84174824, "learning_rate": 2.3761091636459248e-06, "loss": 0.86322099, "num_input_tokens_seen": 162910120, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.74609375, "step": 7594, "time_per_iteration": 3.9710750579833984 }, { "auxiliary_loss_clip": 0.01051792, "auxiliary_loss_mlp": 0.01002163, "balance_loss_clip": 1.00029182, "balance_loss_mlp": 1.02567339, "epoch": 0.45663610401322713, "flos": 69364297526400.0, "grad_norm": 0.791611055844838, "language_loss": 0.52703565, "learning_rate": 2.375726643385654e-06, "loss": 0.54757518, "num_input_tokens_seen": 162963720, "router_z_loss_clip": 0.01867676, "router_z_loss_mlp": 0.26171875, "step": 7595, "time_per_iteration": 3.124805450439453 }, { "auxiliary_loss_clip": 0.0112837, "auxiliary_loss_mlp": 0.01034865, "balance_loss_clip": 1.02022648, "balance_loss_mlp": 1.04720354, "epoch": 0.4566962272658951, "flos": 15149095891200.0, "grad_norm": 2.4400377910415836, "language_loss": 0.87083459, "learning_rate": 2.3753441088777915e-06, "loss": 0.8924669, "num_input_tokens_seen": 162975760, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 7596, "time_per_iteration": 2.446427583694458 }, { "auxiliary_loss_clip": 0.01127001, "auxiliary_loss_mlp": 0.01047876, "balance_loss_clip": 1.03371394, "balance_loss_mlp": 1.04731584, "epoch": 0.45675635051856306, "flos": 18697465762560.0, "grad_norm": 1.835684863874188, "language_loss": 0.77047712, "learning_rate": 2.374961560136843e-06, "loss": 0.79222596, "num_input_tokens_seen": 162994865, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.796875, "step": 7597, "time_per_iteration": 2.4934990406036377 }, { "auxiliary_loss_clip": 0.01125616, "auxiliary_loss_mlp": 0.01036213, "balance_loss_clip": 1.02140141, "balance_loss_mlp": 1.04617572, "epoch": 0.456816473771231, "flos": 19098300608640.0, "grad_norm": 2.4692147802312068, "language_loss": 0.78396088, "learning_rate": 2.374578997177314e-06, "loss": 0.80557919, "num_input_tokens_seen": 163014730, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.79296875, "step": 7598, "time_per_iteration": 2.478209972381592 }, { "auxiliary_loss_clip": 0.01125582, "auxiliary_loss_mlp": 0.01034565, "balance_loss_clip": 1.02073097, "balance_loss_mlp": 1.04724121, "epoch": 0.456876597023899, "flos": 28950069507840.0, "grad_norm": 2.3506542362699867, "language_loss": 0.71377021, "learning_rate": 2.374196420013712e-06, "loss": 0.73537171, "num_input_tokens_seen": 163033405, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 7599, "time_per_iteration": 2.5363807678222656 }, { "auxiliary_loss_clip": 0.01123402, "auxiliary_loss_mlp": 0.0104184, "balance_loss_clip": 1.02743983, "balance_loss_mlp": 1.04572856, "epoch": 0.45693672027656695, "flos": 23289470317440.0, "grad_norm": 2.3864958836160097, "language_loss": 0.69280303, "learning_rate": 2.373813828660544e-06, "loss": 0.71445549, "num_input_tokens_seen": 163051400, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 7600, "time_per_iteration": 2.482363700866699 }, { "auxiliary_loss_clip": 0.01125564, "auxiliary_loss_mlp": 0.01039801, "balance_loss_clip": 1.02632999, "balance_loss_mlp": 1.04696965, "epoch": 0.4569968435292349, "flos": 20558212986240.0, "grad_norm": 2.6411083567416207, "language_loss": 0.78607094, "learning_rate": 2.373431223132319e-06, "loss": 0.80772465, "num_input_tokens_seen": 163069250, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78515625, "step": 7601, "time_per_iteration": 2.483501672744751 }, { "auxiliary_loss_clip": 0.01126943, "auxiliary_loss_mlp": 0.01041795, "balance_loss_clip": 1.02838993, "balance_loss_mlp": 1.04661083, "epoch": 0.4570569667819029, "flos": 41282619223680.0, "grad_norm": 1.9961525696950595, "language_loss": 0.71288329, "learning_rate": 2.3730486034435448e-06, "loss": 0.73457074, "num_input_tokens_seen": 163091755, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.8046875, "step": 7602, "time_per_iteration": 2.646338701248169 }, { "auxiliary_loss_clip": 0.01127344, "auxiliary_loss_mlp": 0.01037152, "balance_loss_clip": 1.02150035, "balance_loss_mlp": 1.04817116, "epoch": 0.45711709003457085, "flos": 26031573555840.0, "grad_norm": 1.7937130500134941, "language_loss": 0.73255885, "learning_rate": 2.372665969608729e-06, "loss": 0.75420386, "num_input_tokens_seen": 163111600, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7890625, "step": 7603, "time_per_iteration": 2.53609299659729 }, { "auxiliary_loss_clip": 0.01127013, "auxiliary_loss_mlp": 0.01042181, "balance_loss_clip": 1.0263741, "balance_loss_mlp": 1.0474236, "epoch": 0.4571772132872388, "flos": 22158068751360.0, "grad_norm": 2.080560995896065, "language_loss": 0.82794321, "learning_rate": 2.372283321642383e-06, "loss": 0.84963518, "num_input_tokens_seen": 163127350, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.796875, "step": 7604, "time_per_iteration": 2.468527317047119 }, { "auxiliary_loss_clip": 0.01134148, "auxiliary_loss_mlp": 0.01047646, "balance_loss_clip": 1.03195786, "balance_loss_mlp": 1.05068779, "epoch": 0.45723733653990684, "flos": 23878872587520.0, "grad_norm": 2.5825272151798115, "language_loss": 0.85765237, "learning_rate": 2.371900659559016e-06, "loss": 0.87947035, "num_input_tokens_seen": 163145855, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.8359375, "step": 7605, "time_per_iteration": 2.5271737575531006 }, { "auxiliary_loss_clip": 0.01127773, "auxiliary_loss_mlp": 0.0103863, "balance_loss_clip": 1.02359748, "balance_loss_mlp": 1.0465312, "epoch": 0.4572974597925748, "flos": 16871803148160.0, "grad_norm": 3.0219575814429307, "language_loss": 0.73803234, "learning_rate": 2.371517983373138e-06, "loss": 0.75969636, "num_input_tokens_seen": 163163830, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8125, "step": 7606, "time_per_iteration": -0.19060182571411133 }, { "auxiliary_loss_clip": 0.01128456, "auxiliary_loss_mlp": 0.01040092, "balance_loss_clip": 1.02494061, "balance_loss_mlp": 1.04793894, "epoch": 0.45735758304524277, "flos": 13771491528960.0, "grad_norm": 3.0613882420117142, "language_loss": 0.80411857, "learning_rate": 2.371135293099262e-06, "loss": 0.82580411, "num_input_tokens_seen": 163180700, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8046875, "step": 7607, "time_per_iteration": 2.494107246398926 }, { "auxiliary_loss_clip": 0.01130653, "auxiliary_loss_mlp": 0.01039293, "balance_loss_clip": 1.02471399, "balance_loss_mlp": 1.05157351, "epoch": 0.45741770629791073, "flos": 21100750986240.0, "grad_norm": 1.9218304111443167, "language_loss": 0.80942047, "learning_rate": 2.3707525887518982e-06, "loss": 0.83111989, "num_input_tokens_seen": 163199450, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 7608, "time_per_iteration": 2.4771885871887207 }, { "auxiliary_loss_clip": 0.01126307, "auxiliary_loss_mlp": 0.01040155, "balance_loss_clip": 1.0246098, "balance_loss_mlp": 1.04622602, "epoch": 0.4574778295505787, "flos": 23112898035840.0, "grad_norm": 1.731655049468128, "language_loss": 0.68066633, "learning_rate": 2.370369870345559e-06, "loss": 0.70233095, "num_input_tokens_seen": 163217875, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.80078125, "step": 7609, "time_per_iteration": 2.499425172805786 }, { "auxiliary_loss_clip": 0.01123998, "auxiliary_loss_mlp": 0.0104029, "balance_loss_clip": 1.0256865, "balance_loss_mlp": 1.04552984, "epoch": 0.45753795280324666, "flos": 24352929308160.0, "grad_norm": 2.3055256067600274, "language_loss": 0.80965608, "learning_rate": 2.369987137894757e-06, "loss": 0.83129895, "num_input_tokens_seen": 163237430, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 7610, "time_per_iteration": 2.527113914489746 }, { "auxiliary_loss_clip": 0.01127757, "auxiliary_loss_mlp": 0.01038829, "balance_loss_clip": 1.02396333, "balance_loss_mlp": 1.04695523, "epoch": 0.4575980760559146, "flos": 16653789550080.0, "grad_norm": 4.614879443811611, "language_loss": 0.82613617, "learning_rate": 2.3696043914140057e-06, "loss": 0.84780204, "num_input_tokens_seen": 163253905, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80859375, "step": 7611, "time_per_iteration": 2.50434947013855 }, { "auxiliary_loss_clip": 0.01128605, "auxiliary_loss_mlp": 0.01035836, "balance_loss_clip": 1.02036214, "balance_loss_mlp": 1.04909897, "epoch": 0.4576581993085826, "flos": 35911423912320.0, "grad_norm": 1.9469943688908264, "language_loss": 0.73601449, "learning_rate": 2.369221630917819e-06, "loss": 0.7576589, "num_input_tokens_seen": 163274285, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.79296875, "step": 7612, "time_per_iteration": 2.6119751930236816 }, { "auxiliary_loss_clip": 0.01122617, "auxiliary_loss_mlp": 0.0103465, "balance_loss_clip": 1.01997566, "balance_loss_mlp": 1.04409218, "epoch": 0.45771832256125056, "flos": 20080421251200.0, "grad_norm": 1.5399369068946862, "language_loss": 0.84885037, "learning_rate": 2.368838856420711e-06, "loss": 0.87042296, "num_input_tokens_seen": 163293150, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 7613, "time_per_iteration": 2.505596160888672 }, { "auxiliary_loss_clip": 0.0112645, "auxiliary_loss_mlp": 0.01032905, "balance_loss_clip": 1.01819479, "balance_loss_mlp": 1.04686654, "epoch": 0.4577784458139185, "flos": 10744329957120.0, "grad_norm": 3.3516553765309727, "language_loss": 0.7595222, "learning_rate": 2.3684560679371965e-06, "loss": 0.78111577, "num_input_tokens_seen": 163310065, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.796875, "step": 7614, "time_per_iteration": 2.4806461334228516 }, { "auxiliary_loss_clip": 0.01122982, "auxiliary_loss_mlp": 0.01039424, "balance_loss_clip": 1.02504134, "balance_loss_mlp": 1.04641938, "epoch": 0.4578385690665865, "flos": 21907269014400.0, "grad_norm": 1.6978341894419824, "language_loss": 0.74561286, "learning_rate": 2.368073265481791e-06, "loss": 0.76723689, "num_input_tokens_seen": 163329415, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 7615, "time_per_iteration": 2.511536121368408 }, { "auxiliary_loss_clip": 0.01050786, "auxiliary_loss_mlp": 0.01010927, "balance_loss_clip": 1.00919867, "balance_loss_mlp": 1.02440941, "epoch": 0.45789869231925445, "flos": 64758286667520.0, "grad_norm": 0.7786301566865531, "language_loss": 0.57657659, "learning_rate": 2.3676904490690105e-06, "loss": 0.59719372, "num_input_tokens_seen": 163385875, "router_z_loss_clip": 0.01733398, "router_z_loss_mlp": 0.26367188, "step": 7616, "time_per_iteration": 3.061728000640869 }, { "auxiliary_loss_clip": 0.01122136, "auxiliary_loss_mlp": 0.01036781, "balance_loss_clip": 1.02226138, "balance_loss_mlp": 1.04474235, "epoch": 0.4579588155719224, "flos": 16144001775360.0, "grad_norm": 1.7764163363639038, "language_loss": 0.71162081, "learning_rate": 2.3673076187133704e-06, "loss": 0.73321003, "num_input_tokens_seen": 163405170, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7734375, "step": 7617, "time_per_iteration": 2.4655697345733643 }, { "auxiliary_loss_clip": 0.01126242, "auxiliary_loss_mlp": 0.01034982, "balance_loss_clip": 1.02022433, "balance_loss_mlp": 1.04729021, "epoch": 0.45801893882459044, "flos": 21395541905280.0, "grad_norm": 2.194127941492481, "language_loss": 0.76400006, "learning_rate": 2.36692477442939e-06, "loss": 0.78561229, "num_input_tokens_seen": 163423155, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7890625, "step": 7618, "time_per_iteration": 2.5709147453308105 }, { "auxiliary_loss_clip": 0.01126847, "auxiliary_loss_mlp": 0.01038694, "balance_loss_clip": 1.02466297, "balance_loss_mlp": 1.04721057, "epoch": 0.4580790620772584, "flos": 19536554448000.0, "grad_norm": 2.3080139152556844, "language_loss": 0.76966828, "learning_rate": 2.366541916231585e-06, "loss": 0.79132372, "num_input_tokens_seen": 163442450, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.796875, "step": 7619, "time_per_iteration": 2.473158597946167 }, { "auxiliary_loss_clip": 0.01123253, "auxiliary_loss_mlp": 0.01035406, "balance_loss_clip": 1.02220392, "balance_loss_mlp": 1.04762077, "epoch": 0.45813918532992637, "flos": 16581070465920.0, "grad_norm": 1.8186353908906256, "language_loss": 0.72036952, "learning_rate": 2.366159044134473e-06, "loss": 0.74195611, "num_input_tokens_seen": 163459810, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7578125, "step": 7620, "time_per_iteration": 2.468137741088867 }, { "auxiliary_loss_clip": 0.0112256, "auxiliary_loss_mlp": 0.01031378, "balance_loss_clip": 1.01800251, "balance_loss_mlp": 1.04645288, "epoch": 0.45819930858259433, "flos": 42230301701760.0, "grad_norm": 1.6523028278961958, "language_loss": 0.78060174, "learning_rate": 2.3657761581525748e-06, "loss": 0.80214107, "num_input_tokens_seen": 163482970, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76171875, "step": 7621, "time_per_iteration": 2.6623880863189697 }, { "auxiliary_loss_clip": 0.01050156, "auxiliary_loss_mlp": 0.01004544, "balance_loss_clip": 1.00291109, "balance_loss_mlp": 1.02424347, "epoch": 0.4582594318352623, "flos": 63714795638400.0, "grad_norm": 0.7802812459651609, "language_loss": 0.65110826, "learning_rate": 2.3653932583004063e-06, "loss": 0.6716553, "num_input_tokens_seen": 163545330, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.2578125, "step": 7622, "time_per_iteration": 3.107142686843872 }, { "auxiliary_loss_clip": 0.01125745, "auxiliary_loss_mlp": 0.01030752, "balance_loss_clip": 1.01569641, "balance_loss_mlp": 1.04763365, "epoch": 0.45831955508793026, "flos": 26869979882880.0, "grad_norm": 1.7874844305314803, "language_loss": 0.79674506, "learning_rate": 2.3650103445924903e-06, "loss": 0.81831002, "num_input_tokens_seen": 163564620, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78125, "step": 7623, "time_per_iteration": 2.5220861434936523 }, { "auxiliary_loss_clip": 0.01125299, "auxiliary_loss_mlp": 0.01036987, "balance_loss_clip": 1.02297997, "balance_loss_mlp": 1.04502773, "epoch": 0.45837967834059823, "flos": 18733951002240.0, "grad_norm": 3.5279443124150527, "language_loss": 0.70558888, "learning_rate": 2.3646274170433452e-06, "loss": 0.72721183, "num_input_tokens_seen": 163581010, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80078125, "step": 7624, "time_per_iteration": 2.466062307357788 }, { "auxiliary_loss_clip": 0.01122983, "auxiliary_loss_mlp": 0.01032807, "balance_loss_clip": 1.01874053, "balance_loss_mlp": 1.04460716, "epoch": 0.4584398015932662, "flos": 21178102924800.0, "grad_norm": 2.495515574473084, "language_loss": 0.73546004, "learning_rate": 2.364244475667491e-06, "loss": 0.75701797, "num_input_tokens_seen": 163599955, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 7625, "time_per_iteration": 2.5126194953918457 }, { "auxiliary_loss_clip": 0.01123158, "auxiliary_loss_mlp": 0.01032597, "balance_loss_clip": 1.01909614, "balance_loss_mlp": 1.04600978, "epoch": 0.45849992484593416, "flos": 19790047704960.0, "grad_norm": 2.3049008376695386, "language_loss": 0.78109109, "learning_rate": 2.363861520479451e-06, "loss": 0.80264866, "num_input_tokens_seen": 163618545, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 7626, "time_per_iteration": 2.5335097312927246 }, { "auxiliary_loss_clip": 0.01128022, "auxiliary_loss_mlp": 0.01036275, "balance_loss_clip": 1.02213109, "balance_loss_mlp": 1.04811835, "epoch": 0.4585600480986021, "flos": 18223265387520.0, "grad_norm": 1.661331481052036, "language_loss": 0.84852076, "learning_rate": 2.3634785514937445e-06, "loss": 0.87016368, "num_input_tokens_seen": 163636055, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.80078125, "step": 7627, "time_per_iteration": 2.468327522277832 }, { "auxiliary_loss_clip": 0.01126508, "auxiliary_loss_mlp": 0.01031306, "balance_loss_clip": 1.0165714, "balance_loss_mlp": 1.04529095, "epoch": 0.4586201713512701, "flos": 29022213974400.0, "grad_norm": 1.6358007382984476, "language_loss": 0.69223648, "learning_rate": 2.3630955687248953e-06, "loss": 0.71381462, "num_input_tokens_seen": 163657485, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 7628, "time_per_iteration": 2.563244104385376 }, { "auxiliary_loss_clip": 0.01119733, "auxiliary_loss_mlp": 0.01028763, "balance_loss_clip": 1.01449382, "balance_loss_mlp": 1.04351127, "epoch": 0.45868029460393805, "flos": 23404600385280.0, "grad_norm": 1.6730665848375486, "language_loss": 0.7830267, "learning_rate": 2.3627125721874265e-06, "loss": 0.80451161, "num_input_tokens_seen": 163676030, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76171875, "step": 7629, "time_per_iteration": 2.5200130939483643 }, { "auxiliary_loss_clip": 0.01129243, "auxiliary_loss_mlp": 0.01045541, "balance_loss_clip": 1.03024006, "balance_loss_mlp": 1.04679489, "epoch": 0.458740417856606, "flos": 18221972497920.0, "grad_norm": 2.3398109377573006, "language_loss": 0.79823828, "learning_rate": 2.3623295618958595e-06, "loss": 0.8199861, "num_input_tokens_seen": 163694490, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.828125, "step": 7630, "time_per_iteration": 2.482180595397949 }, { "auxiliary_loss_clip": 0.01127666, "auxiliary_loss_mlp": 0.01032448, "balance_loss_clip": 1.01789212, "balance_loss_mlp": 1.04602575, "epoch": 0.458800541109274, "flos": 34568760504960.0, "grad_norm": 2.3658034256917135, "language_loss": 0.72114658, "learning_rate": 2.3619465378647198e-06, "loss": 0.74274772, "num_input_tokens_seen": 163717035, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.81640625, "step": 7631, "time_per_iteration": 2.6159725189208984 }, { "auxiliary_loss_clip": 0.01128579, "auxiliary_loss_mlp": 0.01037051, "balance_loss_clip": 1.02198267, "balance_loss_mlp": 1.04970837, "epoch": 0.458860664361942, "flos": 17712112896000.0, "grad_norm": 3.113670404054872, "language_loss": 0.71590692, "learning_rate": 2.361563500108531e-06, "loss": 0.73756325, "num_input_tokens_seen": 163734525, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 7632, "time_per_iteration": 4.064201354980469 }, { "auxiliary_loss_clip": 0.01128317, "auxiliary_loss_mlp": 0.01034809, "balance_loss_clip": 1.01962161, "balance_loss_mlp": 1.0465312, "epoch": 0.45892078761460997, "flos": 18441889516800.0, "grad_norm": 2.810090525266893, "language_loss": 0.69496852, "learning_rate": 2.3611804486418178e-06, "loss": 0.71659976, "num_input_tokens_seen": 163752860, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8203125, "step": 7633, "time_per_iteration": 2.480565309524536 }, { "auxiliary_loss_clip": 0.01124602, "auxiliary_loss_mlp": 0.01040019, "balance_loss_clip": 1.02572012, "balance_loss_mlp": 1.04647231, "epoch": 0.45898091086727794, "flos": 22672956257280.0, "grad_norm": 1.7037348483332164, "language_loss": 0.80686665, "learning_rate": 2.3607973834791062e-06, "loss": 0.82851279, "num_input_tokens_seen": 163772495, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 7634, "time_per_iteration": 5.481215000152588 }, { "auxiliary_loss_clip": 0.01129098, "auxiliary_loss_mlp": 0.01039182, "balance_loss_clip": 1.02370858, "balance_loss_mlp": 1.04623628, "epoch": 0.4590410341199459, "flos": 21652949744640.0, "grad_norm": 1.7940329465962261, "language_loss": 0.81408775, "learning_rate": 2.3604143046349216e-06, "loss": 0.83577061, "num_input_tokens_seen": 163791475, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.828125, "step": 7635, "time_per_iteration": 2.5038414001464844 }, { "auxiliary_loss_clip": 0.01124789, "auxiliary_loss_mlp": 0.01044627, "balance_loss_clip": 1.03052402, "balance_loss_mlp": 1.04837584, "epoch": 0.45910115737261387, "flos": 36535372087680.0, "grad_norm": 1.5877608995916341, "language_loss": 0.64772332, "learning_rate": 2.3600312121237905e-06, "loss": 0.6694175, "num_input_tokens_seen": 163812995, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 7636, "time_per_iteration": 3.9832396507263184 }, { "auxiliary_loss_clip": 0.01125641, "auxiliary_loss_mlp": 0.01034275, "balance_loss_clip": 1.01994622, "balance_loss_mlp": 1.04923785, "epoch": 0.45916128062528183, "flos": 24419866302720.0, "grad_norm": 1.8178110291655392, "language_loss": 0.80997056, "learning_rate": 2.3596481059602395e-06, "loss": 0.83156967, "num_input_tokens_seen": 163833945, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.765625, "step": 7637, "time_per_iteration": 2.5188114643096924 }, { "auxiliary_loss_clip": 0.0113088, "auxiliary_loss_mlp": 0.01040379, "balance_loss_clip": 1.02447629, "balance_loss_mlp": 1.04970217, "epoch": 0.4592214038779498, "flos": 23221958705280.0, "grad_norm": 3.6356428887428422, "language_loss": 0.75268924, "learning_rate": 2.3592649861587965e-06, "loss": 0.77440178, "num_input_tokens_seen": 163853885, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.8125, "step": 7638, "time_per_iteration": 2.52612042427063 }, { "auxiliary_loss_clip": 0.01123533, "auxiliary_loss_mlp": 0.01034643, "balance_loss_clip": 1.02098131, "balance_loss_mlp": 1.04758263, "epoch": 0.45928152713061776, "flos": 19172133014400.0, "grad_norm": 4.388140402316122, "language_loss": 0.73918843, "learning_rate": 2.358881852733989e-06, "loss": 0.7607702, "num_input_tokens_seen": 163871855, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 7639, "time_per_iteration": 2.457425594329834 }, { "auxiliary_loss_clip": 0.01125372, "auxiliary_loss_mlp": 0.01035917, "balance_loss_clip": 1.02220249, "balance_loss_mlp": 1.04704356, "epoch": 0.4593416503832857, "flos": 22414686491520.0, "grad_norm": 1.730976337081604, "language_loss": 0.68325198, "learning_rate": 2.358498705700346e-06, "loss": 0.70486486, "num_input_tokens_seen": 163891450, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 7640, "time_per_iteration": 2.5113985538482666 }, { "auxiliary_loss_clip": 0.01127236, "auxiliary_loss_mlp": 0.01038149, "balance_loss_clip": 1.02423716, "balance_loss_mlp": 1.04579592, "epoch": 0.4594017736359537, "flos": 18880215183360.0, "grad_norm": 1.798581593162097, "language_loss": 0.75487721, "learning_rate": 2.3581155450723958e-06, "loss": 0.7765311, "num_input_tokens_seen": 163909345, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8125, "step": 7641, "time_per_iteration": 2.466848134994507 }, { "auxiliary_loss_clip": 0.01128597, "auxiliary_loss_mlp": 0.01035563, "balance_loss_clip": 1.0207696, "balance_loss_mlp": 1.04844332, "epoch": 0.45946189688862166, "flos": 20518567349760.0, "grad_norm": 1.9139670302641711, "language_loss": 0.74929535, "learning_rate": 2.357732370864668e-06, "loss": 0.77093697, "num_input_tokens_seen": 163926940, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.80078125, "step": 7642, "time_per_iteration": 2.4989025592803955 }, { "auxiliary_loss_clip": 0.01054836, "auxiliary_loss_mlp": 0.0100865, "balance_loss_clip": 1.00706434, "balance_loss_mlp": 1.02887166, "epoch": 0.4595220201412896, "flos": 61405990162560.0, "grad_norm": 0.8629646838261937, "language_loss": 0.58234942, "learning_rate": 2.357349183091694e-06, "loss": 0.60298431, "num_input_tokens_seen": 163977785, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.25976562, "step": 7643, "time_per_iteration": 2.8477516174316406 }, { "auxiliary_loss_clip": 0.01128767, "auxiliary_loss_mlp": 0.01036051, "balance_loss_clip": 1.02139401, "balance_loss_mlp": 1.04477215, "epoch": 0.4595821433939576, "flos": 23330947547520.0, "grad_norm": 1.6534099087583962, "language_loss": 0.93132603, "learning_rate": 2.3569659817680016e-06, "loss": 0.9529742, "num_input_tokens_seen": 163996630, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.83984375, "step": 7644, "time_per_iteration": 2.510493278503418 }, { "auxiliary_loss_clip": 0.01127027, "auxiliary_loss_mlp": 0.01037303, "balance_loss_clip": 1.02312863, "balance_loss_mlp": 1.04692674, "epoch": 0.4596422666466256, "flos": 14282356711680.0, "grad_norm": 4.865271585120173, "language_loss": 0.81835163, "learning_rate": 2.3565827669081243e-06, "loss": 0.83999491, "num_input_tokens_seen": 164013190, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.80078125, "step": 7645, "time_per_iteration": 2.4621822834014893 }, { "auxiliary_loss_clip": 0.01051382, "auxiliary_loss_mlp": 0.01000508, "balance_loss_clip": 0.99895877, "balance_loss_mlp": 1.02531624, "epoch": 0.4597023898992936, "flos": 65727337737600.0, "grad_norm": 4.810907867182235, "language_loss": 0.59849191, "learning_rate": 2.356199538526593e-06, "loss": 0.61901081, "num_input_tokens_seen": 164074030, "router_z_loss_clip": 0.01550293, "router_z_loss_mlp": 0.26171875, "step": 7646, "time_per_iteration": 3.0230014324188232 }, { "auxiliary_loss_clip": 0.01124963, "auxiliary_loss_mlp": 0.01036362, "balance_loss_clip": 1.02166367, "balance_loss_mlp": 1.04583108, "epoch": 0.45976251315196154, "flos": 26907075653760.0, "grad_norm": 2.8466245403510744, "language_loss": 0.72915399, "learning_rate": 2.355816296637939e-06, "loss": 0.75076723, "num_input_tokens_seen": 164095515, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 7647, "time_per_iteration": 2.5423269271850586 }, { "auxiliary_loss_clip": 0.01126854, "auxiliary_loss_mlp": 0.01040125, "balance_loss_clip": 1.02551627, "balance_loss_mlp": 1.04567528, "epoch": 0.4598226364046295, "flos": 26618066824320.0, "grad_norm": 1.676102503231469, "language_loss": 0.66581976, "learning_rate": 2.3554330412566957e-06, "loss": 0.68748963, "num_input_tokens_seen": 164117270, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 7648, "time_per_iteration": 2.5628249645233154 }, { "auxiliary_loss_clip": 0.01126676, "auxiliary_loss_mlp": 0.01029067, "balance_loss_clip": 1.01523912, "balance_loss_mlp": 1.04743826, "epoch": 0.45988275965729747, "flos": 24387762522240.0, "grad_norm": 1.7925978458853002, "language_loss": 0.78932357, "learning_rate": 2.3550497723973953e-06, "loss": 0.81088096, "num_input_tokens_seen": 164137850, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.79296875, "step": 7649, "time_per_iteration": 2.5350382328033447 }, { "auxiliary_loss_clip": 0.01124025, "auxiliary_loss_mlp": 0.01040367, "balance_loss_clip": 1.02569866, "balance_loss_mlp": 1.04582191, "epoch": 0.45994288290996543, "flos": 24535822383360.0, "grad_norm": 2.0286402714775282, "language_loss": 0.69274443, "learning_rate": 2.3546664900745726e-06, "loss": 0.71438837, "num_input_tokens_seen": 164157960, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 7650, "time_per_iteration": 2.514253854751587 }, { "auxiliary_loss_clip": 0.0112824, "auxiliary_loss_mlp": 0.01039043, "balance_loss_clip": 1.02323532, "balance_loss_mlp": 1.04639912, "epoch": 0.4600030061626334, "flos": 14830245838080.0, "grad_norm": 2.082554963520093, "language_loss": 0.84295136, "learning_rate": 2.354283194302761e-06, "loss": 0.8646242, "num_input_tokens_seen": 164174590, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8203125, "step": 7651, "time_per_iteration": 2.4570538997650146 }, { "auxiliary_loss_clip": 0.01125444, "auxiliary_loss_mlp": 0.01032467, "balance_loss_clip": 1.01766133, "balance_loss_mlp": 1.04801321, "epoch": 0.46006312941530136, "flos": 18113845582080.0, "grad_norm": 2.332247430305552, "language_loss": 0.74944109, "learning_rate": 2.3538998850964948e-06, "loss": 0.77102017, "num_input_tokens_seen": 164192935, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 7652, "time_per_iteration": 2.469102382659912 }, { "auxiliary_loss_clip": 0.01124948, "auxiliary_loss_mlp": 0.01034629, "balance_loss_clip": 1.01999021, "balance_loss_mlp": 1.04379964, "epoch": 0.46012325266796933, "flos": 21976468565760.0, "grad_norm": 1.630983931851736, "language_loss": 0.75855422, "learning_rate": 2.3535165624703097e-06, "loss": 0.78014994, "num_input_tokens_seen": 164213160, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8125, "step": 7653, "time_per_iteration": 2.549973726272583 }, { "auxiliary_loss_clip": 0.0113459, "auxiliary_loss_mlp": 0.01036938, "balance_loss_clip": 1.01967621, "balance_loss_mlp": 1.0494945, "epoch": 0.4601833759206373, "flos": 15268068714240.0, "grad_norm": 2.1261795428505224, "language_loss": 0.65975153, "learning_rate": 2.353133226438741e-06, "loss": 0.68146682, "num_input_tokens_seen": 164229330, "router_z_loss_clip": 0.17285156, "router_z_loss_mlp": 0.8515625, "step": 7654, "time_per_iteration": 2.4553885459899902 }, { "auxiliary_loss_clip": 0.01124618, "auxiliary_loss_mlp": 0.01036049, "balance_loss_clip": 1.02159524, "balance_loss_mlp": 1.04569244, "epoch": 0.46024349917330526, "flos": 27088999061760.0, "grad_norm": 1.664038017786541, "language_loss": 0.79400724, "learning_rate": 2.3527498770163248e-06, "loss": 0.81561387, "num_input_tokens_seen": 164248240, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 7655, "time_per_iteration": 2.5461113452911377 }, { "auxiliary_loss_clip": 0.01121644, "auxiliary_loss_mlp": 0.01032877, "balance_loss_clip": 1.01802325, "balance_loss_mlp": 1.04479814, "epoch": 0.4603036224259732, "flos": 24462923731200.0, "grad_norm": 4.404504103463693, "language_loss": 0.67282176, "learning_rate": 2.3523665142175985e-06, "loss": 0.69436705, "num_input_tokens_seen": 164268020, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76953125, "step": 7656, "time_per_iteration": 2.5397870540618896 }, { "auxiliary_loss_clip": 0.01122797, "auxiliary_loss_mlp": 0.0103163, "balance_loss_clip": 1.0174675, "balance_loss_mlp": 1.04430962, "epoch": 0.4603637456786412, "flos": 28109292883200.0, "grad_norm": 1.8555185481331948, "language_loss": 0.81268358, "learning_rate": 2.351983138057098e-06, "loss": 0.83422792, "num_input_tokens_seen": 164287305, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78515625, "step": 7657, "time_per_iteration": 2.547900676727295 }, { "auxiliary_loss_clip": 0.01122653, "auxiliary_loss_mlp": 0.01029703, "balance_loss_clip": 1.01455164, "balance_loss_mlp": 1.04379797, "epoch": 0.4604238689313092, "flos": 24348942898560.0, "grad_norm": 2.1448172900372606, "language_loss": 0.70792639, "learning_rate": 2.3515997485493623e-06, "loss": 0.72944987, "num_input_tokens_seen": 164306835, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7890625, "step": 7658, "time_per_iteration": 2.5114266872406006 }, { "auxiliary_loss_clip": 0.01049136, "auxiliary_loss_mlp": 0.01006668, "balance_loss_clip": 1.00504673, "balance_loss_mlp": 1.02319145, "epoch": 0.4604839921839772, "flos": 53606229431040.0, "grad_norm": 0.9680815359412459, "language_loss": 0.62175333, "learning_rate": 2.351216345708928e-06, "loss": 0.64231145, "num_input_tokens_seen": 164367095, "router_z_loss_clip": 0.01623535, "router_z_loss_mlp": 0.25976562, "step": 7659, "time_per_iteration": 3.2057878971099854 }, { "auxiliary_loss_clip": 0.01124777, "auxiliary_loss_mlp": 0.01035698, "balance_loss_clip": 1.02063572, "balance_loss_mlp": 1.04783535, "epoch": 0.46054411543664514, "flos": 31248424126080.0, "grad_norm": 1.884320416286113, "language_loss": 0.68445557, "learning_rate": 2.350832929550336e-06, "loss": 0.70606035, "num_input_tokens_seen": 164388895, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7734375, "step": 7660, "time_per_iteration": 2.5707125663757324 }, { "auxiliary_loss_clip": 0.01124087, "auxiliary_loss_mlp": 0.01040354, "balance_loss_clip": 1.02538729, "balance_loss_mlp": 1.04460001, "epoch": 0.4606042386893131, "flos": 24092863862400.0, "grad_norm": 2.1169922654202025, "language_loss": 0.76955414, "learning_rate": 2.3504495000881227e-06, "loss": 0.79119849, "num_input_tokens_seen": 164409080, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.796875, "step": 7661, "time_per_iteration": 2.5161330699920654 }, { "auxiliary_loss_clip": 0.01122178, "auxiliary_loss_mlp": 0.01037227, "balance_loss_clip": 1.02283275, "balance_loss_mlp": 1.04602325, "epoch": 0.46066436194198107, "flos": 26578457101440.0, "grad_norm": 3.284981168556384, "language_loss": 0.74853146, "learning_rate": 2.3500660573368305e-06, "loss": 0.77012551, "num_input_tokens_seen": 164427585, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76171875, "step": 7662, "time_per_iteration": 2.517446994781494 }, { "auxiliary_loss_clip": 0.01130718, "auxiliary_loss_mlp": 0.01038377, "balance_loss_clip": 1.02084136, "balance_loss_mlp": 1.04611421, "epoch": 0.46072448519464904, "flos": 17775602184960.0, "grad_norm": 2.8591085903695674, "language_loss": 0.79413348, "learning_rate": 2.349682601310998e-06, "loss": 0.81582439, "num_input_tokens_seen": 164438455, "router_z_loss_clip": 0.17480469, "router_z_loss_mlp": 0.84375, "step": 7663, "time_per_iteration": 2.462799072265625 }, { "auxiliary_loss_clip": 0.01123696, "auxiliary_loss_mlp": 0.01032496, "balance_loss_clip": 1.01794088, "balance_loss_mlp": 1.04652333, "epoch": 0.460784608447317, "flos": 15086109392640.0, "grad_norm": 2.066548509772043, "language_loss": 0.73763466, "learning_rate": 2.3492991320251653e-06, "loss": 0.75919664, "num_input_tokens_seen": 164456830, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76953125, "step": 7664, "time_per_iteration": 2.4660370349884033 }, { "auxiliary_loss_clip": 0.01126222, "auxiliary_loss_mlp": 0.01040212, "balance_loss_clip": 1.02495909, "balance_loss_mlp": 1.04685903, "epoch": 0.46084473169998497, "flos": 18588261438720.0, "grad_norm": 1.567881931372631, "language_loss": 0.72177839, "learning_rate": 2.3489156494938753e-06, "loss": 0.74344277, "num_input_tokens_seen": 164475375, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.79296875, "step": 7665, "time_per_iteration": 2.4848058223724365 }, { "auxiliary_loss_clip": 0.01123073, "auxiliary_loss_mlp": 0.01035071, "balance_loss_clip": 1.02018213, "balance_loss_mlp": 1.04394913, "epoch": 0.46090485495265293, "flos": 19494789909120.0, "grad_norm": 2.027646854204998, "language_loss": 0.77735102, "learning_rate": 2.348532153731669e-06, "loss": 0.79893243, "num_input_tokens_seen": 164492040, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7890625, "step": 7666, "time_per_iteration": 2.466240406036377 }, { "auxiliary_loss_clip": 0.0112465, "auxiliary_loss_mlp": 0.01036511, "balance_loss_clip": 1.02039349, "balance_loss_mlp": 1.04576373, "epoch": 0.4609649782053209, "flos": 33364927163520.0, "grad_norm": 1.3793867933295019, "language_loss": 0.73829424, "learning_rate": 2.348148644753088e-06, "loss": 0.75990582, "num_input_tokens_seen": 164513665, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.7890625, "step": 7667, "time_per_iteration": 2.6156370639801025 }, { "auxiliary_loss_clip": 0.01122211, "auxiliary_loss_mlp": 0.01038172, "balance_loss_clip": 1.02364039, "balance_loss_mlp": 1.0432651, "epoch": 0.46102510145798886, "flos": 23769165473280.0, "grad_norm": 3.5626654874098134, "language_loss": 0.76219857, "learning_rate": 2.347765122572676e-06, "loss": 0.78380239, "num_input_tokens_seen": 164533890, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 7668, "time_per_iteration": 2.5137503147125244 }, { "auxiliary_loss_clip": 0.01124956, "auxiliary_loss_mlp": 0.01035037, "balance_loss_clip": 1.0209409, "balance_loss_mlp": 1.04870987, "epoch": 0.4610852247106568, "flos": 23294821443840.0, "grad_norm": 3.4346974749147057, "language_loss": 0.78503865, "learning_rate": 2.347381587204975e-06, "loss": 0.8066386, "num_input_tokens_seen": 164553815, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 7669, "time_per_iteration": 2.510272741317749 }, { "auxiliary_loss_clip": 0.01125956, "auxiliary_loss_mlp": 0.01030373, "balance_loss_clip": 1.01613379, "balance_loss_mlp": 1.04607868, "epoch": 0.4611453479633248, "flos": 25447450584960.0, "grad_norm": 1.7810011823394174, "language_loss": 0.82691282, "learning_rate": 2.34699803866453e-06, "loss": 0.84847605, "num_input_tokens_seen": 164573125, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 7670, "time_per_iteration": 2.523042678833008 }, { "auxiliary_loss_clip": 0.01123639, "auxiliary_loss_mlp": 0.01035371, "balance_loss_clip": 1.02054131, "balance_loss_mlp": 1.04598856, "epoch": 0.4612054712159928, "flos": 21139606523520.0, "grad_norm": 2.3171180309314963, "language_loss": 0.6330601, "learning_rate": 2.3466144769658845e-06, "loss": 0.65465021, "num_input_tokens_seen": 164592575, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 7671, "time_per_iteration": 2.5119471549987793 }, { "auxiliary_loss_clip": 0.01051009, "auxiliary_loss_mlp": 0.01002179, "balance_loss_clip": 1.00061691, "balance_loss_mlp": 1.02498806, "epoch": 0.4612655944686608, "flos": 69959266404480.0, "grad_norm": 0.7459353170568853, "language_loss": 0.55896491, "learning_rate": 2.346230902123583e-06, "loss": 0.5794968, "num_input_tokens_seen": 164659795, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.26171875, "step": 7672, "time_per_iteration": 3.211043357849121 }, { "auxiliary_loss_clip": 0.01127946, "auxiliary_loss_mlp": 0.01037983, "balance_loss_clip": 1.02352929, "balance_loss_mlp": 1.04719877, "epoch": 0.46132571772132874, "flos": 16837149502080.0, "grad_norm": 2.504954753421964, "language_loss": 0.71316874, "learning_rate": 2.3458473141521715e-06, "loss": 0.734828, "num_input_tokens_seen": 164678735, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.8046875, "step": 7673, "time_per_iteration": 2.4713242053985596 }, { "auxiliary_loss_clip": 0.01123185, "auxiliary_loss_mlp": 0.01030795, "balance_loss_clip": 1.01617992, "balance_loss_mlp": 1.04725814, "epoch": 0.4613858409739967, "flos": 35808935431680.0, "grad_norm": 1.8557423259507961, "language_loss": 0.70518839, "learning_rate": 2.345463713066195e-06, "loss": 0.7267282, "num_input_tokens_seen": 164700885, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7578125, "step": 7674, "time_per_iteration": 4.109313726425171 }, { "auxiliary_loss_clip": 0.01121929, "auxiliary_loss_mlp": 0.01039734, "balance_loss_clip": 1.02442765, "balance_loss_mlp": 1.0426079, "epoch": 0.4614459642266647, "flos": 35266756567680.0, "grad_norm": 1.5001620262539075, "language_loss": 0.6533103, "learning_rate": 2.3450800988801996e-06, "loss": 0.67492694, "num_input_tokens_seen": 164726960, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.79296875, "step": 7675, "time_per_iteration": 2.639151096343994 }, { "auxiliary_loss_clip": 0.01048133, "auxiliary_loss_mlp": 0.01004937, "balance_loss_clip": 1.0032202, "balance_loss_mlp": 1.02185678, "epoch": 0.46150608747933264, "flos": 66704610044160.0, "grad_norm": 0.7417556493029638, "language_loss": 0.58653378, "learning_rate": 2.3446964716087327e-06, "loss": 0.60706449, "num_input_tokens_seen": 164788525, "router_z_loss_clip": 0.01721191, "router_z_loss_mlp": 0.26367188, "step": 7676, "time_per_iteration": 5.9211037158966064 }, { "auxiliary_loss_clip": 0.01048389, "auxiliary_loss_mlp": 0.01003687, "balance_loss_clip": 1.00212502, "balance_loss_mlp": 1.02224469, "epoch": 0.4615662107320006, "flos": 55830177025920.0, "grad_norm": 0.7890928192534847, "language_loss": 0.62756664, "learning_rate": 2.344312831266341e-06, "loss": 0.64808744, "num_input_tokens_seen": 164843525, "router_z_loss_clip": 0.01556396, "router_z_loss_mlp": 0.26171875, "step": 7677, "time_per_iteration": 2.9664762020111084 }, { "auxiliary_loss_clip": 0.0112114, "auxiliary_loss_mlp": 0.01030763, "balance_loss_clip": 1.01649368, "balance_loss_mlp": 1.04392815, "epoch": 0.46162633398466857, "flos": 15483245137920.0, "grad_norm": 2.2231328426968617, "language_loss": 0.76765764, "learning_rate": 2.3439291778675718e-06, "loss": 0.7891767, "num_input_tokens_seen": 164859895, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 7678, "time_per_iteration": 3.832458019256592 }, { "auxiliary_loss_clip": 0.01125673, "auxiliary_loss_mlp": 0.01034989, "balance_loss_clip": 1.01990867, "balance_loss_mlp": 1.04707432, "epoch": 0.46168645723733653, "flos": 20011437181440.0, "grad_norm": 2.028226752646455, "language_loss": 0.66842419, "learning_rate": 2.343545511426974e-06, "loss": 0.69003081, "num_input_tokens_seen": 164878030, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78515625, "step": 7679, "time_per_iteration": 2.492103099822998 }, { "auxiliary_loss_clip": 0.01121313, "auxiliary_loss_mlp": 0.01036927, "balance_loss_clip": 1.02303958, "balance_loss_mlp": 1.04399133, "epoch": 0.4617465804900045, "flos": 20298542590080.0, "grad_norm": 2.131326371926667, "language_loss": 0.69388306, "learning_rate": 2.3431618319590963e-06, "loss": 0.71546555, "num_input_tokens_seen": 164895710, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7734375, "step": 7680, "time_per_iteration": 2.484332799911499 }, { "auxiliary_loss_clip": 0.01130524, "auxiliary_loss_mlp": 0.01042875, "balance_loss_clip": 1.02764058, "balance_loss_mlp": 1.04923844, "epoch": 0.46180670374267246, "flos": 22346312952960.0, "grad_norm": 1.7189321234019637, "language_loss": 0.63542509, "learning_rate": 2.342778139478487e-06, "loss": 0.65715909, "num_input_tokens_seen": 164913365, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 7681, "time_per_iteration": 2.4976141452789307 }, { "auxiliary_loss_clip": 0.01121119, "auxiliary_loss_mlp": 0.01029512, "balance_loss_clip": 1.01591063, "balance_loss_mlp": 1.04440379, "epoch": 0.46186682699534043, "flos": 19895696582400.0, "grad_norm": 1.5950352737576319, "language_loss": 0.67256486, "learning_rate": 2.342394433999697e-06, "loss": 0.69407117, "num_input_tokens_seen": 164931620, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 7682, "time_per_iteration": 2.4787583351135254 }, { "auxiliary_loss_clip": 0.01123652, "auxiliary_loss_mlp": 0.01037892, "balance_loss_clip": 1.02286005, "balance_loss_mlp": 1.04469275, "epoch": 0.4619269502480084, "flos": 31503569408640.0, "grad_norm": 2.1712731352481103, "language_loss": 0.74027091, "learning_rate": 2.342010715537275e-06, "loss": 0.76188636, "num_input_tokens_seen": 164950905, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 7683, "time_per_iteration": 2.5585925579071045 }, { "auxiliary_loss_clip": 0.01121539, "auxiliary_loss_mlp": 0.01031222, "balance_loss_clip": 1.01674366, "balance_loss_mlp": 1.04496646, "epoch": 0.46198707350067636, "flos": 25009484054400.0, "grad_norm": 1.9550553054361823, "language_loss": 0.75944841, "learning_rate": 2.3416269841057726e-06, "loss": 0.78097606, "num_input_tokens_seen": 164970950, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 7684, "time_per_iteration": 2.5362930297851562 }, { "auxiliary_loss_clip": 0.01128747, "auxiliary_loss_mlp": 0.0103759, "balance_loss_clip": 1.02259386, "balance_loss_mlp": 1.0458318, "epoch": 0.4620471967533444, "flos": 18292357198080.0, "grad_norm": 1.9159435861239633, "language_loss": 0.80050021, "learning_rate": 2.3412432397197412e-06, "loss": 0.82216352, "num_input_tokens_seen": 164989855, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.828125, "step": 7685, "time_per_iteration": 2.4724955558776855 }, { "auxiliary_loss_clip": 0.01122369, "auxiliary_loss_mlp": 0.01039417, "balance_loss_clip": 1.02398515, "balance_loss_mlp": 1.04653347, "epoch": 0.46210732000601235, "flos": 33985104410880.0, "grad_norm": 1.8596674028252589, "language_loss": 0.67329907, "learning_rate": 2.340859482393731e-06, "loss": 0.69491696, "num_input_tokens_seen": 165012290, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7578125, "step": 7686, "time_per_iteration": 2.607771396636963 }, { "auxiliary_loss_clip": 0.01124726, "auxiliary_loss_mlp": 0.01034441, "balance_loss_clip": 1.01955211, "balance_loss_mlp": 1.04413855, "epoch": 0.4621674432586803, "flos": 25009412227200.0, "grad_norm": 2.351726353249405, "language_loss": 0.74163938, "learning_rate": 2.340475712142296e-06, "loss": 0.76323104, "num_input_tokens_seen": 165030810, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.8046875, "step": 7687, "time_per_iteration": 2.5172276496887207 }, { "auxiliary_loss_clip": 0.01126724, "auxiliary_loss_mlp": 0.01030915, "balance_loss_clip": 1.01641893, "balance_loss_mlp": 1.04884696, "epoch": 0.4622275665113483, "flos": 22014031213440.0, "grad_norm": 1.9585889536110141, "language_loss": 0.74559075, "learning_rate": 2.3400919289799873e-06, "loss": 0.76716715, "num_input_tokens_seen": 165050205, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 7688, "time_per_iteration": 2.4999258518218994 }, { "auxiliary_loss_clip": 0.01119955, "auxiliary_loss_mlp": 0.01032681, "balance_loss_clip": 1.01780999, "balance_loss_mlp": 1.04285264, "epoch": 0.46228768976401624, "flos": 24058820747520.0, "grad_norm": 1.8124871711895014, "language_loss": 0.78467512, "learning_rate": 2.3397081329213585e-06, "loss": 0.80620152, "num_input_tokens_seen": 165069370, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76953125, "step": 7689, "time_per_iteration": 2.558276891708374 }, { "auxiliary_loss_clip": 0.01127049, "auxiliary_loss_mlp": 0.01037332, "balance_loss_clip": 1.02102423, "balance_loss_mlp": 1.04602349, "epoch": 0.4623478130166842, "flos": 26651391667200.0, "grad_norm": 4.214975462755229, "language_loss": 0.57767648, "learning_rate": 2.339324323980964e-06, "loss": 0.59932029, "num_input_tokens_seen": 165089610, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.8125, "step": 7690, "time_per_iteration": 2.5371768474578857 }, { "auxiliary_loss_clip": 0.01123823, "auxiliary_loss_mlp": 0.01034848, "balance_loss_clip": 1.0197978, "balance_loss_mlp": 1.04479468, "epoch": 0.46240793626935217, "flos": 20558428467840.0, "grad_norm": 2.217765839123996, "language_loss": 0.82817256, "learning_rate": 2.3389405021733562e-06, "loss": 0.84975928, "num_input_tokens_seen": 165109050, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 7691, "time_per_iteration": 2.4915266036987305 }, { "auxiliary_loss_clip": 0.01123569, "auxiliary_loss_mlp": 0.01027204, "balance_loss_clip": 1.01351333, "balance_loss_mlp": 1.04562378, "epoch": 0.46246805952202014, "flos": 22456055980800.0, "grad_norm": 1.4267061797828575, "language_loss": 0.75353724, "learning_rate": 2.338556667513091e-06, "loss": 0.77504492, "num_input_tokens_seen": 165130130, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.78125, "step": 7692, "time_per_iteration": 2.5318949222564697 }, { "auxiliary_loss_clip": 0.01125052, "auxiliary_loss_mlp": 0.01038172, "balance_loss_clip": 1.02292562, "balance_loss_mlp": 1.04509151, "epoch": 0.4625281827746881, "flos": 35041308854400.0, "grad_norm": 1.7811166456900276, "language_loss": 0.73948985, "learning_rate": 2.338172820014723e-06, "loss": 0.76112211, "num_input_tokens_seen": 165152685, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80078125, "step": 7693, "time_per_iteration": 2.5965664386749268 }, { "auxiliary_loss_clip": 0.01124375, "auxiliary_loss_mlp": 0.01038088, "balance_loss_clip": 1.02390194, "balance_loss_mlp": 1.04696226, "epoch": 0.46258830602735607, "flos": 21068647205760.0, "grad_norm": 1.5583842711728169, "language_loss": 0.85843819, "learning_rate": 2.337788959692808e-06, "loss": 0.88006288, "num_input_tokens_seen": 165173315, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 7694, "time_per_iteration": 2.4951364994049072 }, { "auxiliary_loss_clip": 0.01125352, "auxiliary_loss_mlp": 0.01036146, "balance_loss_clip": 1.02163196, "balance_loss_mlp": 1.04552197, "epoch": 0.46264842928002403, "flos": 26177227205760.0, "grad_norm": 2.7320037132687123, "language_loss": 0.78817415, "learning_rate": 2.337405086561902e-06, "loss": 0.80978918, "num_input_tokens_seen": 165192395, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.796875, "step": 7695, "time_per_iteration": 2.517869710922241 }, { "auxiliary_loss_clip": 0.01121902, "auxiliary_loss_mlp": 0.01033559, "balance_loss_clip": 1.0198257, "balance_loss_mlp": 1.04532123, "epoch": 0.462708552532692, "flos": 16764214936320.0, "grad_norm": 1.7165736973666526, "language_loss": 0.71943164, "learning_rate": 2.3370212006365606e-06, "loss": 0.74098623, "num_input_tokens_seen": 165211355, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 7696, "time_per_iteration": 2.4907727241516113 }, { "auxiliary_loss_clip": 0.01124104, "auxiliary_loss_mlp": 0.0103905, "balance_loss_clip": 1.02373135, "balance_loss_mlp": 1.04540634, "epoch": 0.46276867578535996, "flos": 15560453422080.0, "grad_norm": 1.5733337209273763, "language_loss": 0.69378245, "learning_rate": 2.3366373019313423e-06, "loss": 0.71541393, "num_input_tokens_seen": 165229380, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.78515625, "step": 7697, "time_per_iteration": 2.4654335975646973 }, { "auxiliary_loss_clip": 0.01123976, "auxiliary_loss_mlp": 0.01032105, "balance_loss_clip": 1.01764512, "balance_loss_mlp": 1.04614687, "epoch": 0.462828799038028, "flos": 22415404763520.0, "grad_norm": 1.8697169262299522, "language_loss": 0.84539276, "learning_rate": 2.3362533904608025e-06, "loss": 0.86695361, "num_input_tokens_seen": 165247200, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 7698, "time_per_iteration": 2.4957478046417236 }, { "auxiliary_loss_clip": 0.01124047, "auxiliary_loss_mlp": 0.01031849, "balance_loss_clip": 1.0176338, "balance_loss_mlp": 1.0458914, "epoch": 0.46288892229069595, "flos": 21069580959360.0, "grad_norm": 2.3316772452403747, "language_loss": 0.70943272, "learning_rate": 2.335869466239502e-06, "loss": 0.7309916, "num_input_tokens_seen": 165265825, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78125, "step": 7699, "time_per_iteration": 2.491147041320801 }, { "auxiliary_loss_clip": 0.01125936, "auxiliary_loss_mlp": 0.01036404, "balance_loss_clip": 1.02063251, "balance_loss_mlp": 1.044451, "epoch": 0.4629490455433639, "flos": 23185688947200.0, "grad_norm": 2.056418677442541, "language_loss": 0.71721131, "learning_rate": 2.335485529281996e-06, "loss": 0.73883474, "num_input_tokens_seen": 165284380, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8125, "step": 7700, "time_per_iteration": 2.5140812397003174 }, { "auxiliary_loss_clip": 0.01123387, "auxiliary_loss_mlp": 0.0103464, "balance_loss_clip": 1.02029943, "balance_loss_mlp": 1.04518747, "epoch": 0.4630091687960319, "flos": 18835541642880.0, "grad_norm": 2.06770914258048, "language_loss": 0.72377086, "learning_rate": 2.3351015796028467e-06, "loss": 0.7453512, "num_input_tokens_seen": 165300320, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 7701, "time_per_iteration": 2.4732091426849365 }, { "auxiliary_loss_clip": 0.01128166, "auxiliary_loss_mlp": 0.01033917, "balance_loss_clip": 1.01887321, "balance_loss_mlp": 1.04615736, "epoch": 0.46306929204869984, "flos": 38907020407680.0, "grad_norm": 2.200248679005562, "language_loss": 0.6502434, "learning_rate": 2.3347176172166114e-06, "loss": 0.67186427, "num_input_tokens_seen": 165318130, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8203125, "step": 7702, "time_per_iteration": 2.6384801864624023 }, { "auxiliary_loss_clip": 0.01121148, "auxiliary_loss_mlp": 0.01032456, "balance_loss_clip": 1.01842499, "balance_loss_mlp": 1.04444349, "epoch": 0.4631294153013678, "flos": 19644178573440.0, "grad_norm": 2.1620947810451567, "language_loss": 0.73643768, "learning_rate": 2.33433364213785e-06, "loss": 0.75797379, "num_input_tokens_seen": 165336225, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 7703, "time_per_iteration": 2.4814131259918213 }, { "auxiliary_loss_clip": 0.01128771, "auxiliary_loss_mlp": 0.0103596, "balance_loss_clip": 1.02070141, "balance_loss_mlp": 1.0472753, "epoch": 0.4631895385540358, "flos": 24608254158720.0, "grad_norm": 1.948368808750919, "language_loss": 0.68836534, "learning_rate": 2.3339496543811243e-06, "loss": 0.71001261, "num_input_tokens_seen": 165355005, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8125, "step": 7704, "time_per_iteration": 2.560492992401123 }, { "auxiliary_loss_clip": 0.01127619, "auxiliary_loss_mlp": 0.01032288, "balance_loss_clip": 1.01677871, "balance_loss_mlp": 1.04623985, "epoch": 0.46324966180670374, "flos": 26320115508480.0, "grad_norm": 2.0056442421196463, "language_loss": 0.80699646, "learning_rate": 2.3335656539609934e-06, "loss": 0.82859558, "num_input_tokens_seen": 165374910, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8125, "step": 7705, "time_per_iteration": 2.5633232593536377 }, { "auxiliary_loss_clip": 0.01124823, "auxiliary_loss_mlp": 0.01036926, "balance_loss_clip": 1.02288282, "balance_loss_mlp": 1.04367781, "epoch": 0.4633097850593717, "flos": 19240506552960.0, "grad_norm": 1.8703711778745196, "language_loss": 0.7761749, "learning_rate": 2.3331816408920196e-06, "loss": 0.79779243, "num_input_tokens_seen": 165392590, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.8125, "step": 7706, "time_per_iteration": 2.4676995277404785 }, { "auxiliary_loss_clip": 0.01118563, "auxiliary_loss_mlp": 0.01036724, "balance_loss_clip": 1.02207303, "balance_loss_mlp": 1.04365563, "epoch": 0.46336990831203967, "flos": 22783166161920.0, "grad_norm": 2.2357229077770078, "language_loss": 0.7008189, "learning_rate": 2.3327976151887654e-06, "loss": 0.7223717, "num_input_tokens_seen": 165411195, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 7707, "time_per_iteration": 2.4889748096466064 }, { "auxiliary_loss_clip": 0.01127575, "auxiliary_loss_mlp": 0.01041431, "balance_loss_clip": 1.02453911, "balance_loss_mlp": 1.0442791, "epoch": 0.46343003156470763, "flos": 38210604543360.0, "grad_norm": 1.9665874569745028, "language_loss": 0.61065423, "learning_rate": 2.332413576865791e-06, "loss": 0.63234431, "num_input_tokens_seen": 165430150, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8359375, "step": 7708, "time_per_iteration": 2.617100715637207 }, { "auxiliary_loss_clip": 0.01125069, "auxiliary_loss_mlp": 0.01041412, "balance_loss_clip": 1.02643323, "balance_loss_mlp": 1.044227, "epoch": 0.4634901548173756, "flos": 31938555110400.0, "grad_norm": 2.0286757070387105, "language_loss": 0.77686834, "learning_rate": 2.3320295259376614e-06, "loss": 0.79853308, "num_input_tokens_seen": 165450595, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80859375, "step": 7709, "time_per_iteration": 2.587892770767212 }, { "auxiliary_loss_clip": 0.01126076, "auxiliary_loss_mlp": 0.01043107, "balance_loss_clip": 1.02733517, "balance_loss_mlp": 1.04509497, "epoch": 0.46355027807004356, "flos": 20082540153600.0, "grad_norm": 1.8792095069322394, "language_loss": 0.77035666, "learning_rate": 2.3316454624189385e-06, "loss": 0.79204851, "num_input_tokens_seen": 165469515, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.8125, "step": 7710, "time_per_iteration": 2.46877121925354 }, { "auxiliary_loss_clip": 0.01128434, "auxiliary_loss_mlp": 0.01037594, "balance_loss_clip": 1.02122712, "balance_loss_mlp": 1.04572785, "epoch": 0.4636104013227116, "flos": 24061370613120.0, "grad_norm": 2.1308816887559012, "language_loss": 0.72881955, "learning_rate": 2.3312613863241865e-06, "loss": 0.75047982, "num_input_tokens_seen": 165488125, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.828125, "step": 7711, "time_per_iteration": 2.5246596336364746 }, { "auxiliary_loss_clip": 0.01123286, "auxiliary_loss_mlp": 0.01043392, "balance_loss_clip": 1.02787101, "balance_loss_mlp": 1.04464483, "epoch": 0.46367052457537955, "flos": 23914639555200.0, "grad_norm": 1.285117986884249, "language_loss": 0.7155962, "learning_rate": 2.33087729766797e-06, "loss": 0.73726296, "num_input_tokens_seen": 165509225, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.78515625, "step": 7712, "time_per_iteration": 2.529327392578125 }, { "auxiliary_loss_clip": 0.01132299, "auxiliary_loss_mlp": 0.01047656, "balance_loss_clip": 1.03076458, "balance_loss_mlp": 1.04763901, "epoch": 0.4637306478280475, "flos": 26396533693440.0, "grad_norm": 7.813445841353891, "language_loss": 0.73480952, "learning_rate": 2.3304931964648524e-06, "loss": 0.75660908, "num_input_tokens_seen": 165529945, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.84765625, "step": 7713, "time_per_iteration": 2.5523314476013184 }, { "auxiliary_loss_clip": 0.01129332, "auxiliary_loss_mlp": 0.0104309, "balance_loss_clip": 1.02611494, "balance_loss_mlp": 1.04535329, "epoch": 0.4637907710807155, "flos": 21980706370560.0, "grad_norm": 2.9479063899319744, "language_loss": 0.58713925, "learning_rate": 2.3301090827294e-06, "loss": 0.60886347, "num_input_tokens_seen": 165550690, "router_z_loss_clip": 0.16992188, "router_z_loss_mlp": 0.83984375, "step": 7714, "time_per_iteration": 2.4856061935424805 }, { "auxiliary_loss_clip": 0.01123432, "auxiliary_loss_mlp": 0.01036248, "balance_loss_clip": 1.02108419, "balance_loss_mlp": 1.04462886, "epoch": 0.46385089433338345, "flos": 12422291846400.0, "grad_norm": 1.9624807539438434, "language_loss": 0.70292473, "learning_rate": 2.3297249564761784e-06, "loss": 0.72452152, "num_input_tokens_seen": 165567775, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7890625, "step": 7715, "time_per_iteration": 2.467100143432617 }, { "auxiliary_loss_clip": 0.01131508, "auxiliary_loss_mlp": 0.01039384, "balance_loss_clip": 1.02352893, "balance_loss_mlp": 1.04565263, "epoch": 0.4639110175860514, "flos": 23915752876800.0, "grad_norm": 2.464478552473045, "language_loss": 0.6882425, "learning_rate": 2.3293408177197527e-06, "loss": 0.7099514, "num_input_tokens_seen": 165587010, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.859375, "step": 7716, "time_per_iteration": 3.9893174171447754 }, { "auxiliary_loss_clip": 0.01127474, "auxiliary_loss_mlp": 0.01036932, "balance_loss_clip": 1.02087462, "balance_loss_mlp": 1.04462922, "epoch": 0.4639711408387194, "flos": 25300396304640.0, "grad_norm": 4.414960078594626, "language_loss": 0.8093366, "learning_rate": 2.328956666474691e-06, "loss": 0.83098066, "num_input_tokens_seen": 165607850, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.828125, "step": 7717, "time_per_iteration": 3.9085216522216797 }, { "auxiliary_loss_clip": 0.01125535, "auxiliary_loss_mlp": 0.01037266, "balance_loss_clip": 1.02201366, "balance_loss_mlp": 1.04454803, "epoch": 0.46403126409138734, "flos": 21211822817280.0, "grad_norm": 1.722394600118948, "language_loss": 0.73298663, "learning_rate": 2.3285725027555593e-06, "loss": 0.75461465, "num_input_tokens_seen": 165627175, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.80859375, "step": 7718, "time_per_iteration": 3.945993661880493 }, { "auxiliary_loss_clip": 0.01122206, "auxiliary_loss_mlp": 0.01040598, "balance_loss_clip": 1.02474332, "balance_loss_mlp": 1.04358304, "epoch": 0.4640913873440553, "flos": 35845564325760.0, "grad_norm": 1.7163048956057176, "language_loss": 0.70709205, "learning_rate": 2.3281883265769254e-06, "loss": 0.72872007, "num_input_tokens_seen": 165648340, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.7890625, "step": 7719, "time_per_iteration": 4.05771803855896 }, { "auxiliary_loss_clip": 0.0113237, "auxiliary_loss_mlp": 0.01039917, "balance_loss_clip": 1.0243125, "balance_loss_mlp": 1.048769, "epoch": 0.46415151059672327, "flos": 19166207270400.0, "grad_norm": 1.693467902793272, "language_loss": 0.86532903, "learning_rate": 2.327804137953357e-06, "loss": 0.88705188, "num_input_tokens_seen": 165667195, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8359375, "step": 7720, "time_per_iteration": 2.4855802059173584 }, { "auxiliary_loss_clip": 0.01053903, "auxiliary_loss_mlp": 0.01003476, "balance_loss_clip": 1.00179529, "balance_loss_mlp": 1.02802885, "epoch": 0.46421163384939124, "flos": 58912750304640.0, "grad_norm": 0.7595215618500407, "language_loss": 0.55091298, "learning_rate": 2.3274199368994226e-06, "loss": 0.57148683, "num_input_tokens_seen": 165726760, "router_z_loss_clip": 0.0168457, "router_z_loss_mlp": 0.2578125, "step": 7721, "time_per_iteration": 3.187293529510498 }, { "auxiliary_loss_clip": 0.0112545, "auxiliary_loss_mlp": 0.01041874, "balance_loss_clip": 1.02642441, "balance_loss_mlp": 1.0461725, "epoch": 0.4642717571020592, "flos": 20157342226560.0, "grad_norm": 2.129052093094972, "language_loss": 0.79796433, "learning_rate": 2.3270357234296918e-06, "loss": 0.81963766, "num_input_tokens_seen": 165745005, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.79296875, "step": 7722, "time_per_iteration": 2.5206336975097656 }, { "auxiliary_loss_clip": 0.01128772, "auxiliary_loss_mlp": 0.0103728, "balance_loss_clip": 1.02162731, "balance_loss_mlp": 1.04639697, "epoch": 0.46433188035472717, "flos": 25046184775680.0, "grad_norm": 1.585045572560044, "language_loss": 0.77578902, "learning_rate": 2.3266514975587332e-06, "loss": 0.79744959, "num_input_tokens_seen": 165765750, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.82421875, "step": 7723, "time_per_iteration": 2.5338785648345947 }, { "auxiliary_loss_clip": 0.01122279, "auxiliary_loss_mlp": 0.01033919, "balance_loss_clip": 1.01943564, "balance_loss_mlp": 1.04481494, "epoch": 0.4643920036073952, "flos": 28075644817920.0, "grad_norm": 1.595135768984151, "language_loss": 0.68517745, "learning_rate": 2.326267259301118e-06, "loss": 0.70673943, "num_input_tokens_seen": 165787515, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7734375, "step": 7724, "time_per_iteration": 2.553894519805908 }, { "auxiliary_loss_clip": 0.01122692, "auxiliary_loss_mlp": 0.01036679, "balance_loss_clip": 1.02147996, "balance_loss_mlp": 1.04339635, "epoch": 0.46445212686006315, "flos": 18369350000640.0, "grad_norm": 2.2202196672528176, "language_loss": 0.66858995, "learning_rate": 2.325883008671415e-06, "loss": 0.69018364, "num_input_tokens_seen": 165806675, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.79296875, "step": 7725, "time_per_iteration": 2.4894912242889404 }, { "auxiliary_loss_clip": 0.01120821, "auxiliary_loss_mlp": 0.01036856, "balance_loss_clip": 1.0237788, "balance_loss_mlp": 1.04500663, "epoch": 0.4645122501127311, "flos": 31721618920320.0, "grad_norm": 1.790557208614952, "language_loss": 0.64539737, "learning_rate": 2.3254987456841955e-06, "loss": 0.66697407, "num_input_tokens_seen": 165829835, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 7726, "time_per_iteration": 2.5864953994750977 }, { "auxiliary_loss_clip": 0.01128517, "auxiliary_loss_mlp": 0.01034849, "balance_loss_clip": 1.01973867, "balance_loss_mlp": 1.04932618, "epoch": 0.4645723733653991, "flos": 23768806337280.0, "grad_norm": 1.699117424825977, "language_loss": 0.75022525, "learning_rate": 2.3251144703540307e-06, "loss": 0.77185887, "num_input_tokens_seen": 165849380, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 7727, "time_per_iteration": 2.500650644302368 }, { "auxiliary_loss_clip": 0.01124371, "auxiliary_loss_mlp": 0.01041952, "balance_loss_clip": 1.02657366, "balance_loss_mlp": 1.04525542, "epoch": 0.46463249661806705, "flos": 33145512935040.0, "grad_norm": 1.9740262637313142, "language_loss": 0.78434116, "learning_rate": 2.3247301826954936e-06, "loss": 0.80600441, "num_input_tokens_seen": 165868620, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7890625, "step": 7728, "time_per_iteration": 2.5786197185516357 }, { "auxiliary_loss_clip": 0.01126267, "auxiliary_loss_mlp": 0.01038239, "balance_loss_clip": 1.02311146, "balance_loss_mlp": 1.04568386, "epoch": 0.464692619870735, "flos": 18296020385280.0, "grad_norm": 2.7092036026436848, "language_loss": 0.76020563, "learning_rate": 2.324345882723155e-06, "loss": 0.78185064, "num_input_tokens_seen": 165885915, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.8046875, "step": 7729, "time_per_iteration": 2.469763994216919 }, { "auxiliary_loss_clip": 0.01126539, "auxiliary_loss_mlp": 0.01038545, "balance_loss_clip": 1.02437735, "balance_loss_mlp": 1.04738867, "epoch": 0.464752743123403, "flos": 22638051216000.0, "grad_norm": 1.972594435850413, "language_loss": 0.80046725, "learning_rate": 2.323961570451588e-06, "loss": 0.82211804, "num_input_tokens_seen": 165905465, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7890625, "step": 7730, "time_per_iteration": 2.494893789291382 }, { "auxiliary_loss_clip": 0.01122272, "auxiliary_loss_mlp": 0.01039299, "balance_loss_clip": 1.02495861, "balance_loss_mlp": 1.04418576, "epoch": 0.46481286637607094, "flos": 20412128373120.0, "grad_norm": 1.550954094185698, "language_loss": 0.76930386, "learning_rate": 2.3235772458953655e-06, "loss": 0.79091954, "num_input_tokens_seen": 165924640, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 7731, "time_per_iteration": 2.4803619384765625 }, { "auxiliary_loss_clip": 0.0112192, "auxiliary_loss_mlp": 0.01030728, "balance_loss_clip": 1.01650035, "balance_loss_mlp": 1.04410207, "epoch": 0.4648729896287389, "flos": 34275406129920.0, "grad_norm": 2.2506664496520243, "language_loss": 0.66156375, "learning_rate": 2.323192909069061e-06, "loss": 0.68309027, "num_input_tokens_seen": 165945765, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 7732, "time_per_iteration": 2.5934042930603027 }, { "auxiliary_loss_clip": 0.01127078, "auxiliary_loss_mlp": 0.01039211, "balance_loss_clip": 1.02339172, "balance_loss_mlp": 1.04433537, "epoch": 0.4649331128814069, "flos": 21321781326720.0, "grad_norm": 2.6179061654617946, "language_loss": 0.73052311, "learning_rate": 2.32280855998725e-06, "loss": 0.752186, "num_input_tokens_seen": 165964025, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.828125, "step": 7733, "time_per_iteration": 2.496378183364868 }, { "auxiliary_loss_clip": 0.01054201, "auxiliary_loss_mlp": 0.01001598, "balance_loss_clip": 0.99990481, "balance_loss_mlp": 1.02847123, "epoch": 0.46499323613407484, "flos": 58308515717760.0, "grad_norm": 1.263004271399074, "language_loss": 0.51951635, "learning_rate": 2.3224241986645057e-06, "loss": 0.54007435, "num_input_tokens_seen": 166021950, "router_z_loss_clip": 0.01696777, "router_z_loss_mlp": 0.2578125, "step": 7734, "time_per_iteration": 3.060396909713745 }, { "auxiliary_loss_clip": 0.01125724, "auxiliary_loss_mlp": 0.01034425, "balance_loss_clip": 1.01915443, "balance_loss_mlp": 1.04738355, "epoch": 0.4650533593867428, "flos": 10889660384640.0, "grad_norm": 2.042735814824574, "language_loss": 0.75525224, "learning_rate": 2.3220398251154035e-06, "loss": 0.77685374, "num_input_tokens_seen": 166039675, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78125, "step": 7735, "time_per_iteration": 2.468648910522461 }, { "auxiliary_loss_clip": 0.01120746, "auxiliary_loss_mlp": 0.0104138, "balance_loss_clip": 1.02651453, "balance_loss_mlp": 1.04443741, "epoch": 0.46511348263941077, "flos": 19974592805760.0, "grad_norm": 1.9088366685842135, "language_loss": 0.69727796, "learning_rate": 2.321655439354519e-06, "loss": 0.71889919, "num_input_tokens_seen": 166057745, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76171875, "step": 7736, "time_per_iteration": 2.4976139068603516 }, { "auxiliary_loss_clip": 0.01122108, "auxiliary_loss_mlp": 0.0102889, "balance_loss_clip": 1.01527059, "balance_loss_mlp": 1.04639518, "epoch": 0.46517360589207873, "flos": 19678401256320.0, "grad_norm": 2.08355608995869, "language_loss": 0.7193343, "learning_rate": 2.321271041396427e-06, "loss": 0.74084425, "num_input_tokens_seen": 166076440, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 7737, "time_per_iteration": 2.466963291168213 }, { "auxiliary_loss_clip": 0.01128306, "auxiliary_loss_mlp": 0.01040634, "balance_loss_clip": 1.02505291, "balance_loss_mlp": 1.04819846, "epoch": 0.46523372914474675, "flos": 16872665074560.0, "grad_norm": 2.2590979499682886, "language_loss": 0.8387056, "learning_rate": 2.3208866312557065e-06, "loss": 0.86039501, "num_input_tokens_seen": 166092520, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.80078125, "step": 7738, "time_per_iteration": 2.4723989963531494 }, { "auxiliary_loss_clip": 0.01054328, "auxiliary_loss_mlp": 0.01004863, "balance_loss_clip": 1.00331306, "balance_loss_mlp": 1.02849579, "epoch": 0.4652938523974147, "flos": 53439138339840.0, "grad_norm": 0.774601066985798, "language_loss": 0.57821095, "learning_rate": 2.320502208946932e-06, "loss": 0.59880292, "num_input_tokens_seen": 166156285, "router_z_loss_clip": 0.01544189, "router_z_loss_mlp": 0.2578125, "step": 7739, "time_per_iteration": 3.1593077182769775 }, { "auxiliary_loss_clip": 0.01125679, "auxiliary_loss_mlp": 0.01039857, "balance_loss_clip": 1.02589202, "balance_loss_mlp": 1.0469234, "epoch": 0.4653539756500827, "flos": 15231296165760.0, "grad_norm": 1.942828618289211, "language_loss": 0.84975898, "learning_rate": 2.3201177744846815e-06, "loss": 0.8714143, "num_input_tokens_seen": 166173455, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7890625, "step": 7740, "time_per_iteration": 2.478288412094116 }, { "auxiliary_loss_clip": 0.01124046, "auxiliary_loss_mlp": 0.01034161, "balance_loss_clip": 1.01915252, "balance_loss_mlp": 1.04679942, "epoch": 0.46541409890275065, "flos": 23732249270400.0, "grad_norm": 2.4264349023867435, "language_loss": 0.7568258, "learning_rate": 2.3197333278835327e-06, "loss": 0.77840793, "num_input_tokens_seen": 166194370, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7734375, "step": 7741, "time_per_iteration": 2.510075330734253 }, { "auxiliary_loss_clip": 0.01128242, "auxiliary_loss_mlp": 0.01034883, "balance_loss_clip": 1.02079284, "balance_loss_mlp": 1.0466187, "epoch": 0.4654742221554186, "flos": 20847329556480.0, "grad_norm": 2.3692411102645963, "language_loss": 0.80835891, "learning_rate": 2.319348869158064e-06, "loss": 0.82999015, "num_input_tokens_seen": 166213195, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.81640625, "step": 7742, "time_per_iteration": 2.5175583362579346 }, { "auxiliary_loss_clip": 0.01124994, "auxiliary_loss_mlp": 0.01039905, "balance_loss_clip": 1.02450323, "balance_loss_mlp": 1.04491735, "epoch": 0.4655343454080866, "flos": 20704836303360.0, "grad_norm": 1.7198901815548675, "language_loss": 0.72704327, "learning_rate": 2.3189643983228555e-06, "loss": 0.74869227, "num_input_tokens_seen": 166231350, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.80078125, "step": 7743, "time_per_iteration": 2.4765193462371826 }, { "auxiliary_loss_clip": 0.01126374, "auxiliary_loss_mlp": 0.01031787, "balance_loss_clip": 1.01651597, "balance_loss_mlp": 1.04694378, "epoch": 0.46559446866075455, "flos": 18989850470400.0, "grad_norm": 2.0032822746690524, "language_loss": 0.71264714, "learning_rate": 2.318579915392483e-06, "loss": 0.73422873, "num_input_tokens_seen": 166250530, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.79296875, "step": 7744, "time_per_iteration": 2.4930245876312256 }, { "auxiliary_loss_clip": 0.01120787, "auxiliary_loss_mlp": 0.01029859, "balance_loss_clip": 1.0170505, "balance_loss_mlp": 1.04501987, "epoch": 0.4656545919134225, "flos": 34496364643200.0, "grad_norm": 1.9726067146521764, "language_loss": 0.84869462, "learning_rate": 2.31819542038153e-06, "loss": 0.87020111, "num_input_tokens_seen": 166272545, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7578125, "step": 7745, "time_per_iteration": 2.6600003242492676 }, { "auxiliary_loss_clip": 0.01119935, "auxiliary_loss_mlp": 0.01038784, "balance_loss_clip": 1.02376926, "balance_loss_mlp": 1.04502094, "epoch": 0.4657147151660905, "flos": 24310554238080.0, "grad_norm": 1.4126591949366456, "language_loss": 0.7296446, "learning_rate": 2.317810913304574e-06, "loss": 0.75123179, "num_input_tokens_seen": 166292135, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.75, "step": 7746, "time_per_iteration": 2.540947914123535 }, { "auxiliary_loss_clip": 0.0112301, "auxiliary_loss_mlp": 0.01036103, "balance_loss_clip": 1.02207828, "balance_loss_mlp": 1.04673278, "epoch": 0.46577483841875844, "flos": 58795139220480.0, "grad_norm": 1.5124786712817953, "language_loss": 0.69615203, "learning_rate": 2.3174263941761963e-06, "loss": 0.71774316, "num_input_tokens_seen": 166316710, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 7747, "time_per_iteration": 2.827749252319336 }, { "auxiliary_loss_clip": 0.01120765, "auxiliary_loss_mlp": 0.01032821, "balance_loss_clip": 1.01856935, "balance_loss_mlp": 1.04325223, "epoch": 0.4658349616714264, "flos": 31321969223040.0, "grad_norm": 1.9517977602550354, "language_loss": 0.67253149, "learning_rate": 2.317041863010978e-06, "loss": 0.69406736, "num_input_tokens_seen": 166338535, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 7748, "time_per_iteration": 2.5726351737976074 }, { "auxiliary_loss_clip": 0.01125398, "auxiliary_loss_mlp": 0.01037556, "balance_loss_clip": 1.02087283, "balance_loss_mlp": 1.04363704, "epoch": 0.46589508492409437, "flos": 14860338456960.0, "grad_norm": 2.054313392783292, "language_loss": 0.63832724, "learning_rate": 2.3166573198235007e-06, "loss": 0.65995675, "num_input_tokens_seen": 166355540, "router_z_loss_clip": 0.16699219, "router_z_loss_mlp": 0.8203125, "step": 7749, "time_per_iteration": 2.4384114742279053 }, { "auxiliary_loss_clip": 0.01127555, "auxiliary_loss_mlp": 0.01035994, "balance_loss_clip": 1.02085459, "balance_loss_mlp": 1.04789853, "epoch": 0.46595520817676234, "flos": 12895989431040.0, "grad_norm": 2.1202493239674887, "language_loss": 0.74364907, "learning_rate": 2.3162727646283456e-06, "loss": 0.76528454, "num_input_tokens_seen": 166372635, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.796875, "step": 7750, "time_per_iteration": 2.454246759414673 }, { "auxiliary_loss_clip": 0.01124959, "auxiliary_loss_mlp": 0.01029661, "balance_loss_clip": 1.01448536, "balance_loss_mlp": 1.04484379, "epoch": 0.46601533142943036, "flos": 32854169721600.0, "grad_norm": 2.0485178490982476, "language_loss": 0.74122691, "learning_rate": 2.3158881974400963e-06, "loss": 0.76277316, "num_input_tokens_seen": 166393175, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.80078125, "step": 7751, "time_per_iteration": 2.578709363937378 }, { "auxiliary_loss_clip": 0.01127412, "auxiliary_loss_mlp": 0.0103593, "balance_loss_clip": 1.02030122, "balance_loss_mlp": 1.0467366, "epoch": 0.4660754546820983, "flos": 19967517826560.0, "grad_norm": 2.01420620118424, "language_loss": 0.73488533, "learning_rate": 2.3155036182733345e-06, "loss": 0.75651878, "num_input_tokens_seen": 166408630, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8046875, "step": 7752, "time_per_iteration": 2.475719690322876 }, { "auxiliary_loss_clip": 0.01126535, "auxiliary_loss_mlp": 0.01034314, "balance_loss_clip": 1.01952004, "balance_loss_mlp": 1.04517674, "epoch": 0.4661355779347663, "flos": 26688164215680.0, "grad_norm": 2.104012950197601, "language_loss": 0.69803226, "learning_rate": 2.315119027142644e-06, "loss": 0.71964073, "num_input_tokens_seen": 166428170, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.8125, "step": 7753, "time_per_iteration": 2.5216917991638184 }, { "auxiliary_loss_clip": 0.01120155, "auxiliary_loss_mlp": 0.01032568, "balance_loss_clip": 1.01838803, "balance_loss_mlp": 1.04432642, "epoch": 0.46619570118743425, "flos": 20959442881920.0, "grad_norm": 1.8138893588509744, "language_loss": 0.73161912, "learning_rate": 2.3147344240626076e-06, "loss": 0.75314629, "num_input_tokens_seen": 166446705, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7578125, "step": 7754, "time_per_iteration": 2.524531364440918 }, { "auxiliary_loss_clip": 0.01123441, "auxiliary_loss_mlp": 0.01030174, "balance_loss_clip": 1.01512384, "balance_loss_mlp": 1.04376912, "epoch": 0.4662558244401022, "flos": 24426079355520.0, "grad_norm": 1.7208102269294383, "language_loss": 0.79176939, "learning_rate": 2.3143498090478114e-06, "loss": 0.8133055, "num_input_tokens_seen": 166466750, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 7755, "time_per_iteration": 2.545006513595581 }, { "auxiliary_loss_clip": 0.01119173, "auxiliary_loss_mlp": 0.01031134, "balance_loss_clip": 1.01719201, "balance_loss_mlp": 1.04400611, "epoch": 0.4663159476927702, "flos": 20595452411520.0, "grad_norm": 1.63828547909062, "language_loss": 0.72198451, "learning_rate": 2.3139651821128382e-06, "loss": 0.74348748, "num_input_tokens_seen": 166485400, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 7756, "time_per_iteration": 2.4987940788269043 }, { "auxiliary_loss_clip": 0.01119744, "auxiliary_loss_mlp": 0.01033369, "balance_loss_clip": 1.01874816, "balance_loss_mlp": 1.04266679, "epoch": 0.46637607094543815, "flos": 25661872823040.0, "grad_norm": 1.7416545524415656, "language_loss": 0.78476036, "learning_rate": 2.313580543272274e-06, "loss": 0.80629152, "num_input_tokens_seen": 166505730, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 7757, "time_per_iteration": 4.035107612609863 }, { "auxiliary_loss_clip": 0.01120652, "auxiliary_loss_mlp": 0.01026891, "balance_loss_clip": 1.01275301, "balance_loss_mlp": 1.04374099, "epoch": 0.4664361941981061, "flos": 24273853516800.0, "grad_norm": 2.199268561365759, "language_loss": 0.66156733, "learning_rate": 2.313195892540705e-06, "loss": 0.68304276, "num_input_tokens_seen": 166523770, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76953125, "step": 7758, "time_per_iteration": 2.5174105167388916 }, { "auxiliary_loss_clip": 0.01122249, "auxiliary_loss_mlp": 0.01040719, "balance_loss_clip": 1.02627087, "balance_loss_mlp": 1.04455614, "epoch": 0.4664963174507741, "flos": 18405871153920.0, "grad_norm": 1.6134693961432656, "language_loss": 0.74789268, "learning_rate": 2.3128112299327147e-06, "loss": 0.76952237, "num_input_tokens_seen": 166542935, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 7759, "time_per_iteration": 5.329595565795898 }, { "auxiliary_loss_clip": 0.01121458, "auxiliary_loss_mlp": 0.01036496, "balance_loss_clip": 1.02225077, "balance_loss_mlp": 1.04442072, "epoch": 0.46655644070344204, "flos": 22455122227200.0, "grad_norm": 1.5244233677767423, "language_loss": 0.7810927, "learning_rate": 2.312426555462893e-06, "loss": 0.80267227, "num_input_tokens_seen": 166563935, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 7760, "time_per_iteration": 2.5080034732818604 }, { "auxiliary_loss_clip": 0.01116863, "auxiliary_loss_mlp": 0.01034118, "balance_loss_clip": 1.01949728, "balance_loss_mlp": 1.04241765, "epoch": 0.46661656395611, "flos": 13808407731840.0, "grad_norm": 1.706842274331675, "language_loss": 0.74224329, "learning_rate": 2.3120418691458237e-06, "loss": 0.76375312, "num_input_tokens_seen": 166582175, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7421875, "step": 7761, "time_per_iteration": 3.8183324337005615 }, { "auxiliary_loss_clip": 0.01126284, "auxiliary_loss_mlp": 0.01037526, "balance_loss_clip": 1.02064562, "balance_loss_mlp": 1.04574561, "epoch": 0.466676687208778, "flos": 21652159645440.0, "grad_norm": 2.1601366319211315, "language_loss": 0.78638434, "learning_rate": 2.3116571709960956e-06, "loss": 0.80802244, "num_input_tokens_seen": 166601870, "router_z_loss_clip": 0.16894531, "router_z_loss_mlp": 0.8046875, "step": 7762, "time_per_iteration": 2.488144636154175 }, { "auxiliary_loss_clip": 0.01045921, "auxiliary_loss_mlp": 0.01011055, "balance_loss_clip": 1.00942159, "balance_loss_mlp": 1.0202992, "epoch": 0.46673681046144594, "flos": 68534259068160.0, "grad_norm": 0.8105271155198204, "language_loss": 0.59815896, "learning_rate": 2.311272461028297e-06, "loss": 0.6187287, "num_input_tokens_seen": 166668960, "router_z_loss_clip": 0.01635742, "router_z_loss_mlp": 0.2578125, "step": 7763, "time_per_iteration": 3.1806435585021973 }, { "auxiliary_loss_clip": 0.01124148, "auxiliary_loss_mlp": 0.01036185, "balance_loss_clip": 1.02006841, "balance_loss_mlp": 1.04301858, "epoch": 0.46679693371411396, "flos": 15814449469440.0, "grad_norm": 2.0804249745496706, "language_loss": 0.78996396, "learning_rate": 2.3108877392570146e-06, "loss": 0.81156731, "num_input_tokens_seen": 166686110, "router_z_loss_clip": 0.16113281, "router_z_loss_mlp": 0.8125, "step": 7764, "time_per_iteration": 2.442260980606079 }, { "auxiliary_loss_clip": 0.01120083, "auxiliary_loss_mlp": 0.01037598, "balance_loss_clip": 1.02438354, "balance_loss_mlp": 1.04424882, "epoch": 0.4668570569667819, "flos": 18514572687360.0, "grad_norm": 1.8794866809393946, "language_loss": 0.72321665, "learning_rate": 2.310503005696839e-06, "loss": 0.74479342, "num_input_tokens_seen": 166703930, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 7765, "time_per_iteration": 2.4648139476776123 }, { "auxiliary_loss_clip": 0.01120392, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.01937187, "balance_loss_mlp": 1.04081559, "epoch": 0.4669171802194499, "flos": 19206643006080.0, "grad_norm": 2.0784877575013097, "language_loss": 0.77599931, "learning_rate": 2.3101182603623576e-06, "loss": 0.79754782, "num_input_tokens_seen": 166719940, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 7766, "time_per_iteration": 2.46516752243042 }, { "auxiliary_loss_clip": 0.01119896, "auxiliary_loss_mlp": 0.01034809, "balance_loss_clip": 1.02049232, "balance_loss_mlp": 1.04261255, "epoch": 0.46697730347211786, "flos": 12276135406080.0, "grad_norm": 2.285248614439096, "language_loss": 0.65175724, "learning_rate": 2.3097335032681607e-06, "loss": 0.67330432, "num_input_tokens_seen": 166738285, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 7767, "time_per_iteration": 2.530743360519409 }, { "auxiliary_loss_clip": 0.01123052, "auxiliary_loss_mlp": 0.01035955, "balance_loss_clip": 1.02167964, "balance_loss_mlp": 1.04534698, "epoch": 0.4670374267247858, "flos": 23586739274880.0, "grad_norm": 1.9587818394142547, "language_loss": 0.74319124, "learning_rate": 2.3093487344288393e-06, "loss": 0.76478136, "num_input_tokens_seen": 166758170, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 7768, "time_per_iteration": 2.493586301803589 }, { "auxiliary_loss_clip": 0.01121475, "auxiliary_loss_mlp": 0.0103176, "balance_loss_clip": 1.01765788, "balance_loss_mlp": 1.04387188, "epoch": 0.4670975499774538, "flos": 15991093578240.0, "grad_norm": 1.5531321480596336, "language_loss": 0.70766997, "learning_rate": 2.308963953858982e-06, "loss": 0.72920233, "num_input_tokens_seen": 166775750, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.77734375, "step": 7769, "time_per_iteration": 2.4812893867492676 }, { "auxiliary_loss_clip": 0.01119208, "auxiliary_loss_mlp": 0.01032383, "balance_loss_clip": 1.01835203, "balance_loss_mlp": 1.04170585, "epoch": 0.46715767323012175, "flos": 15377596260480.0, "grad_norm": 1.849289431127112, "language_loss": 0.8160497, "learning_rate": 2.3085791615731803e-06, "loss": 0.8375656, "num_input_tokens_seen": 166791720, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 7770, "time_per_iteration": 2.468020439147949 }, { "auxiliary_loss_clip": 0.01047261, "auxiliary_loss_mlp": 0.01008312, "balance_loss_clip": 1.00685811, "balance_loss_mlp": 1.02173638, "epoch": 0.4672177964827897, "flos": 60252217401600.0, "grad_norm": 0.8052270502335819, "language_loss": 0.55704671, "learning_rate": 2.3081943575860265e-06, "loss": 0.57760245, "num_input_tokens_seen": 166856360, "router_z_loss_clip": 0.01452637, "router_z_loss_mlp": 0.25390625, "step": 7771, "time_per_iteration": 3.121711015701294 }, { "auxiliary_loss_clip": 0.01118849, "auxiliary_loss_mlp": 0.01034104, "balance_loss_clip": 1.01992369, "balance_loss_mlp": 1.04292989, "epoch": 0.4672779197354577, "flos": 27636134002560.0, "grad_norm": 2.3295301786861495, "language_loss": 0.65864456, "learning_rate": 2.3078095419121117e-06, "loss": 0.68017411, "num_input_tokens_seen": 166875925, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7578125, "step": 7772, "time_per_iteration": 2.526005268096924 }, { "auxiliary_loss_clip": 0.01120049, "auxiliary_loss_mlp": 0.0103194, "balance_loss_clip": 1.01808763, "balance_loss_mlp": 1.04384172, "epoch": 0.46733804298812565, "flos": 31394257344000.0, "grad_norm": 2.2343737959176146, "language_loss": 0.63778353, "learning_rate": 2.3074247145660283e-06, "loss": 0.65930343, "num_input_tokens_seen": 166896520, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 7773, "time_per_iteration": 2.5660955905914307 }, { "auxiliary_loss_clip": 0.01119059, "auxiliary_loss_mlp": 0.01035443, "balance_loss_clip": 1.0201962, "balance_loss_mlp": 1.04161251, "epoch": 0.4673981662407936, "flos": 19500607912320.0, "grad_norm": 1.722304847337428, "language_loss": 0.80523664, "learning_rate": 2.3070398755623685e-06, "loss": 0.82678163, "num_input_tokens_seen": 166915370, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 7774, "time_per_iteration": 2.4683680534362793 }, { "auxiliary_loss_clip": 0.01121513, "auxiliary_loss_mlp": 0.0102992, "balance_loss_clip": 1.0158174, "balance_loss_mlp": 1.04268992, "epoch": 0.4674582894934616, "flos": 20521835487360.0, "grad_norm": 1.7302584785365283, "language_loss": 0.77428657, "learning_rate": 2.306655024915726e-06, "loss": 0.79580092, "num_input_tokens_seen": 166934875, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 7775, "time_per_iteration": 2.4984517097473145 }, { "auxiliary_loss_clip": 0.01116823, "auxiliary_loss_mlp": 0.01031124, "balance_loss_clip": 1.01631844, "balance_loss_mlp": 1.04121923, "epoch": 0.46751841274612954, "flos": 22090952188800.0, "grad_norm": 1.8445013517837714, "language_loss": 0.69964892, "learning_rate": 2.306270162640694e-06, "loss": 0.72112834, "num_input_tokens_seen": 166954285, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7578125, "step": 7776, "time_per_iteration": 2.5116350650787354 }, { "auxiliary_loss_clip": 0.01120629, "auxiliary_loss_mlp": 0.01033066, "balance_loss_clip": 1.02000618, "balance_loss_mlp": 1.04396653, "epoch": 0.46757853599879756, "flos": 26980082046720.0, "grad_norm": 1.3788234027146216, "language_loss": 0.73839349, "learning_rate": 2.3058852887518678e-06, "loss": 0.75993043, "num_input_tokens_seen": 166975975, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.765625, "step": 7777, "time_per_iteration": 2.5513598918914795 }, { "auxiliary_loss_clip": 0.01119424, "auxiliary_loss_mlp": 0.01036417, "balance_loss_clip": 1.02198052, "balance_loss_mlp": 1.04212499, "epoch": 0.4676386592514655, "flos": 24134053783680.0, "grad_norm": 3.3178812558385546, "language_loss": 0.69563133, "learning_rate": 2.3055004032638394e-06, "loss": 0.71718979, "num_input_tokens_seen": 166996140, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 7778, "time_per_iteration": 2.5145061016082764 }, { "auxiliary_loss_clip": 0.01119959, "auxiliary_loss_mlp": 0.01038291, "balance_loss_clip": 1.02332437, "balance_loss_mlp": 1.04207349, "epoch": 0.4676987825041335, "flos": 25483720343040.0, "grad_norm": 4.456792774708776, "language_loss": 0.73703229, "learning_rate": 2.305115506191206e-06, "loss": 0.75861478, "num_input_tokens_seen": 167016105, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.77734375, "step": 7779, "time_per_iteration": 2.617652416229248 }, { "auxiliary_loss_clip": 0.01116026, "auxiliary_loss_mlp": 0.01033741, "balance_loss_clip": 1.0206399, "balance_loss_mlp": 1.04152739, "epoch": 0.46775890575680146, "flos": 21945298538880.0, "grad_norm": 1.3862937188720612, "language_loss": 0.72040606, "learning_rate": 2.304730597548562e-06, "loss": 0.74190378, "num_input_tokens_seen": 167036185, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 7780, "time_per_iteration": 2.561243772506714 }, { "auxiliary_loss_clip": 0.0112385, "auxiliary_loss_mlp": 0.0103548, "balance_loss_clip": 1.02087736, "balance_loss_mlp": 1.04345989, "epoch": 0.4678190290094694, "flos": 25228395492480.0, "grad_norm": 1.9133628570429932, "language_loss": 0.7412585, "learning_rate": 2.3043456773505023e-06, "loss": 0.76285183, "num_input_tokens_seen": 167054515, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.8046875, "step": 7781, "time_per_iteration": 2.570611000061035 }, { "auxiliary_loss_clip": 0.01121182, "auxiliary_loss_mlp": 0.01036309, "balance_loss_clip": 1.02180135, "balance_loss_mlp": 1.04114115, "epoch": 0.4678791522621374, "flos": 32268358811520.0, "grad_norm": 1.6697986889673408, "language_loss": 0.63511854, "learning_rate": 2.3039607456116252e-06, "loss": 0.65669346, "num_input_tokens_seen": 167077245, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.80078125, "step": 7782, "time_per_iteration": 2.5735816955566406 }, { "auxiliary_loss_clip": 0.01122561, "auxiliary_loss_mlp": 0.01042872, "balance_loss_clip": 1.02843583, "balance_loss_mlp": 1.04329491, "epoch": 0.46793927551480535, "flos": 27046480337280.0, "grad_norm": 3.1910254276062404, "language_loss": 0.63327122, "learning_rate": 2.3035758023465254e-06, "loss": 0.65492558, "num_input_tokens_seen": 167097235, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.79296875, "step": 7783, "time_per_iteration": 2.5419325828552246 }, { "auxiliary_loss_clip": 0.01127644, "auxiliary_loss_mlp": 0.01039302, "balance_loss_clip": 1.02345872, "balance_loss_mlp": 1.04595947, "epoch": 0.4679993987674733, "flos": 17457398576640.0, "grad_norm": 4.049060121834886, "language_loss": 0.68580914, "learning_rate": 2.303190847569801e-06, "loss": 0.70747864, "num_input_tokens_seen": 167113155, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.81640625, "step": 7784, "time_per_iteration": 2.438863515853882 }, { "auxiliary_loss_clip": 0.01117333, "auxiliary_loss_mlp": 0.01030606, "balance_loss_clip": 1.01750493, "balance_loss_mlp": 1.04125738, "epoch": 0.4680595220201413, "flos": 17165121609600.0, "grad_norm": 1.9321991379196921, "language_loss": 0.85067129, "learning_rate": 2.3028058812960497e-06, "loss": 0.87215066, "num_input_tokens_seen": 167131765, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 7785, "time_per_iteration": 2.4441494941711426 }, { "auxiliary_loss_clip": 0.01120704, "auxiliary_loss_mlp": 0.01031222, "balance_loss_clip": 1.01691675, "balance_loss_mlp": 1.04256511, "epoch": 0.46811964527280925, "flos": 11327591001600.0, "grad_norm": 4.197318382843688, "language_loss": 0.77478057, "learning_rate": 2.3024209035398678e-06, "loss": 0.79629982, "num_input_tokens_seen": 167149030, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 7786, "time_per_iteration": 2.429405450820923 }, { "auxiliary_loss_clip": 0.01114155, "auxiliary_loss_mlp": 0.01028903, "balance_loss_clip": 1.01584971, "balance_loss_mlp": 1.04051232, "epoch": 0.4681797685254772, "flos": 24278809593600.0, "grad_norm": 1.9480685155163548, "language_loss": 0.74664611, "learning_rate": 2.302035914315856e-06, "loss": 0.76807666, "num_input_tokens_seen": 167167375, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 7787, "time_per_iteration": 2.4936363697052 }, { "auxiliary_loss_clip": 0.01120249, "auxiliary_loss_mlp": 0.0103428, "balance_loss_clip": 1.02057719, "balance_loss_mlp": 1.04360068, "epoch": 0.4682398917781452, "flos": 31650372293760.0, "grad_norm": 1.9232607635701309, "language_loss": 0.65445817, "learning_rate": 2.3016509136386116e-06, "loss": 0.67600346, "num_input_tokens_seen": 167188065, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 7788, "time_per_iteration": 2.5551161766052246 }, { "auxiliary_loss_clip": 0.01117891, "auxiliary_loss_mlp": 0.01030508, "balance_loss_clip": 1.01828921, "balance_loss_mlp": 1.04225779, "epoch": 0.46830001503081314, "flos": 28110765340800.0, "grad_norm": 1.6789638516702434, "language_loss": 0.64555657, "learning_rate": 2.3012659015227343e-06, "loss": 0.66704059, "num_input_tokens_seen": 167209675, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7578125, "step": 7789, "time_per_iteration": 2.5519731044769287 }, { "auxiliary_loss_clip": 0.01045443, "auxiliary_loss_mlp": 0.01008631, "balance_loss_clip": 1.00697422, "balance_loss_mlp": 1.01955724, "epoch": 0.4683601382834811, "flos": 57881718316800.0, "grad_norm": 0.706641367553302, "language_loss": 0.61924273, "learning_rate": 2.300880877982825e-06, "loss": 0.63978344, "num_input_tokens_seen": 167273940, "router_z_loss_clip": 0.01660156, "router_z_loss_mlp": 0.2578125, "step": 7790, "time_per_iteration": 3.178279399871826 }, { "auxiliary_loss_clip": 0.01118704, "auxiliary_loss_mlp": 0.0103465, "balance_loss_clip": 1.02097714, "balance_loss_mlp": 1.04412019, "epoch": 0.46842026153614913, "flos": 21871933009920.0, "grad_norm": 1.6191766336880904, "language_loss": 0.79152083, "learning_rate": 2.3004958430334808e-06, "loss": 0.81305438, "num_input_tokens_seen": 167292730, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 7791, "time_per_iteration": 2.496222734451294 }, { "auxiliary_loss_clip": 0.01117914, "auxiliary_loss_mlp": 0.01034182, "balance_loss_clip": 1.0210278, "balance_loss_mlp": 1.04179907, "epoch": 0.4684803847888171, "flos": 24900818434560.0, "grad_norm": 1.5037881795539887, "language_loss": 0.75003672, "learning_rate": 2.3001107966893052e-06, "loss": 0.77155763, "num_input_tokens_seen": 167313460, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.76171875, "step": 7792, "time_per_iteration": 2.5401785373687744 }, { "auxiliary_loss_clip": 0.01115309, "auxiliary_loss_mlp": 0.01030287, "balance_loss_clip": 1.01757312, "balance_loss_mlp": 1.04108942, "epoch": 0.46854050804148506, "flos": 26251670142720.0, "grad_norm": 1.7211518194303947, "language_loss": 0.68011808, "learning_rate": 2.299725738964898e-06, "loss": 0.70157397, "num_input_tokens_seen": 167335385, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 7793, "time_per_iteration": 2.5647695064544678 }, { "auxiliary_loss_clip": 0.01117026, "auxiliary_loss_mlp": 0.01027425, "balance_loss_clip": 1.01492012, "balance_loss_mlp": 1.04275453, "epoch": 0.468600631294153, "flos": 21579799697280.0, "grad_norm": 1.630263994679267, "language_loss": 0.736408, "learning_rate": 2.2993406698748607e-06, "loss": 0.75785255, "num_input_tokens_seen": 167353625, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7421875, "step": 7794, "time_per_iteration": 2.485427141189575 }, { "auxiliary_loss_clip": 0.01119797, "auxiliary_loss_mlp": 0.01033169, "balance_loss_clip": 1.01932263, "balance_loss_mlp": 1.04476249, "epoch": 0.468660754546821, "flos": 25885632597120.0, "grad_norm": 1.7545019035362153, "language_loss": 0.63279939, "learning_rate": 2.2989555894337953e-06, "loss": 0.65432906, "num_input_tokens_seen": 167374565, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 7795, "time_per_iteration": 2.5290310382843018 }, { "auxiliary_loss_clip": 0.01113238, "auxiliary_loss_mlp": 0.01026323, "balance_loss_clip": 1.01362729, "balance_loss_mlp": 1.04062295, "epoch": 0.46872087779948896, "flos": 35475001666560.0, "grad_norm": 1.6824042546444282, "language_loss": 0.67927974, "learning_rate": 2.298570497656304e-06, "loss": 0.70067537, "num_input_tokens_seen": 167395010, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 7796, "time_per_iteration": 2.58699107170105 }, { "auxiliary_loss_clip": 0.01116439, "auxiliary_loss_mlp": 0.01026848, "balance_loss_clip": 1.01390219, "balance_loss_mlp": 1.04103637, "epoch": 0.4687810010521569, "flos": 26396425952640.0, "grad_norm": 2.6501347993629216, "language_loss": 0.70041823, "learning_rate": 2.2981853945569894e-06, "loss": 0.72185111, "num_input_tokens_seen": 167415285, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75390625, "step": 7797, "time_per_iteration": 2.51607084274292 }, { "auxiliary_loss_clip": 0.0111911, "auxiliary_loss_mlp": 0.01033023, "balance_loss_clip": 1.01914144, "balance_loss_mlp": 1.0428834, "epoch": 0.4688411243048249, "flos": 19972761212160.0, "grad_norm": 2.2781446811624906, "language_loss": 0.67085284, "learning_rate": 2.297800280150454e-06, "loss": 0.69237423, "num_input_tokens_seen": 167432405, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 7798, "time_per_iteration": 2.456291675567627 }, { "auxiliary_loss_clip": 0.01044888, "auxiliary_loss_mlp": 0.01006393, "balance_loss_clip": 1.00492656, "balance_loss_mlp": 1.01944149, "epoch": 0.46890124755749285, "flos": 63977015900160.0, "grad_norm": 0.9418074335928491, "language_loss": 0.64554256, "learning_rate": 2.2974151544513033e-06, "loss": 0.66605538, "num_input_tokens_seen": 167499365, "router_z_loss_clip": 0.01464844, "router_z_loss_mlp": 0.25390625, "step": 7799, "time_per_iteration": 4.747651100158691 }, { "auxiliary_loss_clip": 0.01114906, "auxiliary_loss_mlp": 0.01026021, "balance_loss_clip": 1.01326597, "balance_loss_mlp": 1.0406816, "epoch": 0.4689613708101608, "flos": 23768985905280.0, "grad_norm": 1.3349480447896178, "language_loss": 0.72608674, "learning_rate": 2.2970300174741395e-06, "loss": 0.74749601, "num_input_tokens_seen": 167520390, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 7800, "time_per_iteration": 3.9074606895446777 }, { "auxiliary_loss_clip": 0.01112913, "auxiliary_loss_mlp": 0.0102998, "balance_loss_clip": 1.01819015, "balance_loss_mlp": 1.04170442, "epoch": 0.4690214940628288, "flos": 24788705109120.0, "grad_norm": 1.8963559104768393, "language_loss": 0.72008622, "learning_rate": 2.296644869233568e-06, "loss": 0.74151516, "num_input_tokens_seen": 167539865, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.7109375, "step": 7801, "time_per_iteration": 3.9335319995880127 }, { "auxiliary_loss_clip": 0.01120608, "auxiliary_loss_mlp": 0.01036778, "balance_loss_clip": 1.02216315, "balance_loss_mlp": 1.04228997, "epoch": 0.46908161731549675, "flos": 18077324428800.0, "grad_norm": 2.2064669937077412, "language_loss": 0.62606031, "learning_rate": 2.2962597097441936e-06, "loss": 0.64763421, "num_input_tokens_seen": 167558190, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 7802, "time_per_iteration": 2.4538421630859375 }, { "auxiliary_loss_clip": 0.01117056, "auxiliary_loss_mlp": 0.01035713, "balance_loss_clip": 1.02247453, "balance_loss_mlp": 1.04031253, "epoch": 0.4691417405681647, "flos": 25703350053120.0, "grad_norm": 2.1986618099965516, "language_loss": 0.73271424, "learning_rate": 2.2958745390206206e-06, "loss": 0.75424194, "num_input_tokens_seen": 167577685, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 7803, "time_per_iteration": 3.874180793762207 }, { "auxiliary_loss_clip": 0.01115023, "auxiliary_loss_mlp": 0.01036547, "balance_loss_clip": 1.02385116, "balance_loss_mlp": 1.04071617, "epoch": 0.46920186382083273, "flos": 17457039440640.0, "grad_norm": 1.699484074281973, "language_loss": 0.77237976, "learning_rate": 2.2954893570774558e-06, "loss": 0.79389548, "num_input_tokens_seen": 167596390, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 7804, "time_per_iteration": 2.468505859375 }, { "auxiliary_loss_clip": 0.01117164, "auxiliary_loss_mlp": 0.01028461, "balance_loss_clip": 1.01586032, "balance_loss_mlp": 1.04235291, "epoch": 0.4692619870735007, "flos": 20339445202560.0, "grad_norm": 1.809391417604168, "language_loss": 0.77204305, "learning_rate": 2.295104163929305e-06, "loss": 0.79349929, "num_input_tokens_seen": 167614980, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.74609375, "step": 7805, "time_per_iteration": 2.445439100265503 }, { "auxiliary_loss_clip": 0.01124675, "auxiliary_loss_mlp": 0.01042584, "balance_loss_clip": 1.02836215, "balance_loss_mlp": 1.04435563, "epoch": 0.46932211032616866, "flos": 29496558003840.0, "grad_norm": 1.626013633326495, "language_loss": 0.82857746, "learning_rate": 2.2947189595907742e-06, "loss": 0.85025001, "num_input_tokens_seen": 167635895, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.8046875, "step": 7806, "time_per_iteration": 2.5408878326416016 }, { "auxiliary_loss_clip": 0.0111856, "auxiliary_loss_mlp": 0.01034132, "balance_loss_clip": 1.02104259, "balance_loss_mlp": 1.04202735, "epoch": 0.4693822335788366, "flos": 36211242735360.0, "grad_norm": 2.136639927278331, "language_loss": 0.77208495, "learning_rate": 2.294333744076472e-06, "loss": 0.79361188, "num_input_tokens_seen": 167657440, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.765625, "step": 7807, "time_per_iteration": 2.599431037902832 }, { "auxiliary_loss_clip": 0.01117716, "auxiliary_loss_mlp": 0.0103396, "balance_loss_clip": 1.01993537, "balance_loss_mlp": 1.0421344, "epoch": 0.4694423568315046, "flos": 20338978325760.0, "grad_norm": 1.8043824642973212, "language_loss": 0.51687276, "learning_rate": 2.2939485174010035e-06, "loss": 0.5383895, "num_input_tokens_seen": 167675025, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 7808, "time_per_iteration": 2.4692230224609375 }, { "auxiliary_loss_clip": 0.01046071, "auxiliary_loss_mlp": 0.01004008, "balance_loss_clip": 1.00259542, "balance_loss_mlp": 1.02058887, "epoch": 0.46950248008417256, "flos": 64326353621760.0, "grad_norm": 0.7833025237810419, "language_loss": 0.57723618, "learning_rate": 2.293563279578978e-06, "loss": 0.59773695, "num_input_tokens_seen": 167729635, "router_z_loss_clip": 0.01409912, "router_z_loss_mlp": 0.25390625, "step": 7809, "time_per_iteration": 2.9572951793670654 }, { "auxiliary_loss_clip": 0.01120868, "auxiliary_loss_mlp": 0.01037408, "balance_loss_clip": 1.0242238, "balance_loss_mlp": 1.04407978, "epoch": 0.4695626033368405, "flos": 19200106730880.0, "grad_norm": 2.01599130973476, "language_loss": 0.71687865, "learning_rate": 2.2931780306250045e-06, "loss": 0.73846143, "num_input_tokens_seen": 167745135, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.765625, "step": 7810, "time_per_iteration": 2.4624783992767334 }, { "auxiliary_loss_clip": 0.01118781, "auxiliary_loss_mlp": 0.01032394, "balance_loss_clip": 1.0194416, "balance_loss_mlp": 1.04232621, "epoch": 0.4696227265895085, "flos": 23002436736000.0, "grad_norm": 1.9039848645266662, "language_loss": 0.80566639, "learning_rate": 2.29279277055369e-06, "loss": 0.82717812, "num_input_tokens_seen": 167763875, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.765625, "step": 7811, "time_per_iteration": 2.4981865882873535 }, { "auxiliary_loss_clip": 0.01120623, "auxiliary_loss_mlp": 0.01036909, "balance_loss_clip": 1.02330112, "balance_loss_mlp": 1.04355025, "epoch": 0.46968284984217645, "flos": 21870855601920.0, "grad_norm": 1.662039215808902, "language_loss": 0.80512685, "learning_rate": 2.292407499379644e-06, "loss": 0.82670218, "num_input_tokens_seen": 167784895, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7734375, "step": 7812, "time_per_iteration": 2.509134292602539 }, { "auxiliary_loss_clip": 0.01113951, "auxiliary_loss_mlp": 0.01029459, "balance_loss_clip": 1.01685226, "balance_loss_mlp": 1.04089499, "epoch": 0.4697429730948444, "flos": 19974987855360.0, "grad_norm": 1.757855345166494, "language_loss": 0.74641919, "learning_rate": 2.292022217117477e-06, "loss": 0.76785326, "num_input_tokens_seen": 167803185, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73046875, "step": 7813, "time_per_iteration": 2.4815120697021484 }, { "auxiliary_loss_clip": 0.01115689, "auxiliary_loss_mlp": 0.01030517, "balance_loss_clip": 1.01647973, "balance_loss_mlp": 1.04042196, "epoch": 0.4698030963475124, "flos": 15156206784000.0, "grad_norm": 2.1428066154987273, "language_loss": 0.84915578, "learning_rate": 2.291636923781798e-06, "loss": 0.87061787, "num_input_tokens_seen": 167816550, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 7814, "time_per_iteration": 2.5062756538391113 }, { "auxiliary_loss_clip": 0.01114024, "auxiliary_loss_mlp": 0.01034879, "balance_loss_clip": 1.02156293, "balance_loss_mlp": 1.04093027, "epoch": 0.46986321960018035, "flos": 15151178880000.0, "grad_norm": 1.8134884187649492, "language_loss": 0.8189376, "learning_rate": 2.291251619387217e-06, "loss": 0.84042662, "num_input_tokens_seen": 167831845, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 7815, "time_per_iteration": 2.4482812881469727 }, { "auxiliary_loss_clip": 0.01117777, "auxiliary_loss_mlp": 0.0103375, "balance_loss_clip": 1.01900387, "balance_loss_mlp": 1.04187977, "epoch": 0.4699233428528483, "flos": 23108911626240.0, "grad_norm": 2.1449664149445153, "language_loss": 0.77795064, "learning_rate": 2.2908663039483468e-06, "loss": 0.79946595, "num_input_tokens_seen": 167850360, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7578125, "step": 7816, "time_per_iteration": 2.5050930976867676 }, { "auxiliary_loss_clip": 0.01044452, "auxiliary_loss_mlp": 0.01002555, "balance_loss_clip": 1.00093365, "balance_loss_mlp": 1.01908422, "epoch": 0.46998346610551633, "flos": 68105558246400.0, "grad_norm": 0.8465972830945143, "language_loss": 0.59080434, "learning_rate": 2.290480977479796e-06, "loss": 0.61127436, "num_input_tokens_seen": 167908660, "router_z_loss_clip": 0.01623535, "router_z_loss_mlp": 0.25390625, "step": 7817, "time_per_iteration": 3.109132766723633 }, { "auxiliary_loss_clip": 0.01113092, "auxiliary_loss_mlp": 0.01030494, "balance_loss_clip": 1.01731539, "balance_loss_mlp": 1.04091644, "epoch": 0.4700435893581843, "flos": 24129456842880.0, "grad_norm": 1.6688827423042698, "language_loss": 0.79518223, "learning_rate": 2.2900956399961775e-06, "loss": 0.81661808, "num_input_tokens_seen": 167927905, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 7818, "time_per_iteration": 2.507923126220703 }, { "auxiliary_loss_clip": 0.0111504, "auxiliary_loss_mlp": 0.01030719, "balance_loss_clip": 1.01757574, "balance_loss_mlp": 1.03962862, "epoch": 0.47010371261085226, "flos": 20150518642560.0, "grad_norm": 1.8852711689849897, "language_loss": 0.84272075, "learning_rate": 2.289710291512104e-06, "loss": 0.8641783, "num_input_tokens_seen": 167945995, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75390625, "step": 7819, "time_per_iteration": 2.54278826713562 }, { "auxiliary_loss_clip": 0.01119217, "auxiliary_loss_mlp": 0.01035237, "balance_loss_clip": 1.02111125, "balance_loss_mlp": 1.04133821, "epoch": 0.47016383586352023, "flos": 15122199582720.0, "grad_norm": 1.9457387840475606, "language_loss": 0.75982195, "learning_rate": 2.289324932042186e-06, "loss": 0.78136647, "num_input_tokens_seen": 167963380, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.77734375, "step": 7820, "time_per_iteration": 2.4781086444854736 }, { "auxiliary_loss_clip": 0.0111536, "auxiliary_loss_mlp": 0.01033445, "balance_loss_clip": 1.01984882, "balance_loss_mlp": 1.04252887, "epoch": 0.4702239591161882, "flos": 13552975140480.0, "grad_norm": 1.99753702717165, "language_loss": 0.74463356, "learning_rate": 2.288939561601039e-06, "loss": 0.76612163, "num_input_tokens_seen": 167981740, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 7821, "time_per_iteration": 2.446207284927368 }, { "auxiliary_loss_clip": 0.01114739, "auxiliary_loss_mlp": 0.01035271, "balance_loss_clip": 1.02256334, "balance_loss_mlp": 1.04143739, "epoch": 0.47028408236885616, "flos": 24276511123200.0, "grad_norm": 2.6559994115921137, "language_loss": 0.89082861, "learning_rate": 2.2885541802032746e-06, "loss": 0.91232878, "num_input_tokens_seen": 167999380, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 7822, "time_per_iteration": 2.497889995574951 }, { "auxiliary_loss_clip": 0.01113657, "auxiliary_loss_mlp": 0.01027544, "balance_loss_clip": 1.01477051, "balance_loss_mlp": 1.04035974, "epoch": 0.4703442056215241, "flos": 22856926740480.0, "grad_norm": 1.4993135874904036, "language_loss": 0.7994765, "learning_rate": 2.2881687878635055e-06, "loss": 0.82088852, "num_input_tokens_seen": 168018395, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 7823, "time_per_iteration": 2.467378616333008 }, { "auxiliary_loss_clip": 0.01045372, "auxiliary_loss_mlp": 0.00999972, "balance_loss_clip": 0.99855959, "balance_loss_mlp": 1.02002311, "epoch": 0.4704043288741921, "flos": 69240227950080.0, "grad_norm": 0.7048285466941512, "language_loss": 0.56669444, "learning_rate": 2.2877833845963487e-06, "loss": 0.58714789, "num_input_tokens_seen": 168084080, "router_z_loss_clip": 0.01409912, "router_z_loss_mlp": 0.25390625, "step": 7824, "time_per_iteration": 3.174670696258545 }, { "auxiliary_loss_clip": 0.01118149, "auxiliary_loss_mlp": 0.01034849, "balance_loss_clip": 1.02050233, "balance_loss_mlp": 1.04124844, "epoch": 0.47046445212686006, "flos": 18041090584320.0, "grad_norm": 1.88207024568314, "language_loss": 0.81158501, "learning_rate": 2.2873979704164157e-06, "loss": 0.83311498, "num_input_tokens_seen": 168101555, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76953125, "step": 7825, "time_per_iteration": 2.4630603790283203 }, { "auxiliary_loss_clip": 0.01117266, "auxiliary_loss_mlp": 0.01029481, "balance_loss_clip": 1.01550984, "balance_loss_mlp": 1.04146731, "epoch": 0.470524575379528, "flos": 23951448017280.0, "grad_norm": 1.6607121033185575, "language_loss": 0.66603887, "learning_rate": 2.287012545338324e-06, "loss": 0.68750632, "num_input_tokens_seen": 168121530, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 7826, "time_per_iteration": 2.5491943359375 }, { "auxiliary_loss_clip": 0.01115103, "auxiliary_loss_mlp": 0.01035299, "balance_loss_clip": 1.02173853, "balance_loss_mlp": 1.03892875, "epoch": 0.470584698632196, "flos": 18113558273280.0, "grad_norm": 1.7977316043635734, "language_loss": 0.83722281, "learning_rate": 2.2866271093766877e-06, "loss": 0.8587268, "num_input_tokens_seen": 168140335, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 7827, "time_per_iteration": 2.4756102561950684 }, { "auxiliary_loss_clip": 0.01044239, "auxiliary_loss_mlp": 0.01002413, "balance_loss_clip": 1.00104225, "balance_loss_mlp": 1.0190568, "epoch": 0.47064482188486395, "flos": 57251916224640.0, "grad_norm": 1.3874543610224894, "language_loss": 0.55704385, "learning_rate": 2.286241662546122e-06, "loss": 0.57751036, "num_input_tokens_seen": 168200535, "router_z_loss_clip": 0.01373291, "router_z_loss_mlp": 0.25195312, "step": 7828, "time_per_iteration": 3.0828559398651123 }, { "auxiliary_loss_clip": 0.01113413, "auxiliary_loss_mlp": 0.01028987, "balance_loss_clip": 1.01587963, "balance_loss_mlp": 1.03955126, "epoch": 0.4707049451375319, "flos": 17895077798400.0, "grad_norm": 1.912763554857496, "language_loss": 0.80865371, "learning_rate": 2.285856204861245e-06, "loss": 0.83007771, "num_input_tokens_seen": 168219610, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 7829, "time_per_iteration": 2.4822607040405273 }, { "auxiliary_loss_clip": 0.01114505, "auxiliary_loss_mlp": 0.01034194, "balance_loss_clip": 1.02154016, "balance_loss_mlp": 1.04118109, "epoch": 0.47076506839019994, "flos": 25232669210880.0, "grad_norm": 1.3908647707377664, "language_loss": 0.76216221, "learning_rate": 2.2854707363366703e-06, "loss": 0.78364915, "num_input_tokens_seen": 168242505, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 7830, "time_per_iteration": 2.54188871383667 }, { "auxiliary_loss_clip": 0.01115105, "auxiliary_loss_mlp": 0.01031919, "balance_loss_clip": 1.01795959, "balance_loss_mlp": 1.0413115, "epoch": 0.4708251916428679, "flos": 13479681438720.0, "grad_norm": 1.9183021779321308, "language_loss": 0.78971112, "learning_rate": 2.2850852569870177e-06, "loss": 0.81118131, "num_input_tokens_seen": 168260220, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 7831, "time_per_iteration": 2.448561668395996 }, { "auxiliary_loss_clip": 0.01119428, "auxiliary_loss_mlp": 0.01034673, "balance_loss_clip": 1.02073717, "balance_loss_mlp": 1.03957832, "epoch": 0.47088531489553587, "flos": 30147833450880.0, "grad_norm": 1.7623124436204969, "language_loss": 0.75600863, "learning_rate": 2.2846997668269033e-06, "loss": 0.77754962, "num_input_tokens_seen": 168277360, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.80078125, "step": 7832, "time_per_iteration": 2.5298280715942383 }, { "auxiliary_loss_clip": 0.01114387, "auxiliary_loss_mlp": 0.01026773, "balance_loss_clip": 1.01429737, "balance_loss_mlp": 1.04114604, "epoch": 0.47094543814820383, "flos": 21798280172160.0, "grad_norm": 1.4191916468713874, "language_loss": 0.74678093, "learning_rate": 2.2843142658709454e-06, "loss": 0.76819253, "num_input_tokens_seen": 168296605, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.734375, "step": 7833, "time_per_iteration": 2.493258476257324 }, { "auxiliary_loss_clip": 0.01113906, "auxiliary_loss_mlp": 0.01038584, "balance_loss_clip": 1.02455258, "balance_loss_mlp": 1.03920436, "epoch": 0.4710055614008718, "flos": 23003011353600.0, "grad_norm": 1.8606585538386702, "language_loss": 0.75394285, "learning_rate": 2.283928754133762e-06, "loss": 0.77546775, "num_input_tokens_seen": 168316205, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.74609375, "step": 7834, "time_per_iteration": 2.498401641845703 }, { "auxiliary_loss_clip": 0.01114543, "auxiliary_loss_mlp": 0.01035441, "balance_loss_clip": 1.02248931, "balance_loss_mlp": 1.04159689, "epoch": 0.47106568465353976, "flos": 42741346452480.0, "grad_norm": 1.3819695211739622, "language_loss": 0.6609782, "learning_rate": 2.283543231629972e-06, "loss": 0.68247807, "num_input_tokens_seen": 168338935, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 7835, "time_per_iteration": 2.6957077980041504 }, { "auxiliary_loss_clip": 0.01041713, "auxiliary_loss_mlp": 0.01005966, "balance_loss_clip": 1.00457716, "balance_loss_mlp": 1.0163132, "epoch": 0.4711258079062077, "flos": 68554008570240.0, "grad_norm": 0.87250835694274, "language_loss": 0.62134731, "learning_rate": 2.283157698374194e-06, "loss": 0.64182413, "num_input_tokens_seen": 168392800, "router_z_loss_clip": 0.01391602, "router_z_loss_mlp": 0.25390625, "step": 7836, "time_per_iteration": 3.0786092281341553 }, { "auxiliary_loss_clip": 0.01119907, "auxiliary_loss_mlp": 0.01034454, "balance_loss_clip": 1.02121639, "balance_loss_mlp": 1.04003453, "epoch": 0.4711859311588757, "flos": 25446588658560.0, "grad_norm": 1.9622169160910259, "language_loss": 0.69629109, "learning_rate": 2.2827721543810475e-06, "loss": 0.71783477, "num_input_tokens_seen": 168412940, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.796875, "step": 7837, "time_per_iteration": 2.526428699493408 }, { "auxiliary_loss_clip": 0.0111634, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.02571797, "balance_loss_mlp": 1.04018962, "epoch": 0.47124605441154366, "flos": 21981891519360.0, "grad_norm": 1.8652810503183095, "language_loss": 0.66788173, "learning_rate": 2.282386599665153e-06, "loss": 0.68944645, "num_input_tokens_seen": 168431995, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76171875, "step": 7838, "time_per_iteration": 2.475947856903076 }, { "auxiliary_loss_clip": 0.01119358, "auxiliary_loss_mlp": 0.01029855, "balance_loss_clip": 1.01591969, "balance_loss_mlp": 1.04140651, "epoch": 0.4713061776642116, "flos": 25412689198080.0, "grad_norm": 2.0433252790244496, "language_loss": 0.76898086, "learning_rate": 2.2820010342411304e-06, "loss": 0.79047304, "num_input_tokens_seen": 168454585, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 7839, "time_per_iteration": 2.554931163787842 }, { "auxiliary_loss_clip": 0.01111887, "auxiliary_loss_mlp": 0.01035218, "balance_loss_clip": 1.02230716, "balance_loss_mlp": 1.03921545, "epoch": 0.4713663009168796, "flos": 26542259170560.0, "grad_norm": 1.8343222025771782, "language_loss": 0.72732443, "learning_rate": 2.2816154581235993e-06, "loss": 0.74879545, "num_input_tokens_seen": 168471265, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 7840, "time_per_iteration": 2.5735788345336914 }, { "auxiliary_loss_clip": 0.01113711, "auxiliary_loss_mlp": 0.01029558, "balance_loss_clip": 1.01610518, "balance_loss_mlp": 1.03881741, "epoch": 0.47142642416954755, "flos": 23623583650560.0, "grad_norm": 1.678643170178726, "language_loss": 0.75222474, "learning_rate": 2.2812298713271833e-06, "loss": 0.77365744, "num_input_tokens_seen": 168491360, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 7841, "time_per_iteration": 3.9715213775634766 }, { "auxiliary_loss_clip": 0.01115345, "auxiliary_loss_mlp": 0.01031593, "balance_loss_clip": 1.0187242, "balance_loss_mlp": 1.04005456, "epoch": 0.4714865474222155, "flos": 22310150935680.0, "grad_norm": 1.7218432893225735, "language_loss": 0.70832938, "learning_rate": 2.280844273866501e-06, "loss": 0.72979879, "num_input_tokens_seen": 168511335, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 7842, "time_per_iteration": 5.40913200378418 }, { "auxiliary_loss_clip": 0.01118513, "auxiliary_loss_mlp": 0.01036397, "balance_loss_clip": 1.02291477, "balance_loss_mlp": 1.04320621, "epoch": 0.4715466706748835, "flos": 17822430541440.0, "grad_norm": 2.448991342503611, "language_loss": 0.78743917, "learning_rate": 2.280458665756177e-06, "loss": 0.80898827, "num_input_tokens_seen": 168529920, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 7843, "time_per_iteration": 2.4492080211639404 }, { "auxiliary_loss_clip": 0.01113769, "auxiliary_loss_mlp": 0.01033435, "balance_loss_clip": 1.02060771, "balance_loss_mlp": 1.03906178, "epoch": 0.4716067939275515, "flos": 23659530186240.0, "grad_norm": 1.7639192214594772, "language_loss": 0.7415604, "learning_rate": 2.280073047010832e-06, "loss": 0.76303244, "num_input_tokens_seen": 168550595, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.75, "step": 7844, "time_per_iteration": 3.8899738788604736 }, { "auxiliary_loss_clip": 0.01113835, "auxiliary_loss_mlp": 0.01043808, "balance_loss_clip": 1.03033757, "balance_loss_mlp": 1.03961778, "epoch": 0.47166691718021947, "flos": 17930162407680.0, "grad_norm": 1.5917362762634528, "language_loss": 0.78695589, "learning_rate": 2.279687417645088e-06, "loss": 0.8085323, "num_input_tokens_seen": 168569765, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 7845, "time_per_iteration": 2.4785468578338623 }, { "auxiliary_loss_clip": 0.01111995, "auxiliary_loss_mlp": 0.01033958, "balance_loss_clip": 1.02135205, "balance_loss_mlp": 1.03925872, "epoch": 0.47172704043288743, "flos": 26614583205120.0, "grad_norm": 1.5027635118546268, "language_loss": 0.73328954, "learning_rate": 2.2793017776735703e-06, "loss": 0.75474912, "num_input_tokens_seen": 168591525, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 7846, "time_per_iteration": 2.507070302963257 }, { "auxiliary_loss_clip": 0.01111184, "auxiliary_loss_mlp": 0.01031624, "balance_loss_clip": 1.01856482, "balance_loss_mlp": 1.03884792, "epoch": 0.4717871636855554, "flos": 27922700707200.0, "grad_norm": 1.339758319038289, "language_loss": 0.74082476, "learning_rate": 2.2789161271109e-06, "loss": 0.76225281, "num_input_tokens_seen": 168611235, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 7847, "time_per_iteration": 2.533784866333008 }, { "auxiliary_loss_clip": 0.01114802, "auxiliary_loss_mlp": 0.01035084, "balance_loss_clip": 1.02191699, "balance_loss_mlp": 1.04030764, "epoch": 0.47184728693822336, "flos": 14502237816960.0, "grad_norm": 1.8567531696044477, "language_loss": 0.80975938, "learning_rate": 2.278530465971703e-06, "loss": 0.8312583, "num_input_tokens_seen": 168628710, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 7848, "time_per_iteration": 2.429558277130127 }, { "auxiliary_loss_clip": 0.01118051, "auxiliary_loss_mlp": 0.01033968, "balance_loss_clip": 1.02054524, "balance_loss_mlp": 1.04269695, "epoch": 0.47190741019089133, "flos": 17856545483520.0, "grad_norm": 1.9450557399661146, "language_loss": 0.70503044, "learning_rate": 2.2781447942706032e-06, "loss": 0.72655058, "num_input_tokens_seen": 168645645, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 7849, "time_per_iteration": 2.4394993782043457 }, { "auxiliary_loss_clip": 0.01117975, "auxiliary_loss_mlp": 0.01037993, "balance_loss_clip": 1.02266836, "balance_loss_mlp": 1.03874254, "epoch": 0.4719675334435593, "flos": 17895472848000.0, "grad_norm": 2.5012890714976046, "language_loss": 0.69629651, "learning_rate": 2.277759112022224e-06, "loss": 0.71785623, "num_input_tokens_seen": 168664165, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.79296875, "step": 7850, "time_per_iteration": 2.455430507659912 }, { "auxiliary_loss_clip": 0.01116671, "auxiliary_loss_mlp": 0.01028122, "balance_loss_clip": 1.01470506, "balance_loss_mlp": 1.04002094, "epoch": 0.47202765669622726, "flos": 20704369426560.0, "grad_norm": 1.9948934418002962, "language_loss": 0.74789906, "learning_rate": 2.2773734192411916e-06, "loss": 0.76934695, "num_input_tokens_seen": 168681940, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 7851, "time_per_iteration": 2.463667631149292 }, { "auxiliary_loss_clip": 0.01116289, "auxiliary_loss_mlp": 0.010331, "balance_loss_clip": 1.01860988, "balance_loss_mlp": 1.03872705, "epoch": 0.4720877799488952, "flos": 16360255607040.0, "grad_norm": 3.954979916900141, "language_loss": 0.76394832, "learning_rate": 2.276987715942132e-06, "loss": 0.78544223, "num_input_tokens_seen": 168698830, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 7852, "time_per_iteration": 2.4200854301452637 }, { "auxiliary_loss_clip": 0.01114575, "auxiliary_loss_mlp": 0.01026073, "balance_loss_clip": 1.01200056, "balance_loss_mlp": 1.0396651, "epoch": 0.4721479032015632, "flos": 20668171495680.0, "grad_norm": 1.7189656084627165, "language_loss": 0.68978798, "learning_rate": 2.2766020021396696e-06, "loss": 0.7111944, "num_input_tokens_seen": 168718305, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 7853, "time_per_iteration": 2.486426830291748 }, { "auxiliary_loss_clip": 0.01040886, "auxiliary_loss_mlp": 0.01000737, "balance_loss_clip": 0.99923462, "balance_loss_mlp": 1.01576197, "epoch": 0.47220802645423116, "flos": 67750438435200.0, "grad_norm": 0.700952454736364, "language_loss": 0.50127131, "learning_rate": 2.276216277848432e-06, "loss": 0.52168757, "num_input_tokens_seen": 168782365, "router_z_loss_clip": 0.01501465, "router_z_loss_mlp": 0.25, "step": 7854, "time_per_iteration": 3.226067066192627 }, { "auxiliary_loss_clip": 0.01117395, "auxiliary_loss_mlp": 0.01030616, "balance_loss_clip": 1.0161202, "balance_loss_mlp": 1.04085886, "epoch": 0.4722681497068991, "flos": 20921449271040.0, "grad_norm": 1.8836031316402986, "language_loss": 0.64184946, "learning_rate": 2.2758305430830455e-06, "loss": 0.6633296, "num_input_tokens_seen": 168800485, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 7855, "time_per_iteration": 2.471008777618408 }, { "auxiliary_loss_clip": 0.01117123, "auxiliary_loss_mlp": 0.01034301, "balance_loss_clip": 1.0200969, "balance_loss_mlp": 1.04074526, "epoch": 0.4723282729595671, "flos": 28293083798400.0, "grad_norm": 1.839039856269076, "language_loss": 0.75920439, "learning_rate": 2.2754447978581376e-06, "loss": 0.78071862, "num_input_tokens_seen": 168818965, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 7856, "time_per_iteration": 2.576183557510376 }, { "auxiliary_loss_clip": 0.01112065, "auxiliary_loss_mlp": 0.01031004, "balance_loss_clip": 1.01836753, "balance_loss_mlp": 1.03794277, "epoch": 0.4723883962122351, "flos": 27125053338240.0, "grad_norm": 2.0448738029158084, "language_loss": 0.7447145, "learning_rate": 2.2750590421883347e-06, "loss": 0.76614517, "num_input_tokens_seen": 168840355, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 7857, "time_per_iteration": 2.507354736328125 }, { "auxiliary_loss_clip": 0.0111253, "auxiliary_loss_mlp": 0.01033962, "balance_loss_clip": 1.0211767, "balance_loss_mlp": 1.03881788, "epoch": 0.47244851946490307, "flos": 31537253387520.0, "grad_norm": 1.430888594427511, "language_loss": 0.64476079, "learning_rate": 2.2746732760882655e-06, "loss": 0.66622567, "num_input_tokens_seen": 168861765, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73828125, "step": 7858, "time_per_iteration": 2.57076096534729 }, { "auxiliary_loss_clip": 0.01111994, "auxiliary_loss_mlp": 0.01032161, "balance_loss_clip": 1.0184226, "balance_loss_mlp": 1.03774369, "epoch": 0.47250864271757104, "flos": 20886544229760.0, "grad_norm": 1.4373370573042241, "language_loss": 0.7040785, "learning_rate": 2.2742874995725575e-06, "loss": 0.72552001, "num_input_tokens_seen": 168881310, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 7859, "time_per_iteration": 2.466979742050171 }, { "auxiliary_loss_clip": 0.0112014, "auxiliary_loss_mlp": 0.01035727, "balance_loss_clip": 1.02195835, "balance_loss_mlp": 1.0403173, "epoch": 0.472568765970239, "flos": 20522086882560.0, "grad_norm": 1.9114019904932034, "language_loss": 0.62297654, "learning_rate": 2.2739017126558413e-06, "loss": 0.6445353, "num_input_tokens_seen": 168899470, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.80078125, "step": 7860, "time_per_iteration": 2.4788782596588135 }, { "auxiliary_loss_clip": 0.01120207, "auxiliary_loss_mlp": 0.01040491, "balance_loss_clip": 1.02603698, "balance_loss_mlp": 1.04181695, "epoch": 0.47262888922290697, "flos": 35805200417280.0, "grad_norm": 2.4119549517656953, "language_loss": 0.72391939, "learning_rate": 2.2735159153527445e-06, "loss": 0.74552631, "num_input_tokens_seen": 168921495, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78515625, "step": 7861, "time_per_iteration": 2.5851635932922363 }, { "auxiliary_loss_clip": 0.0111734, "auxiliary_loss_mlp": 0.01033817, "balance_loss_clip": 1.02025652, "balance_loss_mlp": 1.04080343, "epoch": 0.47268901247557493, "flos": 20667740532480.0, "grad_norm": 2.035645245487722, "language_loss": 0.84979701, "learning_rate": 2.273130107677896e-06, "loss": 0.87130857, "num_input_tokens_seen": 168940515, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 7862, "time_per_iteration": 2.533576726913452 }, { "auxiliary_loss_clip": 0.01114944, "auxiliary_loss_mlp": 0.01029294, "balance_loss_clip": 1.01628876, "balance_loss_mlp": 1.03826201, "epoch": 0.4727491357282429, "flos": 19573291082880.0, "grad_norm": 2.212572300425135, "language_loss": 0.84769714, "learning_rate": 2.272744289645927e-06, "loss": 0.86913955, "num_input_tokens_seen": 168958340, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.765625, "step": 7863, "time_per_iteration": 2.4760429859161377 }, { "auxiliary_loss_clip": 0.01114496, "auxiliary_loss_mlp": 0.01035971, "balance_loss_clip": 1.02295375, "balance_loss_mlp": 1.04014564, "epoch": 0.47280925898091086, "flos": 18217231902720.0, "grad_norm": 2.055742288975378, "language_loss": 0.65683329, "learning_rate": 2.272358461271467e-06, "loss": 0.67833793, "num_input_tokens_seen": 168974850, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7421875, "step": 7864, "time_per_iteration": 2.4552557468414307 }, { "auxiliary_loss_clip": 0.01115334, "auxiliary_loss_mlp": 0.01033562, "balance_loss_clip": 1.01949573, "balance_loss_mlp": 1.03934979, "epoch": 0.4728693822335788, "flos": 17821820010240.0, "grad_norm": 1.9023130188644164, "language_loss": 0.65080535, "learning_rate": 2.271972622569147e-06, "loss": 0.67229438, "num_input_tokens_seen": 168992860, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 7865, "time_per_iteration": 2.463620662689209 }, { "auxiliary_loss_clip": 0.01111297, "auxiliary_loss_mlp": 0.01033241, "balance_loss_clip": 1.02027082, "balance_loss_mlp": 1.03842211, "epoch": 0.4729295054862468, "flos": 20595057361920.0, "grad_norm": 1.8948041050269249, "language_loss": 0.74173748, "learning_rate": 2.2715867735535976e-06, "loss": 0.76318288, "num_input_tokens_seen": 169010325, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73046875, "step": 7866, "time_per_iteration": 2.4810967445373535 }, { "auxiliary_loss_clip": 0.01116126, "auxiliary_loss_mlp": 0.01030531, "balance_loss_clip": 1.01742339, "balance_loss_mlp": 1.03906298, "epoch": 0.47298962873891476, "flos": 23368079232000.0, "grad_norm": 2.027376822554319, "language_loss": 0.82760447, "learning_rate": 2.271200914239451e-06, "loss": 0.84907103, "num_input_tokens_seen": 169029840, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7734375, "step": 7867, "time_per_iteration": 2.5058701038360596 }, { "auxiliary_loss_clip": 0.01111032, "auxiliary_loss_mlp": 0.0102947, "balance_loss_clip": 1.01622617, "balance_loss_mlp": 1.03731906, "epoch": 0.4730497519915827, "flos": 22052240305920.0, "grad_norm": 1.7103932547651162, "language_loss": 0.79851067, "learning_rate": 2.2708150446413385e-06, "loss": 0.81991565, "num_input_tokens_seen": 169049975, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 7868, "time_per_iteration": 2.5224809646606445 }, { "auxiliary_loss_clip": 0.0111606, "auxiliary_loss_mlp": 0.01033397, "balance_loss_clip": 1.01941419, "balance_loss_mlp": 1.03849709, "epoch": 0.4731098752442507, "flos": 21069724613760.0, "grad_norm": 1.8800035471956347, "language_loss": 0.75046313, "learning_rate": 2.2704291647738915e-06, "loss": 0.77195776, "num_input_tokens_seen": 169069540, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.77734375, "step": 7869, "time_per_iteration": 2.480924367904663 }, { "auxiliary_loss_clip": 0.01118667, "auxiliary_loss_mlp": 0.01040335, "balance_loss_clip": 1.02517796, "balance_loss_mlp": 1.04191256, "epoch": 0.4731699984969187, "flos": 22528775064960.0, "grad_norm": 2.24708170272574, "language_loss": 0.7374171, "learning_rate": 2.2700432746517443e-06, "loss": 0.75900704, "num_input_tokens_seen": 169089940, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.765625, "step": 7870, "time_per_iteration": 2.5027430057525635 }, { "auxiliary_loss_clip": 0.01120734, "auxiliary_loss_mlp": 0.0103224, "balance_loss_clip": 1.01712465, "balance_loss_mlp": 1.04068065, "epoch": 0.4732301217495867, "flos": 24898124914560.0, "grad_norm": 1.9761157599370387, "language_loss": 0.81243819, "learning_rate": 2.2696573742895292e-06, "loss": 0.83396792, "num_input_tokens_seen": 169109650, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.80078125, "step": 7871, "time_per_iteration": 2.4914700984954834 }, { "auxiliary_loss_clip": 0.01116353, "auxiliary_loss_mlp": 0.01032032, "balance_loss_clip": 1.01808429, "balance_loss_mlp": 1.04072559, "epoch": 0.47329024500225464, "flos": 22784423137920.0, "grad_norm": 1.5855660158011189, "language_loss": 0.75974584, "learning_rate": 2.269271463701879e-06, "loss": 0.78122973, "num_input_tokens_seen": 169128990, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 7872, "time_per_iteration": 2.5002150535583496 }, { "auxiliary_loss_clip": 0.01114913, "auxiliary_loss_mlp": 0.01033851, "balance_loss_clip": 1.01998067, "balance_loss_mlp": 1.0386529, "epoch": 0.4733503682549226, "flos": 38695902220800.0, "grad_norm": 1.6389964107719464, "language_loss": 0.68096292, "learning_rate": 2.268885542903428e-06, "loss": 0.70245057, "num_input_tokens_seen": 169154645, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 7873, "time_per_iteration": 2.661346912384033 }, { "auxiliary_loss_clip": 0.01116001, "auxiliary_loss_mlp": 0.010345, "balance_loss_clip": 1.021083, "balance_loss_mlp": 1.04113448, "epoch": 0.47341049150759057, "flos": 22966849336320.0, "grad_norm": 1.6447732495366358, "language_loss": 0.72724378, "learning_rate": 2.26849961190881e-06, "loss": 0.74874878, "num_input_tokens_seen": 169174995, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 7874, "time_per_iteration": 2.5147392749786377 }, { "auxiliary_loss_clip": 0.01117662, "auxiliary_loss_mlp": 0.01034551, "balance_loss_clip": 1.02115142, "balance_loss_mlp": 1.04113293, "epoch": 0.47347061476025853, "flos": 14538471661440.0, "grad_norm": 2.923169929135502, "language_loss": 0.64882255, "learning_rate": 2.26811367073266e-06, "loss": 0.67034471, "num_input_tokens_seen": 169191815, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 7875, "time_per_iteration": 2.4429426193237305 }, { "auxiliary_loss_clip": 0.01118594, "auxiliary_loss_mlp": 0.01032163, "balance_loss_clip": 1.01815557, "balance_loss_mlp": 1.0409683, "epoch": 0.4735307380129265, "flos": 30263250827520.0, "grad_norm": 2.359193551621405, "language_loss": 0.81104505, "learning_rate": 2.2677277193896125e-06, "loss": 0.83255267, "num_input_tokens_seen": 169210430, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.77734375, "step": 7876, "time_per_iteration": 2.551828145980835 }, { "auxiliary_loss_clip": 0.01112862, "auxiliary_loss_mlp": 0.01032002, "balance_loss_clip": 1.01846588, "balance_loss_mlp": 1.03708792, "epoch": 0.47359086126559446, "flos": 19391044452480.0, "grad_norm": 1.7671235010492155, "language_loss": 0.79121459, "learning_rate": 2.267341757894304e-06, "loss": 0.8126632, "num_input_tokens_seen": 169229295, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 7877, "time_per_iteration": 2.4701876640319824 }, { "auxiliary_loss_clip": 0.01114029, "auxiliary_loss_mlp": 0.01032727, "balance_loss_clip": 1.01879764, "balance_loss_mlp": 1.0386436, "epoch": 0.47365098451826243, "flos": 21939408708480.0, "grad_norm": 1.9145942868216819, "language_loss": 0.70403826, "learning_rate": 2.2669557862613685e-06, "loss": 0.72550589, "num_input_tokens_seen": 169247855, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75390625, "step": 7878, "time_per_iteration": 2.5153160095214844 }, { "auxiliary_loss_clip": 0.01112635, "auxiliary_loss_mlp": 0.01031602, "balance_loss_clip": 1.01865554, "balance_loss_mlp": 1.03967428, "epoch": 0.4737111077709304, "flos": 25845053207040.0, "grad_norm": 2.5987915443275393, "language_loss": 0.75030065, "learning_rate": 2.2665698045054425e-06, "loss": 0.77174306, "num_input_tokens_seen": 169268860, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73046875, "step": 7879, "time_per_iteration": 2.535440444946289 }, { "auxiliary_loss_clip": 0.01040439, "auxiliary_loss_mlp": 0.01006468, "balance_loss_clip": 1.00525784, "balance_loss_mlp": 1.01542282, "epoch": 0.47377123102359836, "flos": 67760886314880.0, "grad_norm": 0.7284284702346118, "language_loss": 0.61245525, "learning_rate": 2.266183812641164e-06, "loss": 0.63292432, "num_input_tokens_seen": 169331855, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.25, "step": 7880, "time_per_iteration": 3.1201486587524414 }, { "auxiliary_loss_clip": 0.01114138, "auxiliary_loss_mlp": 0.01035463, "balance_loss_clip": 1.02083588, "balance_loss_mlp": 1.03972077, "epoch": 0.4738313542762663, "flos": 24315977191680.0, "grad_norm": 1.5084335201471386, "language_loss": 0.67860037, "learning_rate": 2.2657978106831675e-06, "loss": 0.70009637, "num_input_tokens_seen": 169352175, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7421875, "step": 7881, "time_per_iteration": 2.4824776649475098 }, { "auxiliary_loss_clip": 0.01115934, "auxiliary_loss_mlp": 0.01026331, "balance_loss_clip": 1.01329541, "balance_loss_mlp": 1.04178691, "epoch": 0.4738914775289343, "flos": 20705339093760.0, "grad_norm": 1.848248082477775, "language_loss": 0.77464384, "learning_rate": 2.265411798646092e-06, "loss": 0.79606652, "num_input_tokens_seen": 169371215, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 7882, "time_per_iteration": 4.059326887130737 }, { "auxiliary_loss_clip": 0.011167, "auxiliary_loss_mlp": 0.01029644, "balance_loss_clip": 1.0151422, "balance_loss_mlp": 1.04059052, "epoch": 0.4739516007816023, "flos": 25446337263360.0, "grad_norm": 1.4496101384504623, "language_loss": 0.76093328, "learning_rate": 2.2650257765445747e-06, "loss": 0.78239667, "num_input_tokens_seen": 169391745, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76171875, "step": 7883, "time_per_iteration": 3.8956263065338135 }, { "auxiliary_loss_clip": 0.01114658, "auxiliary_loss_mlp": 0.01028613, "balance_loss_clip": 1.01572096, "balance_loss_mlp": 1.03990948, "epoch": 0.4740117240342703, "flos": 19974341410560.0, "grad_norm": 1.8330274543873712, "language_loss": 0.71985781, "learning_rate": 2.2646397443932525e-06, "loss": 0.74129057, "num_input_tokens_seen": 169409845, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 7884, "time_per_iteration": 4.038559198379517 }, { "auxiliary_loss_clip": 0.01121219, "auxiliary_loss_mlp": 0.01035618, "balance_loss_clip": 1.02090788, "balance_loss_mlp": 1.04124928, "epoch": 0.47407184728693824, "flos": 15661146222720.0, "grad_norm": 2.108621556147415, "language_loss": 0.82099199, "learning_rate": 2.2642537022067655e-06, "loss": 0.84256029, "num_input_tokens_seen": 169426085, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.80078125, "step": 7885, "time_per_iteration": 2.410464286804199 }, { "auxiliary_loss_clip": 0.01118364, "auxiliary_loss_mlp": 0.01032858, "balance_loss_clip": 1.01951218, "balance_loss_mlp": 1.04332042, "epoch": 0.4741319705396062, "flos": 18588800142720.0, "grad_norm": 1.8476194893356364, "language_loss": 0.73633635, "learning_rate": 2.263867649999751e-06, "loss": 0.7578485, "num_input_tokens_seen": 169444705, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 7886, "time_per_iteration": 3.827810049057007 }, { "auxiliary_loss_clip": 0.01119843, "auxiliary_loss_mlp": 0.01032804, "balance_loss_clip": 1.01789057, "balance_loss_mlp": 1.03965688, "epoch": 0.47419209379227417, "flos": 13261093223040.0, "grad_norm": 1.9230492399240167, "language_loss": 0.7354551, "learning_rate": 2.263481587786849e-06, "loss": 0.75698149, "num_input_tokens_seen": 169460850, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.80078125, "step": 7887, "time_per_iteration": 2.426119565963745 }, { "auxiliary_loss_clip": 0.01112265, "auxiliary_loss_mlp": 0.01026564, "balance_loss_clip": 1.01450646, "balance_loss_mlp": 1.03919542, "epoch": 0.47425221704494214, "flos": 20044043752320.0, "grad_norm": 2.467343435253, "language_loss": 0.77203941, "learning_rate": 2.2630955155826993e-06, "loss": 0.79342771, "num_input_tokens_seen": 169478890, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.73046875, "step": 7888, "time_per_iteration": 2.4523868560791016 }, { "auxiliary_loss_clip": 0.0111589, "auxiliary_loss_mlp": 0.01030123, "balance_loss_clip": 1.01666427, "balance_loss_mlp": 1.04021478, "epoch": 0.4743123402976101, "flos": 27271892136960.0, "grad_norm": 1.837262356700156, "language_loss": 0.72470421, "learning_rate": 2.2627094334019406e-06, "loss": 0.74616444, "num_input_tokens_seen": 169499690, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 7889, "time_per_iteration": 2.504448175430298 }, { "auxiliary_loss_clip": 0.01038847, "auxiliary_loss_mlp": 0.01008391, "balance_loss_clip": 1.00716877, "balance_loss_mlp": 1.01395845, "epoch": 0.47437246355027807, "flos": 55393970261760.0, "grad_norm": 0.7189602256210588, "language_loss": 0.56115925, "learning_rate": 2.262323341259214e-06, "loss": 0.58163166, "num_input_tokens_seen": 169560475, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.24902344, "step": 7890, "time_per_iteration": 3.1558449268341064 }, { "auxiliary_loss_clip": 0.01117928, "auxiliary_loss_mlp": 0.01034554, "balance_loss_clip": 1.01971209, "balance_loss_mlp": 1.04102039, "epoch": 0.47443258680294603, "flos": 23878477537920.0, "grad_norm": 2.084499701209977, "language_loss": 0.65722609, "learning_rate": 2.2619372391691605e-06, "loss": 0.67875087, "num_input_tokens_seen": 169580110, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76953125, "step": 7891, "time_per_iteration": 2.495304584503174 }, { "auxiliary_loss_clip": 0.01120511, "auxiliary_loss_mlp": 0.01036196, "balance_loss_clip": 1.02113962, "balance_loss_mlp": 1.04077029, "epoch": 0.474492710055614, "flos": 21977761455360.0, "grad_norm": 2.262412231975778, "language_loss": 0.70281899, "learning_rate": 2.26155112714642e-06, "loss": 0.7243861, "num_input_tokens_seen": 169597510, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 7892, "time_per_iteration": 2.465670347213745 }, { "auxiliary_loss_clip": 0.0103928, "auxiliary_loss_mlp": 0.01006719, "balance_loss_clip": 1.00555122, "balance_loss_mlp": 1.01406431, "epoch": 0.47455283330828196, "flos": 62557180122240.0, "grad_norm": 0.8153900995557243, "language_loss": 0.58608842, "learning_rate": 2.2611650052056355e-06, "loss": 0.60654831, "num_input_tokens_seen": 169660010, "router_z_loss_clip": 0.01165771, "router_z_loss_mlp": 0.25195312, "step": 7893, "time_per_iteration": 3.1745071411132812 }, { "auxiliary_loss_clip": 0.01115947, "auxiliary_loss_mlp": 0.01034054, "balance_loss_clip": 1.02130437, "balance_loss_mlp": 1.0401963, "epoch": 0.47461295656094993, "flos": 12093637380480.0, "grad_norm": 1.8153930395630897, "language_loss": 0.77560711, "learning_rate": 2.2607788733614463e-06, "loss": 0.79710704, "num_input_tokens_seen": 169678485, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7578125, "step": 7894, "time_per_iteration": 2.444889545440674 }, { "auxiliary_loss_clip": 0.01114529, "auxiliary_loss_mlp": 0.01033717, "balance_loss_clip": 1.01975167, "balance_loss_mlp": 1.03823543, "epoch": 0.4746730798136179, "flos": 20884568981760.0, "grad_norm": 1.8286144675135516, "language_loss": 0.74667835, "learning_rate": 2.260392731628497e-06, "loss": 0.76816082, "num_input_tokens_seen": 169697335, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76171875, "step": 7895, "time_per_iteration": 2.5263335704803467 }, { "auxiliary_loss_clip": 0.01114716, "auxiliary_loss_mlp": 0.01029616, "balance_loss_clip": 1.01574636, "balance_loss_mlp": 1.03939962, "epoch": 0.4747332030662859, "flos": 19974808287360.0, "grad_norm": 1.7524602216631329, "language_loss": 0.82829654, "learning_rate": 2.260006580021429e-06, "loss": 0.84973985, "num_input_tokens_seen": 169715395, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 7896, "time_per_iteration": 2.4743831157684326 }, { "auxiliary_loss_clip": 0.01115801, "auxiliary_loss_mlp": 0.01031941, "balance_loss_clip": 1.01761818, "balance_loss_mlp": 1.04038644, "epoch": 0.4747933263189539, "flos": 16034186920320.0, "grad_norm": 4.4114425151213625, "language_loss": 0.76321149, "learning_rate": 2.259620418554886e-06, "loss": 0.78468889, "num_input_tokens_seen": 169733755, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 7897, "time_per_iteration": 2.4488236904144287 }, { "auxiliary_loss_clip": 0.01119672, "auxiliary_loss_mlp": 0.01034166, "balance_loss_clip": 1.01982474, "balance_loss_mlp": 1.039644, "epoch": 0.47485344957162184, "flos": 13955102876160.0, "grad_norm": 2.4537574616578435, "language_loss": 0.64038074, "learning_rate": 2.25923424724351e-06, "loss": 0.66191912, "num_input_tokens_seen": 169751390, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.796875, "step": 7898, "time_per_iteration": 2.4263522624969482 }, { "auxiliary_loss_clip": 0.01116825, "auxiliary_loss_mlp": 0.01038138, "balance_loss_clip": 1.02296221, "balance_loss_mlp": 1.03978992, "epoch": 0.4749135728242898, "flos": 20449080489600.0, "grad_norm": 2.28456648351633, "language_loss": 0.69537693, "learning_rate": 2.258848066101946e-06, "loss": 0.71692657, "num_input_tokens_seen": 169769500, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.76953125, "step": 7899, "time_per_iteration": 2.465603828430176 }, { "auxiliary_loss_clip": 0.01117304, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 1.02109265, "balance_loss_mlp": 1.0392164, "epoch": 0.4749736960769578, "flos": 28949961767040.0, "grad_norm": 2.309777828079464, "language_loss": 0.68290448, "learning_rate": 2.258461875144837e-06, "loss": 0.7044329, "num_input_tokens_seen": 169789215, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 7900, "time_per_iteration": 2.5203027725219727 }, { "auxiliary_loss_clip": 0.01116965, "auxiliary_loss_mlp": 0.01034594, "balance_loss_clip": 1.02041984, "balance_loss_mlp": 1.04040122, "epoch": 0.47503381932962574, "flos": 31938770592000.0, "grad_norm": 2.3832919543759346, "language_loss": 0.70355868, "learning_rate": 2.2580756743868273e-06, "loss": 0.72507429, "num_input_tokens_seen": 169808825, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 7901, "time_per_iteration": 2.5734167098999023 }, { "auxiliary_loss_clip": 0.01118803, "auxiliary_loss_mlp": 0.01039235, "balance_loss_clip": 1.02548432, "balance_loss_mlp": 1.04268146, "epoch": 0.4750939425822937, "flos": 22127257860480.0, "grad_norm": 2.444607696267863, "language_loss": 0.73578203, "learning_rate": 2.2576894638425636e-06, "loss": 0.75736243, "num_input_tokens_seen": 169827590, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76171875, "step": 7902, "time_per_iteration": 2.4734506607055664 }, { "auxiliary_loss_clip": 0.01113548, "auxiliary_loss_mlp": 0.01031026, "balance_loss_clip": 1.01799631, "balance_loss_mlp": 1.04026532, "epoch": 0.47515406583496167, "flos": 20850094903680.0, "grad_norm": 1.9097402296062977, "language_loss": 0.6860165, "learning_rate": 2.257303243526688e-06, "loss": 0.70746219, "num_input_tokens_seen": 169844925, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 7903, "time_per_iteration": 2.467235565185547 }, { "auxiliary_loss_clip": 0.01112314, "auxiliary_loss_mlp": 0.01031082, "balance_loss_clip": 1.01820707, "balance_loss_mlp": 1.03898919, "epoch": 0.47521418908762963, "flos": 17524802448000.0, "grad_norm": 1.5013517215023588, "language_loss": 0.72133422, "learning_rate": 2.256917013453848e-06, "loss": 0.74276817, "num_input_tokens_seen": 169862705, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 7904, "time_per_iteration": 2.4414212703704834 }, { "auxiliary_loss_clip": 0.01113217, "auxiliary_loss_mlp": 0.01028331, "balance_loss_clip": 1.01555204, "balance_loss_mlp": 1.0394913, "epoch": 0.4752743123402976, "flos": 20559434048640.0, "grad_norm": 1.5676488080266882, "language_loss": 0.863083, "learning_rate": 2.25653077363869e-06, "loss": 0.88449848, "num_input_tokens_seen": 169880155, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 7905, "time_per_iteration": 2.5099294185638428 }, { "auxiliary_loss_clip": 0.01109684, "auxiliary_loss_mlp": 0.01030469, "balance_loss_clip": 1.01773715, "balance_loss_mlp": 1.03738832, "epoch": 0.47533443559296557, "flos": 26360623071360.0, "grad_norm": 1.6654071502415184, "language_loss": 0.82209289, "learning_rate": 2.2561445240958583e-06, "loss": 0.84349442, "num_input_tokens_seen": 169901525, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 7906, "time_per_iteration": 2.4997308254241943 }, { "auxiliary_loss_clip": 0.01039636, "auxiliary_loss_mlp": 0.01009354, "balance_loss_clip": 1.00806701, "balance_loss_mlp": 1.01438856, "epoch": 0.47539455884563353, "flos": 65949660967680.0, "grad_norm": 0.6776868405291154, "language_loss": 0.58937252, "learning_rate": 2.255758264840002e-06, "loss": 0.60986245, "num_input_tokens_seen": 169970345, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.25195312, "step": 7907, "time_per_iteration": 3.2077341079711914 }, { "auxiliary_loss_clip": 0.01115147, "auxiliary_loss_mlp": 0.01033898, "balance_loss_clip": 1.0203079, "balance_loss_mlp": 1.04056263, "epoch": 0.4754546820983015, "flos": 17238128002560.0, "grad_norm": 1.7892201205896676, "language_loss": 0.81864262, "learning_rate": 2.255371995885765e-06, "loss": 0.84013307, "num_input_tokens_seen": 169986440, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 7908, "time_per_iteration": 2.438565254211426 }, { "auxiliary_loss_clip": 0.01116185, "auxiliary_loss_mlp": 0.01033986, "balance_loss_clip": 1.01975274, "balance_loss_mlp": 1.04066896, "epoch": 0.47551480535096946, "flos": 19825886499840.0, "grad_norm": 1.823418392231554, "language_loss": 0.74403918, "learning_rate": 2.254985717247797e-06, "loss": 0.7655409, "num_input_tokens_seen": 170005705, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 7909, "time_per_iteration": 2.492964506149292 }, { "auxiliary_loss_clip": 0.01114004, "auxiliary_loss_mlp": 0.01033608, "balance_loss_clip": 1.02006555, "balance_loss_mlp": 1.03948092, "epoch": 0.4755749286036375, "flos": 22163958581760.0, "grad_norm": 1.863048050133497, "language_loss": 0.75422388, "learning_rate": 2.2545994289407457e-06, "loss": 0.77569997, "num_input_tokens_seen": 170023415, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 7910, "time_per_iteration": 2.4638445377349854 }, { "auxiliary_loss_clip": 0.01111137, "auxiliary_loss_mlp": 0.01026549, "balance_loss_clip": 1.01511669, "balance_loss_mlp": 1.03738546, "epoch": 0.47563505185630545, "flos": 21648280976640.0, "grad_norm": 1.6855403383175371, "language_loss": 0.7930485, "learning_rate": 2.2542131309792577e-06, "loss": 0.81442547, "num_input_tokens_seen": 170042395, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.73828125, "step": 7911, "time_per_iteration": 2.4818732738494873 }, { "auxiliary_loss_clip": 0.0111692, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 1.0177927, "balance_loss_mlp": 1.03878188, "epoch": 0.4756951751089734, "flos": 20628777254400.0, "grad_norm": 1.6879859961645065, "language_loss": 0.75694388, "learning_rate": 2.253826823377983e-06, "loss": 0.77843267, "num_input_tokens_seen": 170061610, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78125, "step": 7912, "time_per_iteration": 2.461317300796509 }, { "auxiliary_loss_clip": 0.01113181, "auxiliary_loss_mlp": 0.01037668, "balance_loss_clip": 1.02470422, "balance_loss_mlp": 1.03844965, "epoch": 0.4757552983616414, "flos": 25848788221440.0, "grad_norm": 1.7653219138893297, "language_loss": 0.74637002, "learning_rate": 2.253440506151569e-06, "loss": 0.76787847, "num_input_tokens_seen": 170083505, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 7913, "time_per_iteration": 2.519746780395508 }, { "auxiliary_loss_clip": 0.01114943, "auxiliary_loss_mlp": 0.01030233, "balance_loss_clip": 1.01643479, "balance_loss_mlp": 1.04017711, "epoch": 0.47581542161430934, "flos": 18223013992320.0, "grad_norm": 2.564684958333354, "language_loss": 0.72441095, "learning_rate": 2.253054179314666e-06, "loss": 0.74586272, "num_input_tokens_seen": 170100690, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.74609375, "step": 7914, "time_per_iteration": 2.425809144973755 }, { "auxiliary_loss_clip": 0.01117642, "auxiliary_loss_mlp": 0.01033731, "balance_loss_clip": 1.02047467, "balance_loss_mlp": 1.04123092, "epoch": 0.4758755448669773, "flos": 21579763783680.0, "grad_norm": 2.237789224782209, "language_loss": 0.65069473, "learning_rate": 2.2526678428819227e-06, "loss": 0.67220843, "num_input_tokens_seen": 170119240, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76171875, "step": 7915, "time_per_iteration": 2.495802164077759 }, { "auxiliary_loss_clip": 0.01111256, "auxiliary_loss_mlp": 0.01034866, "balance_loss_clip": 1.02168179, "balance_loss_mlp": 1.03877211, "epoch": 0.47593566811964527, "flos": 15231152511360.0, "grad_norm": 1.8084106626949552, "language_loss": 0.76768744, "learning_rate": 2.2522814968679896e-06, "loss": 0.78914869, "num_input_tokens_seen": 170136450, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 7916, "time_per_iteration": 2.4357802867889404 }, { "auxiliary_loss_clip": 0.01113677, "auxiliary_loss_mlp": 0.01033626, "balance_loss_clip": 1.02066159, "balance_loss_mlp": 1.03861666, "epoch": 0.47599579137231324, "flos": 21543242630400.0, "grad_norm": 1.8745348320913198, "language_loss": 0.64427376, "learning_rate": 2.2518951412875173e-06, "loss": 0.66574681, "num_input_tokens_seen": 170155295, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.75, "step": 7917, "time_per_iteration": 2.546135663986206 }, { "auxiliary_loss_clip": 0.01036833, "auxiliary_loss_mlp": 0.01003587, "balance_loss_clip": 1.0021739, "balance_loss_mlp": 1.01180017, "epoch": 0.4760559146249812, "flos": 64554602595840.0, "grad_norm": 0.9405367891533387, "language_loss": 0.65720725, "learning_rate": 2.2515087761551557e-06, "loss": 0.67761147, "num_input_tokens_seen": 170222325, "router_z_loss_clip": 0.01409912, "router_z_loss_mlp": 0.25, "step": 7918, "time_per_iteration": 3.1147572994232178 }, { "auxiliary_loss_clip": 0.01117768, "auxiliary_loss_mlp": 0.0103607, "balance_loss_clip": 1.02254558, "balance_loss_mlp": 1.04151177, "epoch": 0.47611603787764917, "flos": 22233876405120.0, "grad_norm": 1.7005656894221552, "language_loss": 0.69081825, "learning_rate": 2.2511224014855563e-06, "loss": 0.71235663, "num_input_tokens_seen": 170241625, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76171875, "step": 7919, "time_per_iteration": 2.4969727993011475 }, { "auxiliary_loss_clip": 0.01116989, "auxiliary_loss_mlp": 0.01033613, "balance_loss_clip": 1.02039266, "balance_loss_mlp": 1.04051232, "epoch": 0.47617616113031713, "flos": 22780005765120.0, "grad_norm": 1.795108839697151, "language_loss": 0.74872184, "learning_rate": 2.2507360172933694e-06, "loss": 0.77022785, "num_input_tokens_seen": 170262470, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.765625, "step": 7920, "time_per_iteration": 2.551994800567627 }, { "auxiliary_loss_clip": 0.01120783, "auxiliary_loss_mlp": 0.01037516, "balance_loss_clip": 1.0234375, "balance_loss_mlp": 1.04195917, "epoch": 0.4762362843829851, "flos": 24133802388480.0, "grad_norm": 1.5451985030852915, "language_loss": 0.77264571, "learning_rate": 2.2503496235932487e-06, "loss": 0.79422879, "num_input_tokens_seen": 170283460, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 7921, "time_per_iteration": 2.506608724594116 }, { "auxiliary_loss_clip": 0.01117875, "auxiliary_loss_mlp": 0.01038629, "balance_loss_clip": 1.02394247, "balance_loss_mlp": 1.04032803, "epoch": 0.47629640763565306, "flos": 22452069571200.0, "grad_norm": 1.580869684318835, "language_loss": 0.77874053, "learning_rate": 2.249963220399845e-06, "loss": 0.80030555, "num_input_tokens_seen": 170304225, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 7922, "time_per_iteration": 2.4884870052337646 }, { "auxiliary_loss_clip": 0.01118738, "auxiliary_loss_mlp": 0.01042515, "balance_loss_clip": 1.02796006, "balance_loss_mlp": 1.04009056, "epoch": 0.4763565308883211, "flos": 11181398647680.0, "grad_norm": 2.038505661306209, "language_loss": 0.72963667, "learning_rate": 2.2495768077278104e-06, "loss": 0.75124919, "num_input_tokens_seen": 170322110, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78515625, "step": 7923, "time_per_iteration": 2.451054811477661 }, { "auxiliary_loss_clip": 0.01116394, "auxiliary_loss_mlp": 0.01035333, "balance_loss_clip": 1.02238727, "balance_loss_mlp": 1.04034483, "epoch": 0.47641665414098905, "flos": 22382151747840.0, "grad_norm": 1.8002410347395659, "language_loss": 0.8174063, "learning_rate": 2.2491903855917992e-06, "loss": 0.83892357, "num_input_tokens_seen": 170340700, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.76171875, "step": 7924, "time_per_iteration": 3.957897186279297 }, { "auxiliary_loss_clip": 0.01123264, "auxiliary_loss_mlp": 0.01035403, "balance_loss_clip": 1.02137804, "balance_loss_mlp": 1.04255033, "epoch": 0.476476777393657, "flos": 25046148862080.0, "grad_norm": 1.7397979143461861, "language_loss": 0.80618811, "learning_rate": 2.2488039540064626e-06, "loss": 0.82777482, "num_input_tokens_seen": 170359780, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.80859375, "step": 7925, "time_per_iteration": 3.8656728267669678 }, { "auxiliary_loss_clip": 0.01116491, "auxiliary_loss_mlp": 0.01036439, "balance_loss_clip": 1.02321839, "balance_loss_mlp": 1.03931618, "epoch": 0.476536900646325, "flos": 27269916888960.0, "grad_norm": 2.2859575225984132, "language_loss": 0.72143906, "learning_rate": 2.2484175129864558e-06, "loss": 0.74296832, "num_input_tokens_seen": 170381260, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76953125, "step": 7926, "time_per_iteration": 3.9663915634155273 }, { "auxiliary_loss_clip": 0.01121168, "auxiliary_loss_mlp": 0.01038766, "balance_loss_clip": 1.02394807, "balance_loss_mlp": 1.04229355, "epoch": 0.47659702389899294, "flos": 25301401885440.0, "grad_norm": 2.2452126348372876, "language_loss": 0.68625784, "learning_rate": 2.248031062546432e-06, "loss": 0.70785713, "num_input_tokens_seen": 170400595, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 7927, "time_per_iteration": 2.5211172103881836 }, { "auxiliary_loss_clip": 0.01114961, "auxiliary_loss_mlp": 0.01032045, "balance_loss_clip": 1.01886022, "balance_loss_mlp": 1.04090214, "epoch": 0.4766571471516609, "flos": 25992861672960.0, "grad_norm": 1.8500947572783295, "language_loss": 0.67950165, "learning_rate": 2.247644602701045e-06, "loss": 0.70097172, "num_input_tokens_seen": 170421110, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 7928, "time_per_iteration": 3.852243661880493 }, { "auxiliary_loss_clip": 0.01115532, "auxiliary_loss_mlp": 0.01032115, "balance_loss_clip": 1.01806045, "balance_loss_mlp": 1.03967357, "epoch": 0.4767172704043289, "flos": 16032211672320.0, "grad_norm": 2.104190057814865, "language_loss": 0.78931731, "learning_rate": 2.2472581334649496e-06, "loss": 0.81079376, "num_input_tokens_seen": 170436700, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 7929, "time_per_iteration": 2.4477717876434326 }, { "auxiliary_loss_clip": 0.01114403, "auxiliary_loss_mlp": 0.01034163, "balance_loss_clip": 1.02138937, "balance_loss_mlp": 1.04103446, "epoch": 0.47677739365699684, "flos": 39235351651200.0, "grad_norm": 1.7222976621257882, "language_loss": 0.66669744, "learning_rate": 2.2468716548528016e-06, "loss": 0.68818307, "num_input_tokens_seen": 170459555, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 7930, "time_per_iteration": 2.624979019165039 }, { "auxiliary_loss_clip": 0.01113011, "auxiliary_loss_mlp": 0.01031389, "balance_loss_clip": 1.01884794, "balance_loss_mlp": 1.03837121, "epoch": 0.4768375169096648, "flos": 24717781704960.0, "grad_norm": 1.6779858357677486, "language_loss": 0.80039358, "learning_rate": 2.2464851668792555e-06, "loss": 0.82183754, "num_input_tokens_seen": 170479175, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.74609375, "step": 7931, "time_per_iteration": 2.511812925338745 }, { "auxiliary_loss_clip": 0.01117241, "auxiliary_loss_mlp": 0.01029138, "balance_loss_clip": 1.01561999, "balance_loss_mlp": 1.04044163, "epoch": 0.47689764016233277, "flos": 22528667324160.0, "grad_norm": 2.3038747418553043, "language_loss": 0.7584058, "learning_rate": 2.2460986695589678e-06, "loss": 0.77986968, "num_input_tokens_seen": 170498450, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76953125, "step": 7932, "time_per_iteration": 2.464695692062378 }, { "auxiliary_loss_clip": 0.01112119, "auxiliary_loss_mlp": 0.01032932, "balance_loss_clip": 1.01891303, "balance_loss_mlp": 1.03924584, "epoch": 0.47695776341500074, "flos": 15120619384320.0, "grad_norm": 1.9605871412239309, "language_loss": 0.79371589, "learning_rate": 2.245712162906593e-06, "loss": 0.81516641, "num_input_tokens_seen": 170516255, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 7933, "time_per_iteration": 2.455225706100464 }, { "auxiliary_loss_clip": 0.01120545, "auxiliary_loss_mlp": 0.01037707, "balance_loss_clip": 1.02216196, "balance_loss_mlp": 1.04113007, "epoch": 0.4770178866676687, "flos": 14678917839360.0, "grad_norm": 8.094395201516738, "language_loss": 0.74044442, "learning_rate": 2.2453256469367888e-06, "loss": 0.76202697, "num_input_tokens_seen": 170532705, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.796875, "step": 7934, "time_per_iteration": 2.4394261837005615 }, { "auxiliary_loss_clip": 0.01116464, "auxiliary_loss_mlp": 0.01028748, "balance_loss_clip": 1.01538479, "balance_loss_mlp": 1.03848267, "epoch": 0.47707800992033667, "flos": 22565583527040.0, "grad_norm": 2.3462497507971816, "language_loss": 0.8004812, "learning_rate": 2.244939121664211e-06, "loss": 0.82193327, "num_input_tokens_seen": 170551925, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.78125, "step": 7935, "time_per_iteration": 2.5254952907562256 }, { "auxiliary_loss_clip": 0.01124024, "auxiliary_loss_mlp": 0.01038439, "balance_loss_clip": 1.02395511, "balance_loss_mlp": 1.04254675, "epoch": 0.4771381331730047, "flos": 30918225375360.0, "grad_norm": 1.905244439651328, "language_loss": 0.71270096, "learning_rate": 2.2445525871035177e-06, "loss": 0.73432553, "num_input_tokens_seen": 170572320, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.81640625, "step": 7936, "time_per_iteration": 2.5382239818573 }, { "auxiliary_loss_clip": 0.01117669, "auxiliary_loss_mlp": 0.01031441, "balance_loss_clip": 1.0179522, "balance_loss_mlp": 1.03982222, "epoch": 0.47719825642567265, "flos": 25738901539200.0, "grad_norm": 2.351105126283587, "language_loss": 0.68185985, "learning_rate": 2.2441660432693656e-06, "loss": 0.7033509, "num_input_tokens_seen": 170589470, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.77734375, "step": 7937, "time_per_iteration": 2.508796453475952 }, { "auxiliary_loss_clip": 0.01035889, "auxiliary_loss_mlp": 0.01002084, "balance_loss_clip": 1.00061786, "balance_loss_mlp": 1.01076972, "epoch": 0.4772583796783406, "flos": 66355128668160.0, "grad_norm": 0.7219602523290954, "language_loss": 0.56396472, "learning_rate": 2.2437794901764128e-06, "loss": 0.58434451, "num_input_tokens_seen": 170662265, "router_z_loss_clip": 0.01464844, "router_z_loss_mlp": 0.25, "step": 7938, "time_per_iteration": 3.242114305496216 }, { "auxiliary_loss_clip": 0.01117287, "auxiliary_loss_mlp": 0.01031165, "balance_loss_clip": 1.01716959, "balance_loss_mlp": 1.04127884, "epoch": 0.4773185029310086, "flos": 22051091070720.0, "grad_norm": 1.6119669267458372, "language_loss": 0.89041889, "learning_rate": 2.243392927839317e-06, "loss": 0.91190338, "num_input_tokens_seen": 170679680, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 7939, "time_per_iteration": 2.471010446548462 }, { "auxiliary_loss_clip": 0.01115084, "auxiliary_loss_mlp": 0.01034982, "balance_loss_clip": 1.02242291, "balance_loss_mlp": 1.03879571, "epoch": 0.47737862618367655, "flos": 16727801523840.0, "grad_norm": 1.826202313626809, "language_loss": 0.77000415, "learning_rate": 2.2430063562727367e-06, "loss": 0.79150474, "num_input_tokens_seen": 170697340, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.765625, "step": 7940, "time_per_iteration": 2.4596757888793945 }, { "auxiliary_loss_clip": 0.01115887, "auxiliary_loss_mlp": 0.01034181, "balance_loss_clip": 1.02177739, "balance_loss_mlp": 1.04202116, "epoch": 0.4774387494363445, "flos": 19609453100160.0, "grad_norm": 5.332311824721534, "language_loss": 0.85056728, "learning_rate": 2.2426197754913322e-06, "loss": 0.87206799, "num_input_tokens_seen": 170714905, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.73828125, "step": 7941, "time_per_iteration": 2.4743967056274414 }, { "auxiliary_loss_clip": 0.01120719, "auxiliary_loss_mlp": 0.01032856, "balance_loss_clip": 1.01878977, "balance_loss_mlp": 1.04337358, "epoch": 0.4774988726890125, "flos": 16653969118080.0, "grad_norm": 2.31978390297049, "language_loss": 0.76027524, "learning_rate": 2.24223318550976e-06, "loss": 0.78181094, "num_input_tokens_seen": 170731810, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 7942, "time_per_iteration": 2.470163583755493 }, { "auxiliary_loss_clip": 0.01119344, "auxiliary_loss_mlp": 0.01033844, "balance_loss_clip": 1.02028966, "balance_loss_mlp": 1.04231215, "epoch": 0.47755899594168044, "flos": 20485565729280.0, "grad_norm": 1.7366322873082096, "language_loss": 0.64867246, "learning_rate": 2.241846586342682e-06, "loss": 0.67020428, "num_input_tokens_seen": 170750270, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76953125, "step": 7943, "time_per_iteration": 2.4621763229370117 }, { "auxiliary_loss_clip": 0.01119439, "auxiliary_loss_mlp": 0.01033044, "balance_loss_clip": 1.01870298, "balance_loss_mlp": 1.04081333, "epoch": 0.4776191191943484, "flos": 21652806090240.0, "grad_norm": 2.0038372217179674, "language_loss": 0.73483282, "learning_rate": 2.2414599780047577e-06, "loss": 0.75635761, "num_input_tokens_seen": 170769015, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78515625, "step": 7944, "time_per_iteration": 2.4771649837493896 }, { "auxiliary_loss_clip": 0.01118733, "auxiliary_loss_mlp": 0.01031701, "balance_loss_clip": 1.01726508, "balance_loss_mlp": 1.04206467, "epoch": 0.4776792424470164, "flos": 18770220760320.0, "grad_norm": 2.5339933921551516, "language_loss": 0.68525523, "learning_rate": 2.2410733605106456e-06, "loss": 0.70675957, "num_input_tokens_seen": 170785725, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 7945, "time_per_iteration": 2.444768190383911 }, { "auxiliary_loss_clip": 0.01115221, "auxiliary_loss_mlp": 0.01029334, "balance_loss_clip": 1.01633382, "balance_loss_mlp": 1.03918338, "epoch": 0.47773936569968434, "flos": 29715828577920.0, "grad_norm": 1.751755977713602, "language_loss": 0.75385261, "learning_rate": 2.240686733875009e-06, "loss": 0.77529812, "num_input_tokens_seen": 170804600, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.76171875, "step": 7946, "time_per_iteration": 2.541687250137329 }, { "auxiliary_loss_clip": 0.01122185, "auxiliary_loss_mlp": 0.01035895, "balance_loss_clip": 1.02122033, "balance_loss_mlp": 1.04385912, "epoch": 0.4777994889523523, "flos": 24791542283520.0, "grad_norm": 2.1343323420793427, "language_loss": 0.79032689, "learning_rate": 2.240300098112506e-06, "loss": 0.81190765, "num_input_tokens_seen": 170824230, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 7947, "time_per_iteration": 2.4985976219177246 }, { "auxiliary_loss_clip": 0.01113213, "auxiliary_loss_mlp": 0.01030698, "balance_loss_clip": 1.01737022, "balance_loss_mlp": 1.0399797, "epoch": 0.47785961220502027, "flos": 17858161595520.0, "grad_norm": 2.1844663334780883, "language_loss": 0.73809338, "learning_rate": 2.2399134532377998e-06, "loss": 0.75953245, "num_input_tokens_seen": 170843365, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 7948, "time_per_iteration": 2.461193561553955 }, { "auxiliary_loss_clip": 0.01116808, "auxiliary_loss_mlp": 0.01030055, "balance_loss_clip": 1.01616168, "balance_loss_mlp": 1.04035878, "epoch": 0.4779197354576883, "flos": 20266546550400.0, "grad_norm": 1.5264573089357172, "language_loss": 0.77969873, "learning_rate": 2.2395267992655514e-06, "loss": 0.80116737, "num_input_tokens_seen": 170863515, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 7949, "time_per_iteration": 2.468090534210205 }, { "auxiliary_loss_clip": 0.01114554, "auxiliary_loss_mlp": 0.01029163, "balance_loss_clip": 1.01601374, "balance_loss_mlp": 1.0406996, "epoch": 0.47797985871035625, "flos": 17056599644160.0, "grad_norm": 2.605873055330395, "language_loss": 0.74151838, "learning_rate": 2.2391401362104227e-06, "loss": 0.76295543, "num_input_tokens_seen": 170881245, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 7950, "time_per_iteration": 2.478440999984741 }, { "auxiliary_loss_clip": 0.0111601, "auxiliary_loss_mlp": 0.01037026, "balance_loss_clip": 1.02222061, "balance_loss_mlp": 1.04126477, "epoch": 0.4780399819630242, "flos": 31358418549120.0, "grad_norm": 2.017308525977604, "language_loss": 0.74354094, "learning_rate": 2.2387534640870756e-06, "loss": 0.76507127, "num_input_tokens_seen": 170901285, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.74609375, "step": 7951, "time_per_iteration": 2.553626775741577 }, { "auxiliary_loss_clip": 0.01117639, "auxiliary_loss_mlp": 0.01031823, "balance_loss_clip": 1.01748168, "balance_loss_mlp": 1.03935599, "epoch": 0.4781001052156922, "flos": 24899597372160.0, "grad_norm": 2.250471640263233, "language_loss": 0.8012926, "learning_rate": 2.238366782910174e-06, "loss": 0.82278717, "num_input_tokens_seen": 170919740, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 7952, "time_per_iteration": 2.5185110569000244 }, { "auxiliary_loss_clip": 0.01119563, "auxiliary_loss_mlp": 0.01035487, "balance_loss_clip": 1.02111638, "balance_loss_mlp": 1.04168439, "epoch": 0.47816022846836015, "flos": 18697717157760.0, "grad_norm": 1.7581611842538716, "language_loss": 0.78215539, "learning_rate": 2.23798009269438e-06, "loss": 0.80370593, "num_input_tokens_seen": 170938510, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.77734375, "step": 7953, "time_per_iteration": 2.442957878112793 }, { "auxiliary_loss_clip": 0.01119286, "auxiliary_loss_mlp": 0.01031218, "balance_loss_clip": 1.01769376, "balance_loss_mlp": 1.04083383, "epoch": 0.4782203517210281, "flos": 11977573559040.0, "grad_norm": 2.196359319951418, "language_loss": 0.83834428, "learning_rate": 2.2375933934543566e-06, "loss": 0.85984933, "num_input_tokens_seen": 170951170, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.78515625, "step": 7954, "time_per_iteration": 2.43822979927063 }, { "auxiliary_loss_clip": 0.01115288, "auxiliary_loss_mlp": 0.01035247, "balance_loss_clip": 1.02189565, "balance_loss_mlp": 1.03914285, "epoch": 0.4782804749736961, "flos": 20813501923200.0, "grad_norm": 1.4824667908916145, "language_loss": 0.70118773, "learning_rate": 2.237206685204768e-06, "loss": 0.72269309, "num_input_tokens_seen": 170970990, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76171875, "step": 7955, "time_per_iteration": 2.577043056488037 }, { "auxiliary_loss_clip": 0.01117289, "auxiliary_loss_mlp": 0.0103527, "balance_loss_clip": 1.02182305, "balance_loss_mlp": 1.04067755, "epoch": 0.47834059822636404, "flos": 23840304359040.0, "grad_norm": 2.5441830106770875, "language_loss": 0.81646597, "learning_rate": 2.2368199679602787e-06, "loss": 0.83799154, "num_input_tokens_seen": 170991215, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 7956, "time_per_iteration": 2.5092990398406982 }, { "auxiliary_loss_clip": 0.0111823, "auxiliary_loss_mlp": 0.01030139, "balance_loss_clip": 1.01545203, "balance_loss_mlp": 1.04281366, "epoch": 0.478400721479032, "flos": 22633777497600.0, "grad_norm": 1.9366145621193718, "language_loss": 0.84507573, "learning_rate": 2.2364332417355516e-06, "loss": 0.86655945, "num_input_tokens_seen": 171007325, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75390625, "step": 7957, "time_per_iteration": 2.4687836170196533 }, { "auxiliary_loss_clip": 0.01116627, "auxiliary_loss_mlp": 0.01037428, "balance_loss_clip": 1.02365375, "balance_loss_mlp": 1.040869, "epoch": 0.4784608447317, "flos": 19354954262400.0, "grad_norm": 1.6025423479201408, "language_loss": 0.79752028, "learning_rate": 2.2360465065452527e-06, "loss": 0.81906086, "num_input_tokens_seen": 171025650, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7578125, "step": 7958, "time_per_iteration": 2.4839541912078857 }, { "auxiliary_loss_clip": 0.01114982, "auxiliary_loss_mlp": 0.01033913, "balance_loss_clip": 1.01946521, "balance_loss_mlp": 1.03889763, "epoch": 0.47852096798436794, "flos": 24021114445440.0, "grad_norm": 1.780926787898003, "language_loss": 0.82733738, "learning_rate": 2.235659762404047e-06, "loss": 0.84882629, "num_input_tokens_seen": 171045045, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 7959, "time_per_iteration": 2.4850289821624756 }, { "auxiliary_loss_clip": 0.0111285, "auxiliary_loss_mlp": 0.01032512, "balance_loss_clip": 1.02024555, "balance_loss_mlp": 1.04065049, "epoch": 0.4785810912370359, "flos": 25666433850240.0, "grad_norm": 2.532004075920613, "language_loss": 0.73041612, "learning_rate": 2.235273009326599e-06, "loss": 0.75186974, "num_input_tokens_seen": 171062910, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.72265625, "step": 7960, "time_per_iteration": 2.5071632862091064 }, { "auxiliary_loss_clip": 0.01114818, "auxiliary_loss_mlp": 0.01036714, "balance_loss_clip": 1.02360058, "balance_loss_mlp": 1.04051304, "epoch": 0.47864121448970387, "flos": 21432134885760.0, "grad_norm": 1.831479244198384, "language_loss": 0.77366805, "learning_rate": 2.2348862473275745e-06, "loss": 0.79518336, "num_input_tokens_seen": 171080875, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 7961, "time_per_iteration": 2.4643802642822266 }, { "auxiliary_loss_clip": 0.01113715, "auxiliary_loss_mlp": 0.01028829, "balance_loss_clip": 1.0149709, "balance_loss_mlp": 1.0393399, "epoch": 0.47870133774237184, "flos": 16143894034560.0, "grad_norm": 1.6004345319513416, "language_loss": 0.77631032, "learning_rate": 2.2344994764216405e-06, "loss": 0.79773569, "num_input_tokens_seen": 171099190, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 7962, "time_per_iteration": 2.4551072120666504 }, { "auxiliary_loss_clip": 0.01119342, "auxiliary_loss_mlp": 0.01034952, "balance_loss_clip": 1.02135062, "balance_loss_mlp": 1.04216027, "epoch": 0.47876146099503986, "flos": 26906788344960.0, "grad_norm": 1.9547042687773677, "language_loss": 0.65202636, "learning_rate": 2.2341126966234635e-06, "loss": 0.67356932, "num_input_tokens_seen": 171119060, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7734375, "step": 7963, "time_per_iteration": 2.507164239883423 }, { "auxiliary_loss_clip": 0.01116531, "auxiliary_loss_mlp": 0.01035745, "balance_loss_clip": 1.022048, "balance_loss_mlp": 1.04002976, "epoch": 0.4788215842477078, "flos": 45332085778560.0, "grad_norm": 3.207774054490069, "language_loss": 0.77637672, "learning_rate": 2.2337259079477083e-06, "loss": 0.79789954, "num_input_tokens_seen": 171141900, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 7964, "time_per_iteration": 2.690239191055298 }, { "auxiliary_loss_clip": 0.01119656, "auxiliary_loss_mlp": 0.01035559, "balance_loss_clip": 1.02013302, "balance_loss_mlp": 1.04047084, "epoch": 0.4788817075003758, "flos": 22237180456320.0, "grad_norm": 1.7052337594311746, "language_loss": 0.76441497, "learning_rate": 2.233339110409044e-06, "loss": 0.78596717, "num_input_tokens_seen": 171161045, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.79296875, "step": 7965, "time_per_iteration": 2.477280378341675 }, { "auxiliary_loss_clip": 0.0111546, "auxiliary_loss_mlp": 0.01035365, "balance_loss_clip": 1.02241325, "balance_loss_mlp": 1.03991652, "epoch": 0.47894183075304375, "flos": 16471183783680.0, "grad_norm": 1.7497241222325106, "language_loss": 0.74956477, "learning_rate": 2.232952304022137e-06, "loss": 0.77107298, "num_input_tokens_seen": 171179675, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75390625, "step": 7966, "time_per_iteration": 4.012845754623413 }, { "auxiliary_loss_clip": 0.01116664, "auxiliary_loss_mlp": 0.01032955, "balance_loss_clip": 1.0189302, "balance_loss_mlp": 1.04059577, "epoch": 0.4790019540057117, "flos": 24282688262400.0, "grad_norm": 1.6993018409118288, "language_loss": 0.73269284, "learning_rate": 2.232565488801655e-06, "loss": 0.75418907, "num_input_tokens_seen": 171201175, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 7967, "time_per_iteration": 5.293488025665283 }, { "auxiliary_loss_clip": 0.01109487, "auxiliary_loss_mlp": 0.0102705, "balance_loss_clip": 1.01360941, "balance_loss_mlp": 1.03785992, "epoch": 0.4790620772583797, "flos": 25666469763840.0, "grad_norm": 3.6418064979819933, "language_loss": 0.79123974, "learning_rate": 2.232178664762267e-06, "loss": 0.81260514, "num_input_tokens_seen": 171221750, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 7968, "time_per_iteration": 2.510774850845337 }, { "auxiliary_loss_clip": 0.01037479, "auxiliary_loss_mlp": 0.01001666, "balance_loss_clip": 1.00019932, "balance_loss_mlp": 1.01241267, "epoch": 0.47912220051104765, "flos": 69428077102080.0, "grad_norm": 0.7664893357841348, "language_loss": 0.62237728, "learning_rate": 2.2317918319186408e-06, "loss": 0.64276874, "num_input_tokens_seen": 171292235, "router_z_loss_clip": 0.01464844, "router_z_loss_mlp": 0.25, "step": 7969, "time_per_iteration": 4.632097959518433 }, { "auxiliary_loss_clip": 0.01112818, "auxiliary_loss_mlp": 0.01029218, "balance_loss_clip": 1.01627731, "balance_loss_mlp": 1.04048133, "epoch": 0.4791823237637156, "flos": 24168922911360.0, "grad_norm": 1.5742670836056727, "language_loss": 0.77378923, "learning_rate": 2.2314049902854446e-06, "loss": 0.79520953, "num_input_tokens_seen": 171312215, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 7970, "time_per_iteration": 2.5187413692474365 }, { "auxiliary_loss_clip": 0.01113891, "auxiliary_loss_mlp": 0.01032105, "balance_loss_clip": 1.01840782, "balance_loss_mlp": 1.03758621, "epoch": 0.4792424470163836, "flos": 24751465683840.0, "grad_norm": 2.626953685569648, "language_loss": 0.70313245, "learning_rate": 2.231018139877349e-06, "loss": 0.72459239, "num_input_tokens_seen": 171332975, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 7971, "time_per_iteration": 2.4911692142486572 }, { "auxiliary_loss_clip": 0.01113749, "auxiliary_loss_mlp": 0.0102994, "balance_loss_clip": 1.01589727, "balance_loss_mlp": 1.03898513, "epoch": 0.47930257026905154, "flos": 23257905240960.0, "grad_norm": 1.273999706335796, "language_loss": 0.7984308, "learning_rate": 2.230631280709021e-06, "loss": 0.81986773, "num_input_tokens_seen": 171353880, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.74609375, "step": 7972, "time_per_iteration": 2.4994778633117676 }, { "auxiliary_loss_clip": 0.01117836, "auxiliary_loss_mlp": 0.01030666, "balance_loss_clip": 1.01662898, "balance_loss_mlp": 1.04140031, "epoch": 0.4793626935217195, "flos": 14064091718400.0, "grad_norm": 2.005972777292343, "language_loss": 0.69536197, "learning_rate": 2.2302444127951327e-06, "loss": 0.716847, "num_input_tokens_seen": 171370930, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 7973, "time_per_iteration": 2.4293534755706787 }, { "auxiliary_loss_clip": 0.01114712, "auxiliary_loss_mlp": 0.01031177, "balance_loss_clip": 1.01856494, "balance_loss_mlp": 1.04178321, "epoch": 0.4794228167743875, "flos": 21798854789760.0, "grad_norm": 1.693078597155586, "language_loss": 0.78719366, "learning_rate": 2.2298575361503523e-06, "loss": 0.80865252, "num_input_tokens_seen": 171387575, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73046875, "step": 7974, "time_per_iteration": 2.464841365814209 }, { "auxiliary_loss_clip": 0.01038755, "auxiliary_loss_mlp": 0.01002878, "balance_loss_clip": 1.00154305, "balance_loss_mlp": 1.01362062, "epoch": 0.47948294002705544, "flos": 66968805553920.0, "grad_norm": 0.7582282876928964, "language_loss": 0.5405792, "learning_rate": 2.2294706507893517e-06, "loss": 0.56099546, "num_input_tokens_seen": 171449980, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.25, "step": 7975, "time_per_iteration": 3.1175243854522705 }, { "auxiliary_loss_clip": 0.01122315, "auxiliary_loss_mlp": 0.01035658, "balance_loss_clip": 1.02057242, "balance_loss_mlp": 1.04226184, "epoch": 0.47954306327972346, "flos": 12422471414400.0, "grad_norm": 2.0273204968927336, "language_loss": 0.9026593, "learning_rate": 2.2290837567268008e-06, "loss": 0.92423904, "num_input_tokens_seen": 171465290, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 7976, "time_per_iteration": 2.4549126625061035 }, { "auxiliary_loss_clip": 0.01121313, "auxiliary_loss_mlp": 0.01042886, "balance_loss_clip": 1.02761507, "balance_loss_mlp": 1.04238927, "epoch": 0.4796031865323914, "flos": 18361951799040.0, "grad_norm": 2.665057561959219, "language_loss": 0.73160458, "learning_rate": 2.2286968539773713e-06, "loss": 0.75324655, "num_input_tokens_seen": 171481130, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7890625, "step": 7977, "time_per_iteration": 2.4386870861053467 }, { "auxiliary_loss_clip": 0.01113375, "auxiliary_loss_mlp": 0.01035377, "balance_loss_clip": 1.02206135, "balance_loss_mlp": 1.03942776, "epoch": 0.4796633097850594, "flos": 21835088634240.0, "grad_norm": 1.5727294875024072, "language_loss": 0.78641987, "learning_rate": 2.228309942555734e-06, "loss": 0.80790746, "num_input_tokens_seen": 171501140, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 7978, "time_per_iteration": 2.5012166500091553 }, { "auxiliary_loss_clip": 0.01116049, "auxiliary_loss_mlp": 0.01036414, "balance_loss_clip": 1.02249634, "balance_loss_mlp": 1.04034007, "epoch": 0.47972343303772735, "flos": 23437350610560.0, "grad_norm": 1.818502791373185, "language_loss": 0.89454019, "learning_rate": 2.22792302247656e-06, "loss": 0.9160648, "num_input_tokens_seen": 171519835, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 7979, "time_per_iteration": 2.482571601867676 }, { "auxiliary_loss_clip": 0.01120695, "auxiliary_loss_mlp": 0.01036938, "balance_loss_clip": 1.02177429, "balance_loss_mlp": 1.04327869, "epoch": 0.4797835562903953, "flos": 24899776940160.0, "grad_norm": 1.9480788881377873, "language_loss": 0.76858431, "learning_rate": 2.227536093754523e-06, "loss": 0.7901606, "num_input_tokens_seen": 171540980, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7734375, "step": 7980, "time_per_iteration": 2.533953905105591 }, { "auxiliary_loss_clip": 0.01119172, "auxiliary_loss_mlp": 0.01040158, "balance_loss_clip": 1.02457714, "balance_loss_mlp": 1.04071462, "epoch": 0.4798436795430633, "flos": 35042996793600.0, "grad_norm": 1.5603428425547614, "language_loss": 0.71585864, "learning_rate": 2.227149156404295e-06, "loss": 0.73745191, "num_input_tokens_seen": 171563600, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.78515625, "step": 7981, "time_per_iteration": 2.5797715187072754 }, { "auxiliary_loss_clip": 0.01113933, "auxiliary_loss_mlp": 0.01030026, "balance_loss_clip": 1.01722312, "balance_loss_mlp": 1.04105139, "epoch": 0.47990380279573125, "flos": 20590209025920.0, "grad_norm": 1.7939961286565662, "language_loss": 0.70765013, "learning_rate": 2.2267622104405473e-06, "loss": 0.72908968, "num_input_tokens_seen": 171580700, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 7982, "time_per_iteration": 2.477524518966675 }, { "auxiliary_loss_clip": 0.01108803, "auxiliary_loss_mlp": 0.01030793, "balance_loss_clip": 1.01869285, "balance_loss_mlp": 1.03809965, "epoch": 0.4799639260483992, "flos": 26359402008960.0, "grad_norm": 1.6111774964074053, "language_loss": 0.70870364, "learning_rate": 2.2263752558779544e-06, "loss": 0.73009956, "num_input_tokens_seen": 171602035, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.70703125, "step": 7983, "time_per_iteration": 2.512214422225952 }, { "auxiliary_loss_clip": 0.01039475, "auxiliary_loss_mlp": 0.01002758, "balance_loss_clip": 1.00141704, "balance_loss_mlp": 1.01408684, "epoch": 0.4800240493010672, "flos": 70979021521920.0, "grad_norm": 0.8309852227302797, "language_loss": 0.59478498, "learning_rate": 2.2259882927311883e-06, "loss": 0.61520731, "num_input_tokens_seen": 171659215, "router_z_loss_clip": 0.01342773, "router_z_loss_mlp": 0.25390625, "step": 7984, "time_per_iteration": 3.054971933364868 }, { "auxiliary_loss_clip": 0.01114583, "auxiliary_loss_mlp": 0.01037083, "balance_loss_clip": 1.02367187, "balance_loss_mlp": 1.03982556, "epoch": 0.48008417255373514, "flos": 17086656349440.0, "grad_norm": 2.0719212905874356, "language_loss": 0.66551924, "learning_rate": 2.2256013210149247e-06, "loss": 0.68703586, "num_input_tokens_seen": 171675710, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 7985, "time_per_iteration": 2.4542617797851562 }, { "auxiliary_loss_clip": 0.01118534, "auxiliary_loss_mlp": 0.01040657, "balance_loss_clip": 1.02628064, "balance_loss_mlp": 1.04042459, "epoch": 0.4801442958064031, "flos": 15413435055360.0, "grad_norm": 1.7686514399283044, "language_loss": 0.69925386, "learning_rate": 2.225214340743835e-06, "loss": 0.72084576, "num_input_tokens_seen": 171692510, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 7986, "time_per_iteration": 2.430651903152466 }, { "auxiliary_loss_clip": 0.01120941, "auxiliary_loss_mlp": 0.01034634, "balance_loss_clip": 1.01998329, "balance_loss_mlp": 1.0418309, "epoch": 0.4802044190590711, "flos": 11473747441920.0, "grad_norm": 2.3322887330411244, "language_loss": 0.79456538, "learning_rate": 2.2248273519325956e-06, "loss": 0.8161211, "num_input_tokens_seen": 171710235, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.79296875, "step": 7987, "time_per_iteration": 2.4636130332946777 }, { "auxiliary_loss_clip": 0.01117124, "auxiliary_loss_mlp": 0.01036913, "balance_loss_clip": 1.02296567, "balance_loss_mlp": 1.04056156, "epoch": 0.48026454231173904, "flos": 20951003185920.0, "grad_norm": 2.563705393213309, "language_loss": 0.75600588, "learning_rate": 2.2244403545958812e-06, "loss": 0.77754629, "num_input_tokens_seen": 171726715, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 7988, "time_per_iteration": 2.4722139835357666 }, { "auxiliary_loss_clip": 0.01119297, "auxiliary_loss_mlp": 0.01034581, "balance_loss_clip": 1.02104497, "balance_loss_mlp": 1.04316509, "epoch": 0.48032466556440706, "flos": 20448110822400.0, "grad_norm": 2.0815175077069528, "language_loss": 0.79665476, "learning_rate": 2.224053348748365e-06, "loss": 0.81819355, "num_input_tokens_seen": 171743605, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 7989, "time_per_iteration": 2.489600658416748 }, { "auxiliary_loss_clip": 0.0112069, "auxiliary_loss_mlp": 0.0103684, "balance_loss_clip": 1.02215946, "balance_loss_mlp": 1.04102325, "epoch": 0.480384788817075, "flos": 37120823861760.0, "grad_norm": 2.5325969373286514, "language_loss": 0.73409963, "learning_rate": 2.223666334404724e-06, "loss": 0.75567496, "num_input_tokens_seen": 171765445, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.796875, "step": 7990, "time_per_iteration": 2.61814546585083 }, { "auxiliary_loss_clip": 0.01038435, "auxiliary_loss_mlp": 0.01000291, "balance_loss_clip": 0.9989025, "balance_loss_mlp": 1.01324606, "epoch": 0.480444912069743, "flos": 69552577641600.0, "grad_norm": 0.7661076078315247, "language_loss": 0.59052211, "learning_rate": 2.223279311579633e-06, "loss": 0.61090928, "num_input_tokens_seen": 171830115, "router_z_loss_clip": 0.01391602, "router_z_loss_mlp": 0.25195312, "step": 7991, "time_per_iteration": 3.1767008304595947 }, { "auxiliary_loss_clip": 0.01115116, "auxiliary_loss_mlp": 0.01029995, "balance_loss_clip": 1.0160054, "balance_loss_mlp": 1.03942609, "epoch": 0.48050503532241096, "flos": 29822231640960.0, "grad_norm": 1.8118047107718522, "language_loss": 0.67081606, "learning_rate": 2.222892280287768e-06, "loss": 0.69226718, "num_input_tokens_seen": 171849135, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 7992, "time_per_iteration": 2.5304152965545654 }, { "auxiliary_loss_clip": 0.01116746, "auxiliary_loss_mlp": 0.01035098, "balance_loss_clip": 1.02092409, "balance_loss_mlp": 1.03889132, "epoch": 0.4805651585750789, "flos": 23948539015680.0, "grad_norm": 1.6108504505273873, "language_loss": 0.76302648, "learning_rate": 2.2225052405438056e-06, "loss": 0.78454494, "num_input_tokens_seen": 171868880, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.77734375, "step": 7993, "time_per_iteration": 2.5330512523651123 }, { "auxiliary_loss_clip": 0.01114269, "auxiliary_loss_mlp": 0.01037567, "balance_loss_clip": 1.02420354, "balance_loss_mlp": 1.04054415, "epoch": 0.4806252818277469, "flos": 25665428269440.0, "grad_norm": 1.5698334180120401, "language_loss": 0.78579754, "learning_rate": 2.222118192362422e-06, "loss": 0.80731583, "num_input_tokens_seen": 171889455, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 7994, "time_per_iteration": 2.5058743953704834 }, { "auxiliary_loss_clip": 0.01115499, "auxiliary_loss_mlp": 0.010326, "balance_loss_clip": 1.01933205, "balance_loss_mlp": 1.03936648, "epoch": 0.48068540508041485, "flos": 13151996640000.0, "grad_norm": 3.369960157126464, "language_loss": 0.7998926, "learning_rate": 2.2217311357582946e-06, "loss": 0.82137358, "num_input_tokens_seen": 171906070, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76171875, "step": 7995, "time_per_iteration": 2.4755146503448486 }, { "auxiliary_loss_clip": 0.01114468, "auxiliary_loss_mlp": 0.01034657, "balance_loss_clip": 1.02014339, "balance_loss_mlp": 1.03918052, "epoch": 0.4807455283330828, "flos": 21176738208000.0, "grad_norm": 1.5265483515889762, "language_loss": 0.82593089, "learning_rate": 2.2213440707461e-06, "loss": 0.84742218, "num_input_tokens_seen": 171926515, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75390625, "step": 7996, "time_per_iteration": 2.4875874519348145 }, { "auxiliary_loss_clip": 0.01114165, "auxiliary_loss_mlp": 0.01031244, "balance_loss_clip": 1.01802349, "balance_loss_mlp": 1.04048204, "epoch": 0.4808056515857508, "flos": 12275991751680.0, "grad_norm": 1.6266177488824576, "language_loss": 0.80507481, "learning_rate": 2.220956997340516e-06, "loss": 0.82652891, "num_input_tokens_seen": 171943845, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 7997, "time_per_iteration": 2.4599392414093018 }, { "auxiliary_loss_clip": 0.01114292, "auxiliary_loss_mlp": 0.0103465, "balance_loss_clip": 1.0211854, "balance_loss_mlp": 1.0387131, "epoch": 0.48086577483841875, "flos": 24826052275200.0, "grad_norm": 1.8307418401853532, "language_loss": 0.72834241, "learning_rate": 2.220569915556221e-06, "loss": 0.74983186, "num_input_tokens_seen": 171964970, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 7998, "time_per_iteration": 2.5067501068115234 }, { "auxiliary_loss_clip": 0.01114709, "auxiliary_loss_mlp": 0.01033002, "balance_loss_clip": 1.01877463, "balance_loss_mlp": 1.03953218, "epoch": 0.4809258980910867, "flos": 24465365856000.0, "grad_norm": 1.9631327814549613, "language_loss": 0.70916736, "learning_rate": 2.220182825407892e-06, "loss": 0.73064446, "num_input_tokens_seen": 171986340, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 7999, "time_per_iteration": 2.5292446613311768 }, { "auxiliary_loss_clip": 0.01118698, "auxiliary_loss_mlp": 0.01041148, "balance_loss_clip": 1.02705789, "balance_loss_mlp": 1.03991508, "epoch": 0.4809860213437547, "flos": 21215952881280.0, "grad_norm": 1.579003668291791, "language_loss": 0.71215349, "learning_rate": 2.2197957269102083e-06, "loss": 0.73375189, "num_input_tokens_seen": 172007300, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7890625, "step": 8000, "time_per_iteration": 2.4904067516326904 }, { "auxiliary_loss_clip": 0.0111813, "auxiliary_loss_mlp": 0.01036727, "balance_loss_clip": 1.02263021, "balance_loss_mlp": 1.04193342, "epoch": 0.48104614459642264, "flos": 37632084094080.0, "grad_norm": 1.4213554303456877, "language_loss": 0.74978197, "learning_rate": 2.2194086200778485e-06, "loss": 0.77133054, "num_input_tokens_seen": 172029585, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 8001, "time_per_iteration": 2.61968994140625 }, { "auxiliary_loss_clip": 0.01118832, "auxiliary_loss_mlp": 0.01040759, "balance_loss_clip": 1.02634692, "balance_loss_mlp": 1.04120183, "epoch": 0.48110626784909066, "flos": 18406122549120.0, "grad_norm": 1.7300982832660532, "language_loss": 0.81463969, "learning_rate": 2.219021504925493e-06, "loss": 0.83623564, "num_input_tokens_seen": 172047495, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 8002, "time_per_iteration": 2.447446346282959 }, { "auxiliary_loss_clip": 0.01119838, "auxiliary_loss_mlp": 0.01036515, "balance_loss_clip": 1.02157235, "balance_loss_mlp": 1.04144955, "epoch": 0.48116639110175863, "flos": 28439814856320.0, "grad_norm": 1.685800895620913, "language_loss": 0.71420753, "learning_rate": 2.218634381467819e-06, "loss": 0.73577112, "num_input_tokens_seen": 172067625, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78125, "step": 8003, "time_per_iteration": 2.5340442657470703 }, { "auxiliary_loss_clip": 0.01114002, "auxiliary_loss_mlp": 0.0103727, "balance_loss_clip": 1.02422261, "balance_loss_mlp": 1.04084587, "epoch": 0.4812265143544266, "flos": 21725237865600.0, "grad_norm": 1.8647154224974996, "language_loss": 0.82136166, "learning_rate": 2.218247249719507e-06, "loss": 0.84287441, "num_input_tokens_seen": 172087885, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 8004, "time_per_iteration": 2.4865334033966064 }, { "auxiliary_loss_clip": 0.01126035, "auxiliary_loss_mlp": 0.01045229, "balance_loss_clip": 1.02859962, "balance_loss_mlp": 1.04284966, "epoch": 0.48128663760709456, "flos": 13224679810560.0, "grad_norm": 1.9020371160432643, "language_loss": 0.77719688, "learning_rate": 2.217860109695239e-06, "loss": 0.79890954, "num_input_tokens_seen": 172105815, "router_z_loss_clip": 0.16601562, "router_z_loss_mlp": 0.828125, "step": 8005, "time_per_iteration": 2.4551427364349365 }, { "auxiliary_loss_clip": 0.01116982, "auxiliary_loss_mlp": 0.01034432, "balance_loss_clip": 1.02013874, "balance_loss_mlp": 1.03885531, "epoch": 0.4813467608597625, "flos": 24243437675520.0, "grad_norm": 4.014533969046682, "language_loss": 0.71165532, "learning_rate": 2.217472961409692e-06, "loss": 0.73316944, "num_input_tokens_seen": 172126125, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 8006, "time_per_iteration": 2.475019931793213 }, { "auxiliary_loss_clip": 0.01118159, "auxiliary_loss_mlp": 0.01036461, "balance_loss_clip": 1.02223372, "balance_loss_mlp": 1.04045057, "epoch": 0.4814068841124305, "flos": 27480424544640.0, "grad_norm": 1.8870805752868984, "language_loss": 0.70654476, "learning_rate": 2.2170858048775495e-06, "loss": 0.72809094, "num_input_tokens_seen": 172141945, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 8007, "time_per_iteration": 3.9888737201690674 }, { "auxiliary_loss_clip": 0.01119557, "auxiliary_loss_mlp": 0.01034382, "balance_loss_clip": 1.02009439, "balance_loss_mlp": 1.04096985, "epoch": 0.48146700736509845, "flos": 19572896033280.0, "grad_norm": 1.862494719738571, "language_loss": 0.71120286, "learning_rate": 2.2166986401134914e-06, "loss": 0.73274225, "num_input_tokens_seen": 172161095, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78515625, "step": 8008, "time_per_iteration": 3.8403069972991943 }, { "auxiliary_loss_clip": 0.0112167, "auxiliary_loss_mlp": 0.01045299, "balance_loss_clip": 1.02980185, "balance_loss_mlp": 1.04136634, "epoch": 0.4815271306177664, "flos": 20627771673600.0, "grad_norm": 1.848414815429013, "language_loss": 0.61156917, "learning_rate": 2.216311467132199e-06, "loss": 0.63323885, "num_input_tokens_seen": 172178750, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.8046875, "step": 8009, "time_per_iteration": 3.9383411407470703 }, { "auxiliary_loss_clip": 0.0103739, "auxiliary_loss_mlp": 0.01005743, "balance_loss_clip": 1.00437796, "balance_loss_mlp": 1.0117389, "epoch": 0.4815872538704344, "flos": 67691076232320.0, "grad_norm": 1.06057259185167, "language_loss": 0.61323082, "learning_rate": 2.2159242859483547e-06, "loss": 0.63366216, "num_input_tokens_seen": 172240235, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.25585938, "step": 8010, "time_per_iteration": 3.1198067665100098 }, { "auxiliary_loss_clip": 0.0111953, "auxiliary_loss_mlp": 0.01039622, "balance_loss_clip": 1.02485216, "balance_loss_mlp": 1.0419358, "epoch": 0.48164737712310235, "flos": 22820764723200.0, "grad_norm": 1.6243939898162703, "language_loss": 0.7352308, "learning_rate": 2.215537096576639e-06, "loss": 0.75682229, "num_input_tokens_seen": 172259875, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7734375, "step": 8011, "time_per_iteration": 3.860240936279297 }, { "auxiliary_loss_clip": 0.01112472, "auxiliary_loss_mlp": 0.01031359, "balance_loss_clip": 1.01835346, "balance_loss_mlp": 1.03886294, "epoch": 0.4817075003757703, "flos": 23733865382400.0, "grad_norm": 2.966522426346687, "language_loss": 0.79967535, "learning_rate": 2.2151498990317354e-06, "loss": 0.82111359, "num_input_tokens_seen": 172280150, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 8012, "time_per_iteration": 2.5196917057037354 }, { "auxiliary_loss_clip": 0.01118028, "auxiliary_loss_mlp": 0.01043387, "balance_loss_clip": 1.0283066, "balance_loss_mlp": 1.04077721, "epoch": 0.4817676236284383, "flos": 28182909807360.0, "grad_norm": 2.234969052492919, "language_loss": 0.73559409, "learning_rate": 2.214762693328326e-06, "loss": 0.75720823, "num_input_tokens_seen": 172300810, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7734375, "step": 8013, "time_per_iteration": 2.5231876373291016 }, { "auxiliary_loss_clip": 0.01114864, "auxiliary_loss_mlp": 0.01030425, "balance_loss_clip": 1.01693642, "balance_loss_mlp": 1.03993273, "epoch": 0.48182774688110624, "flos": 17091756080640.0, "grad_norm": 1.8795725597771658, "language_loss": 0.9107635, "learning_rate": 2.214375479481094e-06, "loss": 0.93221647, "num_input_tokens_seen": 172317930, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 8014, "time_per_iteration": 2.4456350803375244 }, { "auxiliary_loss_clip": 0.01120512, "auxiliary_loss_mlp": 0.01038355, "balance_loss_clip": 1.02339458, "balance_loss_mlp": 1.04064202, "epoch": 0.4818878701337742, "flos": 12567873669120.0, "grad_norm": 2.263485218325395, "language_loss": 0.73948896, "learning_rate": 2.213988257504722e-06, "loss": 0.76107764, "num_input_tokens_seen": 172336340, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 8015, "time_per_iteration": 2.4641218185424805 }, { "auxiliary_loss_clip": 0.01122123, "auxiliary_loss_mlp": 0.01040118, "balance_loss_clip": 1.02519894, "balance_loss_mlp": 1.04049754, "epoch": 0.48194799338644223, "flos": 24608505553920.0, "grad_norm": 5.397832341366181, "language_loss": 0.80470026, "learning_rate": 2.213601027413894e-06, "loss": 0.82632267, "num_input_tokens_seen": 172354315, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.81640625, "step": 8016, "time_per_iteration": 2.4847347736358643 }, { "auxiliary_loss_clip": 0.0111527, "auxiliary_loss_mlp": 0.01031721, "balance_loss_clip": 1.01825666, "balance_loss_mlp": 1.04262245, "epoch": 0.4820081166391102, "flos": 21105204272640.0, "grad_norm": 1.9028687298977625, "language_loss": 0.77646858, "learning_rate": 2.2132137892232933e-06, "loss": 0.79793847, "num_input_tokens_seen": 172372695, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 8017, "time_per_iteration": 2.469975471496582 }, { "auxiliary_loss_clip": 0.0111456, "auxiliary_loss_mlp": 0.01029727, "balance_loss_clip": 1.01489139, "balance_loss_mlp": 1.04084146, "epoch": 0.48206823989177816, "flos": 25264593423360.0, "grad_norm": 2.9920466531523964, "language_loss": 0.79974484, "learning_rate": 2.2128265429476043e-06, "loss": 0.82118773, "num_input_tokens_seen": 172390905, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.73828125, "step": 8018, "time_per_iteration": 2.4978559017181396 }, { "auxiliary_loss_clip": 0.01119714, "auxiliary_loss_mlp": 0.01031302, "balance_loss_clip": 1.01757479, "balance_loss_mlp": 1.04194653, "epoch": 0.4821283631444461, "flos": 24645062620800.0, "grad_norm": 2.1591320994352916, "language_loss": 0.76050389, "learning_rate": 2.2124392886015124e-06, "loss": 0.78201401, "num_input_tokens_seen": 172412295, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.77734375, "step": 8019, "time_per_iteration": 2.509526252746582 }, { "auxiliary_loss_clip": 0.01116292, "auxiliary_loss_mlp": 0.01036306, "balance_loss_clip": 1.0215956, "balance_loss_mlp": 1.03883433, "epoch": 0.4821884863971141, "flos": 23952094462080.0, "grad_norm": 1.865747434631303, "language_loss": 0.78971612, "learning_rate": 2.212052026199701e-06, "loss": 0.8112421, "num_input_tokens_seen": 172432625, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7734375, "step": 8020, "time_per_iteration": 2.4928853511810303 }, { "auxiliary_loss_clip": 0.0111515, "auxiliary_loss_mlp": 0.01035816, "balance_loss_clip": 1.02142775, "balance_loss_mlp": 1.04101872, "epoch": 0.48224860964978206, "flos": 17160668323200.0, "grad_norm": 2.39583044300151, "language_loss": 0.69907355, "learning_rate": 2.211664755756855e-06, "loss": 0.7205832, "num_input_tokens_seen": 172450010, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7421875, "step": 8021, "time_per_iteration": 2.461442708969116 }, { "auxiliary_loss_clip": 0.01121569, "auxiliary_loss_mlp": 0.01030623, "balance_loss_clip": 1.01531613, "balance_loss_mlp": 1.04118824, "epoch": 0.48230873290245, "flos": 23075838178560.0, "grad_norm": 1.7586727033727334, "language_loss": 0.63044429, "learning_rate": 2.2112774772876603e-06, "loss": 0.65196633, "num_input_tokens_seen": 172469080, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.8046875, "step": 8022, "time_per_iteration": 2.4971249103546143 }, { "auxiliary_loss_clip": 0.01115064, "auxiliary_loss_mlp": 0.01026135, "balance_loss_clip": 1.0129143, "balance_loss_mlp": 1.04043198, "epoch": 0.482368856155118, "flos": 19353517718400.0, "grad_norm": 2.3877434427865523, "language_loss": 0.66635227, "learning_rate": 2.2108901908068028e-06, "loss": 0.68776429, "num_input_tokens_seen": 172484850, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.74609375, "step": 8023, "time_per_iteration": 2.4750068187713623 }, { "auxiliary_loss_clip": 0.01115921, "auxiliary_loss_mlp": 0.01030522, "balance_loss_clip": 1.01654482, "balance_loss_mlp": 1.03955698, "epoch": 0.48242897940778595, "flos": 20078984707200.0, "grad_norm": 2.1429499857660947, "language_loss": 0.76952714, "learning_rate": 2.2105028963289683e-06, "loss": 0.79099154, "num_input_tokens_seen": 172503525, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 8024, "time_per_iteration": 2.473592758178711 }, { "auxiliary_loss_clip": 0.0111622, "auxiliary_loss_mlp": 0.01036724, "balance_loss_clip": 1.02151263, "balance_loss_mlp": 1.03921342, "epoch": 0.4824891026604539, "flos": 23403989854080.0, "grad_norm": 1.5962447098069499, "language_loss": 0.75020492, "learning_rate": 2.2101155938688423e-06, "loss": 0.7717343, "num_input_tokens_seen": 172524360, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.76953125, "step": 8025, "time_per_iteration": 2.5010828971862793 }, { "auxiliary_loss_clip": 0.01116365, "auxiliary_loss_mlp": 0.01035185, "balance_loss_clip": 1.02099323, "balance_loss_mlp": 1.04032207, "epoch": 0.4825492259131219, "flos": 20368675895040.0, "grad_norm": 1.9675890565372756, "language_loss": 0.70801651, "learning_rate": 2.209728283441112e-06, "loss": 0.729532, "num_input_tokens_seen": 172541480, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76171875, "step": 8026, "time_per_iteration": 2.441124677658081 }, { "auxiliary_loss_clip": 0.01121063, "auxiliary_loss_mlp": 0.01041189, "balance_loss_clip": 1.02513182, "balance_loss_mlp": 1.04163003, "epoch": 0.48260934916578985, "flos": 14319021519360.0, "grad_norm": 1.9949259068599778, "language_loss": 0.74742532, "learning_rate": 2.209340965060465e-06, "loss": 0.76904786, "num_input_tokens_seen": 172559005, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.796875, "step": 8027, "time_per_iteration": 2.4695675373077393 }, { "auxiliary_loss_clip": 0.01121669, "auxiliary_loss_mlp": 0.01034582, "balance_loss_clip": 1.0207901, "balance_loss_mlp": 1.04325223, "epoch": 0.4826694724184578, "flos": 22121152548480.0, "grad_norm": 1.69757010399945, "language_loss": 0.67692876, "learning_rate": 2.2089536387415868e-06, "loss": 0.69849122, "num_input_tokens_seen": 172578435, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.78515625, "step": 8028, "time_per_iteration": 2.4889469146728516 }, { "auxiliary_loss_clip": 0.0111844, "auxiliary_loss_mlp": 0.01034225, "balance_loss_clip": 1.02040887, "balance_loss_mlp": 1.04150295, "epoch": 0.48272959567112583, "flos": 16181169373440.0, "grad_norm": 1.5925347670792829, "language_loss": 0.72999322, "learning_rate": 2.2085663044991655e-06, "loss": 0.75151986, "num_input_tokens_seen": 172596095, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 8029, "time_per_iteration": 2.4613423347473145 }, { "auxiliary_loss_clip": 0.01118497, "auxiliary_loss_mlp": 0.01030277, "balance_loss_clip": 1.01539934, "balance_loss_mlp": 1.04075444, "epoch": 0.4827897189237938, "flos": 23180445561600.0, "grad_norm": 2.2026563708766007, "language_loss": 0.84756243, "learning_rate": 2.2081789623478896e-06, "loss": 0.86905015, "num_input_tokens_seen": 172615255, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.77734375, "step": 8030, "time_per_iteration": 2.4766623973846436 }, { "auxiliary_loss_clip": 0.01114723, "auxiliary_loss_mlp": 0.01028693, "balance_loss_clip": 1.01546049, "balance_loss_mlp": 1.03903437, "epoch": 0.48284984217646176, "flos": 21652626522240.0, "grad_norm": 2.190842723076904, "language_loss": 0.74465024, "learning_rate": 2.2077916123024466e-06, "loss": 0.76608443, "num_input_tokens_seen": 172633185, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7578125, "step": 8031, "time_per_iteration": 2.484938144683838 }, { "auxiliary_loss_clip": 0.01119457, "auxiliary_loss_mlp": 0.01041328, "balance_loss_clip": 1.02588487, "balance_loss_mlp": 1.0395304, "epoch": 0.48290996542912973, "flos": 31467443304960.0, "grad_norm": 2.6623468230741114, "language_loss": 0.72066993, "learning_rate": 2.2074042543775245e-06, "loss": 0.7422778, "num_input_tokens_seen": 172654280, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 8032, "time_per_iteration": 2.532898426055908 }, { "auxiliary_loss_clip": 0.0111512, "auxiliary_loss_mlp": 0.01033446, "balance_loss_clip": 1.01974845, "balance_loss_mlp": 1.03886592, "epoch": 0.4829700886817977, "flos": 24461954064000.0, "grad_norm": 1.3805526144552438, "language_loss": 0.73895425, "learning_rate": 2.2070168885878126e-06, "loss": 0.76043993, "num_input_tokens_seen": 172675545, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76171875, "step": 8033, "time_per_iteration": 2.506009101867676 }, { "auxiliary_loss_clip": 0.01120905, "auxiliary_loss_mlp": 0.01032831, "balance_loss_clip": 1.01858532, "balance_loss_mlp": 1.04142594, "epoch": 0.48303021193446566, "flos": 25702164904320.0, "grad_norm": 2.1513757780467495, "language_loss": 0.83563828, "learning_rate": 2.2066295149479996e-06, "loss": 0.85717559, "num_input_tokens_seen": 172696455, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.796875, "step": 8034, "time_per_iteration": 2.5217926502227783 }, { "auxiliary_loss_clip": 0.01112542, "auxiliary_loss_mlp": 0.01028247, "balance_loss_clip": 1.01486611, "balance_loss_mlp": 1.03912365, "epoch": 0.4830903351871336, "flos": 20085233673600.0, "grad_norm": 1.9114924349931661, "language_loss": 0.79225725, "learning_rate": 2.2062421334727744e-06, "loss": 0.81366515, "num_input_tokens_seen": 172716720, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 8035, "time_per_iteration": 2.4909250736236572 }, { "auxiliary_loss_clip": 0.0111738, "auxiliary_loss_mlp": 0.01039807, "balance_loss_clip": 1.02355933, "balance_loss_mlp": 1.04053593, "epoch": 0.4831504584398016, "flos": 39452216014080.0, "grad_norm": 2.0004269949941555, "language_loss": 0.69535577, "learning_rate": 2.2058547441768267e-06, "loss": 0.71692759, "num_input_tokens_seen": 172737435, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.76953125, "step": 8036, "time_per_iteration": 2.61885404586792 }, { "auxiliary_loss_clip": 0.01114285, "auxiliary_loss_mlp": 0.01034542, "balance_loss_clip": 1.0203445, "balance_loss_mlp": 1.03900993, "epoch": 0.48321058169246955, "flos": 20006588845440.0, "grad_norm": 1.917851872850579, "language_loss": 0.72967041, "learning_rate": 2.205467347074847e-06, "loss": 0.7511586, "num_input_tokens_seen": 172755700, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 8037, "time_per_iteration": 2.47230863571167 }, { "auxiliary_loss_clip": 0.01122791, "auxiliary_loss_mlp": 0.01037229, "balance_loss_clip": 1.0218035, "balance_loss_mlp": 1.04129815, "epoch": 0.4832707049451375, "flos": 20741465197440.0, "grad_norm": 2.405848575435522, "language_loss": 0.69551349, "learning_rate": 2.205079942181525e-06, "loss": 0.71711373, "num_input_tokens_seen": 172775185, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.81640625, "step": 8038, "time_per_iteration": 2.4641830921173096 }, { "auxiliary_loss_clip": 0.01115785, "auxiliary_loss_mlp": 0.01035614, "balance_loss_clip": 1.02099919, "balance_loss_mlp": 1.03944159, "epoch": 0.4833308281978055, "flos": 33145584762240.0, "grad_norm": 1.4814517828928002, "language_loss": 0.79142058, "learning_rate": 2.20469252951155e-06, "loss": 0.81293464, "num_input_tokens_seen": 172796990, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76171875, "step": 8039, "time_per_iteration": 2.5903093814849854 }, { "auxiliary_loss_clip": 0.01116467, "auxiliary_loss_mlp": 0.01031545, "balance_loss_clip": 1.01764536, "balance_loss_mlp": 1.03943396, "epoch": 0.48339095145047345, "flos": 19099234362240.0, "grad_norm": 2.018311726035209, "language_loss": 0.7782433, "learning_rate": 2.2043051090796143e-06, "loss": 0.79972345, "num_input_tokens_seen": 172814915, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 8040, "time_per_iteration": 2.448634624481201 }, { "auxiliary_loss_clip": 0.01117882, "auxiliary_loss_mlp": 0.01037814, "balance_loss_clip": 1.02275777, "balance_loss_mlp": 1.04014456, "epoch": 0.4834510747031414, "flos": 34459448440320.0, "grad_norm": 3.2672132735760457, "language_loss": 0.75620985, "learning_rate": 2.203917680900409e-06, "loss": 0.77776676, "num_input_tokens_seen": 172837060, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.77734375, "step": 8041, "time_per_iteration": 2.6150619983673096 }, { "auxiliary_loss_clip": 0.01118, "auxiliary_loss_mlp": 0.01031839, "balance_loss_clip": 1.01801062, "balance_loss_mlp": 1.04309177, "epoch": 0.48351119795580944, "flos": 27380845065600.0, "grad_norm": 1.7950966260350876, "language_loss": 0.66651875, "learning_rate": 2.203530244988624e-06, "loss": 0.68801719, "num_input_tokens_seen": 172856545, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 8042, "time_per_iteration": 2.527432441711426 }, { "auxiliary_loss_clip": 0.01038106, "auxiliary_loss_mlp": 0.01005335, "balance_loss_clip": 1.00396955, "balance_loss_mlp": 1.01317215, "epoch": 0.4835713212084774, "flos": 67143941291520.0, "grad_norm": 0.6909931647046667, "language_loss": 0.58566761, "learning_rate": 2.2031428013589517e-06, "loss": 0.60610199, "num_input_tokens_seen": 172923055, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.24902344, "step": 8043, "time_per_iteration": 3.1924777030944824 }, { "auxiliary_loss_clip": 0.01118728, "auxiliary_loss_mlp": 0.01033536, "balance_loss_clip": 1.01855183, "balance_loss_mlp": 1.04004288, "epoch": 0.48363144446114537, "flos": 17967473660160.0, "grad_norm": 2.0357621750037147, "language_loss": 0.72037673, "learning_rate": 2.2027553500260847e-06, "loss": 0.74189937, "num_input_tokens_seen": 172940700, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78515625, "step": 8044, "time_per_iteration": 2.4707188606262207 }, { "auxiliary_loss_clip": 0.01114993, "auxiliary_loss_mlp": 0.01031691, "balance_loss_clip": 1.01714158, "balance_loss_mlp": 1.04035819, "epoch": 0.48369156771381333, "flos": 20593513077120.0, "grad_norm": 1.5938733995459475, "language_loss": 0.761087, "learning_rate": 2.202367891004714e-06, "loss": 0.78255385, "num_input_tokens_seen": 172961125, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.74609375, "step": 8045, "time_per_iteration": 2.49039888381958 }, { "auxiliary_loss_clip": 0.01119814, "auxiliary_loss_mlp": 0.01034024, "balance_loss_clip": 1.01931345, "balance_loss_mlp": 1.04207027, "epoch": 0.4837516909664813, "flos": 22675075159680.0, "grad_norm": 1.5467497986617598, "language_loss": 0.69599807, "learning_rate": 2.201980424309533e-06, "loss": 0.71753651, "num_input_tokens_seen": 172980405, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.77734375, "step": 8046, "time_per_iteration": 2.5103113651275635 }, { "auxiliary_loss_clip": 0.01115041, "auxiliary_loss_mlp": 0.0103561, "balance_loss_clip": 1.02067268, "balance_loss_mlp": 1.03883553, "epoch": 0.48381181421914926, "flos": 25518625384320.0, "grad_norm": 2.3403665831052436, "language_loss": 0.8204965, "learning_rate": 2.2015929499552337e-06, "loss": 0.84200299, "num_input_tokens_seen": 172999105, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.76171875, "step": 8047, "time_per_iteration": 2.5070414543151855 }, { "auxiliary_loss_clip": 0.01113506, "auxiliary_loss_mlp": 0.0103136, "balance_loss_clip": 1.01761496, "balance_loss_mlp": 1.03859377, "epoch": 0.4838719374718172, "flos": 24207491139840.0, "grad_norm": 1.570074897946642, "language_loss": 0.80501151, "learning_rate": 2.2012054679565092e-06, "loss": 0.82646012, "num_input_tokens_seen": 173019935, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 8048, "time_per_iteration": 2.5245683193206787 }, { "auxiliary_loss_clip": 0.0112096, "auxiliary_loss_mlp": 0.01037127, "balance_loss_clip": 1.02233911, "balance_loss_mlp": 1.04126024, "epoch": 0.4839320607244852, "flos": 26724577628160.0, "grad_norm": 1.6448404949094513, "language_loss": 0.81308049, "learning_rate": 2.200817978328054e-06, "loss": 0.83466136, "num_input_tokens_seen": 173039700, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.796875, "step": 8049, "time_per_iteration": 4.043022871017456 }, { "auxiliary_loss_clip": 0.01113919, "auxiliary_loss_mlp": 0.01027349, "balance_loss_clip": 1.01489186, "balance_loss_mlp": 1.04146004, "epoch": 0.48399218397715316, "flos": 20448900921600.0, "grad_norm": 1.7116784969927838, "language_loss": 0.72805667, "learning_rate": 2.2004304810845602e-06, "loss": 0.74946928, "num_input_tokens_seen": 173059170, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7265625, "step": 8050, "time_per_iteration": 3.83951735496521 }, { "auxiliary_loss_clip": 0.0103866, "auxiliary_loss_mlp": 0.01008109, "balance_loss_clip": 1.00655901, "balance_loss_mlp": 1.01411343, "epoch": 0.4840523072298211, "flos": 67180570185600.0, "grad_norm": 0.7109004781294949, "language_loss": 0.56422812, "learning_rate": 2.200042976240723e-06, "loss": 0.58469582, "num_input_tokens_seen": 173119000, "router_z_loss_clip": 0.01544189, "router_z_loss_mlp": 0.24609375, "step": 8051, "time_per_iteration": 4.638932466506958 }, { "auxiliary_loss_clip": 0.01121939, "auxiliary_loss_mlp": 0.01032605, "balance_loss_clip": 1.01806164, "balance_loss_mlp": 1.04402924, "epoch": 0.4841124304824891, "flos": 22411490181120.0, "grad_norm": 1.9530705036353582, "language_loss": 0.75278544, "learning_rate": 2.199655463811236e-06, "loss": 0.77433085, "num_input_tokens_seen": 173137570, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78125, "step": 8052, "time_per_iteration": 3.8592231273651123 }, { "auxiliary_loss_clip": 0.01117098, "auxiliary_loss_mlp": 0.01033313, "balance_loss_clip": 1.01943731, "balance_loss_mlp": 1.04112411, "epoch": 0.48417255373515705, "flos": 13843959217920.0, "grad_norm": 2.356780475561872, "language_loss": 0.65966415, "learning_rate": 2.1992679438107936e-06, "loss": 0.68116832, "num_input_tokens_seen": 173154355, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 8053, "time_per_iteration": 2.41670823097229 }, { "auxiliary_loss_clip": 0.0111345, "auxiliary_loss_mlp": 0.01028285, "balance_loss_clip": 1.01485562, "balance_loss_mlp": 1.03933525, "epoch": 0.484232676987825, "flos": 31649689935360.0, "grad_norm": 2.751156451515161, "language_loss": 0.69176251, "learning_rate": 2.198880416254091e-06, "loss": 0.71317989, "num_input_tokens_seen": 173174845, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 8054, "time_per_iteration": 2.550940990447998 }, { "auxiliary_loss_clip": 0.01116775, "auxiliary_loss_mlp": 0.01029849, "balance_loss_clip": 1.01596129, "balance_loss_mlp": 1.04070151, "epoch": 0.48429280024049304, "flos": 24095377814400.0, "grad_norm": 1.5475689818222182, "language_loss": 0.69652557, "learning_rate": 2.1984928811558233e-06, "loss": 0.71799183, "num_input_tokens_seen": 173195025, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 8055, "time_per_iteration": 2.496981143951416 }, { "auxiliary_loss_clip": 0.01118304, "auxiliary_loss_mlp": 0.01032476, "balance_loss_clip": 1.01821828, "balance_loss_mlp": 1.0413835, "epoch": 0.484352923493161, "flos": 17530081747200.0, "grad_norm": 1.9343209343321894, "language_loss": 0.63302875, "learning_rate": 2.198105338530685e-06, "loss": 0.65453649, "num_input_tokens_seen": 173213065, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 8056, "time_per_iteration": 2.49161434173584 }, { "auxiliary_loss_clip": 0.01116038, "auxiliary_loss_mlp": 0.01032097, "balance_loss_clip": 1.01741648, "balance_loss_mlp": 1.03970933, "epoch": 0.48441304674582897, "flos": 29166862043520.0, "grad_norm": 3.764117654337475, "language_loss": 0.67658383, "learning_rate": 2.1977177883933726e-06, "loss": 0.69806516, "num_input_tokens_seen": 173234545, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.76171875, "step": 8057, "time_per_iteration": 2.528738498687744 }, { "auxiliary_loss_clip": 0.01113861, "auxiliary_loss_mlp": 0.01033472, "balance_loss_clip": 1.01949501, "balance_loss_mlp": 1.0384289, "epoch": 0.48447316999849693, "flos": 15886701676800.0, "grad_norm": 1.668885554110616, "language_loss": 0.8192212, "learning_rate": 2.1973302307585827e-06, "loss": 0.84069455, "num_input_tokens_seen": 173252175, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75390625, "step": 8058, "time_per_iteration": 2.4483301639556885 }, { "auxiliary_loss_clip": 0.0112017, "auxiliary_loss_mlp": 0.01033746, "balance_loss_clip": 1.01955414, "balance_loss_mlp": 1.04200244, "epoch": 0.4845332932511649, "flos": 24381405815040.0, "grad_norm": 1.7562327758954324, "language_loss": 0.79710621, "learning_rate": 2.1969426656410097e-06, "loss": 0.81864536, "num_input_tokens_seen": 173268790, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.78125, "step": 8059, "time_per_iteration": 2.501234531402588 }, { "auxiliary_loss_clip": 0.01122482, "auxiliary_loss_mlp": 0.01039719, "balance_loss_clip": 1.02464509, "balance_loss_mlp": 1.04346132, "epoch": 0.48459341650383286, "flos": 37116478316160.0, "grad_norm": 4.000395566028756, "language_loss": 0.66806513, "learning_rate": 2.196555093055352e-06, "loss": 0.68968713, "num_input_tokens_seen": 173288030, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 8060, "time_per_iteration": 2.5852231979370117 }, { "auxiliary_loss_clip": 0.01120539, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.02156091, "balance_loss_mlp": 1.04372144, "epoch": 0.48465353975650083, "flos": 22966777509120.0, "grad_norm": 5.022490999707136, "language_loss": 0.6740315, "learning_rate": 2.1961675130163046e-06, "loss": 0.69559824, "num_input_tokens_seen": 173305965, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.765625, "step": 8061, "time_per_iteration": 2.4858248233795166 }, { "auxiliary_loss_clip": 0.01120973, "auxiliary_loss_mlp": 0.01041442, "balance_loss_clip": 1.02635002, "balance_loss_mlp": 1.04365993, "epoch": 0.4847136630091688, "flos": 17707695523200.0, "grad_norm": 2.0450587453079914, "language_loss": 0.82464111, "learning_rate": 2.1957799255385653e-06, "loss": 0.84626526, "num_input_tokens_seen": 173321985, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7734375, "step": 8062, "time_per_iteration": 2.4542994499206543 }, { "auxiliary_loss_clip": 0.0111758, "auxiliary_loss_mlp": 0.01031809, "balance_loss_clip": 1.01799297, "balance_loss_mlp": 1.0430485, "epoch": 0.48477378626183676, "flos": 22018269018240.0, "grad_norm": 1.5221656532587575, "language_loss": 0.74538821, "learning_rate": 2.1953923306368325e-06, "loss": 0.76688206, "num_input_tokens_seen": 173341315, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.74609375, "step": 8063, "time_per_iteration": 2.4910919666290283 }, { "auxiliary_loss_clip": 0.01119545, "auxiliary_loss_mlp": 0.01031784, "balance_loss_clip": 1.01745534, "balance_loss_mlp": 1.04263139, "epoch": 0.4848339095145047, "flos": 27962956874880.0, "grad_norm": 1.7218965858820197, "language_loss": 0.78345186, "learning_rate": 2.1950047283258023e-06, "loss": 0.80496514, "num_input_tokens_seen": 173361055, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 8064, "time_per_iteration": 2.516106367111206 }, { "auxiliary_loss_clip": 0.01114107, "auxiliary_loss_mlp": 0.01034862, "balance_loss_clip": 1.02252996, "balance_loss_mlp": 1.041996, "epoch": 0.4848940327671727, "flos": 21688752625920.0, "grad_norm": 2.0663207232371277, "language_loss": 0.78796196, "learning_rate": 2.194617118620173e-06, "loss": 0.8094517, "num_input_tokens_seen": 173379255, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.72265625, "step": 8065, "time_per_iteration": 2.466310977935791 }, { "auxiliary_loss_clip": 0.01110843, "auxiliary_loss_mlp": 0.01030942, "balance_loss_clip": 1.01784134, "balance_loss_mlp": 1.03933716, "epoch": 0.48495415601984065, "flos": 20631578515200.0, "grad_norm": 1.5919013956689088, "language_loss": 0.75862753, "learning_rate": 2.194229501534644e-06, "loss": 0.78004533, "num_input_tokens_seen": 173398370, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 8066, "time_per_iteration": 2.4829275608062744 }, { "auxiliary_loss_clip": 0.01114051, "auxiliary_loss_mlp": 0.0103207, "balance_loss_clip": 1.0190165, "balance_loss_mlp": 1.04134357, "epoch": 0.4850142792725086, "flos": 25628152930560.0, "grad_norm": 2.3591381915404153, "language_loss": 0.7211839, "learning_rate": 2.193841877083912e-06, "loss": 0.74264503, "num_input_tokens_seen": 173419595, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 8067, "time_per_iteration": 2.4960315227508545 }, { "auxiliary_loss_clip": 0.01117493, "auxiliary_loss_mlp": 0.01034235, "balance_loss_clip": 1.01984632, "balance_loss_mlp": 1.04185534, "epoch": 0.4850744025251766, "flos": 13771958405760.0, "grad_norm": 2.009879714347031, "language_loss": 0.79444969, "learning_rate": 2.1934542452826767e-06, "loss": 0.81596696, "num_input_tokens_seen": 173435390, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8068, "time_per_iteration": 2.469069004058838 }, { "auxiliary_loss_clip": 0.01112121, "auxiliary_loss_mlp": 0.01032896, "balance_loss_clip": 1.02015281, "balance_loss_mlp": 1.03884065, "epoch": 0.4851345257778446, "flos": 20261339078400.0, "grad_norm": 1.3476981682011653, "language_loss": 0.84621418, "learning_rate": 2.193066606145638e-06, "loss": 0.86766434, "num_input_tokens_seen": 173454095, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 8069, "time_per_iteration": 2.4744069576263428 }, { "auxiliary_loss_clip": 0.01114861, "auxiliary_loss_mlp": 0.01034974, "balance_loss_clip": 1.02186751, "balance_loss_mlp": 1.04038131, "epoch": 0.48519464903051257, "flos": 27089681420160.0, "grad_norm": 2.250357636102211, "language_loss": 0.77900124, "learning_rate": 2.192678959687493e-06, "loss": 0.80049956, "num_input_tokens_seen": 173475300, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 8070, "time_per_iteration": 2.5116312503814697 }, { "auxiliary_loss_clip": 0.01116536, "auxiliary_loss_mlp": 0.01031062, "balance_loss_clip": 1.01711428, "balance_loss_mlp": 1.04222894, "epoch": 0.48525477228318054, "flos": 17127235739520.0, "grad_norm": 1.9387698477934587, "language_loss": 0.78141534, "learning_rate": 2.192291305922943e-06, "loss": 0.80289137, "num_input_tokens_seen": 173492005, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7421875, "step": 8071, "time_per_iteration": 2.4412143230438232 }, { "auxiliary_loss_clip": 0.01114121, "auxiliary_loss_mlp": 0.01032576, "balance_loss_clip": 1.01798487, "balance_loss_mlp": 1.0386138, "epoch": 0.4853148955358485, "flos": 28180324028160.0, "grad_norm": 1.8136805720699474, "language_loss": 0.72449863, "learning_rate": 2.1919036448666873e-06, "loss": 0.74596554, "num_input_tokens_seen": 173511995, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75390625, "step": 8072, "time_per_iteration": 2.502866268157959 }, { "auxiliary_loss_clip": 0.01119855, "auxiliary_loss_mlp": 0.01037268, "balance_loss_clip": 1.02301061, "balance_loss_mlp": 1.04233384, "epoch": 0.48537501878851647, "flos": 17493309198720.0, "grad_norm": 2.9374802004250022, "language_loss": 0.87678272, "learning_rate": 2.1915159765334262e-06, "loss": 0.89835393, "num_input_tokens_seen": 173530215, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 8073, "time_per_iteration": 2.423600435256958 }, { "auxiliary_loss_clip": 0.01113372, "auxiliary_loss_mlp": 0.01029667, "balance_loss_clip": 1.01585627, "balance_loss_mlp": 1.0410651, "epoch": 0.48543514204118443, "flos": 28584857975040.0, "grad_norm": 1.7374868000952655, "language_loss": 0.61176503, "learning_rate": 2.19112830093786e-06, "loss": 0.63319546, "num_input_tokens_seen": 173550920, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.72265625, "step": 8074, "time_per_iteration": 2.5174379348754883 }, { "auxiliary_loss_clip": 0.01116487, "auxiliary_loss_mlp": 0.01036375, "balance_loss_clip": 1.02162266, "balance_loss_mlp": 1.0401324, "epoch": 0.4854952652938524, "flos": 20959981585920.0, "grad_norm": 1.6096621873849226, "language_loss": 0.72958398, "learning_rate": 2.19074061809469e-06, "loss": 0.75111258, "num_input_tokens_seen": 173569065, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.765625, "step": 8075, "time_per_iteration": 2.463266134262085 }, { "auxiliary_loss_clip": 0.01112514, "auxiliary_loss_mlp": 0.01037011, "balance_loss_clip": 1.0236423, "balance_loss_mlp": 1.04076099, "epoch": 0.48555538854652036, "flos": 66529543155840.0, "grad_norm": 1.7397109832484916, "language_loss": 0.81805438, "learning_rate": 2.1903529280186163e-06, "loss": 0.83954966, "num_input_tokens_seen": 173596085, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 8076, "time_per_iteration": 2.8516926765441895 }, { "auxiliary_loss_clip": 0.01115613, "auxiliary_loss_mlp": 0.01033824, "balance_loss_clip": 1.01814854, "balance_loss_mlp": 1.04048753, "epoch": 0.4856155117991883, "flos": 15924982596480.0, "grad_norm": 1.9111773578923301, "language_loss": 0.86622947, "learning_rate": 2.1899652307243407e-06, "loss": 0.88772386, "num_input_tokens_seen": 173613900, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.75, "step": 8077, "time_per_iteration": 2.428755044937134 }, { "auxiliary_loss_clip": 0.01040392, "auxiliary_loss_mlp": 0.01000924, "balance_loss_clip": 0.99935037, "balance_loss_mlp": 1.0154016, "epoch": 0.4856756350518563, "flos": 71047395060480.0, "grad_norm": 0.8994120349965166, "language_loss": 0.58503485, "learning_rate": 2.189577526226564e-06, "loss": 0.60544801, "num_input_tokens_seen": 173671305, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.25, "step": 8078, "time_per_iteration": 3.0678598880767822 }, { "auxiliary_loss_clip": 0.01121254, "auxiliary_loss_mlp": 0.01033851, "balance_loss_clip": 1.01970053, "balance_loss_mlp": 1.04334617, "epoch": 0.48573575830452426, "flos": 29825679346560.0, "grad_norm": 1.7201283113521457, "language_loss": 0.72535706, "learning_rate": 2.1891898145399884e-06, "loss": 0.74690819, "num_input_tokens_seen": 173692070, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.77734375, "step": 8079, "time_per_iteration": 2.5431408882141113 }, { "auxiliary_loss_clip": 0.01120836, "auxiliary_loss_mlp": 0.01032747, "balance_loss_clip": 1.01852548, "balance_loss_mlp": 1.04410422, "epoch": 0.4857958815571922, "flos": 17639501552640.0, "grad_norm": 2.22000291750221, "language_loss": 0.80264342, "learning_rate": 2.1888020956793172e-06, "loss": 0.82417929, "num_input_tokens_seen": 173709785, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 8080, "time_per_iteration": 2.4712486267089844 }, { "auxiliary_loss_clip": 0.01117786, "auxiliary_loss_mlp": 0.01034685, "balance_loss_clip": 1.02010608, "balance_loss_mlp": 1.04066324, "epoch": 0.4858560048098602, "flos": 21105491581440.0, "grad_norm": 1.9206383491739498, "language_loss": 0.83563268, "learning_rate": 2.188414369659251e-06, "loss": 0.85715735, "num_input_tokens_seen": 173728770, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 8081, "time_per_iteration": 2.5006844997406006 }, { "auxiliary_loss_clip": 0.01116515, "auxiliary_loss_mlp": 0.01038306, "balance_loss_clip": 1.02254605, "balance_loss_mlp": 1.04062843, "epoch": 0.4859161280625282, "flos": 22090844448000.0, "grad_norm": 1.4457767360549099, "language_loss": 0.83083987, "learning_rate": 2.1880266364944924e-06, "loss": 0.85238802, "num_input_tokens_seen": 173747355, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.7578125, "step": 8082, "time_per_iteration": 2.4808433055877686 }, { "auxiliary_loss_clip": 0.01115759, "auxiliary_loss_mlp": 0.01036063, "balance_loss_clip": 1.02342725, "balance_loss_mlp": 1.04285908, "epoch": 0.4859762513151962, "flos": 17493452853120.0, "grad_norm": 2.0842087951039456, "language_loss": 0.87314296, "learning_rate": 2.187638896199746e-06, "loss": 0.89466119, "num_input_tokens_seen": 173764825, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 8083, "time_per_iteration": 2.4617881774902344 }, { "auxiliary_loss_clip": 0.01116086, "auxiliary_loss_mlp": 0.01041756, "balance_loss_clip": 1.02867293, "balance_loss_mlp": 1.04160953, "epoch": 0.48603637456786414, "flos": 18004246208640.0, "grad_norm": 1.8982925056269166, "language_loss": 0.80984914, "learning_rate": 2.1872511487897126e-06, "loss": 0.83142751, "num_input_tokens_seen": 173783215, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 8084, "time_per_iteration": 2.453606128692627 }, { "auxiliary_loss_clip": 0.01118589, "auxiliary_loss_mlp": 0.01037181, "balance_loss_clip": 1.02260733, "balance_loss_mlp": 1.04147124, "epoch": 0.4860964978205321, "flos": 22492038430080.0, "grad_norm": 1.9460368894231839, "language_loss": 0.68363237, "learning_rate": 2.186863394279098e-06, "loss": 0.70519006, "num_input_tokens_seen": 173801905, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7734375, "step": 8085, "time_per_iteration": 2.4958841800689697 }, { "auxiliary_loss_clip": 0.01117383, "auxiliary_loss_mlp": 0.01036823, "balance_loss_clip": 1.02272606, "balance_loss_mlp": 1.04194677, "epoch": 0.48615662107320007, "flos": 23372532518400.0, "grad_norm": 1.5268476865923855, "language_loss": 0.7730813, "learning_rate": 2.1864756326826046e-06, "loss": 0.79462337, "num_input_tokens_seen": 173824690, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 8086, "time_per_iteration": 2.5711147785186768 }, { "auxiliary_loss_clip": 0.011153, "auxiliary_loss_mlp": 0.0103053, "balance_loss_clip": 1.01668417, "balance_loss_mlp": 1.04049063, "epoch": 0.48621674432586803, "flos": 34418833136640.0, "grad_norm": 1.9099849366882347, "language_loss": 0.69827628, "learning_rate": 2.1860878640149355e-06, "loss": 0.71973455, "num_input_tokens_seen": 173844450, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 8087, "time_per_iteration": 2.5787978172302246 }, { "auxiliary_loss_clip": 0.01120715, "auxiliary_loss_mlp": 0.01040829, "balance_loss_clip": 1.0258323, "balance_loss_mlp": 1.04038012, "epoch": 0.486276867578536, "flos": 33107555237760.0, "grad_norm": 2.1086023844985067, "language_loss": 0.72990072, "learning_rate": 2.1857000882907974e-06, "loss": 0.75151616, "num_input_tokens_seen": 173864975, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.80078125, "step": 8088, "time_per_iteration": 2.5536105632781982 }, { "auxiliary_loss_clip": 0.01116046, "auxiliary_loss_mlp": 0.01036255, "balance_loss_clip": 1.02243233, "balance_loss_mlp": 1.04034352, "epoch": 0.48633699083120396, "flos": 21470703114240.0, "grad_norm": 1.4942591052858123, "language_loss": 0.75455296, "learning_rate": 2.185312305524892e-06, "loss": 0.77607596, "num_input_tokens_seen": 173883805, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 8089, "time_per_iteration": 2.4878945350646973 }, { "auxiliary_loss_clip": 0.01116842, "auxiliary_loss_mlp": 0.01035617, "balance_loss_clip": 1.02157378, "balance_loss_mlp": 1.03927088, "epoch": 0.48639711408387193, "flos": 20084335833600.0, "grad_norm": 1.5466273550028848, "language_loss": 0.84108424, "learning_rate": 2.184924515731926e-06, "loss": 0.86260891, "num_input_tokens_seen": 173903520, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.77734375, "step": 8090, "time_per_iteration": 2.460232973098755 }, { "auxiliary_loss_clip": 0.01113564, "auxiliary_loss_mlp": 0.01030736, "balance_loss_clip": 1.0174439, "balance_loss_mlp": 1.04123282, "epoch": 0.4864572373365399, "flos": 20778884190720.0, "grad_norm": 3.49771725397161, "language_loss": 0.76069874, "learning_rate": 2.1845367189266045e-06, "loss": 0.78214169, "num_input_tokens_seen": 173924255, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 8091, "time_per_iteration": 3.9437687397003174 }, { "auxiliary_loss_clip": 0.0111557, "auxiliary_loss_mlp": 0.01030503, "balance_loss_clip": 1.01680601, "balance_loss_mlp": 1.04002643, "epoch": 0.48651736058920786, "flos": 26025360503040.0, "grad_norm": 1.5101298669194358, "language_loss": 0.80439961, "learning_rate": 2.184148915123631e-06, "loss": 0.82586032, "num_input_tokens_seen": 173943285, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75390625, "step": 8092, "time_per_iteration": 5.3115668296813965 }, { "auxiliary_loss_clip": 0.01118, "auxiliary_loss_mlp": 0.01031004, "balance_loss_clip": 1.01697326, "balance_loss_mlp": 1.04179704, "epoch": 0.4865774838418758, "flos": 20485601642880.0, "grad_norm": 1.4177102068406024, "language_loss": 0.7169261, "learning_rate": 2.1837611043377126e-06, "loss": 0.73841608, "num_input_tokens_seen": 173962205, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 8093, "time_per_iteration": 2.4654505252838135 }, { "auxiliary_loss_clip": 0.01114097, "auxiliary_loss_mlp": 0.01034543, "balance_loss_clip": 1.02134609, "balance_loss_mlp": 1.03946197, "epoch": 0.4866376070945438, "flos": 23547704169600.0, "grad_norm": 1.763111968881086, "language_loss": 0.67811841, "learning_rate": 2.1833732865835545e-06, "loss": 0.69960481, "num_input_tokens_seen": 173980945, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.74609375, "step": 8094, "time_per_iteration": 3.8845245838165283 }, { "auxiliary_loss_clip": 0.01121163, "auxiliary_loss_mlp": 0.01039183, "balance_loss_clip": 1.0243299, "balance_loss_mlp": 1.04257762, "epoch": 0.4866977303472118, "flos": 16690598012160.0, "grad_norm": 2.288342593314149, "language_loss": 0.67019188, "learning_rate": 2.1829854618758636e-06, "loss": 0.69179529, "num_input_tokens_seen": 173998860, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78515625, "step": 8095, "time_per_iteration": 2.4340085983276367 }, { "auxiliary_loss_clip": 0.0111566, "auxiliary_loss_mlp": 0.01033278, "balance_loss_clip": 1.0185256, "balance_loss_mlp": 1.0388757, "epoch": 0.4867578535998798, "flos": 17896011552000.0, "grad_norm": 1.9121407799324814, "language_loss": 0.7887997, "learning_rate": 2.182597630229345e-06, "loss": 0.81028903, "num_input_tokens_seen": 174016665, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.765625, "step": 8096, "time_per_iteration": 2.453040599822998 }, { "auxiliary_loss_clip": 0.01113818, "auxiliary_loss_mlp": 0.01031777, "balance_loss_clip": 1.01806831, "balance_loss_mlp": 1.03942251, "epoch": 0.48681797685254774, "flos": 22637799820800.0, "grad_norm": 2.150115321365913, "language_loss": 0.67705417, "learning_rate": 2.1822097916587067e-06, "loss": 0.69851011, "num_input_tokens_seen": 174034800, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 8097, "time_per_iteration": 2.4884796142578125 }, { "auxiliary_loss_clip": 0.01112414, "auxiliary_loss_mlp": 0.01032142, "balance_loss_clip": 1.01901674, "balance_loss_mlp": 1.03845751, "epoch": 0.4868781001052157, "flos": 20886077352960.0, "grad_norm": 1.5142605226682428, "language_loss": 0.71479034, "learning_rate": 2.1818219461786543e-06, "loss": 0.73623592, "num_input_tokens_seen": 174054445, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 8098, "time_per_iteration": 2.48382306098938 }, { "auxiliary_loss_clip": 0.01121584, "auxiliary_loss_mlp": 0.01037965, "balance_loss_clip": 1.02296233, "balance_loss_mlp": 1.04114866, "epoch": 0.48693822335788367, "flos": 41974940937600.0, "grad_norm": 1.7865352363780882, "language_loss": 0.66417539, "learning_rate": 2.1814340938038956e-06, "loss": 0.68577081, "num_input_tokens_seen": 174077890, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 8099, "time_per_iteration": 2.650146961212158 }, { "auxiliary_loss_clip": 0.01111832, "auxiliary_loss_mlp": 0.01038522, "balance_loss_clip": 1.02502155, "balance_loss_mlp": 1.03734088, "epoch": 0.48699834661055164, "flos": 24243294021120.0, "grad_norm": 1.7049781568513194, "language_loss": 0.67233092, "learning_rate": 2.181046234549138e-06, "loss": 0.69383448, "num_input_tokens_seen": 174097460, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 8100, "time_per_iteration": 2.4986863136291504 }, { "auxiliary_loss_clip": 0.01111497, "auxiliary_loss_mlp": 0.01033643, "balance_loss_clip": 1.02063084, "balance_loss_mlp": 1.03890634, "epoch": 0.4870584698632196, "flos": 25923877603200.0, "grad_norm": 1.6713934198616613, "language_loss": 0.76567805, "learning_rate": 2.180658368429088e-06, "loss": 0.78712946, "num_input_tokens_seen": 174120775, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 8101, "time_per_iteration": 2.6112418174743652 }, { "auxiliary_loss_clip": 0.01038223, "auxiliary_loss_mlp": 0.01015434, "balance_loss_clip": 1.01407516, "balance_loss_mlp": 1.01325035, "epoch": 0.48711859311588757, "flos": 70211933648640.0, "grad_norm": 0.6904704501869987, "language_loss": 0.52336061, "learning_rate": 2.1802704954584565e-06, "loss": 0.54389721, "num_input_tokens_seen": 174189135, "router_z_loss_clip": 0.01361084, "router_z_loss_mlp": 0.25, "step": 8102, "time_per_iteration": 3.255979061126709 }, { "auxiliary_loss_clip": 0.0111616, "auxiliary_loss_mlp": 0.01030712, "balance_loss_clip": 1.01759267, "balance_loss_mlp": 1.04031062, "epoch": 0.48717871636855553, "flos": 12342964659840.0, "grad_norm": 2.6145730267355787, "language_loss": 0.73865342, "learning_rate": 2.1798826156519484e-06, "loss": 0.76012218, "num_input_tokens_seen": 174203250, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 8103, "time_per_iteration": 2.45418119430542 }, { "auxiliary_loss_clip": 0.01118476, "auxiliary_loss_mlp": 0.01040474, "balance_loss_clip": 1.02587688, "balance_loss_mlp": 1.04216552, "epoch": 0.4872388396212235, "flos": 23477139901440.0, "grad_norm": 1.7674934182319775, "language_loss": 0.63102436, "learning_rate": 2.1794947290242737e-06, "loss": 0.65261382, "num_input_tokens_seen": 174224145, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76171875, "step": 8104, "time_per_iteration": 2.5047998428344727 }, { "auxiliary_loss_clip": 0.01114851, "auxiliary_loss_mlp": 0.0103006, "balance_loss_clip": 1.01632142, "balance_loss_mlp": 1.04035294, "epoch": 0.48729896287389146, "flos": 31427582186880.0, "grad_norm": 1.507242263257496, "language_loss": 0.68819177, "learning_rate": 2.1791068355901413e-06, "loss": 0.70964086, "num_input_tokens_seen": 174244435, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 8105, "time_per_iteration": 2.5346500873565674 }, { "auxiliary_loss_clip": 0.0111054, "auxiliary_loss_mlp": 0.0102793, "balance_loss_clip": 1.01531744, "balance_loss_mlp": 1.03788829, "epoch": 0.4873590861265594, "flos": 19057936700160.0, "grad_norm": 1.6601375979752333, "language_loss": 0.73136342, "learning_rate": 2.178718935364259e-06, "loss": 0.75274813, "num_input_tokens_seen": 174262710, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 8106, "time_per_iteration": 2.4638218879699707 }, { "auxiliary_loss_clip": 0.01120307, "auxiliary_loss_mlp": 0.01032623, "balance_loss_clip": 1.01889014, "balance_loss_mlp": 1.04297137, "epoch": 0.4874192093792274, "flos": 24348296453760.0, "grad_norm": 2.0593613782346822, "language_loss": 0.76925629, "learning_rate": 2.1783310283613373e-06, "loss": 0.79078555, "num_input_tokens_seen": 174281545, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7734375, "step": 8107, "time_per_iteration": 2.512331485748291 }, { "auxiliary_loss_clip": 0.01113752, "auxiliary_loss_mlp": 0.01032879, "balance_loss_clip": 1.02027893, "balance_loss_mlp": 1.04210174, "epoch": 0.4874793326318954, "flos": 23112610727040.0, "grad_norm": 2.2210535610885067, "language_loss": 0.75101995, "learning_rate": 2.1779431145960853e-06, "loss": 0.77248621, "num_input_tokens_seen": 174300290, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 8108, "time_per_iteration": 2.4891204833984375 }, { "auxiliary_loss_clip": 0.01113195, "auxiliary_loss_mlp": 0.01030429, "balance_loss_clip": 1.01885939, "balance_loss_mlp": 1.04056907, "epoch": 0.4875394558845634, "flos": 19026156142080.0, "grad_norm": 1.8345971899079618, "language_loss": 0.73705149, "learning_rate": 2.177555194083212e-06, "loss": 0.7584877, "num_input_tokens_seen": 174318490, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.7265625, "step": 8109, "time_per_iteration": 2.4471352100372314 }, { "auxiliary_loss_clip": 0.011128, "auxiliary_loss_mlp": 0.01035021, "balance_loss_clip": 1.02118039, "balance_loss_mlp": 1.0400548, "epoch": 0.48759957913723134, "flos": 21433607343360.0, "grad_norm": 1.858281146499044, "language_loss": 0.78501874, "learning_rate": 2.177167266837428e-06, "loss": 0.80649698, "num_input_tokens_seen": 174335505, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 8110, "time_per_iteration": 2.4650585651397705 }, { "auxiliary_loss_clip": 0.01115015, "auxiliary_loss_mlp": 0.01039193, "balance_loss_clip": 1.02550197, "balance_loss_mlp": 1.04082656, "epoch": 0.4876597023898993, "flos": 17748669962880.0, "grad_norm": 2.4007130059347404, "language_loss": 0.72341764, "learning_rate": 2.176779332873444e-06, "loss": 0.74495971, "num_input_tokens_seen": 174353990, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 8111, "time_per_iteration": 2.449171543121338 }, { "auxiliary_loss_clip": 0.01114549, "auxiliary_loss_mlp": 0.01037344, "balance_loss_clip": 1.02449322, "balance_loss_mlp": 1.04202378, "epoch": 0.4877198256425673, "flos": 17019647527680.0, "grad_norm": 2.3295944181180386, "language_loss": 0.75988901, "learning_rate": 2.17639139220597e-06, "loss": 0.78140795, "num_input_tokens_seen": 174373425, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 8112, "time_per_iteration": 2.474883794784546 }, { "auxiliary_loss_clip": 0.01120026, "auxiliary_loss_mlp": 0.01041786, "balance_loss_clip": 1.02756405, "balance_loss_mlp": 1.04128337, "epoch": 0.48777994889523524, "flos": 22384091082240.0, "grad_norm": 2.0483004392879907, "language_loss": 0.75115395, "learning_rate": 2.1760034448497166e-06, "loss": 0.77277213, "num_input_tokens_seen": 174393070, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 8113, "time_per_iteration": 2.514411211013794 }, { "auxiliary_loss_clip": 0.01038871, "auxiliary_loss_mlp": 0.01016183, "balance_loss_clip": 1.01487219, "balance_loss_mlp": 1.01405454, "epoch": 0.4878400721479032, "flos": 61241772159360.0, "grad_norm": 0.7923969671677855, "language_loss": 0.48897219, "learning_rate": 2.1756154908193943e-06, "loss": 0.50952268, "num_input_tokens_seen": 174446880, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.24804688, "step": 8114, "time_per_iteration": 2.992821216583252 }, { "auxiliary_loss_clip": 0.01116433, "auxiliary_loss_mlp": 0.01044892, "balance_loss_clip": 1.03062224, "balance_loss_mlp": 1.0396626, "epoch": 0.48790019540057117, "flos": 24536612482560.0, "grad_norm": 1.3952381697309284, "language_loss": 0.76728868, "learning_rate": 2.1752275301297155e-06, "loss": 0.78890193, "num_input_tokens_seen": 174468485, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 8115, "time_per_iteration": 2.5275793075561523 }, { "auxiliary_loss_clip": 0.01121506, "auxiliary_loss_mlp": 0.01036656, "balance_loss_clip": 1.0225234, "balance_loss_mlp": 1.04404521, "epoch": 0.48796031865323913, "flos": 21833939399040.0, "grad_norm": 2.2323977354699784, "language_loss": 0.71858764, "learning_rate": 2.1748395627953915e-06, "loss": 0.74016929, "num_input_tokens_seen": 174486360, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7734375, "step": 8116, "time_per_iteration": 2.4883882999420166 }, { "auxiliary_loss_clip": 0.01114102, "auxiliary_loss_mlp": 0.01036091, "balance_loss_clip": 1.02253103, "balance_loss_mlp": 1.04112625, "epoch": 0.4880204419059071, "flos": 18588907883520.0, "grad_norm": 1.7239930918404585, "language_loss": 0.63219762, "learning_rate": 2.1744515888311335e-06, "loss": 0.65369952, "num_input_tokens_seen": 174505075, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 8117, "time_per_iteration": 2.4608559608459473 }, { "auxiliary_loss_clip": 0.01113082, "auxiliary_loss_mlp": 0.01038506, "balance_loss_clip": 1.02466035, "balance_loss_mlp": 1.03874707, "epoch": 0.48808056515857506, "flos": 19172168928000.0, "grad_norm": 1.7343386512826908, "language_loss": 0.7984851, "learning_rate": 2.1740636082516533e-06, "loss": 0.82000101, "num_input_tokens_seen": 174523385, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 8118, "time_per_iteration": 2.484208583831787 }, { "auxiliary_loss_clip": 0.01116304, "auxiliary_loss_mlp": 0.01036459, "balance_loss_clip": 1.02218938, "balance_loss_mlp": 1.04014146, "epoch": 0.48814068841124303, "flos": 20120497850880.0, "grad_norm": 1.8020501557997988, "language_loss": 0.63242477, "learning_rate": 2.1736756210716645e-06, "loss": 0.65395242, "num_input_tokens_seen": 174542200, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76171875, "step": 8119, "time_per_iteration": 2.464061737060547 }, { "auxiliary_loss_clip": 0.01115475, "auxiliary_loss_mlp": 0.01033096, "balance_loss_clip": 1.02019143, "balance_loss_mlp": 1.04038501, "epoch": 0.488200811663911, "flos": 22965592360320.0, "grad_norm": 2.632488935086778, "language_loss": 0.72570729, "learning_rate": 2.173287627305878e-06, "loss": 0.74719304, "num_input_tokens_seen": 174563620, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 8120, "time_per_iteration": 2.5226571559906006 }, { "auxiliary_loss_clip": 0.01117492, "auxiliary_loss_mlp": 0.01034019, "balance_loss_clip": 1.01970196, "balance_loss_mlp": 1.04084504, "epoch": 0.48826093491657896, "flos": 33910697387520.0, "grad_norm": 2.535940020655419, "language_loss": 0.63133073, "learning_rate": 2.1728996269690075e-06, "loss": 0.6528458, "num_input_tokens_seen": 174586465, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.765625, "step": 8121, "time_per_iteration": 2.5797505378723145 }, { "auxiliary_loss_clip": 0.01120106, "auxiliary_loss_mlp": 0.01035838, "balance_loss_clip": 1.02136612, "balance_loss_mlp": 1.04244494, "epoch": 0.488321058169247, "flos": 23070307484160.0, "grad_norm": 2.142230338704651, "language_loss": 0.83000243, "learning_rate": 2.1725116200757664e-06, "loss": 0.85156184, "num_input_tokens_seen": 174604035, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 8122, "time_per_iteration": 2.493511199951172 }, { "auxiliary_loss_clip": 0.01120184, "auxiliary_loss_mlp": 0.01039357, "balance_loss_clip": 1.02449703, "balance_loss_mlp": 1.04170632, "epoch": 0.48838118142191494, "flos": 19317714837120.0, "grad_norm": 3.6204552003182533, "language_loss": 0.85458469, "learning_rate": 2.172123606640866e-06, "loss": 0.87618005, "num_input_tokens_seen": 174621715, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 8123, "time_per_iteration": 2.4906840324401855 }, { "auxiliary_loss_clip": 0.01117947, "auxiliary_loss_mlp": 0.01032218, "balance_loss_clip": 1.01846755, "balance_loss_mlp": 1.04041779, "epoch": 0.4884413046745829, "flos": 25410678036480.0, "grad_norm": 1.8141569321724982, "language_loss": 0.85531944, "learning_rate": 2.1717355866790227e-06, "loss": 0.87682104, "num_input_tokens_seen": 174643835, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7734375, "step": 8124, "time_per_iteration": 2.5343923568725586 }, { "auxiliary_loss_clip": 0.01119519, "auxiliary_loss_mlp": 0.01032661, "balance_loss_clip": 1.01814747, "balance_loss_mlp": 1.04181898, "epoch": 0.4885014279272509, "flos": 20991546662400.0, "grad_norm": 1.8701418122919449, "language_loss": 0.79342175, "learning_rate": 2.171347560204948e-06, "loss": 0.81494355, "num_input_tokens_seen": 174660955, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 8125, "time_per_iteration": 2.481844186782837 }, { "auxiliary_loss_clip": 0.01115233, "auxiliary_loss_mlp": 0.01033545, "balance_loss_clip": 1.02031279, "balance_loss_mlp": 1.03961468, "epoch": 0.48856155117991884, "flos": 13771599269760.0, "grad_norm": 2.6060646565861636, "language_loss": 0.72321117, "learning_rate": 2.170959527233356e-06, "loss": 0.744699, "num_input_tokens_seen": 174678270, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7578125, "step": 8126, "time_per_iteration": 2.470646858215332 }, { "auxiliary_loss_clip": 0.01118671, "auxiliary_loss_mlp": 0.01035343, "balance_loss_clip": 1.02109158, "balance_loss_mlp": 1.04057813, "epoch": 0.4886216744325868, "flos": 32087764206720.0, "grad_norm": 1.9898254193493805, "language_loss": 0.68937802, "learning_rate": 2.1705714877789633e-06, "loss": 0.71091819, "num_input_tokens_seen": 174698360, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.78125, "step": 8127, "time_per_iteration": 2.565383195877075 }, { "auxiliary_loss_clip": 0.01118247, "auxiliary_loss_mlp": 0.01033562, "balance_loss_clip": 1.01920366, "balance_loss_mlp": 1.03908312, "epoch": 0.48868179768525477, "flos": 19610063631360.0, "grad_norm": 2.0266208966133332, "language_loss": 0.76214516, "learning_rate": 2.170183441856481e-06, "loss": 0.78366333, "num_input_tokens_seen": 174716755, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.79296875, "step": 8128, "time_per_iteration": 2.465668201446533 }, { "auxiliary_loss_clip": 0.01118206, "auxiliary_loss_mlp": 0.01032224, "balance_loss_clip": 1.0185802, "balance_loss_mlp": 1.04135203, "epoch": 0.48874192093792274, "flos": 21286912199040.0, "grad_norm": 1.5668540311844095, "language_loss": 0.76135004, "learning_rate": 2.1697953894806265e-06, "loss": 0.78285432, "num_input_tokens_seen": 174735560, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76953125, "step": 8129, "time_per_iteration": 2.4692280292510986 }, { "auxiliary_loss_clip": 0.01115115, "auxiliary_loss_mlp": 0.01030032, "balance_loss_clip": 1.01554179, "balance_loss_mlp": 1.0391295, "epoch": 0.4888020441905907, "flos": 14173439696640.0, "grad_norm": 2.064596374680056, "language_loss": 0.64782828, "learning_rate": 2.169407330666114e-06, "loss": 0.66927969, "num_input_tokens_seen": 174752730, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8130, "time_per_iteration": 2.4613289833068848 }, { "auxiliary_loss_clip": 0.01113039, "auxiliary_loss_mlp": 0.01030313, "balance_loss_clip": 1.01688457, "balance_loss_mlp": 1.03874564, "epoch": 0.48886216744325867, "flos": 24097891766400.0, "grad_norm": 1.697585228275153, "language_loss": 0.71791673, "learning_rate": 2.169019265427658e-06, "loss": 0.7393502, "num_input_tokens_seen": 174772520, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 8131, "time_per_iteration": 2.4982430934906006 }, { "auxiliary_loss_clip": 0.01120008, "auxiliary_loss_mlp": 0.01039048, "balance_loss_clip": 1.02467167, "balance_loss_mlp": 1.04156947, "epoch": 0.48892229069592663, "flos": 38431419402240.0, "grad_norm": 1.3755511050538773, "language_loss": 0.69603503, "learning_rate": 2.1686311937799745e-06, "loss": 0.71762568, "num_input_tokens_seen": 174796540, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.78125, "step": 8132, "time_per_iteration": 4.141760587692261 }, { "auxiliary_loss_clip": 0.01116292, "auxiliary_loss_mlp": 0.01028369, "balance_loss_clip": 1.01421905, "balance_loss_mlp": 1.04157639, "epoch": 0.4889824139485946, "flos": 23843321101440.0, "grad_norm": 1.4218524768700902, "language_loss": 0.70400715, "learning_rate": 2.1682431157377797e-06, "loss": 0.72545385, "num_input_tokens_seen": 174817840, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.74609375, "step": 8133, "time_per_iteration": 3.907621383666992 }, { "auxiliary_loss_clip": 0.01115999, "auxiliary_loss_mlp": 0.01031644, "balance_loss_clip": 1.01816106, "balance_loss_mlp": 1.04128766, "epoch": 0.48904253720126256, "flos": 24425827960320.0, "grad_norm": 1.8485428095673921, "language_loss": 0.71038258, "learning_rate": 2.1678550313157883e-06, "loss": 0.73185897, "num_input_tokens_seen": 174837885, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 8134, "time_per_iteration": 4.0537824630737305 }, { "auxiliary_loss_clip": 0.01121572, "auxiliary_loss_mlp": 0.01036846, "balance_loss_clip": 1.02192068, "balance_loss_mlp": 1.04239416, "epoch": 0.4891026604539306, "flos": 24170682677760.0, "grad_norm": 1.8005217005408616, "language_loss": 0.81050283, "learning_rate": 2.167466940528718e-06, "loss": 0.83208704, "num_input_tokens_seen": 174855240, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.79296875, "step": 8135, "time_per_iteration": 2.4998443126678467 }, { "auxiliary_loss_clip": 0.01112542, "auxiliary_loss_mlp": 0.01035192, "balance_loss_clip": 1.02219248, "balance_loss_mlp": 1.03816104, "epoch": 0.48916278370659855, "flos": 21470954509440.0, "grad_norm": 1.6494769711775763, "language_loss": 0.74576771, "learning_rate": 2.1670788433912843e-06, "loss": 0.76724505, "num_input_tokens_seen": 174875145, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 8136, "time_per_iteration": 3.9427106380462646 }, { "auxiliary_loss_clip": 0.01115771, "auxiliary_loss_mlp": 0.0103564, "balance_loss_clip": 1.02209163, "balance_loss_mlp": 1.04072893, "epoch": 0.4892229069592665, "flos": 22309755886080.0, "grad_norm": 1.6464721975098255, "language_loss": 0.73107493, "learning_rate": 2.166690739918204e-06, "loss": 0.75258905, "num_input_tokens_seen": 174894770, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 8137, "time_per_iteration": 2.467883348464966 }, { "auxiliary_loss_clip": 0.01116472, "auxiliary_loss_mlp": 0.01033055, "balance_loss_clip": 1.01859522, "balance_loss_mlp": 1.03879201, "epoch": 0.4892830302119345, "flos": 12786856934400.0, "grad_norm": 1.8300248857300172, "language_loss": 0.74736667, "learning_rate": 2.1663026301241944e-06, "loss": 0.76886195, "num_input_tokens_seen": 174912780, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.77734375, "step": 8138, "time_per_iteration": 2.445791006088257 }, { "auxiliary_loss_clip": 0.0111853, "auxiliary_loss_mlp": 0.01034843, "balance_loss_clip": 1.021492, "balance_loss_mlp": 1.04273033, "epoch": 0.48934315346460244, "flos": 20813896972800.0, "grad_norm": 1.5774534511635436, "language_loss": 0.74438834, "learning_rate": 2.165914514023972e-06, "loss": 0.76592207, "num_input_tokens_seen": 174931250, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7578125, "step": 8139, "time_per_iteration": 2.4658358097076416 }, { "auxiliary_loss_clip": 0.01115398, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.02208292, "balance_loss_mlp": 1.03950512, "epoch": 0.4894032767172704, "flos": 19755537713280.0, "grad_norm": 1.8470316075936166, "language_loss": 0.62030303, "learning_rate": 2.165526391632255e-06, "loss": 0.64180923, "num_input_tokens_seen": 174951105, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 8140, "time_per_iteration": 2.4708056449890137 }, { "auxiliary_loss_clip": 0.01119355, "auxiliary_loss_mlp": 0.01037585, "balance_loss_clip": 1.02298212, "balance_loss_mlp": 1.04089475, "epoch": 0.4894633999699384, "flos": 17818982835840.0, "grad_norm": 1.7793797591283502, "language_loss": 0.82369035, "learning_rate": 2.1651382629637608e-06, "loss": 0.84525973, "num_input_tokens_seen": 174969120, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78515625, "step": 8141, "time_per_iteration": 2.4572296142578125 }, { "auxiliary_loss_clip": 0.01122138, "auxiliary_loss_mlp": 0.01037191, "balance_loss_clip": 1.02245665, "balance_loss_mlp": 1.04365849, "epoch": 0.48952352322260634, "flos": 25523222325120.0, "grad_norm": 1.599314430987915, "language_loss": 0.72406721, "learning_rate": 2.1647501280332066e-06, "loss": 0.74566054, "num_input_tokens_seen": 174991295, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 8142, "time_per_iteration": 2.5494325160980225 }, { "auxiliary_loss_clip": 0.01114163, "auxiliary_loss_mlp": 0.01035062, "balance_loss_clip": 1.02178168, "balance_loss_mlp": 1.03924084, "epoch": 0.4895836464752743, "flos": 29055502903680.0, "grad_norm": 1.685696611122889, "language_loss": 0.66954756, "learning_rate": 2.1643619868553105e-06, "loss": 0.6910398, "num_input_tokens_seen": 175012830, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 8143, "time_per_iteration": 2.5100152492523193 }, { "auxiliary_loss_clip": 0.01113369, "auxiliary_loss_mlp": 0.0102958, "balance_loss_clip": 1.01596034, "balance_loss_mlp": 1.03876495, "epoch": 0.48964376972794227, "flos": 33546958312320.0, "grad_norm": 1.628785449025006, "language_loss": 0.75126535, "learning_rate": 2.163973839444793e-06, "loss": 0.77269483, "num_input_tokens_seen": 175035695, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 8144, "time_per_iteration": 2.58428692817688 }, { "auxiliary_loss_clip": 0.01116265, "auxiliary_loss_mlp": 0.01032141, "balance_loss_clip": 1.01861644, "balance_loss_mlp": 1.03997445, "epoch": 0.48970389298061023, "flos": 22054035985920.0, "grad_norm": 1.8030124404136285, "language_loss": 0.75714052, "learning_rate": 2.1635856858163695e-06, "loss": 0.77862459, "num_input_tokens_seen": 175056425, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 8145, "time_per_iteration": 2.486999034881592 }, { "auxiliary_loss_clip": 0.01120098, "auxiliary_loss_mlp": 0.0103604, "balance_loss_clip": 1.02131772, "balance_loss_mlp": 1.04106903, "epoch": 0.4897640162332782, "flos": 20084299920000.0, "grad_norm": 1.924881169902541, "language_loss": 0.8031131, "learning_rate": 2.163197525984761e-06, "loss": 0.82467449, "num_input_tokens_seen": 175074800, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7890625, "step": 8146, "time_per_iteration": 2.4652910232543945 }, { "auxiliary_loss_clip": 0.01112647, "auxiliary_loss_mlp": 0.01030997, "balance_loss_clip": 1.01729441, "balance_loss_mlp": 1.03882754, "epoch": 0.48982413948594616, "flos": 23806225330560.0, "grad_norm": 1.8851922188357375, "language_loss": 0.74173337, "learning_rate": 2.162809359964687e-06, "loss": 0.76316983, "num_input_tokens_seen": 175094500, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 8147, "time_per_iteration": 2.4960927963256836 }, { "auxiliary_loss_clip": 0.01117721, "auxiliary_loss_mlp": 0.01031554, "balance_loss_clip": 1.01743913, "balance_loss_mlp": 1.0419693, "epoch": 0.4898842627386142, "flos": 17639645207040.0, "grad_norm": 2.2932597926852494, "language_loss": 0.82452422, "learning_rate": 2.162421187770864e-06, "loss": 0.846017, "num_input_tokens_seen": 175112920, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 8148, "time_per_iteration": 2.482215404510498 }, { "auxiliary_loss_clip": 0.01110648, "auxiliary_loss_mlp": 0.01029935, "balance_loss_clip": 1.01754332, "balance_loss_mlp": 1.03865135, "epoch": 0.48994438599128215, "flos": 16617914841600.0, "grad_norm": 1.8994791325798888, "language_loss": 0.74071991, "learning_rate": 2.162033009418015e-06, "loss": 0.76212579, "num_input_tokens_seen": 175129910, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 8149, "time_per_iteration": 2.4313840866088867 }, { "auxiliary_loss_clip": 0.01121751, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.01660192, "balance_loss_mlp": 1.04269934, "epoch": 0.4900045092439501, "flos": 26614834600320.0, "grad_norm": 1.813996793045163, "language_loss": 0.76656961, "learning_rate": 2.1616448249208567e-06, "loss": 0.78810322, "num_input_tokens_seen": 175148705, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 8150, "time_per_iteration": 2.5234296321868896 }, { "auxiliary_loss_clip": 0.01120247, "auxiliary_loss_mlp": 0.01037531, "balance_loss_clip": 1.02298713, "balance_loss_mlp": 1.04277337, "epoch": 0.4900646324966181, "flos": 19902125116800.0, "grad_norm": 2.117659411133752, "language_loss": 0.7244823, "learning_rate": 2.1612566342941106e-06, "loss": 0.74606013, "num_input_tokens_seen": 175167425, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7734375, "step": 8151, "time_per_iteration": 2.4544332027435303 }, { "auxiliary_loss_clip": 0.01042835, "auxiliary_loss_mlp": 0.01006779, "balance_loss_clip": 1.00538385, "balance_loss_mlp": 1.01754594, "epoch": 0.49012475574928605, "flos": 59189620337280.0, "grad_norm": 0.825868950156771, "language_loss": 0.54479784, "learning_rate": 2.1608684375524977e-06, "loss": 0.56529397, "num_input_tokens_seen": 175227985, "router_z_loss_clip": 0.01397705, "router_z_loss_mlp": 0.25390625, "step": 8152, "time_per_iteration": 3.0642991065979004 }, { "auxiliary_loss_clip": 0.011204, "auxiliary_loss_mlp": 0.01034387, "balance_loss_clip": 1.0204097, "balance_loss_mlp": 1.04135942, "epoch": 0.490184879001954, "flos": 45259797657600.0, "grad_norm": 1.7786484153510846, "language_loss": 0.61443758, "learning_rate": 2.1604802347107364e-06, "loss": 0.63598549, "num_input_tokens_seen": 175251895, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7890625, "step": 8153, "time_per_iteration": 2.6852927207946777 }, { "auxiliary_loss_clip": 0.01116397, "auxiliary_loss_mlp": 0.01037395, "balance_loss_clip": 1.02303672, "balance_loss_mlp": 1.03959262, "epoch": 0.490245002254622, "flos": 28002135634560.0, "grad_norm": 1.8268281154179937, "language_loss": 0.77368104, "learning_rate": 2.160092025783549e-06, "loss": 0.79521894, "num_input_tokens_seen": 175272770, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76953125, "step": 8154, "time_per_iteration": 2.534818410873413 }, { "auxiliary_loss_clip": 0.01044144, "auxiliary_loss_mlp": 0.0100038, "balance_loss_clip": 0.9991585, "balance_loss_mlp": 1.01854837, "epoch": 0.49030512550728994, "flos": 58951318533120.0, "grad_norm": 0.9793539253258876, "language_loss": 0.67063463, "learning_rate": 2.1597038107856564e-06, "loss": 0.69107985, "num_input_tokens_seen": 175336320, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.2578125, "step": 8155, "time_per_iteration": 3.1574912071228027 }, { "auxiliary_loss_clip": 0.01118919, "auxiliary_loss_mlp": 0.01029787, "balance_loss_clip": 1.01669228, "balance_loss_mlp": 1.0427916, "epoch": 0.4903652487599579, "flos": 19791843384960.0, "grad_norm": 2.0417792060667175, "language_loss": 0.768888, "learning_rate": 2.1593155897317784e-06, "loss": 0.79037505, "num_input_tokens_seen": 175353540, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.76171875, "step": 8156, "time_per_iteration": 2.4763660430908203 }, { "auxiliary_loss_clip": 0.01117166, "auxiliary_loss_mlp": 0.01034018, "balance_loss_clip": 1.02045763, "balance_loss_mlp": 1.04114246, "epoch": 0.49042537201262587, "flos": 21762082241280.0, "grad_norm": 2.0042625446088844, "language_loss": 0.83158374, "learning_rate": 2.1589273626366377e-06, "loss": 0.85309559, "num_input_tokens_seen": 175370445, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 8157, "time_per_iteration": 2.459984064102173 }, { "auxiliary_loss_clip": 0.01116305, "auxiliary_loss_mlp": 0.01035197, "balance_loss_clip": 1.02092791, "balance_loss_mlp": 1.04047143, "epoch": 0.49048549526529384, "flos": 18953042008320.0, "grad_norm": 1.854301188315874, "language_loss": 0.79889607, "learning_rate": 2.158539129514956e-06, "loss": 0.82041109, "num_input_tokens_seen": 175389020, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 8158, "time_per_iteration": 2.4604361057281494 }, { "auxiliary_loss_clip": 0.01120335, "auxiliary_loss_mlp": 0.0103188, "balance_loss_clip": 1.01784861, "balance_loss_mlp": 1.0419085, "epoch": 0.4905456185179618, "flos": 26906393295360.0, "grad_norm": 1.5366433257041348, "language_loss": 0.69118375, "learning_rate": 2.158150890381454e-06, "loss": 0.71270585, "num_input_tokens_seen": 175409545, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 8159, "time_per_iteration": 2.514648675918579 }, { "auxiliary_loss_clip": 0.01115927, "auxiliary_loss_mlp": 0.01031565, "balance_loss_clip": 1.01816583, "balance_loss_mlp": 1.04174972, "epoch": 0.49060574177062977, "flos": 20412343854720.0, "grad_norm": 1.9436126701937015, "language_loss": 0.73430526, "learning_rate": 2.157762645250854e-06, "loss": 0.75578022, "num_input_tokens_seen": 175429335, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 8160, "time_per_iteration": 2.4853298664093018 }, { "auxiliary_loss_clip": 0.01120276, "auxiliary_loss_mlp": 0.01041612, "balance_loss_clip": 1.02672243, "balance_loss_mlp": 1.04151845, "epoch": 0.4906658650232978, "flos": 17493704248320.0, "grad_norm": 1.9810695054184597, "language_loss": 0.71763128, "learning_rate": 2.1573743941378796e-06, "loss": 0.73925018, "num_input_tokens_seen": 175446955, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7890625, "step": 8161, "time_per_iteration": 2.440701961517334 }, { "auxiliary_loss_clip": 0.01119654, "auxiliary_loss_mlp": 0.01037688, "balance_loss_clip": 1.02396679, "balance_loss_mlp": 1.04423237, "epoch": 0.49072598827596575, "flos": 26614439550720.0, "grad_norm": 1.9902954444894985, "language_loss": 0.68641013, "learning_rate": 2.1569861370572517e-06, "loss": 0.70798355, "num_input_tokens_seen": 175468195, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75390625, "step": 8162, "time_per_iteration": 2.5225040912628174 }, { "auxiliary_loss_clip": 0.0112193, "auxiliary_loss_mlp": 0.01038492, "balance_loss_clip": 1.02323294, "balance_loss_mlp": 1.04239047, "epoch": 0.4907861115286337, "flos": 20412595249920.0, "grad_norm": 6.8981312506259185, "language_loss": 0.63444293, "learning_rate": 2.1565978740236944e-06, "loss": 0.65604717, "num_input_tokens_seen": 175487455, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.796875, "step": 8163, "time_per_iteration": 2.4479238986968994 }, { "auxiliary_loss_clip": 0.01114699, "auxiliary_loss_mlp": 0.01034123, "balance_loss_clip": 1.02046156, "balance_loss_mlp": 1.04149413, "epoch": 0.4908462347813017, "flos": 14064271286400.0, "grad_norm": 2.6359903234464053, "language_loss": 0.7721191, "learning_rate": 2.1562096050519293e-06, "loss": 0.79360729, "num_input_tokens_seen": 175504450, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 8164, "time_per_iteration": 2.4601590633392334 }, { "auxiliary_loss_clip": 0.01116313, "auxiliary_loss_mlp": 0.0103051, "balance_loss_clip": 1.01554954, "balance_loss_mlp": 1.03959179, "epoch": 0.49090635803396965, "flos": 18735100237440.0, "grad_norm": 1.6112400769553432, "language_loss": 0.76864976, "learning_rate": 2.1558213301566806e-06, "loss": 0.79011804, "num_input_tokens_seen": 175523600, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.765625, "step": 8165, "time_per_iteration": 2.4593679904937744 }, { "auxiliary_loss_clip": 0.01118104, "auxiliary_loss_mlp": 0.01033303, "balance_loss_clip": 1.01967716, "balance_loss_mlp": 1.04295969, "epoch": 0.4909664812866376, "flos": 20558500295040.0, "grad_norm": 1.7850205735927913, "language_loss": 0.77697271, "learning_rate": 2.1554330493526716e-06, "loss": 0.79848677, "num_input_tokens_seen": 175542720, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 8166, "time_per_iteration": 2.5040860176086426 }, { "auxiliary_loss_clip": 0.0104197, "auxiliary_loss_mlp": 0.01002633, "balance_loss_clip": 1.0012207, "balance_loss_mlp": 1.0163821, "epoch": 0.4910266045393056, "flos": 54684017948160.0, "grad_norm": 0.8045719553017218, "language_loss": 0.54226589, "learning_rate": 2.1550447626546253e-06, "loss": 0.56271195, "num_input_tokens_seen": 175598640, "router_z_loss_clip": 0.01409912, "router_z_loss_mlp": 0.25585938, "step": 8167, "time_per_iteration": 3.090040922164917 }, { "auxiliary_loss_clip": 0.01115509, "auxiliary_loss_mlp": 0.01033396, "balance_loss_clip": 1.01941276, "balance_loss_mlp": 1.04176211, "epoch": 0.49108672779197354, "flos": 16246454342400.0, "grad_norm": 2.5441791579751656, "language_loss": 0.86264813, "learning_rate": 2.1546564700772665e-06, "loss": 0.88413727, "num_input_tokens_seen": 175615675, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 8168, "time_per_iteration": 2.442337989807129 }, { "auxiliary_loss_clip": 0.01114362, "auxiliary_loss_mlp": 0.01037179, "balance_loss_clip": 1.02385759, "balance_loss_mlp": 1.04127288, "epoch": 0.4911468510446415, "flos": 19825419623040.0, "grad_norm": 1.6632943902336328, "language_loss": 0.73562896, "learning_rate": 2.1542681716353193e-06, "loss": 0.75714433, "num_input_tokens_seen": 175632255, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 8169, "time_per_iteration": 2.44730281829834 }, { "auxiliary_loss_clip": 0.01113781, "auxiliary_loss_mlp": 0.01032593, "balance_loss_clip": 1.01924706, "balance_loss_mlp": 1.03876615, "epoch": 0.4912069742973095, "flos": 21212684743680.0, "grad_norm": 1.4653869378962114, "language_loss": 0.78000355, "learning_rate": 2.1538798673435068e-06, "loss": 0.80146724, "num_input_tokens_seen": 175651625, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 8170, "time_per_iteration": 2.4652912616729736 }, { "auxiliary_loss_clip": 0.01117577, "auxiliary_loss_mlp": 0.01039231, "balance_loss_clip": 1.02609468, "balance_loss_mlp": 1.04146087, "epoch": 0.49126709754997744, "flos": 19537129065600.0, "grad_norm": 2.8913963854271185, "language_loss": 0.76067591, "learning_rate": 2.1534915572165545e-06, "loss": 0.78224403, "num_input_tokens_seen": 175669265, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.76171875, "step": 8171, "time_per_iteration": 2.453106641769409 }, { "auxiliary_loss_clip": 0.01118295, "auxiliary_loss_mlp": 0.01041017, "balance_loss_clip": 1.02763009, "balance_loss_mlp": 1.04053664, "epoch": 0.4913272208026454, "flos": 12239686080000.0, "grad_norm": 2.327576313875411, "language_loss": 0.81479573, "learning_rate": 2.1531032412691875e-06, "loss": 0.83638889, "num_input_tokens_seen": 175686065, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.77734375, "step": 8172, "time_per_iteration": 2.45316219329834 }, { "auxiliary_loss_clip": 0.01041048, "auxiliary_loss_mlp": 0.01007411, "balance_loss_clip": 1.00615907, "balance_loss_mlp": 1.01535416, "epoch": 0.49138734405531337, "flos": 65465871661440.0, "grad_norm": 0.917780410820254, "language_loss": 0.53349447, "learning_rate": 2.1527149195161295e-06, "loss": 0.55397904, "num_input_tokens_seen": 175748595, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.2578125, "step": 8173, "time_per_iteration": 3.0849156379699707 }, { "auxiliary_loss_clip": 0.01118188, "auxiliary_loss_mlp": 0.01037595, "balance_loss_clip": 1.02278304, "balance_loss_mlp": 1.04114234, "epoch": 0.4914474673079814, "flos": 18439052342400.0, "grad_norm": 1.8680470053832374, "language_loss": 0.62842381, "learning_rate": 2.152326591972107e-06, "loss": 0.64998162, "num_input_tokens_seen": 175766770, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7734375, "step": 8174, "time_per_iteration": 3.8790688514709473 }, { "auxiliary_loss_clip": 0.01116829, "auxiliary_loss_mlp": 0.01036087, "balance_loss_clip": 1.02237844, "balance_loss_mlp": 1.0408076, "epoch": 0.49150759056064935, "flos": 21685053525120.0, "grad_norm": 2.710270271473754, "language_loss": 0.69458079, "learning_rate": 2.1519382586518445e-06, "loss": 0.71610993, "num_input_tokens_seen": 175783605, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 8175, "time_per_iteration": 5.3887858390808105 }, { "auxiliary_loss_clip": 0.01117124, "auxiliary_loss_mlp": 0.01032336, "balance_loss_clip": 1.01885962, "balance_loss_mlp": 1.04146802, "epoch": 0.4915677138133173, "flos": 22382439056640.0, "grad_norm": 1.9053312562706544, "language_loss": 0.74238479, "learning_rate": 2.151549919570068e-06, "loss": 0.76387936, "num_input_tokens_seen": 175801390, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 8176, "time_per_iteration": 2.490095615386963 }, { "auxiliary_loss_clip": 0.01117407, "auxiliary_loss_mlp": 0.01040509, "balance_loss_clip": 1.02694941, "balance_loss_mlp": 1.04143953, "epoch": 0.4916278370659853, "flos": 18402890325120.0, "grad_norm": 1.763006918284623, "language_loss": 0.70549709, "learning_rate": 2.1511615747415036e-06, "loss": 0.72707629, "num_input_tokens_seen": 175819830, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 8177, "time_per_iteration": 3.8919970989227295 }, { "auxiliary_loss_clip": 0.01040542, "auxiliary_loss_mlp": 0.01002362, "balance_loss_clip": 1.00094986, "balance_loss_mlp": 1.0151161, "epoch": 0.49168796031865325, "flos": 66609124715520.0, "grad_norm": 0.6805549426692385, "language_loss": 0.46247524, "learning_rate": 2.150773224180877e-06, "loss": 0.48290431, "num_input_tokens_seen": 175881765, "router_z_loss_clip": 0.01409912, "router_z_loss_mlp": 0.25390625, "step": 8178, "time_per_iteration": 3.0953187942504883 }, { "auxiliary_loss_clip": 0.01120671, "auxiliary_loss_mlp": 0.01034562, "balance_loss_clip": 1.01966047, "balance_loss_mlp": 1.04217935, "epoch": 0.4917480835713212, "flos": 20959335141120.0, "grad_norm": 2.3854263608066884, "language_loss": 0.65723026, "learning_rate": 2.1503848679029147e-06, "loss": 0.67878258, "num_input_tokens_seen": 175901795, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78515625, "step": 8179, "time_per_iteration": 2.488308906555176 }, { "auxiliary_loss_clip": 0.01120985, "auxiliary_loss_mlp": 0.01037782, "balance_loss_clip": 1.02276134, "balance_loss_mlp": 1.0406158, "epoch": 0.4918082068239892, "flos": 15772900412160.0, "grad_norm": 1.7889033982124598, "language_loss": 0.69758642, "learning_rate": 2.149996505922343e-06, "loss": 0.71917409, "num_input_tokens_seen": 175917770, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.8046875, "step": 8180, "time_per_iteration": 2.4277803897857666 }, { "auxiliary_loss_clip": 0.01114455, "auxiliary_loss_mlp": 0.01034462, "balance_loss_clip": 1.02026367, "balance_loss_mlp": 1.03977728, "epoch": 0.49186833007665715, "flos": 24604806453120.0, "grad_norm": 1.989026892999576, "language_loss": 0.84424233, "learning_rate": 2.1496081382538895e-06, "loss": 0.86573148, "num_input_tokens_seen": 175937000, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.74609375, "step": 8181, "time_per_iteration": 2.502122402191162 }, { "auxiliary_loss_clip": 0.01114447, "auxiliary_loss_mlp": 0.01032782, "balance_loss_clip": 1.01958513, "balance_loss_mlp": 1.04134059, "epoch": 0.4919284533293251, "flos": 22090557139200.0, "grad_norm": 2.7111562498386887, "language_loss": 0.72820312, "learning_rate": 2.1492197649122793e-06, "loss": 0.74967539, "num_input_tokens_seen": 175955170, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 8182, "time_per_iteration": 2.470085620880127 }, { "auxiliary_loss_clip": 0.01116025, "auxiliary_loss_mlp": 0.01036109, "balance_loss_clip": 1.0220964, "balance_loss_mlp": 1.04018593, "epoch": 0.4919885765819931, "flos": 23368043318400.0, "grad_norm": 1.965361447652066, "language_loss": 0.7264024, "learning_rate": 2.1488313859122412e-06, "loss": 0.74792379, "num_input_tokens_seen": 175973725, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 8183, "time_per_iteration": 2.49202299118042 }, { "auxiliary_loss_clip": 0.01120474, "auxiliary_loss_mlp": 0.01030495, "balance_loss_clip": 1.01546288, "balance_loss_mlp": 1.04071975, "epoch": 0.49204869983466104, "flos": 21360493209600.0, "grad_norm": 2.045481043007384, "language_loss": 0.77392846, "learning_rate": 2.1484430012685015e-06, "loss": 0.79543811, "num_input_tokens_seen": 175993885, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.796875, "step": 8184, "time_per_iteration": 2.463823080062866 }, { "auxiliary_loss_clip": 0.01115949, "auxiliary_loss_mlp": 0.01034922, "balance_loss_clip": 1.02119493, "balance_loss_mlp": 1.04066598, "epoch": 0.492108823087329, "flos": 21142695093120.0, "grad_norm": 1.8090674130602424, "language_loss": 0.70809674, "learning_rate": 2.148054610995789e-06, "loss": 0.72960538, "num_input_tokens_seen": 176014210, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75390625, "step": 8185, "time_per_iteration": 2.477079153060913 }, { "auxiliary_loss_clip": 0.01118153, "auxiliary_loss_mlp": 0.01037631, "balance_loss_clip": 1.02203822, "balance_loss_mlp": 1.04090524, "epoch": 0.49216894633999697, "flos": 25116605389440.0, "grad_norm": 1.662234253812285, "language_loss": 0.75361121, "learning_rate": 2.147666215108831e-06, "loss": 0.77516901, "num_input_tokens_seen": 176033890, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7734375, "step": 8186, "time_per_iteration": 2.47873592376709 }, { "auxiliary_loss_clip": 0.01118942, "auxiliary_loss_mlp": 0.01039203, "balance_loss_clip": 1.02450418, "balance_loss_mlp": 1.04175496, "epoch": 0.49222906959266494, "flos": 22637943475200.0, "grad_norm": 2.100829110880582, "language_loss": 0.68007863, "learning_rate": 2.1472778136223545e-06, "loss": 0.70166016, "num_input_tokens_seen": 176052720, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7734375, "step": 8187, "time_per_iteration": 2.4901037216186523 }, { "auxiliary_loss_clip": 0.01116306, "auxiliary_loss_mlp": 0.0103723, "balance_loss_clip": 1.02291894, "balance_loss_mlp": 1.04091311, "epoch": 0.49228919284533296, "flos": 20410548174720.0, "grad_norm": 1.4901378303539248, "language_loss": 0.66946959, "learning_rate": 2.1468894065510894e-06, "loss": 0.69100499, "num_input_tokens_seen": 176072545, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 8188, "time_per_iteration": 2.45302677154541 }, { "auxiliary_loss_clip": 0.01118172, "auxiliary_loss_mlp": 0.01031152, "balance_loss_clip": 1.01781297, "balance_loss_mlp": 1.04154539, "epoch": 0.4923493160980009, "flos": 27122359818240.0, "grad_norm": 1.8164642375016473, "language_loss": 0.74870837, "learning_rate": 2.1465009939097623e-06, "loss": 0.77020156, "num_input_tokens_seen": 176091490, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 8189, "time_per_iteration": 2.5116593837738037 }, { "auxiliary_loss_clip": 0.01114162, "auxiliary_loss_mlp": 0.01032926, "balance_loss_clip": 1.0188477, "balance_loss_mlp": 1.03919661, "epoch": 0.4924094393506689, "flos": 35736683224320.0, "grad_norm": 1.5559991159002275, "language_loss": 0.64347744, "learning_rate": 2.146112575713104e-06, "loss": 0.66494834, "num_input_tokens_seen": 176113200, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 8190, "time_per_iteration": 2.581907033920288 }, { "auxiliary_loss_clip": 0.01116048, "auxiliary_loss_mlp": 0.01027909, "balance_loss_clip": 1.01409316, "balance_loss_mlp": 1.04140341, "epoch": 0.49246956260333685, "flos": 20412487509120.0, "grad_norm": 1.9597595735435287, "language_loss": 0.7195428, "learning_rate": 2.1457241519758413e-06, "loss": 0.74098235, "num_input_tokens_seen": 176132485, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 8191, "time_per_iteration": 2.4710500240325928 }, { "auxiliary_loss_clip": 0.01116228, "auxiliary_loss_mlp": 0.01036591, "balance_loss_clip": 1.02264953, "balance_loss_mlp": 1.04018235, "epoch": 0.4925296858560048, "flos": 38976938231040.0, "grad_norm": 1.5863795040546156, "language_loss": 0.71672291, "learning_rate": 2.1453357227127043e-06, "loss": 0.73825109, "num_input_tokens_seen": 176155755, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 8192, "time_per_iteration": 2.614267110824585 }, { "auxiliary_loss_clip": 0.01038365, "auxiliary_loss_mlp": 0.01005082, "balance_loss_clip": 1.00377047, "balance_loss_mlp": 1.01300311, "epoch": 0.4925898091086728, "flos": 64278917712000.0, "grad_norm": 0.7466590918392981, "language_loss": 0.52104223, "learning_rate": 2.1449472879384224e-06, "loss": 0.54147667, "num_input_tokens_seen": 176216295, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.25390625, "step": 8193, "time_per_iteration": 3.142343759536743 }, { "auxiliary_loss_clip": 0.01115649, "auxiliary_loss_mlp": 0.01042382, "balance_loss_clip": 1.02825022, "balance_loss_mlp": 1.04134881, "epoch": 0.49264993236134075, "flos": 23036372110080.0, "grad_norm": 1.6007182352330567, "language_loss": 0.7692492, "learning_rate": 2.1445588476677246e-06, "loss": 0.79082954, "num_input_tokens_seen": 176235925, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 8194, "time_per_iteration": 2.4847679138183594 }, { "auxiliary_loss_clip": 0.01114981, "auxiliary_loss_mlp": 0.01034064, "balance_loss_clip": 1.0206058, "balance_loss_mlp": 1.03952122, "epoch": 0.4927100556140087, "flos": 24718212668160.0, "grad_norm": 1.8827178938688063, "language_loss": 0.70229447, "learning_rate": 2.144170401915341e-06, "loss": 0.72378492, "num_input_tokens_seen": 176253865, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 8195, "time_per_iteration": 2.4989418983459473 }, { "auxiliary_loss_clip": 0.01115697, "auxiliary_loss_mlp": 0.01031862, "balance_loss_clip": 1.01836181, "balance_loss_mlp": 1.03952885, "epoch": 0.4927701788666767, "flos": 23505544581120.0, "grad_norm": 1.8519198375895247, "language_loss": 0.8099426, "learning_rate": 2.143781950696001e-06, "loss": 0.83141816, "num_input_tokens_seen": 176271525, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76171875, "step": 8196, "time_per_iteration": 2.4859161376953125 }, { "auxiliary_loss_clip": 0.01118354, "auxiliary_loss_mlp": 0.0103622, "balance_loss_clip": 1.02156293, "balance_loss_mlp": 1.04001462, "epoch": 0.49283030211934464, "flos": 22928891639040.0, "grad_norm": 1.6536399919446345, "language_loss": 0.70626128, "learning_rate": 2.1433934940244356e-06, "loss": 0.72780699, "num_input_tokens_seen": 176290810, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 8197, "time_per_iteration": 2.4676008224487305 }, { "auxiliary_loss_clip": 0.01114233, "auxiliary_loss_mlp": 0.0103744, "balance_loss_clip": 1.02427888, "balance_loss_mlp": 1.03975868, "epoch": 0.4928904253720126, "flos": 16873024210560.0, "grad_norm": 1.9013326050167794, "language_loss": 0.84983689, "learning_rate": 2.143005031915374e-06, "loss": 0.87135363, "num_input_tokens_seen": 176309165, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 8198, "time_per_iteration": 2.427128791809082 }, { "auxiliary_loss_clip": 0.01119408, "auxiliary_loss_mlp": 0.01039317, "balance_loss_clip": 1.024225, "balance_loss_mlp": 1.04167461, "epoch": 0.4929505486246806, "flos": 14866551509760.0, "grad_norm": 2.0156114134747862, "language_loss": 0.7606529, "learning_rate": 2.1426165643835467e-06, "loss": 0.78224015, "num_input_tokens_seen": 176324960, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.77734375, "step": 8199, "time_per_iteration": 2.4604599475860596 }, { "auxiliary_loss_clip": 0.01118955, "auxiliary_loss_mlp": 0.01039427, "balance_loss_clip": 1.0241797, "balance_loss_mlp": 1.04056501, "epoch": 0.49301067187734854, "flos": 23842351434240.0, "grad_norm": 1.6803640385660075, "language_loss": 0.59877938, "learning_rate": 2.1422280914436864e-06, "loss": 0.62036324, "num_input_tokens_seen": 176346195, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78515625, "step": 8200, "time_per_iteration": 2.503979206085205 }, { "auxiliary_loss_clip": 0.01110736, "auxiliary_loss_mlp": 0.01037252, "balance_loss_clip": 1.02368593, "balance_loss_mlp": 1.03829551, "epoch": 0.49307079513001656, "flos": 22491284244480.0, "grad_norm": 1.7580771436232812, "language_loss": 0.79255903, "learning_rate": 2.1418396131105213e-06, "loss": 0.81403899, "num_input_tokens_seen": 176366735, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 8201, "time_per_iteration": 2.4738004207611084 }, { "auxiliary_loss_clip": 0.01121366, "auxiliary_loss_mlp": 0.01040143, "balance_loss_clip": 1.02456212, "balance_loss_mlp": 1.04020905, "epoch": 0.4931309183826845, "flos": 15924587546880.0, "grad_norm": 3.107477733472762, "language_loss": 0.67817646, "learning_rate": 2.141451129398785e-06, "loss": 0.69979155, "num_input_tokens_seen": 176384475, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8125, "step": 8202, "time_per_iteration": 2.4346067905426025 }, { "auxiliary_loss_clip": 0.0111462, "auxiliary_loss_mlp": 0.01036049, "balance_loss_clip": 1.02219677, "balance_loss_mlp": 1.03882086, "epoch": 0.4931910416353525, "flos": 27309059735040.0, "grad_norm": 1.7719250637704713, "language_loss": 0.75476372, "learning_rate": 2.1410626403232076e-06, "loss": 0.77627039, "num_input_tokens_seen": 176402645, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 8203, "time_per_iteration": 2.5162031650543213 }, { "auxiliary_loss_clip": 0.01114828, "auxiliary_loss_mlp": 0.01036808, "balance_loss_clip": 1.02205038, "balance_loss_mlp": 1.03937793, "epoch": 0.49325116488802045, "flos": 20806139635200.0, "grad_norm": 2.0726230264309513, "language_loss": 0.80018514, "learning_rate": 2.1406741458985197e-06, "loss": 0.82170153, "num_input_tokens_seen": 176416715, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.75390625, "step": 8204, "time_per_iteration": 2.4386146068573 }, { "auxiliary_loss_clip": 0.01113776, "auxiliary_loss_mlp": 0.0103836, "balance_loss_clip": 1.02483606, "balance_loss_mlp": 1.04003596, "epoch": 0.4933112881406884, "flos": 19865963099520.0, "grad_norm": 1.9405116143923256, "language_loss": 0.66154397, "learning_rate": 2.140285646139455e-06, "loss": 0.68306535, "num_input_tokens_seen": 176435755, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 8205, "time_per_iteration": 2.458055257797241 }, { "auxiliary_loss_clip": 0.01122024, "auxiliary_loss_mlp": 0.01036498, "balance_loss_clip": 1.02055967, "balance_loss_mlp": 1.04167807, "epoch": 0.4933714113933564, "flos": 21827977741440.0, "grad_norm": 1.8184988897826055, "language_loss": 0.66506767, "learning_rate": 2.139897141060744e-06, "loss": 0.6866529, "num_input_tokens_seen": 176453915, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.8046875, "step": 8206, "time_per_iteration": 2.461111068725586 }, { "auxiliary_loss_clip": 0.01116492, "auxiliary_loss_mlp": 0.0103325, "balance_loss_clip": 1.020262, "balance_loss_mlp": 1.0400331, "epoch": 0.49343153464602435, "flos": 27890130049920.0, "grad_norm": 1.6661287126151383, "language_loss": 0.76548797, "learning_rate": 2.1395086306771196e-06, "loss": 0.7869854, "num_input_tokens_seen": 176475175, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.765625, "step": 8207, "time_per_iteration": 2.5343070030212402 }, { "auxiliary_loss_clip": 0.01118598, "auxiliary_loss_mlp": 0.01039149, "balance_loss_clip": 1.02428412, "balance_loss_mlp": 1.04146338, "epoch": 0.4934916578986923, "flos": 24681080983680.0, "grad_norm": 2.296785390161655, "language_loss": 0.60195494, "learning_rate": 2.1391201150033147e-06, "loss": 0.62353241, "num_input_tokens_seen": 176494250, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76953125, "step": 8208, "time_per_iteration": 2.4717674255371094 }, { "auxiliary_loss_clip": 0.01116929, "auxiliary_loss_mlp": 0.01037095, "balance_loss_clip": 1.02244449, "balance_loss_mlp": 1.04035342, "epoch": 0.4935517811513603, "flos": 23405139089280.0, "grad_norm": 1.8138378052885586, "language_loss": 0.78363025, "learning_rate": 2.1387315940540598e-06, "loss": 0.80517048, "num_input_tokens_seen": 176513325, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 8209, "time_per_iteration": 2.4769105911254883 }, { "auxiliary_loss_clip": 0.01114308, "auxiliary_loss_mlp": 0.01035694, "balance_loss_clip": 1.02119875, "balance_loss_mlp": 1.03895473, "epoch": 0.49361190440402825, "flos": 21944508439680.0, "grad_norm": 2.47170699052365, "language_loss": 0.78775656, "learning_rate": 2.138343067844089e-06, "loss": 0.80925667, "num_input_tokens_seen": 176532915, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75390625, "step": 8210, "time_per_iteration": 2.4469470977783203 }, { "auxiliary_loss_clip": 0.01119167, "auxiliary_loss_mlp": 0.01039682, "balance_loss_clip": 1.02431595, "balance_loss_mlp": 1.03968108, "epoch": 0.4936720276566962, "flos": 25115671635840.0, "grad_norm": 1.9322297328776914, "language_loss": 0.81254303, "learning_rate": 2.1379545363881363e-06, "loss": 0.83413148, "num_input_tokens_seen": 176552775, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.796875, "step": 8211, "time_per_iteration": 2.4884231090545654 }, { "auxiliary_loss_clip": 0.01117078, "auxiliary_loss_mlp": 0.01038202, "balance_loss_clip": 1.02396846, "balance_loss_mlp": 1.04000628, "epoch": 0.4937321509093642, "flos": 26358935132160.0, "grad_norm": 2.310409562912084, "language_loss": 0.91258943, "learning_rate": 2.137565999700933e-06, "loss": 0.93414223, "num_input_tokens_seen": 176572185, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 8212, "time_per_iteration": 2.520482301712036 }, { "auxiliary_loss_clip": 0.01116107, "auxiliary_loss_mlp": 0.01036207, "balance_loss_clip": 1.022367, "balance_loss_mlp": 1.03928232, "epoch": 0.49379227416203214, "flos": 22961390469120.0, "grad_norm": 2.107580812489984, "language_loss": 0.64501894, "learning_rate": 2.1371774577972138e-06, "loss": 0.66654211, "num_input_tokens_seen": 176591490, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 8213, "time_per_iteration": 2.4787516593933105 }, { "auxiliary_loss_clip": 0.01112257, "auxiliary_loss_mlp": 0.01033862, "balance_loss_clip": 1.01875782, "balance_loss_mlp": 1.03714156, "epoch": 0.49385239741470016, "flos": 32489101843200.0, "grad_norm": 1.845347019577406, "language_loss": 0.75960523, "learning_rate": 2.136788910691711e-06, "loss": 0.78106642, "num_input_tokens_seen": 176612715, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.75, "step": 8214, "time_per_iteration": 2.5562925338745117 }, { "auxiliary_loss_clip": 0.01116404, "auxiliary_loss_mlp": 0.01036923, "balance_loss_clip": 1.02252257, "balance_loss_mlp": 1.04064178, "epoch": 0.4939125206673681, "flos": 22492864442880.0, "grad_norm": 1.790170309903795, "language_loss": 0.84404212, "learning_rate": 2.1364003583991594e-06, "loss": 0.86557543, "num_input_tokens_seen": 176631950, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 8215, "time_per_iteration": 2.47773814201355 }, { "auxiliary_loss_clip": 0.01109009, "auxiliary_loss_mlp": 0.01029782, "balance_loss_clip": 1.01667511, "balance_loss_mlp": 1.03752708, "epoch": 0.4939726439200361, "flos": 31176351486720.0, "grad_norm": 1.703953172692509, "language_loss": 0.83472419, "learning_rate": 2.136011800934292e-06, "loss": 0.85611212, "num_input_tokens_seen": 176653060, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 8216, "time_per_iteration": 5.353250503540039 }, { "auxiliary_loss_clip": 0.01113083, "auxiliary_loss_mlp": 0.01037485, "balance_loss_clip": 1.02351928, "balance_loss_mlp": 1.03868079, "epoch": 0.49403276717270406, "flos": 22674213233280.0, "grad_norm": 1.5341783206715691, "language_loss": 0.75084579, "learning_rate": 2.1356232383118442e-06, "loss": 0.77235144, "num_input_tokens_seen": 176673895, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.74609375, "step": 8217, "time_per_iteration": 4.043224573135376 }, { "auxiliary_loss_clip": 0.01110934, "auxiliary_loss_mlp": 0.01036811, "balance_loss_clip": 1.02214873, "balance_loss_mlp": 1.03865123, "epoch": 0.494092890425372, "flos": 20741070147840.0, "grad_norm": 1.5772237532517823, "language_loss": 0.79268718, "learning_rate": 2.1352346705465494e-06, "loss": 0.81416464, "num_input_tokens_seen": 176692550, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.72265625, "step": 8218, "time_per_iteration": 2.454793930053711 }, { "auxiliary_loss_clip": 0.01109104, "auxiliary_loss_mlp": 0.01039193, "balance_loss_clip": 1.0257467, "balance_loss_mlp": 1.03680968, "epoch": 0.49415301367804, "flos": 18369026778240.0, "grad_norm": 2.126967292555728, "language_loss": 0.76610947, "learning_rate": 2.134846097653142e-06, "loss": 0.78759241, "num_input_tokens_seen": 176709335, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 8219, "time_per_iteration": 3.8006560802459717 }, { "auxiliary_loss_clip": 0.0111497, "auxiliary_loss_mlp": 0.01035342, "balance_loss_clip": 1.02087641, "balance_loss_mlp": 1.03908145, "epoch": 0.49421313693070795, "flos": 17530620451200.0, "grad_norm": 1.6504999126826079, "language_loss": 0.62538981, "learning_rate": 2.134457519646357e-06, "loss": 0.64689296, "num_input_tokens_seen": 176727715, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8220, "time_per_iteration": 2.442084550857544 }, { "auxiliary_loss_clip": 0.01112232, "auxiliary_loss_mlp": 0.01035466, "balance_loss_clip": 1.02097034, "balance_loss_mlp": 1.03782678, "epoch": 0.4942732601833759, "flos": 20812173120000.0, "grad_norm": 1.807292615734803, "language_loss": 0.72439742, "learning_rate": 2.1340689365409296e-06, "loss": 0.7458744, "num_input_tokens_seen": 176747530, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7421875, "step": 8221, "time_per_iteration": 2.4500558376312256 }, { "auxiliary_loss_clip": 0.01112507, "auxiliary_loss_mlp": 0.01034275, "balance_loss_clip": 1.02081656, "balance_loss_mlp": 1.04096389, "epoch": 0.4943333834360439, "flos": 15048941794560.0, "grad_norm": 1.6985919482814706, "language_loss": 0.79118311, "learning_rate": 2.133680348351595e-06, "loss": 0.81265092, "num_input_tokens_seen": 176765260, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 8222, "time_per_iteration": 2.4468953609466553 }, { "auxiliary_loss_clip": 0.01115224, "auxiliary_loss_mlp": 0.01036511, "balance_loss_clip": 1.02194321, "balance_loss_mlp": 1.04048777, "epoch": 0.49439350668871185, "flos": 16070420764800.0, "grad_norm": 2.3725606618032695, "language_loss": 0.71864176, "learning_rate": 2.133291755093088e-06, "loss": 0.74015903, "num_input_tokens_seen": 176781770, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.74609375, "step": 8223, "time_per_iteration": 2.411499261856079 }, { "auxiliary_loss_clip": 0.01115172, "auxiliary_loss_mlp": 0.01037946, "balance_loss_clip": 1.02311051, "balance_loss_mlp": 1.03835928, "epoch": 0.4944536299413798, "flos": 20880079781760.0, "grad_norm": 1.6067587289495917, "language_loss": 0.7521404, "learning_rate": 2.132903156780144e-06, "loss": 0.77367157, "num_input_tokens_seen": 176800655, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76953125, "step": 8224, "time_per_iteration": 2.4679603576660156 }, { "auxiliary_loss_clip": 0.01116584, "auxiliary_loss_mlp": 0.01033247, "balance_loss_clip": 1.01903152, "balance_loss_mlp": 1.04082716, "epoch": 0.4945137531940478, "flos": 26608908856320.0, "grad_norm": 2.3051556479423994, "language_loss": 0.64183331, "learning_rate": 2.1325145534274997e-06, "loss": 0.66333163, "num_input_tokens_seen": 176820610, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 8225, "time_per_iteration": 2.493340015411377 }, { "auxiliary_loss_clip": 0.0111306, "auxiliary_loss_mlp": 0.0103346, "balance_loss_clip": 1.01979303, "balance_loss_mlp": 1.03773594, "epoch": 0.49457387644671574, "flos": 23988148738560.0, "grad_norm": 4.570482013553007, "language_loss": 0.76522541, "learning_rate": 2.1321259450498893e-06, "loss": 0.78669065, "num_input_tokens_seen": 176840520, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75390625, "step": 8226, "time_per_iteration": 2.489394426345825 }, { "auxiliary_loss_clip": 0.01115278, "auxiliary_loss_mlp": 0.01040398, "balance_loss_clip": 1.02454329, "balance_loss_mlp": 1.03766108, "epoch": 0.49463399969938376, "flos": 26976598427520.0, "grad_norm": 1.6927319665407417, "language_loss": 0.71275079, "learning_rate": 2.131737331662051e-06, "loss": 0.73430753, "num_input_tokens_seen": 176860265, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.7734375, "step": 8227, "time_per_iteration": 2.4800338745117188 }, { "auxiliary_loss_clip": 0.0111913, "auxiliary_loss_mlp": 0.01037587, "balance_loss_clip": 1.02301979, "balance_loss_mlp": 1.04023695, "epoch": 0.49469412295205173, "flos": 29681534067840.0, "grad_norm": 1.6142580343702184, "language_loss": 0.71429294, "learning_rate": 2.131348713278718e-06, "loss": 0.73586011, "num_input_tokens_seen": 176882910, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 8228, "time_per_iteration": 2.5281217098236084 }, { "auxiliary_loss_clip": 0.01110921, "auxiliary_loss_mlp": 0.01030443, "balance_loss_clip": 1.01609039, "balance_loss_mlp": 1.03766763, "epoch": 0.4947542462047197, "flos": 24131791226880.0, "grad_norm": 1.6902783238334051, "language_loss": 0.8364948, "learning_rate": 2.1309600899146304e-06, "loss": 0.85790837, "num_input_tokens_seen": 176903030, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.734375, "step": 8229, "time_per_iteration": 2.48775577545166 }, { "auxiliary_loss_clip": 0.01113662, "auxiliary_loss_mlp": 0.01037754, "balance_loss_clip": 1.02275705, "balance_loss_mlp": 1.03718281, "epoch": 0.49481436945738766, "flos": 20045049333120.0, "grad_norm": 2.4693906330048536, "language_loss": 0.74585009, "learning_rate": 2.1305714615845227e-06, "loss": 0.7673642, "num_input_tokens_seen": 176919025, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.765625, "step": 8230, "time_per_iteration": 2.4516665935516357 }, { "auxiliary_loss_clip": 0.01113774, "auxiliary_loss_mlp": 0.01030136, "balance_loss_clip": 1.01626015, "balance_loss_mlp": 1.03808224, "epoch": 0.4948744927100556, "flos": 15669550005120.0, "grad_norm": 1.9164456752279957, "language_loss": 0.79483098, "learning_rate": 2.1301828283031314e-06, "loss": 0.81627011, "num_input_tokens_seen": 176937945, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 8231, "time_per_iteration": 2.433120012283325 }, { "auxiliary_loss_clip": 0.01041088, "auxiliary_loss_mlp": 0.01000668, "balance_loss_clip": 0.99943972, "balance_loss_mlp": 1.01597142, "epoch": 0.4949346159627236, "flos": 68872071502080.0, "grad_norm": 0.7446201159267939, "language_loss": 0.6017924, "learning_rate": 2.1297941900851944e-06, "loss": 0.62220997, "num_input_tokens_seen": 177004575, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.25, "step": 8232, "time_per_iteration": 3.2250823974609375 }, { "auxiliary_loss_clip": 0.01118857, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.02372503, "balance_loss_mlp": 1.04006052, "epoch": 0.49499473921539155, "flos": 24790285307520.0, "grad_norm": 2.041493599804292, "language_loss": 0.68690431, "learning_rate": 2.1294055469454496e-06, "loss": 0.7084803, "num_input_tokens_seen": 177024155, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7890625, "step": 8233, "time_per_iteration": 2.5022759437561035 }, { "auxiliary_loss_clip": 0.01110205, "auxiliary_loss_mlp": 0.01031221, "balance_loss_clip": 1.01599216, "balance_loss_mlp": 1.0362817, "epoch": 0.4950548624680595, "flos": 32707905540480.0, "grad_norm": 2.144140118219756, "language_loss": 0.66501909, "learning_rate": 2.129016898898633e-06, "loss": 0.68643332, "num_input_tokens_seen": 177046185, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.73828125, "step": 8234, "time_per_iteration": 2.5937395095825195 }, { "auxiliary_loss_clip": 0.01040461, "auxiliary_loss_mlp": 0.01000698, "balance_loss_clip": 0.99943429, "balance_loss_mlp": 1.01532578, "epoch": 0.4951149857207275, "flos": 50082173066880.0, "grad_norm": 0.8054832223377187, "language_loss": 0.57995635, "learning_rate": 2.128628245959482e-06, "loss": 0.6003679, "num_input_tokens_seen": 177099025, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.25, "step": 8235, "time_per_iteration": 3.0124804973602295 }, { "auxiliary_loss_clip": 0.01114436, "auxiliary_loss_mlp": 0.01038986, "balance_loss_clip": 1.0242753, "balance_loss_mlp": 1.03768468, "epoch": 0.49517510897339545, "flos": 22236785406720.0, "grad_norm": 1.6759365337656846, "language_loss": 0.77153087, "learning_rate": 2.1282395881427355e-06, "loss": 0.79306507, "num_input_tokens_seen": 177118365, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 8236, "time_per_iteration": 2.49576735496521 }, { "auxiliary_loss_clip": 0.01115199, "auxiliary_loss_mlp": 0.01034439, "balance_loss_clip": 1.0209502, "balance_loss_mlp": 1.04046166, "epoch": 0.4952352322260634, "flos": 25374120969600.0, "grad_norm": 1.6885811644155728, "language_loss": 0.72684264, "learning_rate": 2.1278509254631315e-06, "loss": 0.748339, "num_input_tokens_seen": 177136415, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 8237, "time_per_iteration": 2.5003414154052734 }, { "auxiliary_loss_clip": 0.01111438, "auxiliary_loss_mlp": 0.01036577, "balance_loss_clip": 1.02290964, "balance_loss_mlp": 1.03759575, "epoch": 0.4952953554787314, "flos": 24608721035520.0, "grad_norm": 2.0885668715572963, "language_loss": 0.76186532, "learning_rate": 2.127462257935406e-06, "loss": 0.78334546, "num_input_tokens_seen": 177155690, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 8238, "time_per_iteration": 2.5036633014678955 }, { "auxiliary_loss_clip": 0.01116667, "auxiliary_loss_mlp": 0.01035865, "balance_loss_clip": 1.02129793, "balance_loss_mlp": 1.03898835, "epoch": 0.49535547873139935, "flos": 17311278049920.0, "grad_norm": 2.5243461777802425, "language_loss": 0.73869073, "learning_rate": 2.1270735855743008e-06, "loss": 0.76021612, "num_input_tokens_seen": 177173350, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.77734375, "step": 8239, "time_per_iteration": 2.418529987335205 }, { "auxiliary_loss_clip": 0.01115659, "auxiliary_loss_mlp": 0.01039512, "balance_loss_clip": 1.02328777, "balance_loss_mlp": 1.03776515, "epoch": 0.4954156019840673, "flos": 20740315962240.0, "grad_norm": 2.649709400777046, "language_loss": 0.78370202, "learning_rate": 2.126684908394552e-06, "loss": 0.80525374, "num_input_tokens_seen": 177191115, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.78125, "step": 8240, "time_per_iteration": 2.4899046421051025 }, { "auxiliary_loss_clip": 0.01111001, "auxiliary_loss_mlp": 0.01042305, "balance_loss_clip": 1.02878082, "balance_loss_mlp": 1.03906035, "epoch": 0.49547572523673533, "flos": 12820684567680.0, "grad_norm": 2.140877472098745, "language_loss": 0.85637283, "learning_rate": 2.126296226410898e-06, "loss": 0.87790585, "num_input_tokens_seen": 177206155, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 8241, "time_per_iteration": 2.4077234268188477 }, { "auxiliary_loss_clip": 0.01112754, "auxiliary_loss_mlp": 0.01037066, "balance_loss_clip": 1.02374399, "balance_loss_mlp": 1.03985023, "epoch": 0.4955358484894033, "flos": 15597046402560.0, "grad_norm": 1.8389619791711524, "language_loss": 0.77245057, "learning_rate": 2.1259075396380794e-06, "loss": 0.79394883, "num_input_tokens_seen": 177224815, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 8242, "time_per_iteration": 2.4675426483154297 }, { "auxiliary_loss_clip": 0.01111609, "auxiliary_loss_mlp": 0.01031272, "balance_loss_clip": 1.0175277, "balance_loss_mlp": 1.03753734, "epoch": 0.49559597174207126, "flos": 26464368528000.0, "grad_norm": 1.6114345845990228, "language_loss": 0.671143, "learning_rate": 2.125518848090833e-06, "loss": 0.69257176, "num_input_tokens_seen": 177244490, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 8243, "time_per_iteration": 2.489874839782715 }, { "auxiliary_loss_clip": 0.01112607, "auxiliary_loss_mlp": 0.01032024, "balance_loss_clip": 1.01849329, "balance_loss_mlp": 1.0399406, "epoch": 0.4956560949947392, "flos": 23148234040320.0, "grad_norm": 2.0450451640463894, "language_loss": 0.68251079, "learning_rate": 2.125130151783901e-06, "loss": 0.70395708, "num_input_tokens_seen": 177264340, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 8244, "time_per_iteration": 2.4797325134277344 }, { "auxiliary_loss_clip": 0.0111617, "auxiliary_loss_mlp": 0.01038084, "balance_loss_clip": 1.0225153, "balance_loss_mlp": 1.03964508, "epoch": 0.4957162182474072, "flos": 20773461237120.0, "grad_norm": 2.093995558464574, "language_loss": 0.75099349, "learning_rate": 2.12474145073202e-06, "loss": 0.77253604, "num_input_tokens_seen": 177283055, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.765625, "step": 8245, "time_per_iteration": 2.427910566329956 }, { "auxiliary_loss_clip": 0.01112676, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.02039433, "balance_loss_mlp": 1.03884435, "epoch": 0.49577634150007516, "flos": 18734202397440.0, "grad_norm": 1.9695084552850526, "language_loss": 0.82163674, "learning_rate": 2.1243527449499306e-06, "loss": 0.84310508, "num_input_tokens_seen": 177301140, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 8246, "time_per_iteration": 2.4537436962127686 }, { "auxiliary_loss_clip": 0.0111678, "auxiliary_loss_mlp": 0.01040594, "balance_loss_clip": 1.02562118, "balance_loss_mlp": 1.03989065, "epoch": 0.4958364647527431, "flos": 25554176870400.0, "grad_norm": 1.810303609876012, "language_loss": 0.84007287, "learning_rate": 2.1239640344523733e-06, "loss": 0.86164665, "num_input_tokens_seen": 177323095, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.76953125, "step": 8247, "time_per_iteration": 2.4821836948394775 }, { "auxiliary_loss_clip": 0.01117365, "auxiliary_loss_mlp": 0.01030086, "balance_loss_clip": 1.01643074, "balance_loss_mlp": 1.04130435, "epoch": 0.4958965880054111, "flos": 24425325169920.0, "grad_norm": 1.9868667010135528, "language_loss": 0.83823925, "learning_rate": 2.123575319254087e-06, "loss": 0.85971379, "num_input_tokens_seen": 177339845, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 8248, "time_per_iteration": 2.5118985176086426 }, { "auxiliary_loss_clip": 0.01117467, "auxiliary_loss_mlp": 0.01032298, "balance_loss_clip": 1.01779628, "balance_loss_mlp": 1.04090452, "epoch": 0.49595671125807905, "flos": 25083460114560.0, "grad_norm": 1.8494633719015077, "language_loss": 0.73433936, "learning_rate": 2.123186599369812e-06, "loss": 0.75583696, "num_input_tokens_seen": 177359980, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.765625, "step": 8249, "time_per_iteration": 2.4982166290283203 }, { "auxiliary_loss_clip": 0.01119621, "auxiliary_loss_mlp": 0.01040993, "balance_loss_clip": 1.02605581, "balance_loss_mlp": 1.04202604, "epoch": 0.496016834510747, "flos": 16435883692800.0, "grad_norm": 1.6985213486093778, "language_loss": 0.75886917, "learning_rate": 2.122797874814289e-06, "loss": 0.78047526, "num_input_tokens_seen": 177378580, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7734375, "step": 8250, "time_per_iteration": 2.489072799682617 }, { "auxiliary_loss_clip": 0.01115633, "auxiliary_loss_mlp": 0.01037761, "balance_loss_clip": 1.02342045, "balance_loss_mlp": 1.0395478, "epoch": 0.496076957763415, "flos": 23437925228160.0, "grad_norm": 2.1649095861197445, "language_loss": 0.70423979, "learning_rate": 2.1224091456022585e-06, "loss": 0.72577369, "num_input_tokens_seen": 177398790, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 8251, "time_per_iteration": 2.4485621452331543 }, { "auxiliary_loss_clip": 0.01114603, "auxiliary_loss_mlp": 0.01032721, "balance_loss_clip": 1.01953053, "balance_loss_mlp": 1.04016995, "epoch": 0.49613708101608295, "flos": 16909509450240.0, "grad_norm": 1.9733052554885584, "language_loss": 0.80111229, "learning_rate": 2.122020411748461e-06, "loss": 0.82258558, "num_input_tokens_seen": 177416515, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 8252, "time_per_iteration": 2.452056646347046 }, { "auxiliary_loss_clip": 0.01115391, "auxiliary_loss_mlp": 0.01031194, "balance_loss_clip": 1.01538658, "balance_loss_mlp": 1.0395596, "epoch": 0.4961972042687509, "flos": 16618094409600.0, "grad_norm": 1.8172057368926857, "language_loss": 0.80985254, "learning_rate": 2.1216316732676363e-06, "loss": 0.83131838, "num_input_tokens_seen": 177434425, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.7578125, "step": 8253, "time_per_iteration": 2.427774667739868 }, { "auxiliary_loss_clip": 0.01111696, "auxiliary_loss_mlp": 0.01031592, "balance_loss_clip": 1.01834202, "balance_loss_mlp": 1.0373559, "epoch": 0.49625732752141893, "flos": 28956749437440.0, "grad_norm": 3.9196141721635738, "language_loss": 0.67344689, "learning_rate": 2.1212429301745275e-06, "loss": 0.69487977, "num_input_tokens_seen": 177459675, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 8254, "time_per_iteration": 2.6141834259033203 }, { "auxiliary_loss_clip": 0.01114485, "auxiliary_loss_mlp": 0.01039987, "balance_loss_clip": 1.0260756, "balance_loss_mlp": 1.03767538, "epoch": 0.4963174507740869, "flos": 23112359331840.0, "grad_norm": 2.334910042113881, "language_loss": 0.74471742, "learning_rate": 2.1208541824838743e-06, "loss": 0.76626217, "num_input_tokens_seen": 177478895, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 8255, "time_per_iteration": 2.443011999130249 }, { "auxiliary_loss_clip": 0.011128, "auxiliary_loss_mlp": 0.0103602, "balance_loss_clip": 1.02214384, "balance_loss_mlp": 1.0388869, "epoch": 0.49637757402675486, "flos": 13917863450880.0, "grad_norm": 1.8611958726111117, "language_loss": 0.81835926, "learning_rate": 2.1204654302104183e-06, "loss": 0.83984751, "num_input_tokens_seen": 177494920, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 8256, "time_per_iteration": 2.446046829223633 }, { "auxiliary_loss_clip": 0.01110284, "auxiliary_loss_mlp": 0.01028477, "balance_loss_clip": 1.01517892, "balance_loss_mlp": 1.03763151, "epoch": 0.49643769727942283, "flos": 22309001700480.0, "grad_norm": 1.5915761392254684, "language_loss": 0.80684018, "learning_rate": 2.120076673368901e-06, "loss": 0.82822782, "num_input_tokens_seen": 177515455, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 8257, "time_per_iteration": 3.994826316833496 }, { "auxiliary_loss_clip": 0.01119828, "auxiliary_loss_mlp": 0.01037052, "balance_loss_clip": 1.02164435, "balance_loss_mlp": 1.0402801, "epoch": 0.4964978205320908, "flos": 19500248776320.0, "grad_norm": 1.940251480191283, "language_loss": 0.66184723, "learning_rate": 2.1196879119740647e-06, "loss": 0.68341601, "num_input_tokens_seen": 177534040, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.796875, "step": 8258, "time_per_iteration": 3.816655158996582 }, { "auxiliary_loss_clip": 0.01110715, "auxiliary_loss_mlp": 0.01028717, "balance_loss_clip": 1.01601481, "balance_loss_mlp": 1.03723621, "epoch": 0.49655794378475876, "flos": 23436524597760.0, "grad_norm": 1.4436608455263857, "language_loss": 0.7779755, "learning_rate": 2.1192991460406502e-06, "loss": 0.79936981, "num_input_tokens_seen": 177554510, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 8259, "time_per_iteration": 3.974862575531006 }, { "auxiliary_loss_clip": 0.01116444, "auxiliary_loss_mlp": 0.01031569, "balance_loss_clip": 1.01750803, "balance_loss_mlp": 1.04208446, "epoch": 0.4966180670374267, "flos": 26831124345600.0, "grad_norm": 1.5158726875113002, "language_loss": 0.78473449, "learning_rate": 2.1189103755834e-06, "loss": 0.80621469, "num_input_tokens_seen": 177575780, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 8260, "time_per_iteration": 2.542285442352295 }, { "auxiliary_loss_clip": 0.01115813, "auxiliary_loss_mlp": 0.01032367, "balance_loss_clip": 1.01806164, "balance_loss_mlp": 1.03955328, "epoch": 0.4966781902900947, "flos": 22009326531840.0, "grad_norm": 3.5625737769745247, "language_loss": 0.76584482, "learning_rate": 2.1185216006170573e-06, "loss": 0.78732663, "num_input_tokens_seen": 177588965, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76171875, "step": 8261, "time_per_iteration": 3.785461664199829 }, { "auxiliary_loss_clip": 0.01111028, "auxiliary_loss_mlp": 0.01033651, "balance_loss_clip": 1.02001917, "balance_loss_mlp": 1.03894413, "epoch": 0.49673831354276266, "flos": 26213353309440.0, "grad_norm": 1.8407970109932408, "language_loss": 0.89803064, "learning_rate": 2.1181328211563627e-06, "loss": 0.9194774, "num_input_tokens_seen": 177608425, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 8262, "time_per_iteration": 2.5082244873046875 }, { "auxiliary_loss_clip": 0.01113832, "auxiliary_loss_mlp": 0.01031363, "balance_loss_clip": 1.01784503, "balance_loss_mlp": 1.04115605, "epoch": 0.4967984367954306, "flos": 23182277155200.0, "grad_norm": 1.4440848710762793, "language_loss": 0.73781747, "learning_rate": 2.11774403721606e-06, "loss": 0.75926948, "num_input_tokens_seen": 177628240, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 8263, "time_per_iteration": 2.4550654888153076 }, { "auxiliary_loss_clip": 0.01120012, "auxiliary_loss_mlp": 0.0103588, "balance_loss_clip": 1.01989985, "balance_loss_mlp": 1.04301453, "epoch": 0.4968585600480986, "flos": 19281445079040.0, "grad_norm": 2.2785396919830276, "language_loss": 0.69885176, "learning_rate": 2.1173552488108923e-06, "loss": 0.7204107, "num_input_tokens_seen": 177645920, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.76953125, "step": 8264, "time_per_iteration": 2.4635210037231445 }, { "auxiliary_loss_clip": 0.01116419, "auxiliary_loss_mlp": 0.01030156, "balance_loss_clip": 1.01594591, "balance_loss_mlp": 1.03899813, "epoch": 0.49691868330076655, "flos": 22528703237760.0, "grad_norm": 1.4352160769292626, "language_loss": 0.64967495, "learning_rate": 2.1169664559556007e-06, "loss": 0.67114067, "num_input_tokens_seen": 177667185, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 8265, "time_per_iteration": 2.4565696716308594 }, { "auxiliary_loss_clip": 0.01043036, "auxiliary_loss_mlp": 0.01025678, "balance_loss_clip": 1.02446258, "balance_loss_mlp": 1.01788056, "epoch": 0.4969788065534345, "flos": 66577128675840.0, "grad_norm": 0.840414268388477, "language_loss": 0.53519136, "learning_rate": 2.1165776586649304e-06, "loss": 0.55587852, "num_input_tokens_seen": 177733020, "router_z_loss_clip": 0.012146, "router_z_loss_mlp": 0.25195312, "step": 8266, "time_per_iteration": 3.1420063972473145 }, { "auxiliary_loss_clip": 0.01113768, "auxiliary_loss_mlp": 0.01030885, "balance_loss_clip": 1.01718771, "balance_loss_mlp": 1.04124928, "epoch": 0.49703892980610254, "flos": 24059503105920.0, "grad_norm": 1.6514245528116829, "language_loss": 0.79360276, "learning_rate": 2.1161888569536223e-06, "loss": 0.81504929, "num_input_tokens_seen": 177753370, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 8267, "time_per_iteration": 2.4802935123443604 }, { "auxiliary_loss_clip": 0.01117912, "auxiliary_loss_mlp": 0.01035549, "balance_loss_clip": 1.02076674, "balance_loss_mlp": 1.04165697, "epoch": 0.4970990530587705, "flos": 29126174912640.0, "grad_norm": 2.5026737253527775, "language_loss": 0.74909663, "learning_rate": 2.1158000508364223e-06, "loss": 0.77063131, "num_input_tokens_seen": 177771530, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.76171875, "step": 8268, "time_per_iteration": 2.5540566444396973 }, { "auxiliary_loss_clip": 0.01116758, "auxiliary_loss_mlp": 0.01036441, "balance_loss_clip": 1.02148008, "balance_loss_mlp": 1.04176521, "epoch": 0.49715917631143847, "flos": 46026167258880.0, "grad_norm": 1.467024514617923, "language_loss": 0.67825377, "learning_rate": 2.115411240328073e-06, "loss": 0.69978583, "num_input_tokens_seen": 177796355, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.75, "step": 8269, "time_per_iteration": 2.6730053424835205 }, { "auxiliary_loss_clip": 0.01112672, "auxiliary_loss_mlp": 0.01034132, "balance_loss_clip": 1.0200299, "balance_loss_mlp": 1.04065382, "epoch": 0.49721929956410643, "flos": 20191277600640.0, "grad_norm": 1.5651033491292154, "language_loss": 0.85674644, "learning_rate": 2.1150224254433167e-06, "loss": 0.87821448, "num_input_tokens_seen": 177814300, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71875, "step": 8270, "time_per_iteration": 2.4933955669403076 }, { "auxiliary_loss_clip": 0.01117308, "auxiliary_loss_mlp": 0.01030783, "balance_loss_clip": 1.01847458, "balance_loss_mlp": 1.04107499, "epoch": 0.4972794228167744, "flos": 21653560275840.0, "grad_norm": 2.3085562362344683, "language_loss": 0.70626855, "learning_rate": 2.114633606196899e-06, "loss": 0.72774941, "num_input_tokens_seen": 177833615, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.76171875, "step": 8271, "time_per_iteration": 2.464296817779541 }, { "auxiliary_loss_clip": 0.01117935, "auxiliary_loss_mlp": 0.0103408, "balance_loss_clip": 1.0195787, "balance_loss_mlp": 1.04209399, "epoch": 0.49733954606944236, "flos": 24279743347200.0, "grad_norm": 1.3824909318374121, "language_loss": 0.78492033, "learning_rate": 2.1142447826035635e-06, "loss": 0.80644047, "num_input_tokens_seen": 177855315, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8272, "time_per_iteration": 2.521013021469116 }, { "auxiliary_loss_clip": 0.01118899, "auxiliary_loss_mlp": 0.01035725, "balance_loss_clip": 1.02172971, "balance_loss_mlp": 1.04315889, "epoch": 0.4973996693221103, "flos": 37852575730560.0, "grad_norm": 2.202391559320881, "language_loss": 0.66996634, "learning_rate": 2.1138559546780544e-06, "loss": 0.69151253, "num_input_tokens_seen": 177875590, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 8273, "time_per_iteration": 2.5908265113830566 }, { "auxiliary_loss_clip": 0.01117779, "auxiliary_loss_mlp": 0.01034506, "balance_loss_clip": 1.02079129, "balance_loss_mlp": 1.04359031, "epoch": 0.4974597925747783, "flos": 21361426963200.0, "grad_norm": 1.7144460037901945, "language_loss": 0.77914226, "learning_rate": 2.1134671224351163e-06, "loss": 0.80066514, "num_input_tokens_seen": 177894175, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 8274, "time_per_iteration": 2.4996025562286377 }, { "auxiliary_loss_clip": 0.01119033, "auxiliary_loss_mlp": 0.01033775, "balance_loss_clip": 1.01902294, "balance_loss_mlp": 1.04168415, "epoch": 0.49751991582744626, "flos": 30738133560960.0, "grad_norm": 1.8002602677586688, "language_loss": 0.75702435, "learning_rate": 2.113078285889493e-06, "loss": 0.77855241, "num_input_tokens_seen": 177913920, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7734375, "step": 8275, "time_per_iteration": 2.5471386909484863 }, { "auxiliary_loss_clip": 0.01119019, "auxiliary_loss_mlp": 0.01036396, "balance_loss_clip": 1.02053499, "balance_loss_mlp": 1.04128587, "epoch": 0.4975800390801142, "flos": 14100541044480.0, "grad_norm": 2.108429742926106, "language_loss": 0.83666706, "learning_rate": 2.1126894450559303e-06, "loss": 0.85822117, "num_input_tokens_seen": 177930425, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.77734375, "step": 8276, "time_per_iteration": 2.464484691619873 }, { "auxiliary_loss_clip": 0.01112685, "auxiliary_loss_mlp": 0.01026793, "balance_loss_clip": 1.01394856, "balance_loss_mlp": 1.04137528, "epoch": 0.4976401623327822, "flos": 24207275658240.0, "grad_norm": 1.5136826973315805, "language_loss": 0.70195365, "learning_rate": 2.112300599949172e-06, "loss": 0.72334838, "num_input_tokens_seen": 177949885, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 8277, "time_per_iteration": 2.4827818870544434 }, { "auxiliary_loss_clip": 0.01113245, "auxiliary_loss_mlp": 0.01036341, "balance_loss_clip": 1.02205336, "balance_loss_mlp": 1.04036307, "epoch": 0.49770028558545015, "flos": 21136769349120.0, "grad_norm": 1.7869679370745442, "language_loss": 0.82220161, "learning_rate": 2.111911750583964e-06, "loss": 0.84369743, "num_input_tokens_seen": 177965720, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7265625, "step": 8278, "time_per_iteration": 2.501986503601074 }, { "auxiliary_loss_clip": 0.01117668, "auxiliary_loss_mlp": 0.01035606, "balance_loss_clip": 1.02125311, "balance_loss_mlp": 1.04138839, "epoch": 0.4977604088381181, "flos": 16763927627520.0, "grad_norm": 2.2392237522642477, "language_loss": 0.67628539, "learning_rate": 2.111522896975052e-06, "loss": 0.69781816, "num_input_tokens_seen": 177983190, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.76171875, "step": 8279, "time_per_iteration": 2.437373399734497 }, { "auxiliary_loss_clip": 0.01114836, "auxiliary_loss_mlp": 0.01033391, "balance_loss_clip": 1.01813197, "balance_loss_mlp": 1.03902125, "epoch": 0.49782053209078614, "flos": 15703521292800.0, "grad_norm": 2.2678694910948667, "language_loss": 0.71208143, "learning_rate": 2.1111340391371794e-06, "loss": 0.73356366, "num_input_tokens_seen": 178000155, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7578125, "step": 8280, "time_per_iteration": 2.4602718353271484 }, { "auxiliary_loss_clip": 0.01115744, "auxiliary_loss_mlp": 0.01036694, "balance_loss_clip": 1.02206743, "balance_loss_mlp": 1.04007101, "epoch": 0.4978806553434541, "flos": 24753692327040.0, "grad_norm": 2.318609786171507, "language_loss": 0.6459173, "learning_rate": 2.1107451770850936e-06, "loss": 0.66744173, "num_input_tokens_seen": 178021060, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7578125, "step": 8281, "time_per_iteration": 2.4895784854888916 }, { "auxiliary_loss_clip": 0.01119293, "auxiliary_loss_mlp": 0.01035731, "balance_loss_clip": 1.02033472, "balance_loss_mlp": 1.04258084, "epoch": 0.49794077859612207, "flos": 13115726881920.0, "grad_norm": 2.2486563492326543, "language_loss": 0.72695428, "learning_rate": 2.1103563108335387e-06, "loss": 0.74850452, "num_input_tokens_seen": 178038180, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.765625, "step": 8282, "time_per_iteration": 2.467311143875122 }, { "auxiliary_loss_clip": 0.0111192, "auxiliary_loss_mlp": 0.01032951, "balance_loss_clip": 1.02052879, "balance_loss_mlp": 1.03948808, "epoch": 0.49800090184879003, "flos": 27525133998720.0, "grad_norm": 1.8664115647974886, "language_loss": 0.73259521, "learning_rate": 2.109967440397263e-06, "loss": 0.75404394, "num_input_tokens_seen": 178057565, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.72265625, "step": 8283, "time_per_iteration": 2.511632204055786 }, { "auxiliary_loss_clip": 0.01114332, "auxiliary_loss_mlp": 0.01034848, "balance_loss_clip": 1.02088261, "balance_loss_mlp": 1.04090691, "epoch": 0.498061025101458, "flos": 19792489829760.0, "grad_norm": 1.5070323809716315, "language_loss": 0.78928673, "learning_rate": 2.1095785657910095e-06, "loss": 0.8107785, "num_input_tokens_seen": 178076965, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 8284, "time_per_iteration": 2.4960784912109375 }, { "auxiliary_loss_clip": 0.01122522, "auxiliary_loss_mlp": 0.01036426, "balance_loss_clip": 1.02122676, "balance_loss_mlp": 1.04320157, "epoch": 0.49812114835412596, "flos": 29893909230720.0, "grad_norm": 2.8884412526539593, "language_loss": 0.74326932, "learning_rate": 2.109189687029526e-06, "loss": 0.76485872, "num_input_tokens_seen": 178095105, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.79296875, "step": 8285, "time_per_iteration": 2.5187630653381348 }, { "auxiliary_loss_clip": 0.0112086, "auxiliary_loss_mlp": 0.0103246, "balance_loss_clip": 1.01738, "balance_loss_mlp": 1.04593968, "epoch": 0.49818127160679393, "flos": 23147048891520.0, "grad_norm": 1.9407891176130976, "language_loss": 0.74202877, "learning_rate": 2.1088008041275598e-06, "loss": 0.76356196, "num_input_tokens_seen": 178114505, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.75, "step": 8286, "time_per_iteration": 2.5074687004089355 }, { "auxiliary_loss_clip": 0.01118548, "auxiliary_loss_mlp": 0.01040405, "balance_loss_clip": 1.02582538, "balance_loss_mlp": 1.04239154, "epoch": 0.4982413948594619, "flos": 21652806090240.0, "grad_norm": 1.6760695237522782, "language_loss": 0.85826725, "learning_rate": 2.1084119170998545e-06, "loss": 0.87985682, "num_input_tokens_seen": 178131595, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.76171875, "step": 8287, "time_per_iteration": 2.4759302139282227 }, { "auxiliary_loss_clip": 0.01116539, "auxiliary_loss_mlp": 0.01030079, "balance_loss_clip": 1.01577973, "balance_loss_mlp": 1.04122603, "epoch": 0.49830151811212986, "flos": 32486982940800.0, "grad_norm": 2.125089285512104, "language_loss": 0.72230303, "learning_rate": 2.108023025961159e-06, "loss": 0.74376923, "num_input_tokens_seen": 178152055, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 8288, "time_per_iteration": 2.611133098602295 }, { "auxiliary_loss_clip": 0.01121199, "auxiliary_loss_mlp": 0.01040598, "balance_loss_clip": 1.02479672, "balance_loss_mlp": 1.04243946, "epoch": 0.4983616413647978, "flos": 18142358002560.0, "grad_norm": 7.152960193745452, "language_loss": 0.80235374, "learning_rate": 2.10763413072622e-06, "loss": 0.82397175, "num_input_tokens_seen": 178168150, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.7890625, "step": 8289, "time_per_iteration": 2.4453980922698975 }, { "auxiliary_loss_clip": 0.01115469, "auxiliary_loss_mlp": 0.01036245, "balance_loss_clip": 1.02171373, "balance_loss_mlp": 1.04044032, "epoch": 0.4984217646174658, "flos": 19718836992000.0, "grad_norm": 2.1639653357551634, "language_loss": 0.73690295, "learning_rate": 2.107245231409784e-06, "loss": 0.75842011, "num_input_tokens_seen": 178186150, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.75, "step": 8290, "time_per_iteration": 2.5069260597229004 }, { "auxiliary_loss_clip": 0.01119242, "auxiliary_loss_mlp": 0.01039284, "balance_loss_clip": 1.02301157, "balance_loss_mlp": 1.04278791, "epoch": 0.49848188787013376, "flos": 24936549488640.0, "grad_norm": 1.5536112395325348, "language_loss": 0.84222865, "learning_rate": 2.106856328026598e-06, "loss": 0.86381394, "num_input_tokens_seen": 178207665, "router_z_loss_clip": 0.16210938, "router_z_loss_mlp": 0.765625, "step": 8291, "time_per_iteration": 2.5105319023132324 }, { "auxiliary_loss_clip": 0.01123656, "auxiliary_loss_mlp": 0.01041404, "balance_loss_clip": 1.02563238, "balance_loss_mlp": 1.04386306, "epoch": 0.4985420111228017, "flos": 22382439056640.0, "grad_norm": 2.195548917179531, "language_loss": 0.66870713, "learning_rate": 2.106467420591409e-06, "loss": 0.69035774, "num_input_tokens_seen": 178226325, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.796875, "step": 8292, "time_per_iteration": 2.512373924255371 }, { "auxiliary_loss_clip": 0.01118381, "auxiliary_loss_mlp": 0.01034074, "balance_loss_clip": 1.02057385, "balance_loss_mlp": 1.04257357, "epoch": 0.4986021343754697, "flos": 16216469464320.0, "grad_norm": 1.649299288672035, "language_loss": 0.67167705, "learning_rate": 2.106078509118965e-06, "loss": 0.69320166, "num_input_tokens_seen": 178244960, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 8293, "time_per_iteration": 2.4563677310943604 }, { "auxiliary_loss_clip": 0.01118395, "auxiliary_loss_mlp": 0.01028857, "balance_loss_clip": 1.01499271, "balance_loss_mlp": 1.04204881, "epoch": 0.4986622576281377, "flos": 23403594804480.0, "grad_norm": 2.077467926211567, "language_loss": 0.82248455, "learning_rate": 2.1056895936240133e-06, "loss": 0.84395707, "num_input_tokens_seen": 178265400, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 8294, "time_per_iteration": 2.5378808975219727 }, { "auxiliary_loss_clip": 0.01117738, "auxiliary_loss_mlp": 0.01032335, "balance_loss_clip": 1.01729679, "balance_loss_mlp": 1.04165804, "epoch": 0.49872238088080567, "flos": 19974556892160.0, "grad_norm": 1.6890578951613153, "language_loss": 0.73026311, "learning_rate": 2.1053006741213016e-06, "loss": 0.75176382, "num_input_tokens_seen": 178284535, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.76171875, "step": 8295, "time_per_iteration": 2.468869686126709 }, { "auxiliary_loss_clip": 0.01115292, "auxiliary_loss_mlp": 0.0103762, "balance_loss_clip": 1.02360141, "balance_loss_mlp": 1.04152465, "epoch": 0.49878250413347364, "flos": 22893016930560.0, "grad_norm": 1.7404630790685025, "language_loss": 0.67807484, "learning_rate": 2.1049117506255775e-06, "loss": 0.69960403, "num_input_tokens_seen": 178302425, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 8296, "time_per_iteration": 2.501727342605591 }, { "auxiliary_loss_clip": 0.0112032, "auxiliary_loss_mlp": 0.0103598, "balance_loss_clip": 1.02059579, "balance_loss_mlp": 1.04374075, "epoch": 0.4988426273861416, "flos": 32598449821440.0, "grad_norm": 2.570182236182127, "language_loss": 0.64788008, "learning_rate": 2.1045228231515895e-06, "loss": 0.66944301, "num_input_tokens_seen": 178323065, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.765625, "step": 8297, "time_per_iteration": 2.553701162338257 }, { "auxiliary_loss_clip": 0.01114774, "auxiliary_loss_mlp": 0.01033374, "balance_loss_clip": 1.01994467, "balance_loss_mlp": 1.0410533, "epoch": 0.49890275063880957, "flos": 20923604087040.0, "grad_norm": 2.0528633826677667, "language_loss": 0.69654369, "learning_rate": 2.1041338917140857e-06, "loss": 0.71802521, "num_input_tokens_seen": 178343985, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 8298, "time_per_iteration": 2.5411479473114014 }, { "auxiliary_loss_clip": 0.01113323, "auxiliary_loss_mlp": 0.01035085, "balance_loss_clip": 1.02119112, "balance_loss_mlp": 1.03952372, "epoch": 0.49896287389147753, "flos": 18624459369600.0, "grad_norm": 1.8126556030672698, "language_loss": 0.84323555, "learning_rate": 2.103744956327814e-06, "loss": 0.86471963, "num_input_tokens_seen": 178362345, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73828125, "step": 8299, "time_per_iteration": 4.005002498626709 }, { "auxiliary_loss_clip": 0.01119387, "auxiliary_loss_mlp": 0.01040933, "balance_loss_clip": 1.02528048, "balance_loss_mlp": 1.04099274, "epoch": 0.4990229971441455, "flos": 24826555065600.0, "grad_norm": 3.9018634365688043, "language_loss": 0.69451833, "learning_rate": 2.1033560170075234e-06, "loss": 0.71612144, "num_input_tokens_seen": 178383190, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.78125, "step": 8300, "time_per_iteration": 3.9137964248657227 }, { "auxiliary_loss_clip": 0.01041827, "auxiliary_loss_mlp": 0.01009738, "balance_loss_clip": 1.00847995, "balance_loss_mlp": 1.01674008, "epoch": 0.49908312039681346, "flos": 71384525136000.0, "grad_norm": 0.7810823945274333, "language_loss": 0.51159495, "learning_rate": 2.1029670737679623e-06, "loss": 0.53211057, "num_input_tokens_seen": 178444250, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.25, "step": 8301, "time_per_iteration": 3.1831741333007812 }, { "auxiliary_loss_clip": 0.01113901, "auxiliary_loss_mlp": 0.01041506, "balance_loss_clip": 1.0274756, "balance_loss_mlp": 1.04095662, "epoch": 0.4991432436494814, "flos": 19828651847040.0, "grad_norm": 1.640730248737862, "language_loss": 0.84652728, "learning_rate": 2.102578126623879e-06, "loss": 0.86808133, "num_input_tokens_seen": 178463250, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73046875, "step": 8302, "time_per_iteration": 3.8195011615753174 }, { "auxiliary_loss_clip": 0.01114607, "auxiliary_loss_mlp": 0.0102981, "balance_loss_clip": 1.01692963, "balance_loss_mlp": 1.04191482, "epoch": 0.4992033669021494, "flos": 15121912273920.0, "grad_norm": 3.685886368658136, "language_loss": 0.68979371, "learning_rate": 2.102189175590024e-06, "loss": 0.71123791, "num_input_tokens_seen": 178481340, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8303, "time_per_iteration": 2.4430837631225586 }, { "auxiliary_loss_clip": 0.01117073, "auxiliary_loss_mlp": 0.01032304, "balance_loss_clip": 1.01773691, "balance_loss_mlp": 1.04052663, "epoch": 0.49926349015481736, "flos": 31207952476800.0, "grad_norm": 1.7348343780397109, "language_loss": 0.72627282, "learning_rate": 2.101800220681144e-06, "loss": 0.74776661, "num_input_tokens_seen": 178501545, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.765625, "step": 8304, "time_per_iteration": 2.573270082473755 }, { "auxiliary_loss_clip": 0.01117296, "auxiliary_loss_mlp": 0.01033801, "balance_loss_clip": 1.02041996, "balance_loss_mlp": 1.04236364, "epoch": 0.4993236134074853, "flos": 24900207903360.0, "grad_norm": 2.168512340580263, "language_loss": 0.80558884, "learning_rate": 2.10141126191199e-06, "loss": 0.8270998, "num_input_tokens_seen": 178519700, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 8305, "time_per_iteration": 2.5203888416290283 }, { "auxiliary_loss_clip": 0.01041568, "auxiliary_loss_mlp": 0.01003794, "balance_loss_clip": 1.00245917, "balance_loss_mlp": 1.0166719, "epoch": 0.4993837366601533, "flos": 70420573797120.0, "grad_norm": 0.7130003926061448, "language_loss": 0.56889629, "learning_rate": 2.1010222992973107e-06, "loss": 0.58934987, "num_input_tokens_seen": 178576740, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.24902344, "step": 8306, "time_per_iteration": 3.219465970993042 }, { "auxiliary_loss_clip": 0.01117075, "auxiliary_loss_mlp": 0.01039105, "balance_loss_clip": 1.02308297, "balance_loss_mlp": 1.04238725, "epoch": 0.4994438599128213, "flos": 15961216440960.0, "grad_norm": 1.7383568926101765, "language_loss": 0.82759964, "learning_rate": 2.1006333328518556e-06, "loss": 0.84916145, "num_input_tokens_seen": 178594745, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.74609375, "step": 8307, "time_per_iteration": 2.4413645267486572 }, { "auxiliary_loss_clip": 0.0111472, "auxiliary_loss_mlp": 0.01038549, "balance_loss_clip": 1.02426171, "balance_loss_mlp": 1.04028726, "epoch": 0.4995039831654893, "flos": 27928303228800.0, "grad_norm": 1.759277962283655, "language_loss": 0.61162013, "learning_rate": 2.1002443625903748e-06, "loss": 0.63315284, "num_input_tokens_seen": 178614110, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7421875, "step": 8308, "time_per_iteration": 2.5417613983154297 }, { "auxiliary_loss_clip": 0.01110521, "auxiliary_loss_mlp": 0.01032478, "balance_loss_clip": 1.01907885, "balance_loss_mlp": 1.03821516, "epoch": 0.49956410641815724, "flos": 24204797619840.0, "grad_norm": 1.9941524150673025, "language_loss": 0.74687809, "learning_rate": 2.0998553885276168e-06, "loss": 0.76830804, "num_input_tokens_seen": 178634170, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 8309, "time_per_iteration": 2.513169765472412 }, { "auxiliary_loss_clip": 0.01113863, "auxiliary_loss_mlp": 0.01037152, "balance_loss_clip": 1.02359247, "balance_loss_mlp": 1.03922844, "epoch": 0.4996242296708252, "flos": 16180127879040.0, "grad_norm": 1.9119938173048243, "language_loss": 0.79698217, "learning_rate": 2.0994664106783335e-06, "loss": 0.81849235, "num_input_tokens_seen": 178651775, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 8310, "time_per_iteration": 2.489797592163086 }, { "auxiliary_loss_clip": 0.01115972, "auxiliary_loss_mlp": 0.01038712, "balance_loss_clip": 1.02519941, "balance_loss_mlp": 1.03967547, "epoch": 0.49968435292349317, "flos": 16873527000960.0, "grad_norm": 1.594980098240406, "language_loss": 0.70502198, "learning_rate": 2.0990774290572735e-06, "loss": 0.72656882, "num_input_tokens_seen": 178669720, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 8311, "time_per_iteration": 2.4364116191864014 }, { "auxiliary_loss_clip": 0.01115833, "auxiliary_loss_mlp": 0.01040535, "balance_loss_clip": 1.02763629, "balance_loss_mlp": 1.04170942, "epoch": 0.49974447617616113, "flos": 14939521989120.0, "grad_norm": 1.9051287642146557, "language_loss": 0.77531165, "learning_rate": 2.098688443679187e-06, "loss": 0.79687536, "num_input_tokens_seen": 178686765, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 8312, "time_per_iteration": 2.4706408977508545 }, { "auxiliary_loss_clip": 0.01115876, "auxiliary_loss_mlp": 0.01039831, "balance_loss_clip": 1.02585971, "balance_loss_mlp": 1.04067039, "epoch": 0.4998045994288291, "flos": 26651535321600.0, "grad_norm": 3.599914716800203, "language_loss": 0.84661806, "learning_rate": 2.0982994545588256e-06, "loss": 0.86817515, "num_input_tokens_seen": 178705845, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 8313, "time_per_iteration": 2.5014705657958984 }, { "auxiliary_loss_clip": 0.01115005, "auxiliary_loss_mlp": 0.01038281, "balance_loss_clip": 1.0243516, "balance_loss_mlp": 1.03959584, "epoch": 0.49986472268149706, "flos": 20953768533120.0, "grad_norm": 1.825317826647082, "language_loss": 0.80778915, "learning_rate": 2.097910461710939e-06, "loss": 0.82932198, "num_input_tokens_seen": 178723410, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75390625, "step": 8314, "time_per_iteration": 2.469806671142578 }, { "auxiliary_loss_clip": 0.01118407, "auxiliary_loss_mlp": 0.01051091, "balance_loss_clip": 1.0360465, "balance_loss_mlp": 1.04234123, "epoch": 0.49992484593416503, "flos": 22783884433920.0, "grad_norm": 1.7663483437532197, "language_loss": 0.7957086, "learning_rate": 2.0975214651502773e-06, "loss": 0.81740355, "num_input_tokens_seen": 178743560, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.76171875, "step": 8315, "time_per_iteration": 2.4694294929504395 }, { "auxiliary_loss_clip": 0.011154, "auxiliary_loss_mlp": 0.01042191, "balance_loss_clip": 1.02876842, "balance_loss_mlp": 1.04142761, "epoch": 0.499984969186833, "flos": 46786970252160.0, "grad_norm": 1.5482774896168647, "language_loss": 0.74861431, "learning_rate": 2.0971324648915926e-06, "loss": 0.77019024, "num_input_tokens_seen": 178767225, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 8316, "time_per_iteration": 2.693337917327881 }, { "auxiliary_loss_clip": 0.01113132, "auxiliary_loss_mlp": 0.01037017, "balance_loss_clip": 1.02391613, "balance_loss_mlp": 1.04063594, "epoch": 0.500045092439501, "flos": 25556978131200.0, "grad_norm": 1.4264192178200945, "language_loss": 0.81285381, "learning_rate": 2.0967434609496343e-06, "loss": 0.83435524, "num_input_tokens_seen": 178786810, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 8317, "time_per_iteration": 2.5034725666046143 }, { "auxiliary_loss_clip": 0.01115333, "auxiliary_loss_mlp": 0.01044779, "balance_loss_clip": 1.03033042, "balance_loss_mlp": 1.03976297, "epoch": 0.5001052156921689, "flos": 20704764476160.0, "grad_norm": 1.6806376220317543, "language_loss": 0.8306278, "learning_rate": 2.0963544533391548e-06, "loss": 0.852229, "num_input_tokens_seen": 178805660, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8318, "time_per_iteration": 2.482541084289551 }, { "auxiliary_loss_clip": 0.01115315, "auxiliary_loss_mlp": 0.01037102, "balance_loss_clip": 1.02340484, "balance_loss_mlp": 1.04073501, "epoch": 0.500165338944837, "flos": 21251109317760.0, "grad_norm": 1.8054918176911905, "language_loss": 0.81848037, "learning_rate": 2.0959654420749045e-06, "loss": 0.84000456, "num_input_tokens_seen": 178824780, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 8319, "time_per_iteration": 2.4444167613983154 }, { "auxiliary_loss_clip": 0.01113963, "auxiliary_loss_mlp": 0.01035574, "balance_loss_clip": 1.02269971, "balance_loss_mlp": 1.03922462, "epoch": 0.5002254621975049, "flos": 27854398995840.0, "grad_norm": 1.695320014371538, "language_loss": 0.71850705, "learning_rate": 2.095576427171635e-06, "loss": 0.74000239, "num_input_tokens_seen": 178845640, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 8320, "time_per_iteration": 2.523510217666626 }, { "auxiliary_loss_clip": 0.01122329, "auxiliary_loss_mlp": 0.01049103, "balance_loss_clip": 1.03361821, "balance_loss_mlp": 1.04056215, "epoch": 0.5002855854501729, "flos": 15551941898880.0, "grad_norm": 3.2693182508874488, "language_loss": 0.76820993, "learning_rate": 2.0951874086440978e-06, "loss": 0.78992426, "num_input_tokens_seen": 178862290, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.8203125, "step": 8321, "time_per_iteration": 2.4336209297180176 }, { "auxiliary_loss_clip": 0.01116154, "auxiliary_loss_mlp": 0.01044844, "balance_loss_clip": 1.03127766, "balance_loss_mlp": 1.04109287, "epoch": 0.5003457087028408, "flos": 16107408794880.0, "grad_norm": 1.6133967299438439, "language_loss": 0.83094877, "learning_rate": 2.0947983865070455e-06, "loss": 0.85255873, "num_input_tokens_seen": 178879805, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 8322, "time_per_iteration": 2.4583487510681152 }, { "auxiliary_loss_clip": 0.01118531, "auxiliary_loss_mlp": 0.01035499, "balance_loss_clip": 1.02177238, "balance_loss_mlp": 1.04203677, "epoch": 0.5004058319555088, "flos": 22710518904960.0, "grad_norm": 2.1960087374726225, "language_loss": 0.73611188, "learning_rate": 2.094409360775228e-06, "loss": 0.75765216, "num_input_tokens_seen": 178896985, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.765625, "step": 8323, "time_per_iteration": 2.4716873168945312 }, { "auxiliary_loss_clip": 0.01116546, "auxiliary_loss_mlp": 0.01040852, "balance_loss_clip": 1.02662456, "balance_loss_mlp": 1.04148471, "epoch": 0.5004659552081767, "flos": 30117956313600.0, "grad_norm": 1.4624395903909349, "language_loss": 0.69338584, "learning_rate": 2.0940203314633977e-06, "loss": 0.71495986, "num_input_tokens_seen": 178920605, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 8324, "time_per_iteration": 2.564232110977173 }, { "auxiliary_loss_clip": 0.01112563, "auxiliary_loss_mlp": 0.01037107, "balance_loss_clip": 1.02332628, "balance_loss_mlp": 1.03804231, "epoch": 0.5005260784608447, "flos": 18624710764800.0, "grad_norm": 1.9707202701688515, "language_loss": 0.72056067, "learning_rate": 2.0936312985863077e-06, "loss": 0.74205738, "num_input_tokens_seen": 178937760, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.74609375, "step": 8325, "time_per_iteration": 2.4520485401153564 }, { "auxiliary_loss_clip": 0.01117244, "auxiliary_loss_mlp": 0.01041227, "balance_loss_clip": 1.02590847, "balance_loss_mlp": 1.04116678, "epoch": 0.5005862017135126, "flos": 24859987649280.0, "grad_norm": 1.6004204984580845, "language_loss": 0.73890865, "learning_rate": 2.093242262158709e-06, "loss": 0.76049334, "num_input_tokens_seen": 178957985, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.76171875, "step": 8326, "time_per_iteration": 2.499319553375244 }, { "auxiliary_loss_clip": 0.01113632, "auxiliary_loss_mlp": 0.0103429, "balance_loss_clip": 1.02063417, "balance_loss_mlp": 1.03929019, "epoch": 0.5006463249661807, "flos": 18734381965440.0, "grad_norm": 1.5028888562391658, "language_loss": 0.78449237, "learning_rate": 2.0928532221953544e-06, "loss": 0.80597162, "num_input_tokens_seen": 178977070, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 8327, "time_per_iteration": 2.450352430343628 }, { "auxiliary_loss_clip": 0.01118292, "auxiliary_loss_mlp": 0.01036061, "balance_loss_clip": 1.0220542, "balance_loss_mlp": 1.04194558, "epoch": 0.5007064482188487, "flos": 13042145871360.0, "grad_norm": 2.3544066821490666, "language_loss": 0.87939519, "learning_rate": 2.092464178710997e-06, "loss": 0.90093875, "num_input_tokens_seen": 178994175, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 8328, "time_per_iteration": 2.44118595123291 }, { "auxiliary_loss_clip": 0.0111928, "auxiliary_loss_mlp": 0.01034645, "balance_loss_clip": 1.02114511, "balance_loss_mlp": 1.04096413, "epoch": 0.5007665714715166, "flos": 21288671965440.0, "grad_norm": 2.2148188482538567, "language_loss": 0.74555129, "learning_rate": 2.092075131720388e-06, "loss": 0.76709056, "num_input_tokens_seen": 179013710, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.78515625, "step": 8329, "time_per_iteration": 2.4858736991882324 }, { "auxiliary_loss_clip": 0.0111411, "auxiliary_loss_mlp": 0.01032125, "balance_loss_clip": 1.01897633, "balance_loss_mlp": 1.04155219, "epoch": 0.5008266947241846, "flos": 29754576374400.0, "grad_norm": 1.6584417557245459, "language_loss": 0.79619557, "learning_rate": 2.091686081238281e-06, "loss": 0.81765795, "num_input_tokens_seen": 179035255, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 8330, "time_per_iteration": 2.56679630279541 }, { "auxiliary_loss_clip": 0.01041913, "auxiliary_loss_mlp": 0.01004727, "balance_loss_clip": 1.00340366, "balance_loss_mlp": 1.01671219, "epoch": 0.5008868179768525, "flos": 63557829204480.0, "grad_norm": 0.7353057948421156, "language_loss": 0.56090462, "learning_rate": 2.0912970272794282e-06, "loss": 0.58137101, "num_input_tokens_seen": 179090915, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.25195312, "step": 8331, "time_per_iteration": 2.9265522956848145 }, { "auxiliary_loss_clip": 0.01113988, "auxiliary_loss_mlp": 0.01029975, "balance_loss_clip": 1.01721978, "balance_loss_mlp": 1.04140139, "epoch": 0.5009469412295205, "flos": 27375637593600.0, "grad_norm": 1.9920365823046453, "language_loss": 0.65490597, "learning_rate": 2.0909079698585833e-06, "loss": 0.67634559, "num_input_tokens_seen": 179109160, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 8332, "time_per_iteration": 2.5258240699768066 }, { "auxiliary_loss_clip": 0.011117, "auxiliary_loss_mlp": 0.01030615, "balance_loss_clip": 1.01799035, "balance_loss_mlp": 1.03922081, "epoch": 0.5010070644821885, "flos": 27378833904000.0, "grad_norm": 1.4247022586507423, "language_loss": 0.74181914, "learning_rate": 2.0905189089904993e-06, "loss": 0.76324224, "num_input_tokens_seen": 179130610, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 8333, "time_per_iteration": 2.5367937088012695 }, { "auxiliary_loss_clip": 0.01114626, "auxiliary_loss_mlp": 0.01030229, "balance_loss_clip": 1.01722336, "balance_loss_mlp": 1.03868842, "epoch": 0.5010671877348565, "flos": 20662748542080.0, "grad_norm": 1.90931643031356, "language_loss": 0.80273974, "learning_rate": 2.090129844689929e-06, "loss": 0.82418835, "num_input_tokens_seen": 179147860, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7578125, "step": 8334, "time_per_iteration": 2.468031167984009 }, { "auxiliary_loss_clip": 0.01041485, "auxiliary_loss_mlp": 0.0100025, "balance_loss_clip": 0.99895108, "balance_loss_mlp": 1.01632774, "epoch": 0.5011273109875244, "flos": 59128645000320.0, "grad_norm": 0.9124232519480245, "language_loss": 0.62710798, "learning_rate": 2.089740776971626e-06, "loss": 0.64752531, "num_input_tokens_seen": 179210490, "router_z_loss_clip": 0.01300049, "router_z_loss_mlp": 0.25, "step": 8335, "time_per_iteration": 3.072697877883911 }, { "auxiliary_loss_clip": 0.01108623, "auxiliary_loss_mlp": 0.01030623, "balance_loss_clip": 1.01756334, "balance_loss_mlp": 1.03653789, "epoch": 0.5011874342401924, "flos": 25336342840320.0, "grad_norm": 1.4383587009548053, "language_loss": 0.7966097, "learning_rate": 2.0893517058503435e-06, "loss": 0.81800222, "num_input_tokens_seen": 179231360, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 8336, "time_per_iteration": 2.497323751449585 }, { "auxiliary_loss_clip": 0.01112592, "auxiliary_loss_mlp": 0.01028216, "balance_loss_clip": 1.01464963, "balance_loss_mlp": 1.03816533, "epoch": 0.5012475574928603, "flos": 20229953569920.0, "grad_norm": 1.8034129008696327, "language_loss": 0.79603952, "learning_rate": 2.088962631340836e-06, "loss": 0.8174476, "num_input_tokens_seen": 179250625, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 8337, "time_per_iteration": 2.4898390769958496 }, { "auxiliary_loss_clip": 0.01116839, "auxiliary_loss_mlp": 0.0103227, "balance_loss_clip": 1.01815009, "balance_loss_mlp": 1.03878427, "epoch": 0.5013076807455283, "flos": 22710123855360.0, "grad_norm": 1.9277596284488927, "language_loss": 0.79551256, "learning_rate": 2.0885735534578555e-06, "loss": 0.81700361, "num_input_tokens_seen": 179267360, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.78125, "step": 8338, "time_per_iteration": 2.466203212738037 }, { "auxiliary_loss_clip": 0.01112189, "auxiliary_loss_mlp": 0.0102832, "balance_loss_clip": 1.01415193, "balance_loss_mlp": 1.03675103, "epoch": 0.5013678039981962, "flos": 24245161528320.0, "grad_norm": 1.6249719475880402, "language_loss": 0.85171759, "learning_rate": 2.0881844722161583e-06, "loss": 0.87312269, "num_input_tokens_seen": 179289810, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75390625, "step": 8339, "time_per_iteration": 2.5206573009490967 }, { "auxiliary_loss_clip": 0.0111087, "auxiliary_loss_mlp": 0.01031587, "balance_loss_clip": 1.01869416, "balance_loss_mlp": 1.03759408, "epoch": 0.5014279272508643, "flos": 26176688501760.0, "grad_norm": 1.5810241474986948, "language_loss": 0.70609522, "learning_rate": 2.0877953876304962e-06, "loss": 0.72751975, "num_input_tokens_seen": 179310620, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 8340, "time_per_iteration": 2.497567653656006 }, { "auxiliary_loss_clip": 0.01118971, "auxiliary_loss_mlp": 0.01034159, "balance_loss_clip": 1.01994932, "balance_loss_mlp": 1.04141188, "epoch": 0.5014880505035323, "flos": 21430446946560.0, "grad_norm": 2.2270872482442274, "language_loss": 0.78277838, "learning_rate": 2.0874062997156245e-06, "loss": 0.80430967, "num_input_tokens_seen": 179329005, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 8341, "time_per_iteration": 5.405110597610474 }, { "auxiliary_loss_clip": 0.01117837, "auxiliary_loss_mlp": 0.01032458, "balance_loss_clip": 1.01772332, "balance_loss_mlp": 1.04071259, "epoch": 0.5015481737562002, "flos": 15770745596160.0, "grad_norm": 2.3218224192486145, "language_loss": 0.89374626, "learning_rate": 2.0870172084862975e-06, "loss": 0.91524923, "num_input_tokens_seen": 179343785, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 8342, "time_per_iteration": 3.9802486896514893 }, { "auxiliary_loss_clip": 0.01112404, "auxiliary_loss_mlp": 0.01033624, "balance_loss_clip": 1.01922345, "balance_loss_mlp": 1.03781617, "epoch": 0.5016082970088682, "flos": 26830801123200.0, "grad_norm": 1.6288049309018406, "language_loss": 0.76376331, "learning_rate": 2.0866281139572682e-06, "loss": 0.78522354, "num_input_tokens_seen": 179364070, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.74609375, "step": 8343, "time_per_iteration": 2.5175940990448 }, { "auxiliary_loss_clip": 0.01110908, "auxiliary_loss_mlp": 0.01022654, "balance_loss_clip": 1.01048899, "balance_loss_mlp": 1.03920257, "epoch": 0.5016684202615361, "flos": 21470595373440.0, "grad_norm": 1.7266698365342055, "language_loss": 0.66746193, "learning_rate": 2.086239016143293e-06, "loss": 0.68879759, "num_input_tokens_seen": 179384225, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71875, "step": 8344, "time_per_iteration": 3.918037176132202 }, { "auxiliary_loss_clip": 0.01115158, "auxiliary_loss_mlp": 0.01033846, "balance_loss_clip": 1.02077496, "balance_loss_mlp": 1.04016209, "epoch": 0.5017285435142042, "flos": 26246821806720.0, "grad_norm": 2.1925936891833984, "language_loss": 0.75325179, "learning_rate": 2.0858499150591258e-06, "loss": 0.77474189, "num_input_tokens_seen": 179402595, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 8345, "time_per_iteration": 2.510291576385498 }, { "auxiliary_loss_clip": 0.01114466, "auxiliary_loss_mlp": 0.01033437, "balance_loss_clip": 1.01838088, "balance_loss_mlp": 1.04036975, "epoch": 0.5017886667668721, "flos": 20777555387520.0, "grad_norm": 4.0586229021795175, "language_loss": 0.79082072, "learning_rate": 2.0854608107195203e-06, "loss": 0.81229973, "num_input_tokens_seen": 179419635, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7421875, "step": 8346, "time_per_iteration": 2.4628536701202393 }, { "auxiliary_loss_clip": 0.01112717, "auxiliary_loss_mlp": 0.01030758, "balance_loss_clip": 1.01745403, "balance_loss_mlp": 1.03812718, "epoch": 0.5018487900195401, "flos": 20156408472960.0, "grad_norm": 1.7918720878743428, "language_loss": 0.69294852, "learning_rate": 2.0850717031392333e-06, "loss": 0.71438324, "num_input_tokens_seen": 179438770, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 8347, "time_per_iteration": 2.517810106277466 }, { "auxiliary_loss_clip": 0.01115408, "auxiliary_loss_mlp": 0.01032019, "balance_loss_clip": 1.01884604, "balance_loss_mlp": 1.03938651, "epoch": 0.501908913272208, "flos": 18150689957760.0, "grad_norm": 2.189453814676424, "language_loss": 0.71232504, "learning_rate": 2.0846825923330174e-06, "loss": 0.73379928, "num_input_tokens_seen": 179457475, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.76171875, "step": 8348, "time_per_iteration": 2.4794628620147705 }, { "auxiliary_loss_clip": 0.01111777, "auxiliary_loss_mlp": 0.01030058, "balance_loss_clip": 1.01797056, "balance_loss_mlp": 1.03973293, "epoch": 0.501969036524876, "flos": 23112287504640.0, "grad_norm": 1.5370405051502334, "language_loss": 0.74312794, "learning_rate": 2.0842934783156303e-06, "loss": 0.76454628, "num_input_tokens_seen": 179478140, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71875, "step": 8349, "time_per_iteration": 2.503401279449463 }, { "auxiliary_loss_clip": 0.0111355, "auxiliary_loss_mlp": 0.01031129, "balance_loss_clip": 1.01741409, "balance_loss_mlp": 1.03864288, "epoch": 0.5020291597775439, "flos": 11363214314880.0, "grad_norm": 2.164838412404475, "language_loss": 0.63695014, "learning_rate": 2.0839043611018266e-06, "loss": 0.65839696, "num_input_tokens_seen": 179494325, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 8350, "time_per_iteration": 2.426642417907715 }, { "auxiliary_loss_clip": 0.01038382, "auxiliary_loss_mlp": 0.01000852, "balance_loss_clip": 0.99966019, "balance_loss_mlp": 1.01367772, "epoch": 0.5020892830302119, "flos": 64011094928640.0, "grad_norm": 0.9084906498998587, "language_loss": 0.59781402, "learning_rate": 2.0835152407063597e-06, "loss": 0.61820632, "num_input_tokens_seen": 179553545, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.24707031, "step": 8351, "time_per_iteration": 3.2243235111236572 }, { "auxiliary_loss_clip": 0.01115611, "auxiliary_loss_mlp": 0.01030216, "balance_loss_clip": 1.01681733, "balance_loss_mlp": 1.03948808, "epoch": 0.5021494062828799, "flos": 23732859801600.0, "grad_norm": 1.7370022344223528, "language_loss": 0.7495271, "learning_rate": 2.0831261171439873e-06, "loss": 0.77098536, "num_input_tokens_seen": 179573645, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76171875, "step": 8352, "time_per_iteration": 2.491844892501831 }, { "auxiliary_loss_clip": 0.0111454, "auxiliary_loss_mlp": 0.01032613, "balance_loss_clip": 1.01898718, "balance_loss_mlp": 1.04075503, "epoch": 0.5022095295355479, "flos": 21576747041280.0, "grad_norm": 2.205321644113436, "language_loss": 0.71810454, "learning_rate": 2.082736990429464e-06, "loss": 0.73957604, "num_input_tokens_seen": 179591435, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 8353, "time_per_iteration": 2.4554641246795654 }, { "auxiliary_loss_clip": 0.01117973, "auxiliary_loss_mlp": 0.01037176, "balance_loss_clip": 1.02234674, "balance_loss_mlp": 1.04320049, "epoch": 0.5022696527882159, "flos": 21397229844480.0, "grad_norm": 1.671622306678286, "language_loss": 0.74348617, "learning_rate": 2.0823478605775455e-06, "loss": 0.76503766, "num_input_tokens_seen": 179609955, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.75, "step": 8354, "time_per_iteration": 2.4677011966705322 }, { "auxiliary_loss_clip": 0.01113045, "auxiliary_loss_mlp": 0.01036417, "balance_loss_clip": 1.02285695, "balance_loss_mlp": 1.04006052, "epoch": 0.5023297760408838, "flos": 27160712565120.0, "grad_norm": 1.60198698668387, "language_loss": 0.72367239, "learning_rate": 2.0819587276029884e-06, "loss": 0.74516702, "num_input_tokens_seen": 179630875, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 8355, "time_per_iteration": 2.5074150562286377 }, { "auxiliary_loss_clip": 0.011158, "auxiliary_loss_mlp": 0.01038731, "balance_loss_clip": 1.02478981, "balance_loss_mlp": 1.03939486, "epoch": 0.5023898992935518, "flos": 26213820186240.0, "grad_norm": 1.603155912806754, "language_loss": 0.81412244, "learning_rate": 2.081569591520548e-06, "loss": 0.83566773, "num_input_tokens_seen": 179649835, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 8356, "time_per_iteration": 2.512208938598633 }, { "auxiliary_loss_clip": 0.01116716, "auxiliary_loss_mlp": 0.01037344, "balance_loss_clip": 1.02267563, "balance_loss_mlp": 1.03808427, "epoch": 0.5024500225462197, "flos": 13440323111040.0, "grad_norm": 5.802004946581655, "language_loss": 0.76718986, "learning_rate": 2.0811804523449803e-06, "loss": 0.78873038, "num_input_tokens_seen": 179667605, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7890625, "step": 8357, "time_per_iteration": 2.4448840618133545 }, { "auxiliary_loss_clip": 0.01115273, "auxiliary_loss_mlp": 0.01038312, "balance_loss_clip": 1.02354813, "balance_loss_mlp": 1.0396564, "epoch": 0.5025101457988878, "flos": 21579584215680.0, "grad_norm": 2.091935407214148, "language_loss": 0.76287675, "learning_rate": 2.0807913100910417e-06, "loss": 0.78441262, "num_input_tokens_seen": 179686910, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7578125, "step": 8358, "time_per_iteration": 2.465686082839966 }, { "auxiliary_loss_clip": 0.01113381, "auxiliary_loss_mlp": 0.01036904, "balance_loss_clip": 1.02273595, "balance_loss_mlp": 1.03874815, "epoch": 0.5025702690515557, "flos": 24645134448000.0, "grad_norm": 2.1674776267929934, "language_loss": 0.72547036, "learning_rate": 2.0804021647734887e-06, "loss": 0.74697316, "num_input_tokens_seen": 179706395, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.74609375, "step": 8359, "time_per_iteration": 2.5107266902923584 }, { "auxiliary_loss_clip": 0.01112435, "auxiliary_loss_mlp": 0.01043083, "balance_loss_clip": 1.02993429, "balance_loss_mlp": 1.03912807, "epoch": 0.5026303923042237, "flos": 22090162089600.0, "grad_norm": 1.54465041337609, "language_loss": 0.77353895, "learning_rate": 2.080013016407077e-06, "loss": 0.79509419, "num_input_tokens_seen": 179725735, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 8360, "time_per_iteration": 2.465599536895752 }, { "auxiliary_loss_clip": 0.01114071, "auxiliary_loss_mlp": 0.01036218, "balance_loss_clip": 1.02305114, "balance_loss_mlp": 1.04059243, "epoch": 0.5026905155568916, "flos": 23697200574720.0, "grad_norm": 1.5643778482253186, "language_loss": 0.76931357, "learning_rate": 2.0796238650065645e-06, "loss": 0.79081655, "num_input_tokens_seen": 179746150, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 8361, "time_per_iteration": 2.4963316917419434 }, { "auxiliary_loss_clip": 0.01114525, "auxiliary_loss_mlp": 0.0103503, "balance_loss_clip": 1.02082014, "balance_loss_mlp": 1.03828549, "epoch": 0.5027506388095596, "flos": 25812410722560.0, "grad_norm": 1.5761492401561832, "language_loss": 0.85073519, "learning_rate": 2.0792347105867065e-06, "loss": 0.87223077, "num_input_tokens_seen": 179767550, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76171875, "step": 8362, "time_per_iteration": 2.5017073154449463 }, { "auxiliary_loss_clip": 0.01112716, "auxiliary_loss_mlp": 0.01031425, "balance_loss_clip": 1.01799607, "balance_loss_mlp": 1.03701496, "epoch": 0.5028107620622275, "flos": 27526606456320.0, "grad_norm": 1.7005063009530021, "language_loss": 0.78536868, "learning_rate": 2.0788455531622605e-06, "loss": 0.80681002, "num_input_tokens_seen": 179790075, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 8363, "time_per_iteration": 2.5164289474487305 }, { "auxiliary_loss_clip": 0.01110089, "auxiliary_loss_mlp": 0.01032282, "balance_loss_clip": 1.01906216, "balance_loss_mlp": 1.03897631, "epoch": 0.5028708853148955, "flos": 24534278098560.0, "grad_norm": 2.497013259467419, "language_loss": 0.75863582, "learning_rate": 2.0784563927479838e-06, "loss": 0.78005958, "num_input_tokens_seen": 179806515, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 8364, "time_per_iteration": 2.4781854152679443 }, { "auxiliary_loss_clip": 0.01110693, "auxiliary_loss_mlp": 0.01031734, "balance_loss_clip": 1.01921105, "balance_loss_mlp": 1.03878367, "epoch": 0.5029310085675635, "flos": 20813609664000.0, "grad_norm": 1.8308355039386035, "language_loss": 0.69694239, "learning_rate": 2.0780672293586317e-06, "loss": 0.71836662, "num_input_tokens_seen": 179826450, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 8365, "time_per_iteration": 2.462435245513916 }, { "auxiliary_loss_clip": 0.01116775, "auxiliary_loss_mlp": 0.01033895, "balance_loss_clip": 1.01950669, "balance_loss_mlp": 1.03941476, "epoch": 0.5029911318202315, "flos": 22342470197760.0, "grad_norm": 1.643806431688009, "language_loss": 0.73417771, "learning_rate": 2.0776780630089635e-06, "loss": 0.75568438, "num_input_tokens_seen": 179846770, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 8366, "time_per_iteration": 2.484793186187744 }, { "auxiliary_loss_clip": 0.01113123, "auxiliary_loss_mlp": 0.01030848, "balance_loss_clip": 1.01840293, "balance_loss_mlp": 1.04083312, "epoch": 0.5030512550728995, "flos": 24352713826560.0, "grad_norm": 1.4259949143452488, "language_loss": 0.78209591, "learning_rate": 2.077288893713735e-06, "loss": 0.80353564, "num_input_tokens_seen": 179866585, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.72265625, "step": 8367, "time_per_iteration": 2.5002565383911133 }, { "auxiliary_loss_clip": 0.01112251, "auxiliary_loss_mlp": 0.01028761, "balance_loss_clip": 1.01609516, "balance_loss_mlp": 1.03844428, "epoch": 0.5031113783255674, "flos": 18259930195200.0, "grad_norm": 1.76840157257751, "language_loss": 0.69901627, "learning_rate": 2.0768997214877035e-06, "loss": 0.72042644, "num_input_tokens_seen": 179885575, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 8368, "time_per_iteration": 2.4689207077026367 }, { "auxiliary_loss_clip": 0.01039374, "auxiliary_loss_mlp": 0.0100386, "balance_loss_clip": 1.00272775, "balance_loss_mlp": 1.01452982, "epoch": 0.5031715015782354, "flos": 57253173200640.0, "grad_norm": 0.8756338995145976, "language_loss": 0.63310838, "learning_rate": 2.0765105463456274e-06, "loss": 0.65354073, "num_input_tokens_seen": 179939650, "router_z_loss_clip": 0.01135254, "router_z_loss_mlp": 0.24804688, "step": 8369, "time_per_iteration": 3.0569350719451904 }, { "auxiliary_loss_clip": 0.01111444, "auxiliary_loss_mlp": 0.01032552, "balance_loss_clip": 1.02011895, "balance_loss_mlp": 1.03918386, "epoch": 0.5032316248309033, "flos": 27527360641920.0, "grad_norm": 2.0184845172878823, "language_loss": 0.60259664, "learning_rate": 2.076121368302263e-06, "loss": 0.62403667, "num_input_tokens_seen": 179961765, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.72265625, "step": 8370, "time_per_iteration": 2.5119822025299072 }, { "auxiliary_loss_clip": 0.01114917, "auxiliary_loss_mlp": 0.01036505, "balance_loss_clip": 1.02200365, "balance_loss_mlp": 1.03918242, "epoch": 0.5032917480835714, "flos": 34495825939200.0, "grad_norm": 1.6017538053395066, "language_loss": 0.68112934, "learning_rate": 2.0757321873723695e-06, "loss": 0.70264351, "num_input_tokens_seen": 179983015, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8371, "time_per_iteration": 2.5806312561035156 }, { "auxiliary_loss_clip": 0.01113117, "auxiliary_loss_mlp": 0.01027712, "balance_loss_clip": 1.01347816, "balance_loss_mlp": 1.03888679, "epoch": 0.5033518713362393, "flos": 33656773167360.0, "grad_norm": 1.7066416104911353, "language_loss": 0.67610866, "learning_rate": 2.0753430035707042e-06, "loss": 0.69751698, "num_input_tokens_seen": 180003210, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7421875, "step": 8372, "time_per_iteration": 2.563067674636841 }, { "auxiliary_loss_clip": 0.01112513, "auxiliary_loss_mlp": 0.01034291, "balance_loss_clip": 1.02002764, "balance_loss_mlp": 1.03845799, "epoch": 0.5034119945889073, "flos": 28185495586560.0, "grad_norm": 1.588950902769608, "language_loss": 0.6692127, "learning_rate": 2.0749538169120235e-06, "loss": 0.69068074, "num_input_tokens_seen": 180025530, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7421875, "step": 8373, "time_per_iteration": 2.5176355838775635 }, { "auxiliary_loss_clip": 0.01109167, "auxiliary_loss_mlp": 0.0103156, "balance_loss_clip": 1.01829815, "balance_loss_mlp": 1.03704369, "epoch": 0.5034721178415752, "flos": 21358697529600.0, "grad_norm": 1.6313531034480053, "language_loss": 0.74593115, "learning_rate": 2.0745646274110872e-06, "loss": 0.7673384, "num_input_tokens_seen": 180043180, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 8374, "time_per_iteration": 2.489128589630127 }, { "auxiliary_loss_clip": 0.01113604, "auxiliary_loss_mlp": 0.01037329, "balance_loss_clip": 1.02303016, "balance_loss_mlp": 1.03902531, "epoch": 0.5035322410942432, "flos": 22674823764480.0, "grad_norm": 1.9021721525005548, "language_loss": 0.68257123, "learning_rate": 2.0741754350826525e-06, "loss": 0.70408058, "num_input_tokens_seen": 180062905, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 8375, "time_per_iteration": 2.473893880844116 }, { "auxiliary_loss_clip": 0.01118786, "auxiliary_loss_mlp": 0.01030061, "balance_loss_clip": 1.01538682, "balance_loss_mlp": 1.04092455, "epoch": 0.5035923643469111, "flos": 19828723674240.0, "grad_norm": 5.29257610333685, "language_loss": 0.78882694, "learning_rate": 2.0737862399414777e-06, "loss": 0.81031543, "num_input_tokens_seen": 180082000, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 8376, "time_per_iteration": 2.4727282524108887 }, { "auxiliary_loss_clip": 0.01115025, "auxiliary_loss_mlp": 0.01032214, "balance_loss_clip": 1.01870179, "balance_loss_mlp": 1.03800726, "epoch": 0.5036524875995791, "flos": 30514625182080.0, "grad_norm": 1.9161676593074903, "language_loss": 0.59723049, "learning_rate": 2.0733970420023213e-06, "loss": 0.61870289, "num_input_tokens_seen": 180101340, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76953125, "step": 8377, "time_per_iteration": 2.534480333328247 }, { "auxiliary_loss_clip": 0.01112038, "auxiliary_loss_mlp": 0.01032442, "balance_loss_clip": 1.01876879, "balance_loss_mlp": 1.03825366, "epoch": 0.5037126108522471, "flos": 14720574637440.0, "grad_norm": 2.29272202803208, "language_loss": 0.76149267, "learning_rate": 2.0730078412799425e-06, "loss": 0.78293741, "num_input_tokens_seen": 180119160, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 8378, "time_per_iteration": 2.4427735805511475 }, { "auxiliary_loss_clip": 0.01112615, "auxiliary_loss_mlp": 0.01031512, "balance_loss_clip": 1.01843452, "balance_loss_mlp": 1.03907061, "epoch": 0.5037727341049151, "flos": 25297702784640.0, "grad_norm": 1.7797175082778274, "language_loss": 0.7454744, "learning_rate": 2.0726186377890985e-06, "loss": 0.76691568, "num_input_tokens_seen": 180138730, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 8379, "time_per_iteration": 2.4999351501464844 }, { "auxiliary_loss_clip": 0.01111587, "auxiliary_loss_mlp": 0.01028613, "balance_loss_clip": 1.01562548, "balance_loss_mlp": 1.04036796, "epoch": 0.5038328573575831, "flos": 28541764632960.0, "grad_norm": 1.9660501534125274, "language_loss": 0.6689781, "learning_rate": 2.072229431544548e-06, "loss": 0.6903801, "num_input_tokens_seen": 180158810, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 8380, "time_per_iteration": 2.531587839126587 }, { "auxiliary_loss_clip": 0.01110288, "auxiliary_loss_mlp": 0.01030212, "balance_loss_clip": 1.01747429, "balance_loss_mlp": 1.0386287, "epoch": 0.503892980610251, "flos": 31649869503360.0, "grad_norm": 1.9494608302760477, "language_loss": 0.63532877, "learning_rate": 2.071840222561051e-06, "loss": 0.65673375, "num_input_tokens_seen": 180179700, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 8381, "time_per_iteration": 2.54207444190979 }, { "auxiliary_loss_clip": 0.01109458, "auxiliary_loss_mlp": 0.01034977, "balance_loss_clip": 1.02237642, "balance_loss_mlp": 1.03790045, "epoch": 0.503953103862919, "flos": 27089358197760.0, "grad_norm": 1.4410111534703176, "language_loss": 0.67491752, "learning_rate": 2.071451010853365e-06, "loss": 0.6963619, "num_input_tokens_seen": 180199890, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 8382, "time_per_iteration": 3.9528520107269287 }, { "auxiliary_loss_clip": 0.01119095, "auxiliary_loss_mlp": 0.0103436, "balance_loss_clip": 1.02057958, "balance_loss_mlp": 1.04092908, "epoch": 0.5040132271155869, "flos": 15632957024640.0, "grad_norm": 2.0357458920644538, "language_loss": 0.62604642, "learning_rate": 2.0710617964362506e-06, "loss": 0.64758092, "num_input_tokens_seen": 180217840, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.78125, "step": 8383, "time_per_iteration": 3.805009126663208 }, { "auxiliary_loss_clip": 0.01108568, "auxiliary_loss_mlp": 0.01027454, "balance_loss_clip": 1.01522875, "balance_loss_mlp": 1.03772664, "epoch": 0.504073350368255, "flos": 13590106824960.0, "grad_norm": 2.6503211829672866, "language_loss": 0.66860437, "learning_rate": 2.070672579324465e-06, "loss": 0.68996465, "num_input_tokens_seen": 180236465, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 8384, "time_per_iteration": 2.432103395462036 }, { "auxiliary_loss_clip": 0.01111221, "auxiliary_loss_mlp": 0.01032852, "balance_loss_clip": 1.0206089, "balance_loss_mlp": 1.03810525, "epoch": 0.5041334736209229, "flos": 29058160510080.0, "grad_norm": 1.6157202115779967, "language_loss": 0.71154761, "learning_rate": 2.0702833595327674e-06, "loss": 0.73298836, "num_input_tokens_seen": 180258025, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.73046875, "step": 8385, "time_per_iteration": 4.069955110549927 }, { "auxiliary_loss_clip": 0.01108208, "auxiliary_loss_mlp": 0.01027144, "balance_loss_clip": 1.0149014, "balance_loss_mlp": 1.03797793, "epoch": 0.5041935968735909, "flos": 24608361899520.0, "grad_norm": 1.9518667489288892, "language_loss": 0.83199245, "learning_rate": 2.069894137075919e-06, "loss": 0.85334593, "num_input_tokens_seen": 180277825, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 8386, "time_per_iteration": 3.850539445877075 }, { "auxiliary_loss_clip": 0.01111686, "auxiliary_loss_mlp": 0.01031075, "balance_loss_clip": 1.0174135, "balance_loss_mlp": 1.03855205, "epoch": 0.5042537201262588, "flos": 26286934320000.0, "grad_norm": 2.50705260785185, "language_loss": 0.66666538, "learning_rate": 2.0695049119686766e-06, "loss": 0.68809295, "num_input_tokens_seen": 180300465, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 8387, "time_per_iteration": 2.4982149600982666 }, { "auxiliary_loss_clip": 0.0111264, "auxiliary_loss_mlp": 0.01028773, "balance_loss_clip": 1.01632154, "balance_loss_mlp": 1.04102695, "epoch": 0.5043138433789268, "flos": 22017371178240.0, "grad_norm": 1.454093905411696, "language_loss": 0.80227971, "learning_rate": 2.0691156842258016e-06, "loss": 0.82369381, "num_input_tokens_seen": 180321050, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.71875, "step": 8388, "time_per_iteration": 2.4811196327209473 }, { "auxiliary_loss_clip": 0.01110123, "auxiliary_loss_mlp": 0.01029309, "balance_loss_clip": 1.01695871, "balance_loss_mlp": 1.03796744, "epoch": 0.5043739666315947, "flos": 28767104605440.0, "grad_norm": 2.4545547975051973, "language_loss": 0.70301163, "learning_rate": 2.0687264538620537e-06, "loss": 0.72440588, "num_input_tokens_seen": 180338870, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 8389, "time_per_iteration": 2.5033912658691406 }, { "auxiliary_loss_clip": 0.01111759, "auxiliary_loss_mlp": 0.0103168, "balance_loss_clip": 1.0192461, "balance_loss_mlp": 1.0381918, "epoch": 0.5044340898842627, "flos": 27599253713280.0, "grad_norm": 1.728635399428524, "language_loss": 0.69943178, "learning_rate": 2.068337220892191e-06, "loss": 0.7208662, "num_input_tokens_seen": 180361285, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.734375, "step": 8390, "time_per_iteration": 2.5303730964660645 }, { "auxiliary_loss_clip": 0.01043028, "auxiliary_loss_mlp": 0.00999651, "balance_loss_clip": 0.99844712, "balance_loss_mlp": 1.0179379, "epoch": 0.5044942131369307, "flos": 67458050749440.0, "grad_norm": 0.880081342326619, "language_loss": 0.52974772, "learning_rate": 2.067947985330974e-06, "loss": 0.55017447, "num_input_tokens_seen": 180415170, "router_z_loss_clip": 0.01202393, "router_z_loss_mlp": 0.25, "step": 8391, "time_per_iteration": 2.9048616886138916 }, { "auxiliary_loss_clip": 0.01042869, "auxiliary_loss_mlp": 0.00999641, "balance_loss_clip": 0.99835342, "balance_loss_mlp": 1.0179323, "epoch": 0.5045543363895987, "flos": 58630849390080.0, "grad_norm": 0.8903192127405299, "language_loss": 0.60699552, "learning_rate": 2.0675587471931628e-06, "loss": 0.6274206, "num_input_tokens_seen": 180468060, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.24902344, "step": 8392, "time_per_iteration": 2.9329240322113037 }, { "auxiliary_loss_clip": 0.01106467, "auxiliary_loss_mlp": 0.01028475, "balance_loss_clip": 1.0161314, "balance_loss_mlp": 1.03710306, "epoch": 0.5046144596422667, "flos": 22526620248960.0, "grad_norm": 1.74004296797788, "language_loss": 0.84524345, "learning_rate": 2.067169506493517e-06, "loss": 0.86659288, "num_input_tokens_seen": 180486610, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 8393, "time_per_iteration": 2.4611144065856934 }, { "auxiliary_loss_clip": 0.0111105, "auxiliary_loss_mlp": 0.01027615, "balance_loss_clip": 1.01496112, "balance_loss_mlp": 1.03899014, "epoch": 0.5046745828949346, "flos": 27454246508160.0, "grad_norm": 2.0632744761678734, "language_loss": 0.51010597, "learning_rate": 2.0667802632467974e-06, "loss": 0.53149265, "num_input_tokens_seen": 180508135, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 8394, "time_per_iteration": 2.52661395072937 }, { "auxiliary_loss_clip": 0.01109748, "auxiliary_loss_mlp": 0.010292, "balance_loss_clip": 1.0151993, "balance_loss_mlp": 1.03705668, "epoch": 0.5047347061476026, "flos": 17274541415040.0, "grad_norm": 1.532037718958513, "language_loss": 0.75055861, "learning_rate": 2.0663910174677627e-06, "loss": 0.7719481, "num_input_tokens_seen": 180527000, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7265625, "step": 8395, "time_per_iteration": 2.467611312866211 }, { "auxiliary_loss_clip": 0.01108493, "auxiliary_loss_mlp": 0.01031167, "balance_loss_clip": 1.01817358, "balance_loss_mlp": 1.03711832, "epoch": 0.5047948294002705, "flos": 16649515831680.0, "grad_norm": 1.9057037169968152, "language_loss": 0.68138874, "learning_rate": 2.0660017691711737e-06, "loss": 0.70278537, "num_input_tokens_seen": 180544715, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 8396, "time_per_iteration": 2.6034700870513916 }, { "auxiliary_loss_clip": 0.01113178, "auxiliary_loss_mlp": 0.01028203, "balance_loss_clip": 1.01581764, "balance_loss_mlp": 1.04080665, "epoch": 0.5048549526529386, "flos": 26865706164480.0, "grad_norm": 1.679793573641351, "language_loss": 0.78781044, "learning_rate": 2.065612518371792e-06, "loss": 0.80922425, "num_input_tokens_seen": 180565365, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 8397, "time_per_iteration": 2.5333709716796875 }, { "auxiliary_loss_clip": 0.01107762, "auxiliary_loss_mlp": 0.0102717, "balance_loss_clip": 1.01520705, "balance_loss_mlp": 1.03630745, "epoch": 0.5049150759056065, "flos": 21833939399040.0, "grad_norm": 1.9593519673387718, "language_loss": 0.664042, "learning_rate": 2.065223265084376e-06, "loss": 0.68539131, "num_input_tokens_seen": 180586670, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.71484375, "step": 8398, "time_per_iteration": 2.5002169609069824 }, { "auxiliary_loss_clip": 0.01111477, "auxiliary_loss_mlp": 0.01027976, "balance_loss_clip": 1.01537561, "balance_loss_mlp": 1.0388118, "epoch": 0.5049751991582745, "flos": 21685807710720.0, "grad_norm": 1.6126558796863468, "language_loss": 0.71577662, "learning_rate": 2.064834009323688e-06, "loss": 0.73717117, "num_input_tokens_seen": 180605085, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 8399, "time_per_iteration": 2.5251853466033936 }, { "auxiliary_loss_clip": 0.0111391, "auxiliary_loss_mlp": 0.01042944, "balance_loss_clip": 1.02967572, "balance_loss_mlp": 1.03967786, "epoch": 0.5050353224109424, "flos": 21359379888000.0, "grad_norm": 1.7926479624788592, "language_loss": 0.81672907, "learning_rate": 2.0644447511044878e-06, "loss": 0.83829755, "num_input_tokens_seen": 180624370, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 8400, "time_per_iteration": 2.4616682529449463 }, { "auxiliary_loss_clip": 0.01111138, "auxiliary_loss_mlp": 0.01031329, "balance_loss_clip": 1.01849604, "balance_loss_mlp": 1.03922772, "epoch": 0.5050954456636104, "flos": 22820082364800.0, "grad_norm": 1.951059792846352, "language_loss": 0.78491354, "learning_rate": 2.0640554904415362e-06, "loss": 0.80633819, "num_input_tokens_seen": 180642450, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 8401, "time_per_iteration": 2.4490816593170166 }, { "auxiliary_loss_clip": 0.01111649, "auxiliary_loss_mlp": 0.01030582, "balance_loss_clip": 1.01793373, "balance_loss_mlp": 1.03732586, "epoch": 0.5051555689162783, "flos": 30448226891520.0, "grad_norm": 1.51514318511359, "language_loss": 0.70098478, "learning_rate": 2.063666227349593e-06, "loss": 0.7224071, "num_input_tokens_seen": 180665250, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 8402, "time_per_iteration": 2.5514729022979736 }, { "auxiliary_loss_clip": 0.01108961, "auxiliary_loss_mlp": 0.01028256, "balance_loss_clip": 1.01574469, "balance_loss_mlp": 1.03604424, "epoch": 0.5052156921689464, "flos": 21287953693440.0, "grad_norm": 2.8571501233927923, "language_loss": 0.69525313, "learning_rate": 2.063276961843422e-06, "loss": 0.71662521, "num_input_tokens_seen": 180687425, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 8403, "time_per_iteration": 2.494257688522339 }, { "auxiliary_loss_clip": 0.01107704, "auxiliary_loss_mlp": 0.01041167, "balance_loss_clip": 1.02894795, "balance_loss_mlp": 1.03835535, "epoch": 0.5052758154216143, "flos": 25081305298560.0, "grad_norm": 1.3900312963060186, "language_loss": 0.85940546, "learning_rate": 2.062887693937781e-06, "loss": 0.88089418, "num_input_tokens_seen": 180708725, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 8404, "time_per_iteration": 2.5079638957977295 }, { "auxiliary_loss_clip": 0.01109287, "auxiliary_loss_mlp": 0.01035686, "balance_loss_clip": 1.02308011, "balance_loss_mlp": 1.0371896, "epoch": 0.5053359386742823, "flos": 20885502735360.0, "grad_norm": 2.6127160699733354, "language_loss": 0.75891864, "learning_rate": 2.0624984236474322e-06, "loss": 0.78036833, "num_input_tokens_seen": 180727990, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 8405, "time_per_iteration": 2.463789224624634 }, { "auxiliary_loss_clip": 0.01111867, "auxiliary_loss_mlp": 0.01027839, "balance_loss_clip": 1.01427317, "balance_loss_mlp": 1.03781176, "epoch": 0.5053960619269503, "flos": 37743335493120.0, "grad_norm": 1.76329772209441, "language_loss": 0.72810799, "learning_rate": 2.0621091509871378e-06, "loss": 0.74950504, "num_input_tokens_seen": 180749765, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 8406, "time_per_iteration": 2.6099300384521484 }, { "auxiliary_loss_clip": 0.01106558, "auxiliary_loss_mlp": 0.01035434, "balance_loss_clip": 1.0227325, "balance_loss_mlp": 1.03701639, "epoch": 0.5054561851796182, "flos": 23513840622720.0, "grad_norm": 2.3142034412343193, "language_loss": 0.76875401, "learning_rate": 2.0617198759716568e-06, "loss": 0.79017395, "num_input_tokens_seen": 180769580, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 8407, "time_per_iteration": 2.4905929565429688 }, { "auxiliary_loss_clip": 0.01110733, "auxiliary_loss_mlp": 0.01033511, "balance_loss_clip": 1.02135158, "balance_loss_mlp": 1.0367136, "epoch": 0.5055163084322862, "flos": 30410233280640.0, "grad_norm": 1.932649797539127, "language_loss": 0.63277155, "learning_rate": 2.0613305986157535e-06, "loss": 0.65421402, "num_input_tokens_seen": 180790295, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.73828125, "step": 8408, "time_per_iteration": 2.5501067638397217 }, { "auxiliary_loss_clip": 0.01109604, "auxiliary_loss_mlp": 0.01030023, "balance_loss_clip": 1.01699305, "balance_loss_mlp": 1.03789568, "epoch": 0.5055764316849541, "flos": 20259651139200.0, "grad_norm": 3.7616542633206818, "language_loss": 0.63726842, "learning_rate": 2.0609413189341865e-06, "loss": 0.6586647, "num_input_tokens_seen": 180807875, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 8409, "time_per_iteration": 2.4233083724975586 }, { "auxiliary_loss_clip": 0.01108886, "auxiliary_loss_mlp": 0.01023342, "balance_loss_clip": 1.01237488, "balance_loss_mlp": 1.03867185, "epoch": 0.5056365549376222, "flos": 26070895969920.0, "grad_norm": 1.3887726860257508, "language_loss": 0.70935351, "learning_rate": 2.0605520369417193e-06, "loss": 0.73067582, "num_input_tokens_seen": 180831300, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.703125, "step": 8410, "time_per_iteration": 2.521221160888672 }, { "auxiliary_loss_clip": 0.01111871, "auxiliary_loss_mlp": 0.0104107, "balance_loss_clip": 1.02750409, "balance_loss_mlp": 1.03872848, "epoch": 0.5056966781902901, "flos": 19279074781440.0, "grad_norm": 1.4686717881753206, "language_loss": 0.79293406, "learning_rate": 2.060162752653113e-06, "loss": 0.81446344, "num_input_tokens_seen": 180849055, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 8411, "time_per_iteration": 2.443452835083008 }, { "auxiliary_loss_clip": 0.01112069, "auxiliary_loss_mlp": 0.01034674, "balance_loss_clip": 1.02055943, "balance_loss_mlp": 1.03832984, "epoch": 0.5057568014429581, "flos": 21323325611520.0, "grad_norm": 1.8459021392202242, "language_loss": 0.81802285, "learning_rate": 2.0597734660831285e-06, "loss": 0.83949029, "num_input_tokens_seen": 180867395, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.73828125, "step": 8412, "time_per_iteration": 2.4527289867401123 }, { "auxiliary_loss_clip": 0.01113841, "auxiliary_loss_mlp": 0.01036604, "balance_loss_clip": 1.0235858, "balance_loss_mlp": 1.04183638, "epoch": 0.505816924695626, "flos": 17493596507520.0, "grad_norm": 1.8770921499395605, "language_loss": 0.80951667, "learning_rate": 2.0593841772465283e-06, "loss": 0.83102107, "num_input_tokens_seen": 180886670, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 8413, "time_per_iteration": 2.445239305496216 }, { "auxiliary_loss_clip": 0.0111295, "auxiliary_loss_mlp": 0.01035001, "balance_loss_clip": 1.02122068, "balance_loss_mlp": 1.03891683, "epoch": 0.505877047948294, "flos": 21142084561920.0, "grad_norm": 1.964716804562292, "language_loss": 0.80477142, "learning_rate": 2.0589948861580737e-06, "loss": 0.82625097, "num_input_tokens_seen": 180904645, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 8414, "time_per_iteration": 2.469911813735962 }, { "auxiliary_loss_clip": 0.01110387, "auxiliary_loss_mlp": 0.010343, "balance_loss_clip": 1.0207994, "balance_loss_mlp": 1.03654969, "epoch": 0.5059371712009619, "flos": 36350036887680.0, "grad_norm": 2.17936995405965, "language_loss": 0.62474161, "learning_rate": 2.058605592832528e-06, "loss": 0.6461885, "num_input_tokens_seen": 180922340, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 8415, "time_per_iteration": 2.55080246925354 }, { "auxiliary_loss_clip": 0.0110971, "auxiliary_loss_mlp": 0.01030963, "balance_loss_clip": 1.0178622, "balance_loss_mlp": 1.03687811, "epoch": 0.50599729445363, "flos": 22673387220480.0, "grad_norm": 1.5506326483948085, "language_loss": 0.81995344, "learning_rate": 2.0582162972846515e-06, "loss": 0.84136021, "num_input_tokens_seen": 180941350, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 8416, "time_per_iteration": 2.4604897499084473 }, { "auxiliary_loss_clip": 0.01110496, "auxiliary_loss_mlp": 0.01036275, "balance_loss_clip": 1.02399647, "balance_loss_mlp": 1.03941607, "epoch": 0.5060574177062979, "flos": 22747866071040.0, "grad_norm": 1.9132049221813325, "language_loss": 0.79230285, "learning_rate": 2.0578269995292078e-06, "loss": 0.81377053, "num_input_tokens_seen": 180960720, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 8417, "time_per_iteration": 2.450028419494629 }, { "auxiliary_loss_clip": 0.01107116, "auxiliary_loss_mlp": 0.01031891, "balance_loss_clip": 1.01941609, "balance_loss_mlp": 1.037305, "epoch": 0.5061175409589659, "flos": 21653201139840.0, "grad_norm": 1.7565101319162508, "language_loss": 0.62651539, "learning_rate": 2.0574376995809588e-06, "loss": 0.64790547, "num_input_tokens_seen": 180979725, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69921875, "step": 8418, "time_per_iteration": 2.507774591445923 }, { "auxiliary_loss_clip": 0.01111808, "auxiliary_loss_mlp": 0.01030498, "balance_loss_clip": 1.01790929, "balance_loss_mlp": 1.03822184, "epoch": 0.5061776642116339, "flos": 21616249023360.0, "grad_norm": 2.1084598722720984, "language_loss": 0.77692771, "learning_rate": 2.0570483974546653e-06, "loss": 0.79835069, "num_input_tokens_seen": 180998980, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 8419, "time_per_iteration": 2.4694955348968506 }, { "auxiliary_loss_clip": 0.01111498, "auxiliary_loss_mlp": 0.01034586, "balance_loss_clip": 1.02020955, "balance_loss_mlp": 1.03695929, "epoch": 0.5062377874643018, "flos": 24426294837120.0, "grad_norm": 1.8304611527212782, "language_loss": 0.77161348, "learning_rate": 2.0566590931650917e-06, "loss": 0.79307437, "num_input_tokens_seen": 181019165, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.74609375, "step": 8420, "time_per_iteration": 2.517573833465576 }, { "auxiliary_loss_clip": 0.01110982, "auxiliary_loss_mlp": 0.01033044, "balance_loss_clip": 1.01900756, "balance_loss_mlp": 1.03690529, "epoch": 0.5062979107169698, "flos": 22524429519360.0, "grad_norm": 2.094270214317825, "language_loss": 0.77380186, "learning_rate": 2.056269786726999e-06, "loss": 0.79524213, "num_input_tokens_seen": 181037110, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 8421, "time_per_iteration": 2.4645168781280518 }, { "auxiliary_loss_clip": 0.01108438, "auxiliary_loss_mlp": 0.01028019, "balance_loss_clip": 1.01546049, "balance_loss_mlp": 1.03607476, "epoch": 0.5063580339696377, "flos": 24571984400640.0, "grad_norm": 1.5075950330770922, "language_loss": 0.66768223, "learning_rate": 2.0558804781551512e-06, "loss": 0.6890468, "num_input_tokens_seen": 181057775, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 8422, "time_per_iteration": 2.5040478706359863 }, { "auxiliary_loss_clip": 0.01110409, "auxiliary_loss_mlp": 0.010329, "balance_loss_clip": 1.01994801, "balance_loss_mlp": 1.03927517, "epoch": 0.5064181572223058, "flos": 22596143022720.0, "grad_norm": 2.072965552416029, "language_loss": 0.81839317, "learning_rate": 2.05549116746431e-06, "loss": 0.83982623, "num_input_tokens_seen": 181078260, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 8423, "time_per_iteration": 2.5178768634796143 }, { "auxiliary_loss_clip": 0.01112538, "auxiliary_loss_mlp": 0.01032862, "balance_loss_clip": 1.01865256, "balance_loss_mlp": 1.03757119, "epoch": 0.5064782804749737, "flos": 25994944661760.0, "grad_norm": 2.088166435746944, "language_loss": 0.75001544, "learning_rate": 2.055101854669237e-06, "loss": 0.77146947, "num_input_tokens_seen": 181098755, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 8424, "time_per_iteration": 5.408450365066528 }, { "auxiliary_loss_clip": 0.0110702, "auxiliary_loss_mlp": 0.01033821, "balance_loss_clip": 1.02008808, "balance_loss_mlp": 1.03616691, "epoch": 0.5065384037276417, "flos": 28553041503360.0, "grad_norm": 1.6070635012487344, "language_loss": 0.71377426, "learning_rate": 2.0547125397846975e-06, "loss": 0.73518264, "num_input_tokens_seen": 181121570, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 8425, "time_per_iteration": 2.562913179397583 }, { "auxiliary_loss_clip": 0.01109844, "auxiliary_loss_mlp": 0.01035134, "balance_loss_clip": 1.02238417, "balance_loss_mlp": 1.03639817, "epoch": 0.5065985269803096, "flos": 22966023323520.0, "grad_norm": 2.6216581501808363, "language_loss": 0.79004031, "learning_rate": 2.0543232228254524e-06, "loss": 0.81149012, "num_input_tokens_seen": 181140240, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 8426, "time_per_iteration": 3.9894330501556396 }, { "auxiliary_loss_clip": 0.01113838, "auxiliary_loss_mlp": 0.01037286, "balance_loss_clip": 1.02343941, "balance_loss_mlp": 1.03929806, "epoch": 0.5066586502329776, "flos": 21608563512960.0, "grad_norm": 2.1617012425693196, "language_loss": 0.77628058, "learning_rate": 2.053933903806265e-06, "loss": 0.79779184, "num_input_tokens_seen": 181158630, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 8427, "time_per_iteration": 3.8287479877471924 }, { "auxiliary_loss_clip": 0.01107416, "auxiliary_loss_mlp": 0.01028962, "balance_loss_clip": 1.01536024, "balance_loss_mlp": 1.03639376, "epoch": 0.5067187734856455, "flos": 20339912079360.0, "grad_norm": 1.7271022582311424, "language_loss": 0.71701968, "learning_rate": 2.0535445827418997e-06, "loss": 0.73838341, "num_input_tokens_seen": 181176405, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 8428, "time_per_iteration": 2.4726481437683105 }, { "auxiliary_loss_clip": 0.01107464, "auxiliary_loss_mlp": 0.01031259, "balance_loss_clip": 1.01890945, "balance_loss_mlp": 1.03553295, "epoch": 0.5067788967383136, "flos": 28841080665600.0, "grad_norm": 1.6223264796694072, "language_loss": 0.82967377, "learning_rate": 2.0531552596471168e-06, "loss": 0.85106099, "num_input_tokens_seen": 181197595, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 8429, "time_per_iteration": 2.504251480102539 }, { "auxiliary_loss_clip": 0.01115732, "auxiliary_loss_mlp": 0.01037044, "balance_loss_clip": 1.02221417, "balance_loss_mlp": 1.03920889, "epoch": 0.5068390199909815, "flos": 32450174478720.0, "grad_norm": 2.272382226637019, "language_loss": 0.73451394, "learning_rate": 2.052765934536682e-06, "loss": 0.75604165, "num_input_tokens_seen": 181218560, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.765625, "step": 8430, "time_per_iteration": 2.547329902648926 }, { "auxiliary_loss_clip": 0.01111478, "auxiliary_loss_mlp": 0.01031217, "balance_loss_clip": 1.01786518, "balance_loss_mlp": 1.03756213, "epoch": 0.5068991432436495, "flos": 23146582014720.0, "grad_norm": 1.626508050160054, "language_loss": 0.76840335, "learning_rate": 2.0523766074253575e-06, "loss": 0.78983021, "num_input_tokens_seen": 181237095, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 8431, "time_per_iteration": 2.4566595554351807 }, { "auxiliary_loss_clip": 0.01109547, "auxiliary_loss_mlp": 0.01031171, "balance_loss_clip": 1.01772404, "balance_loss_mlp": 1.03772449, "epoch": 0.5069592664963174, "flos": 19936096404480.0, "grad_norm": 1.531086739020954, "language_loss": 0.72239637, "learning_rate": 2.0519872783279074e-06, "loss": 0.74380362, "num_input_tokens_seen": 181255940, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 8432, "time_per_iteration": 2.461897850036621 }, { "auxiliary_loss_clip": 0.01040175, "auxiliary_loss_mlp": 0.01008426, "balance_loss_clip": 1.00723362, "balance_loss_mlp": 1.01479483, "epoch": 0.5070193897489854, "flos": 65793771941760.0, "grad_norm": 0.7561938016270429, "language_loss": 0.6370877, "learning_rate": 2.0515979472590945e-06, "loss": 0.6575737, "num_input_tokens_seen": 181316945, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.25390625, "step": 8433, "time_per_iteration": 3.119961738586426 }, { "auxiliary_loss_clip": 0.01110538, "auxiliary_loss_mlp": 0.01039675, "balance_loss_clip": 1.02566218, "balance_loss_mlp": 1.03740585, "epoch": 0.5070795130016534, "flos": 17275331514240.0, "grad_norm": 1.8202412199564562, "language_loss": 0.77607757, "learning_rate": 2.051208614233681e-06, "loss": 0.79757965, "num_input_tokens_seen": 181335555, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73046875, "step": 8434, "time_per_iteration": 2.443110466003418 }, { "auxiliary_loss_clip": 0.01112924, "auxiliary_loss_mlp": 0.01028098, "balance_loss_clip": 1.0145793, "balance_loss_mlp": 1.03845942, "epoch": 0.5071396362543213, "flos": 21069940095360.0, "grad_norm": 1.7421262323978361, "language_loss": 0.70980895, "learning_rate": 2.0508192792664326e-06, "loss": 0.73121917, "num_input_tokens_seen": 181354580, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 8435, "time_per_iteration": 2.469633102416992 }, { "auxiliary_loss_clip": 0.01112915, "auxiliary_loss_mlp": 0.01036451, "balance_loss_clip": 1.02214551, "balance_loss_mlp": 1.038661, "epoch": 0.5071997595069894, "flos": 23144822248320.0, "grad_norm": 1.9465997280343614, "language_loss": 0.72387815, "learning_rate": 2.050429942372112e-06, "loss": 0.74537182, "num_input_tokens_seen": 181374320, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7421875, "step": 8436, "time_per_iteration": 2.469853162765503 }, { "auxiliary_loss_clip": 0.01112726, "auxiliary_loss_mlp": 0.01030821, "balance_loss_clip": 1.01660573, "balance_loss_mlp": 1.03931046, "epoch": 0.5072598827596573, "flos": 22747183712640.0, "grad_norm": 1.5779101908289528, "language_loss": 0.83958566, "learning_rate": 2.050040603565483e-06, "loss": 0.8610211, "num_input_tokens_seen": 181392190, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.734375, "step": 8437, "time_per_iteration": 2.505343437194824 }, { "auxiliary_loss_clip": 0.01106761, "auxiliary_loss_mlp": 0.01027193, "balance_loss_clip": 1.01441991, "balance_loss_mlp": 1.03553879, "epoch": 0.5073200060123253, "flos": 22566301799040.0, "grad_norm": 1.515867463873253, "language_loss": 0.80540502, "learning_rate": 2.049651262861309e-06, "loss": 0.82674456, "num_input_tokens_seen": 181413890, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 8438, "time_per_iteration": 2.5166099071502686 }, { "auxiliary_loss_clip": 0.01112219, "auxiliary_loss_mlp": 0.01036026, "balance_loss_clip": 1.02149391, "balance_loss_mlp": 1.03805971, "epoch": 0.5073801292649932, "flos": 25806341324160.0, "grad_norm": 1.6391373183238316, "language_loss": 0.79711831, "learning_rate": 2.0492619202743543e-06, "loss": 0.81860077, "num_input_tokens_seen": 181433240, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7421875, "step": 8439, "time_per_iteration": 2.537835121154785 }, { "auxiliary_loss_clip": 0.01107569, "auxiliary_loss_mlp": 0.01030185, "balance_loss_clip": 1.01735842, "balance_loss_mlp": 1.03624701, "epoch": 0.5074402525176612, "flos": 25373941401600.0, "grad_norm": 1.592432165032107, "language_loss": 0.70854056, "learning_rate": 2.048872575819383e-06, "loss": 0.72991812, "num_input_tokens_seen": 181453535, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 8440, "time_per_iteration": 2.5027639865875244 }, { "auxiliary_loss_clip": 0.01112018, "auxiliary_loss_mlp": 0.01030647, "balance_loss_clip": 1.01781392, "balance_loss_mlp": 1.03755236, "epoch": 0.5075003757703291, "flos": 26064431521920.0, "grad_norm": 1.6366437576281043, "language_loss": 0.70861655, "learning_rate": 2.048483229511158e-06, "loss": 0.73004317, "num_input_tokens_seen": 181474195, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 8441, "time_per_iteration": 2.5249361991882324 }, { "auxiliary_loss_clip": 0.01112898, "auxiliary_loss_mlp": 0.01035254, "balance_loss_clip": 1.02094865, "balance_loss_mlp": 1.037673, "epoch": 0.5075604990229972, "flos": 21835447770240.0, "grad_norm": 1.702792296439091, "language_loss": 0.63930953, "learning_rate": 2.0480938813644445e-06, "loss": 0.6607911, "num_input_tokens_seen": 181494000, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 8442, "time_per_iteration": 2.4597058296203613 }, { "auxiliary_loss_clip": 0.01108368, "auxiliary_loss_mlp": 0.01026621, "balance_loss_clip": 1.01444983, "balance_loss_mlp": 1.03760433, "epoch": 0.5076206222756651, "flos": 31978703537280.0, "grad_norm": 1.6370212345546489, "language_loss": 0.71624887, "learning_rate": 2.047704531394006e-06, "loss": 0.73759878, "num_input_tokens_seen": 181515955, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 8443, "time_per_iteration": 2.5650744438171387 }, { "auxiliary_loss_clip": 0.01113367, "auxiliary_loss_mlp": 0.01032228, "balance_loss_clip": 1.01809573, "balance_loss_mlp": 1.03801644, "epoch": 0.5076807455283331, "flos": 36904031326080.0, "grad_norm": 1.2781352498764593, "language_loss": 0.6196785, "learning_rate": 2.047315179614607e-06, "loss": 0.6411345, "num_input_tokens_seen": 181540225, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75390625, "step": 8444, "time_per_iteration": 2.6012251377105713 }, { "auxiliary_loss_clip": 0.01109721, "auxiliary_loss_mlp": 0.01027007, "balance_loss_clip": 1.01442456, "balance_loss_mlp": 1.03712082, "epoch": 0.507740868781001, "flos": 29862415981440.0, "grad_norm": 1.7246750418214463, "language_loss": 0.64076841, "learning_rate": 2.046925826041012e-06, "loss": 0.66213572, "num_input_tokens_seen": 181560125, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 8445, "time_per_iteration": 2.5298092365264893 }, { "auxiliary_loss_clip": 0.01037824, "auxiliary_loss_mlp": 0.01001763, "balance_loss_clip": 1.0004096, "balance_loss_mlp": 1.01293778, "epoch": 0.507800992033669, "flos": 61918974247680.0, "grad_norm": 0.8295472773494887, "language_loss": 0.61795473, "learning_rate": 2.0465364706879845e-06, "loss": 0.63835061, "num_input_tokens_seen": 181618830, "router_z_loss_clip": 0.0135498, "router_z_loss_mlp": 0.24902344, "step": 8446, "time_per_iteration": 3.1092782020568848 }, { "auxiliary_loss_clip": 0.01106514, "auxiliary_loss_mlp": 0.01027414, "balance_loss_clip": 1.0148015, "balance_loss_mlp": 1.03529894, "epoch": 0.507861115286337, "flos": 20700490757760.0, "grad_norm": 1.546497036361756, "language_loss": 0.81137335, "learning_rate": 2.04614711357029e-06, "loss": 0.83271265, "num_input_tokens_seen": 181637120, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 8447, "time_per_iteration": 2.466897487640381 }, { "auxiliary_loss_clip": 0.01110074, "auxiliary_loss_mlp": 0.01030223, "balance_loss_clip": 1.01772952, "balance_loss_mlp": 1.03916526, "epoch": 0.507921238539005, "flos": 30847050576000.0, "grad_norm": 1.3483586657059914, "language_loss": 0.70802444, "learning_rate": 2.0457577547026916e-06, "loss": 0.72942734, "num_input_tokens_seen": 181659965, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 8448, "time_per_iteration": 2.5211877822875977 }, { "auxiliary_loss_clip": 0.0110875, "auxiliary_loss_mlp": 0.01026564, "balance_loss_clip": 1.01501226, "balance_loss_mlp": 1.03797698, "epoch": 0.507981361791673, "flos": 35700197984640.0, "grad_norm": 6.331402688720886, "language_loss": 0.71863669, "learning_rate": 2.045368394099955e-06, "loss": 0.73998982, "num_input_tokens_seen": 181685290, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.70703125, "step": 8449, "time_per_iteration": 2.6140663623809814 }, { "auxiliary_loss_clip": 0.01106614, "auxiliary_loss_mlp": 0.01027582, "balance_loss_clip": 1.01488638, "balance_loss_mlp": 1.03487837, "epoch": 0.5080414850443409, "flos": 27161466750720.0, "grad_norm": 1.9041963904449937, "language_loss": 0.72907901, "learning_rate": 2.044979031776844e-06, "loss": 0.75042093, "num_input_tokens_seen": 181706080, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 8450, "time_per_iteration": 2.5106189250946045 }, { "auxiliary_loss_clip": 0.01112079, "auxiliary_loss_mlp": 0.01030083, "balance_loss_clip": 1.01726186, "balance_loss_mlp": 1.03860307, "epoch": 0.5081016082970089, "flos": 27085192220160.0, "grad_norm": 2.1904745702430253, "language_loss": 0.77243853, "learning_rate": 2.0445896677481234e-06, "loss": 0.7938602, "num_input_tokens_seen": 181724805, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8451, "time_per_iteration": 2.5366475582122803 }, { "auxiliary_loss_clip": 0.01111076, "auxiliary_loss_mlp": 0.01032859, "balance_loss_clip": 1.02047908, "balance_loss_mlp": 1.03724408, "epoch": 0.5081617315496768, "flos": 22856531690880.0, "grad_norm": 1.659180395316095, "language_loss": 0.85214823, "learning_rate": 2.044200302028559e-06, "loss": 0.87358761, "num_input_tokens_seen": 181743725, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7421875, "step": 8452, "time_per_iteration": 2.463465690612793 }, { "auxiliary_loss_clip": 0.01114994, "auxiliary_loss_mlp": 0.01031085, "balance_loss_clip": 1.01707256, "balance_loss_mlp": 1.03885746, "epoch": 0.5082218548023448, "flos": 16281898087680.0, "grad_norm": 2.6441909408825897, "language_loss": 0.7754221, "learning_rate": 2.0438109346329143e-06, "loss": 0.79688287, "num_input_tokens_seen": 181757720, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76171875, "step": 8453, "time_per_iteration": 2.4307780265808105 }, { "auxiliary_loss_clip": 0.01106652, "auxiliary_loss_mlp": 0.01028165, "balance_loss_clip": 1.01597607, "balance_loss_mlp": 1.0367794, "epoch": 0.5082819780550127, "flos": 24460768915200.0, "grad_norm": 1.5788090122924976, "language_loss": 0.75938851, "learning_rate": 2.0434215655759544e-06, "loss": 0.78073668, "num_input_tokens_seen": 181778545, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 8454, "time_per_iteration": 2.483469247817993 }, { "auxiliary_loss_clip": 0.01112322, "auxiliary_loss_mlp": 0.01032797, "balance_loss_clip": 1.01919544, "balance_loss_mlp": 1.03914773, "epoch": 0.5083421013076808, "flos": 23403271582080.0, "grad_norm": 1.5821916687442352, "language_loss": 0.89390349, "learning_rate": 2.0430321948724446e-06, "loss": 0.91535473, "num_input_tokens_seen": 181799495, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 8455, "time_per_iteration": 2.514932155609131 }, { "auxiliary_loss_clip": 0.01115577, "auxiliary_loss_mlp": 0.01033615, "balance_loss_clip": 1.01933956, "balance_loss_mlp": 1.03839993, "epoch": 0.5084022245603487, "flos": 23872695448320.0, "grad_norm": 1.619601687971489, "language_loss": 0.62211585, "learning_rate": 2.042642822537149e-06, "loss": 0.6436078, "num_input_tokens_seen": 181818400, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 8456, "time_per_iteration": 2.4658772945404053 }, { "auxiliary_loss_clip": 0.01037227, "auxiliary_loss_mlp": 0.010081, "balance_loss_clip": 1.00683069, "balance_loss_mlp": 1.01259625, "epoch": 0.5084623478130167, "flos": 62873336655360.0, "grad_norm": 0.8275539327075181, "language_loss": 0.62473452, "learning_rate": 2.0422534485848343e-06, "loss": 0.6451878, "num_input_tokens_seen": 181875975, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.24609375, "step": 8457, "time_per_iteration": 3.0216870307922363 }, { "auxiliary_loss_clip": 0.01112427, "auxiliary_loss_mlp": 0.01030949, "balance_loss_clip": 1.01680434, "balance_loss_mlp": 1.03884053, "epoch": 0.5085224710656846, "flos": 22346133384960.0, "grad_norm": 1.7754337861894711, "language_loss": 0.67602766, "learning_rate": 2.0418640730302644e-06, "loss": 0.69746143, "num_input_tokens_seen": 181896450, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.734375, "step": 8458, "time_per_iteration": 2.5043575763702393 }, { "auxiliary_loss_clip": 0.01110933, "auxiliary_loss_mlp": 0.01031971, "balance_loss_clip": 1.01824391, "balance_loss_mlp": 1.03587556, "epoch": 0.5085825943183526, "flos": 26066263115520.0, "grad_norm": 2.048836954891979, "language_loss": 0.77311575, "learning_rate": 2.0414746958882043e-06, "loss": 0.79454482, "num_input_tokens_seen": 181916770, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 8459, "time_per_iteration": 2.52608323097229 }, { "auxiliary_loss_clip": 0.01119725, "auxiliary_loss_mlp": 0.01031969, "balance_loss_clip": 1.01785445, "balance_loss_mlp": 1.04286885, "epoch": 0.5086427175710206, "flos": 17420733768960.0, "grad_norm": 1.897668193103751, "language_loss": 0.80853438, "learning_rate": 2.0410853171734196e-06, "loss": 0.83005142, "num_input_tokens_seen": 181932710, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76953125, "step": 8460, "time_per_iteration": 2.436455488204956 }, { "auxiliary_loss_clip": 0.01114444, "auxiliary_loss_mlp": 0.01033476, "balance_loss_clip": 1.02023149, "balance_loss_mlp": 1.04021418, "epoch": 0.5087028408236886, "flos": 20631758083200.0, "grad_norm": 1.8830591926212825, "language_loss": 0.68859631, "learning_rate": 2.0406959369006754e-06, "loss": 0.7100755, "num_input_tokens_seen": 181950665, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 8461, "time_per_iteration": 2.494175910949707 }, { "auxiliary_loss_clip": 0.01107964, "auxiliary_loss_mlp": 0.01029535, "balance_loss_clip": 1.01636815, "balance_loss_mlp": 1.0372746, "epoch": 0.5087629640763566, "flos": 25593822506880.0, "grad_norm": 1.5675404727702555, "language_loss": 0.75835198, "learning_rate": 2.0403065550847375e-06, "loss": 0.77972698, "num_input_tokens_seen": 181971270, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 8462, "time_per_iteration": 2.492732286453247 }, { "auxiliary_loss_clip": 0.01112444, "auxiliary_loss_mlp": 0.01032075, "balance_loss_clip": 1.01871192, "balance_loss_mlp": 1.03954363, "epoch": 0.5088230873290245, "flos": 13261631927040.0, "grad_norm": 3.3324597563310845, "language_loss": 0.81309307, "learning_rate": 2.0399171717403706e-06, "loss": 0.83453834, "num_input_tokens_seen": 181988410, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 8463, "time_per_iteration": 2.4811465740203857 }, { "auxiliary_loss_clip": 0.01110417, "auxiliary_loss_mlp": 0.01038554, "balance_loss_clip": 1.02548838, "balance_loss_mlp": 1.03750062, "epoch": 0.5088832105816925, "flos": 20043469134720.0, "grad_norm": 3.3790181709362166, "language_loss": 0.76314324, "learning_rate": 2.039527786882341e-06, "loss": 0.78463298, "num_input_tokens_seen": 182006530, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 8464, "time_per_iteration": 2.483546257019043 }, { "auxiliary_loss_clip": 0.01036708, "auxiliary_loss_mlp": 0.01002551, "balance_loss_clip": 1.00112617, "balance_loss_mlp": 1.01173997, "epoch": 0.5089433338343604, "flos": 67422179018880.0, "grad_norm": 0.6913471672539752, "language_loss": 0.59367502, "learning_rate": 2.0391384005254133e-06, "loss": 0.61406767, "num_input_tokens_seen": 182074240, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.25, "step": 8465, "time_per_iteration": 3.2217323780059814 }, { "auxiliary_loss_clip": 0.01109429, "auxiliary_loss_mlp": 0.01029372, "balance_loss_clip": 1.01583576, "balance_loss_mlp": 1.03718209, "epoch": 0.5090034570870284, "flos": 22710339336960.0, "grad_norm": 2.110093784895323, "language_loss": 0.80560935, "learning_rate": 2.038749012684354e-06, "loss": 0.82699734, "num_input_tokens_seen": 182093360, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 8466, "time_per_iteration": 3.9802660942077637 }, { "auxiliary_loss_clip": 0.01107245, "auxiliary_loss_mlp": 0.01027598, "balance_loss_clip": 1.01447868, "balance_loss_mlp": 1.03521919, "epoch": 0.5090635803396963, "flos": 20445812352000.0, "grad_norm": 1.5123274424146314, "language_loss": 0.78499293, "learning_rate": 2.0383596233739286e-06, "loss": 0.80634129, "num_input_tokens_seen": 182110170, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 8467, "time_per_iteration": 4.016773462295532 }, { "auxiliary_loss_clip": 0.01108999, "auxiliary_loss_mlp": 0.01030493, "balance_loss_clip": 1.01786828, "balance_loss_mlp": 1.03806186, "epoch": 0.5091237035923644, "flos": 23768878164480.0, "grad_norm": 1.631608056282059, "language_loss": 0.74265552, "learning_rate": 2.0379702326089013e-06, "loss": 0.76405048, "num_input_tokens_seen": 182129570, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 8468, "time_per_iteration": 2.47305965423584 }, { "auxiliary_loss_clip": 0.01109329, "auxiliary_loss_mlp": 0.01029065, "balance_loss_clip": 1.01592851, "balance_loss_mlp": 1.03701818, "epoch": 0.5091838268450323, "flos": 18327908684160.0, "grad_norm": 1.649237513920041, "language_loss": 0.78019023, "learning_rate": 2.03758084040404e-06, "loss": 0.80157417, "num_input_tokens_seen": 182147565, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 8469, "time_per_iteration": 3.8346712589263916 }, { "auxiliary_loss_clip": 0.01114526, "auxiliary_loss_mlp": 0.01029915, "balance_loss_clip": 1.01593184, "balance_loss_mlp": 1.04135168, "epoch": 0.5092439500977003, "flos": 29057621806080.0, "grad_norm": 1.544664697676732, "language_loss": 0.69629782, "learning_rate": 2.037191446774109e-06, "loss": 0.7177422, "num_input_tokens_seen": 182169695, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 8470, "time_per_iteration": 2.5180885791778564 }, { "auxiliary_loss_clip": 0.01113054, "auxiliary_loss_mlp": 0.01032364, "balance_loss_clip": 1.01867843, "balance_loss_mlp": 1.03811145, "epoch": 0.5093040733503682, "flos": 13553908894080.0, "grad_norm": 1.906189660872295, "language_loss": 0.73843092, "learning_rate": 2.0368020517338745e-06, "loss": 0.75988507, "num_input_tokens_seen": 182186385, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 8471, "time_per_iteration": 2.4498348236083984 }, { "auxiliary_loss_clip": 0.01036846, "auxiliary_loss_mlp": 0.01002031, "balance_loss_clip": 1.00074375, "balance_loss_mlp": 1.01192689, "epoch": 0.5093641966030362, "flos": 68906617407360.0, "grad_norm": 0.7916152370347378, "language_loss": 0.58095384, "learning_rate": 2.036412655298103e-06, "loss": 0.60134262, "num_input_tokens_seen": 182247095, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.24902344, "step": 8472, "time_per_iteration": 3.0919601917266846 }, { "auxiliary_loss_clip": 0.01111157, "auxiliary_loss_mlp": 0.0103325, "balance_loss_clip": 1.02048874, "balance_loss_mlp": 1.03832757, "epoch": 0.5094243198557042, "flos": 21580948932480.0, "grad_norm": 1.8758445319123864, "language_loss": 0.69072127, "learning_rate": 2.03602325748156e-06, "loss": 0.71216536, "num_input_tokens_seen": 182266380, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 8473, "time_per_iteration": 2.499319314956665 }, { "auxiliary_loss_clip": 0.01112031, "auxiliary_loss_mlp": 0.01032046, "balance_loss_clip": 1.01920092, "balance_loss_mlp": 1.0392077, "epoch": 0.5094844431083722, "flos": 28840721529600.0, "grad_norm": 2.1901123480034745, "language_loss": 0.85185021, "learning_rate": 2.0356338582990105e-06, "loss": 0.87329096, "num_input_tokens_seen": 182284685, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8474, "time_per_iteration": 2.5158891677856445 }, { "auxiliary_loss_clip": 0.01113521, "auxiliary_loss_mlp": 0.01031145, "balance_loss_clip": 1.01780522, "balance_loss_mlp": 1.03893769, "epoch": 0.5095445663610402, "flos": 14976114969600.0, "grad_norm": 2.342952364914395, "language_loss": 0.65359032, "learning_rate": 2.035244457765222e-06, "loss": 0.67503697, "num_input_tokens_seen": 182301810, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.74609375, "step": 8475, "time_per_iteration": 2.4949607849121094 }, { "auxiliary_loss_clip": 0.01117016, "auxiliary_loss_mlp": 0.01037362, "balance_loss_clip": 1.02362323, "balance_loss_mlp": 1.04002023, "epoch": 0.5096046896137081, "flos": 20777088510720.0, "grad_norm": 2.205270701284397, "language_loss": 0.81660455, "learning_rate": 2.0348550558949605e-06, "loss": 0.83814836, "num_input_tokens_seen": 182320285, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76953125, "step": 8476, "time_per_iteration": 2.4616897106170654 }, { "auxiliary_loss_clip": 0.01114732, "auxiliary_loss_mlp": 0.01040679, "balance_loss_clip": 1.02416873, "balance_loss_mlp": 1.03832436, "epoch": 0.5096648128663761, "flos": 23185078416000.0, "grad_norm": 2.0483148837414267, "language_loss": 0.81132293, "learning_rate": 2.0344656527029917e-06, "loss": 0.83287704, "num_input_tokens_seen": 182339465, "router_z_loss_clip": 0.16503906, "router_z_loss_mlp": 0.765625, "step": 8477, "time_per_iteration": 2.516190528869629 }, { "auxiliary_loss_clip": 0.01114885, "auxiliary_loss_mlp": 0.01029021, "balance_loss_clip": 1.0143528, "balance_loss_mlp": 1.03955293, "epoch": 0.509724936119044, "flos": 22309432663680.0, "grad_norm": 1.843667573714867, "language_loss": 0.62319148, "learning_rate": 2.034076248204082e-06, "loss": 0.64463055, "num_input_tokens_seen": 182358375, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75390625, "step": 8478, "time_per_iteration": 2.45389723777771 }, { "auxiliary_loss_clip": 0.01112133, "auxiliary_loss_mlp": 0.01037355, "balance_loss_clip": 1.02435493, "balance_loss_mlp": 1.03926396, "epoch": 0.509785059371712, "flos": 26287077974400.0, "grad_norm": 1.6143224460346994, "language_loss": 0.6599189, "learning_rate": 2.0336868424129968e-06, "loss": 0.68141377, "num_input_tokens_seen": 182377935, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 8479, "time_per_iteration": 2.5110690593719482 }, { "auxiliary_loss_clip": 0.011112, "auxiliary_loss_mlp": 0.0102861, "balance_loss_clip": 1.01553881, "balance_loss_mlp": 1.03864121, "epoch": 0.50984518262438, "flos": 22964586779520.0, "grad_norm": 1.5187958846559544, "language_loss": 0.69819552, "learning_rate": 2.0332974353445037e-06, "loss": 0.71959364, "num_input_tokens_seen": 182396440, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 8480, "time_per_iteration": 2.460598945617676 }, { "auxiliary_loss_clip": 0.01113741, "auxiliary_loss_mlp": 0.01030883, "balance_loss_clip": 1.01753807, "balance_loss_mlp": 1.03730679, "epoch": 0.509905305877048, "flos": 26213389223040.0, "grad_norm": 2.020918907481138, "language_loss": 0.79524487, "learning_rate": 2.0329080270133688e-06, "loss": 0.8166911, "num_input_tokens_seen": 182415890, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.765625, "step": 8481, "time_per_iteration": 2.510950803756714 }, { "auxiliary_loss_clip": 0.01106606, "auxiliary_loss_mlp": 0.01032035, "balance_loss_clip": 1.01877928, "balance_loss_mlp": 1.0356493, "epoch": 0.5099654291297159, "flos": 20340055733760.0, "grad_norm": 1.68902380437563, "language_loss": 0.83213723, "learning_rate": 2.0325186174343578e-06, "loss": 0.85352367, "num_input_tokens_seen": 182434235, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 8482, "time_per_iteration": 2.4787683486938477 }, { "auxiliary_loss_clip": 0.01113488, "auxiliary_loss_mlp": 0.01033758, "balance_loss_clip": 1.01964402, "balance_loss_mlp": 1.03698575, "epoch": 0.5100255523823839, "flos": 29054820545280.0, "grad_norm": 1.7299526934287457, "language_loss": 0.85286129, "learning_rate": 2.032129206622238e-06, "loss": 0.87433368, "num_input_tokens_seen": 182454360, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 8483, "time_per_iteration": 2.5519909858703613 }, { "auxiliary_loss_clip": 0.01109747, "auxiliary_loss_mlp": 0.01031261, "balance_loss_clip": 1.01849413, "balance_loss_mlp": 1.03593183, "epoch": 0.5100856756350518, "flos": 22455912326400.0, "grad_norm": 2.72586035685644, "language_loss": 0.82938969, "learning_rate": 2.031739794591775e-06, "loss": 0.85079974, "num_input_tokens_seen": 182471940, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73828125, "step": 8484, "time_per_iteration": 2.473453998565674 }, { "auxiliary_loss_clip": 0.01111891, "auxiliary_loss_mlp": 0.01028569, "balance_loss_clip": 1.0146637, "balance_loss_mlp": 1.03833282, "epoch": 0.5101457988877198, "flos": 19171055606400.0, "grad_norm": 2.111791637218147, "language_loss": 0.8142004, "learning_rate": 2.031350381357736e-06, "loss": 0.83560503, "num_input_tokens_seen": 182490685, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 8485, "time_per_iteration": 2.4689781665802 }, { "auxiliary_loss_clip": 0.01107227, "auxiliary_loss_mlp": 0.0103156, "balance_loss_clip": 1.01904249, "balance_loss_mlp": 1.03671098, "epoch": 0.5102059221403878, "flos": 14866371941760.0, "grad_norm": 2.010096506610651, "language_loss": 0.7371999, "learning_rate": 2.0309609669348874e-06, "loss": 0.75858778, "num_input_tokens_seen": 182508325, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 8486, "time_per_iteration": 2.4506585597991943 }, { "auxiliary_loss_clip": 0.01113391, "auxiliary_loss_mlp": 0.01030571, "balance_loss_clip": 1.01758313, "balance_loss_mlp": 1.03852248, "epoch": 0.5102660453930558, "flos": 22961103160320.0, "grad_norm": 1.5040413670188366, "language_loss": 0.70062757, "learning_rate": 2.0305715513379953e-06, "loss": 0.72206718, "num_input_tokens_seen": 182527020, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 8487, "time_per_iteration": 2.465334892272949 }, { "auxiliary_loss_clip": 0.01110834, "auxiliary_loss_mlp": 0.01033078, "balance_loss_clip": 1.01880264, "balance_loss_mlp": 1.03831983, "epoch": 0.5103261686457238, "flos": 23149311448320.0, "grad_norm": 2.0669221361139076, "language_loss": 0.72741354, "learning_rate": 2.030182134581827e-06, "loss": 0.74885261, "num_input_tokens_seen": 182543505, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7265625, "step": 8488, "time_per_iteration": 2.454988956451416 }, { "auxiliary_loss_clip": 0.0111331, "auxiliary_loss_mlp": 0.01033803, "balance_loss_clip": 1.02079093, "balance_loss_mlp": 1.03868234, "epoch": 0.5103862918983917, "flos": 14319237000960.0, "grad_norm": 1.9251626369465318, "language_loss": 0.69469976, "learning_rate": 2.0297927166811503e-06, "loss": 0.71617091, "num_input_tokens_seen": 182562250, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 8489, "time_per_iteration": 2.455883502960205 }, { "auxiliary_loss_clip": 0.01110333, "auxiliary_loss_mlp": 0.01031591, "balance_loss_clip": 1.01876426, "balance_loss_mlp": 1.03634048, "epoch": 0.5104464151510597, "flos": 25848536826240.0, "grad_norm": 1.8249589534369304, "language_loss": 0.72878164, "learning_rate": 2.0294032976507297e-06, "loss": 0.75020087, "num_input_tokens_seen": 182581910, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 8490, "time_per_iteration": 2.4810383319854736 }, { "auxiliary_loss_clip": 0.01107581, "auxiliary_loss_mlp": 0.01027849, "balance_loss_clip": 1.01562405, "balance_loss_mlp": 1.03662312, "epoch": 0.5105065384037276, "flos": 21652913831040.0, "grad_norm": 1.6170287211106857, "language_loss": 0.8046326, "learning_rate": 2.0290138775053337e-06, "loss": 0.82598686, "num_input_tokens_seen": 182601350, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 8491, "time_per_iteration": 2.4603705406188965 }, { "auxiliary_loss_clip": 0.01105934, "auxiliary_loss_mlp": 0.01027252, "balance_loss_clip": 1.01527178, "balance_loss_mlp": 1.03592253, "epoch": 0.5105666616563956, "flos": 22491571553280.0, "grad_norm": 1.9997393956330343, "language_loss": 0.78954959, "learning_rate": 2.028624456259728e-06, "loss": 0.8108815, "num_input_tokens_seen": 182619660, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 8492, "time_per_iteration": 2.461326837539673 }, { "auxiliary_loss_clip": 0.01113381, "auxiliary_loss_mlp": 0.01035825, "balance_loss_clip": 1.02237225, "balance_loss_mlp": 1.03799224, "epoch": 0.5106267849090635, "flos": 22455768672000.0, "grad_norm": 2.0161734084238554, "language_loss": 0.78247392, "learning_rate": 2.0282350339286804e-06, "loss": 0.80396599, "num_input_tokens_seen": 182639815, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 8493, "time_per_iteration": 2.4646358489990234 }, { "auxiliary_loss_clip": 0.01110988, "auxiliary_loss_mlp": 0.0103075, "balance_loss_clip": 1.01707077, "balance_loss_mlp": 1.03766632, "epoch": 0.5106869081617316, "flos": 23547093638400.0, "grad_norm": 1.8743938662788264, "language_loss": 0.8353163, "learning_rate": 2.0278456105269574e-06, "loss": 0.85673368, "num_input_tokens_seen": 182659655, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 8494, "time_per_iteration": 2.465471029281616 }, { "auxiliary_loss_clip": 0.01114922, "auxiliary_loss_mlp": 0.01033545, "balance_loss_clip": 1.02124226, "balance_loss_mlp": 1.04027557, "epoch": 0.5107470314143995, "flos": 26792987080320.0, "grad_norm": 2.2098879088244625, "language_loss": 0.78949106, "learning_rate": 2.027456186069326e-06, "loss": 0.81097567, "num_input_tokens_seen": 182677075, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.74609375, "step": 8495, "time_per_iteration": 2.4903807640075684 }, { "auxiliary_loss_clip": 0.01110438, "auxiliary_loss_mlp": 0.01033059, "balance_loss_clip": 1.01901579, "balance_loss_mlp": 1.03734779, "epoch": 0.5108071546670675, "flos": 25739691638400.0, "grad_norm": 1.5586542900550189, "language_loss": 0.78169024, "learning_rate": 2.0270667605705535e-06, "loss": 0.80312514, "num_input_tokens_seen": 182699625, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73046875, "step": 8496, "time_per_iteration": 2.5626049041748047 }, { "auxiliary_loss_clip": 0.01107956, "auxiliary_loss_mlp": 0.01026976, "balance_loss_clip": 1.01443493, "balance_loss_mlp": 1.03679276, "epoch": 0.5108672779197354, "flos": 18697537589760.0, "grad_norm": 1.9643189669440075, "language_loss": 0.78730583, "learning_rate": 2.0266773340454066e-06, "loss": 0.80865514, "num_input_tokens_seen": 182717020, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 8497, "time_per_iteration": 2.443103790283203 }, { "auxiliary_loss_clip": 0.01107919, "auxiliary_loss_mlp": 0.01029577, "balance_loss_clip": 1.01720333, "balance_loss_mlp": 1.03623605, "epoch": 0.5109274011724034, "flos": 26688164215680.0, "grad_norm": 1.6716234948303037, "language_loss": 0.81972349, "learning_rate": 2.0262879065086525e-06, "loss": 0.84109843, "num_input_tokens_seen": 182736955, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 8498, "time_per_iteration": 2.507328748703003 }, { "auxiliary_loss_clip": 0.01110103, "auxiliary_loss_mlp": 0.01030408, "balance_loss_clip": 1.0167166, "balance_loss_mlp": 1.03902066, "epoch": 0.5109875244250714, "flos": 22784028088320.0, "grad_norm": 2.2760280354352402, "language_loss": 0.70605195, "learning_rate": 2.0258984779750584e-06, "loss": 0.72745705, "num_input_tokens_seen": 182757620, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 8499, "time_per_iteration": 2.4809844493865967 }, { "auxiliary_loss_clip": 0.01110921, "auxiliary_loss_mlp": 0.01033705, "balance_loss_clip": 1.0204072, "balance_loss_mlp": 1.03776169, "epoch": 0.5110476476777394, "flos": 35588515622400.0, "grad_norm": 1.4992255695779368, "language_loss": 0.72196239, "learning_rate": 2.0255090484593914e-06, "loss": 0.74340862, "num_input_tokens_seen": 182780195, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 8500, "time_per_iteration": 2.5689148902893066 }, { "auxiliary_loss_clip": 0.01115588, "auxiliary_loss_mlp": 0.01036342, "balance_loss_clip": 1.02223337, "balance_loss_mlp": 1.0384835, "epoch": 0.5111077709304074, "flos": 19280798634240.0, "grad_norm": 2.6926687714865376, "language_loss": 0.62181109, "learning_rate": 2.0251196179764183e-06, "loss": 0.6433304, "num_input_tokens_seen": 182795765, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76953125, "step": 8501, "time_per_iteration": 2.450354814529419 }, { "auxiliary_loss_clip": 0.01109524, "auxiliary_loss_mlp": 0.01035888, "balance_loss_clip": 1.02228022, "balance_loss_mlp": 1.03513527, "epoch": 0.5111678941830753, "flos": 20668207409280.0, "grad_norm": 2.0395854648653726, "language_loss": 0.8746016, "learning_rate": 2.024730186540907e-06, "loss": 0.89605576, "num_input_tokens_seen": 182813120, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 8502, "time_per_iteration": 2.46606183052063 }, { "auxiliary_loss_clip": 0.01107574, "auxiliary_loss_mlp": 0.0103483, "balance_loss_clip": 1.02239656, "balance_loss_mlp": 1.03456473, "epoch": 0.5112280174357433, "flos": 26287903987200.0, "grad_norm": 1.5680557759416531, "language_loss": 0.82353508, "learning_rate": 2.0243407541676253e-06, "loss": 0.84495914, "num_input_tokens_seen": 182835745, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.73046875, "step": 8503, "time_per_iteration": 2.518941640853882 }, { "auxiliary_loss_clip": 0.0103894, "auxiliary_loss_mlp": 0.01004833, "balance_loss_clip": 1.00366437, "balance_loss_mlp": 1.01364207, "epoch": 0.5112881406884112, "flos": 59474247707520.0, "grad_norm": 0.8546124813145696, "language_loss": 0.63893324, "learning_rate": 2.023951320871339e-06, "loss": 0.65937096, "num_input_tokens_seen": 182892540, "router_z_loss_clip": 0.01165771, "router_z_loss_mlp": 0.25390625, "step": 8504, "time_per_iteration": 3.151326894760132 }, { "auxiliary_loss_clip": 0.01110311, "auxiliary_loss_mlp": 0.01030096, "balance_loss_clip": 1.01615441, "balance_loss_mlp": 1.03819108, "epoch": 0.5113482639410792, "flos": 26468857728000.0, "grad_norm": 1.8893307702612907, "language_loss": 0.84321165, "learning_rate": 2.023561886666816e-06, "loss": 0.86461568, "num_input_tokens_seen": 182911515, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 8505, "time_per_iteration": 2.5216240882873535 }, { "auxiliary_loss_clip": 0.01110158, "auxiliary_loss_mlp": 0.01028101, "balance_loss_clip": 1.01523221, "balance_loss_mlp": 1.03838658, "epoch": 0.5114083871937471, "flos": 29895848565120.0, "grad_norm": 1.7437476026868606, "language_loss": 0.75011909, "learning_rate": 2.0231724515688246e-06, "loss": 0.77150166, "num_input_tokens_seen": 182930860, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 8506, "time_per_iteration": 2.556093692779541 }, { "auxiliary_loss_clip": 0.01111603, "auxiliary_loss_mlp": 0.01031648, "balance_loss_clip": 1.01723552, "balance_loss_mlp": 1.03793764, "epoch": 0.5114685104464152, "flos": 24314576561280.0, "grad_norm": 1.833192127308066, "language_loss": 0.58298099, "learning_rate": 2.022783015592131e-06, "loss": 0.60441351, "num_input_tokens_seen": 182949960, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.73828125, "step": 8507, "time_per_iteration": 5.4705750942230225 }, { "auxiliary_loss_clip": 0.01115717, "auxiliary_loss_mlp": 0.01041607, "balance_loss_clip": 1.02709949, "balance_loss_mlp": 1.0413897, "epoch": 0.5115286336990831, "flos": 17019288391680.0, "grad_norm": 2.4181931375999652, "language_loss": 0.85993212, "learning_rate": 2.022393578751503e-06, "loss": 0.88150537, "num_input_tokens_seen": 182968085, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7421875, "step": 8508, "time_per_iteration": 2.4603660106658936 }, { "auxiliary_loss_clip": 0.01113852, "auxiliary_loss_mlp": 0.0103404, "balance_loss_clip": 1.01932955, "balance_loss_mlp": 1.03926897, "epoch": 0.5115887569517511, "flos": 23659386531840.0, "grad_norm": 1.6531568723679977, "language_loss": 0.7240572, "learning_rate": 2.022004141061709e-06, "loss": 0.74553609, "num_input_tokens_seen": 182987275, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.74609375, "step": 8509, "time_per_iteration": 2.482959032058716 }, { "auxiliary_loss_clip": 0.01109033, "auxiliary_loss_mlp": 0.01031443, "balance_loss_clip": 1.01890802, "balance_loss_mlp": 1.03809655, "epoch": 0.511648880204419, "flos": 16107193313280.0, "grad_norm": 1.7100640437378094, "language_loss": 0.76139706, "learning_rate": 2.0216147025375153e-06, "loss": 0.78280181, "num_input_tokens_seen": 183004700, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 8510, "time_per_iteration": 3.812084913253784 }, { "auxiliary_loss_clip": 0.01109305, "auxiliary_loss_mlp": 0.01036018, "balance_loss_clip": 1.02314961, "balance_loss_mlp": 1.03796113, "epoch": 0.511709003457087, "flos": 32634970974720.0, "grad_norm": 1.600941792010016, "language_loss": 0.70779872, "learning_rate": 2.0212252631936907e-06, "loss": 0.72925204, "num_input_tokens_seen": 183025830, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 8511, "time_per_iteration": 3.912564992904663 }, { "auxiliary_loss_clip": 0.01112845, "auxiliary_loss_mlp": 0.01031071, "balance_loss_clip": 1.01823843, "balance_loss_mlp": 1.04179549, "epoch": 0.511769126709755, "flos": 21762082241280.0, "grad_norm": 2.4022856825444094, "language_loss": 0.66787279, "learning_rate": 2.020835823045001e-06, "loss": 0.68931198, "num_input_tokens_seen": 183045140, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 8512, "time_per_iteration": 2.505683660507202 }, { "auxiliary_loss_clip": 0.01114252, "auxiliary_loss_mlp": 0.01037013, "balance_loss_clip": 1.02216506, "balance_loss_mlp": 1.03974724, "epoch": 0.511829249962423, "flos": 23915357827200.0, "grad_norm": 1.874464647095955, "language_loss": 0.67358148, "learning_rate": 2.0204463821062146e-06, "loss": 0.69509417, "num_input_tokens_seen": 183063935, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.74609375, "step": 8513, "time_per_iteration": 2.470604419708252 }, { "auxiliary_loss_clip": 0.01111616, "auxiliary_loss_mlp": 0.01033879, "balance_loss_clip": 1.01985359, "balance_loss_mlp": 1.04018712, "epoch": 0.511889373215091, "flos": 23727005884800.0, "grad_norm": 2.048543865462331, "language_loss": 0.68902397, "learning_rate": 2.0200569403921e-06, "loss": 0.7104789, "num_input_tokens_seen": 183084135, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71484375, "step": 8514, "time_per_iteration": 2.5106353759765625 }, { "auxiliary_loss_clip": 0.01107328, "auxiliary_loss_mlp": 0.01031999, "balance_loss_clip": 1.0195415, "balance_loss_mlp": 1.03663659, "epoch": 0.5119494964677589, "flos": 28111519526400.0, "grad_norm": 1.5866149276504578, "language_loss": 0.65793413, "learning_rate": 2.019667497917424e-06, "loss": 0.67932737, "num_input_tokens_seen": 183104570, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.70703125, "step": 8515, "time_per_iteration": 2.5032811164855957 }, { "auxiliary_loss_clip": 0.01109589, "auxiliary_loss_mlp": 0.01030693, "balance_loss_clip": 1.01807463, "balance_loss_mlp": 1.038486, "epoch": 0.5120096197204269, "flos": 24973214296320.0, "grad_norm": 1.9893064378610223, "language_loss": 0.75609624, "learning_rate": 2.019278054696955e-06, "loss": 0.77749908, "num_input_tokens_seen": 183123850, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 8516, "time_per_iteration": 2.5161490440368652 }, { "auxiliary_loss_clip": 0.01115117, "auxiliary_loss_mlp": 0.01037575, "balance_loss_clip": 1.02430081, "balance_loss_mlp": 1.04169583, "epoch": 0.5120697429730948, "flos": 17968012364160.0, "grad_norm": 1.878753281249955, "language_loss": 0.77785158, "learning_rate": 2.0188886107454595e-06, "loss": 0.79937845, "num_input_tokens_seen": 183141725, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 8517, "time_per_iteration": 2.435818910598755 }, { "auxiliary_loss_clip": 0.01114955, "auxiliary_loss_mlp": 0.01031769, "balance_loss_clip": 1.01775002, "balance_loss_mlp": 1.03950405, "epoch": 0.5121298662257628, "flos": 23292343405440.0, "grad_norm": 1.7726088399801265, "language_loss": 0.73952383, "learning_rate": 2.0184991660777063e-06, "loss": 0.7609911, "num_input_tokens_seen": 183161300, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 8518, "time_per_iteration": 2.509411334991455 }, { "auxiliary_loss_clip": 0.01113535, "auxiliary_loss_mlp": 0.01036004, "balance_loss_clip": 1.02267659, "balance_loss_mlp": 1.04041278, "epoch": 0.5121899894784308, "flos": 17311062568320.0, "grad_norm": 1.9647032132333628, "language_loss": 0.78161669, "learning_rate": 2.0181097207084625e-06, "loss": 0.80311203, "num_input_tokens_seen": 183180495, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 8519, "time_per_iteration": 2.4358136653900146 }, { "auxiliary_loss_clip": 0.01114202, "auxiliary_loss_mlp": 0.01034815, "balance_loss_clip": 1.02095127, "balance_loss_mlp": 1.04119635, "epoch": 0.5122501127310988, "flos": 24930085040640.0, "grad_norm": 1.5501472663546199, "language_loss": 0.79368627, "learning_rate": 2.017720274652497e-06, "loss": 0.81517643, "num_input_tokens_seen": 183200330, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73046875, "step": 8520, "time_per_iteration": 2.5264978408813477 }, { "auxiliary_loss_clip": 0.01118942, "auxiliary_loss_mlp": 0.01042083, "balance_loss_clip": 1.02715778, "balance_loss_mlp": 1.04091287, "epoch": 0.5123102359837667, "flos": 18442859184000.0, "grad_norm": 1.808260964579926, "language_loss": 0.81336558, "learning_rate": 2.0173308279245765e-06, "loss": 0.83497578, "num_input_tokens_seen": 183218230, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.78125, "step": 8521, "time_per_iteration": 2.428609609603882 }, { "auxiliary_loss_clip": 0.0110843, "auxiliary_loss_mlp": 0.01029893, "balance_loss_clip": 1.01665449, "balance_loss_mlp": 1.03502893, "epoch": 0.5123703592364347, "flos": 26684860164480.0, "grad_norm": 1.9078578927148584, "language_loss": 0.68571103, "learning_rate": 2.0169413805394692e-06, "loss": 0.70709419, "num_input_tokens_seen": 183236735, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 8522, "time_per_iteration": 2.522139549255371 }, { "auxiliary_loss_clip": 0.01119816, "auxiliary_loss_mlp": 0.01037879, "balance_loss_clip": 1.02147615, "balance_loss_mlp": 1.04285979, "epoch": 0.5124304824891026, "flos": 28803948981120.0, "grad_norm": 2.1687297308204263, "language_loss": 0.61874616, "learning_rate": 2.0165519325119433e-06, "loss": 0.64032316, "num_input_tokens_seen": 183257550, "router_z_loss_clip": 0.1640625, "router_z_loss_mlp": 0.76953125, "step": 8523, "time_per_iteration": 2.5052363872528076 }, { "auxiliary_loss_clip": 0.0111333, "auxiliary_loss_mlp": 0.01034666, "balance_loss_clip": 1.02217329, "balance_loss_mlp": 1.03988004, "epoch": 0.5124906057417706, "flos": 21761830846080.0, "grad_norm": 2.2517542518839844, "language_loss": 0.7762084, "learning_rate": 2.0161624838567656e-06, "loss": 0.79768836, "num_input_tokens_seen": 183275515, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.734375, "step": 8524, "time_per_iteration": 2.4942336082458496 }, { "auxiliary_loss_clip": 0.0111276, "auxiliary_loss_mlp": 0.01031174, "balance_loss_clip": 1.01844811, "balance_loss_mlp": 1.04100108, "epoch": 0.5125507289944387, "flos": 18880538405760.0, "grad_norm": 1.9537199876315272, "language_loss": 0.74935997, "learning_rate": 2.015773034588706e-06, "loss": 0.77079934, "num_input_tokens_seen": 183293880, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 8525, "time_per_iteration": 2.4382824897766113 }, { "auxiliary_loss_clip": 0.01115218, "auxiliary_loss_mlp": 0.01034201, "balance_loss_clip": 1.02009845, "balance_loss_mlp": 1.04020309, "epoch": 0.5126108522471066, "flos": 35627838036480.0, "grad_norm": 1.5922108341011576, "language_loss": 0.74900413, "learning_rate": 2.015383584722531e-06, "loss": 0.77049828, "num_input_tokens_seen": 183315860, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 8526, "time_per_iteration": 2.587383270263672 }, { "auxiliary_loss_clip": 0.01114058, "auxiliary_loss_mlp": 0.01037145, "balance_loss_clip": 1.02365065, "balance_loss_mlp": 1.04089773, "epoch": 0.5126709754997746, "flos": 20190918464640.0, "grad_norm": 1.9803034500050993, "language_loss": 0.65264791, "learning_rate": 2.0149941342730088e-06, "loss": 0.67415994, "num_input_tokens_seen": 183335480, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 8527, "time_per_iteration": 2.4695582389831543 }, { "auxiliary_loss_clip": 0.01111295, "auxiliary_loss_mlp": 0.01034851, "balance_loss_clip": 1.0229001, "balance_loss_mlp": 1.04169536, "epoch": 0.5127310987524425, "flos": 18588548747520.0, "grad_norm": 1.5645161917225534, "language_loss": 0.74343389, "learning_rate": 2.014604683254908e-06, "loss": 0.76489544, "num_input_tokens_seen": 183354395, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 8528, "time_per_iteration": 2.4861130714416504 }, { "auxiliary_loss_clip": 0.01108401, "auxiliary_loss_mlp": 0.01033716, "balance_loss_clip": 1.02048326, "balance_loss_mlp": 1.03655422, "epoch": 0.5127912220051105, "flos": 22454691264000.0, "grad_norm": 2.0755533064463143, "language_loss": 0.83070821, "learning_rate": 2.014215231682995e-06, "loss": 0.85212934, "num_input_tokens_seen": 183372980, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 8529, "time_per_iteration": 2.4555225372314453 }, { "auxiliary_loss_clip": 0.01108386, "auxiliary_loss_mlp": 0.01034201, "balance_loss_clip": 1.02073002, "balance_loss_mlp": 1.0383023, "epoch": 0.5128513452577784, "flos": 19093703667840.0, "grad_norm": 1.894606523394408, "language_loss": 0.73860294, "learning_rate": 2.01382577957204e-06, "loss": 0.76002884, "num_input_tokens_seen": 183390160, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 8530, "time_per_iteration": 2.4559316635131836 }, { "auxiliary_loss_clip": 0.01045902, "auxiliary_loss_mlp": 0.01003076, "balance_loss_clip": 1.00172925, "balance_loss_mlp": 1.02066147, "epoch": 0.5129114685104464, "flos": 67892285243520.0, "grad_norm": 0.7539304319004639, "language_loss": 0.60807484, "learning_rate": 2.0134363269368095e-06, "loss": 0.6285646, "num_input_tokens_seen": 183455280, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.25390625, "step": 8531, "time_per_iteration": 3.182746171951294 }, { "auxiliary_loss_clip": 0.01114223, "auxiliary_loss_mlp": 0.01027497, "balance_loss_clip": 1.01467061, "balance_loss_mlp": 1.04073465, "epoch": 0.5129715917631144, "flos": 20449152316800.0, "grad_norm": 1.8234267180313521, "language_loss": 0.77104783, "learning_rate": 2.0130468737920725e-06, "loss": 0.79246509, "num_input_tokens_seen": 183473955, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.734375, "step": 8532, "time_per_iteration": 2.4709830284118652 }, { "auxiliary_loss_clip": 0.01111474, "auxiliary_loss_mlp": 0.01031142, "balance_loss_clip": 1.01739764, "balance_loss_mlp": 1.03931284, "epoch": 0.5130317150157824, "flos": 35116146840960.0, "grad_norm": 1.74900780896073, "language_loss": 0.66890538, "learning_rate": 2.012657420152597e-06, "loss": 0.69033158, "num_input_tokens_seen": 183497195, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 8533, "time_per_iteration": 2.5780882835388184 }, { "auxiliary_loss_clip": 0.01114969, "auxiliary_loss_mlp": 0.0103687, "balance_loss_clip": 1.02267861, "balance_loss_mlp": 1.04050517, "epoch": 0.5130918382684503, "flos": 19791627903360.0, "grad_norm": 1.9304286806101099, "language_loss": 0.82239103, "learning_rate": 2.01226796603315e-06, "loss": 0.84390944, "num_input_tokens_seen": 183513675, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7421875, "step": 8534, "time_per_iteration": 2.4608373641967773 }, { "auxiliary_loss_clip": 0.01112478, "auxiliary_loss_mlp": 0.0103314, "balance_loss_clip": 1.01902521, "balance_loss_mlp": 1.03887069, "epoch": 0.5131519615211183, "flos": 26323096337280.0, "grad_norm": 1.59258350532245, "language_loss": 0.63601494, "learning_rate": 2.0118785114485017e-06, "loss": 0.65747118, "num_input_tokens_seen": 183535165, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 8535, "time_per_iteration": 2.510847568511963 }, { "auxiliary_loss_clip": 0.01112365, "auxiliary_loss_mlp": 0.01025885, "balance_loss_clip": 1.01287341, "balance_loss_mlp": 1.04016542, "epoch": 0.5132120847737862, "flos": 19171917532800.0, "grad_norm": 1.5322874294926614, "language_loss": 0.69680333, "learning_rate": 2.011489056413418e-06, "loss": 0.71818584, "num_input_tokens_seen": 183553780, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 8536, "time_per_iteration": 2.4533607959747314 }, { "auxiliary_loss_clip": 0.01113909, "auxiliary_loss_mlp": 0.01031109, "balance_loss_clip": 1.01670849, "balance_loss_mlp": 1.03896117, "epoch": 0.5132722080264542, "flos": 20230420446720.0, "grad_norm": 1.896673888528655, "language_loss": 0.71570027, "learning_rate": 2.011099600942669e-06, "loss": 0.73715043, "num_input_tokens_seen": 183572285, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75, "step": 8537, "time_per_iteration": 2.4522745609283447 }, { "auxiliary_loss_clip": 0.01113217, "auxiliary_loss_mlp": 0.01027711, "balance_loss_clip": 1.01378787, "balance_loss_mlp": 1.03791809, "epoch": 0.5133323312791223, "flos": 16469459930880.0, "grad_norm": 1.9647005313144503, "language_loss": 0.80154544, "learning_rate": 2.0107101450510214e-06, "loss": 0.82295465, "num_input_tokens_seen": 183589330, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75390625, "step": 8538, "time_per_iteration": 2.4331881999969482 }, { "auxiliary_loss_clip": 0.01108463, "auxiliary_loss_mlp": 0.01028086, "balance_loss_clip": 1.01463962, "balance_loss_mlp": 1.03560162, "epoch": 0.5133924545317902, "flos": 26068094709120.0, "grad_norm": 1.904293486414148, "language_loss": 0.78487593, "learning_rate": 2.0103206887532437e-06, "loss": 0.80624139, "num_input_tokens_seen": 183609205, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 8539, "time_per_iteration": 2.5632472038269043 }, { "auxiliary_loss_clip": 0.01110917, "auxiliary_loss_mlp": 0.01030694, "balance_loss_clip": 1.01687145, "balance_loss_mlp": 1.0374006, "epoch": 0.5134525777844582, "flos": 29131023248640.0, "grad_norm": 1.6376340805782719, "language_loss": 0.7612735, "learning_rate": 2.009931232064105e-06, "loss": 0.78268957, "num_input_tokens_seen": 183629985, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 8540, "time_per_iteration": 2.569150686264038 }, { "auxiliary_loss_clip": 0.01115443, "auxiliary_loss_mlp": 0.01031823, "balance_loss_clip": 1.01741099, "balance_loss_mlp": 1.03947544, "epoch": 0.5135127010371261, "flos": 17454776883840.0, "grad_norm": 1.6558674929646051, "language_loss": 0.74669677, "learning_rate": 2.0095417749983724e-06, "loss": 0.76816946, "num_input_tokens_seen": 183648220, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76171875, "step": 8541, "time_per_iteration": 2.432936191558838 }, { "auxiliary_loss_clip": 0.01110877, "auxiliary_loss_mlp": 0.01035236, "balance_loss_clip": 1.02191412, "balance_loss_mlp": 1.03841221, "epoch": 0.5135728242897941, "flos": 21944975316480.0, "grad_norm": 1.606240650565503, "language_loss": 0.70543945, "learning_rate": 2.0091523175708162e-06, "loss": 0.72690058, "num_input_tokens_seen": 183668230, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 8542, "time_per_iteration": 2.484872341156006 }, { "auxiliary_loss_clip": 0.01112286, "auxiliary_loss_mlp": 0.01029304, "balance_loss_clip": 1.01623261, "balance_loss_mlp": 1.03876209, "epoch": 0.513632947542462, "flos": 22674859678080.0, "grad_norm": 1.9301895053629228, "language_loss": 0.79504836, "learning_rate": 2.0087628597962023e-06, "loss": 0.8164643, "num_input_tokens_seen": 183687800, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 8543, "time_per_iteration": 2.460184335708618 }, { "auxiliary_loss_clip": 0.01113934, "auxiliary_loss_mlp": 0.0103513, "balance_loss_clip": 1.02137291, "balance_loss_mlp": 1.04067588, "epoch": 0.51369307079513, "flos": 29457163762560.0, "grad_norm": 1.758717398358761, "language_loss": 0.67816305, "learning_rate": 2.008373401689299e-06, "loss": 0.69965363, "num_input_tokens_seen": 183709025, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 8544, "time_per_iteration": 2.534453868865967 }, { "auxiliary_loss_clip": 0.01112756, "auxiliary_loss_mlp": 0.01040578, "balance_loss_clip": 1.02748251, "balance_loss_mlp": 1.03802168, "epoch": 0.513753194047798, "flos": 18989347680000.0, "grad_norm": 2.263271413080841, "language_loss": 0.7256639, "learning_rate": 2.0079839432648765e-06, "loss": 0.74719727, "num_input_tokens_seen": 183725740, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 8545, "time_per_iteration": 2.4278788566589355 }, { "auxiliary_loss_clip": 0.0111307, "auxiliary_loss_mlp": 0.01039238, "balance_loss_clip": 1.02492714, "balance_loss_mlp": 1.03826571, "epoch": 0.513813317300466, "flos": 17821855923840.0, "grad_norm": 2.462362790324542, "language_loss": 0.82365966, "learning_rate": 2.0075944845377016e-06, "loss": 0.84518266, "num_input_tokens_seen": 183743995, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 8546, "time_per_iteration": 2.458401918411255 }, { "auxiliary_loss_clip": 0.01112104, "auxiliary_loss_mlp": 0.0103135, "balance_loss_clip": 1.01762938, "balance_loss_mlp": 1.03877831, "epoch": 0.5138734405531339, "flos": 24061191045120.0, "grad_norm": 1.7868334218969393, "language_loss": 0.73112667, "learning_rate": 2.007205025522544e-06, "loss": 0.75256127, "num_input_tokens_seen": 183764150, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 8547, "time_per_iteration": 2.474294900894165 }, { "auxiliary_loss_clip": 0.0110923, "auxiliary_loss_mlp": 0.01040873, "balance_loss_clip": 1.02771258, "balance_loss_mlp": 1.03653228, "epoch": 0.5139335638058019, "flos": 26097253574400.0, "grad_norm": 1.93834127469755, "language_loss": 0.73544419, "learning_rate": 2.0068155662341702e-06, "loss": 0.75694519, "num_input_tokens_seen": 183783280, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 8548, "time_per_iteration": 2.501918315887451 }, { "auxiliary_loss_clip": 0.01110194, "auxiliary_loss_mlp": 0.01033387, "balance_loss_clip": 1.01971972, "balance_loss_mlp": 1.03711307, "epoch": 0.5139936870584698, "flos": 18917095472640.0, "grad_norm": 1.7611915435834924, "language_loss": 0.82248688, "learning_rate": 2.0064261066873495e-06, "loss": 0.84392267, "num_input_tokens_seen": 183800725, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 8549, "time_per_iteration": 5.3129706382751465 }, { "auxiliary_loss_clip": 0.01109504, "auxiliary_loss_mlp": 0.01026746, "balance_loss_clip": 1.01469445, "balance_loss_mlp": 1.03879786, "epoch": 0.5140538103111378, "flos": 16144001775360.0, "grad_norm": 2.3783100983938157, "language_loss": 0.72150326, "learning_rate": 2.0060366468968504e-06, "loss": 0.74286574, "num_input_tokens_seen": 183818735, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.70703125, "step": 8550, "time_per_iteration": 2.4209554195404053 }, { "auxiliary_loss_clip": 0.01113009, "auxiliary_loss_mlp": 0.01033737, "balance_loss_clip": 1.02022433, "balance_loss_mlp": 1.03786778, "epoch": 0.5141139335638057, "flos": 22420145358720.0, "grad_norm": 1.8093195552266856, "language_loss": 0.75083864, "learning_rate": 2.0056471868774408e-06, "loss": 0.77230608, "num_input_tokens_seen": 183840015, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 8551, "time_per_iteration": 3.893537759780884 }, { "auxiliary_loss_clip": 0.01108928, "auxiliary_loss_mlp": 0.01027591, "balance_loss_clip": 1.01482964, "balance_loss_mlp": 1.03887582, "epoch": 0.5141740568164738, "flos": 27089645506560.0, "grad_norm": 1.8890397324460837, "language_loss": 0.69472969, "learning_rate": 2.0052577266438897e-06, "loss": 0.71609485, "num_input_tokens_seen": 183860145, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 8552, "time_per_iteration": 2.545227289199829 }, { "auxiliary_loss_clip": 0.01111483, "auxiliary_loss_mlp": 0.01030325, "balance_loss_clip": 1.01677132, "balance_loss_mlp": 1.03796887, "epoch": 0.5142341800691418, "flos": 24973250209920.0, "grad_norm": 2.7745876853823535, "language_loss": 0.75012839, "learning_rate": 2.004868266210965e-06, "loss": 0.77154648, "num_input_tokens_seen": 183880540, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 8553, "time_per_iteration": 3.88071346282959 }, { "auxiliary_loss_clip": 0.01111523, "auxiliary_loss_mlp": 0.01033612, "balance_loss_clip": 1.02027822, "balance_loss_mlp": 1.03879726, "epoch": 0.5142943033218097, "flos": 20704513080960.0, "grad_norm": 2.8164441304475116, "language_loss": 0.67868984, "learning_rate": 2.004478805593435e-06, "loss": 0.70014119, "num_input_tokens_seen": 183900895, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 8554, "time_per_iteration": 2.4795913696289062 }, { "auxiliary_loss_clip": 0.01114957, "auxiliary_loss_mlp": 0.01038835, "balance_loss_clip": 1.023314, "balance_loss_mlp": 1.03852248, "epoch": 0.5143544265744777, "flos": 22925479847040.0, "grad_norm": 1.7645246234569352, "language_loss": 0.73267084, "learning_rate": 2.004089344806068e-06, "loss": 0.75420874, "num_input_tokens_seen": 183920335, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.765625, "step": 8555, "time_per_iteration": 2.471083641052246 }, { "auxiliary_loss_clip": 0.01111241, "auxiliary_loss_mlp": 0.01032992, "balance_loss_clip": 1.02045715, "balance_loss_mlp": 1.03850853, "epoch": 0.5144145498271456, "flos": 15921391236480.0, "grad_norm": 2.3694441391614878, "language_loss": 0.74420297, "learning_rate": 2.003699883863633e-06, "loss": 0.76564527, "num_input_tokens_seen": 183936220, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 8556, "time_per_iteration": 2.5063934326171875 }, { "auxiliary_loss_clip": 0.01106385, "auxiliary_loss_mlp": 0.01028743, "balance_loss_clip": 1.01644707, "balance_loss_mlp": 1.03561902, "epoch": 0.5144746730798136, "flos": 19681238430720.0, "grad_norm": 1.951349769646018, "language_loss": 0.86072743, "learning_rate": 2.003310422780898e-06, "loss": 0.88207871, "num_input_tokens_seen": 183953250, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 8557, "time_per_iteration": 2.439638614654541 }, { "auxiliary_loss_clip": 0.01107383, "auxiliary_loss_mlp": 0.01031743, "balance_loss_clip": 1.01941681, "balance_loss_mlp": 1.03700733, "epoch": 0.5145347963324816, "flos": 23914711382400.0, "grad_norm": 11.53221339827504, "language_loss": 0.89149904, "learning_rate": 2.0029209615726307e-06, "loss": 0.91289032, "num_input_tokens_seen": 183973865, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 8558, "time_per_iteration": 2.5117084980010986 }, { "auxiliary_loss_clip": 0.01108336, "auxiliary_loss_mlp": 0.01030063, "balance_loss_clip": 1.01728988, "balance_loss_mlp": 1.03858066, "epoch": 0.5145949195851496, "flos": 18260002022400.0, "grad_norm": 2.1207445993996457, "language_loss": 0.65578341, "learning_rate": 2.002531500253602e-06, "loss": 0.67716742, "num_input_tokens_seen": 183992555, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 8559, "time_per_iteration": 2.4416136741638184 }, { "auxiliary_loss_clip": 0.01112349, "auxiliary_loss_mlp": 0.01034567, "balance_loss_clip": 1.02189505, "balance_loss_mlp": 1.04098582, "epoch": 0.5146550428378175, "flos": 26213425136640.0, "grad_norm": 1.6803917355141613, "language_loss": 0.63315296, "learning_rate": 2.002142038838577e-06, "loss": 0.65462208, "num_input_tokens_seen": 184010825, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 8560, "time_per_iteration": 2.5216917991638184 }, { "auxiliary_loss_clip": 0.0110935, "auxiliary_loss_mlp": 0.01029724, "balance_loss_clip": 1.01710534, "balance_loss_mlp": 1.03773975, "epoch": 0.5147151660904855, "flos": 22674177319680.0, "grad_norm": 1.9758642238612014, "language_loss": 0.70072412, "learning_rate": 2.0017525773423265e-06, "loss": 0.7221148, "num_input_tokens_seen": 184030155, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 8561, "time_per_iteration": 2.45658278465271 }, { "auxiliary_loss_clip": 0.01111214, "auxiliary_loss_mlp": 0.01026797, "balance_loss_clip": 1.01467955, "balance_loss_mlp": 1.03818369, "epoch": 0.5147752893431534, "flos": 24972388283520.0, "grad_norm": 1.6557542084817423, "language_loss": 0.66545773, "learning_rate": 2.0013631157796177e-06, "loss": 0.68683779, "num_input_tokens_seen": 184051440, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.73046875, "step": 8562, "time_per_iteration": 2.502317190170288 }, { "auxiliary_loss_clip": 0.01114042, "auxiliary_loss_mlp": 0.01031738, "balance_loss_clip": 1.0188278, "balance_loss_mlp": 1.04048514, "epoch": 0.5148354125958214, "flos": 22744669760640.0, "grad_norm": 1.694699336393273, "language_loss": 0.77537078, "learning_rate": 2.0009736541652188e-06, "loss": 0.79682857, "num_input_tokens_seen": 184070205, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8563, "time_per_iteration": 2.475323438644409 }, { "auxiliary_loss_clip": 0.01116132, "auxiliary_loss_mlp": 0.01034397, "balance_loss_clip": 1.01928174, "balance_loss_mlp": 1.03891575, "epoch": 0.5148955358484893, "flos": 23068763199360.0, "grad_norm": 3.762715986443633, "language_loss": 0.82877272, "learning_rate": 2.0005841925139e-06, "loss": 0.85027802, "num_input_tokens_seen": 184087345, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7734375, "step": 8564, "time_per_iteration": 2.4894604682922363 }, { "auxiliary_loss_clip": 0.01114437, "auxiliary_loss_mlp": 0.010324, "balance_loss_clip": 1.01877999, "balance_loss_mlp": 1.03850472, "epoch": 0.5149556591011574, "flos": 20340127560960.0, "grad_norm": 1.7461886099414659, "language_loss": 0.7332387, "learning_rate": 2.0001947308404283e-06, "loss": 0.7547071, "num_input_tokens_seen": 184107110, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 8565, "time_per_iteration": 2.459260940551758 }, { "auxiliary_loss_clip": 0.01116761, "auxiliary_loss_mlp": 0.01032747, "balance_loss_clip": 1.01807261, "balance_loss_mlp": 1.04038608, "epoch": 0.5150157823538254, "flos": 22638230784000.0, "grad_norm": 2.0217288474162745, "language_loss": 0.69154084, "learning_rate": 1.9998052691595715e-06, "loss": 0.71303588, "num_input_tokens_seen": 184127105, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 8566, "time_per_iteration": 2.4960761070251465 }, { "auxiliary_loss_clip": 0.01113466, "auxiliary_loss_mlp": 0.01030309, "balance_loss_clip": 1.01735091, "balance_loss_mlp": 1.03759551, "epoch": 0.5150759056064933, "flos": 26067627832320.0, "grad_norm": 2.343677185600353, "language_loss": 0.77843875, "learning_rate": 1.9994158074861005e-06, "loss": 0.79987651, "num_input_tokens_seen": 184148060, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7578125, "step": 8567, "time_per_iteration": 2.4900355339050293 }, { "auxiliary_loss_clip": 0.01115335, "auxiliary_loss_mlp": 0.01031951, "balance_loss_clip": 1.01786661, "balance_loss_mlp": 1.03991866, "epoch": 0.5151360288591613, "flos": 25952641418880.0, "grad_norm": 2.83297119265593, "language_loss": 0.79244548, "learning_rate": 1.9990263458347806e-06, "loss": 0.81391835, "num_input_tokens_seen": 184166175, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 8568, "time_per_iteration": 2.5045273303985596 }, { "auxiliary_loss_clip": 0.01108589, "auxiliary_loss_mlp": 0.01030606, "balance_loss_clip": 1.01826143, "balance_loss_mlp": 1.03705287, "epoch": 0.5151961521118292, "flos": 18507246312960.0, "grad_norm": 2.7071439298086677, "language_loss": 0.90914917, "learning_rate": 1.9986368842203825e-06, "loss": 0.93054104, "num_input_tokens_seen": 184182600, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71484375, "step": 8569, "time_per_iteration": 2.442979097366333 }, { "auxiliary_loss_clip": 0.01114586, "auxiliary_loss_mlp": 0.01030415, "balance_loss_clip": 1.01720679, "balance_loss_mlp": 1.04014754, "epoch": 0.5152562753644973, "flos": 22233696837120.0, "grad_norm": 1.8425089609181875, "language_loss": 0.76489741, "learning_rate": 1.998247422657674e-06, "loss": 0.78634745, "num_input_tokens_seen": 184202020, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 8570, "time_per_iteration": 2.488217830657959 }, { "auxiliary_loss_clip": 0.01111648, "auxiliary_loss_mlp": 0.01036376, "balance_loss_clip": 1.02169585, "balance_loss_mlp": 1.03810096, "epoch": 0.5153163986171652, "flos": 38436555047040.0, "grad_norm": 1.7550946575851183, "language_loss": 0.73657668, "learning_rate": 1.9978579611614227e-06, "loss": 0.75805688, "num_input_tokens_seen": 184224850, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.734375, "step": 8571, "time_per_iteration": 2.623849868774414 }, { "auxiliary_loss_clip": 0.0104171, "auxiliary_loss_mlp": 0.01005585, "balance_loss_clip": 1.0043633, "balance_loss_mlp": 1.01669991, "epoch": 0.5153765218698332, "flos": 66384503015040.0, "grad_norm": 0.794727330329501, "language_loss": 0.52933991, "learning_rate": 1.9974684997463984e-06, "loss": 0.54981279, "num_input_tokens_seen": 184288520, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.25, "step": 8572, "time_per_iteration": 3.1715617179870605 }, { "auxiliary_loss_clip": 0.01111876, "auxiliary_loss_mlp": 0.01032431, "balance_loss_clip": 1.02012229, "balance_loss_mlp": 1.04106629, "epoch": 0.5154366451225011, "flos": 24024669891840.0, "grad_norm": 1.6201767237350804, "language_loss": 0.76229882, "learning_rate": 1.9970790384273687e-06, "loss": 0.78374183, "num_input_tokens_seen": 184308565, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 8573, "time_per_iteration": 2.4976372718811035 }, { "auxiliary_loss_clip": 0.01108906, "auxiliary_loss_mlp": 0.01028178, "balance_loss_clip": 1.0148623, "balance_loss_mlp": 1.03766418, "epoch": 0.5154967683751691, "flos": 23468843859840.0, "grad_norm": 2.7325815599145, "language_loss": 0.77137983, "learning_rate": 1.996689577219102e-06, "loss": 0.79275066, "num_input_tokens_seen": 184326795, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 8574, "time_per_iteration": 2.478444814682007 }, { "auxiliary_loss_clip": 0.01109312, "auxiliary_loss_mlp": 0.01034618, "balance_loss_clip": 1.0223093, "balance_loss_mlp": 1.03762317, "epoch": 0.515556891627837, "flos": 23805650712960.0, "grad_norm": 1.6711988334078982, "language_loss": 0.85470617, "learning_rate": 1.996300116136367e-06, "loss": 0.87614548, "num_input_tokens_seen": 184345990, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 8575, "time_per_iteration": 2.4710488319396973 }, { "auxiliary_loss_clip": 0.01112357, "auxiliary_loss_mlp": 0.01032847, "balance_loss_clip": 1.01937675, "balance_loss_mlp": 1.03901696, "epoch": 0.515617014880505, "flos": 19828544106240.0, "grad_norm": 1.557923713430643, "language_loss": 0.77213687, "learning_rate": 1.995910655193932e-06, "loss": 0.79358894, "num_input_tokens_seen": 184366300, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 8576, "time_per_iteration": 2.503750801086426 }, { "auxiliary_loss_clip": 0.01116303, "auxiliary_loss_mlp": 0.01028077, "balance_loss_clip": 1.01421845, "balance_loss_mlp": 1.03988004, "epoch": 0.515677138133173, "flos": 14245907385600.0, "grad_norm": 4.683171036991821, "language_loss": 0.75588286, "learning_rate": 1.9955211944065654e-06, "loss": 0.77732664, "num_input_tokens_seen": 184383030, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 8577, "time_per_iteration": 2.4389729499816895 }, { "auxiliary_loss_clip": 0.01114103, "auxiliary_loss_mlp": 0.01036144, "balance_loss_clip": 1.0217731, "balance_loss_mlp": 1.03865027, "epoch": 0.515737261385841, "flos": 28289707920000.0, "grad_norm": 1.7558491583769629, "language_loss": 0.80755854, "learning_rate": 1.9951317337890353e-06, "loss": 0.82906103, "num_input_tokens_seen": 184403410, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 8578, "time_per_iteration": 2.5527913570404053 }, { "auxiliary_loss_clip": 0.01108185, "auxiliary_loss_mlp": 0.01030421, "balance_loss_clip": 1.01739717, "balance_loss_mlp": 1.03679633, "epoch": 0.515797384638509, "flos": 27891925729920.0, "grad_norm": 1.6721744847044053, "language_loss": 0.76216614, "learning_rate": 1.9947422733561105e-06, "loss": 0.78355217, "num_input_tokens_seen": 184423830, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 8579, "time_per_iteration": 2.5099267959594727 }, { "auxiliary_loss_clip": 0.0111307, "auxiliary_loss_mlp": 0.01028447, "balance_loss_clip": 1.01582265, "balance_loss_mlp": 1.03955448, "epoch": 0.5158575078911769, "flos": 23040071210880.0, "grad_norm": 2.149794244840519, "language_loss": 0.79323196, "learning_rate": 1.994352813122559e-06, "loss": 0.81464708, "num_input_tokens_seen": 184445050, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 8580, "time_per_iteration": 2.5091238021850586 }, { "auxiliary_loss_clip": 0.01116616, "auxiliary_loss_mlp": 0.01035889, "balance_loss_clip": 1.02225173, "balance_loss_mlp": 1.04116249, "epoch": 0.5159176311438449, "flos": 12641346938880.0, "grad_norm": 2.047325702008275, "language_loss": 0.72956192, "learning_rate": 1.99396335310315e-06, "loss": 0.75108695, "num_input_tokens_seen": 184460775, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75390625, "step": 8581, "time_per_iteration": 2.410463571548462 }, { "auxiliary_loss_clip": 0.01110053, "auxiliary_loss_mlp": 0.01029547, "balance_loss_clip": 1.01685143, "balance_loss_mlp": 1.03811359, "epoch": 0.5159777543965128, "flos": 15558154951680.0, "grad_norm": 4.174955666188024, "language_loss": 0.74705756, "learning_rate": 1.9935738933126508e-06, "loss": 0.7684536, "num_input_tokens_seen": 184477365, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 8582, "time_per_iteration": 2.4537370204925537 }, { "auxiliary_loss_clip": 0.01111689, "auxiliary_loss_mlp": 0.01030964, "balance_loss_clip": 1.01845896, "balance_loss_mlp": 1.03938663, "epoch": 0.5160378776491809, "flos": 23221671396480.0, "grad_norm": 1.9279635846320013, "language_loss": 0.66134524, "learning_rate": 1.99318443376583e-06, "loss": 0.6827718, "num_input_tokens_seen": 184497045, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 8583, "time_per_iteration": 2.4596776962280273 }, { "auxiliary_loss_clip": 0.01113577, "auxiliary_loss_mlp": 0.01033944, "balance_loss_clip": 1.02011514, "balance_loss_mlp": 1.03947246, "epoch": 0.5160980009018488, "flos": 21944616180480.0, "grad_norm": 1.7041559785677063, "language_loss": 0.75836504, "learning_rate": 1.9927949744774568e-06, "loss": 0.77984023, "num_input_tokens_seen": 184517675, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 8584, "time_per_iteration": 2.469353437423706 }, { "auxiliary_loss_clip": 0.0111513, "auxiliary_loss_mlp": 0.01039234, "balance_loss_clip": 1.02566826, "balance_loss_mlp": 1.03948486, "epoch": 0.5161581241545168, "flos": 22784064001920.0, "grad_norm": 1.9849653587090583, "language_loss": 0.78895247, "learning_rate": 1.9924055154622983e-06, "loss": 0.81049621, "num_input_tokens_seen": 184537745, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7578125, "step": 8585, "time_per_iteration": 2.4613702297210693 }, { "auxiliary_loss_clip": 0.01106511, "auxiliary_loss_mlp": 0.01033163, "balance_loss_clip": 1.02058053, "balance_loss_mlp": 1.03663325, "epoch": 0.5162182474071847, "flos": 19675384513920.0, "grad_norm": 2.57209972779393, "language_loss": 0.81517768, "learning_rate": 1.9920160567351238e-06, "loss": 0.83657432, "num_input_tokens_seen": 184553630, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 8586, "time_per_iteration": 2.4448437690734863 }, { "auxiliary_loss_clip": 0.01112055, "auxiliary_loss_mlp": 0.01029957, "balance_loss_clip": 1.01689196, "balance_loss_mlp": 1.03827071, "epoch": 0.5162783706598527, "flos": 20046198568320.0, "grad_norm": 10.241494050620446, "language_loss": 0.71682304, "learning_rate": 1.991626598310701e-06, "loss": 0.7382431, "num_input_tokens_seen": 184573530, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 8587, "time_per_iteration": 2.4672975540161133 }, { "auxiliary_loss_clip": 0.01042851, "auxiliary_loss_mlp": 0.01002559, "balance_loss_clip": 1.00130129, "balance_loss_mlp": 1.01823711, "epoch": 0.5163384939125206, "flos": 69959553713280.0, "grad_norm": 0.7334597396099594, "language_loss": 0.57844281, "learning_rate": 1.9912371402037984e-06, "loss": 0.59889686, "num_input_tokens_seen": 184637875, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.24609375, "step": 8588, "time_per_iteration": 3.1161234378814697 }, { "auxiliary_loss_clip": 0.0111187, "auxiliary_loss_mlp": 0.01037517, "balance_loss_clip": 1.02342081, "balance_loss_mlp": 1.03804576, "epoch": 0.5163986171651886, "flos": 17417034668160.0, "grad_norm": 1.8750404607845972, "language_loss": 0.74986076, "learning_rate": 1.990847682429185e-06, "loss": 0.77135468, "num_input_tokens_seen": 184656125, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 8589, "time_per_iteration": 2.464015245437622 }, { "auxiliary_loss_clip": 0.01112685, "auxiliary_loss_mlp": 0.01030683, "balance_loss_clip": 1.01817799, "balance_loss_mlp": 1.03855968, "epoch": 0.5164587404178566, "flos": 21322679166720.0, "grad_norm": 1.5577436071377218, "language_loss": 0.67415726, "learning_rate": 1.990458225001627e-06, "loss": 0.69559097, "num_input_tokens_seen": 184675920, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7421875, "step": 8590, "time_per_iteration": 3.922821044921875 }, { "auxiliary_loss_clip": 0.01043849, "auxiliary_loss_mlp": 0.01004448, "balance_loss_clip": 1.00310063, "balance_loss_mlp": 1.01895714, "epoch": 0.5165188636705246, "flos": 68057149691520.0, "grad_norm": 4.73788356992276, "language_loss": 0.55880523, "learning_rate": 1.990068767935895e-06, "loss": 0.57928824, "num_input_tokens_seen": 184730520, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.24902344, "step": 8591, "time_per_iteration": 4.506302118301392 }, { "auxiliary_loss_clip": 0.0110492, "auxiliary_loss_mlp": 0.01027391, "balance_loss_clip": 1.01536322, "balance_loss_mlp": 1.03668499, "epoch": 0.5165789869231926, "flos": 19385657412480.0, "grad_norm": 1.6239239859497636, "language_loss": 0.81307447, "learning_rate": 1.9896793112467566e-06, "loss": 0.83439761, "num_input_tokens_seen": 184748340, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 8592, "time_per_iteration": 3.9242005348205566 }, { "auxiliary_loss_clip": 0.01110316, "auxiliary_loss_mlp": 0.01030171, "balance_loss_clip": 1.01798749, "balance_loss_mlp": 1.04004645, "epoch": 0.5166391101758605, "flos": 20960197067520.0, "grad_norm": 2.832972196996028, "language_loss": 0.83653939, "learning_rate": 1.989289854948979e-06, "loss": 0.85794425, "num_input_tokens_seen": 184766615, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 8593, "time_per_iteration": 2.473665475845337 }, { "auxiliary_loss_clip": 0.01113174, "auxiliary_loss_mlp": 0.01032559, "balance_loss_clip": 1.01977396, "balance_loss_mlp": 1.04044783, "epoch": 0.5166992334285285, "flos": 29462407148160.0, "grad_norm": 1.6870383322749962, "language_loss": 0.69271255, "learning_rate": 1.9889003990573314e-06, "loss": 0.71416986, "num_input_tokens_seen": 184788075, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 8594, "time_per_iteration": 3.9646599292755127 }, { "auxiliary_loss_clip": 0.01110933, "auxiliary_loss_mlp": 0.01029899, "balance_loss_clip": 1.01648235, "balance_loss_mlp": 1.0390718, "epoch": 0.5167593566811964, "flos": 20304360593280.0, "grad_norm": 1.761669770047631, "language_loss": 0.77338862, "learning_rate": 1.988510943586582e-06, "loss": 0.79479694, "num_input_tokens_seen": 184808710, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 8595, "time_per_iteration": 2.5127756595611572 }, { "auxiliary_loss_clip": 0.01110868, "auxiliary_loss_mlp": 0.01036722, "balance_loss_clip": 1.02309012, "balance_loss_mlp": 1.03878319, "epoch": 0.5168194799338645, "flos": 14611370313600.0, "grad_norm": 1.716619315154339, "language_loss": 0.6540432, "learning_rate": 1.9881214885514986e-06, "loss": 0.67551911, "num_input_tokens_seen": 184826475, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 8596, "time_per_iteration": 2.457318067550659 }, { "auxiliary_loss_clip": 0.0111281, "auxiliary_loss_mlp": 0.01032387, "balance_loss_clip": 1.01774192, "balance_loss_mlp": 1.04080188, "epoch": 0.5168796031865324, "flos": 25007257411200.0, "grad_norm": 2.1634036684861813, "language_loss": 0.75252533, "learning_rate": 1.9877320339668492e-06, "loss": 0.7739774, "num_input_tokens_seen": 184845245, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.72265625, "step": 8597, "time_per_iteration": 2.5264010429382324 }, { "auxiliary_loss_clip": 0.01110435, "auxiliary_loss_mlp": 0.01024103, "balance_loss_clip": 1.01117527, "balance_loss_mlp": 1.03760624, "epoch": 0.5169397264392004, "flos": 26939969533440.0, "grad_norm": 1.6277582278941392, "language_loss": 0.81272644, "learning_rate": 1.987342579847403e-06, "loss": 0.83407182, "num_input_tokens_seen": 184866605, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8598, "time_per_iteration": 2.561785936355591 }, { "auxiliary_loss_clip": 0.01112205, "auxiliary_loss_mlp": 0.01034496, "balance_loss_clip": 1.0209955, "balance_loss_mlp": 1.03863692, "epoch": 0.5169998496918683, "flos": 25407804948480.0, "grad_norm": 1.5861626112678857, "language_loss": 0.75498009, "learning_rate": 1.9869531262079273e-06, "loss": 0.77644706, "num_input_tokens_seen": 184886945, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 8599, "time_per_iteration": 2.53657603263855 }, { "auxiliary_loss_clip": 0.0111124, "auxiliary_loss_mlp": 0.01033193, "balance_loss_clip": 1.02046776, "balance_loss_mlp": 1.03950524, "epoch": 0.5170599729445363, "flos": 24680793674880.0, "grad_norm": 2.308111214015118, "language_loss": 0.71883363, "learning_rate": 1.9865636730631904e-06, "loss": 0.74027795, "num_input_tokens_seen": 184905590, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 8600, "time_per_iteration": 2.503061532974243 }, { "auxiliary_loss_clip": 0.01111697, "auxiliary_loss_mlp": 0.01034683, "balance_loss_clip": 1.02131963, "balance_loss_mlp": 1.04005456, "epoch": 0.5171200961972042, "flos": 20994455664000.0, "grad_norm": 1.470310191088619, "language_loss": 0.74843073, "learning_rate": 1.9861742204279602e-06, "loss": 0.76989454, "num_input_tokens_seen": 184925555, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 8601, "time_per_iteration": 2.488701820373535 }, { "auxiliary_loss_clip": 0.01111854, "auxiliary_loss_mlp": 0.01039943, "balance_loss_clip": 1.0260905, "balance_loss_mlp": 1.03869712, "epoch": 0.5171802194498722, "flos": 22745639427840.0, "grad_norm": 1.8939775002046408, "language_loss": 0.83757836, "learning_rate": 1.9857847683170045e-06, "loss": 0.85909641, "num_input_tokens_seen": 184944490, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73046875, "step": 8602, "time_per_iteration": 2.4926843643188477 }, { "auxiliary_loss_clip": 0.01112767, "auxiliary_loss_mlp": 0.01032097, "balance_loss_clip": 1.01872754, "balance_loss_mlp": 1.03960192, "epoch": 0.5172403427025402, "flos": 28176732668160.0, "grad_norm": 2.543275542267953, "language_loss": 0.74377155, "learning_rate": 1.9853953167450926e-06, "loss": 0.76522022, "num_input_tokens_seen": 184963190, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 8603, "time_per_iteration": 2.54233980178833 }, { "auxiliary_loss_clip": 0.01113952, "auxiliary_loss_mlp": 0.01033927, "balance_loss_clip": 1.02078366, "balance_loss_mlp": 1.04011071, "epoch": 0.5173004659552082, "flos": 20337829090560.0, "grad_norm": 2.0337424734226035, "language_loss": 0.72324598, "learning_rate": 1.9850058657269915e-06, "loss": 0.74472475, "num_input_tokens_seen": 184981220, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 8604, "time_per_iteration": 2.4845097064971924 }, { "auxiliary_loss_clip": 0.0111912, "auxiliary_loss_mlp": 0.01032463, "balance_loss_clip": 1.01811624, "balance_loss_mlp": 1.03983915, "epoch": 0.5173605892078762, "flos": 19063323740160.0, "grad_norm": 1.851030131565794, "language_loss": 0.85176194, "learning_rate": 1.984616415277469e-06, "loss": 0.87327778, "num_input_tokens_seen": 184998810, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.79296875, "step": 8605, "time_per_iteration": 2.490496873855591 }, { "auxiliary_loss_clip": 0.0110984, "auxiliary_loss_mlp": 0.01024308, "balance_loss_clip": 1.01191616, "balance_loss_mlp": 1.03705144, "epoch": 0.5174207124605441, "flos": 27995168396160.0, "grad_norm": 2.768199121776588, "language_loss": 0.64740086, "learning_rate": 1.984226965411294e-06, "loss": 0.6687423, "num_input_tokens_seen": 185021185, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 8606, "time_per_iteration": 2.5307023525238037 }, { "auxiliary_loss_clip": 0.01110259, "auxiliary_loss_mlp": 0.01027939, "balance_loss_clip": 1.01513577, "balance_loss_mlp": 1.03863215, "epoch": 0.5174808357132121, "flos": 19496657416320.0, "grad_norm": 1.8205892364212544, "language_loss": 0.78166819, "learning_rate": 1.983837516143234e-06, "loss": 0.80305016, "num_input_tokens_seen": 185038465, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 8607, "time_per_iteration": 2.470287322998047 }, { "auxiliary_loss_clip": 0.01113132, "auxiliary_loss_mlp": 0.01035796, "balance_loss_clip": 1.02175868, "balance_loss_mlp": 1.03906393, "epoch": 0.51754095896588, "flos": 22784171742720.0, "grad_norm": 2.272160757398211, "language_loss": 0.72539282, "learning_rate": 1.983448067488057e-06, "loss": 0.74688208, "num_input_tokens_seen": 185057340, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 8608, "time_per_iteration": 2.4992451667785645 }, { "auxiliary_loss_clip": 0.01118198, "auxiliary_loss_mlp": 0.01034361, "balance_loss_clip": 1.01980519, "balance_loss_mlp": 1.03998375, "epoch": 0.5176010822185481, "flos": 22669257156480.0, "grad_norm": 2.303039575582069, "language_loss": 0.86734682, "learning_rate": 1.983058619460531e-06, "loss": 0.88887244, "num_input_tokens_seen": 185074935, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.78125, "step": 8609, "time_per_iteration": 2.493030309677124 }, { "auxiliary_loss_clip": 0.01110403, "auxiliary_loss_mlp": 0.01032143, "balance_loss_clip": 1.01978636, "balance_loss_mlp": 1.03721142, "epoch": 0.517661205471216, "flos": 23951196622080.0, "grad_norm": 6.427984107309573, "language_loss": 0.73592424, "learning_rate": 1.9826691720754237e-06, "loss": 0.75734973, "num_input_tokens_seen": 185095050, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.73046875, "step": 8610, "time_per_iteration": 2.526716470718384 }, { "auxiliary_loss_clip": 0.01118736, "auxiliary_loss_mlp": 0.01034996, "balance_loss_clip": 1.02037477, "balance_loss_mlp": 1.04154444, "epoch": 0.517721328723884, "flos": 15596076735360.0, "grad_norm": 2.042914693743031, "language_loss": 0.6750198, "learning_rate": 1.9822797253475034e-06, "loss": 0.69655716, "num_input_tokens_seen": 185112275, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 8611, "time_per_iteration": 2.452794313430786 }, { "auxiliary_loss_clip": 0.0111201, "auxiliary_loss_mlp": 0.01035307, "balance_loss_clip": 1.02187824, "balance_loss_mlp": 1.03810263, "epoch": 0.5177814519765519, "flos": 20960197067520.0, "grad_norm": 2.3357135833700635, "language_loss": 0.77290332, "learning_rate": 1.9818902792915373e-06, "loss": 0.79437649, "num_input_tokens_seen": 185132165, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 8612, "time_per_iteration": 2.5295281410217285 }, { "auxiliary_loss_clip": 0.01113566, "auxiliary_loss_mlp": 0.01034589, "balance_loss_clip": 1.02068353, "balance_loss_mlp": 1.03892779, "epoch": 0.5178415752292199, "flos": 17967832796160.0, "grad_norm": 1.9928068518433646, "language_loss": 0.82048714, "learning_rate": 1.981500833922294e-06, "loss": 0.84196872, "num_input_tokens_seen": 185151025, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 8613, "time_per_iteration": 2.4714698791503906 }, { "auxiliary_loss_clip": 0.0111543, "auxiliary_loss_mlp": 0.01033562, "balance_loss_clip": 1.01916134, "balance_loss_mlp": 1.04172051, "epoch": 0.5179016984818878, "flos": 17821496787840.0, "grad_norm": 2.542284899426558, "language_loss": 0.66005737, "learning_rate": 1.981111389254541e-06, "loss": 0.68154728, "num_input_tokens_seen": 185168455, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.734375, "step": 8614, "time_per_iteration": 2.4702439308166504 }, { "auxiliary_loss_clip": 0.0111582, "auxiliary_loss_mlp": 0.01030773, "balance_loss_clip": 1.01686716, "balance_loss_mlp": 1.04030919, "epoch": 0.5179618217345558, "flos": 17820455293440.0, "grad_norm": 2.3399674992875075, "language_loss": 0.86657113, "learning_rate": 1.9807219453030453e-06, "loss": 0.88803709, "num_input_tokens_seen": 185184415, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75390625, "step": 8615, "time_per_iteration": 2.4673283100128174 }, { "auxiliary_loss_clip": 0.01112404, "auxiliary_loss_mlp": 0.01035479, "balance_loss_clip": 1.02249098, "balance_loss_mlp": 1.04012871, "epoch": 0.5180219449872238, "flos": 22522131048960.0, "grad_norm": 1.5536284164893202, "language_loss": 0.80930519, "learning_rate": 1.9803325020825763e-06, "loss": 0.83078408, "num_input_tokens_seen": 185202910, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 8616, "time_per_iteration": 2.4751172065734863 }, { "auxiliary_loss_clip": 0.01122524, "auxiliary_loss_mlp": 0.01042947, "balance_loss_clip": 1.02890968, "balance_loss_mlp": 1.0457412, "epoch": 0.5180820682398918, "flos": 23915465568000.0, "grad_norm": 2.21114838737362, "language_loss": 0.74992156, "learning_rate": 1.9799430596079e-06, "loss": 0.77157623, "num_input_tokens_seen": 185223085, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 8617, "time_per_iteration": 2.5315589904785156 }, { "auxiliary_loss_clip": 0.0111554, "auxiliary_loss_mlp": 0.01037083, "balance_loss_clip": 1.02307618, "balance_loss_mlp": 1.04060531, "epoch": 0.5181421914925598, "flos": 16979930064000.0, "grad_norm": 2.066119253885327, "language_loss": 0.70148408, "learning_rate": 1.979553617893785e-06, "loss": 0.7230103, "num_input_tokens_seen": 185241295, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 8618, "time_per_iteration": 2.452545404434204 }, { "auxiliary_loss_clip": 0.01046536, "auxiliary_loss_mlp": 0.01005528, "balance_loss_clip": 1.0041151, "balance_loss_mlp": 1.021631, "epoch": 0.5182023147452277, "flos": 66059870872320.0, "grad_norm": 0.9394493580447819, "language_loss": 0.67270136, "learning_rate": 1.979164176954999e-06, "loss": 0.69322205, "num_input_tokens_seen": 185298295, "router_z_loss_clip": 0.01409912, "router_z_loss_mlp": 0.24902344, "step": 8619, "time_per_iteration": 3.066267490386963 }, { "auxiliary_loss_clip": 0.01110508, "auxiliary_loss_mlp": 0.01030581, "balance_loss_clip": 1.01778352, "balance_loss_mlp": 1.03952432, "epoch": 0.5182624379978957, "flos": 18187749815040.0, "grad_norm": 2.0791178655322837, "language_loss": 0.79467434, "learning_rate": 1.97877473680631e-06, "loss": 0.81608522, "num_input_tokens_seen": 185317000, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 8620, "time_per_iteration": 2.4609601497650146 }, { "auxiliary_loss_clip": 0.01113916, "auxiliary_loss_mlp": 0.0103596, "balance_loss_clip": 1.02275157, "balance_loss_mlp": 1.04217935, "epoch": 0.5183225612505636, "flos": 14026708638720.0, "grad_norm": 2.3879642570890725, "language_loss": 0.82007229, "learning_rate": 1.9783852974624846e-06, "loss": 0.84157109, "num_input_tokens_seen": 185331185, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 8621, "time_per_iteration": 2.454045295715332 }, { "auxiliary_loss_clip": 0.01115088, "auxiliary_loss_mlp": 0.01035101, "balance_loss_clip": 1.02262545, "balance_loss_mlp": 1.0413084, "epoch": 0.5183826845032317, "flos": 23659781581440.0, "grad_norm": 4.050287160567147, "language_loss": 0.6537112, "learning_rate": 1.9779958589382905e-06, "loss": 0.6752131, "num_input_tokens_seen": 185348955, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.73828125, "step": 8622, "time_per_iteration": 2.519860029220581 }, { "auxiliary_loss_clip": 0.01118362, "auxiliary_loss_mlp": 0.01038326, "balance_loss_clip": 1.02422404, "balance_loss_mlp": 1.04252625, "epoch": 0.5184428077558996, "flos": 15888605097600.0, "grad_norm": 2.787073337462933, "language_loss": 0.6087606, "learning_rate": 1.977606421248497e-06, "loss": 0.63032746, "num_input_tokens_seen": 185367330, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 8623, "time_per_iteration": 2.468337297439575 }, { "auxiliary_loss_clip": 0.01114875, "auxiliary_loss_mlp": 0.01029594, "balance_loss_clip": 1.01652241, "balance_loss_mlp": 1.04062712, "epoch": 0.5185029310085676, "flos": 21030833162880.0, "grad_norm": 1.8844440273891336, "language_loss": 0.76082057, "learning_rate": 1.9772169844078685e-06, "loss": 0.78226525, "num_input_tokens_seen": 185385060, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 8624, "time_per_iteration": 2.501816511154175 }, { "auxiliary_loss_clip": 0.01114383, "auxiliary_loss_mlp": 0.01034456, "balance_loss_clip": 1.02183735, "balance_loss_mlp": 1.04055619, "epoch": 0.5185630542612355, "flos": 26542690133760.0, "grad_norm": 1.7687597021274004, "language_loss": 0.71718556, "learning_rate": 1.9768275484311756e-06, "loss": 0.73867393, "num_input_tokens_seen": 185403745, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73828125, "step": 8625, "time_per_iteration": 2.529489278793335 }, { "auxiliary_loss_clip": 0.01116489, "auxiliary_loss_mlp": 0.0103108, "balance_loss_clip": 1.01844335, "balance_loss_mlp": 1.04239535, "epoch": 0.5186231775139035, "flos": 20668422890880.0, "grad_norm": 1.9599175841743905, "language_loss": 0.67778027, "learning_rate": 1.976438113333184e-06, "loss": 0.69925594, "num_input_tokens_seen": 185422620, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 8626, "time_per_iteration": 2.4955639839172363 }, { "auxiliary_loss_clip": 0.0111488, "auxiliary_loss_mlp": 0.01031928, "balance_loss_clip": 1.01860619, "balance_loss_mlp": 1.04246712, "epoch": 0.5186833007665714, "flos": 20885502735360.0, "grad_norm": 1.9971518144323805, "language_loss": 0.70642698, "learning_rate": 1.9760486791286612e-06, "loss": 0.72789502, "num_input_tokens_seen": 185439380, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 8627, "time_per_iteration": 2.480792284011841 }, { "auxiliary_loss_clip": 0.01119146, "auxiliary_loss_mlp": 0.0104007, "balance_loss_clip": 1.02696896, "balance_loss_mlp": 1.04334164, "epoch": 0.5187434240192395, "flos": 20886903365760.0, "grad_norm": 1.934686597018356, "language_loss": 0.73088378, "learning_rate": 1.9756592458323753e-06, "loss": 0.75247592, "num_input_tokens_seen": 185458830, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7578125, "step": 8628, "time_per_iteration": 2.5016868114471436 }, { "auxiliary_loss_clip": 0.01115942, "auxiliary_loss_mlp": 0.01032858, "balance_loss_clip": 1.02000701, "balance_loss_mlp": 1.04312491, "epoch": 0.5188035472719074, "flos": 19859929614720.0, "grad_norm": 2.1102032845852596, "language_loss": 0.77175689, "learning_rate": 1.9752698134590927e-06, "loss": 0.79324496, "num_input_tokens_seen": 185477270, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8629, "time_per_iteration": 2.488492488861084 }, { "auxiliary_loss_clip": 0.01120496, "auxiliary_loss_mlp": 0.01032538, "balance_loss_clip": 1.0187571, "balance_loss_mlp": 1.04541373, "epoch": 0.5188636705245754, "flos": 21138313633920.0, "grad_norm": 2.1108709101403353, "language_loss": 0.75011933, "learning_rate": 1.9748803820235815e-06, "loss": 0.77164972, "num_input_tokens_seen": 185495795, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 8630, "time_per_iteration": 2.4853804111480713 }, { "auxiliary_loss_clip": 0.01115027, "auxiliary_loss_mlp": 0.01034846, "balance_loss_clip": 1.0206778, "balance_loss_mlp": 1.03999233, "epoch": 0.5189237937772434, "flos": 22419786222720.0, "grad_norm": 1.7044172710819334, "language_loss": 0.80009282, "learning_rate": 1.9744909515406093e-06, "loss": 0.82159156, "num_input_tokens_seen": 185514885, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 8631, "time_per_iteration": 2.5085904598236084 }, { "auxiliary_loss_clip": 0.01115865, "auxiliary_loss_mlp": 0.01034195, "balance_loss_clip": 1.0205338, "balance_loss_mlp": 1.04077744, "epoch": 0.5189839170299113, "flos": 25446696399360.0, "grad_norm": 1.6165746816782243, "language_loss": 0.74606025, "learning_rate": 1.974101522024942e-06, "loss": 0.76756084, "num_input_tokens_seen": 185537155, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 8632, "time_per_iteration": 3.981816291809082 }, { "auxiliary_loss_clip": 0.01110376, "auxiliary_loss_mlp": 0.01033655, "balance_loss_clip": 1.0209173, "balance_loss_mlp": 1.0399437, "epoch": 0.5190440402825793, "flos": 18587722734720.0, "grad_norm": 2.196830239491887, "language_loss": 0.7880497, "learning_rate": 1.9737120934913477e-06, "loss": 0.80949003, "num_input_tokens_seen": 185555520, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 8633, "time_per_iteration": 4.004780292510986 }, { "auxiliary_loss_clip": 0.0111577, "auxiliary_loss_mlp": 0.01031687, "balance_loss_clip": 1.01881289, "balance_loss_mlp": 1.04120827, "epoch": 0.5191041635352472, "flos": 21908633731200.0, "grad_norm": 1.5743224915107734, "language_loss": 0.80770689, "learning_rate": 1.9733226659545936e-06, "loss": 0.82918143, "num_input_tokens_seen": 185573855, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.74609375, "step": 8634, "time_per_iteration": 2.518416166305542 }, { "auxiliary_loss_clip": 0.01111633, "auxiliary_loss_mlp": 0.01037575, "balance_loss_clip": 1.02449203, "balance_loss_mlp": 1.04002929, "epoch": 0.5191642867879153, "flos": 27527971173120.0, "grad_norm": 1.6166776160111922, "language_loss": 0.68903708, "learning_rate": 1.9729332394294467e-06, "loss": 0.71052921, "num_input_tokens_seen": 185595145, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 8635, "time_per_iteration": 3.9476163387298584 }, { "auxiliary_loss_clip": 0.01116572, "auxiliary_loss_mlp": 0.01033897, "balance_loss_clip": 1.02066457, "balance_loss_mlp": 1.04087138, "epoch": 0.5192244100405832, "flos": 15705999331200.0, "grad_norm": 5.611013378131567, "language_loss": 0.7766217, "learning_rate": 1.9725438139306742e-06, "loss": 0.79812634, "num_input_tokens_seen": 185613320, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 8636, "time_per_iteration": 3.8288984298706055 }, { "auxiliary_loss_clip": 0.01116229, "auxiliary_loss_mlp": 0.01033984, "balance_loss_clip": 1.0206269, "balance_loss_mlp": 1.04104793, "epoch": 0.5192845332932512, "flos": 12057080313600.0, "grad_norm": 1.9810581737091888, "language_loss": 0.72032344, "learning_rate": 1.9721543894730425e-06, "loss": 0.74182558, "num_input_tokens_seen": 185630730, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 8637, "time_per_iteration": 2.464534044265747 }, { "auxiliary_loss_clip": 0.011118, "auxiliary_loss_mlp": 0.01032613, "balance_loss_clip": 1.01968503, "balance_loss_mlp": 1.04020882, "epoch": 0.5193446565459191, "flos": 18953185662720.0, "grad_norm": 2.2371476071225636, "language_loss": 0.75426114, "learning_rate": 1.9717649660713194e-06, "loss": 0.77570534, "num_input_tokens_seen": 185648515, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 8638, "time_per_iteration": 2.467827558517456 }, { "auxiliary_loss_clip": 0.011115, "auxiliary_loss_mlp": 0.01036218, "balance_loss_clip": 1.02368951, "balance_loss_mlp": 1.03910339, "epoch": 0.5194047797985871, "flos": 20374960775040.0, "grad_norm": 1.783952458931889, "language_loss": 0.75031573, "learning_rate": 1.971375543740272e-06, "loss": 0.77179289, "num_input_tokens_seen": 185665220, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 8639, "time_per_iteration": 2.4666168689727783 }, { "auxiliary_loss_clip": 0.01111459, "auxiliary_loss_mlp": 0.01028774, "balance_loss_clip": 1.01521349, "balance_loss_mlp": 1.03957212, "epoch": 0.519464903051255, "flos": 24353001135360.0, "grad_norm": 5.79931480121692, "language_loss": 0.77845693, "learning_rate": 1.9709861224946665e-06, "loss": 0.79985923, "num_input_tokens_seen": 185683750, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 8640, "time_per_iteration": 2.5187621116638184 }, { "auxiliary_loss_clip": 0.01113233, "auxiliary_loss_mlp": 0.01035207, "balance_loss_clip": 1.02214193, "balance_loss_mlp": 1.04155326, "epoch": 0.519525026303923, "flos": 14061829161600.0, "grad_norm": 2.1180302160753093, "language_loss": 0.66043657, "learning_rate": 1.97059670234927e-06, "loss": 0.68192095, "num_input_tokens_seen": 185700625, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 8641, "time_per_iteration": 2.475471019744873 }, { "auxiliary_loss_clip": 0.01112756, "auxiliary_loss_mlp": 0.01032307, "balance_loss_clip": 1.02012372, "balance_loss_mlp": 1.04102838, "epoch": 0.519585149556591, "flos": 28835873193600.0, "grad_norm": 1.7594417893103864, "language_loss": 0.76382738, "learning_rate": 1.97020728331885e-06, "loss": 0.78527808, "num_input_tokens_seen": 185721155, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 8642, "time_per_iteration": 2.5420658588409424 }, { "auxiliary_loss_clip": 0.01111556, "auxiliary_loss_mlp": 0.0102896, "balance_loss_clip": 1.01567972, "balance_loss_mlp": 1.03997803, "epoch": 0.519645272809259, "flos": 25373007648000.0, "grad_norm": 3.5990482584278034, "language_loss": 0.83225536, "learning_rate": 1.9698178654181726e-06, "loss": 0.85366058, "num_input_tokens_seen": 185740990, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 8643, "time_per_iteration": 2.5590665340423584 }, { "auxiliary_loss_clip": 0.01114925, "auxiliary_loss_mlp": 0.01036688, "balance_loss_clip": 1.02342021, "balance_loss_mlp": 1.03941131, "epoch": 0.519705396061927, "flos": 25372863993600.0, "grad_norm": 1.931455838370658, "language_loss": 0.70331049, "learning_rate": 1.969428448662004e-06, "loss": 0.72482663, "num_input_tokens_seen": 185762235, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 8644, "time_per_iteration": 2.5478339195251465 }, { "auxiliary_loss_clip": 0.01110973, "auxiliary_loss_mlp": 0.01035367, "balance_loss_clip": 1.02252197, "balance_loss_mlp": 1.03802335, "epoch": 0.5197655193145949, "flos": 28476228268800.0, "grad_norm": 3.335735016965064, "language_loss": 0.8028186, "learning_rate": 1.9690390330651133e-06, "loss": 0.82428205, "num_input_tokens_seen": 185783415, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 8645, "time_per_iteration": 2.5594382286071777 }, { "auxiliary_loss_clip": 0.01110591, "auxiliary_loss_mlp": 0.01029847, "balance_loss_clip": 1.01705003, "balance_loss_mlp": 1.03734589, "epoch": 0.5198256425672629, "flos": 20009138711040.0, "grad_norm": 2.03554137378436, "language_loss": 0.78123313, "learning_rate": 1.968649618642264e-06, "loss": 0.80263758, "num_input_tokens_seen": 185801345, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8646, "time_per_iteration": 2.494795322418213 }, { "auxiliary_loss_clip": 0.01113713, "auxiliary_loss_mlp": 0.01036024, "balance_loss_clip": 1.02345371, "balance_loss_mlp": 1.04086947, "epoch": 0.5198857658199308, "flos": 19828867328640.0, "grad_norm": 2.434339148052994, "language_loss": 0.66306651, "learning_rate": 1.9682602054082252e-06, "loss": 0.68456388, "num_input_tokens_seen": 185820815, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 8647, "time_per_iteration": 2.4657015800476074 }, { "auxiliary_loss_clip": 0.01116115, "auxiliary_loss_mlp": 0.01031843, "balance_loss_clip": 1.01683426, "balance_loss_mlp": 1.04061842, "epoch": 0.5199458890725989, "flos": 24461918150400.0, "grad_norm": 2.49449834196274, "language_loss": 0.71791565, "learning_rate": 1.967870793377763e-06, "loss": 0.73939526, "num_input_tokens_seen": 185841450, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.75390625, "step": 8648, "time_per_iteration": 2.5414276123046875 }, { "auxiliary_loss_clip": 0.01115754, "auxiliary_loss_mlp": 0.01032219, "balance_loss_clip": 1.01766372, "balance_loss_mlp": 1.04177654, "epoch": 0.5200060123252668, "flos": 23404779953280.0, "grad_norm": 1.920449096945046, "language_loss": 0.64702451, "learning_rate": 1.967481382565642e-06, "loss": 0.66850424, "num_input_tokens_seen": 185859935, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.73828125, "step": 8649, "time_per_iteration": 2.5011398792266846 }, { "auxiliary_loss_clip": 0.01117831, "auxiliary_loss_mlp": 0.01031825, "balance_loss_clip": 1.01701927, "balance_loss_mlp": 1.04094696, "epoch": 0.5200661355779348, "flos": 17201355454080.0, "grad_norm": 2.1290766521556623, "language_loss": 0.70549703, "learning_rate": 1.9670919729866315e-06, "loss": 0.72699362, "num_input_tokens_seen": 185876795, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76953125, "step": 8650, "time_per_iteration": 2.4539802074432373 }, { "auxiliary_loss_clip": 0.01110575, "auxiliary_loss_mlp": 0.01030863, "balance_loss_clip": 1.01748216, "balance_loss_mlp": 1.03816032, "epoch": 0.5201262588306027, "flos": 18515075477760.0, "grad_norm": 1.6684338628441373, "language_loss": 0.77508974, "learning_rate": 1.966702564655496e-06, "loss": 0.79650408, "num_input_tokens_seen": 185895570, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 8651, "time_per_iteration": 2.4755699634552 }, { "auxiliary_loss_clip": 0.01116971, "auxiliary_loss_mlp": 0.01037857, "balance_loss_clip": 1.02406454, "balance_loss_mlp": 1.04258704, "epoch": 0.5201863820832707, "flos": 18619395552000.0, "grad_norm": 2.0913801296244543, "language_loss": 0.78447008, "learning_rate": 1.966313157587003e-06, "loss": 0.80601841, "num_input_tokens_seen": 185913700, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 8652, "time_per_iteration": 2.5175511837005615 }, { "auxiliary_loss_clip": 0.01114033, "auxiliary_loss_mlp": 0.01033793, "balance_loss_clip": 1.01938701, "balance_loss_mlp": 1.04050112, "epoch": 0.5202465053359386, "flos": 22857142222080.0, "grad_norm": 1.9910956356566467, "language_loss": 0.70192873, "learning_rate": 1.9659237517959187e-06, "loss": 0.72340697, "num_input_tokens_seen": 185932460, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.734375, "step": 8653, "time_per_iteration": 2.496860980987549 }, { "auxiliary_loss_clip": 0.01115452, "auxiliary_loss_mlp": 0.0103977, "balance_loss_clip": 1.02582824, "balance_loss_mlp": 1.03962183, "epoch": 0.5203066285886067, "flos": 21981532383360.0, "grad_norm": 2.7154950216661473, "language_loss": 0.78230941, "learning_rate": 1.965534347297008e-06, "loss": 0.80386162, "num_input_tokens_seen": 185952030, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 8654, "time_per_iteration": 2.509542942047119 }, { "auxiliary_loss_clip": 0.01119134, "auxiliary_loss_mlp": 0.01042645, "balance_loss_clip": 1.02877486, "balance_loss_mlp": 1.04115188, "epoch": 0.5203667518412746, "flos": 20233329448320.0, "grad_norm": 2.173132816894377, "language_loss": 0.84196115, "learning_rate": 1.9651449441050393e-06, "loss": 0.86357886, "num_input_tokens_seen": 185973130, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.78125, "step": 8655, "time_per_iteration": 2.532470464706421 }, { "auxiliary_loss_clip": 0.01111088, "auxiliary_loss_mlp": 0.01031465, "balance_loss_clip": 1.0192517, "balance_loss_mlp": 1.03994668, "epoch": 0.5204268750939426, "flos": 15705460627200.0, "grad_norm": 2.090544961330516, "language_loss": 0.65839458, "learning_rate": 1.9647555422347777e-06, "loss": 0.67982012, "num_input_tokens_seen": 185990200, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 8656, "time_per_iteration": 2.4857122898101807 }, { "auxiliary_loss_clip": 0.01114371, "auxiliary_loss_mlp": 0.01034893, "balance_loss_clip": 1.02203608, "balance_loss_mlp": 1.04082298, "epoch": 0.5204869983466105, "flos": 27449469999360.0, "grad_norm": 1.7735811839908238, "language_loss": 0.73116028, "learning_rate": 1.9643661417009893e-06, "loss": 0.75265288, "num_input_tokens_seen": 186009880, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8657, "time_per_iteration": 2.5244054794311523 }, { "auxiliary_loss_clip": 0.01112625, "auxiliary_loss_mlp": 0.01033837, "balance_loss_clip": 1.01966918, "balance_loss_mlp": 1.0401032, "epoch": 0.5205471215992785, "flos": 20595452411520.0, "grad_norm": 1.7861857510409676, "language_loss": 0.71496522, "learning_rate": 1.9639767425184408e-06, "loss": 0.73642987, "num_input_tokens_seen": 186026680, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7265625, "step": 8658, "time_per_iteration": 2.5008857250213623 }, { "auxiliary_loss_clip": 0.01111643, "auxiliary_loss_mlp": 0.01033563, "balance_loss_clip": 1.02022338, "balance_loss_mlp": 1.03807068, "epoch": 0.5206072448519465, "flos": 22127904305280.0, "grad_norm": 1.9204258463734276, "language_loss": 0.83331013, "learning_rate": 1.963587344701897e-06, "loss": 0.8547622, "num_input_tokens_seen": 186046920, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 8659, "time_per_iteration": 2.4873337745666504 }, { "auxiliary_loss_clip": 0.01118052, "auxiliary_loss_mlp": 0.0104001, "balance_loss_clip": 1.02495408, "balance_loss_mlp": 1.03946841, "epoch": 0.5206673681046144, "flos": 18330422636160.0, "grad_norm": 2.1161720665726875, "language_loss": 0.75998443, "learning_rate": 1.9631979482661253e-06, "loss": 0.78156507, "num_input_tokens_seen": 186062090, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.78125, "step": 8660, "time_per_iteration": 2.4625303745269775 }, { "auxiliary_loss_clip": 0.01112113, "auxiliary_loss_mlp": 0.01033916, "balance_loss_clip": 1.02139878, "balance_loss_mlp": 1.03970075, "epoch": 0.5207274913572825, "flos": 20230240878720.0, "grad_norm": 1.8108984581594672, "language_loss": 0.77431887, "learning_rate": 1.9628085532258906e-06, "loss": 0.79577911, "num_input_tokens_seen": 186081135, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 8661, "time_per_iteration": 2.47027587890625 }, { "auxiliary_loss_clip": 0.01112691, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.02079725, "balance_loss_mlp": 1.03764462, "epoch": 0.5207876146099504, "flos": 22127042378880.0, "grad_norm": 1.7440556460205952, "language_loss": 0.70474708, "learning_rate": 1.9624191595959603e-06, "loss": 0.72620797, "num_input_tokens_seen": 186099700, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.75, "step": 8662, "time_per_iteration": 2.522346019744873 }, { "auxiliary_loss_clip": 0.01110788, "auxiliary_loss_mlp": 0.01033589, "balance_loss_clip": 1.01965404, "balance_loss_mlp": 1.03881013, "epoch": 0.5208477378626184, "flos": 23878908501120.0, "grad_norm": 1.7989981299212767, "language_loss": 0.69608915, "learning_rate": 1.962029767391098e-06, "loss": 0.71753287, "num_input_tokens_seen": 186119740, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 8663, "time_per_iteration": 2.493913173675537 }, { "auxiliary_loss_clip": 0.01112446, "auxiliary_loss_mlp": 0.01031251, "balance_loss_clip": 1.01802468, "balance_loss_mlp": 1.03954804, "epoch": 0.5209078611152863, "flos": 20961525870720.0, "grad_norm": 1.594897106934998, "language_loss": 0.76893675, "learning_rate": 1.961640376626072e-06, "loss": 0.79037368, "num_input_tokens_seen": 186140645, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 8664, "time_per_iteration": 2.508207321166992 }, { "auxiliary_loss_clip": 0.01112344, "auxiliary_loss_mlp": 0.01035451, "balance_loss_clip": 1.02204585, "balance_loss_mlp": 1.03873527, "epoch": 0.5209679843679543, "flos": 20667740532480.0, "grad_norm": 1.9458458494083286, "language_loss": 0.75856805, "learning_rate": 1.961250987315646e-06, "loss": 0.78004599, "num_input_tokens_seen": 186160130, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 8665, "time_per_iteration": 2.4868359565734863 }, { "auxiliary_loss_clip": 0.01113251, "auxiliary_loss_mlp": 0.01029745, "balance_loss_clip": 1.01718616, "balance_loss_mlp": 1.04065442, "epoch": 0.5210281076206222, "flos": 20227295963520.0, "grad_norm": 1.7016501806363737, "language_loss": 0.72146082, "learning_rate": 1.960861599474586e-06, "loss": 0.74289072, "num_input_tokens_seen": 186179485, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 8666, "time_per_iteration": 2.486192226409912 }, { "auxiliary_loss_clip": 0.01119885, "auxiliary_loss_mlp": 0.01037435, "balance_loss_clip": 1.02150846, "balance_loss_mlp": 1.04017651, "epoch": 0.5210882308732903, "flos": 16069989801600.0, "grad_norm": 2.148748944603456, "language_loss": 0.68348712, "learning_rate": 1.9604722131176592e-06, "loss": 0.7050603, "num_input_tokens_seen": 186197140, "router_z_loss_clip": 0.15917969, "router_z_loss_mlp": 0.796875, "step": 8667, "time_per_iteration": 2.4650614261627197 }, { "auxiliary_loss_clip": 0.01108858, "auxiliary_loss_mlp": 0.0103427, "balance_loss_clip": 1.02112114, "balance_loss_mlp": 1.0377419, "epoch": 0.5211483541259582, "flos": 24825298089600.0, "grad_norm": 1.4998527329990143, "language_loss": 0.81279534, "learning_rate": 1.960082828259629e-06, "loss": 0.83422667, "num_input_tokens_seen": 186216800, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 8668, "time_per_iteration": 2.552173376083374 }, { "auxiliary_loss_clip": 0.01113378, "auxiliary_loss_mlp": 0.01028863, "balance_loss_clip": 1.01597095, "balance_loss_mlp": 1.03969193, "epoch": 0.5212084773786262, "flos": 20370651143040.0, "grad_norm": 2.1941738643110624, "language_loss": 0.63784903, "learning_rate": 1.9596934449152623e-06, "loss": 0.65927148, "num_input_tokens_seen": 186235320, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 8669, "time_per_iteration": 2.5028152465820312 }, { "auxiliary_loss_clip": 0.01114845, "auxiliary_loss_mlp": 0.01035098, "balance_loss_clip": 1.02160943, "balance_loss_mlp": 1.04224825, "epoch": 0.5212686006312941, "flos": 23145468693120.0, "grad_norm": 1.5773598935539628, "language_loss": 0.66609871, "learning_rate": 1.959304063099325e-06, "loss": 0.68759817, "num_input_tokens_seen": 186254460, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 8670, "time_per_iteration": 2.503096103668213 }, { "auxiliary_loss_clip": 0.01106894, "auxiliary_loss_mlp": 0.01032797, "balance_loss_clip": 1.0199585, "balance_loss_mlp": 1.03701425, "epoch": 0.5213287238839621, "flos": 27774030314880.0, "grad_norm": 2.79127736644402, "language_loss": 0.76119232, "learning_rate": 1.9589146828265806e-06, "loss": 0.7825892, "num_input_tokens_seen": 186269465, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 8671, "time_per_iteration": 2.5015687942504883 }, { "auxiliary_loss_clip": 0.01117997, "auxiliary_loss_mlp": 0.01036778, "balance_loss_clip": 1.02242565, "balance_loss_mlp": 1.0424943, "epoch": 0.5213888471366301, "flos": 19937676602880.0, "grad_norm": 3.791939686191562, "language_loss": 0.78204811, "learning_rate": 1.958525304111796e-06, "loss": 0.8035959, "num_input_tokens_seen": 186288660, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 8672, "time_per_iteration": 2.515366554260254 }, { "auxiliary_loss_clip": 0.01108703, "auxiliary_loss_mlp": 0.01032257, "balance_loss_clip": 1.02003241, "balance_loss_mlp": 1.03714275, "epoch": 0.521448970389298, "flos": 16982731324800.0, "grad_norm": 2.000124830100279, "language_loss": 0.72162151, "learning_rate": 1.958135926969736e-06, "loss": 0.74303108, "num_input_tokens_seen": 186305760, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71875, "step": 8673, "time_per_iteration": 2.478813648223877 }, { "auxiliary_loss_clip": 0.01110716, "auxiliary_loss_mlp": 0.01030856, "balance_loss_clip": 1.01774323, "balance_loss_mlp": 1.03815532, "epoch": 0.5215090936419661, "flos": 18989706816000.0, "grad_norm": 1.532355807954942, "language_loss": 0.74984658, "learning_rate": 1.957746551415166e-06, "loss": 0.77126223, "num_input_tokens_seen": 186324135, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 8674, "time_per_iteration": 3.8649098873138428 }, { "auxiliary_loss_clip": 0.01113484, "auxiliary_loss_mlp": 0.01034303, "balance_loss_clip": 1.01978338, "balance_loss_mlp": 1.03783238, "epoch": 0.521569216894634, "flos": 16143427157760.0, "grad_norm": 2.2703713472567286, "language_loss": 0.85901535, "learning_rate": 1.9573571774628506e-06, "loss": 0.88049322, "num_input_tokens_seen": 186340205, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8675, "time_per_iteration": 3.9038851261138916 }, { "auxiliary_loss_clip": 0.01042705, "auxiliary_loss_mlp": 0.01005419, "balance_loss_clip": 1.00417888, "balance_loss_mlp": 1.01778984, "epoch": 0.521629340147302, "flos": 57579493282560.0, "grad_norm": 1.0366626193983552, "language_loss": 0.63115907, "learning_rate": 1.9569678051275556e-06, "loss": 0.6516403, "num_input_tokens_seen": 186396940, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.24902344, "step": 8676, "time_per_iteration": 4.480421543121338 }, { "auxiliary_loss_clip": 0.01110281, "auxiliary_loss_mlp": 0.01028225, "balance_loss_clip": 1.01597619, "balance_loss_mlp": 1.03880572, "epoch": 0.5216894633999699, "flos": 26796901662720.0, "grad_norm": 1.6928616348868575, "language_loss": 0.68614173, "learning_rate": 1.956578434424046e-06, "loss": 0.7075268, "num_input_tokens_seen": 186418680, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71484375, "step": 8677, "time_per_iteration": 3.9779155254364014 }, { "auxiliary_loss_clip": 0.01110604, "auxiliary_loss_mlp": 0.01029971, "balance_loss_clip": 1.01695299, "balance_loss_mlp": 1.03831673, "epoch": 0.5217495866526379, "flos": 26358719650560.0, "grad_norm": 1.720182197245193, "language_loss": 0.65139067, "learning_rate": 1.956189065367086e-06, "loss": 0.67279643, "num_input_tokens_seen": 186438265, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 8678, "time_per_iteration": 2.570556640625 }, { "auxiliary_loss_clip": 0.0111583, "auxiliary_loss_mlp": 0.0103165, "balance_loss_clip": 1.01750612, "balance_loss_mlp": 1.03983879, "epoch": 0.5218097099053058, "flos": 23584009841280.0, "grad_norm": 4.681085533072197, "language_loss": 0.68454838, "learning_rate": 1.9557996979714414e-06, "loss": 0.70602322, "num_input_tokens_seen": 186456870, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 8679, "time_per_iteration": 2.4985806941986084 }, { "auxiliary_loss_clip": 0.0111448, "auxiliary_loss_mlp": 0.0103207, "balance_loss_clip": 1.01898098, "balance_loss_mlp": 1.04052782, "epoch": 0.5218698331579739, "flos": 18077396256000.0, "grad_norm": 1.9051905298448104, "language_loss": 0.67116237, "learning_rate": 1.9554103322518764e-06, "loss": 0.69262791, "num_input_tokens_seen": 186476425, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 8680, "time_per_iteration": 2.50504469871521 }, { "auxiliary_loss_clip": 0.0111238, "auxiliary_loss_mlp": 0.01034744, "balance_loss_clip": 1.02112412, "balance_loss_mlp": 1.03856623, "epoch": 0.5219299564106418, "flos": 19281121856640.0, "grad_norm": 4.413352805734703, "language_loss": 0.82986867, "learning_rate": 1.955020968223156e-06, "loss": 0.85133994, "num_input_tokens_seen": 186492555, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 8681, "time_per_iteration": 2.441587209701538 }, { "auxiliary_loss_clip": 0.01109289, "auxiliary_loss_mlp": 0.01029208, "balance_loss_clip": 1.01664925, "balance_loss_mlp": 1.03691995, "epoch": 0.5219900796633098, "flos": 26651355753600.0, "grad_norm": 1.832269154432525, "language_loss": 0.77952093, "learning_rate": 1.9546316059000454e-06, "loss": 0.80090594, "num_input_tokens_seen": 186513190, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 8682, "time_per_iteration": 2.5456383228302 }, { "auxiliary_loss_clip": 0.01111384, "auxiliary_loss_mlp": 0.01037974, "balance_loss_clip": 1.02527201, "balance_loss_mlp": 1.03954828, "epoch": 0.5220502029159777, "flos": 34312717382400.0, "grad_norm": 1.7580498953617274, "language_loss": 0.68738151, "learning_rate": 1.9542422452973082e-06, "loss": 0.70887506, "num_input_tokens_seen": 186534830, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 8683, "time_per_iteration": 2.5868868827819824 }, { "auxiliary_loss_clip": 0.01113441, "auxiliary_loss_mlp": 0.01038279, "balance_loss_clip": 1.0244329, "balance_loss_mlp": 1.03942752, "epoch": 0.5221103261686457, "flos": 22156488552960.0, "grad_norm": 1.7393586553049885, "language_loss": 0.75941384, "learning_rate": 1.9538528864297104e-06, "loss": 0.780931, "num_input_tokens_seen": 186554390, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 8684, "time_per_iteration": 2.5170607566833496 }, { "auxiliary_loss_clip": 0.01108097, "auxiliary_loss_mlp": 0.01027254, "balance_loss_clip": 1.01443899, "balance_loss_mlp": 1.0367192, "epoch": 0.5221704494213137, "flos": 19208402772480.0, "grad_norm": 4.7103776101427615, "language_loss": 0.76323897, "learning_rate": 1.9534635293120153e-06, "loss": 0.78459251, "num_input_tokens_seen": 186572360, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 8685, "time_per_iteration": 2.461514711380005 }, { "auxiliary_loss_clip": 0.01114884, "auxiliary_loss_mlp": 0.01038109, "balance_loss_clip": 1.0259552, "balance_loss_mlp": 1.04116178, "epoch": 0.5222305726739817, "flos": 19354056422400.0, "grad_norm": 1.892349492029246, "language_loss": 0.80828547, "learning_rate": 1.9530741739589876e-06, "loss": 0.82981539, "num_input_tokens_seen": 186590655, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.734375, "step": 8686, "time_per_iteration": 2.525926113128662 }, { "auxiliary_loss_clip": 0.01106328, "auxiliary_loss_mlp": 0.01032699, "balance_loss_clip": 1.02080798, "balance_loss_mlp": 1.03712523, "epoch": 0.5222906959266497, "flos": 27814789272960.0, "grad_norm": 2.0522051358645634, "language_loss": 0.70024788, "learning_rate": 1.9526848203853927e-06, "loss": 0.72163808, "num_input_tokens_seen": 186610345, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 8687, "time_per_iteration": 2.56673002243042 }, { "auxiliary_loss_clip": 0.01107157, "auxiliary_loss_mlp": 0.01027509, "balance_loss_clip": 1.01546252, "balance_loss_mlp": 1.03643429, "epoch": 0.5223508191793176, "flos": 12712988615040.0, "grad_norm": 2.341774810714778, "language_loss": 0.83166462, "learning_rate": 1.9522954686059936e-06, "loss": 0.85301131, "num_input_tokens_seen": 186624360, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 8688, "time_per_iteration": 2.5014114379882812 }, { "auxiliary_loss_clip": 0.01108435, "auxiliary_loss_mlp": 0.01029883, "balance_loss_clip": 1.01710367, "balance_loss_mlp": 1.0377028, "epoch": 0.5224109424319856, "flos": 15632238752640.0, "grad_norm": 2.628066893562655, "language_loss": 0.73743808, "learning_rate": 1.9519061186355558e-06, "loss": 0.75882125, "num_input_tokens_seen": 186638680, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 8689, "time_per_iteration": 2.474090337753296 }, { "auxiliary_loss_clip": 0.01108202, "auxiliary_loss_mlp": 0.01030381, "balance_loss_clip": 1.01727986, "balance_loss_mlp": 1.03817272, "epoch": 0.5224710656846535, "flos": 15742233175680.0, "grad_norm": 2.1487234976269187, "language_loss": 0.83032513, "learning_rate": 1.9515167704888417e-06, "loss": 0.85171098, "num_input_tokens_seen": 186655840, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 8690, "time_per_iteration": 2.510896682739258 }, { "auxiliary_loss_clip": 0.0111178, "auxiliary_loss_mlp": 0.01034961, "balance_loss_clip": 1.02085233, "balance_loss_mlp": 1.03920555, "epoch": 0.5225311889373215, "flos": 26030998938240.0, "grad_norm": 2.244617347157862, "language_loss": 0.79077721, "learning_rate": 1.9511274241806173e-06, "loss": 0.81224465, "num_input_tokens_seen": 186674150, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 8691, "time_per_iteration": 2.5440003871917725 }, { "auxiliary_loss_clip": 0.01115741, "auxiliary_loss_mlp": 0.01035199, "balance_loss_clip": 1.02054238, "balance_loss_mlp": 1.04081845, "epoch": 0.5225913121899894, "flos": 18369278173440.0, "grad_norm": 2.6432421586779555, "language_loss": 0.76326358, "learning_rate": 1.950738079725646e-06, "loss": 0.78477305, "num_input_tokens_seen": 186690675, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 8692, "time_per_iteration": 2.475457191467285 }, { "auxiliary_loss_clip": 0.0111038, "auxiliary_loss_mlp": 0.01030901, "balance_loss_clip": 1.01888442, "balance_loss_mlp": 1.04007792, "epoch": 0.5226514354426575, "flos": 29273516501760.0, "grad_norm": 2.6262484787946567, "language_loss": 0.7294752, "learning_rate": 1.950348737138691e-06, "loss": 0.75088799, "num_input_tokens_seen": 186710380, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 8693, "time_per_iteration": 2.5489518642425537 }, { "auxiliary_loss_clip": 0.01116799, "auxiliary_loss_mlp": 0.01041592, "balance_loss_clip": 1.0264641, "balance_loss_mlp": 1.04053342, "epoch": 0.5227115586953254, "flos": 22853299466880.0, "grad_norm": 2.410544257096586, "language_loss": 0.82230377, "learning_rate": 1.949959396434517e-06, "loss": 0.84388769, "num_input_tokens_seen": 186729135, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.76171875, "step": 8694, "time_per_iteration": 2.506779909133911 }, { "auxiliary_loss_clip": 0.01043261, "auxiliary_loss_mlp": 0.01003179, "balance_loss_clip": 1.00180233, "balance_loss_mlp": 1.01815772, "epoch": 0.5227716819479934, "flos": 57474419022720.0, "grad_norm": 0.7609704141282304, "language_loss": 0.55636793, "learning_rate": 1.949570057627888e-06, "loss": 0.57683235, "num_input_tokens_seen": 186791115, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.25, "step": 8695, "time_per_iteration": 3.1304895877838135 }, { "auxiliary_loss_clip": 0.01114393, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.01888382, "balance_loss_mlp": 1.04195249, "epoch": 0.5228318052006613, "flos": 13808264077440.0, "grad_norm": 2.8894174484445765, "language_loss": 0.73763275, "learning_rate": 1.9491807207335672e-06, "loss": 0.7590968, "num_input_tokens_seen": 186808660, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 8696, "time_per_iteration": 2.4653878211975098 }, { "auxiliary_loss_clip": 0.01114752, "auxiliary_loss_mlp": 0.01032033, "balance_loss_clip": 1.01912236, "balance_loss_mlp": 1.04193544, "epoch": 0.5228919284533293, "flos": 15596184476160.0, "grad_norm": 4.530240804471715, "language_loss": 0.71645683, "learning_rate": 1.948791385766319e-06, "loss": 0.7379247, "num_input_tokens_seen": 186825900, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8697, "time_per_iteration": 2.4399731159210205 }, { "auxiliary_loss_clip": 0.01112345, "auxiliary_loss_mlp": 0.01032947, "balance_loss_clip": 1.0206387, "balance_loss_mlp": 1.04168439, "epoch": 0.5229520517059973, "flos": 22491499726080.0, "grad_norm": 1.8434878766969842, "language_loss": 0.8088727, "learning_rate": 1.948402052740906e-06, "loss": 0.8303256, "num_input_tokens_seen": 186843735, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 8698, "time_per_iteration": 2.491896867752075 }, { "auxiliary_loss_clip": 0.01113352, "auxiliary_loss_mlp": 0.01033014, "balance_loss_clip": 1.02077675, "balance_loss_mlp": 1.04127169, "epoch": 0.5230121749586653, "flos": 22090880361600.0, "grad_norm": 1.743584331838792, "language_loss": 0.74331415, "learning_rate": 1.948012721672093e-06, "loss": 0.76477778, "num_input_tokens_seen": 186862440, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 8699, "time_per_iteration": 2.468580722808838 }, { "auxiliary_loss_clip": 0.01117339, "auxiliary_loss_mlp": 0.01031091, "balance_loss_clip": 1.01715565, "balance_loss_mlp": 1.04115653, "epoch": 0.5230722982113333, "flos": 22127150119680.0, "grad_norm": 1.9776382035556137, "language_loss": 0.73452371, "learning_rate": 1.947623392574642e-06, "loss": 0.75600803, "num_input_tokens_seen": 186880940, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76171875, "step": 8700, "time_per_iteration": 2.5024948120117188 }, { "auxiliary_loss_clip": 0.01118765, "auxiliary_loss_mlp": 0.01034147, "balance_loss_clip": 1.02009273, "balance_loss_mlp": 1.04336643, "epoch": 0.5231324214640012, "flos": 25009268572800.0, "grad_norm": 2.1463569747157227, "language_loss": 0.669779, "learning_rate": 1.947234065463318e-06, "loss": 0.69130814, "num_input_tokens_seen": 186900785, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 8701, "time_per_iteration": 2.500180721282959 }, { "auxiliary_loss_clip": 0.01112117, "auxiliary_loss_mlp": 0.01035241, "balance_loss_clip": 1.0222528, "balance_loss_mlp": 1.0401628, "epoch": 0.5231925447166692, "flos": 25740517651200.0, "grad_norm": 1.8037942007075478, "language_loss": 0.66527796, "learning_rate": 1.9468447403528826e-06, "loss": 0.6867516, "num_input_tokens_seen": 186920895, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 8702, "time_per_iteration": 2.528421401977539 }, { "auxiliary_loss_clip": 0.01113987, "auxiliary_loss_mlp": 0.01035364, "balance_loss_clip": 1.02201807, "balance_loss_mlp": 1.04197979, "epoch": 0.5232526679693371, "flos": 21433930565760.0, "grad_norm": 2.004876351628407, "language_loss": 0.76252186, "learning_rate": 1.946455417258101e-06, "loss": 0.78401542, "num_input_tokens_seen": 186940605, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 8703, "time_per_iteration": 2.505448341369629 }, { "auxiliary_loss_clip": 0.01120621, "auxiliary_loss_mlp": 0.01044911, "balance_loss_clip": 1.02912807, "balance_loss_mlp": 1.04267275, "epoch": 0.5233127912220051, "flos": 35298393471360.0, "grad_norm": 1.9625178977536983, "language_loss": 0.76784879, "learning_rate": 1.9460660961937348e-06, "loss": 0.78950405, "num_input_tokens_seen": 186960820, "router_z_loss_clip": 0.15820312, "router_z_loss_mlp": 0.78125, "step": 8704, "time_per_iteration": 2.6304900646209717 }, { "auxiliary_loss_clip": 0.01115257, "auxiliary_loss_mlp": 0.01040508, "balance_loss_clip": 1.02777052, "balance_loss_mlp": 1.04428875, "epoch": 0.523372914474673, "flos": 17051320344960.0, "grad_norm": 1.906473850553106, "language_loss": 0.77841878, "learning_rate": 1.9456767771745474e-06, "loss": 0.79997641, "num_input_tokens_seen": 186976240, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 8705, "time_per_iteration": 2.542334794998169 }, { "auxiliary_loss_clip": 0.01117281, "auxiliary_loss_mlp": 0.0103628, "balance_loss_clip": 1.02230906, "balance_loss_mlp": 1.04223514, "epoch": 0.5234330377273411, "flos": 18406302117120.0, "grad_norm": 2.3186133607958697, "language_loss": 0.69390815, "learning_rate": 1.9452874602153027e-06, "loss": 0.71544373, "num_input_tokens_seen": 186992855, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 8706, "time_per_iteration": 2.5194759368896484 }, { "auxiliary_loss_clip": 0.01044833, "auxiliary_loss_mlp": 0.01007466, "balance_loss_clip": 1.00611329, "balance_loss_mlp": 1.01959634, "epoch": 0.523493160980009, "flos": 65850296970240.0, "grad_norm": 0.6855286322807183, "language_loss": 0.52486074, "learning_rate": 1.9448981453307623e-06, "loss": 0.54538375, "num_input_tokens_seen": 187051205, "router_z_loss_clip": 0.0135498, "router_z_loss_mlp": 0.25195312, "step": 8707, "time_per_iteration": 3.158616542816162 }, { "auxiliary_loss_clip": 0.01114745, "auxiliary_loss_mlp": 0.01042027, "balance_loss_clip": 1.02929521, "balance_loss_mlp": 1.04142272, "epoch": 0.523553284232677, "flos": 21872076664320.0, "grad_norm": 2.2937676087923253, "language_loss": 0.74377275, "learning_rate": 1.9445088325356904e-06, "loss": 0.76534051, "num_input_tokens_seen": 187070540, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 8708, "time_per_iteration": 2.510052442550659 }, { "auxiliary_loss_clip": 0.01114947, "auxiliary_loss_mlp": 0.01032165, "balance_loss_clip": 1.0192306, "balance_loss_mlp": 1.04315758, "epoch": 0.5236134074853449, "flos": 20848191482880.0, "grad_norm": 1.822881249465185, "language_loss": 0.77461231, "learning_rate": 1.944119521844849e-06, "loss": 0.79608345, "num_input_tokens_seen": 187089975, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 8709, "time_per_iteration": 2.4965739250183105 }, { "auxiliary_loss_clip": 0.01119693, "auxiliary_loss_mlp": 0.01038, "balance_loss_clip": 1.02280116, "balance_loss_mlp": 1.0411036, "epoch": 0.5236735307380129, "flos": 25520421064320.0, "grad_norm": 2.358017258833778, "language_loss": 0.83312893, "learning_rate": 1.9437302132730003e-06, "loss": 0.85470581, "num_input_tokens_seen": 187108775, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.78515625, "step": 8710, "time_per_iteration": 2.5318691730499268 }, { "auxiliary_loss_clip": 0.01113694, "auxiliary_loss_mlp": 0.01029307, "balance_loss_clip": 1.01643836, "balance_loss_mlp": 1.04175472, "epoch": 0.523733653990681, "flos": 23583112001280.0, "grad_norm": 1.8741998506533648, "language_loss": 0.69521976, "learning_rate": 1.943340906834908e-06, "loss": 0.71664977, "num_input_tokens_seen": 187128830, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 8711, "time_per_iteration": 2.5036427974700928 }, { "auxiliary_loss_clip": 0.01114245, "auxiliary_loss_mlp": 0.01037456, "balance_loss_clip": 1.02391362, "balance_loss_mlp": 1.04013765, "epoch": 0.5237937772433489, "flos": 21106245767040.0, "grad_norm": 1.9200696763868157, "language_loss": 0.82918453, "learning_rate": 1.9429516025453345e-06, "loss": 0.85070151, "num_input_tokens_seen": 187149570, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 8712, "time_per_iteration": 2.51605224609375 }, { "auxiliary_loss_clip": 0.01117579, "auxiliary_loss_mlp": 0.01037639, "balance_loss_clip": 1.02323866, "balance_loss_mlp": 1.04199243, "epoch": 0.5238539004960169, "flos": 19172887200000.0, "grad_norm": 2.1595142298706214, "language_loss": 0.69737262, "learning_rate": 1.9425623004190415e-06, "loss": 0.71892476, "num_input_tokens_seen": 187170575, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 8713, "time_per_iteration": 2.5142645835876465 }, { "auxiliary_loss_clip": 0.01119656, "auxiliary_loss_mlp": 0.01038448, "balance_loss_clip": 1.02373743, "balance_loss_mlp": 1.04145122, "epoch": 0.5239140237486848, "flos": 17888218300800.0, "grad_norm": 3.119805523603748, "language_loss": 0.76779032, "learning_rate": 1.9421730004707925e-06, "loss": 0.78937131, "num_input_tokens_seen": 187187190, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.78125, "step": 8714, "time_per_iteration": 2.47251558303833 }, { "auxiliary_loss_clip": 0.01121236, "auxiliary_loss_mlp": 0.01033404, "balance_loss_clip": 1.01920652, "balance_loss_mlp": 1.04574347, "epoch": 0.5239741470013528, "flos": 17930413802880.0, "grad_norm": 2.0012951251835926, "language_loss": 0.76163757, "learning_rate": 1.9417837027153483e-06, "loss": 0.78318399, "num_input_tokens_seen": 187204350, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 8715, "time_per_iteration": 3.8292572498321533 }, { "auxiliary_loss_clip": 0.01115592, "auxiliary_loss_mlp": 0.01033928, "balance_loss_clip": 1.02022529, "balance_loss_mlp": 1.043432, "epoch": 0.5240342702540207, "flos": 30993386584320.0, "grad_norm": 1.4474881273392899, "language_loss": 0.71086264, "learning_rate": 1.9413944071674723e-06, "loss": 0.7323578, "num_input_tokens_seen": 187225605, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 8716, "time_per_iteration": 4.037405729293823 }, { "auxiliary_loss_clip": 0.01117921, "auxiliary_loss_mlp": 0.01040038, "balance_loss_clip": 1.02763426, "balance_loss_mlp": 1.04408693, "epoch": 0.5240943935066887, "flos": 25005066681600.0, "grad_norm": 1.9563027355374873, "language_loss": 0.87073863, "learning_rate": 1.941005113841926e-06, "loss": 0.89231825, "num_input_tokens_seen": 187241335, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.73828125, "step": 8717, "time_per_iteration": 3.9686152935028076 }, { "auxiliary_loss_clip": 0.01119008, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.02022219, "balance_loss_mlp": 1.04473186, "epoch": 0.5241545167593566, "flos": 23659099223040.0, "grad_norm": 2.3693346460886113, "language_loss": 0.61371934, "learning_rate": 1.9406158227534723e-06, "loss": 0.63524592, "num_input_tokens_seen": 187259925, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 8718, "time_per_iteration": 2.531855344772339 }, { "auxiliary_loss_clip": 0.01122785, "auxiliary_loss_mlp": 0.01039231, "balance_loss_clip": 1.02549815, "balance_loss_mlp": 1.04673886, "epoch": 0.5242146400120247, "flos": 23400398494080.0, "grad_norm": 2.1360162835929546, "language_loss": 0.72110093, "learning_rate": 1.940226533916872e-06, "loss": 0.74272108, "num_input_tokens_seen": 187279035, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.76171875, "step": 8719, "time_per_iteration": 3.901686429977417 }, { "auxiliary_loss_clip": 0.01114623, "auxiliary_loss_mlp": 0.0103108, "balance_loss_clip": 1.01887298, "balance_loss_mlp": 1.04360032, "epoch": 0.5242747632646926, "flos": 17749065012480.0, "grad_norm": 1.7775710689229156, "language_loss": 0.73216599, "learning_rate": 1.9398372473468877e-06, "loss": 0.75362301, "num_input_tokens_seen": 187297555, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 8720, "time_per_iteration": 2.4657199382781982 }, { "auxiliary_loss_clip": 0.0111839, "auxiliary_loss_mlp": 0.01041209, "balance_loss_clip": 1.0273391, "balance_loss_mlp": 1.04485202, "epoch": 0.5243348865173606, "flos": 32597731549440.0, "grad_norm": 1.6443326383114776, "language_loss": 0.70295477, "learning_rate": 1.939447963058281e-06, "loss": 0.72455078, "num_input_tokens_seen": 187320265, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 8721, "time_per_iteration": 2.5863208770751953 }, { "auxiliary_loss_clip": 0.01116601, "auxiliary_loss_mlp": 0.01034041, "balance_loss_clip": 1.02020693, "balance_loss_mlp": 1.04330945, "epoch": 0.5243950097700285, "flos": 25484115392640.0, "grad_norm": 1.6913335675294574, "language_loss": 0.87027287, "learning_rate": 1.939058681065813e-06, "loss": 0.8917793, "num_input_tokens_seen": 187338045, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 8722, "time_per_iteration": 2.518418550491333 }, { "auxiliary_loss_clip": 0.01120509, "auxiliary_loss_mlp": 0.01033775, "balance_loss_clip": 1.02019715, "balance_loss_mlp": 1.04801118, "epoch": 0.5244551330226965, "flos": 15268391936640.0, "grad_norm": 1.7117482413914524, "language_loss": 0.7993077, "learning_rate": 1.938669401384247e-06, "loss": 0.82085055, "num_input_tokens_seen": 187356040, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 8723, "time_per_iteration": 2.480367422103882 }, { "auxiliary_loss_clip": 0.01122544, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.02605116, "balance_loss_mlp": 1.04683232, "epoch": 0.5245152562753645, "flos": 22237108629120.0, "grad_norm": 7.1326041721493665, "language_loss": 0.74959481, "learning_rate": 1.9382801240283426e-06, "loss": 0.77122462, "num_input_tokens_seen": 187374185, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 8724, "time_per_iteration": 2.4886441230773926 }, { "auxiliary_loss_clip": 0.0112428, "auxiliary_loss_mlp": 0.01036457, "balance_loss_clip": 1.02086401, "balance_loss_mlp": 1.04554987, "epoch": 0.5245753795280325, "flos": 29426460612480.0, "grad_norm": 2.7172469979357716, "language_loss": 0.70513391, "learning_rate": 1.9378908490128625e-06, "loss": 0.72674125, "num_input_tokens_seen": 187396640, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.7890625, "step": 8725, "time_per_iteration": 2.602552652359009 }, { "auxiliary_loss_clip": 0.01048454, "auxiliary_loss_mlp": 0.01001514, "balance_loss_clip": 1.00015509, "balance_loss_mlp": 1.0234127, "epoch": 0.5246355027807005, "flos": 58834392785280.0, "grad_norm": 0.7554402387810583, "language_loss": 0.55683589, "learning_rate": 1.937501576352568e-06, "loss": 0.5773356, "num_input_tokens_seen": 187455945, "router_z_loss_clip": 0.01361084, "router_z_loss_mlp": 0.25, "step": 8726, "time_per_iteration": 3.1002719402313232 }, { "auxiliary_loss_clip": 0.01050178, "auxiliary_loss_mlp": 0.01001783, "balance_loss_clip": 1.00037086, "balance_loss_mlp": 1.02529407, "epoch": 0.5246956260333684, "flos": 64526592965760.0, "grad_norm": 0.7953807475117021, "language_loss": 0.58392918, "learning_rate": 1.937112306062219e-06, "loss": 0.6044488, "num_input_tokens_seen": 187519975, "router_z_loss_clip": 0.01409912, "router_z_loss_mlp": 0.24902344, "step": 8727, "time_per_iteration": 3.071842670440674 }, { "auxiliary_loss_clip": 0.01123772, "auxiliary_loss_mlp": 0.01039234, "balance_loss_clip": 1.02477407, "balance_loss_mlp": 1.0469842, "epoch": 0.5247557492860364, "flos": 24533631653760.0, "grad_norm": 1.3323340035456697, "language_loss": 0.70821369, "learning_rate": 1.9367230381565786e-06, "loss": 0.72984374, "num_input_tokens_seen": 187541775, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.76953125, "step": 8728, "time_per_iteration": 2.549575090408325 }, { "auxiliary_loss_clip": 0.01119091, "auxiliary_loss_mlp": 0.01027747, "balance_loss_clip": 1.01511645, "balance_loss_mlp": 1.04543173, "epoch": 0.5248158725387043, "flos": 18806131382400.0, "grad_norm": 1.6242953115099512, "language_loss": 0.69900477, "learning_rate": 1.9363337726504062e-06, "loss": 0.72047317, "num_input_tokens_seen": 187560425, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73828125, "step": 8729, "time_per_iteration": 2.541978359222412 }, { "auxiliary_loss_clip": 0.01123449, "auxiliary_loss_mlp": 0.01035235, "balance_loss_clip": 1.02131152, "balance_loss_mlp": 1.047315, "epoch": 0.5248759957913723, "flos": 20955851521920.0, "grad_norm": 1.8286049657452164, "language_loss": 0.84054542, "learning_rate": 1.935944509558464e-06, "loss": 0.86213231, "num_input_tokens_seen": 187579930, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 8730, "time_per_iteration": 2.509767532348633 }, { "auxiliary_loss_clip": 0.01121674, "auxiliary_loss_mlp": 0.0103478, "balance_loss_clip": 1.02127409, "balance_loss_mlp": 1.04756355, "epoch": 0.5249361190440403, "flos": 18660980522880.0, "grad_norm": 2.229741598850157, "language_loss": 0.79648477, "learning_rate": 1.9355552488955125e-06, "loss": 0.81804931, "num_input_tokens_seen": 187595365, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 8731, "time_per_iteration": 2.4711077213287354 }, { "auxiliary_loss_clip": 0.01116063, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.02176833, "balance_loss_mlp": 1.04474187, "epoch": 0.5249962422967083, "flos": 24863327614080.0, "grad_norm": 1.8443900062688836, "language_loss": 0.83056867, "learning_rate": 1.935165990676312e-06, "loss": 0.85207725, "num_input_tokens_seen": 187614715, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 8732, "time_per_iteration": 2.545229196548462 }, { "auxiliary_loss_clip": 0.01120319, "auxiliary_loss_mlp": 0.01034814, "balance_loss_clip": 1.02213621, "balance_loss_mlp": 1.04633355, "epoch": 0.5250563655493762, "flos": 15262681674240.0, "grad_norm": 1.6664867176346518, "language_loss": 0.77428228, "learning_rate": 1.9347767349156237e-06, "loss": 0.79583359, "num_input_tokens_seen": 187630745, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 8733, "time_per_iteration": 2.463482618331909 }, { "auxiliary_loss_clip": 0.01123527, "auxiliary_loss_mlp": 0.01040158, "balance_loss_clip": 1.02568638, "balance_loss_mlp": 1.04645324, "epoch": 0.5251164888020442, "flos": 18625177641600.0, "grad_norm": 2.200827385176258, "language_loss": 0.81867611, "learning_rate": 1.934387481628208e-06, "loss": 0.84031296, "num_input_tokens_seen": 187648200, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 8734, "time_per_iteration": 2.484158992767334 }, { "auxiliary_loss_clip": 0.0111841, "auxiliary_loss_mlp": 0.01031788, "balance_loss_clip": 1.01828194, "balance_loss_mlp": 1.04693449, "epoch": 0.5251766120547121, "flos": 29710764760320.0, "grad_norm": 1.3583237785377642, "language_loss": 0.77086532, "learning_rate": 1.933998230828826e-06, "loss": 0.79236734, "num_input_tokens_seen": 187669205, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 8735, "time_per_iteration": 2.5475192070007324 }, { "auxiliary_loss_clip": 0.01122866, "auxiliary_loss_mlp": 0.01037906, "balance_loss_clip": 1.02527618, "balance_loss_mlp": 1.04800534, "epoch": 0.5252367353073801, "flos": 23440295525760.0, "grad_norm": 1.5941724617104784, "language_loss": 0.8051095, "learning_rate": 1.9336089825322376e-06, "loss": 0.82671726, "num_input_tokens_seen": 187690890, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.75, "step": 8736, "time_per_iteration": 2.532593250274658 }, { "auxiliary_loss_clip": 0.01122692, "auxiliary_loss_mlp": 0.01035341, "balance_loss_clip": 1.02170324, "balance_loss_mlp": 1.04749441, "epoch": 0.5252968585600482, "flos": 30810708990720.0, "grad_norm": 2.10757105440167, "language_loss": 0.69899893, "learning_rate": 1.9332197367532033e-06, "loss": 0.72057927, "num_input_tokens_seen": 187713045, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 8737, "time_per_iteration": 2.5543925762176514 }, { "auxiliary_loss_clip": 0.01119787, "auxiliary_loss_mlp": 0.01037535, "balance_loss_clip": 1.02378464, "balance_loss_mlp": 1.04524159, "epoch": 0.5253569818127161, "flos": 20628274464000.0, "grad_norm": 2.4394430737160566, "language_loss": 0.77368605, "learning_rate": 1.9328304935064833e-06, "loss": 0.79525924, "num_input_tokens_seen": 187733640, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 8738, "time_per_iteration": 2.541215181350708 }, { "auxiliary_loss_clip": 0.01046755, "auxiliary_loss_mlp": 0.01001829, "balance_loss_clip": 1.0004586, "balance_loss_mlp": 1.02191651, "epoch": 0.5254171050653841, "flos": 63428695810560.0, "grad_norm": 0.7366887578419921, "language_loss": 0.54490805, "learning_rate": 1.932441252806837e-06, "loss": 0.56539387, "num_input_tokens_seen": 187792930, "router_z_loss_clip": 0.01373291, "router_z_loss_mlp": 0.24804688, "step": 8739, "time_per_iteration": 3.071406126022339 }, { "auxiliary_loss_clip": 0.01118641, "auxiliary_loss_mlp": 0.01035283, "balance_loss_clip": 1.02212262, "balance_loss_mlp": 1.04490697, "epoch": 0.525477228318052, "flos": 34670782108800.0, "grad_norm": 1.7900189536156172, "language_loss": 0.84563482, "learning_rate": 1.9320520146690263e-06, "loss": 0.86717403, "num_input_tokens_seen": 187812495, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 8740, "time_per_iteration": 2.6373000144958496 }, { "auxiliary_loss_clip": 0.01118331, "auxiliary_loss_mlp": 0.0104221, "balance_loss_clip": 1.02861452, "balance_loss_mlp": 1.0446794, "epoch": 0.52553735157072, "flos": 17930844766080.0, "grad_norm": 10.091619273624573, "language_loss": 0.69474763, "learning_rate": 1.9316627791078093e-06, "loss": 0.71635306, "num_input_tokens_seen": 187829685, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 8741, "time_per_iteration": 2.518573522567749 }, { "auxiliary_loss_clip": 0.01124342, "auxiliary_loss_mlp": 0.01032543, "balance_loss_clip": 1.01845884, "balance_loss_mlp": 1.04782295, "epoch": 0.5255974748233879, "flos": 9940864584960.0, "grad_norm": 1.984914533704382, "language_loss": 0.66608018, "learning_rate": 1.931273546137947e-06, "loss": 0.68764901, "num_input_tokens_seen": 187846495, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.765625, "step": 8742, "time_per_iteration": 2.5119757652282715 }, { "auxiliary_loss_clip": 0.01123871, "auxiliary_loss_mlp": 0.01039597, "balance_loss_clip": 1.02514911, "balance_loss_mlp": 1.04578948, "epoch": 0.5256575980760559, "flos": 16868427269760.0, "grad_norm": 2.7145952012524526, "language_loss": 0.62846959, "learning_rate": 1.9308843157741983e-06, "loss": 0.65010428, "num_input_tokens_seen": 187862010, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.78125, "step": 8743, "time_per_iteration": 2.4541592597961426 }, { "auxiliary_loss_clip": 0.01044799, "auxiliary_loss_mlp": 0.0100123, "balance_loss_clip": 0.99985933, "balance_loss_mlp": 1.01998389, "epoch": 0.5257177213287239, "flos": 62386210362240.0, "grad_norm": 0.7787582128714843, "language_loss": 0.54246145, "learning_rate": 1.930495088031323e-06, "loss": 0.56292176, "num_input_tokens_seen": 187922730, "router_z_loss_clip": 0.01373291, "router_z_loss_mlp": 0.24804688, "step": 8744, "time_per_iteration": 3.23205828666687 }, { "auxiliary_loss_clip": 0.01124896, "auxiliary_loss_mlp": 0.01040979, "balance_loss_clip": 1.02546954, "balance_loss_mlp": 1.04528415, "epoch": 0.5257778445813919, "flos": 20776908942720.0, "grad_norm": 4.095576630030649, "language_loss": 0.75649011, "learning_rate": 1.9301058629240814e-06, "loss": 0.77814889, "num_input_tokens_seen": 187940160, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.796875, "step": 8745, "time_per_iteration": 2.4846372604370117 }, { "auxiliary_loss_clip": 0.01118396, "auxiliary_loss_mlp": 0.01036626, "balance_loss_clip": 1.02365613, "balance_loss_mlp": 1.04406583, "epoch": 0.5258379678340598, "flos": 17018606033280.0, "grad_norm": 2.119333448315327, "language_loss": 0.81271601, "learning_rate": 1.9297166404672324e-06, "loss": 0.83426625, "num_input_tokens_seen": 187958625, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7421875, "step": 8746, "time_per_iteration": 2.497954845428467 }, { "auxiliary_loss_clip": 0.01119619, "auxiliary_loss_mlp": 0.01034969, "balance_loss_clip": 1.02130735, "balance_loss_mlp": 1.0452733, "epoch": 0.5258980910867278, "flos": 21068754946560.0, "grad_norm": 2.2343336757466066, "language_loss": 0.75444126, "learning_rate": 1.9293274206755353e-06, "loss": 0.77598715, "num_input_tokens_seen": 187977575, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 8747, "time_per_iteration": 2.477175712585449 }, { "auxiliary_loss_clip": 0.01112847, "auxiliary_loss_mlp": 0.01031489, "balance_loss_clip": 1.01838732, "balance_loss_mlp": 1.04203951, "epoch": 0.5259582143393957, "flos": 18004461690240.0, "grad_norm": 1.8151768374784658, "language_loss": 0.82968163, "learning_rate": 1.9289382035637505e-06, "loss": 0.85112494, "num_input_tokens_seen": 187996650, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 8748, "time_per_iteration": 2.5083656311035156 }, { "auxiliary_loss_clip": 0.01117255, "auxiliary_loss_mlp": 0.0103911, "balance_loss_clip": 1.0244416, "balance_loss_mlp": 1.0415467, "epoch": 0.5260183375920637, "flos": 22783848520320.0, "grad_norm": 2.014283813235265, "language_loss": 0.80560374, "learning_rate": 1.9285489891466345e-06, "loss": 0.82716739, "num_input_tokens_seen": 188013510, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7578125, "step": 8749, "time_per_iteration": 2.4787395000457764 }, { "auxiliary_loss_clip": 0.0111868, "auxiliary_loss_mlp": 0.01035682, "balance_loss_clip": 1.02123439, "balance_loss_mlp": 1.04469132, "epoch": 0.5260784608447318, "flos": 27052406081280.0, "grad_norm": 1.7842741469084518, "language_loss": 0.72373521, "learning_rate": 1.9281597774389487e-06, "loss": 0.74527889, "num_input_tokens_seen": 188032085, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7421875, "step": 8750, "time_per_iteration": 2.5257062911987305 }, { "auxiliary_loss_clip": 0.01116428, "auxiliary_loss_mlp": 0.01034933, "balance_loss_clip": 1.02143896, "balance_loss_mlp": 1.04213929, "epoch": 0.5261385840973997, "flos": 20662820369280.0, "grad_norm": 1.465789382088893, "language_loss": 0.76183569, "learning_rate": 1.9277705684554517e-06, "loss": 0.78334928, "num_input_tokens_seen": 188050590, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 8751, "time_per_iteration": 2.4671671390533447 }, { "auxiliary_loss_clip": 0.01113482, "auxiliary_loss_mlp": 0.01035045, "balance_loss_clip": 1.02224803, "balance_loss_mlp": 1.04230642, "epoch": 0.5261987073500677, "flos": 23622649896960.0, "grad_norm": 1.699013586044293, "language_loss": 0.76031029, "learning_rate": 1.927381362210902e-06, "loss": 0.7817955, "num_input_tokens_seen": 188071620, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 8752, "time_per_iteration": 2.514378309249878 }, { "auxiliary_loss_clip": 0.01119508, "auxiliary_loss_mlp": 0.01029029, "balance_loss_clip": 1.01459908, "balance_loss_mlp": 1.04312468, "epoch": 0.5262588306027356, "flos": 27636241743360.0, "grad_norm": 4.475434160167183, "language_loss": 0.68003976, "learning_rate": 1.926992158720058e-06, "loss": 0.70152515, "num_input_tokens_seen": 188091740, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 8753, "time_per_iteration": 2.5249836444854736 }, { "auxiliary_loss_clip": 0.01118291, "auxiliary_loss_mlp": 0.01034226, "balance_loss_clip": 1.02046955, "balance_loss_mlp": 1.04459453, "epoch": 0.5263189538554036, "flos": 21759711943680.0, "grad_norm": 2.1549271351042587, "language_loss": 0.83792603, "learning_rate": 1.9266029579976785e-06, "loss": 0.85945117, "num_input_tokens_seen": 188111165, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 8754, "time_per_iteration": 2.4896609783172607 }, { "auxiliary_loss_clip": 0.01120017, "auxiliary_loss_mlp": 0.0103707, "balance_loss_clip": 1.02320027, "balance_loss_mlp": 1.04373217, "epoch": 0.5263790771080715, "flos": 14276359140480.0, "grad_norm": 2.2473127340385135, "language_loss": 0.87547636, "learning_rate": 1.926213760058522e-06, "loss": 0.89704728, "num_input_tokens_seen": 188127825, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 8755, "time_per_iteration": 2.447956085205078 }, { "auxiliary_loss_clip": 0.01043623, "auxiliary_loss_mlp": 0.01009266, "balance_loss_clip": 1.00804961, "balance_loss_mlp": 1.01868033, "epoch": 0.5264392003607395, "flos": 65806413528960.0, "grad_norm": 0.832056727460198, "language_loss": 0.58850837, "learning_rate": 1.9258245649173477e-06, "loss": 0.60903728, "num_input_tokens_seen": 188194050, "router_z_loss_clip": 0.012146, "router_z_loss_mlp": 0.25, "step": 8756, "time_per_iteration": 3.1943345069885254 }, { "auxiliary_loss_clip": 0.01120085, "auxiliary_loss_mlp": 0.01034904, "balance_loss_clip": 1.01997912, "balance_loss_mlp": 1.04281926, "epoch": 0.5264993236134075, "flos": 21032413361280.0, "grad_norm": 1.6785682390099492, "language_loss": 0.70422989, "learning_rate": 1.925435372588913e-06, "loss": 0.72577977, "num_input_tokens_seen": 188212565, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 8757, "time_per_iteration": 5.2899298667907715 }, { "auxiliary_loss_clip": 0.01116967, "auxiliary_loss_mlp": 0.01034665, "balance_loss_clip": 1.02074194, "balance_loss_mlp": 1.04167795, "epoch": 0.5265594468660755, "flos": 16618202150400.0, "grad_norm": 1.8079145513146915, "language_loss": 0.87855512, "learning_rate": 1.9250461830879768e-06, "loss": 0.90007138, "num_input_tokens_seen": 188229505, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 8758, "time_per_iteration": 2.4889326095581055 }, { "auxiliary_loss_clip": 0.01119085, "auxiliary_loss_mlp": 0.01036642, "balance_loss_clip": 1.02166331, "balance_loss_mlp": 1.04218793, "epoch": 0.5266195701187434, "flos": 24134125610880.0, "grad_norm": 1.445053235325002, "language_loss": 0.76229179, "learning_rate": 1.9246569964292965e-06, "loss": 0.78384912, "num_input_tokens_seen": 188250395, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.76953125, "step": 8759, "time_per_iteration": 2.5068886280059814 }, { "auxiliary_loss_clip": 0.0111321, "auxiliary_loss_mlp": 0.01029128, "balance_loss_clip": 1.01549602, "balance_loss_mlp": 1.04097342, "epoch": 0.5266796933714114, "flos": 15844111125120.0, "grad_norm": 2.2820986238160423, "language_loss": 0.71513116, "learning_rate": 1.9242678126276307e-06, "loss": 0.7365545, "num_input_tokens_seen": 188266785, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 8760, "time_per_iteration": 3.9937403202056885 }, { "auxiliary_loss_clip": 0.01122209, "auxiliary_loss_mlp": 0.01038686, "balance_loss_clip": 1.02324867, "balance_loss_mlp": 1.04409146, "epoch": 0.5267398166240793, "flos": 20951434149120.0, "grad_norm": 2.662728598094361, "language_loss": 0.75467038, "learning_rate": 1.923878631697736e-06, "loss": 0.77627927, "num_input_tokens_seen": 188282525, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78125, "step": 8761, "time_per_iteration": 3.8728690147399902 }, { "auxiliary_loss_clip": 0.01117977, "auxiliary_loss_mlp": 0.01031643, "balance_loss_clip": 1.01854157, "balance_loss_mlp": 1.04283297, "epoch": 0.5267999398767473, "flos": 20996394998400.0, "grad_norm": 1.7112671715666983, "language_loss": 0.70917761, "learning_rate": 1.923489453654373e-06, "loss": 0.73067385, "num_input_tokens_seen": 188301395, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 8762, "time_per_iteration": 2.48710036277771 }, { "auxiliary_loss_clip": 0.01043896, "auxiliary_loss_mlp": 0.01001203, "balance_loss_clip": 0.99981463, "balance_loss_mlp": 1.01877356, "epoch": 0.5268600631294152, "flos": 66849401767680.0, "grad_norm": 0.9049231070930583, "language_loss": 0.6545372, "learning_rate": 1.9231002785122963e-06, "loss": 0.67498821, "num_input_tokens_seen": 188357665, "router_z_loss_clip": 0.01391602, "router_z_loss_mlp": 0.25195312, "step": 8763, "time_per_iteration": 3.006100654602051 }, { "auxiliary_loss_clip": 0.01118346, "auxiliary_loss_mlp": 0.01029918, "balance_loss_clip": 1.01575589, "balance_loss_mlp": 1.04303479, "epoch": 0.5269201863820833, "flos": 17165552572800.0, "grad_norm": 1.8395020219664164, "language_loss": 0.70959032, "learning_rate": 1.922711106286265e-06, "loss": 0.7310729, "num_input_tokens_seen": 188376935, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75390625, "step": 8764, "time_per_iteration": 2.485015869140625 }, { "auxiliary_loss_clip": 0.01117239, "auxiliary_loss_mlp": 0.01032199, "balance_loss_clip": 1.01717842, "balance_loss_mlp": 1.04163027, "epoch": 0.5269803096347513, "flos": 20522589672960.0, "grad_norm": 1.7274820600035985, "language_loss": 0.74596614, "learning_rate": 1.9223219369910368e-06, "loss": 0.76746053, "num_input_tokens_seen": 188394995, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7578125, "step": 8765, "time_per_iteration": 2.496598958969116 }, { "auxiliary_loss_clip": 0.01118418, "auxiliary_loss_mlp": 0.0103126, "balance_loss_clip": 1.01608431, "balance_loss_mlp": 1.04016423, "epoch": 0.5270404328874192, "flos": 27230989524480.0, "grad_norm": 1.5170646098375016, "language_loss": 0.85749209, "learning_rate": 1.9219327706413677e-06, "loss": 0.87898892, "num_input_tokens_seen": 188415475, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.78125, "step": 8766, "time_per_iteration": 2.553116798400879 }, { "auxiliary_loss_clip": 0.01121371, "auxiliary_loss_mlp": 0.01033888, "balance_loss_clip": 1.0192728, "balance_loss_mlp": 1.04479551, "epoch": 0.5271005561400872, "flos": 23110491824640.0, "grad_norm": 2.006042805817745, "language_loss": 0.78982985, "learning_rate": 1.921543607252017e-06, "loss": 0.81138241, "num_input_tokens_seen": 188435665, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 8767, "time_per_iteration": 2.508882999420166 }, { "auxiliary_loss_clip": 0.01121188, "auxiliary_loss_mlp": 0.01032836, "balance_loss_clip": 1.01765513, "balance_loss_mlp": 1.04408836, "epoch": 0.5271606793927551, "flos": 22564793427840.0, "grad_norm": 2.0589190300895264, "language_loss": 0.73693061, "learning_rate": 1.9211544468377394e-06, "loss": 0.75847089, "num_input_tokens_seen": 188455405, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.76953125, "step": 8768, "time_per_iteration": 2.485488176345825 }, { "auxiliary_loss_clip": 0.01117189, "auxiliary_loss_mlp": 0.0103915, "balance_loss_clip": 1.02609658, "balance_loss_mlp": 1.0435605, "epoch": 0.5272208026454231, "flos": 18764259102720.0, "grad_norm": 2.828848350575803, "language_loss": 0.74045509, "learning_rate": 1.9207652894132933e-06, "loss": 0.76201844, "num_input_tokens_seen": 188472940, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 8769, "time_per_iteration": 2.4738335609436035 }, { "auxiliary_loss_clip": 0.01117928, "auxiliary_loss_mlp": 0.01039598, "balance_loss_clip": 1.02546597, "balance_loss_mlp": 1.04312158, "epoch": 0.5272809258980911, "flos": 20412164286720.0, "grad_norm": 1.734219895420066, "language_loss": 0.73650324, "learning_rate": 1.920376134993436e-06, "loss": 0.75807846, "num_input_tokens_seen": 188493035, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 8770, "time_per_iteration": 2.4968323707580566 }, { "auxiliary_loss_clip": 0.01119579, "auxiliary_loss_mlp": 0.01033813, "balance_loss_clip": 1.01970494, "balance_loss_mlp": 1.04480219, "epoch": 0.5273410491507591, "flos": 28256742213120.0, "grad_norm": 1.871343170556907, "language_loss": 0.67833877, "learning_rate": 1.9199869835929224e-06, "loss": 0.69987273, "num_input_tokens_seen": 188513860, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.74609375, "step": 8771, "time_per_iteration": 2.562162399291992 }, { "auxiliary_loss_clip": 0.01116537, "auxiliary_loss_mlp": 0.01034597, "balance_loss_clip": 1.02061415, "balance_loss_mlp": 1.04312563, "epoch": 0.527401172403427, "flos": 22455158140800.0, "grad_norm": 4.731249627727066, "language_loss": 0.76457965, "learning_rate": 1.9195978352265115e-06, "loss": 0.78609109, "num_input_tokens_seen": 188533345, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 8772, "time_per_iteration": 2.5199854373931885 }, { "auxiliary_loss_clip": 0.01121581, "auxiliary_loss_mlp": 0.01041724, "balance_loss_clip": 1.02724028, "balance_loss_mlp": 1.04426146, "epoch": 0.527461295656095, "flos": 21031084558080.0, "grad_norm": 2.4667193805850927, "language_loss": 0.65922904, "learning_rate": 1.9192086899089585e-06, "loss": 0.68086213, "num_input_tokens_seen": 188551550, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7734375, "step": 8773, "time_per_iteration": 2.4956278800964355 }, { "auxiliary_loss_clip": 0.01118645, "auxiliary_loss_mlp": 0.01039263, "balance_loss_clip": 1.02566135, "balance_loss_mlp": 1.04217052, "epoch": 0.5275214189087629, "flos": 26322018929280.0, "grad_norm": 1.7713253404664697, "language_loss": 0.86153132, "learning_rate": 1.91881954765502e-06, "loss": 0.8831104, "num_input_tokens_seen": 188571615, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 8774, "time_per_iteration": 2.5252294540405273 }, { "auxiliary_loss_clip": 0.01117078, "auxiliary_loss_mlp": 0.01030391, "balance_loss_clip": 1.01731396, "balance_loss_mlp": 1.04310536, "epoch": 0.5275815421614309, "flos": 20047024581120.0, "grad_norm": 1.8226336274798058, "language_loss": 0.80382764, "learning_rate": 1.9184304084794523e-06, "loss": 0.8253023, "num_input_tokens_seen": 188591965, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 8775, "time_per_iteration": 2.5061867237091064 }, { "auxiliary_loss_clip": 0.01116626, "auxiliary_loss_mlp": 0.01037874, "balance_loss_clip": 1.02371836, "balance_loss_mlp": 1.04361951, "epoch": 0.5276416654140988, "flos": 21432206712960.0, "grad_norm": 1.85400203220341, "language_loss": 0.83511949, "learning_rate": 1.918041272397012e-06, "loss": 0.85666442, "num_input_tokens_seen": 188610675, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.73046875, "step": 8776, "time_per_iteration": 2.478348970413208 }, { "auxiliary_loss_clip": 0.01116749, "auxiliary_loss_mlp": 0.01033401, "balance_loss_clip": 1.01937652, "balance_loss_mlp": 1.04095769, "epoch": 0.5277017886667669, "flos": 17165085696000.0, "grad_norm": 1.8630436247111042, "language_loss": 0.67957342, "learning_rate": 1.9176521394224547e-06, "loss": 0.70107496, "num_input_tokens_seen": 188628235, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 8777, "time_per_iteration": 2.4787704944610596 }, { "auxiliary_loss_clip": 0.01117265, "auxiliary_loss_mlp": 0.01040132, "balance_loss_clip": 1.02660203, "balance_loss_mlp": 1.04393005, "epoch": 0.5277619119194349, "flos": 20448146736000.0, "grad_norm": 1.4924815395718427, "language_loss": 0.82281172, "learning_rate": 1.9172630095705358e-06, "loss": 0.84438574, "num_input_tokens_seen": 188648925, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 8778, "time_per_iteration": 2.5336453914642334 }, { "auxiliary_loss_clip": 0.01118918, "auxiliary_loss_mlp": 0.01037836, "balance_loss_clip": 1.02357268, "balance_loss_mlp": 1.04325795, "epoch": 0.5278220351721028, "flos": 24061083304320.0, "grad_norm": 2.2756641974076732, "language_loss": 0.79655653, "learning_rate": 1.916873882856013e-06, "loss": 0.81812412, "num_input_tokens_seen": 188668125, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 8779, "time_per_iteration": 2.53379225730896 }, { "auxiliary_loss_clip": 0.01111453, "auxiliary_loss_mlp": 0.01032971, "balance_loss_clip": 1.02047169, "balance_loss_mlp": 1.03933752, "epoch": 0.5278821584247708, "flos": 24642907804800.0, "grad_norm": 2.064886583621472, "language_loss": 0.77114618, "learning_rate": 1.9164847592936406e-06, "loss": 0.79259044, "num_input_tokens_seen": 188684410, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 8780, "time_per_iteration": 2.5007083415985107 }, { "auxiliary_loss_clip": 0.01122767, "auxiliary_loss_mlp": 0.01035108, "balance_loss_clip": 1.02046919, "balance_loss_mlp": 1.04538572, "epoch": 0.5279422816774387, "flos": 35408244240000.0, "grad_norm": 1.638363422022183, "language_loss": 0.69603682, "learning_rate": 1.916095638898174e-06, "loss": 0.71761554, "num_input_tokens_seen": 188706130, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7734375, "step": 8781, "time_per_iteration": 2.6180124282836914 }, { "auxiliary_loss_clip": 0.01113296, "auxiliary_loss_mlp": 0.01033614, "balance_loss_clip": 1.02117467, "balance_loss_mlp": 1.04075837, "epoch": 0.5280024049301068, "flos": 22967028904320.0, "grad_norm": 2.13954061319265, "language_loss": 0.7230342, "learning_rate": 1.9157065216843696e-06, "loss": 0.74450332, "num_input_tokens_seen": 188725030, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.72265625, "step": 8782, "time_per_iteration": 2.48964262008667 }, { "auxiliary_loss_clip": 0.0111508, "auxiliary_loss_mlp": 0.01030296, "balance_loss_clip": 1.0171293, "balance_loss_mlp": 1.04166949, "epoch": 0.5280625281827747, "flos": 21507619317120.0, "grad_norm": 1.952459454925775, "language_loss": 0.68441498, "learning_rate": 1.915317407666982e-06, "loss": 0.70586872, "num_input_tokens_seen": 188744325, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 8783, "time_per_iteration": 2.5241167545318604 }, { "auxiliary_loss_clip": 0.01124231, "auxiliary_loss_mlp": 0.01041826, "balance_loss_clip": 1.02623308, "balance_loss_mlp": 1.04387999, "epoch": 0.5281226514354427, "flos": 31208167958400.0, "grad_norm": 2.15388749197512, "language_loss": 0.69104987, "learning_rate": 1.9149282968607674e-06, "loss": 0.71271044, "num_input_tokens_seen": 188765100, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.8046875, "step": 8784, "time_per_iteration": 2.58141827583313 }, { "auxiliary_loss_clip": 0.01121608, "auxiliary_loss_mlp": 0.01034022, "balance_loss_clip": 1.01884675, "balance_loss_mlp": 1.04171252, "epoch": 0.5281827746881106, "flos": 25077821679360.0, "grad_norm": 2.009828096886062, "language_loss": 0.74862099, "learning_rate": 1.91453918928048e-06, "loss": 0.77017736, "num_input_tokens_seen": 188783995, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.796875, "step": 8785, "time_per_iteration": 2.5172793865203857 }, { "auxiliary_loss_clip": 0.01119591, "auxiliary_loss_mlp": 0.01036281, "balance_loss_clip": 1.0212549, "balance_loss_mlp": 1.04410315, "epoch": 0.5282428979407786, "flos": 20631255292800.0, "grad_norm": 2.4484358298693283, "language_loss": 0.83465528, "learning_rate": 1.9141500849408745e-06, "loss": 0.85621393, "num_input_tokens_seen": 188803120, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.75390625, "step": 8786, "time_per_iteration": 2.5114121437072754 }, { "auxiliary_loss_clip": 0.01111834, "auxiliary_loss_mlp": 0.01028189, "balance_loss_clip": 1.0160234, "balance_loss_mlp": 1.04081583, "epoch": 0.5283030211934465, "flos": 22419391173120.0, "grad_norm": 8.430136171503893, "language_loss": 0.82702172, "learning_rate": 1.9137609838567076e-06, "loss": 0.84842193, "num_input_tokens_seen": 188820960, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7109375, "step": 8787, "time_per_iteration": 2.5050129890441895 }, { "auxiliary_loss_clip": 0.01115927, "auxiliary_loss_mlp": 0.01030352, "balance_loss_clip": 1.01790667, "balance_loss_mlp": 1.04347694, "epoch": 0.5283631444461145, "flos": 23615467176960.0, "grad_norm": 1.7253419475116207, "language_loss": 0.83254755, "learning_rate": 1.9133718860427316e-06, "loss": 0.85401034, "num_input_tokens_seen": 188837165, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7265625, "step": 8788, "time_per_iteration": 2.4784083366394043 }, { "auxiliary_loss_clip": 0.01117273, "auxiliary_loss_mlp": 0.01040719, "balance_loss_clip": 1.02537656, "balance_loss_mlp": 1.04454327, "epoch": 0.5284232676987825, "flos": 32671994918400.0, "grad_norm": 1.8899444342857645, "language_loss": 0.74786568, "learning_rate": 1.9129827915137027e-06, "loss": 0.7694456, "num_input_tokens_seen": 188858555, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.7265625, "step": 8789, "time_per_iteration": 2.575312614440918 }, { "auxiliary_loss_clip": 0.01120232, "auxiliary_loss_mlp": 0.01039678, "balance_loss_clip": 1.02596879, "balance_loss_mlp": 1.04432237, "epoch": 0.5284833909514505, "flos": 26760919213440.0, "grad_norm": 1.5930097567638966, "language_loss": 0.70150256, "learning_rate": 1.9125937002843754e-06, "loss": 0.72310162, "num_input_tokens_seen": 188879050, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 8790, "time_per_iteration": 2.541870355606079 }, { "auxiliary_loss_clip": 0.01114207, "auxiliary_loss_mlp": 0.0103196, "balance_loss_clip": 1.01959205, "balance_loss_mlp": 1.04223907, "epoch": 0.5285435142041185, "flos": 22090700793600.0, "grad_norm": 1.6527372314214734, "language_loss": 0.78907013, "learning_rate": 1.9122046123695036e-06, "loss": 0.81053174, "num_input_tokens_seen": 188898885, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 8791, "time_per_iteration": 2.509197235107422 }, { "auxiliary_loss_clip": 0.01117764, "auxiliary_loss_mlp": 0.01026465, "balance_loss_clip": 1.01365614, "balance_loss_mlp": 1.04500914, "epoch": 0.5286036374567864, "flos": 20375463565440.0, "grad_norm": 2.509782665456707, "language_loss": 0.66247332, "learning_rate": 1.9118155277838423e-06, "loss": 0.68391562, "num_input_tokens_seen": 188917225, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 8792, "time_per_iteration": 2.463414192199707 }, { "auxiliary_loss_clip": 0.01114318, "auxiliary_loss_mlp": 0.01036454, "balance_loss_clip": 1.02373433, "balance_loss_mlp": 1.04159904, "epoch": 0.5286637607094544, "flos": 24352175122560.0, "grad_norm": 2.09203873864047, "language_loss": 0.80056131, "learning_rate": 1.9114264465421443e-06, "loss": 0.82206905, "num_input_tokens_seen": 188936120, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 8793, "time_per_iteration": 2.520329475402832 }, { "auxiliary_loss_clip": 0.0111844, "auxiliary_loss_mlp": 0.01039768, "balance_loss_clip": 1.02604151, "balance_loss_mlp": 1.04373872, "epoch": 0.5287238839621223, "flos": 17271165536640.0, "grad_norm": 1.9340686938417733, "language_loss": 0.85313845, "learning_rate": 1.9110373686591645e-06, "loss": 0.87472051, "num_input_tokens_seen": 188953405, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.74609375, "step": 8794, "time_per_iteration": 2.452751636505127 }, { "auxiliary_loss_clip": 0.01121551, "auxiliary_loss_mlp": 0.01038436, "balance_loss_clip": 1.02420235, "balance_loss_mlp": 1.04269588, "epoch": 0.5287840072147904, "flos": 17566890209280.0, "grad_norm": 2.640715957197445, "language_loss": 0.67548144, "learning_rate": 1.9106482941496564e-06, "loss": 0.69708133, "num_input_tokens_seen": 188971150, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7890625, "step": 8795, "time_per_iteration": 2.4716947078704834 }, { "auxiliary_loss_clip": 0.01118653, "auxiliary_loss_mlp": 0.0103235, "balance_loss_clip": 1.01881945, "balance_loss_mlp": 1.04263735, "epoch": 0.5288441304674583, "flos": 18552099421440.0, "grad_norm": 2.290540941375479, "language_loss": 0.80573404, "learning_rate": 1.910259223028374e-06, "loss": 0.82724404, "num_input_tokens_seen": 188989550, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.76171875, "step": 8796, "time_per_iteration": 2.469794988632202 }, { "auxiliary_loss_clip": 0.01122809, "auxiliary_loss_mlp": 0.01036611, "balance_loss_clip": 1.02261007, "balance_loss_mlp": 1.04694533, "epoch": 0.5289042537201263, "flos": 20814507504000.0, "grad_norm": 2.192701537235125, "language_loss": 0.68911463, "learning_rate": 1.909870155310071e-06, "loss": 0.7107088, "num_input_tokens_seen": 189008795, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 8797, "time_per_iteration": 2.5194718837738037 }, { "auxiliary_loss_clip": 0.01112908, "auxiliary_loss_mlp": 0.01034654, "balance_loss_clip": 1.02187443, "balance_loss_mlp": 1.04191923, "epoch": 0.5289643769727942, "flos": 15735265937280.0, "grad_norm": 1.652401988122534, "language_loss": 0.82495654, "learning_rate": 1.9094810910095005e-06, "loss": 0.84643215, "num_input_tokens_seen": 189025540, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 8798, "time_per_iteration": 2.461740732192993 }, { "auxiliary_loss_clip": 0.01119865, "auxiliary_loss_mlp": 0.01042486, "balance_loss_clip": 1.02833009, "balance_loss_mlp": 1.04291356, "epoch": 0.5290245002254622, "flos": 19537308633600.0, "grad_norm": 1.880319243618293, "language_loss": 0.70678663, "learning_rate": 1.9090920301414166e-06, "loss": 0.72841018, "num_input_tokens_seen": 189044885, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.76953125, "step": 8799, "time_per_iteration": 5.345048427581787 }, { "auxiliary_loss_clip": 0.01116801, "auxiliary_loss_mlp": 0.01034606, "balance_loss_clip": 1.02185667, "balance_loss_mlp": 1.04643488, "epoch": 0.5290846234781301, "flos": 15815131827840.0, "grad_norm": 2.0972991546249307, "language_loss": 0.69415027, "learning_rate": 1.9087029727205716e-06, "loss": 0.71566439, "num_input_tokens_seen": 189061280, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 8800, "time_per_iteration": 2.4621143341064453 }, { "auxiliary_loss_clip": 0.01045577, "auxiliary_loss_mlp": 0.01026115, "balance_loss_clip": 1.02460122, "balance_loss_mlp": 1.02019966, "epoch": 0.5291447467307981, "flos": 70057624821120.0, "grad_norm": 0.9675005556296223, "language_loss": 0.56987315, "learning_rate": 1.9083139187617193e-06, "loss": 0.59059012, "num_input_tokens_seen": 189114775, "router_z_loss_clip": 0.01513672, "router_z_loss_mlp": 0.25390625, "step": 8801, "time_per_iteration": 4.495891571044922 }, { "auxiliary_loss_clip": 0.01119383, "auxiliary_loss_mlp": 0.0103761, "balance_loss_clip": 1.02431774, "balance_loss_mlp": 1.04432774, "epoch": 0.529204869983466, "flos": 28364186770560.0, "grad_norm": 2.1348920093645263, "language_loss": 0.64487588, "learning_rate": 1.9079248682796123e-06, "loss": 0.66644585, "num_input_tokens_seen": 189134700, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75, "step": 8802, "time_per_iteration": 3.945561408996582 }, { "auxiliary_loss_clip": 0.01115851, "auxiliary_loss_mlp": 0.01030348, "balance_loss_clip": 1.01666856, "balance_loss_mlp": 1.04289985, "epoch": 0.5292649932361341, "flos": 33758830684800.0, "grad_norm": 1.7111853788590128, "language_loss": 0.68890649, "learning_rate": 1.907535821289003e-06, "loss": 0.71036845, "num_input_tokens_seen": 189155365, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 8803, "time_per_iteration": 2.589991569519043 }, { "auxiliary_loss_clip": 0.01114149, "auxiliary_loss_mlp": 0.01037188, "balance_loss_clip": 1.02383077, "balance_loss_mlp": 1.04209471, "epoch": 0.5293251164888021, "flos": 20447679859200.0, "grad_norm": 1.6206015177066289, "language_loss": 0.75958139, "learning_rate": 1.9071467778046458e-06, "loss": 0.78109473, "num_input_tokens_seen": 189173885, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 8804, "time_per_iteration": 2.498194456100464 }, { "auxiliary_loss_clip": 0.01047185, "auxiliary_loss_mlp": 0.01001387, "balance_loss_clip": 0.99996227, "balance_loss_mlp": 1.02208066, "epoch": 0.52938523974147, "flos": 66545312204160.0, "grad_norm": 0.7759827338995738, "language_loss": 0.52993745, "learning_rate": 1.906757737841291e-06, "loss": 0.55042315, "num_input_tokens_seen": 189236515, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.25, "step": 8805, "time_per_iteration": 3.1853513717651367 }, { "auxiliary_loss_clip": 0.01048117, "auxiliary_loss_mlp": 0.01000937, "balance_loss_clip": 0.99945301, "balance_loss_mlp": 1.0225811, "epoch": 0.529445362994138, "flos": 67151734542720.0, "grad_norm": 0.7455281031928451, "language_loss": 0.63796425, "learning_rate": 1.906368701413693e-06, "loss": 0.65845478, "num_input_tokens_seen": 189300500, "router_z_loss_clip": 0.01483154, "router_z_loss_mlp": 0.25390625, "step": 8806, "time_per_iteration": 3.1099560260772705 }, { "auxiliary_loss_clip": 0.01120982, "auxiliary_loss_mlp": 0.01031828, "balance_loss_clip": 1.01876879, "balance_loss_mlp": 1.04311633, "epoch": 0.5295054862468059, "flos": 17749316407680.0, "grad_norm": 1.7107665903304856, "language_loss": 0.72375077, "learning_rate": 1.9059796685366026e-06, "loss": 0.74527884, "num_input_tokens_seen": 189319745, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.77734375, "step": 8807, "time_per_iteration": 2.471726894378662 }, { "auxiliary_loss_clip": 0.01115313, "auxiliary_loss_mlp": 0.01030169, "balance_loss_clip": 1.01811743, "balance_loss_mlp": 1.04346538, "epoch": 0.529565609499474, "flos": 11397401084160.0, "grad_norm": 2.6896304998241263, "language_loss": 0.69430733, "learning_rate": 1.9055906392247723e-06, "loss": 0.71576208, "num_input_tokens_seen": 189334550, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71875, "step": 8808, "time_per_iteration": 2.453284502029419 }, { "auxiliary_loss_clip": 0.01114765, "auxiliary_loss_mlp": 0.01033721, "balance_loss_clip": 1.02099586, "balance_loss_mlp": 1.04184961, "epoch": 0.5296257327521419, "flos": 17196363463680.0, "grad_norm": 1.83306475241864, "language_loss": 0.87055069, "learning_rate": 1.9052016134929554e-06, "loss": 0.8920356, "num_input_tokens_seen": 189351735, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 8809, "time_per_iteration": 2.4557113647460938 }, { "auxiliary_loss_clip": 0.0112239, "auxiliary_loss_mlp": 0.01034182, "balance_loss_clip": 1.01951885, "balance_loss_mlp": 1.0442512, "epoch": 0.5296858560048099, "flos": 39964086777600.0, "grad_norm": 1.6816879677775856, "language_loss": 0.6383338, "learning_rate": 1.9048125913559016e-06, "loss": 0.65989947, "num_input_tokens_seen": 189373105, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 8810, "time_per_iteration": 2.631575345993042 }, { "auxiliary_loss_clip": 0.01115964, "auxiliary_loss_mlp": 0.01034861, "balance_loss_clip": 1.02196229, "balance_loss_mlp": 1.0439384, "epoch": 0.5297459792574778, "flos": 20961418129920.0, "grad_norm": 1.50863354963284, "language_loss": 0.67985308, "learning_rate": 1.9044235728283646e-06, "loss": 0.7013613, "num_input_tokens_seen": 189394615, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 8811, "time_per_iteration": 2.5398895740509033 }, { "auxiliary_loss_clip": 0.0104556, "auxiliary_loss_mlp": 0.01001183, "balance_loss_clip": 0.99981827, "balance_loss_mlp": 1.02017629, "epoch": 0.5298061025101458, "flos": 66523620389760.0, "grad_norm": 0.6720937206488773, "language_loss": 0.53372836, "learning_rate": 1.9040345579250953e-06, "loss": 0.55419582, "num_input_tokens_seen": 189459750, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.25390625, "step": 8812, "time_per_iteration": 3.217832565307617 }, { "auxiliary_loss_clip": 0.01045606, "auxiliary_loss_mlp": 0.01002755, "balance_loss_clip": 1.00130069, "balance_loss_mlp": 1.02028763, "epoch": 0.5298662257628137, "flos": 67662994775040.0, "grad_norm": 0.746178135199671, "language_loss": 0.56284785, "learning_rate": 1.9036455466608453e-06, "loss": 0.58333147, "num_input_tokens_seen": 189527540, "router_z_loss_clip": 0.01452637, "router_z_loss_mlp": 0.25390625, "step": 8813, "time_per_iteration": 3.164418935775757 }, { "auxiliary_loss_clip": 0.01113626, "auxiliary_loss_mlp": 0.01029948, "balance_loss_clip": 1.01749647, "balance_loss_mlp": 1.04421186, "epoch": 0.5299263490154817, "flos": 19646405216640.0, "grad_norm": 1.5161587959359213, "language_loss": 0.8178165, "learning_rate": 1.9032565390503657e-06, "loss": 0.83925223, "num_input_tokens_seen": 189546900, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 8814, "time_per_iteration": 2.477621078491211 }, { "auxiliary_loss_clip": 0.01121858, "auxiliary_loss_mlp": 0.01028825, "balance_loss_clip": 1.01616454, "balance_loss_mlp": 1.04575992, "epoch": 0.5299864722681497, "flos": 22055005653120.0, "grad_norm": 2.1597504812969475, "language_loss": 0.84760964, "learning_rate": 1.9028675351084076e-06, "loss": 0.86911654, "num_input_tokens_seen": 189566490, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.76171875, "step": 8815, "time_per_iteration": 2.513021230697632 }, { "auxiliary_loss_clip": 0.01115641, "auxiliary_loss_mlp": 0.01033938, "balance_loss_clip": 1.02188015, "balance_loss_mlp": 1.04450154, "epoch": 0.5300465955208177, "flos": 21763698353280.0, "grad_norm": 1.8774041624735978, "language_loss": 0.66048396, "learning_rate": 1.9024785348497225e-06, "loss": 0.68197972, "num_input_tokens_seen": 189585580, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7109375, "step": 8816, "time_per_iteration": 2.4953925609588623 }, { "auxiliary_loss_clip": 0.01118573, "auxiliary_loss_mlp": 0.01033403, "balance_loss_clip": 1.02061141, "balance_loss_mlp": 1.04493427, "epoch": 0.5301067187734857, "flos": 42996491735040.0, "grad_norm": 1.4441359602895893, "language_loss": 0.72243834, "learning_rate": 1.9020895382890611e-06, "loss": 0.743958, "num_input_tokens_seen": 189608485, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73828125, "step": 8817, "time_per_iteration": 2.6943371295928955 }, { "auxiliary_loss_clip": 0.01118297, "auxiliary_loss_mlp": 0.01029943, "balance_loss_clip": 1.01646686, "balance_loss_mlp": 1.04272139, "epoch": 0.5301668420261536, "flos": 20554298403840.0, "grad_norm": 1.6356425698638055, "language_loss": 0.65082145, "learning_rate": 1.9017005454411743e-06, "loss": 0.67230386, "num_input_tokens_seen": 189627815, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 8818, "time_per_iteration": 2.510770320892334 }, { "auxiliary_loss_clip": 0.01118533, "auxiliary_loss_mlp": 0.01027521, "balance_loss_clip": 1.01378822, "balance_loss_mlp": 1.04366052, "epoch": 0.5302269652788216, "flos": 17486665182720.0, "grad_norm": 4.110742452688514, "language_loss": 0.74871624, "learning_rate": 1.9013115563208126e-06, "loss": 0.77017677, "num_input_tokens_seen": 189644850, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 8819, "time_per_iteration": 2.475813865661621 }, { "auxiliary_loss_clip": 0.01118905, "auxiliary_loss_mlp": 0.01035676, "balance_loss_clip": 1.02207422, "balance_loss_mlp": 1.04251885, "epoch": 0.5302870885314895, "flos": 14574202715520.0, "grad_norm": 1.888712954637547, "language_loss": 0.82098067, "learning_rate": 1.9009225709427267e-06, "loss": 0.84252644, "num_input_tokens_seen": 189660945, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.765625, "step": 8820, "time_per_iteration": 2.4467666149139404 }, { "auxiliary_loss_clip": 0.01116787, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.02135026, "balance_loss_mlp": 1.04205692, "epoch": 0.5303472117841576, "flos": 23438032968960.0, "grad_norm": 1.4671786023515885, "language_loss": 0.72637177, "learning_rate": 1.9005335893216667e-06, "loss": 0.74787378, "num_input_tokens_seen": 189680425, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.75, "step": 8821, "time_per_iteration": 2.524567127227783 }, { "auxiliary_loss_clip": 0.01112912, "auxiliary_loss_mlp": 0.01028665, "balance_loss_clip": 1.01679802, "balance_loss_mlp": 1.04184866, "epoch": 0.5304073350368255, "flos": 22709010533760.0, "grad_norm": 1.5166539254250258, "language_loss": 0.74127573, "learning_rate": 1.9001446114723824e-06, "loss": 0.7626915, "num_input_tokens_seen": 189700375, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 8822, "time_per_iteration": 2.4856386184692383 }, { "auxiliary_loss_clip": 0.01116168, "auxiliary_loss_mlp": 0.01035322, "balance_loss_clip": 1.02138638, "balance_loss_mlp": 1.04221916, "epoch": 0.5304674582894935, "flos": 27928554624000.0, "grad_norm": 1.6044139123550045, "language_loss": 0.67248118, "learning_rate": 1.8997556374096257e-06, "loss": 0.69399607, "num_input_tokens_seen": 189721225, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73828125, "step": 8823, "time_per_iteration": 2.5582680702209473 }, { "auxiliary_loss_clip": 0.01120741, "auxiliary_loss_mlp": 0.01040122, "balance_loss_clip": 1.02540565, "balance_loss_mlp": 1.04390121, "epoch": 0.5305275815421614, "flos": 21250642440960.0, "grad_norm": 1.5558841777458614, "language_loss": 0.69480592, "learning_rate": 1.8993666671481444e-06, "loss": 0.71641457, "num_input_tokens_seen": 189740170, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 8824, "time_per_iteration": 2.4767231941223145 }, { "auxiliary_loss_clip": 0.01113234, "auxiliary_loss_mlp": 0.01031125, "balance_loss_clip": 1.01854873, "balance_loss_mlp": 1.04241621, "epoch": 0.5305877047948294, "flos": 17603088140160.0, "grad_norm": 2.2694005729222253, "language_loss": 0.76142138, "learning_rate": 1.898977700702689e-06, "loss": 0.78286499, "num_input_tokens_seen": 189757890, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 8825, "time_per_iteration": 2.4750595092773438 }, { "auxiliary_loss_clip": 0.01115874, "auxiliary_loss_mlp": 0.01039908, "balance_loss_clip": 1.02659798, "balance_loss_mlp": 1.04245615, "epoch": 0.5306478280474973, "flos": 15195493284480.0, "grad_norm": 2.1557344164622196, "language_loss": 0.85639483, "learning_rate": 1.8985887380880103e-06, "loss": 0.87795269, "num_input_tokens_seen": 189775390, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 8826, "time_per_iteration": 2.45245099067688 }, { "auxiliary_loss_clip": 0.01114534, "auxiliary_loss_mlp": 0.01035248, "balance_loss_clip": 1.02149165, "balance_loss_mlp": 1.04248071, "epoch": 0.5307079513001653, "flos": 15341218761600.0, "grad_norm": 1.7552764384428432, "language_loss": 0.64409649, "learning_rate": 1.8981997793188558e-06, "loss": 0.66559434, "num_input_tokens_seen": 189793975, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 8827, "time_per_iteration": 2.464615821838379 }, { "auxiliary_loss_clip": 0.0112296, "auxiliary_loss_mlp": 0.01038291, "balance_loss_clip": 1.02455235, "balance_loss_mlp": 1.04708052, "epoch": 0.5307680745528333, "flos": 43544452688640.0, "grad_norm": 1.5192515113523777, "language_loss": 0.60051191, "learning_rate": 1.8978108244099762e-06, "loss": 0.62212443, "num_input_tokens_seen": 189817870, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7578125, "step": 8828, "time_per_iteration": 2.6781697273254395 }, { "auxiliary_loss_clip": 0.01120332, "auxiliary_loss_mlp": 0.01034179, "balance_loss_clip": 1.02058887, "balance_loss_mlp": 1.04426777, "epoch": 0.5308281978055013, "flos": 20048928001920.0, "grad_norm": 2.3866724407994795, "language_loss": 0.81447709, "learning_rate": 1.8974218733761208e-06, "loss": 0.8360222, "num_input_tokens_seen": 189837905, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.76171875, "step": 8829, "time_per_iteration": 2.5069141387939453 }, { "auxiliary_loss_clip": 0.011174, "auxiliary_loss_mlp": 0.01033289, "balance_loss_clip": 1.01988935, "balance_loss_mlp": 1.04508424, "epoch": 0.5308883210581693, "flos": 20703938463360.0, "grad_norm": 2.126587528425003, "language_loss": 0.78062367, "learning_rate": 1.8970329262320375e-06, "loss": 0.80213058, "num_input_tokens_seen": 189856970, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 8830, "time_per_iteration": 2.4754996299743652 }, { "auxiliary_loss_clip": 0.01119735, "auxiliary_loss_mlp": 0.01033812, "balance_loss_clip": 1.02049053, "balance_loss_mlp": 1.04508936, "epoch": 0.5309484443108372, "flos": 14355506759040.0, "grad_norm": 2.257458226733369, "language_loss": 0.80675304, "learning_rate": 1.8966439829924768e-06, "loss": 0.8282885, "num_input_tokens_seen": 189872830, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.74609375, "step": 8831, "time_per_iteration": 2.458040952682495 }, { "auxiliary_loss_clip": 0.01117532, "auxiliary_loss_mlp": 0.01031191, "balance_loss_clip": 1.01776862, "balance_loss_mlp": 1.04413748, "epoch": 0.5310085675635052, "flos": 20010503427840.0, "grad_norm": 1.7479115813622146, "language_loss": 0.73477972, "learning_rate": 1.896255043672186e-06, "loss": 0.75626695, "num_input_tokens_seen": 189891635, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 8832, "time_per_iteration": 2.4728662967681885 }, { "auxiliary_loss_clip": 0.01122158, "auxiliary_loss_mlp": 0.01034064, "balance_loss_clip": 1.02028358, "balance_loss_mlp": 1.04580379, "epoch": 0.5310686908161731, "flos": 22127293774080.0, "grad_norm": 2.183589392369416, "language_loss": 0.75499851, "learning_rate": 1.8958661082859143e-06, "loss": 0.77656072, "num_input_tokens_seen": 189909050, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.765625, "step": 8833, "time_per_iteration": 2.5066747665405273 }, { "auxiliary_loss_clip": 0.01117053, "auxiliary_loss_mlp": 0.01033176, "balance_loss_clip": 1.01952052, "balance_loss_mlp": 1.04097915, "epoch": 0.5311288140688412, "flos": 24717889445760.0, "grad_norm": 2.0571678061176213, "language_loss": 0.73540378, "learning_rate": 1.8954771768484103e-06, "loss": 0.75690603, "num_input_tokens_seen": 189927405, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 8834, "time_per_iteration": 2.5172650814056396 }, { "auxiliary_loss_clip": 0.01122278, "auxiliary_loss_mlp": 0.01036633, "balance_loss_clip": 1.02211928, "balance_loss_mlp": 1.04307342, "epoch": 0.5311889373215091, "flos": 24097712198400.0, "grad_norm": 2.1358085883175173, "language_loss": 0.77795565, "learning_rate": 1.8950882493744226e-06, "loss": 0.79954475, "num_input_tokens_seen": 189947740, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7890625, "step": 8835, "time_per_iteration": 2.5379297733306885 }, { "auxiliary_loss_clip": 0.01117878, "auxiliary_loss_mlp": 0.01037561, "balance_loss_clip": 1.02298164, "balance_loss_mlp": 1.04303038, "epoch": 0.5312490605741771, "flos": 22017012042240.0, "grad_norm": 2.0720203354470983, "language_loss": 0.72308129, "learning_rate": 1.8946993258786985e-06, "loss": 0.74463564, "num_input_tokens_seen": 189966495, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.75, "step": 8836, "time_per_iteration": 2.4923410415649414 }, { "auxiliary_loss_clip": 0.01119733, "auxiliary_loss_mlp": 0.01036695, "balance_loss_clip": 1.02190077, "balance_loss_mlp": 1.04343998, "epoch": 0.531309183826845, "flos": 19390541662080.0, "grad_norm": 1.8228952344722504, "language_loss": 0.80780202, "learning_rate": 1.894310406375987e-06, "loss": 0.82936627, "num_input_tokens_seen": 189985325, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.76171875, "step": 8837, "time_per_iteration": 2.501821994781494 }, { "auxiliary_loss_clip": 0.01119724, "auxiliary_loss_mlp": 0.01030537, "balance_loss_clip": 1.01661944, "balance_loss_mlp": 1.04643309, "epoch": 0.531369307079513, "flos": 20190056538240.0, "grad_norm": 1.775333724384514, "language_loss": 0.86139107, "learning_rate": 1.893921490881035e-06, "loss": 0.88289368, "num_input_tokens_seen": 190003290, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 8838, "time_per_iteration": 2.477848529815674 }, { "auxiliary_loss_clip": 0.01117952, "auxiliary_loss_mlp": 0.01031527, "balance_loss_clip": 1.01859903, "balance_loss_mlp": 1.04417253, "epoch": 0.5314294303321809, "flos": 18880143356160.0, "grad_norm": 1.7072444554854174, "language_loss": 0.72997814, "learning_rate": 1.8935325794085906e-06, "loss": 0.75147301, "num_input_tokens_seen": 190023260, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73828125, "step": 8839, "time_per_iteration": 2.5613791942596436 }, { "auxiliary_loss_clip": 0.01118719, "auxiliary_loss_mlp": 0.01038343, "balance_loss_clip": 1.02450323, "balance_loss_mlp": 1.04282832, "epoch": 0.531489553584849, "flos": 23040035297280.0, "grad_norm": 1.613831802668821, "language_loss": 0.76750016, "learning_rate": 1.8931436719734023e-06, "loss": 0.78907079, "num_input_tokens_seen": 190042035, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 8840, "time_per_iteration": 3.922839641571045 }, { "auxiliary_loss_clip": 0.01119022, "auxiliary_loss_mlp": 0.0103268, "balance_loss_clip": 1.01867306, "balance_loss_mlp": 1.04252744, "epoch": 0.5315496768375169, "flos": 19790478668160.0, "grad_norm": 1.9393989871912685, "language_loss": 0.77515459, "learning_rate": 1.892754768590216e-06, "loss": 0.79667163, "num_input_tokens_seen": 190057545, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.765625, "step": 8841, "time_per_iteration": 3.9528958797454834 }, { "auxiliary_loss_clip": 0.01044666, "auxiliary_loss_mlp": 0.01015075, "balance_loss_clip": 1.01378119, "balance_loss_mlp": 1.0198195, "epoch": 0.5316098000901849, "flos": 71023228185600.0, "grad_norm": 0.6964250166449903, "language_loss": 0.56786007, "learning_rate": 1.8923658692737793e-06, "loss": 0.58845747, "num_input_tokens_seen": 190123800, "router_z_loss_clip": 0.01293945, "router_z_loss_mlp": 0.24804688, "step": 8842, "time_per_iteration": 4.674586296081543 }, { "auxiliary_loss_clip": 0.01121902, "auxiliary_loss_mlp": 0.01042088, "balance_loss_clip": 1.0270915, "balance_loss_mlp": 1.04489374, "epoch": 0.5316699233428529, "flos": 16435560470400.0, "grad_norm": 2.078388610290352, "language_loss": 0.7374025, "learning_rate": 1.8919769740388407e-06, "loss": 0.75904238, "num_input_tokens_seen": 190141625, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.76953125, "step": 8843, "time_per_iteration": 2.4771926403045654 }, { "auxiliary_loss_clip": 0.01044726, "auxiliary_loss_mlp": 0.0100857, "balance_loss_clip": 1.00736034, "balance_loss_mlp": 1.0199126, "epoch": 0.5317300465955208, "flos": 67420814302080.0, "grad_norm": 0.9054065466801619, "language_loss": 0.61004496, "learning_rate": 1.891588082900145e-06, "loss": 0.63057792, "num_input_tokens_seen": 190198110, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.24804688, "step": 8844, "time_per_iteration": 4.552812337875366 }, { "auxiliary_loss_clip": 0.010435, "auxiliary_loss_mlp": 0.01006504, "balance_loss_clip": 1.00530577, "balance_loss_mlp": 1.01873016, "epoch": 0.5317901698481888, "flos": 59508075340800.0, "grad_norm": 0.8395611016801989, "language_loss": 0.62187207, "learning_rate": 1.8911991958724411e-06, "loss": 0.64237207, "num_input_tokens_seen": 190259950, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.24804688, "step": 8845, "time_per_iteration": 3.0916104316711426 }, { "auxiliary_loss_clip": 0.01119121, "auxiliary_loss_mlp": 0.01038961, "balance_loss_clip": 1.02408361, "balance_loss_mlp": 1.04453027, "epoch": 0.5318502931008567, "flos": 19129219240320.0, "grad_norm": 2.214702478321128, "language_loss": 0.75405258, "learning_rate": 1.890810312970474e-06, "loss": 0.77563339, "num_input_tokens_seen": 190278265, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.74609375, "step": 8846, "time_per_iteration": 2.4986953735351562 }, { "auxiliary_loss_clip": 0.01120657, "auxiliary_loss_mlp": 0.01035038, "balance_loss_clip": 1.02266383, "balance_loss_mlp": 1.04628658, "epoch": 0.5319104163535248, "flos": 24681045070080.0, "grad_norm": 1.7826971383270291, "language_loss": 0.74812949, "learning_rate": 1.8904214342089903e-06, "loss": 0.76968646, "num_input_tokens_seen": 190298400, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7421875, "step": 8847, "time_per_iteration": 2.52898907661438 }, { "auxiliary_loss_clip": 0.01116837, "auxiliary_loss_mlp": 0.0103173, "balance_loss_clip": 1.0187304, "balance_loss_mlp": 1.04292679, "epoch": 0.5319705396061927, "flos": 19385513758080.0, "grad_norm": 1.7346454652028944, "language_loss": 0.87694043, "learning_rate": 1.8900325596027378e-06, "loss": 0.89842618, "num_input_tokens_seen": 190316235, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7421875, "step": 8848, "time_per_iteration": 2.5047643184661865 }, { "auxiliary_loss_clip": 0.01120158, "auxiliary_loss_mlp": 0.01038883, "balance_loss_clip": 1.02374935, "balance_loss_mlp": 1.04464102, "epoch": 0.5320306628588607, "flos": 18259319664000.0, "grad_norm": 1.817788790473896, "language_loss": 0.74429679, "learning_rate": 1.8896436891664609e-06, "loss": 0.76588714, "num_input_tokens_seen": 190335060, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.75390625, "step": 8849, "time_per_iteration": 2.5043821334838867 }, { "auxiliary_loss_clip": 0.0112052, "auxiliary_loss_mlp": 0.01027291, "balance_loss_clip": 1.01334965, "balance_loss_mlp": 1.04337978, "epoch": 0.5320907861115286, "flos": 23732321097600.0, "grad_norm": 1.9531556734318731, "language_loss": 0.7987107, "learning_rate": 1.8892548229149066e-06, "loss": 0.82018888, "num_input_tokens_seen": 190353265, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.76953125, "step": 8850, "time_per_iteration": 2.520920515060425 }, { "auxiliary_loss_clip": 0.01118801, "auxiliary_loss_mlp": 0.01029111, "balance_loss_clip": 1.01636171, "balance_loss_mlp": 1.0433352, "epoch": 0.5321509093641966, "flos": 34495251321600.0, "grad_norm": 1.459768753694676, "language_loss": 0.54824734, "learning_rate": 1.888865960862821e-06, "loss": 0.56972647, "num_input_tokens_seen": 190376575, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.75390625, "step": 8851, "time_per_iteration": 2.5928921699523926 }, { "auxiliary_loss_clip": 0.01122119, "auxiliary_loss_mlp": 0.01032087, "balance_loss_clip": 1.0186758, "balance_loss_mlp": 1.04525352, "epoch": 0.5322110326168645, "flos": 20010934391040.0, "grad_norm": 1.638403653247659, "language_loss": 0.68204182, "learning_rate": 1.8884771030249484e-06, "loss": 0.7035839, "num_input_tokens_seen": 190395185, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76953125, "step": 8852, "time_per_iteration": 2.4936492443084717 }, { "auxiliary_loss_clip": 0.01044021, "auxiliary_loss_mlp": 0.01003854, "balance_loss_clip": 1.00253677, "balance_loss_mlp": 1.01896894, "epoch": 0.5322711558695326, "flos": 64631164435200.0, "grad_norm": 0.8061556197898149, "language_loss": 0.62805158, "learning_rate": 1.8880882494160357e-06, "loss": 0.64853042, "num_input_tokens_seen": 190452595, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.25, "step": 8853, "time_per_iteration": 3.0439090728759766 }, { "auxiliary_loss_clip": 0.01120945, "auxiliary_loss_mlp": 0.01030734, "balance_loss_clip": 1.01696551, "balance_loss_mlp": 1.04355979, "epoch": 0.5323312791222005, "flos": 14939342421120.0, "grad_norm": 4.0351602594206675, "language_loss": 0.79631627, "learning_rate": 1.8876994000508278e-06, "loss": 0.81783307, "num_input_tokens_seen": 190469140, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7734375, "step": 8854, "time_per_iteration": 2.475710391998291 }, { "auxiliary_loss_clip": 0.01115043, "auxiliary_loss_mlp": 0.01030608, "balance_loss_clip": 1.01895511, "balance_loss_mlp": 1.04403663, "epoch": 0.5323914023748685, "flos": 23440834229760.0, "grad_norm": 1.977268737047586, "language_loss": 0.73516607, "learning_rate": 1.8873105549440698e-06, "loss": 0.75662255, "num_input_tokens_seen": 190489015, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.7109375, "step": 8855, "time_per_iteration": 2.516616106033325 }, { "auxiliary_loss_clip": 0.01116375, "auxiliary_loss_mlp": 0.01034094, "balance_loss_clip": 1.02197623, "balance_loss_mlp": 1.04228997, "epoch": 0.5324515256275365, "flos": 26286180134400.0, "grad_norm": 2.1935493855379793, "language_loss": 0.65071136, "learning_rate": 1.886921714110507e-06, "loss": 0.67221606, "num_input_tokens_seen": 190508065, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7421875, "step": 8856, "time_per_iteration": 2.5486180782318115 }, { "auxiliary_loss_clip": 0.01123954, "auxiliary_loss_mlp": 0.01036773, "balance_loss_clip": 1.02249801, "balance_loss_mlp": 1.04627562, "epoch": 0.5325116488802044, "flos": 26870913636480.0, "grad_norm": 1.8566991133989412, "language_loss": 0.77603424, "learning_rate": 1.8865328775648842e-06, "loss": 0.79764158, "num_input_tokens_seen": 190527045, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.77734375, "step": 8857, "time_per_iteration": 2.522251605987549 }, { "auxiliary_loss_clip": 0.01117853, "auxiliary_loss_mlp": 0.01034846, "balance_loss_clip": 1.02082658, "balance_loss_mlp": 1.04340672, "epoch": 0.5325717721328724, "flos": 25884734757120.0, "grad_norm": 2.205798908368227, "language_loss": 0.71226478, "learning_rate": 1.8861440453219456e-06, "loss": 0.73379177, "num_input_tokens_seen": 190544075, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7421875, "step": 8858, "time_per_iteration": 2.5084009170532227 }, { "auxiliary_loss_clip": 0.01120707, "auxiliary_loss_mlp": 0.01036343, "balance_loss_clip": 1.02231836, "balance_loss_mlp": 1.0453378, "epoch": 0.5326318953855403, "flos": 21799321666560.0, "grad_norm": 2.0321690841945594, "language_loss": 0.69322622, "learning_rate": 1.8857552173964367e-06, "loss": 0.71479672, "num_input_tokens_seen": 190566030, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 8859, "time_per_iteration": 2.5114657878875732 }, { "auxiliary_loss_clip": 0.01115224, "auxiliary_loss_mlp": 0.01028787, "balance_loss_clip": 1.01649094, "balance_loss_mlp": 1.0441941, "epoch": 0.5326920186382084, "flos": 20922921728640.0, "grad_norm": 1.5500041983308583, "language_loss": 0.69705337, "learning_rate": 1.8853663938031013e-06, "loss": 0.71849346, "num_input_tokens_seen": 190585605, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.7109375, "step": 8860, "time_per_iteration": 2.4774222373962402 }, { "auxiliary_loss_clip": 0.01118745, "auxiliary_loss_mlp": 0.01033473, "balance_loss_clip": 1.02100945, "balance_loss_mlp": 1.0457325, "epoch": 0.5327521418908763, "flos": 21433427775360.0, "grad_norm": 2.0240834781008834, "language_loss": 0.77896917, "learning_rate": 1.884977574556683e-06, "loss": 0.80049139, "num_input_tokens_seen": 190604625, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73046875, "step": 8861, "time_per_iteration": 2.5136184692382812 }, { "auxiliary_loss_clip": 0.011185, "auxiliary_loss_mlp": 0.01037214, "balance_loss_clip": 1.02339756, "balance_loss_mlp": 1.04391801, "epoch": 0.5328122651435443, "flos": 21760250647680.0, "grad_norm": 1.703695958580097, "language_loss": 0.85721803, "learning_rate": 1.8845887596719279e-06, "loss": 0.87877518, "num_input_tokens_seen": 190625060, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 8862, "time_per_iteration": 2.509974718093872 }, { "auxiliary_loss_clip": 0.01118835, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.02170849, "balance_loss_mlp": 1.04240716, "epoch": 0.5328723883962122, "flos": 18296487262080.0, "grad_norm": 1.936152256569882, "language_loss": 0.61696827, "learning_rate": 1.8841999491635778e-06, "loss": 0.63852119, "num_input_tokens_seen": 190643150, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.765625, "step": 8863, "time_per_iteration": 2.443084716796875 }, { "auxiliary_loss_clip": 0.01119406, "auxiliary_loss_mlp": 0.0103644, "balance_loss_clip": 1.024436, "balance_loss_mlp": 1.04656112, "epoch": 0.5329325116488802, "flos": 25374911068800.0, "grad_norm": 2.5444054613089526, "language_loss": 0.73227537, "learning_rate": 1.883811143046377e-06, "loss": 0.75383377, "num_input_tokens_seen": 190662725, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7265625, "step": 8864, "time_per_iteration": 2.5090644359588623 }, { "auxiliary_loss_clip": 0.01119056, "auxiliary_loss_mlp": 0.01041891, "balance_loss_clip": 1.02898669, "balance_loss_mlp": 1.04455698, "epoch": 0.5329926349015481, "flos": 25592098654080.0, "grad_norm": 1.7248773549185348, "language_loss": 0.6431551, "learning_rate": 1.8834223413350702e-06, "loss": 0.66476452, "num_input_tokens_seen": 190683680, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 8865, "time_per_iteration": 2.52528715133667 }, { "auxiliary_loss_clip": 0.01118922, "auxiliary_loss_mlp": 0.01032902, "balance_loss_clip": 1.01972973, "balance_loss_mlp": 1.04401743, "epoch": 0.5330527581542162, "flos": 22889605138560.0, "grad_norm": 1.7445137034696736, "language_loss": 0.78531045, "learning_rate": 1.8830335440443989e-06, "loss": 0.80682874, "num_input_tokens_seen": 190703350, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.75, "step": 8866, "time_per_iteration": 2.5051591396331787 }, { "auxiliary_loss_clip": 0.01118213, "auxiliary_loss_mlp": 0.01036539, "balance_loss_clip": 1.02387357, "balance_loss_mlp": 1.04471016, "epoch": 0.5331128814068841, "flos": 16026752805120.0, "grad_norm": 1.7602443442286506, "language_loss": 0.73400617, "learning_rate": 1.882644751189108e-06, "loss": 0.75555372, "num_input_tokens_seen": 190721170, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 8867, "time_per_iteration": 2.5891778469085693 }, { "auxiliary_loss_clip": 0.01120923, "auxiliary_loss_mlp": 0.01042723, "balance_loss_clip": 1.0283761, "balance_loss_mlp": 1.04538703, "epoch": 0.5331730046595521, "flos": 39344699629440.0, "grad_norm": 2.439453453247133, "language_loss": 0.72181541, "learning_rate": 1.88225596278394e-06, "loss": 0.74345183, "num_input_tokens_seen": 190743795, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7578125, "step": 8868, "time_per_iteration": 2.648637294769287 }, { "auxiliary_loss_clip": 0.01116318, "auxiliary_loss_mlp": 0.01031949, "balance_loss_clip": 1.01925969, "balance_loss_mlp": 1.04189479, "epoch": 0.5332331279122201, "flos": 24024382583040.0, "grad_norm": 2.456984200306027, "language_loss": 0.7877239, "learning_rate": 1.881867178843637e-06, "loss": 0.80920649, "num_input_tokens_seen": 190761560, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 8869, "time_per_iteration": 2.514348268508911 }, { "auxiliary_loss_clip": 0.01121358, "auxiliary_loss_mlp": 0.01038856, "balance_loss_clip": 1.02542758, "balance_loss_mlp": 1.04401207, "epoch": 0.533293251164888, "flos": 17129318728320.0, "grad_norm": 1.9438020373714981, "language_loss": 0.75696725, "learning_rate": 1.8814783993829434e-06, "loss": 0.7785694, "num_input_tokens_seen": 190778875, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7734375, "step": 8870, "time_per_iteration": 2.4711408615112305 }, { "auxiliary_loss_clip": 0.01123893, "auxiliary_loss_mlp": 0.01039973, "balance_loss_clip": 1.02567935, "balance_loss_mlp": 1.04653001, "epoch": 0.533353374417556, "flos": 22126360020480.0, "grad_norm": 1.8004780936183982, "language_loss": 0.7519123, "learning_rate": 1.8810896244165997e-06, "loss": 0.77355093, "num_input_tokens_seen": 190799830, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 8871, "time_per_iteration": 2.5113799571990967 }, { "auxiliary_loss_clip": 0.01118945, "auxiliary_loss_mlp": 0.01030878, "balance_loss_clip": 1.01796174, "balance_loss_mlp": 1.04399633, "epoch": 0.533413497670224, "flos": 15011091838080.0, "grad_norm": 1.940726456006373, "language_loss": 0.72608125, "learning_rate": 1.8807008539593498e-06, "loss": 0.74757951, "num_input_tokens_seen": 190817155, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 8872, "time_per_iteration": 2.457085132598877 }, { "auxiliary_loss_clip": 0.01120726, "auxiliary_loss_mlp": 0.01043808, "balance_loss_clip": 1.03015828, "balance_loss_mlp": 1.04708052, "epoch": 0.533473620922892, "flos": 19609955890560.0, "grad_norm": 5.002789261454498, "language_loss": 0.65295136, "learning_rate": 1.880312088025936e-06, "loss": 0.67459667, "num_input_tokens_seen": 190835240, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 8873, "time_per_iteration": 2.479086399078369 }, { "auxiliary_loss_clip": 0.01117182, "auxiliary_loss_mlp": 0.01044196, "balance_loss_clip": 1.03107738, "balance_loss_mlp": 1.04315412, "epoch": 0.5335337441755599, "flos": 14282644020480.0, "grad_norm": 2.7677408261371164, "language_loss": 0.80630058, "learning_rate": 1.879923326631099e-06, "loss": 0.82791436, "num_input_tokens_seen": 190851620, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 8874, "time_per_iteration": 2.490609884262085 }, { "auxiliary_loss_clip": 0.01117748, "auxiliary_loss_mlp": 0.01033826, "balance_loss_clip": 1.02025414, "balance_loss_mlp": 1.04353213, "epoch": 0.5335938674282279, "flos": 20814830726400.0, "grad_norm": 1.700377638370832, "language_loss": 0.6967932, "learning_rate": 1.879534569789582e-06, "loss": 0.71830893, "num_input_tokens_seen": 190870545, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 8875, "time_per_iteration": 2.4917213916778564 }, { "auxiliary_loss_clip": 0.01043704, "auxiliary_loss_mlp": 0.01004111, "balance_loss_clip": 1.00278747, "balance_loss_mlp": 1.0192616, "epoch": 0.5336539906808958, "flos": 71396448451200.0, "grad_norm": 0.7309069639402318, "language_loss": 0.59707028, "learning_rate": 1.879145817516126e-06, "loss": 0.61754841, "num_input_tokens_seen": 190931995, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.24414062, "step": 8876, "time_per_iteration": 3.248817205429077 }, { "auxiliary_loss_clip": 0.01115862, "auxiliary_loss_mlp": 0.01039765, "balance_loss_clip": 1.02668142, "balance_loss_mlp": 1.04262936, "epoch": 0.5337141139335638, "flos": 20152996680960.0, "grad_norm": 1.7226977652056472, "language_loss": 0.74500775, "learning_rate": 1.8787570698254727e-06, "loss": 0.76656401, "num_input_tokens_seen": 190949890, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 8877, "time_per_iteration": 2.4786109924316406 }, { "auxiliary_loss_clip": 0.01041522, "auxiliary_loss_mlp": 0.0100389, "balance_loss_clip": 1.00265586, "balance_loss_mlp": 1.01709986, "epoch": 0.5337742371862317, "flos": 67728387484800.0, "grad_norm": 0.762362894352633, "language_loss": 0.5719884, "learning_rate": 1.8783683267323629e-06, "loss": 0.59244251, "num_input_tokens_seen": 191008480, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.24414062, "step": 8878, "time_per_iteration": 3.017266035079956 }, { "auxiliary_loss_clip": 0.01120997, "auxiliary_loss_mlp": 0.0103581, "balance_loss_clip": 1.02146935, "balance_loss_mlp": 1.04353929, "epoch": 0.5338343604388998, "flos": 25008909436800.0, "grad_norm": 1.6379129692043144, "language_loss": 0.72321784, "learning_rate": 1.8779795882515395e-06, "loss": 0.7447859, "num_input_tokens_seen": 191028995, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7734375, "step": 8879, "time_per_iteration": 2.5468955039978027 }, { "auxiliary_loss_clip": 0.01120505, "auxiliary_loss_mlp": 0.01030317, "balance_loss_clip": 1.01649511, "balance_loss_mlp": 1.04437637, "epoch": 0.5338944836915677, "flos": 17601256546560.0, "grad_norm": 2.184556570147068, "language_loss": 0.83384323, "learning_rate": 1.8775908543977416e-06, "loss": 0.85535151, "num_input_tokens_seen": 191045285, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76171875, "step": 8880, "time_per_iteration": 2.4733057022094727 }, { "auxiliary_loss_clip": 0.01113585, "auxiliary_loss_mlp": 0.01027529, "balance_loss_clip": 1.01542962, "balance_loss_mlp": 1.04152071, "epoch": 0.5339546069442357, "flos": 21724124544000.0, "grad_norm": 1.4606611868683788, "language_loss": 0.79139286, "learning_rate": 1.8772021251857107e-06, "loss": 0.81280398, "num_input_tokens_seen": 191066105, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71875, "step": 8881, "time_per_iteration": 2.4974524974823 }, { "auxiliary_loss_clip": 0.01041542, "auxiliary_loss_mlp": 0.01001735, "balance_loss_clip": 1.00040615, "balance_loss_mlp": 1.01689792, "epoch": 0.5340147301969036, "flos": 69723583315200.0, "grad_norm": 0.7934478530881982, "language_loss": 0.59308589, "learning_rate": 1.8768134006301882e-06, "loss": 0.61351871, "num_input_tokens_seen": 191126315, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.24609375, "step": 8882, "time_per_iteration": 5.920284986495972 }, { "auxiliary_loss_clip": 0.01039471, "auxiliary_loss_mlp": 0.01002247, "balance_loss_clip": 1.00088155, "balance_loss_mlp": 1.01483846, "epoch": 0.5340748534495716, "flos": 63880701580800.0, "grad_norm": 0.8606844607095098, "language_loss": 0.63638198, "learning_rate": 1.876424680745913e-06, "loss": 0.65679908, "num_input_tokens_seen": 191174240, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.24609375, "step": 8883, "time_per_iteration": 2.9059062004089355 }, { "auxiliary_loss_clip": 0.01120903, "auxiliary_loss_mlp": 0.0103067, "balance_loss_clip": 1.01667523, "balance_loss_mlp": 1.04372263, "epoch": 0.5341349767022396, "flos": 28694313694080.0, "grad_norm": 7.680630951938044, "language_loss": 0.82148206, "learning_rate": 1.8760359655476272e-06, "loss": 0.84299785, "num_input_tokens_seen": 191193335, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76953125, "step": 8884, "time_per_iteration": 3.951962471008301 }, { "auxiliary_loss_clip": 0.01113898, "auxiliary_loss_mlp": 0.01032683, "balance_loss_clip": 1.01953435, "balance_loss_mlp": 1.04312801, "epoch": 0.5341950999549075, "flos": 16289691338880.0, "grad_norm": 8.656305550869138, "language_loss": 0.7245239, "learning_rate": 1.8756472550500695e-06, "loss": 0.74598968, "num_input_tokens_seen": 191210900, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 8885, "time_per_iteration": 2.4841573238372803 }, { "auxiliary_loss_clip": 0.01122354, "auxiliary_loss_mlp": 0.01032763, "balance_loss_clip": 1.01830888, "balance_loss_mlp": 1.0435307, "epoch": 0.5342552232075756, "flos": 14355650413440.0, "grad_norm": 2.4571959519979534, "language_loss": 0.7883085, "learning_rate": 1.87525854926798e-06, "loss": 0.80985969, "num_input_tokens_seen": 191226730, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7890625, "step": 8886, "time_per_iteration": 3.8620848655700684 }, { "auxiliary_loss_clip": 0.01118979, "auxiliary_loss_mlp": 0.01032205, "balance_loss_clip": 1.01723886, "balance_loss_mlp": 1.04328418, "epoch": 0.5343153464602435, "flos": 30297976300800.0, "grad_norm": 1.495996844882296, "language_loss": 0.74886918, "learning_rate": 1.8748698482160996e-06, "loss": 0.77038097, "num_input_tokens_seen": 191250435, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.7578125, "step": 8887, "time_per_iteration": 2.585944414138794 }, { "auxiliary_loss_clip": 0.01113357, "auxiliary_loss_mlp": 0.01029445, "balance_loss_clip": 1.01591468, "balance_loss_mlp": 1.04060507, "epoch": 0.5343754697129115, "flos": 15596292216960.0, "grad_norm": 2.252274550919765, "language_loss": 0.68589157, "learning_rate": 1.8744811519091663e-06, "loss": 0.70731962, "num_input_tokens_seen": 191268315, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 8888, "time_per_iteration": 2.479321002960205 }, { "auxiliary_loss_clip": 0.01124659, "auxiliary_loss_mlp": 0.01034837, "balance_loss_clip": 1.02091956, "balance_loss_mlp": 1.04406512, "epoch": 0.5344355929655794, "flos": 16909617191040.0, "grad_norm": 1.8717273695950447, "language_loss": 0.77278203, "learning_rate": 1.8740924603619208e-06, "loss": 0.79437697, "num_input_tokens_seen": 191287000, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.8046875, "step": 8889, "time_per_iteration": 2.5060360431671143 }, { "auxiliary_loss_clip": 0.0111614, "auxiliary_loss_mlp": 0.01038816, "balance_loss_clip": 1.02514911, "balance_loss_mlp": 1.04283881, "epoch": 0.5344957162182474, "flos": 16798186224000.0, "grad_norm": 2.14614812675172, "language_loss": 0.69277513, "learning_rate": 1.873703773589102e-06, "loss": 0.71432471, "num_input_tokens_seen": 191304565, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 8890, "time_per_iteration": 2.49084734916687 }, { "auxiliary_loss_clip": 0.01118822, "auxiliary_loss_mlp": 0.0103838, "balance_loss_clip": 1.0233717, "balance_loss_mlp": 1.04168999, "epoch": 0.5345558394709153, "flos": 12705590413440.0, "grad_norm": 2.560434929778089, "language_loss": 0.76966453, "learning_rate": 1.8733150916054483e-06, "loss": 0.79123652, "num_input_tokens_seen": 191318300, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7734375, "step": 8891, "time_per_iteration": 2.428384780883789 }, { "auxiliary_loss_clip": 0.01113032, "auxiliary_loss_mlp": 0.01032104, "balance_loss_clip": 1.01961648, "balance_loss_mlp": 1.04163587, "epoch": 0.5346159627235834, "flos": 22455050400000.0, "grad_norm": 1.7172392665030034, "language_loss": 0.74286282, "learning_rate": 1.872926414425699e-06, "loss": 0.76431417, "num_input_tokens_seen": 191337925, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.71484375, "step": 8892, "time_per_iteration": 2.4957733154296875 }, { "auxiliary_loss_clip": 0.01113814, "auxiliary_loss_mlp": 0.01034087, "balance_loss_clip": 1.02089083, "balance_loss_mlp": 1.0398798, "epoch": 0.5346760859762513, "flos": 22415763899520.0, "grad_norm": 1.8859117629961792, "language_loss": 0.88138974, "learning_rate": 1.8725377420645932e-06, "loss": 0.90286875, "num_input_tokens_seen": 191357120, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 8893, "time_per_iteration": 2.484266757965088 }, { "auxiliary_loss_clip": 0.01112165, "auxiliary_loss_mlp": 0.0103036, "balance_loss_clip": 1.01803946, "balance_loss_mlp": 1.04042566, "epoch": 0.5347362092289193, "flos": 22816131868800.0, "grad_norm": 2.511210867260509, "language_loss": 0.72819769, "learning_rate": 1.872149074536869e-06, "loss": 0.74962294, "num_input_tokens_seen": 191375395, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 8894, "time_per_iteration": 2.5297207832336426 }, { "auxiliary_loss_clip": 0.01114716, "auxiliary_loss_mlp": 0.01032004, "balance_loss_clip": 1.01863456, "balance_loss_mlp": 1.04184306, "epoch": 0.5347963324815872, "flos": 23219480666880.0, "grad_norm": 1.73999592735786, "language_loss": 0.74940884, "learning_rate": 1.8717604118572648e-06, "loss": 0.77087605, "num_input_tokens_seen": 191395595, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 8895, "time_per_iteration": 2.4852311611175537 }, { "auxiliary_loss_clip": 0.01112458, "auxiliary_loss_mlp": 0.01032249, "balance_loss_clip": 1.01892149, "balance_loss_mlp": 1.03930724, "epoch": 0.5348564557342552, "flos": 22601350494720.0, "grad_norm": 1.7832564636663533, "language_loss": 0.76842952, "learning_rate": 1.8713717540405178e-06, "loss": 0.78987658, "num_input_tokens_seen": 191413730, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 8896, "time_per_iteration": 2.510634422302246 }, { "auxiliary_loss_clip": 0.01110841, "auxiliary_loss_mlp": 0.01029426, "balance_loss_clip": 1.0157764, "balance_loss_mlp": 1.03902209, "epoch": 0.5349165789869232, "flos": 18002378701440.0, "grad_norm": 1.8323008838700756, "language_loss": 0.78737962, "learning_rate": 1.8709831011013676e-06, "loss": 0.80878234, "num_input_tokens_seen": 191432400, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 8897, "time_per_iteration": 2.4824063777923584 }, { "auxiliary_loss_clip": 0.01115724, "auxiliary_loss_mlp": 0.01031633, "balance_loss_clip": 1.01836479, "balance_loss_mlp": 1.04116094, "epoch": 0.5349767022395912, "flos": 17159770483200.0, "grad_norm": 1.8792842933891873, "language_loss": 0.75867033, "learning_rate": 1.8705944530545509e-06, "loss": 0.78014386, "num_input_tokens_seen": 191448855, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 8898, "time_per_iteration": 2.463249921798706 }, { "auxiliary_loss_clip": 0.01035879, "auxiliary_loss_mlp": 0.01006332, "balance_loss_clip": 1.00506806, "balance_loss_mlp": 1.01148403, "epoch": 0.5350368254922592, "flos": 70992058158720.0, "grad_norm": 0.9698097925793567, "language_loss": 0.57991576, "learning_rate": 1.8702058099148052e-06, "loss": 0.60033786, "num_input_tokens_seen": 191519690, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.24414062, "step": 8899, "time_per_iteration": 3.269299268722534 }, { "auxiliary_loss_clip": 0.01109212, "auxiliary_loss_mlp": 0.01031625, "balance_loss_clip": 1.01929259, "balance_loss_mlp": 1.03823042, "epoch": 0.5350969487449271, "flos": 27417833095680.0, "grad_norm": 1.8032804412158803, "language_loss": 0.69977725, "learning_rate": 1.869817171696868e-06, "loss": 0.72118562, "num_input_tokens_seen": 191539380, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 8900, "time_per_iteration": 2.5819568634033203 }, { "auxiliary_loss_clip": 0.01114557, "auxiliary_loss_mlp": 0.01031344, "balance_loss_clip": 1.01810026, "balance_loss_mlp": 1.03974843, "epoch": 0.5351570719975951, "flos": 19316134638720.0, "grad_norm": 1.697671913616739, "language_loss": 0.71865296, "learning_rate": 1.8694285384154777e-06, "loss": 0.74011195, "num_input_tokens_seen": 191557400, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 8901, "time_per_iteration": 2.457063913345337 }, { "auxiliary_loss_clip": 0.01113862, "auxiliary_loss_mlp": 0.01031164, "balance_loss_clip": 1.01803327, "balance_loss_mlp": 1.03981018, "epoch": 0.535217195250263, "flos": 19828580019840.0, "grad_norm": 1.817515771869147, "language_loss": 0.77484989, "learning_rate": 1.8690399100853699e-06, "loss": 0.79630017, "num_input_tokens_seen": 191575860, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 8902, "time_per_iteration": 2.4975736141204834 }, { "auxiliary_loss_clip": 0.01109147, "auxiliary_loss_mlp": 0.01028826, "balance_loss_clip": 1.01691723, "balance_loss_mlp": 1.03955197, "epoch": 0.535277318502931, "flos": 22127868391680.0, "grad_norm": 1.5181669254592876, "language_loss": 0.70475477, "learning_rate": 1.868651286721281e-06, "loss": 0.72613448, "num_input_tokens_seen": 191595775, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 8903, "time_per_iteration": 2.4889912605285645 }, { "auxiliary_loss_clip": 0.0111498, "auxiliary_loss_mlp": 0.01034637, "balance_loss_clip": 1.02110696, "balance_loss_mlp": 1.03949833, "epoch": 0.5353374417555989, "flos": 25045897466880.0, "grad_norm": 1.6271514916772774, "language_loss": 0.72270685, "learning_rate": 1.86826266833795e-06, "loss": 0.74420303, "num_input_tokens_seen": 191617785, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 8904, "time_per_iteration": 2.5911006927490234 }, { "auxiliary_loss_clip": 0.01113986, "auxiliary_loss_mlp": 0.01038424, "balance_loss_clip": 1.0248878, "balance_loss_mlp": 1.04019809, "epoch": 0.535397565008267, "flos": 19388710068480.0, "grad_norm": 1.8900707819908358, "language_loss": 0.73430175, "learning_rate": 1.8678740549501103e-06, "loss": 0.75582588, "num_input_tokens_seen": 191636900, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 8905, "time_per_iteration": 2.46809458732605 }, { "auxiliary_loss_clip": 0.01107036, "auxiliary_loss_mlp": 0.01035315, "balance_loss_clip": 1.02413368, "balance_loss_mlp": 1.03839958, "epoch": 0.5354576882609349, "flos": 21471205904640.0, "grad_norm": 1.5332745609243144, "language_loss": 0.83392215, "learning_rate": 1.8674854465725005e-06, "loss": 0.85534573, "num_input_tokens_seen": 191656720, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6875, "step": 8906, "time_per_iteration": 2.527952194213867 }, { "auxiliary_loss_clip": 0.01115308, "auxiliary_loss_mlp": 0.0103379, "balance_loss_clip": 1.02021766, "balance_loss_mlp": 1.0402565, "epoch": 0.5355178115136029, "flos": 20777519473920.0, "grad_norm": 2.0272093071747865, "language_loss": 0.73738325, "learning_rate": 1.8670968432198563e-06, "loss": 0.75887424, "num_input_tokens_seen": 191674445, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 8907, "time_per_iteration": 2.4758636951446533 }, { "auxiliary_loss_clip": 0.01112727, "auxiliary_loss_mlp": 0.01031105, "balance_loss_clip": 1.01829624, "balance_loss_mlp": 1.03998387, "epoch": 0.5355779347662708, "flos": 23514020190720.0, "grad_norm": 1.9725357466099662, "language_loss": 0.7697798, "learning_rate": 1.866708244906912e-06, "loss": 0.79121816, "num_input_tokens_seen": 191695000, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 8908, "time_per_iteration": 2.5239524841308594 }, { "auxiliary_loss_clip": 0.01113595, "auxiliary_loss_mlp": 0.01034957, "balance_loss_clip": 1.02140331, "balance_loss_mlp": 1.03938818, "epoch": 0.5356380580189388, "flos": 20303211358080.0, "grad_norm": 2.4518762278549358, "language_loss": 0.74253237, "learning_rate": 1.8663196516484055e-06, "loss": 0.76401794, "num_input_tokens_seen": 191713295, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 8909, "time_per_iteration": 2.4699370861053467 }, { "auxiliary_loss_clip": 0.01114986, "auxiliary_loss_mlp": 0.01036405, "balance_loss_clip": 1.0241859, "balance_loss_mlp": 1.04211247, "epoch": 0.5356981812716068, "flos": 21361642444800.0, "grad_norm": 2.0948030328525657, "language_loss": 0.8387239, "learning_rate": 1.8659310634590702e-06, "loss": 0.86023778, "num_input_tokens_seen": 191732725, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.73046875, "step": 8910, "time_per_iteration": 2.528676986694336 }, { "auxiliary_loss_clip": 0.01113365, "auxiliary_loss_mlp": 0.01033237, "balance_loss_clip": 1.01965261, "balance_loss_mlp": 1.03921008, "epoch": 0.5357583045242748, "flos": 23111246010240.0, "grad_norm": 1.5796683825063367, "language_loss": 0.81630367, "learning_rate": 1.8655424803536427e-06, "loss": 0.83776969, "num_input_tokens_seen": 191753765, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 8911, "time_per_iteration": 2.513653516769409 }, { "auxiliary_loss_clip": 0.01112005, "auxiliary_loss_mlp": 0.01035819, "balance_loss_clip": 1.02368307, "balance_loss_mlp": 1.03961658, "epoch": 0.5358184277769428, "flos": 21141761339520.0, "grad_norm": 1.8544268736077427, "language_loss": 0.68465078, "learning_rate": 1.8651539023468585e-06, "loss": 0.70612901, "num_input_tokens_seen": 191773560, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.72265625, "step": 8912, "time_per_iteration": 2.5162582397460938 }, { "auxiliary_loss_clip": 0.01112399, "auxiliary_loss_mlp": 0.01036642, "balance_loss_clip": 1.02356482, "balance_loss_mlp": 1.04030979, "epoch": 0.5358785510296107, "flos": 16282400878080.0, "grad_norm": 1.8788159781154363, "language_loss": 0.71746874, "learning_rate": 1.8647653294534509e-06, "loss": 0.73895919, "num_input_tokens_seen": 191791255, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 8913, "time_per_iteration": 2.4568326473236084 }, { "auxiliary_loss_clip": 0.01118441, "auxiliary_loss_mlp": 0.01038353, "balance_loss_clip": 1.02510917, "balance_loss_mlp": 1.04178882, "epoch": 0.5359386742822787, "flos": 16976877408000.0, "grad_norm": 1.95675251571487, "language_loss": 0.72581315, "learning_rate": 1.864376761688156e-06, "loss": 0.74738109, "num_input_tokens_seen": 191809325, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.765625, "step": 8914, "time_per_iteration": 2.4735589027404785 }, { "auxiliary_loss_clip": 0.01119742, "auxiliary_loss_mlp": 0.01044587, "balance_loss_clip": 1.02959037, "balance_loss_mlp": 1.04199815, "epoch": 0.5359987975349466, "flos": 20812927305600.0, "grad_norm": 1.9890074954011263, "language_loss": 0.70562315, "learning_rate": 1.8639881990657079e-06, "loss": 0.72726643, "num_input_tokens_seen": 191829795, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.77734375, "step": 8915, "time_per_iteration": 2.517946720123291 }, { "auxiliary_loss_clip": 0.01114169, "auxiliary_loss_mlp": 0.01041134, "balance_loss_clip": 1.02775264, "balance_loss_mlp": 1.04141402, "epoch": 0.5360589207876146, "flos": 22199941031040.0, "grad_norm": 1.7592972656496546, "language_loss": 0.75076127, "learning_rate": 1.8635996416008408e-06, "loss": 0.77231431, "num_input_tokens_seen": 191850840, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 8916, "time_per_iteration": 2.523372173309326 }, { "auxiliary_loss_clip": 0.0111629, "auxiliary_loss_mlp": 0.01029869, "balance_loss_clip": 1.01694131, "balance_loss_mlp": 1.04096746, "epoch": 0.5361190440402825, "flos": 31394365084800.0, "grad_norm": 1.9353460353956113, "language_loss": 0.72235495, "learning_rate": 1.863211089308289e-06, "loss": 0.74381661, "num_input_tokens_seen": 191869520, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 8917, "time_per_iteration": 2.557027816772461 }, { "auxiliary_loss_clip": 0.01115032, "auxiliary_loss_mlp": 0.0103896, "balance_loss_clip": 1.02479756, "balance_loss_mlp": 1.04184914, "epoch": 0.5361791672929506, "flos": 16069882060800.0, "grad_norm": 2.4299513566022166, "language_loss": 0.71323371, "learning_rate": 1.8628225422027865e-06, "loss": 0.73477364, "num_input_tokens_seen": 191887240, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.73046875, "step": 8918, "time_per_iteration": 2.484795093536377 }, { "auxiliary_loss_clip": 0.01116533, "auxiliary_loss_mlp": 0.01034945, "balance_loss_clip": 1.02197456, "balance_loss_mlp": 1.04321218, "epoch": 0.5362392905456185, "flos": 20740926493440.0, "grad_norm": 2.070527246239163, "language_loss": 0.74925494, "learning_rate": 1.862434000299067e-06, "loss": 0.77076972, "num_input_tokens_seen": 191905690, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 8919, "time_per_iteration": 2.4829142093658447 }, { "auxiliary_loss_clip": 0.01114331, "auxiliary_loss_mlp": 0.01036523, "balance_loss_clip": 1.02355886, "balance_loss_mlp": 1.03944528, "epoch": 0.5362994137982865, "flos": 17340077779200.0, "grad_norm": 3.5728822744503597, "language_loss": 0.71757466, "learning_rate": 1.862045463611864e-06, "loss": 0.73908317, "num_input_tokens_seen": 191920725, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.75, "step": 8920, "time_per_iteration": 2.4528157711029053 }, { "auxiliary_loss_clip": 0.0111559, "auxiliary_loss_mlp": 0.01032166, "balance_loss_clip": 1.01911294, "balance_loss_mlp": 1.04183865, "epoch": 0.5363595370509544, "flos": 42813957795840.0, "grad_norm": 1.4166692226550734, "language_loss": 0.68614501, "learning_rate": 1.8616569321559105e-06, "loss": 0.70762253, "num_input_tokens_seen": 191944645, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 8921, "time_per_iteration": 2.673659563064575 }, { "auxiliary_loss_clip": 0.01118541, "auxiliary_loss_mlp": 0.01036596, "balance_loss_clip": 1.02396631, "balance_loss_mlp": 1.04466319, "epoch": 0.5364196603036224, "flos": 19171953446400.0, "grad_norm": 6.470654639654153, "language_loss": 0.81881189, "learning_rate": 1.86126840594594e-06, "loss": 0.84036326, "num_input_tokens_seen": 191962265, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73828125, "step": 8922, "time_per_iteration": 2.4964752197265625 }, { "auxiliary_loss_clip": 0.0111674, "auxiliary_loss_mlp": 0.01029035, "balance_loss_clip": 1.01644063, "balance_loss_mlp": 1.04181695, "epoch": 0.5364797835562904, "flos": 17931060247680.0, "grad_norm": 2.5560977915108682, "language_loss": 0.76747251, "learning_rate": 1.860879884996686e-06, "loss": 0.7889303, "num_input_tokens_seen": 191978850, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.75, "step": 8923, "time_per_iteration": 3.9496209621429443 }, { "auxiliary_loss_clip": 0.01120197, "auxiliary_loss_mlp": 0.01036619, "balance_loss_clip": 1.02311265, "balance_loss_mlp": 1.04373062, "epoch": 0.5365399068089584, "flos": 30228058477440.0, "grad_norm": 1.457378819065877, "language_loss": 0.70361531, "learning_rate": 1.8604913693228804e-06, "loss": 0.72518349, "num_input_tokens_seen": 192002000, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.765625, "step": 8924, "time_per_iteration": 4.04402494430542 }, { "auxiliary_loss_clip": 0.01120472, "auxiliary_loss_mlp": 0.01037466, "balance_loss_clip": 1.02280283, "balance_loss_mlp": 1.04420543, "epoch": 0.5366000300616264, "flos": 24891696380160.0, "grad_norm": 26.04793085940954, "language_loss": 0.86937845, "learning_rate": 1.8601028589392558e-06, "loss": 0.89095783, "num_input_tokens_seen": 192019100, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76171875, "step": 8925, "time_per_iteration": 3.9576542377471924 }, { "auxiliary_loss_clip": 0.01118429, "auxiliary_loss_mlp": 0.01032812, "balance_loss_clip": 1.02000308, "balance_loss_mlp": 1.04213893, "epoch": 0.5366601533142943, "flos": 29826649013760.0, "grad_norm": 1.5949208126716004, "language_loss": 0.77835262, "learning_rate": 1.8597143538605455e-06, "loss": 0.79986507, "num_input_tokens_seen": 192041660, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.765625, "step": 8926, "time_per_iteration": 2.572772741317749 }, { "auxiliary_loss_clip": 0.01115237, "auxiliary_loss_mlp": 0.01030909, "balance_loss_clip": 1.01913726, "balance_loss_mlp": 1.04467916, "epoch": 0.5367202765669623, "flos": 27199352620800.0, "grad_norm": 1.837139223531635, "language_loss": 0.6680032, "learning_rate": 1.85932585410148e-06, "loss": 0.68946469, "num_input_tokens_seen": 192063540, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 8927, "time_per_iteration": 3.9274778366088867 }, { "auxiliary_loss_clip": 0.01118456, "auxiliary_loss_mlp": 0.01030062, "balance_loss_clip": 1.01752114, "balance_loss_mlp": 1.04330504, "epoch": 0.5367803998196302, "flos": 20229953569920.0, "grad_norm": 1.710745310228728, "language_loss": 0.73089212, "learning_rate": 1.8589373596767929e-06, "loss": 0.75237727, "num_input_tokens_seen": 192081760, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.75, "step": 8928, "time_per_iteration": 2.5584731101989746 }, { "auxiliary_loss_clip": 0.01115208, "auxiliary_loss_mlp": 0.01031877, "balance_loss_clip": 1.01977742, "balance_loss_mlp": 1.04171801, "epoch": 0.5368405230722982, "flos": 32154629374080.0, "grad_norm": 2.161266857594222, "language_loss": 0.63180232, "learning_rate": 1.8585488706012154e-06, "loss": 0.65327322, "num_input_tokens_seen": 192101620, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.734375, "step": 8929, "time_per_iteration": 2.568904161453247 }, { "auxiliary_loss_clip": 0.01118617, "auxiliary_loss_mlp": 0.01035847, "balance_loss_clip": 1.0231514, "balance_loss_mlp": 1.04441726, "epoch": 0.5369006463249661, "flos": 26247935128320.0, "grad_norm": 3.2848458092128863, "language_loss": 0.66006923, "learning_rate": 1.8581603868894781e-06, "loss": 0.68161392, "num_input_tokens_seen": 192121805, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 8930, "time_per_iteration": 2.5646612644195557 }, { "auxiliary_loss_clip": 0.01115186, "auxiliary_loss_mlp": 0.01032923, "balance_loss_clip": 1.01978636, "balance_loss_mlp": 1.04334915, "epoch": 0.5369607695776342, "flos": 26211306234240.0, "grad_norm": 1.4803475441124727, "language_loss": 0.67194867, "learning_rate": 1.8577719085563136e-06, "loss": 0.69342971, "num_input_tokens_seen": 192141765, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 8931, "time_per_iteration": 2.539999008178711 }, { "auxiliary_loss_clip": 0.01119947, "auxiliary_loss_mlp": 0.01032499, "balance_loss_clip": 1.01899242, "balance_loss_mlp": 1.04739261, "epoch": 0.5370208928303021, "flos": 25009017177600.0, "grad_norm": 1.6432763438085154, "language_loss": 0.75851446, "learning_rate": 1.8573834356164525e-06, "loss": 0.78003895, "num_input_tokens_seen": 192161560, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 8932, "time_per_iteration": 2.5524404048919678 }, { "auxiliary_loss_clip": 0.01120377, "auxiliary_loss_mlp": 0.01031922, "balance_loss_clip": 1.01880884, "balance_loss_mlp": 1.04754341, "epoch": 0.5370810160829701, "flos": 31792147274880.0, "grad_norm": 2.150127005572964, "language_loss": 0.66129982, "learning_rate": 1.8569949680846261e-06, "loss": 0.68282282, "num_input_tokens_seen": 192180190, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 8933, "time_per_iteration": 2.5617899894714355 }, { "auxiliary_loss_clip": 0.01114341, "auxiliary_loss_mlp": 0.01037028, "balance_loss_clip": 1.02479661, "balance_loss_mlp": 1.04485464, "epoch": 0.537141139335638, "flos": 23842602829440.0, "grad_norm": 1.7143264632057518, "language_loss": 0.82842505, "learning_rate": 1.856606505975565e-06, "loss": 0.84993875, "num_input_tokens_seen": 192198855, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 8934, "time_per_iteration": 2.540126085281372 }, { "auxiliary_loss_clip": 0.0111229, "auxiliary_loss_mlp": 0.01033549, "balance_loss_clip": 1.02089477, "balance_loss_mlp": 1.04249144, "epoch": 0.537201262588306, "flos": 18508826511360.0, "grad_norm": 1.8918510728692814, "language_loss": 0.79741073, "learning_rate": 1.856218049303999e-06, "loss": 0.81886911, "num_input_tokens_seen": 192216555, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 8935, "time_per_iteration": 2.453441619873047 }, { "auxiliary_loss_clip": 0.01115724, "auxiliary_loss_mlp": 0.01040309, "balance_loss_clip": 1.02751732, "balance_loss_mlp": 1.04275072, "epoch": 0.537261385840974, "flos": 25662950231040.0, "grad_norm": 1.9493176646413137, "language_loss": 0.84066629, "learning_rate": 1.855829598084659e-06, "loss": 0.86222661, "num_input_tokens_seen": 192236910, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73046875, "step": 8936, "time_per_iteration": 2.5328404903411865 }, { "auxiliary_loss_clip": 0.0111384, "auxiliary_loss_mlp": 0.01031599, "balance_loss_clip": 1.019315, "balance_loss_mlp": 1.04271638, "epoch": 0.537321509093642, "flos": 40735017406080.0, "grad_norm": 1.4068926229141123, "language_loss": 0.72722912, "learning_rate": 1.8554411523322754e-06, "loss": 0.74868345, "num_input_tokens_seen": 192260790, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.7109375, "step": 8937, "time_per_iteration": 2.660994291305542 }, { "auxiliary_loss_clip": 0.01117638, "auxiliary_loss_mlp": 0.01031959, "balance_loss_clip": 1.01847672, "balance_loss_mlp": 1.04195905, "epoch": 0.53738163234631, "flos": 17238487138560.0, "grad_norm": 8.913806998754065, "language_loss": 0.81589842, "learning_rate": 1.8550527120615778e-06, "loss": 0.83739436, "num_input_tokens_seen": 192277230, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 8938, "time_per_iteration": 2.4674694538116455 }, { "auxiliary_loss_clip": 0.01122422, "auxiliary_loss_mlp": 0.01033862, "balance_loss_clip": 1.02120769, "balance_loss_mlp": 1.04511309, "epoch": 0.5374417555989779, "flos": 12821977457280.0, "grad_norm": 2.606796662125918, "language_loss": 0.79971743, "learning_rate": 1.8546642772872957e-06, "loss": 0.82128024, "num_input_tokens_seen": 192292840, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7734375, "step": 8939, "time_per_iteration": 2.4224894046783447 }, { "auxiliary_loss_clip": 0.01043595, "auxiliary_loss_mlp": 0.01009461, "balance_loss_clip": 1.0081439, "balance_loss_mlp": 1.01951337, "epoch": 0.5375018788516459, "flos": 67256018703360.0, "grad_norm": 0.7086045794996787, "language_loss": 0.52486539, "learning_rate": 1.8542758480241589e-06, "loss": 0.54539591, "num_input_tokens_seen": 192358240, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.24023438, "step": 8940, "time_per_iteration": 3.105286121368408 }, { "auxiliary_loss_clip": 0.01115191, "auxiliary_loss_mlp": 0.01030427, "balance_loss_clip": 1.0181129, "balance_loss_mlp": 1.0441432, "epoch": 0.5375620021043138, "flos": 18114168804480.0, "grad_norm": 3.287279525374259, "language_loss": 0.71890688, "learning_rate": 1.8538874242868965e-06, "loss": 0.74036312, "num_input_tokens_seen": 192377370, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 8941, "time_per_iteration": 2.492457151412964 }, { "auxiliary_loss_clip": 0.01116084, "auxiliary_loss_mlp": 0.01028944, "balance_loss_clip": 1.01675475, "balance_loss_mlp": 1.04540634, "epoch": 0.5376221253569818, "flos": 23149383275520.0, "grad_norm": 1.9382147584962028, "language_loss": 0.79518306, "learning_rate": 1.853499006090237e-06, "loss": 0.81663334, "num_input_tokens_seen": 192396450, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 8942, "time_per_iteration": 2.5178020000457764 }, { "auxiliary_loss_clip": 0.01121296, "auxiliary_loss_mlp": 0.01034681, "balance_loss_clip": 1.02133012, "balance_loss_mlp": 1.04563355, "epoch": 0.5376822486096497, "flos": 29972302663680.0, "grad_norm": 2.0673255492648033, "language_loss": 0.70416582, "learning_rate": 1.853110593448911e-06, "loss": 0.72572559, "num_input_tokens_seen": 192417390, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7578125, "step": 8943, "time_per_iteration": 2.5661802291870117 }, { "auxiliary_loss_clip": 0.01043393, "auxiliary_loss_mlp": 0.01001418, "balance_loss_clip": 0.99998802, "balance_loss_mlp": 1.01902771, "epoch": 0.5377423718623178, "flos": 54168950874240.0, "grad_norm": 0.8204781332863006, "language_loss": 0.5967859, "learning_rate": 1.852722186377645e-06, "loss": 0.61723399, "num_input_tokens_seen": 192478060, "router_z_loss_clip": 0.01428223, "router_z_loss_mlp": 0.24414062, "step": 8944, "time_per_iteration": 3.1200432777404785 }, { "auxiliary_loss_clip": 0.01122192, "auxiliary_loss_mlp": 0.01034492, "balance_loss_clip": 1.02050257, "balance_loss_mlp": 1.04421508, "epoch": 0.5378024951149857, "flos": 23257079228160.0, "grad_norm": 2.16506433042139, "language_loss": 0.78444624, "learning_rate": 1.852333784891169e-06, "loss": 0.80601305, "num_input_tokens_seen": 192495985, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.78125, "step": 8945, "time_per_iteration": 2.5085721015930176 }, { "auxiliary_loss_clip": 0.01116906, "auxiliary_loss_mlp": 0.01034421, "balance_loss_clip": 1.0217247, "balance_loss_mlp": 1.0430634, "epoch": 0.5378626183676537, "flos": 24024095274240.0, "grad_norm": 2.248837154687917, "language_loss": 0.68725997, "learning_rate": 1.8519453890042112e-06, "loss": 0.70877326, "num_input_tokens_seen": 192515445, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 8946, "time_per_iteration": 2.550083875656128 }, { "auxiliary_loss_clip": 0.01113813, "auxiliary_loss_mlp": 0.01039061, "balance_loss_clip": 1.02645433, "balance_loss_mlp": 1.04208326, "epoch": 0.5379227416203216, "flos": 27161789973120.0, "grad_norm": 1.8637232334088638, "language_loss": 0.7694419, "learning_rate": 1.851556998731498e-06, "loss": 0.79097062, "num_input_tokens_seen": 192536530, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 8947, "time_per_iteration": 2.537663221359253 }, { "auxiliary_loss_clip": 0.01116008, "auxiliary_loss_mlp": 0.01031805, "balance_loss_clip": 1.0197413, "balance_loss_mlp": 1.04425621, "epoch": 0.5379828648729896, "flos": 24681619687680.0, "grad_norm": 1.8184909022460525, "language_loss": 0.60500062, "learning_rate": 1.8511686140877592e-06, "loss": 0.62647873, "num_input_tokens_seen": 192556075, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71875, "step": 8948, "time_per_iteration": 2.543187379837036 }, { "auxiliary_loss_clip": 0.01117052, "auxiliary_loss_mlp": 0.01032256, "balance_loss_clip": 1.02023959, "balance_loss_mlp": 1.04446852, "epoch": 0.5380429881256577, "flos": 22523280284160.0, "grad_norm": 1.67129837165779, "language_loss": 0.79277951, "learning_rate": 1.8507802350877205e-06, "loss": 0.81427264, "num_input_tokens_seen": 192575535, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7265625, "step": 8949, "time_per_iteration": 2.514716625213623 }, { "auxiliary_loss_clip": 0.01114547, "auxiliary_loss_mlp": 0.01032147, "balance_loss_clip": 1.01954067, "balance_loss_mlp": 1.04412889, "epoch": 0.5381031113783256, "flos": 26979543342720.0, "grad_norm": 2.47141192224899, "language_loss": 0.78035438, "learning_rate": 1.850391861746111e-06, "loss": 0.80182129, "num_input_tokens_seen": 192594490, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 8950, "time_per_iteration": 2.546347141265869 }, { "auxiliary_loss_clip": 0.01114879, "auxiliary_loss_mlp": 0.01034574, "balance_loss_clip": 1.02228355, "balance_loss_mlp": 1.04424381, "epoch": 0.5381632346309936, "flos": 24754087376640.0, "grad_norm": 2.7799862926955665, "language_loss": 0.72841763, "learning_rate": 1.8500034940776573e-06, "loss": 0.74991214, "num_input_tokens_seen": 192615650, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 8951, "time_per_iteration": 2.579012632369995 }, { "auxiliary_loss_clip": 0.01115545, "auxiliary_loss_mlp": 0.01029358, "balance_loss_clip": 1.01649547, "balance_loss_mlp": 1.0419029, "epoch": 0.5382233578836615, "flos": 15560058372480.0, "grad_norm": 1.9050359225349431, "language_loss": 0.75485492, "learning_rate": 1.849615132097085e-06, "loss": 0.77630395, "num_input_tokens_seen": 192633840, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 8952, "time_per_iteration": 2.4798030853271484 }, { "auxiliary_loss_clip": 0.01115166, "auxiliary_loss_mlp": 0.01033257, "balance_loss_clip": 1.01998854, "balance_loss_mlp": 1.04296792, "epoch": 0.5382834811363295, "flos": 25084501608960.0, "grad_norm": 1.5733112312713924, "language_loss": 0.79923272, "learning_rate": 1.8492267758191228e-06, "loss": 0.82071698, "num_input_tokens_seen": 192655890, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 8953, "time_per_iteration": 2.5341644287109375 }, { "auxiliary_loss_clip": 0.01112307, "auxiliary_loss_mlp": 0.0103049, "balance_loss_clip": 1.01763964, "balance_loss_mlp": 1.04227626, "epoch": 0.5383436043889974, "flos": 13297901685120.0, "grad_norm": 1.9543703233965743, "language_loss": 0.80586648, "learning_rate": 1.8488384252584964e-06, "loss": 0.82729447, "num_input_tokens_seen": 192673025, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 8954, "time_per_iteration": 2.4511916637420654 }, { "auxiliary_loss_clip": 0.01116349, "auxiliary_loss_mlp": 0.01028656, "balance_loss_clip": 1.0157342, "balance_loss_mlp": 1.0438596, "epoch": 0.5384037276416654, "flos": 23039388852480.0, "grad_norm": 2.4297211472828097, "language_loss": 0.76189148, "learning_rate": 1.8484500804299318e-06, "loss": 0.78334159, "num_input_tokens_seen": 192692190, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 8955, "time_per_iteration": 2.4977145195007324 }, { "auxiliary_loss_clip": 0.01115511, "auxiliary_loss_mlp": 0.01035958, "balance_loss_clip": 1.02324486, "balance_loss_mlp": 1.04430068, "epoch": 0.5384638508943334, "flos": 20631147552000.0, "grad_norm": 1.8791227716132517, "language_loss": 0.78302646, "learning_rate": 1.8480617413481557e-06, "loss": 0.80454111, "num_input_tokens_seen": 192710380, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 8956, "time_per_iteration": 2.500532865524292 }, { "auxiliary_loss_clip": 0.01041493, "auxiliary_loss_mlp": 0.01002537, "balance_loss_clip": 1.00095201, "balance_loss_mlp": 1.01692533, "epoch": 0.5385239741470014, "flos": 66737683491840.0, "grad_norm": 0.85580488266657, "language_loss": 0.63494486, "learning_rate": 1.8476734080278932e-06, "loss": 0.6553852, "num_input_tokens_seen": 192768995, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.24609375, "step": 8957, "time_per_iteration": 3.02303147315979 }, { "auxiliary_loss_clip": 0.01041078, "auxiliary_loss_mlp": 0.01001556, "balance_loss_clip": 0.99999446, "balance_loss_mlp": 1.01680636, "epoch": 0.5385840973996693, "flos": 64716058229760.0, "grad_norm": 0.7191779977586009, "language_loss": 0.51591015, "learning_rate": 1.8472850804838705e-06, "loss": 0.53633642, "num_input_tokens_seen": 192825585, "router_z_loss_clip": 0.01556396, "router_z_loss_mlp": 0.2421875, "step": 8958, "time_per_iteration": 3.13645339012146 }, { "auxiliary_loss_clip": 0.01120594, "auxiliary_loss_mlp": 0.01032892, "balance_loss_clip": 1.01889682, "balance_loss_mlp": 1.04612136, "epoch": 0.5386442206523373, "flos": 26141783460480.0, "grad_norm": 2.1433705072796623, "language_loss": 0.77270389, "learning_rate": 1.8468967587308128e-06, "loss": 0.79423869, "num_input_tokens_seen": 192847335, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7421875, "step": 8959, "time_per_iteration": 2.5167927742004395 }, { "auxiliary_loss_clip": 0.01116363, "auxiliary_loss_mlp": 0.01030296, "balance_loss_clip": 1.01751685, "balance_loss_mlp": 1.04230332, "epoch": 0.5387043439050052, "flos": 18251849635200.0, "grad_norm": 2.0979092147632676, "language_loss": 0.83984816, "learning_rate": 1.8465084427834455e-06, "loss": 0.86131477, "num_input_tokens_seen": 192862205, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7421875, "step": 8960, "time_per_iteration": 2.4654674530029297 }, { "auxiliary_loss_clip": 0.01117223, "auxiliary_loss_mlp": 0.01030486, "balance_loss_clip": 1.01812387, "balance_loss_mlp": 1.04449344, "epoch": 0.5387644671576732, "flos": 29788296266880.0, "grad_norm": 1.5872933234779472, "language_loss": 0.78624904, "learning_rate": 1.8461201326564933e-06, "loss": 0.80772614, "num_input_tokens_seen": 192883695, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7265625, "step": 8961, "time_per_iteration": 2.5528974533081055 }, { "auxiliary_loss_clip": 0.0111541, "auxiliary_loss_mlp": 0.01029821, "balance_loss_clip": 1.01667881, "balance_loss_mlp": 1.04318726, "epoch": 0.5388245904103413, "flos": 22374466237440.0, "grad_norm": 3.431973037394152, "language_loss": 0.84348047, "learning_rate": 1.845731828364681e-06, "loss": 0.86493284, "num_input_tokens_seen": 192900190, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 8962, "time_per_iteration": 2.5103280544281006 }, { "auxiliary_loss_clip": 0.01040037, "auxiliary_loss_mlp": 0.01007188, "balance_loss_clip": 1.0057162, "balance_loss_mlp": 1.01571894, "epoch": 0.5388847136630092, "flos": 69807794751360.0, "grad_norm": 0.7381636492423786, "language_loss": 0.54208583, "learning_rate": 1.8453435299227333e-06, "loss": 0.56255805, "num_input_tokens_seen": 192958675, "router_z_loss_clip": 0.01470947, "router_z_loss_mlp": 0.24316406, "step": 8963, "time_per_iteration": 3.0085015296936035 }, { "auxiliary_loss_clip": 0.01039665, "auxiliary_loss_mlp": 0.01002477, "balance_loss_clip": 1.00110579, "balance_loss_mlp": 1.01555264, "epoch": 0.5389448369156772, "flos": 69822303845760.0, "grad_norm": 0.8039031220282651, "language_loss": 0.63336778, "learning_rate": 1.8449552373453744e-06, "loss": 0.65378928, "num_input_tokens_seen": 193033135, "router_z_loss_clip": 0.01373291, "router_z_loss_mlp": 0.24121094, "step": 8964, "time_per_iteration": 3.2099177837371826 }, { "auxiliary_loss_clip": 0.01120261, "auxiliary_loss_mlp": 0.01031117, "balance_loss_clip": 1.01817703, "balance_loss_mlp": 1.04382837, "epoch": 0.5390049601683451, "flos": 31722444933120.0, "grad_norm": 1.5657051812960108, "language_loss": 0.6995858, "learning_rate": 1.8445669506473287e-06, "loss": 0.72109962, "num_input_tokens_seen": 193055570, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.765625, "step": 8965, "time_per_iteration": 3.9091827869415283 }, { "auxiliary_loss_clip": 0.01122031, "auxiliary_loss_mlp": 0.01034264, "balance_loss_clip": 1.02043521, "balance_loss_mlp": 1.04600954, "epoch": 0.5390650834210131, "flos": 18113486446080.0, "grad_norm": 2.52518174858001, "language_loss": 0.82154983, "learning_rate": 1.8441786698433192e-06, "loss": 0.84311277, "num_input_tokens_seen": 193073120, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 8966, "time_per_iteration": 4.005359172821045 }, { "auxiliary_loss_clip": 0.0111509, "auxiliary_loss_mlp": 0.01030971, "balance_loss_clip": 1.01757181, "balance_loss_mlp": 1.04321587, "epoch": 0.539125206673681, "flos": 17416711445760.0, "grad_norm": 1.9461331691390682, "language_loss": 0.72068954, "learning_rate": 1.8437903949480706e-06, "loss": 0.74215019, "num_input_tokens_seen": 193090105, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 8967, "time_per_iteration": 3.9266467094421387 }, { "auxiliary_loss_clip": 0.01113113, "auxiliary_loss_mlp": 0.01030597, "balance_loss_clip": 1.01844382, "balance_loss_mlp": 1.04092717, "epoch": 0.539185329926349, "flos": 22198935450240.0, "grad_norm": 4.371629999057618, "language_loss": 0.81905603, "learning_rate": 1.8434021259763065e-06, "loss": 0.8404932, "num_input_tokens_seen": 193109325, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.72265625, "step": 8968, "time_per_iteration": 2.522430896759033 }, { "auxiliary_loss_clip": 0.01117219, "auxiliary_loss_mlp": 0.01033785, "balance_loss_clip": 1.01986718, "balance_loss_mlp": 1.04338408, "epoch": 0.539245453179017, "flos": 21434397442560.0, "grad_norm": 2.7448198677387587, "language_loss": 0.73939312, "learning_rate": 1.8430138629427484e-06, "loss": 0.76090318, "num_input_tokens_seen": 193130595, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73828125, "step": 8969, "time_per_iteration": 3.8890225887298584 }, { "auxiliary_loss_clip": 0.01118235, "auxiliary_loss_mlp": 0.01027928, "balance_loss_clip": 1.01423073, "balance_loss_mlp": 1.04162598, "epoch": 0.539305576431685, "flos": 20735000749440.0, "grad_norm": 2.175365061544869, "language_loss": 0.8230983, "learning_rate": 1.8426256058621205e-06, "loss": 0.84455991, "num_input_tokens_seen": 193148930, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 8970, "time_per_iteration": 2.5054476261138916 }, { "auxiliary_loss_clip": 0.01114209, "auxiliary_loss_mlp": 0.01031591, "balance_loss_clip": 1.01902056, "balance_loss_mlp": 1.0428741, "epoch": 0.5393656996843529, "flos": 30920452018560.0, "grad_norm": 1.3989770443887715, "language_loss": 0.75613093, "learning_rate": 1.842237354749146e-06, "loss": 0.77758884, "num_input_tokens_seen": 193170140, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 8971, "time_per_iteration": 2.5537145137786865 }, { "auxiliary_loss_clip": 0.01041499, "auxiliary_loss_mlp": 0.01000597, "balance_loss_clip": 0.99922031, "balance_loss_mlp": 1.01728356, "epoch": 0.5394258229370209, "flos": 50317781351040.0, "grad_norm": 0.8835716026419879, "language_loss": 0.60421944, "learning_rate": 1.8418491096185465e-06, "loss": 0.62464035, "num_input_tokens_seen": 193227235, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.2421875, "step": 8972, "time_per_iteration": 3.1177449226379395 }, { "auxiliary_loss_clip": 0.01114387, "auxiliary_loss_mlp": 0.01035975, "balance_loss_clip": 1.02261806, "balance_loss_mlp": 1.04194438, "epoch": 0.5394859461896888, "flos": 25411935012480.0, "grad_norm": 1.4529327612770093, "language_loss": 0.78650129, "learning_rate": 1.841460870485045e-06, "loss": 0.80800498, "num_input_tokens_seen": 193248435, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 8973, "time_per_iteration": 2.525822639465332 }, { "auxiliary_loss_clip": 0.01120761, "auxiliary_loss_mlp": 0.01039243, "balance_loss_clip": 1.02399588, "balance_loss_mlp": 1.0421617, "epoch": 0.5395460694423568, "flos": 25478476957440.0, "grad_norm": 2.7278720911742886, "language_loss": 0.74048704, "learning_rate": 1.8410726373633623e-06, "loss": 0.76208705, "num_input_tokens_seen": 193267490, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.78515625, "step": 8974, "time_per_iteration": 2.520037889480591 }, { "auxiliary_loss_clip": 0.01040759, "auxiliary_loss_mlp": 0.01003699, "balance_loss_clip": 1.00225663, "balance_loss_mlp": 1.0167259, "epoch": 0.5396061926950249, "flos": 53249493507840.0, "grad_norm": 0.7228188517502906, "language_loss": 0.51071155, "learning_rate": 1.8406844102682215e-06, "loss": 0.53115606, "num_input_tokens_seen": 193326050, "router_z_loss_clip": 0.0144043, "router_z_loss_mlp": 0.24023438, "step": 8975, "time_per_iteration": 3.078596591949463 }, { "auxiliary_loss_clip": 0.01115131, "auxiliary_loss_mlp": 0.01038129, "balance_loss_clip": 1.0243963, "balance_loss_mlp": 1.04320002, "epoch": 0.5396663159476928, "flos": 26725080418560.0, "grad_norm": 1.5881354078731686, "language_loss": 0.72288615, "learning_rate": 1.840296189214344e-06, "loss": 0.74441874, "num_input_tokens_seen": 193348785, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 8976, "time_per_iteration": 2.5577213764190674 }, { "auxiliary_loss_clip": 0.01115802, "auxiliary_loss_mlp": 0.01034608, "balance_loss_clip": 1.02133405, "balance_loss_mlp": 1.04251504, "epoch": 0.5397264392003608, "flos": 23253380127360.0, "grad_norm": 2.0482456387428343, "language_loss": 0.69686484, "learning_rate": 1.8399079742164509e-06, "loss": 0.71836889, "num_input_tokens_seen": 193367080, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 8977, "time_per_iteration": 2.5068633556365967 }, { "auxiliary_loss_clip": 0.01116695, "auxiliary_loss_mlp": 0.01031614, "balance_loss_clip": 1.01847112, "balance_loss_mlp": 1.04309773, "epoch": 0.5397865624530287, "flos": 18294188791680.0, "grad_norm": 1.7608013696399487, "language_loss": 0.72285229, "learning_rate": 1.8395197652892636e-06, "loss": 0.74433541, "num_input_tokens_seen": 193383715, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 8978, "time_per_iteration": 2.464766502380371 }, { "auxiliary_loss_clip": 0.01121641, "auxiliary_loss_mlp": 0.01036342, "balance_loss_clip": 1.02151251, "balance_loss_mlp": 1.04371405, "epoch": 0.5398466857056967, "flos": 15297514888320.0, "grad_norm": 1.9388287822242696, "language_loss": 0.74293792, "learning_rate": 1.8391315624475028e-06, "loss": 0.76451778, "num_input_tokens_seen": 193400560, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.78125, "step": 8979, "time_per_iteration": 2.490032911300659 }, { "auxiliary_loss_clip": 0.01121918, "auxiliary_loss_mlp": 0.01048499, "balance_loss_clip": 1.0343188, "balance_loss_mlp": 1.04426765, "epoch": 0.5399068089583646, "flos": 17821748183040.0, "grad_norm": 1.9318548963168263, "language_loss": 0.76908362, "learning_rate": 1.8387433657058892e-06, "loss": 0.79078782, "num_input_tokens_seen": 193418680, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.77734375, "step": 8980, "time_per_iteration": 2.479518413543701 }, { "auxiliary_loss_clip": 0.01116096, "auxiliary_loss_mlp": 0.01034663, "balance_loss_clip": 1.02185345, "balance_loss_mlp": 1.04193425, "epoch": 0.5399669322110326, "flos": 27381635164800.0, "grad_norm": 1.7457125947386196, "language_loss": 0.81932902, "learning_rate": 1.8383551750791431e-06, "loss": 0.84083664, "num_input_tokens_seen": 193439310, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7421875, "step": 8981, "time_per_iteration": 2.545738458633423 }, { "auxiliary_loss_clip": 0.0111785, "auxiliary_loss_mlp": 0.01031689, "balance_loss_clip": 1.01800966, "balance_loss_mlp": 1.04190028, "epoch": 0.5400270554637006, "flos": 20449116403200.0, "grad_norm": 1.781383031353321, "language_loss": 0.66917086, "learning_rate": 1.8379669905819857e-06, "loss": 0.69066632, "num_input_tokens_seen": 193458115, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 8982, "time_per_iteration": 2.530233860015869 }, { "auxiliary_loss_clip": 0.01117223, "auxiliary_loss_mlp": 0.01037313, "balance_loss_clip": 1.02524281, "balance_loss_mlp": 1.04400051, "epoch": 0.5400871787163686, "flos": 21689578638720.0, "grad_norm": 1.6614696681473735, "language_loss": 0.82801247, "learning_rate": 1.8375788122291358e-06, "loss": 0.84955776, "num_input_tokens_seen": 193477365, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.734375, "step": 8983, "time_per_iteration": 2.545880079269409 }, { "auxiliary_loss_clip": 0.01114841, "auxiliary_loss_mlp": 0.01037616, "balance_loss_clip": 1.02348959, "balance_loss_mlp": 1.0414288, "epoch": 0.5401473019690365, "flos": 19204739585280.0, "grad_norm": 1.9348969639167775, "language_loss": 0.7060473, "learning_rate": 1.8371906400353138e-06, "loss": 0.72757185, "num_input_tokens_seen": 193495595, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.734375, "step": 8984, "time_per_iteration": 2.4727253913879395 }, { "auxiliary_loss_clip": 0.0112039, "auxiliary_loss_mlp": 0.01035604, "balance_loss_clip": 1.02101851, "balance_loss_mlp": 1.04374909, "epoch": 0.5402074252217045, "flos": 20627376624000.0, "grad_norm": 1.6437205228573843, "language_loss": 0.79799682, "learning_rate": 1.8368024740152386e-06, "loss": 0.81955671, "num_input_tokens_seen": 193514035, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.765625, "step": 8985, "time_per_iteration": 2.4901938438415527 }, { "auxiliary_loss_clip": 0.01110287, "auxiliary_loss_mlp": 0.01030325, "balance_loss_clip": 1.0174861, "balance_loss_mlp": 1.04118133, "epoch": 0.5402675484743724, "flos": 24973465691520.0, "grad_norm": 1.5553678142988012, "language_loss": 0.78981161, "learning_rate": 1.83641431418363e-06, "loss": 0.81121767, "num_input_tokens_seen": 193535445, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 8986, "time_per_iteration": 2.53442645072937 }, { "auxiliary_loss_clip": 0.01114067, "auxiliary_loss_mlp": 0.01031819, "balance_loss_clip": 1.01855111, "balance_loss_mlp": 1.04095054, "epoch": 0.5403276717270404, "flos": 19459022941440.0, "grad_norm": 1.7872780353702846, "language_loss": 0.76656955, "learning_rate": 1.8360261605552075e-06, "loss": 0.78802848, "num_input_tokens_seen": 193554780, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 8987, "time_per_iteration": 2.496159553527832 }, { "auxiliary_loss_clip": 0.01114, "auxiliary_loss_mlp": 0.01030144, "balance_loss_clip": 1.01694202, "balance_loss_mlp": 1.04052234, "epoch": 0.5403877949797083, "flos": 18442140912000.0, "grad_norm": 1.7747759985064036, "language_loss": 0.70861906, "learning_rate": 1.8356380131446887e-06, "loss": 0.73006058, "num_input_tokens_seen": 193573580, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 8988, "time_per_iteration": 2.5013933181762695 }, { "auxiliary_loss_clip": 0.01116119, "auxiliary_loss_mlp": 0.01037876, "balance_loss_clip": 1.0241313, "balance_loss_mlp": 1.04193521, "epoch": 0.5404479182323764, "flos": 28292868316800.0, "grad_norm": 2.392641606962018, "language_loss": 0.68265146, "learning_rate": 1.8352498719667934e-06, "loss": 0.70419139, "num_input_tokens_seen": 193590490, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 8989, "time_per_iteration": 2.561068534851074 }, { "auxiliary_loss_clip": 0.01116645, "auxiliary_loss_mlp": 0.01039506, "balance_loss_clip": 1.02576709, "balance_loss_mlp": 1.04230022, "epoch": 0.5405080414850444, "flos": 23367325046400.0, "grad_norm": 2.2726673300128093, "language_loss": 0.77600533, "learning_rate": 1.8348617370362399e-06, "loss": 0.79756683, "num_input_tokens_seen": 193609900, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 8990, "time_per_iteration": 2.5088119506835938 }, { "auxiliary_loss_clip": 0.0111349, "auxiliary_loss_mlp": 0.01027722, "balance_loss_clip": 1.0158782, "balance_loss_mlp": 1.04073644, "epoch": 0.5405681647377123, "flos": 21106425335040.0, "grad_norm": 3.5747915968987325, "language_loss": 0.69607508, "learning_rate": 1.834473608367745e-06, "loss": 0.71748722, "num_input_tokens_seen": 193629775, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.7265625, "step": 8991, "time_per_iteration": 2.4906227588653564 }, { "auxiliary_loss_clip": 0.01116392, "auxiliary_loss_mlp": 0.0103594, "balance_loss_clip": 1.0222013, "balance_loss_mlp": 1.04255772, "epoch": 0.5406282879903803, "flos": 20449188230400.0, "grad_norm": 1.846581898361263, "language_loss": 0.76245135, "learning_rate": 1.8340854859760277e-06, "loss": 0.78397471, "num_input_tokens_seen": 193648070, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 8992, "time_per_iteration": 2.491255044937134 }, { "auxiliary_loss_clip": 0.0111782, "auxiliary_loss_mlp": 0.01032589, "balance_loss_clip": 1.01926136, "balance_loss_mlp": 1.04206944, "epoch": 0.5406884112430482, "flos": 14209493973120.0, "grad_norm": 3.3892792437763126, "language_loss": 0.76627117, "learning_rate": 1.8336973698758056e-06, "loss": 0.78777528, "num_input_tokens_seen": 193665060, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7578125, "step": 8993, "time_per_iteration": 2.4516823291778564 }, { "auxiliary_loss_clip": 0.01111174, "auxiliary_loss_mlp": 0.01027805, "balance_loss_clip": 1.01522267, "balance_loss_mlp": 1.04054666, "epoch": 0.5407485344957162, "flos": 23875568536320.0, "grad_norm": 2.050174651161395, "language_loss": 0.71054041, "learning_rate": 1.8333092600817959e-06, "loss": 0.73193026, "num_input_tokens_seen": 193683620, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 8994, "time_per_iteration": 2.5127058029174805 }, { "auxiliary_loss_clip": 0.01116232, "auxiliary_loss_mlp": 0.01029058, "balance_loss_clip": 1.01492, "balance_loss_mlp": 1.04183364, "epoch": 0.5408086577483842, "flos": 23148485435520.0, "grad_norm": 3.24793162356675, "language_loss": 0.7514379, "learning_rate": 1.8329211566087157e-06, "loss": 0.77289081, "num_input_tokens_seen": 193702990, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7421875, "step": 8995, "time_per_iteration": 2.523886203765869 }, { "auxiliary_loss_clip": 0.01112435, "auxiliary_loss_mlp": 0.01034209, "balance_loss_clip": 1.02191889, "balance_loss_mlp": 1.04109967, "epoch": 0.5408687810010522, "flos": 18771046773120.0, "grad_norm": 1.9968809881411413, "language_loss": 0.73673427, "learning_rate": 1.832533059471282e-06, "loss": 0.75820065, "num_input_tokens_seen": 193721785, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 8996, "time_per_iteration": 2.4846396446228027 }, { "auxiliary_loss_clip": 0.0111267, "auxiliary_loss_mlp": 0.0103944, "balance_loss_clip": 1.02661312, "balance_loss_mlp": 1.04254019, "epoch": 0.5409289042537201, "flos": 13881557779200.0, "grad_norm": 2.0331737226322466, "language_loss": 0.73328173, "learning_rate": 1.8321449686842115e-06, "loss": 0.75480282, "num_input_tokens_seen": 193740315, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 8997, "time_per_iteration": 2.4690542221069336 }, { "auxiliary_loss_clip": 0.01114612, "auxiliary_loss_mlp": 0.01032585, "balance_loss_clip": 1.01885176, "balance_loss_mlp": 1.04161596, "epoch": 0.5409890275063881, "flos": 14465357527680.0, "grad_norm": 2.20256775996143, "language_loss": 0.7213521, "learning_rate": 1.8317568842622207e-06, "loss": 0.74282402, "num_input_tokens_seen": 193757580, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 8998, "time_per_iteration": 2.456901788711548 }, { "auxiliary_loss_clip": 0.01113244, "auxiliary_loss_mlp": 0.01038013, "balance_loss_clip": 1.02476931, "balance_loss_mlp": 1.04083776, "epoch": 0.541049150759056, "flos": 48977449349760.0, "grad_norm": 1.6179049483543897, "language_loss": 0.70549375, "learning_rate": 1.8313688062200256e-06, "loss": 0.72700626, "num_input_tokens_seen": 193780965, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 8999, "time_per_iteration": 2.730201244354248 }, { "auxiliary_loss_clip": 0.01113981, "auxiliary_loss_mlp": 0.01034033, "balance_loss_clip": 1.02056789, "balance_loss_mlp": 1.04298377, "epoch": 0.541109274011724, "flos": 18147601388160.0, "grad_norm": 2.6936484427437724, "language_loss": 0.81359065, "learning_rate": 1.8309807345723422e-06, "loss": 0.83507085, "num_input_tokens_seen": 193797855, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 9000, "time_per_iteration": 2.4652605056762695 }, { "auxiliary_loss_clip": 0.01113783, "auxiliary_loss_mlp": 0.01033312, "balance_loss_clip": 1.0199244, "balance_loss_mlp": 1.0421052, "epoch": 0.541169397264392, "flos": 20522553759360.0, "grad_norm": 1.5655952488154659, "language_loss": 0.73045421, "learning_rate": 1.8305926693338863e-06, "loss": 0.75192523, "num_input_tokens_seen": 193817375, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 9001, "time_per_iteration": 2.5057809352874756 }, { "auxiliary_loss_clip": 0.01120352, "auxiliary_loss_mlp": 0.01038384, "balance_loss_clip": 1.02449048, "balance_loss_mlp": 1.04390192, "epoch": 0.54122952051706, "flos": 20044043752320.0, "grad_norm": 2.547816299747835, "language_loss": 0.85225546, "learning_rate": 1.8302046105193734e-06, "loss": 0.87384284, "num_input_tokens_seen": 193832205, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.765625, "step": 9002, "time_per_iteration": 2.4688374996185303 }, { "auxiliary_loss_clip": 0.01112962, "auxiliary_loss_mlp": 0.01035399, "balance_loss_clip": 1.02351952, "balance_loss_mlp": 1.04274845, "epoch": 0.541289643769728, "flos": 19062246332160.0, "grad_norm": 2.0545766737557773, "language_loss": 0.78204656, "learning_rate": 1.8298165581435183e-06, "loss": 0.80353016, "num_input_tokens_seen": 193849830, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 9003, "time_per_iteration": 2.4871826171875 }, { "auxiliary_loss_clip": 0.01113243, "auxiliary_loss_mlp": 0.01031283, "balance_loss_clip": 1.01796103, "balance_loss_mlp": 1.0415976, "epoch": 0.5413497670223959, "flos": 22382295402240.0, "grad_norm": 2.0146428414399518, "language_loss": 0.69318539, "learning_rate": 1.8294285122210372e-06, "loss": 0.71463072, "num_input_tokens_seen": 193869945, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 9004, "time_per_iteration": 2.516348123550415 }, { "auxiliary_loss_clip": 0.01043145, "auxiliary_loss_mlp": 0.01003643, "balance_loss_clip": 1.0021944, "balance_loss_mlp": 1.01905513, "epoch": 0.5414098902750639, "flos": 70031734093440.0, "grad_norm": 1.0055377946636048, "language_loss": 0.59184253, "learning_rate": 1.8290404727666434e-06, "loss": 0.61231041, "num_input_tokens_seen": 193930860, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.24121094, "step": 9005, "time_per_iteration": 3.2039148807525635 }, { "auxiliary_loss_clip": 0.01121455, "auxiliary_loss_mlp": 0.0103434, "balance_loss_clip": 1.02203727, "balance_loss_mlp": 1.04661345, "epoch": 0.5414700135277318, "flos": 21798962530560.0, "grad_norm": 1.9650408715231544, "language_loss": 0.78066683, "learning_rate": 1.8286524397950517e-06, "loss": 0.80222476, "num_input_tokens_seen": 193949075, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.75, "step": 9006, "time_per_iteration": 3.998430013656616 }, { "auxiliary_loss_clip": 0.01113512, "auxiliary_loss_mlp": 0.010387, "balance_loss_clip": 1.02701712, "balance_loss_mlp": 1.04226065, "epoch": 0.5415301367803999, "flos": 16907929251840.0, "grad_norm": 1.6970973700874337, "language_loss": 0.83325636, "learning_rate": 1.8282644133209777e-06, "loss": 0.85477841, "num_input_tokens_seen": 193967630, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.7109375, "step": 9007, "time_per_iteration": 2.4837162494659424 }, { "auxiliary_loss_clip": 0.01117679, "auxiliary_loss_mlp": 0.01034143, "balance_loss_clip": 1.02010632, "balance_loss_mlp": 1.04402673, "epoch": 0.5415902600330678, "flos": 25704176065920.0, "grad_norm": 2.0133987300557523, "language_loss": 0.6689961, "learning_rate": 1.8278763933591334e-06, "loss": 0.69051433, "num_input_tokens_seen": 193988730, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 9008, "time_per_iteration": 4.011279582977295 }, { "auxiliary_loss_clip": 0.01120409, "auxiliary_loss_mlp": 0.0103611, "balance_loss_clip": 1.02141118, "balance_loss_mlp": 1.04378307, "epoch": 0.5416503832857358, "flos": 19208151377280.0, "grad_norm": 2.1942136461970043, "language_loss": 0.73642927, "learning_rate": 1.827488379924234e-06, "loss": 0.75799441, "num_input_tokens_seen": 194005160, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.765625, "step": 9009, "time_per_iteration": 3.9122354984283447 }, { "auxiliary_loss_clip": 0.01118566, "auxiliary_loss_mlp": 0.01034839, "balance_loss_clip": 1.02041519, "balance_loss_mlp": 1.04367805, "epoch": 0.5417105065384037, "flos": 12713706887040.0, "grad_norm": 2.3255793557675215, "language_loss": 0.87331879, "learning_rate": 1.8271003730309923e-06, "loss": 0.89485282, "num_input_tokens_seen": 194021700, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75, "step": 9010, "time_per_iteration": 3.9431838989257812 }, { "auxiliary_loss_clip": 0.01116607, "auxiliary_loss_mlp": 0.01036619, "balance_loss_clip": 1.02338088, "balance_loss_mlp": 1.04351413, "epoch": 0.5417706297910717, "flos": 30335933998080.0, "grad_norm": 2.079990072076888, "language_loss": 0.65164411, "learning_rate": 1.826712372694122e-06, "loss": 0.67317635, "num_input_tokens_seen": 194042620, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 9011, "time_per_iteration": 2.5313854217529297 }, { "auxiliary_loss_clip": 0.01119153, "auxiliary_loss_mlp": 0.01036086, "balance_loss_clip": 1.02293706, "balance_loss_mlp": 1.045663, "epoch": 0.5418307530437396, "flos": 29020992912000.0, "grad_norm": 2.3203418353790837, "language_loss": 0.79422909, "learning_rate": 1.8263243789283362e-06, "loss": 0.81578147, "num_input_tokens_seen": 194061800, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 9012, "time_per_iteration": 2.5692765712738037 }, { "auxiliary_loss_clip": 0.01116503, "auxiliary_loss_mlp": 0.01032033, "balance_loss_clip": 1.01850843, "balance_loss_mlp": 1.04307282, "epoch": 0.5418908762964076, "flos": 16873455173760.0, "grad_norm": 1.9230337010662646, "language_loss": 0.74283779, "learning_rate": 1.8259363917483466e-06, "loss": 0.76432317, "num_input_tokens_seen": 194079890, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 9013, "time_per_iteration": 2.4561455249786377 }, { "auxiliary_loss_clip": 0.01119815, "auxiliary_loss_mlp": 0.01033497, "balance_loss_clip": 1.02016318, "balance_loss_mlp": 1.04367781, "epoch": 0.5419509995490756, "flos": 18949702043520.0, "grad_norm": 2.1458262627207287, "language_loss": 0.71991485, "learning_rate": 1.8255484111688667e-06, "loss": 0.74144793, "num_input_tokens_seen": 194097625, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.76171875, "step": 9014, "time_per_iteration": 2.494286298751831 }, { "auxiliary_loss_clip": 0.01117855, "auxiliary_loss_mlp": 0.0103383, "balance_loss_clip": 1.02063906, "balance_loss_mlp": 1.0450002, "epoch": 0.5420111228017436, "flos": 18077719478400.0, "grad_norm": 2.2572953092792716, "language_loss": 0.80664313, "learning_rate": 1.8251604372046085e-06, "loss": 0.82815993, "num_input_tokens_seen": 194116055, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9015, "time_per_iteration": 2.4612131118774414 }, { "auxiliary_loss_clip": 0.0111858, "auxiliary_loss_mlp": 0.01036503, "balance_loss_clip": 1.02331221, "balance_loss_mlp": 1.04244947, "epoch": 0.5420712460544116, "flos": 19061779455360.0, "grad_norm": 2.7836279302606117, "language_loss": 0.81568408, "learning_rate": 1.8247724698702843e-06, "loss": 0.83723491, "num_input_tokens_seen": 194130365, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.76171875, "step": 9016, "time_per_iteration": 2.490182399749756 }, { "auxiliary_loss_clip": 0.01116749, "auxiliary_loss_mlp": 0.01027959, "balance_loss_clip": 1.01437545, "balance_loss_mlp": 1.04400468, "epoch": 0.5421313693070795, "flos": 18187103370240.0, "grad_norm": 2.276990802604585, "language_loss": 0.81246537, "learning_rate": 1.8243845091806053e-06, "loss": 0.83391243, "num_input_tokens_seen": 194148975, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 9017, "time_per_iteration": 2.4692001342773438 }, { "auxiliary_loss_clip": 0.01114754, "auxiliary_loss_mlp": 0.01033244, "balance_loss_clip": 1.01978517, "balance_loss_mlp": 1.04344356, "epoch": 0.5421914925597475, "flos": 13005947940480.0, "grad_norm": 1.9804580308134194, "language_loss": 0.77510935, "learning_rate": 1.8239965551502837e-06, "loss": 0.79658931, "num_input_tokens_seen": 194167185, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 9018, "time_per_iteration": 2.490483045578003 }, { "auxiliary_loss_clip": 0.01116746, "auxiliary_loss_mlp": 0.01040013, "balance_loss_clip": 1.0259881, "balance_loss_mlp": 1.04061222, "epoch": 0.5422516158124154, "flos": 46758457831680.0, "grad_norm": 1.443683408025406, "language_loss": 0.66276085, "learning_rate": 1.8236086077940303e-06, "loss": 0.68432844, "num_input_tokens_seen": 194192840, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 9019, "time_per_iteration": 2.730062961578369 }, { "auxiliary_loss_clip": 0.01113251, "auxiliary_loss_mlp": 0.01028329, "balance_loss_clip": 1.01644945, "balance_loss_mlp": 1.04315269, "epoch": 0.5423117390650835, "flos": 31758642864000.0, "grad_norm": 1.6052942991072978, "language_loss": 0.69863391, "learning_rate": 1.8232206671265555e-06, "loss": 0.72004974, "num_input_tokens_seen": 194213150, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 9020, "time_per_iteration": 2.5745019912719727 }, { "auxiliary_loss_clip": 0.01113286, "auxiliary_loss_mlp": 0.01038828, "balance_loss_clip": 1.02607274, "balance_loss_mlp": 1.04378581, "epoch": 0.5423718623177514, "flos": 27201974313600.0, "grad_norm": 1.656049737024096, "language_loss": 0.8023172, "learning_rate": 1.8228327331625717e-06, "loss": 0.82383829, "num_input_tokens_seen": 194234665, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 9021, "time_per_iteration": 2.5637011528015137 }, { "auxiliary_loss_clip": 0.01118705, "auxiliary_loss_mlp": 0.01037716, "balance_loss_clip": 1.02406108, "balance_loss_mlp": 1.04610693, "epoch": 0.5424319855704194, "flos": 23546447193600.0, "grad_norm": 2.4174588418849905, "language_loss": 0.78923512, "learning_rate": 1.822444805916788e-06, "loss": 0.81079936, "num_input_tokens_seen": 194253790, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 9022, "time_per_iteration": 2.522888660430908 }, { "auxiliary_loss_clip": 0.01115474, "auxiliary_loss_mlp": 0.01034699, "balance_loss_clip": 1.02190173, "balance_loss_mlp": 1.04338789, "epoch": 0.5424921088230873, "flos": 26615624699520.0, "grad_norm": 1.6271048342982368, "language_loss": 0.82140982, "learning_rate": 1.822056885403915e-06, "loss": 0.8429116, "num_input_tokens_seen": 194274950, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 9023, "time_per_iteration": 2.522810220718384 }, { "auxiliary_loss_clip": 0.01115081, "auxiliary_loss_mlp": 0.01029568, "balance_loss_clip": 1.01672316, "balance_loss_mlp": 1.04275537, "epoch": 0.5425522320757553, "flos": 23586811102080.0, "grad_norm": 1.641970936650883, "language_loss": 0.7138921, "learning_rate": 1.8216689716386627e-06, "loss": 0.73533857, "num_input_tokens_seen": 194296155, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 9024, "time_per_iteration": 2.5446090698242188 }, { "auxiliary_loss_clip": 0.01116801, "auxiliary_loss_mlp": 0.01032225, "balance_loss_clip": 1.0195713, "balance_loss_mlp": 1.04271913, "epoch": 0.5426123553284232, "flos": 30592264429440.0, "grad_norm": 1.7838489055509672, "language_loss": 0.65028059, "learning_rate": 1.8212810646357405e-06, "loss": 0.67177081, "num_input_tokens_seen": 194318025, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7421875, "step": 9025, "time_per_iteration": 2.555042266845703 }, { "auxiliary_loss_clip": 0.0112061, "auxiliary_loss_mlp": 0.01034699, "balance_loss_clip": 1.02221179, "balance_loss_mlp": 1.04562366, "epoch": 0.5426724785810912, "flos": 12495118671360.0, "grad_norm": 2.0542733160905216, "language_loss": 0.73694718, "learning_rate": 1.8208931644098591e-06, "loss": 0.75850022, "num_input_tokens_seen": 194336150, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.75, "step": 9026, "time_per_iteration": 2.455667734146118 }, { "auxiliary_loss_clip": 0.01116572, "auxiliary_loss_mlp": 0.0103782, "balance_loss_clip": 1.02303839, "balance_loss_mlp": 1.04179096, "epoch": 0.5427326018337592, "flos": 26064611089920.0, "grad_norm": 1.7366347435560072, "language_loss": 0.78441328, "learning_rate": 1.8205052709757265e-06, "loss": 0.8059572, "num_input_tokens_seen": 194355980, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.75, "step": 9027, "time_per_iteration": 2.5157744884490967 }, { "auxiliary_loss_clip": 0.01040183, "auxiliary_loss_mlp": 0.01000462, "balance_loss_clip": 0.99884129, "balance_loss_mlp": 1.01622367, "epoch": 0.5427927250864272, "flos": 65984745576960.0, "grad_norm": 0.7520319828844595, "language_loss": 0.5653733, "learning_rate": 1.8201173843480515e-06, "loss": 0.58577979, "num_input_tokens_seen": 194422660, "router_z_loss_clip": 0.01623535, "router_z_loss_mlp": 0.24023438, "step": 9028, "time_per_iteration": 3.146118640899658 }, { "auxiliary_loss_clip": 0.01116051, "auxiliary_loss_mlp": 0.01027586, "balance_loss_clip": 1.01387739, "balance_loss_mlp": 1.04205155, "epoch": 0.5428528483390952, "flos": 19975382904960.0, "grad_norm": 1.9571686844206755, "language_loss": 0.77813303, "learning_rate": 1.8197295045415442e-06, "loss": 0.79956949, "num_input_tokens_seen": 194438545, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 9029, "time_per_iteration": 2.4933743476867676 }, { "auxiliary_loss_clip": 0.01116118, "auxiliary_loss_mlp": 0.01028944, "balance_loss_clip": 1.01576531, "balance_loss_mlp": 1.04475379, "epoch": 0.5429129715917631, "flos": 21832323287040.0, "grad_norm": 1.7215900277374274, "language_loss": 0.82844955, "learning_rate": 1.8193416315709112e-06, "loss": 0.84990013, "num_input_tokens_seen": 194458060, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 9030, "time_per_iteration": 2.5423107147216797 }, { "auxiliary_loss_clip": 0.0111473, "auxiliary_loss_mlp": 0.01032394, "balance_loss_clip": 1.01926947, "balance_loss_mlp": 1.04344225, "epoch": 0.5429730948444311, "flos": 27782685492480.0, "grad_norm": 1.7596268179059567, "language_loss": 0.75139141, "learning_rate": 1.8189537654508623e-06, "loss": 0.77286267, "num_input_tokens_seen": 194477405, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 9031, "time_per_iteration": 2.531402111053467 }, { "auxiliary_loss_clip": 0.01112226, "auxiliary_loss_mlp": 0.01034135, "balance_loss_clip": 1.02273262, "balance_loss_mlp": 1.04292035, "epoch": 0.543033218097099, "flos": 26760452336640.0, "grad_norm": 1.9271286583499587, "language_loss": 0.85397923, "learning_rate": 1.8185659061961045e-06, "loss": 0.8754428, "num_input_tokens_seen": 194497085, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.69140625, "step": 9032, "time_per_iteration": 2.53528094291687 }, { "auxiliary_loss_clip": 0.01120526, "auxiliary_loss_mlp": 0.01034182, "balance_loss_clip": 1.02108133, "balance_loss_mlp": 1.04532123, "epoch": 0.5430933413497671, "flos": 22675254727680.0, "grad_norm": 1.7500427226586175, "language_loss": 0.73962134, "learning_rate": 1.8181780538213457e-06, "loss": 0.76116842, "num_input_tokens_seen": 194516785, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 9033, "time_per_iteration": 2.4894630908966064 }, { "auxiliary_loss_clip": 0.0111461, "auxiliary_loss_mlp": 0.01038956, "balance_loss_clip": 1.02482402, "balance_loss_mlp": 1.04272664, "epoch": 0.543153464602435, "flos": 24607499973120.0, "grad_norm": 1.6335611963059096, "language_loss": 0.75836051, "learning_rate": 1.8177902083412935e-06, "loss": 0.77989614, "num_input_tokens_seen": 194536475, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.71875, "step": 9034, "time_per_iteration": 2.5332539081573486 }, { "auxiliary_loss_clip": 0.01114762, "auxiliary_loss_mlp": 0.01028198, "balance_loss_clip": 1.01577616, "balance_loss_mlp": 1.04363465, "epoch": 0.543213587855103, "flos": 19025725178880.0, "grad_norm": 1.7368283027983842, "language_loss": 0.84046441, "learning_rate": 1.817402369770655e-06, "loss": 0.86189401, "num_input_tokens_seen": 194554495, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 9035, "time_per_iteration": 2.4681320190429688 }, { "auxiliary_loss_clip": 0.01040543, "auxiliary_loss_mlp": 0.01001474, "balance_loss_clip": 0.99976921, "balance_loss_mlp": 1.01642048, "epoch": 0.5432737111077709, "flos": 65686435125120.0, "grad_norm": 0.7365952320608624, "language_loss": 0.55910885, "learning_rate": 1.8170145381241364e-06, "loss": 0.57952899, "num_input_tokens_seen": 194617620, "router_z_loss_clip": 0.01708984, "router_z_loss_mlp": 0.24121094, "step": 9036, "time_per_iteration": 3.090031147003174 }, { "auxiliary_loss_clip": 0.0111728, "auxiliary_loss_mlp": 0.01031513, "balance_loss_clip": 1.0184716, "balance_loss_mlp": 1.04304266, "epoch": 0.5433338343604389, "flos": 22091670460800.0, "grad_norm": 1.593688494102926, "language_loss": 0.75500107, "learning_rate": 1.8166267134164451e-06, "loss": 0.77648902, "num_input_tokens_seen": 194637690, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 9037, "time_per_iteration": 2.493072986602783 }, { "auxiliary_loss_clip": 0.01116814, "auxiliary_loss_mlp": 0.01034819, "balance_loss_clip": 1.02175951, "balance_loss_mlp": 1.04327703, "epoch": 0.5433939576131068, "flos": 34672649616000.0, "grad_norm": 3.3779367581089943, "language_loss": 0.66864812, "learning_rate": 1.8162388956622875e-06, "loss": 0.69016439, "num_input_tokens_seen": 194659520, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 9038, "time_per_iteration": 2.6108877658843994 }, { "auxiliary_loss_clip": 0.01111714, "auxiliary_loss_mlp": 0.01035182, "balance_loss_clip": 1.02274215, "balance_loss_mlp": 1.04051089, "epoch": 0.5434540808657748, "flos": 20303355012480.0, "grad_norm": 2.312019237946731, "language_loss": 0.78227973, "learning_rate": 1.8158510848763692e-06, "loss": 0.80374867, "num_input_tokens_seen": 194677645, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 9039, "time_per_iteration": 2.478921413421631 }, { "auxiliary_loss_clip": 0.01115929, "auxiliary_loss_mlp": 0.01032718, "balance_loss_clip": 1.02012932, "balance_loss_mlp": 1.04359841, "epoch": 0.5435142041184428, "flos": 23112790295040.0, "grad_norm": 1.9250689355145993, "language_loss": 0.76758623, "learning_rate": 1.8154632810733962e-06, "loss": 0.78907275, "num_input_tokens_seen": 194697400, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 9040, "time_per_iteration": 2.5200350284576416 }, { "auxiliary_loss_clip": 0.01038179, "auxiliary_loss_mlp": 0.01002367, "balance_loss_clip": 1.00095391, "balance_loss_mlp": 1.01417911, "epoch": 0.5435743273711108, "flos": 64012746954240.0, "grad_norm": 0.6617321424928633, "language_loss": 0.52462912, "learning_rate": 1.815075484268074e-06, "loss": 0.54503453, "num_input_tokens_seen": 194761205, "router_z_loss_clip": 0.01409912, "router_z_loss_mlp": 0.24023438, "step": 9041, "time_per_iteration": 3.0845930576324463 }, { "auxiliary_loss_clip": 0.01114944, "auxiliary_loss_mlp": 0.01033346, "balance_loss_clip": 1.02041149, "balance_loss_mlp": 1.04297638, "epoch": 0.5436344506237788, "flos": 25118903859840.0, "grad_norm": 1.788915666184711, "language_loss": 0.76473475, "learning_rate": 1.8146876944751078e-06, "loss": 0.78621757, "num_input_tokens_seen": 194782445, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 9042, "time_per_iteration": 2.527573823928833 }, { "auxiliary_loss_clip": 0.01112397, "auxiliary_loss_mlp": 0.01030773, "balance_loss_clip": 1.01872683, "balance_loss_mlp": 1.04255176, "epoch": 0.5436945738764467, "flos": 19572967860480.0, "grad_norm": 1.629042710234195, "language_loss": 0.6756258, "learning_rate": 1.8142999117092033e-06, "loss": 0.69705755, "num_input_tokens_seen": 194800325, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 9043, "time_per_iteration": 2.5189082622528076 }, { "auxiliary_loss_clip": 0.0111119, "auxiliary_loss_mlp": 0.0103513, "balance_loss_clip": 1.02231526, "balance_loss_mlp": 1.04150343, "epoch": 0.5437546971291147, "flos": 21142515525120.0, "grad_norm": 1.537724523182177, "language_loss": 0.84107834, "learning_rate": 1.8139121359850644e-06, "loss": 0.8625415, "num_input_tokens_seen": 194818675, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 9044, "time_per_iteration": 2.499469041824341 }, { "auxiliary_loss_clip": 0.01119006, "auxiliary_loss_mlp": 0.01030616, "balance_loss_clip": 1.01706803, "balance_loss_mlp": 1.0436759, "epoch": 0.5438148203817826, "flos": 25118688378240.0, "grad_norm": 1.6810396757155806, "language_loss": 0.61915469, "learning_rate": 1.8135243673173956e-06, "loss": 0.64065087, "num_input_tokens_seen": 194836595, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75390625, "step": 9045, "time_per_iteration": 2.5248751640319824 }, { "auxiliary_loss_clip": 0.01117776, "auxiliary_loss_mlp": 0.01033521, "balance_loss_clip": 1.0203898, "balance_loss_mlp": 1.04461503, "epoch": 0.5438749436344507, "flos": 23002939526400.0, "grad_norm": 2.533587769476954, "language_loss": 0.70187777, "learning_rate": 1.8131366057209023e-06, "loss": 0.72339076, "num_input_tokens_seen": 194857520, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 9046, "time_per_iteration": 2.4944844245910645 }, { "auxiliary_loss_clip": 0.01112557, "auxiliary_loss_mlp": 0.01027851, "balance_loss_clip": 1.0157336, "balance_loss_mlp": 1.04225039, "epoch": 0.5439350668871186, "flos": 15487016065920.0, "grad_norm": 1.5902810996446266, "language_loss": 0.77164549, "learning_rate": 1.8127488512102868e-06, "loss": 0.79304957, "num_input_tokens_seen": 194876020, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 9047, "time_per_iteration": 2.4723925590515137 }, { "auxiliary_loss_clip": 0.01118535, "auxiliary_loss_mlp": 0.01035488, "balance_loss_clip": 1.02280426, "balance_loss_mlp": 1.04553485, "epoch": 0.5439951901397866, "flos": 17238415311360.0, "grad_norm": 2.3042862973872826, "language_loss": 0.72420955, "learning_rate": 1.8123611038002547e-06, "loss": 0.74574977, "num_input_tokens_seen": 194894650, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 9048, "time_per_iteration": 3.8063838481903076 }, { "auxiliary_loss_clip": 0.01116551, "auxiliary_loss_mlp": 0.01033489, "balance_loss_clip": 1.01994658, "balance_loss_mlp": 1.04541469, "epoch": 0.5440553133924545, "flos": 18661016436480.0, "grad_norm": 2.039359186348875, "language_loss": 0.92970753, "learning_rate": 1.8119733635055076e-06, "loss": 0.95120788, "num_input_tokens_seen": 194911935, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 9049, "time_per_iteration": 2.467409610748291 }, { "auxiliary_loss_clip": 0.01112207, "auxiliary_loss_mlp": 0.01036088, "balance_loss_clip": 1.0241853, "balance_loss_mlp": 1.04162347, "epoch": 0.5441154366451225, "flos": 27122934435840.0, "grad_norm": 1.8274043206647428, "language_loss": 0.74531019, "learning_rate": 1.8115856303407492e-06, "loss": 0.76679325, "num_input_tokens_seen": 194931620, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.70703125, "step": 9050, "time_per_iteration": 3.957066774368286 }, { "auxiliary_loss_clip": 0.01119009, "auxiliary_loss_mlp": 0.01029148, "balance_loss_clip": 1.01623774, "balance_loss_mlp": 1.04548097, "epoch": 0.5441755598977904, "flos": 25993867253760.0, "grad_norm": 2.4919114067194847, "language_loss": 0.6720506, "learning_rate": 1.8111979043206832e-06, "loss": 0.69353217, "num_input_tokens_seen": 194952560, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 9051, "time_per_iteration": 4.029027462005615 }, { "auxiliary_loss_clip": 0.01114057, "auxiliary_loss_mlp": 0.01029781, "balance_loss_clip": 1.01697803, "balance_loss_mlp": 1.04150999, "epoch": 0.5442356831504584, "flos": 32380041173760.0, "grad_norm": 1.86542808378144, "language_loss": 0.67568988, "learning_rate": 1.810810185460011e-06, "loss": 0.6971283, "num_input_tokens_seen": 194973915, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 9052, "time_per_iteration": 3.9474713802337646 }, { "auxiliary_loss_clip": 0.01117304, "auxiliary_loss_mlp": 0.01034369, "balance_loss_clip": 1.02092862, "balance_loss_mlp": 1.04375482, "epoch": 0.5442958064031264, "flos": 24164290056960.0, "grad_norm": 1.917351413838854, "language_loss": 0.9318893, "learning_rate": 1.810422473773436e-06, "loss": 0.9534061, "num_input_tokens_seen": 194990170, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 9053, "time_per_iteration": 2.5120623111724854 }, { "auxiliary_loss_clip": 0.01117925, "auxiliary_loss_mlp": 0.01037046, "balance_loss_clip": 1.02392161, "balance_loss_mlp": 1.04350328, "epoch": 0.5443559296557944, "flos": 18764690065920.0, "grad_norm": 7.13198529619226, "language_loss": 0.8350352, "learning_rate": 1.8100347692756595e-06, "loss": 0.85658491, "num_input_tokens_seen": 195006395, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 9054, "time_per_iteration": 2.4574623107910156 }, { "auxiliary_loss_clip": 0.01118361, "auxiliary_loss_mlp": 0.01035073, "balance_loss_clip": 1.02199614, "balance_loss_mlp": 1.04526746, "epoch": 0.5444160529084624, "flos": 22632556435200.0, "grad_norm": 2.1220486812596175, "language_loss": 0.68316352, "learning_rate": 1.8096470719813836e-06, "loss": 0.70469791, "num_input_tokens_seen": 195025080, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 9055, "time_per_iteration": 2.4985971450805664 }, { "auxiliary_loss_clip": 0.01040761, "auxiliary_loss_mlp": 0.00999, "balance_loss_clip": 0.99762326, "balance_loss_mlp": 1.01716208, "epoch": 0.5444761761611303, "flos": 69671909600640.0, "grad_norm": 0.774366329253037, "language_loss": 0.57645714, "learning_rate": 1.80925938190531e-06, "loss": 0.59685469, "num_input_tokens_seen": 195085725, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.23632812, "step": 9056, "time_per_iteration": 3.095656394958496 }, { "auxiliary_loss_clip": 0.01115538, "auxiliary_loss_mlp": 0.01033276, "balance_loss_clip": 1.01980579, "balance_loss_mlp": 1.0407089, "epoch": 0.5445362994137983, "flos": 14278442129280.0, "grad_norm": 1.943668590567022, "language_loss": 0.69834059, "learning_rate": 1.8088716990621395e-06, "loss": 0.71982872, "num_input_tokens_seen": 195102585, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75, "step": 9057, "time_per_iteration": 2.470219373703003 }, { "auxiliary_loss_clip": 0.01116426, "auxiliary_loss_mlp": 0.01033774, "balance_loss_clip": 1.0207386, "balance_loss_mlp": 1.04473245, "epoch": 0.5445964226664662, "flos": 28986195611520.0, "grad_norm": 1.9474933690444955, "language_loss": 0.750853, "learning_rate": 1.8084840234665738e-06, "loss": 0.77235502, "num_input_tokens_seen": 195120055, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 9058, "time_per_iteration": 2.530695915222168 }, { "auxiliary_loss_clip": 0.01039199, "auxiliary_loss_mlp": 0.01002874, "balance_loss_clip": 1.00141931, "balance_loss_mlp": 1.01531947, "epoch": 0.5446565459191343, "flos": 68620230270720.0, "grad_norm": 0.793166641633107, "language_loss": 0.62689769, "learning_rate": 1.808096355133312e-06, "loss": 0.64731848, "num_input_tokens_seen": 195181045, "router_z_loss_clip": 0.01452637, "router_z_loss_mlp": 0.23828125, "step": 9059, "time_per_iteration": 3.183863639831543 }, { "auxiliary_loss_clip": 0.0111315, "auxiliary_loss_mlp": 0.01032124, "balance_loss_clip": 1.0192914, "balance_loss_mlp": 1.04187393, "epoch": 0.5447166691718022, "flos": 16216469464320.0, "grad_norm": 1.9583915635394573, "language_loss": 0.79218745, "learning_rate": 1.8077086940770572e-06, "loss": 0.81364012, "num_input_tokens_seen": 195198840, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9060, "time_per_iteration": 2.473456621170044 }, { "auxiliary_loss_clip": 0.01115398, "auxiliary_loss_mlp": 0.01039274, "balance_loss_clip": 1.02611303, "balance_loss_mlp": 1.04241836, "epoch": 0.5447767924244702, "flos": 25849039616640.0, "grad_norm": 1.632993998420373, "language_loss": 0.79688549, "learning_rate": 1.8073210403125072e-06, "loss": 0.81843221, "num_input_tokens_seen": 195218720, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 9061, "time_per_iteration": 2.554989814758301 }, { "auxiliary_loss_clip": 0.01116447, "auxiliary_loss_mlp": 0.0103106, "balance_loss_clip": 1.01862645, "balance_loss_mlp": 1.04504514, "epoch": 0.5448369156771381, "flos": 19677718897920.0, "grad_norm": 1.758950144252582, "language_loss": 0.87128961, "learning_rate": 1.8069333938543627e-06, "loss": 0.89276469, "num_input_tokens_seen": 195235770, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 9062, "time_per_iteration": 2.480968952178955 }, { "auxiliary_loss_clip": 0.01120076, "auxiliary_loss_mlp": 0.0103693, "balance_loss_clip": 1.02288675, "balance_loss_mlp": 1.04392016, "epoch": 0.5448970389298061, "flos": 19281804215040.0, "grad_norm": 1.8274859796700942, "language_loss": 0.82408011, "learning_rate": 1.8065457547173233e-06, "loss": 0.8456502, "num_input_tokens_seen": 195254870, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 9063, "time_per_iteration": 2.4780032634735107 }, { "auxiliary_loss_clip": 0.01116642, "auxiliary_loss_mlp": 0.01034969, "balance_loss_clip": 1.02120626, "balance_loss_mlp": 1.04290581, "epoch": 0.544957162182474, "flos": 20991690316800.0, "grad_norm": 6.733366136414121, "language_loss": 0.64182329, "learning_rate": 1.8061581229160878e-06, "loss": 0.66333938, "num_input_tokens_seen": 195273390, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 9064, "time_per_iteration": 2.501648187637329 }, { "auxiliary_loss_clip": 0.01118305, "auxiliary_loss_mlp": 0.0103838, "balance_loss_clip": 1.02431965, "balance_loss_mlp": 1.04341686, "epoch": 0.545017285435142, "flos": 25374587846400.0, "grad_norm": 1.7168477342552984, "language_loss": 0.79766262, "learning_rate": 1.8057704984653566e-06, "loss": 0.81922948, "num_input_tokens_seen": 195295635, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.74609375, "step": 9065, "time_per_iteration": 2.5531468391418457 }, { "auxiliary_loss_clip": 0.01114182, "auxiliary_loss_mlp": 0.01030629, "balance_loss_clip": 1.01914334, "balance_loss_mlp": 1.04321456, "epoch": 0.54507740868781, "flos": 19134749934720.0, "grad_norm": 1.9215722803992044, "language_loss": 0.77529621, "learning_rate": 1.805382881379827e-06, "loss": 0.79674435, "num_input_tokens_seen": 195312545, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.7109375, "step": 9066, "time_per_iteration": 2.59243106842041 }, { "auxiliary_loss_clip": 0.01116868, "auxiliary_loss_mlp": 0.0103306, "balance_loss_clip": 1.01912427, "balance_loss_mlp": 1.04125798, "epoch": 0.545137531940478, "flos": 26249802635520.0, "grad_norm": 3.789690680775584, "language_loss": 0.7587924, "learning_rate": 1.8049952716741975e-06, "loss": 0.78029168, "num_input_tokens_seen": 195332955, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7578125, "step": 9067, "time_per_iteration": 2.5513970851898193 }, { "auxiliary_loss_clip": 0.01123552, "auxiliary_loss_mlp": 0.01038051, "balance_loss_clip": 1.02264273, "balance_loss_mlp": 1.04551435, "epoch": 0.545197655193146, "flos": 37555629995520.0, "grad_norm": 1.8677506397180113, "language_loss": 0.6345343, "learning_rate": 1.8046076693631682e-06, "loss": 0.65615034, "num_input_tokens_seen": 195355930, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.78125, "step": 9068, "time_per_iteration": 2.64430832862854 }, { "auxiliary_loss_clip": 0.01115656, "auxiliary_loss_mlp": 0.01041516, "balance_loss_clip": 1.02898121, "balance_loss_mlp": 1.0450021, "epoch": 0.5452577784458139, "flos": 26031250333440.0, "grad_norm": 2.1368874985822983, "language_loss": 0.72120142, "learning_rate": 1.8042200744614343e-06, "loss": 0.74277318, "num_input_tokens_seen": 195376445, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 9069, "time_per_iteration": 2.5179035663604736 }, { "auxiliary_loss_clip": 0.01114299, "auxiliary_loss_mlp": 0.01029486, "balance_loss_clip": 1.01755321, "balance_loss_mlp": 1.04446518, "epoch": 0.5453179016984819, "flos": 17639034675840.0, "grad_norm": 1.8377142817144614, "language_loss": 0.73996031, "learning_rate": 1.8038324869836957e-06, "loss": 0.7613982, "num_input_tokens_seen": 195393725, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 9070, "time_per_iteration": 2.463406562805176 }, { "auxiliary_loss_clip": 0.01114456, "auxiliary_loss_mlp": 0.01033785, "balance_loss_clip": 1.02011788, "balance_loss_mlp": 1.04214108, "epoch": 0.5453780249511498, "flos": 23216679406080.0, "grad_norm": 2.562203091929871, "language_loss": 0.6048131, "learning_rate": 1.8034449069446489e-06, "loss": 0.62629551, "num_input_tokens_seen": 195411380, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 9071, "time_per_iteration": 2.494718313217163 }, { "auxiliary_loss_clip": 0.01037064, "auxiliary_loss_mlp": 0.01004676, "balance_loss_clip": 1.00336456, "balance_loss_mlp": 1.01269853, "epoch": 0.5454381482038179, "flos": 68696504801280.0, "grad_norm": 0.7035955559227357, "language_loss": 0.57112861, "learning_rate": 1.80305733435899e-06, "loss": 0.591546, "num_input_tokens_seen": 195482015, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.24414062, "step": 9072, "time_per_iteration": 3.1872055530548096 }, { "auxiliary_loss_clip": 0.01114286, "auxiliary_loss_mlp": 0.01034445, "balance_loss_clip": 1.02185607, "balance_loss_mlp": 1.04362845, "epoch": 0.5454982714564858, "flos": 13260626346240.0, "grad_norm": 1.6846484532911692, "language_loss": 0.69859236, "learning_rate": 1.8026697692414174e-06, "loss": 0.72007966, "num_input_tokens_seen": 195500440, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 9073, "time_per_iteration": 2.452394962310791 }, { "auxiliary_loss_clip": 0.01112378, "auxiliary_loss_mlp": 0.01041725, "balance_loss_clip": 1.02909517, "balance_loss_mlp": 1.04214442, "epoch": 0.5455583947091538, "flos": 21835878733440.0, "grad_norm": 1.8528982087307446, "language_loss": 0.71484673, "learning_rate": 1.802282211606627e-06, "loss": 0.73638779, "num_input_tokens_seen": 195520860, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 9074, "time_per_iteration": 2.5189473628997803 }, { "auxiliary_loss_clip": 0.01113794, "auxiliary_loss_mlp": 0.01039417, "balance_loss_clip": 1.02656043, "balance_loss_mlp": 1.04220462, "epoch": 0.5456185179618217, "flos": 17817438551040.0, "grad_norm": 2.0961699705153043, "language_loss": 0.68656451, "learning_rate": 1.8018946614693148e-06, "loss": 0.70809662, "num_input_tokens_seen": 195538615, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 9075, "time_per_iteration": 2.4658679962158203 }, { "auxiliary_loss_clip": 0.01113288, "auxiliary_loss_mlp": 0.01036397, "balance_loss_clip": 1.02433884, "balance_loss_mlp": 1.0422473, "epoch": 0.5456786412144897, "flos": 21069401391360.0, "grad_norm": 2.222082309325089, "language_loss": 0.80473125, "learning_rate": 1.8015071188441768e-06, "loss": 0.82622802, "num_input_tokens_seen": 195557460, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7109375, "step": 9076, "time_per_iteration": 2.5182437896728516 }, { "auxiliary_loss_clip": 0.01113851, "auxiliary_loss_mlp": 0.0103562, "balance_loss_clip": 1.02315032, "balance_loss_mlp": 1.04156864, "epoch": 0.5457387644671576, "flos": 23294965098240.0, "grad_norm": 1.549975038116192, "language_loss": 0.80216849, "learning_rate": 1.8011195837459089e-06, "loss": 0.82366312, "num_input_tokens_seen": 195577985, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.72265625, "step": 9077, "time_per_iteration": 2.503553867340088 }, { "auxiliary_loss_clip": 0.01114921, "auxiliary_loss_mlp": 0.01031421, "balance_loss_clip": 1.01857066, "balance_loss_mlp": 1.04123366, "epoch": 0.5457988877198257, "flos": 21617039122560.0, "grad_norm": 2.156458606149096, "language_loss": 0.68222725, "learning_rate": 1.8007320561892064e-06, "loss": 0.70369065, "num_input_tokens_seen": 195597620, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 9078, "time_per_iteration": 2.5566020011901855 }, { "auxiliary_loss_clip": 0.01116642, "auxiliary_loss_mlp": 0.01036544, "balance_loss_clip": 1.02315664, "balance_loss_mlp": 1.04248977, "epoch": 0.5458590109724936, "flos": 23762485543680.0, "grad_norm": 1.7168402553423276, "language_loss": 0.81128854, "learning_rate": 1.800344536188764e-06, "loss": 0.83282036, "num_input_tokens_seen": 195615910, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 9079, "time_per_iteration": 2.4896490573883057 }, { "auxiliary_loss_clip": 0.01120778, "auxiliary_loss_mlp": 0.0103458, "balance_loss_clip": 1.01976216, "balance_loss_mlp": 1.04355931, "epoch": 0.5459191342251616, "flos": 24424283675520.0, "grad_norm": 4.213558642225152, "language_loss": 0.7582525, "learning_rate": 1.799957023759277e-06, "loss": 0.77980608, "num_input_tokens_seen": 195635620, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 9080, "time_per_iteration": 2.5933287143707275 }, { "auxiliary_loss_clip": 0.01115893, "auxiliary_loss_mlp": 0.01034489, "balance_loss_clip": 1.02089882, "balance_loss_mlp": 1.0424192, "epoch": 0.5459792574778296, "flos": 23623009032960.0, "grad_norm": 2.183988079530516, "language_loss": 0.83666831, "learning_rate": 1.7995695189154392e-06, "loss": 0.85817218, "num_input_tokens_seen": 195652495, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 9081, "time_per_iteration": 2.4958925247192383 }, { "auxiliary_loss_clip": 0.01119112, "auxiliary_loss_mlp": 0.01031539, "balance_loss_clip": 1.01793718, "balance_loss_mlp": 1.04339314, "epoch": 0.5460393807304975, "flos": 19135540033920.0, "grad_norm": 1.729717427842751, "language_loss": 0.70241082, "learning_rate": 1.7991820216719461e-06, "loss": 0.72391737, "num_input_tokens_seen": 195671965, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 9082, "time_per_iteration": 2.5107476711273193 }, { "auxiliary_loss_clip": 0.01111019, "auxiliary_loss_mlp": 0.01027959, "balance_loss_clip": 1.01517367, "balance_loss_mlp": 1.04058933, "epoch": 0.5460995039831655, "flos": 35918534805120.0, "grad_norm": 1.5866309812547146, "language_loss": 0.6633448, "learning_rate": 1.7987945320434906e-06, "loss": 0.68473458, "num_input_tokens_seen": 195694725, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 9083, "time_per_iteration": 2.5844595432281494 }, { "auxiliary_loss_clip": 0.01112552, "auxiliary_loss_mlp": 0.01031938, "balance_loss_clip": 1.0194155, "balance_loss_mlp": 1.04217541, "epoch": 0.5461596272358334, "flos": 26759231274240.0, "grad_norm": 2.0317903853920876, "language_loss": 0.78684998, "learning_rate": 1.798407050044766e-06, "loss": 0.80829489, "num_input_tokens_seen": 195714090, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 9084, "time_per_iteration": 2.5362861156463623 }, { "auxiliary_loss_clip": 0.01117595, "auxiliary_loss_mlp": 0.01033256, "balance_loss_clip": 1.01995242, "balance_loss_mlp": 1.04369283, "epoch": 0.5462197504885015, "flos": 20886580143360.0, "grad_norm": 1.6703844682823652, "language_loss": 0.75151724, "learning_rate": 1.7980195756904675e-06, "loss": 0.77302575, "num_input_tokens_seen": 195733585, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 9085, "time_per_iteration": 2.467074155807495 }, { "auxiliary_loss_clip": 0.01115394, "auxiliary_loss_mlp": 0.01033194, "balance_loss_clip": 1.01981306, "balance_loss_mlp": 1.04122818, "epoch": 0.5462798737411694, "flos": 25804976607360.0, "grad_norm": 2.043351200579771, "language_loss": 0.74977303, "learning_rate": 1.7976321089952857e-06, "loss": 0.77125889, "num_input_tokens_seen": 195752820, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 9086, "time_per_iteration": 2.547938346862793 }, { "auxiliary_loss_clip": 0.01116247, "auxiliary_loss_mlp": 0.01031044, "balance_loss_clip": 1.01791298, "balance_loss_mlp": 1.04272437, "epoch": 0.5463399969938374, "flos": 25775027642880.0, "grad_norm": 1.5173669890926507, "language_loss": 0.77162635, "learning_rate": 1.7972446499739155e-06, "loss": 0.79309928, "num_input_tokens_seen": 195773740, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 9087, "time_per_iteration": 2.533409833908081 }, { "auxiliary_loss_clip": 0.01118786, "auxiliary_loss_mlp": 0.01039154, "balance_loss_clip": 1.02477098, "balance_loss_mlp": 1.04441345, "epoch": 0.5464001202465053, "flos": 18843298980480.0, "grad_norm": 1.9794371598040286, "language_loss": 0.77510965, "learning_rate": 1.7968571986410484e-06, "loss": 0.79668903, "num_input_tokens_seen": 195792125, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7421875, "step": 9088, "time_per_iteration": 2.5448343753814697 }, { "auxiliary_loss_clip": 0.01039988, "auxiliary_loss_mlp": 0.01001636, "balance_loss_clip": 1.00021732, "balance_loss_mlp": 1.0157485, "epoch": 0.5464602434991733, "flos": 69049541623680.0, "grad_norm": 0.722098508505742, "language_loss": 0.5774982, "learning_rate": 1.7964697550113758e-06, "loss": 0.5979144, "num_input_tokens_seen": 195854935, "router_z_loss_clip": 0.01416016, "router_z_loss_mlp": 0.2421875, "step": 9089, "time_per_iteration": 4.497862100601196 }, { "auxiliary_loss_clip": 0.01114796, "auxiliary_loss_mlp": 0.01035127, "balance_loss_clip": 1.02193701, "balance_loss_mlp": 1.04075909, "epoch": 0.5465203667518412, "flos": 27560039040000.0, "grad_norm": 1.8143719345992675, "language_loss": 0.76921743, "learning_rate": 1.7960823190995918e-06, "loss": 0.79071665, "num_input_tokens_seen": 195874715, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 9090, "time_per_iteration": 2.5433385372161865 }, { "auxiliary_loss_clip": 0.01117593, "auxiliary_loss_mlp": 0.01039229, "balance_loss_clip": 1.02470958, "balance_loss_mlp": 1.04109144, "epoch": 0.5465804900045093, "flos": 21210206705280.0, "grad_norm": 2.198229547644774, "language_loss": 0.73643887, "learning_rate": 1.7956948909203855e-06, "loss": 0.75800705, "num_input_tokens_seen": 195892610, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.765625, "step": 9091, "time_per_iteration": 3.973921775817871 }, { "auxiliary_loss_clip": 0.01117911, "auxiliary_loss_mlp": 0.01034418, "balance_loss_clip": 1.02075636, "balance_loss_mlp": 1.04418707, "epoch": 0.5466406132571772, "flos": 22488949860480.0, "grad_norm": 4.512407398039289, "language_loss": 0.78151542, "learning_rate": 1.7953074704884498e-06, "loss": 0.80303872, "num_input_tokens_seen": 195911085, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 9092, "time_per_iteration": 3.9609265327453613 }, { "auxiliary_loss_clip": 0.01118047, "auxiliary_loss_mlp": 0.0103503, "balance_loss_clip": 1.02073646, "balance_loss_mlp": 1.0434854, "epoch": 0.5467007365098452, "flos": 17675843137920.0, "grad_norm": 2.14484553866767, "language_loss": 0.75822115, "learning_rate": 1.794920057818476e-06, "loss": 0.77975196, "num_input_tokens_seen": 195929845, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 9093, "time_per_iteration": 3.835545301437378 }, { "auxiliary_loss_clip": 0.01116562, "auxiliary_loss_mlp": 0.01036878, "balance_loss_clip": 1.02178621, "balance_loss_mlp": 1.04163241, "epoch": 0.5467608597625132, "flos": 15698852524800.0, "grad_norm": 2.7149484509907067, "language_loss": 0.69146395, "learning_rate": 1.7945326529251533e-06, "loss": 0.71299833, "num_input_tokens_seen": 195946350, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.75, "step": 9094, "time_per_iteration": 2.4411814212799072 }, { "auxiliary_loss_clip": 0.01116114, "auxiliary_loss_mlp": 0.01035673, "balance_loss_clip": 1.02305496, "balance_loss_mlp": 1.04308462, "epoch": 0.5468209830151811, "flos": 24312816794880.0, "grad_norm": 3.302154571952728, "language_loss": 0.68315625, "learning_rate": 1.7941452558231731e-06, "loss": 0.70467412, "num_input_tokens_seen": 195959840, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73046875, "step": 9095, "time_per_iteration": 2.502910614013672 }, { "auxiliary_loss_clip": 0.01116914, "auxiliary_loss_mlp": 0.01036909, "balance_loss_clip": 1.02414203, "balance_loss_mlp": 1.04450834, "epoch": 0.5468811062678491, "flos": 29166323339520.0, "grad_norm": 1.5209592775209464, "language_loss": 0.66430748, "learning_rate": 1.7937578665272256e-06, "loss": 0.68584573, "num_input_tokens_seen": 195981125, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 9096, "time_per_iteration": 2.540752410888672 }, { "auxiliary_loss_clip": 0.01038828, "auxiliary_loss_mlp": 0.01000799, "balance_loss_clip": 0.99945754, "balance_loss_mlp": 1.01438379, "epoch": 0.546941229520517, "flos": 67867037982720.0, "grad_norm": 0.734190936628726, "language_loss": 0.57561165, "learning_rate": 1.7933704850520007e-06, "loss": 0.59600788, "num_input_tokens_seen": 196038880, "router_z_loss_clip": 0.01342773, "router_z_loss_mlp": 0.24414062, "step": 9097, "time_per_iteration": 3.2027549743652344 }, { "auxiliary_loss_clip": 0.0103798, "auxiliary_loss_mlp": 0.01003214, "balance_loss_clip": 1.00186718, "balance_loss_mlp": 1.01344299, "epoch": 0.5470013527731851, "flos": 58270306625280.0, "grad_norm": 0.9041327104950097, "language_loss": 0.64811265, "learning_rate": 1.7929831114121868e-06, "loss": 0.6685245, "num_input_tokens_seen": 196099215, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.24609375, "step": 9098, "time_per_iteration": 3.040726900100708 }, { "auxiliary_loss_clip": 0.01117099, "auxiliary_loss_mlp": 0.01039027, "balance_loss_clip": 1.02503765, "balance_loss_mlp": 1.04231215, "epoch": 0.547061476025853, "flos": 22965915582720.0, "grad_norm": 1.6738957257249072, "language_loss": 0.73279905, "learning_rate": 1.7925957456224753e-06, "loss": 0.75436032, "num_input_tokens_seen": 196120370, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 9099, "time_per_iteration": 2.5125980377197266 }, { "auxiliary_loss_clip": 0.0111407, "auxiliary_loss_mlp": 0.01036264, "balance_loss_clip": 1.023664, "balance_loss_mlp": 1.04197168, "epoch": 0.547121599278521, "flos": 29968244426880.0, "grad_norm": 1.7790778429556395, "language_loss": 0.72664773, "learning_rate": 1.7922083876975537e-06, "loss": 0.74815106, "num_input_tokens_seen": 196139075, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 9100, "time_per_iteration": 2.5129904747009277 }, { "auxiliary_loss_clip": 0.01114512, "auxiliary_loss_mlp": 0.01028376, "balance_loss_clip": 1.01465535, "balance_loss_mlp": 1.04237711, "epoch": 0.5471817225311889, "flos": 36535443914880.0, "grad_norm": 1.7051383198151553, "language_loss": 0.67945749, "learning_rate": 1.7918210376521102e-06, "loss": 0.70088637, "num_input_tokens_seen": 196159990, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 9101, "time_per_iteration": 2.60898494720459 }, { "auxiliary_loss_clip": 0.0111412, "auxiliary_loss_mlp": 0.01033276, "balance_loss_clip": 1.01988256, "balance_loss_mlp": 1.04097307, "epoch": 0.5472418457838569, "flos": 25775243124480.0, "grad_norm": 1.7664932937328022, "language_loss": 0.77809626, "learning_rate": 1.7914336955008343e-06, "loss": 0.79957026, "num_input_tokens_seen": 196180570, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 9102, "time_per_iteration": 2.501844644546509 }, { "auxiliary_loss_clip": 0.01115786, "auxiliary_loss_mlp": 0.01038367, "balance_loss_clip": 1.025051, "balance_loss_mlp": 1.04406619, "epoch": 0.5473019690365248, "flos": 27887687925120.0, "grad_norm": 1.5549150833499321, "language_loss": 0.71841663, "learning_rate": 1.791046361258413e-06, "loss": 0.73995811, "num_input_tokens_seen": 196200300, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 9103, "time_per_iteration": 2.5333845615386963 }, { "auxiliary_loss_clip": 0.01112454, "auxiliary_loss_mlp": 0.01032316, "balance_loss_clip": 1.01928651, "balance_loss_mlp": 1.04038858, "epoch": 0.5473620922891929, "flos": 57631490219520.0, "grad_norm": 1.4588572111973157, "language_loss": 0.65349925, "learning_rate": 1.7906590349395356e-06, "loss": 0.67494696, "num_input_tokens_seen": 196228525, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 9104, "time_per_iteration": 2.823676824569702 }, { "auxiliary_loss_clip": 0.01118506, "auxiliary_loss_mlp": 0.01029351, "balance_loss_clip": 1.01523662, "balance_loss_mlp": 1.04331946, "epoch": 0.5474222155418608, "flos": 19354056422400.0, "grad_norm": 2.4296432350501673, "language_loss": 0.81497312, "learning_rate": 1.790271716558888e-06, "loss": 0.83645171, "num_input_tokens_seen": 196247690, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 9105, "time_per_iteration": 2.4645512104034424 }, { "auxiliary_loss_clip": 0.01111296, "auxiliary_loss_mlp": 0.01029384, "balance_loss_clip": 1.0171237, "balance_loss_mlp": 1.03986621, "epoch": 0.5474823387945288, "flos": 25120448144640.0, "grad_norm": 2.1090463199056693, "language_loss": 0.80272031, "learning_rate": 1.7898844061311575e-06, "loss": 0.82412708, "num_input_tokens_seen": 196268555, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71484375, "step": 9106, "time_per_iteration": 2.504774808883667 }, { "auxiliary_loss_clip": 0.01115113, "auxiliary_loss_mlp": 0.01037165, "balance_loss_clip": 1.02466631, "balance_loss_mlp": 1.04279768, "epoch": 0.5475424620471967, "flos": 18004174381440.0, "grad_norm": 2.0808692471398205, "language_loss": 0.69773424, "learning_rate": 1.7894971036710322e-06, "loss": 0.71925712, "num_input_tokens_seen": 196285585, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 9107, "time_per_iteration": 2.447679042816162 }, { "auxiliary_loss_clip": 0.01115175, "auxiliary_loss_mlp": 0.01029789, "balance_loss_clip": 1.01583505, "balance_loss_mlp": 1.04059601, "epoch": 0.5476025852998647, "flos": 22309324922880.0, "grad_norm": 1.731713211267309, "language_loss": 0.6349948, "learning_rate": 1.789109809193197e-06, "loss": 0.65644443, "num_input_tokens_seen": 196305085, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7421875, "step": 9108, "time_per_iteration": 2.456090211868286 }, { "auxiliary_loss_clip": 0.01114164, "auxiliary_loss_mlp": 0.01027007, "balance_loss_clip": 1.01509178, "balance_loss_mlp": 1.04204226, "epoch": 0.5476627085525327, "flos": 20120497850880.0, "grad_norm": 2.1623942956849933, "language_loss": 0.74978089, "learning_rate": 1.7887225227123396e-06, "loss": 0.77119255, "num_input_tokens_seen": 196323945, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.71875, "step": 9109, "time_per_iteration": 2.4685275554656982 }, { "auxiliary_loss_clip": 0.01114056, "auxiliary_loss_mlp": 0.01030863, "balance_loss_clip": 1.01695728, "balance_loss_mlp": 1.04383099, "epoch": 0.5477228318052006, "flos": 17712579772800.0, "grad_norm": 2.243656796247688, "language_loss": 0.77707738, "learning_rate": 1.7883352442431457e-06, "loss": 0.79852664, "num_input_tokens_seen": 196342200, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.703125, "step": 9110, "time_per_iteration": 2.488882541656494 }, { "auxiliary_loss_clip": 0.01112591, "auxiliary_loss_mlp": 0.01031778, "balance_loss_clip": 1.01952338, "balance_loss_mlp": 1.04171824, "epoch": 0.5477829550578687, "flos": 25848895962240.0, "grad_norm": 1.518921186459365, "language_loss": 0.71099496, "learning_rate": 1.7879479738002993e-06, "loss": 0.73243862, "num_input_tokens_seen": 196362940, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.70703125, "step": 9111, "time_per_iteration": 2.514758348464966 }, { "auxiliary_loss_clip": 0.01114629, "auxiliary_loss_mlp": 0.01041314, "balance_loss_clip": 1.02833259, "balance_loss_mlp": 1.04182506, "epoch": 0.5478430783105366, "flos": 23039676161280.0, "grad_norm": 1.9311180127539878, "language_loss": 0.71488929, "learning_rate": 1.7875607113984876e-06, "loss": 0.73644876, "num_input_tokens_seen": 196383070, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 9112, "time_per_iteration": 2.482107400894165 }, { "auxiliary_loss_clip": 0.01116437, "auxiliary_loss_mlp": 0.01032216, "balance_loss_clip": 1.01887691, "balance_loss_mlp": 1.04219937, "epoch": 0.5479032015632046, "flos": 16071210864000.0, "grad_norm": 2.3423559071791775, "language_loss": 0.88151556, "learning_rate": 1.7871734570523953e-06, "loss": 0.90300214, "num_input_tokens_seen": 196398485, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 9113, "time_per_iteration": 2.4451467990875244 }, { "auxiliary_loss_clip": 0.01116709, "auxiliary_loss_mlp": 0.01032852, "balance_loss_clip": 1.0192498, "balance_loss_mlp": 1.04372239, "epoch": 0.5479633248158725, "flos": 24278701852800.0, "grad_norm": 1.7953674338694647, "language_loss": 0.73333514, "learning_rate": 1.7867862107767067e-06, "loss": 0.75483078, "num_input_tokens_seen": 196417725, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 9114, "time_per_iteration": 2.4951181411743164 }, { "auxiliary_loss_clip": 0.0111157, "auxiliary_loss_mlp": 0.01035753, "balance_loss_clip": 1.02381372, "balance_loss_mlp": 1.04060566, "epoch": 0.5480234480685405, "flos": 26358216860160.0, "grad_norm": 1.712627631323614, "language_loss": 0.72372055, "learning_rate": 1.7863989725861066e-06, "loss": 0.74519384, "num_input_tokens_seen": 196437840, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.7109375, "step": 9115, "time_per_iteration": 2.509239435195923 }, { "auxiliary_loss_clip": 0.01115439, "auxiliary_loss_mlp": 0.01039038, "balance_loss_clip": 1.02533484, "balance_loss_mlp": 1.04036641, "epoch": 0.5480835713212084, "flos": 22055077480320.0, "grad_norm": 2.592525300737135, "language_loss": 0.72168511, "learning_rate": 1.7860117424952781e-06, "loss": 0.74322987, "num_input_tokens_seen": 196457300, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 9116, "time_per_iteration": 2.4694600105285645 }, { "auxiliary_loss_clip": 0.01114926, "auxiliary_loss_mlp": 0.0103506, "balance_loss_clip": 1.02209616, "balance_loss_mlp": 1.04237926, "epoch": 0.5481436945738765, "flos": 25301042749440.0, "grad_norm": 1.9436900381365771, "language_loss": 0.76137513, "learning_rate": 1.7856245205189063e-06, "loss": 0.78287494, "num_input_tokens_seen": 196476720, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 9117, "time_per_iteration": 2.5694448947906494 }, { "auxiliary_loss_clip": 0.01110032, "auxiliary_loss_mlp": 0.01030332, "balance_loss_clip": 1.01819038, "balance_loss_mlp": 1.04054213, "epoch": 0.5482038178265444, "flos": 33580857772800.0, "grad_norm": 1.6581129917765771, "language_loss": 0.6315043, "learning_rate": 1.785237306671674e-06, "loss": 0.65290797, "num_input_tokens_seen": 196496765, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 9118, "time_per_iteration": 2.5599164962768555 }, { "auxiliary_loss_clip": 0.0111857, "auxiliary_loss_mlp": 0.01032701, "balance_loss_clip": 1.01917124, "balance_loss_mlp": 1.04464579, "epoch": 0.5482639410792124, "flos": 19026192055680.0, "grad_norm": 1.7809049005794602, "language_loss": 0.78880227, "learning_rate": 1.7848501009682646e-06, "loss": 0.81031501, "num_input_tokens_seen": 196516220, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 9119, "time_per_iteration": 2.46394419670105 }, { "auxiliary_loss_clip": 0.01113909, "auxiliary_loss_mlp": 0.01033254, "balance_loss_clip": 1.02152944, "balance_loss_mlp": 1.04354048, "epoch": 0.5483240643318803, "flos": 25410318900480.0, "grad_norm": 2.0446847869429976, "language_loss": 0.82198799, "learning_rate": 1.7844629034233604e-06, "loss": 0.84345961, "num_input_tokens_seen": 196533860, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.703125, "step": 9120, "time_per_iteration": 2.4936845302581787 }, { "auxiliary_loss_clip": 0.01117043, "auxiliary_loss_mlp": 0.01036519, "balance_loss_clip": 1.02292347, "balance_loss_mlp": 1.04405165, "epoch": 0.5483841875845483, "flos": 21466896272640.0, "grad_norm": 1.8014557795711441, "language_loss": 0.80283988, "learning_rate": 1.7840757140516455e-06, "loss": 0.82437551, "num_input_tokens_seen": 196551305, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 9121, "time_per_iteration": 2.487273693084717 }, { "auxiliary_loss_clip": 0.01116943, "auxiliary_loss_mlp": 0.0103663, "balance_loss_clip": 1.0228014, "balance_loss_mlp": 1.04162621, "epoch": 0.5484443108372163, "flos": 24747263792640.0, "grad_norm": 1.8826324161766694, "language_loss": 0.61544889, "learning_rate": 1.7836885328678008e-06, "loss": 0.63698459, "num_input_tokens_seen": 196569420, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 9122, "time_per_iteration": 2.497748613357544 }, { "auxiliary_loss_clip": 0.01115597, "auxiliary_loss_mlp": 0.01036182, "balance_loss_clip": 1.0247618, "balance_loss_mlp": 1.04483724, "epoch": 0.5485044340898843, "flos": 25375377945600.0, "grad_norm": 1.7169426147179951, "language_loss": 0.71858954, "learning_rate": 1.7833013598865084e-06, "loss": 0.74010736, "num_input_tokens_seen": 196590610, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.70703125, "step": 9123, "time_per_iteration": 2.5315611362457275 }, { "auxiliary_loss_clip": 0.01115563, "auxiliary_loss_mlp": 0.01032203, "balance_loss_clip": 1.02036524, "balance_loss_mlp": 1.04349136, "epoch": 0.5485645573425523, "flos": 12641167370880.0, "grad_norm": 1.9234888611305427, "language_loss": 0.83244443, "learning_rate": 1.7829141951224505e-06, "loss": 0.85392213, "num_input_tokens_seen": 196606495, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.71875, "step": 9124, "time_per_iteration": 2.427971601486206 }, { "auxiliary_loss_clip": 0.01117113, "auxiliary_loss_mlp": 0.0103656, "balance_loss_clip": 1.02378654, "balance_loss_mlp": 1.04588127, "epoch": 0.5486246805952202, "flos": 28329425383680.0, "grad_norm": 1.8650590908904445, "language_loss": 0.80208629, "learning_rate": 1.7825270385903075e-06, "loss": 0.823623, "num_input_tokens_seen": 196626365, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 9125, "time_per_iteration": 2.522097587585449 }, { "auxiliary_loss_clip": 0.01117015, "auxiliary_loss_mlp": 0.01034079, "balance_loss_clip": 1.02132964, "balance_loss_mlp": 1.04315615, "epoch": 0.5486848038478882, "flos": 16800017817600.0, "grad_norm": 2.444211238937609, "language_loss": 0.7468133, "learning_rate": 1.7821398903047617e-06, "loss": 0.76832426, "num_input_tokens_seen": 196644465, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 9126, "time_per_iteration": 2.4293060302734375 }, { "auxiliary_loss_clip": 0.01116454, "auxiliary_loss_mlp": 0.01036487, "balance_loss_clip": 1.02256966, "balance_loss_mlp": 1.04184723, "epoch": 0.5487449271005561, "flos": 17236224581760.0, "grad_norm": 2.597469651245624, "language_loss": 0.66960227, "learning_rate": 1.7817527502804928e-06, "loss": 0.69113171, "num_input_tokens_seen": 196659160, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.74609375, "step": 9127, "time_per_iteration": 2.4282541275024414 }, { "auxiliary_loss_clip": 0.01116758, "auxiliary_loss_mlp": 0.01040385, "balance_loss_clip": 1.02666974, "balance_loss_mlp": 1.04349184, "epoch": 0.5488050503532241, "flos": 17340867878400.0, "grad_norm": 1.7697651296464763, "language_loss": 0.83109909, "learning_rate": 1.781365618532181e-06, "loss": 0.85267049, "num_input_tokens_seen": 196677410, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 9128, "time_per_iteration": 2.4541125297546387 }, { "auxiliary_loss_clip": 0.01115616, "auxiliary_loss_mlp": 0.01036252, "balance_loss_clip": 1.02327561, "balance_loss_mlp": 1.0439471, "epoch": 0.548865173605892, "flos": 17239169496960.0, "grad_norm": 1.8238879890365487, "language_loss": 0.74469012, "learning_rate": 1.7809784950745078e-06, "loss": 0.76620877, "num_input_tokens_seen": 196696765, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 9129, "time_per_iteration": 2.4848790168762207 }, { "auxiliary_loss_clip": 0.01119588, "auxiliary_loss_mlp": 0.01033915, "balance_loss_clip": 1.01979506, "balance_loss_mlp": 1.04416895, "epoch": 0.5489252968585601, "flos": 17456716218240.0, "grad_norm": 2.780332236317759, "language_loss": 0.63545346, "learning_rate": 1.7805913799221511e-06, "loss": 0.6569885, "num_input_tokens_seen": 196714895, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 9130, "time_per_iteration": 2.4805591106414795 }, { "auxiliary_loss_clip": 0.01119396, "auxiliary_loss_mlp": 0.01038573, "balance_loss_clip": 1.0246737, "balance_loss_mlp": 1.0445497, "epoch": 0.548985420111228, "flos": 26323383646080.0, "grad_norm": 1.831025617033898, "language_loss": 0.63052386, "learning_rate": 1.7802042730897915e-06, "loss": 0.65210354, "num_input_tokens_seen": 196735510, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 9131, "time_per_iteration": 3.859508752822876 }, { "auxiliary_loss_clip": 0.01118203, "auxiliary_loss_mlp": 0.01032629, "balance_loss_clip": 1.01828206, "balance_loss_mlp": 1.04366922, "epoch": 0.549045543363896, "flos": 18693730748160.0, "grad_norm": 2.1813001262305494, "language_loss": 0.74675757, "learning_rate": 1.7798171745921084e-06, "loss": 0.76826584, "num_input_tokens_seen": 196752855, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7421875, "step": 9132, "time_per_iteration": 3.9420430660247803 }, { "auxiliary_loss_clip": 0.01113699, "auxiliary_loss_mlp": 0.01030172, "balance_loss_clip": 1.01802468, "balance_loss_mlp": 1.03989708, "epoch": 0.5491056666165639, "flos": 24717386655360.0, "grad_norm": 2.460413295230496, "language_loss": 0.81022638, "learning_rate": 1.7794300844437795e-06, "loss": 0.83166504, "num_input_tokens_seen": 196772230, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.73828125, "step": 9133, "time_per_iteration": 2.5133535861968994 }, { "auxiliary_loss_clip": 0.01114108, "auxiliary_loss_mlp": 0.01037268, "balance_loss_clip": 1.02360034, "balance_loss_mlp": 1.04201555, "epoch": 0.5491657898692319, "flos": 21576926609280.0, "grad_norm": 1.8605002398356922, "language_loss": 0.70391572, "learning_rate": 1.7790430026594841e-06, "loss": 0.72542948, "num_input_tokens_seen": 196790405, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 9134, "time_per_iteration": 3.9344873428344727 }, { "auxiliary_loss_clip": 0.01116899, "auxiliary_loss_mlp": 0.01038379, "balance_loss_clip": 1.02539706, "balance_loss_mlp": 1.04224205, "epoch": 0.5492259131219, "flos": 50476432746240.0, "grad_norm": 8.950622485943029, "language_loss": 0.60955805, "learning_rate": 1.7786559292539004e-06, "loss": 0.63111085, "num_input_tokens_seen": 196813785, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.74609375, "step": 9135, "time_per_iteration": 4.067404747009277 }, { "auxiliary_loss_clip": 0.01115978, "auxiliary_loss_mlp": 0.01036683, "balance_loss_clip": 1.02221107, "balance_loss_mlp": 1.04156232, "epoch": 0.5492860363745679, "flos": 25119262995840.0, "grad_norm": 1.5070711286841572, "language_loss": 0.7227841, "learning_rate": 1.7782688642417058e-06, "loss": 0.74431074, "num_input_tokens_seen": 196834390, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7421875, "step": 9136, "time_per_iteration": 2.496532678604126 }, { "auxiliary_loss_clip": 0.01119036, "auxiliary_loss_mlp": 0.01035071, "balance_loss_clip": 1.02019405, "balance_loss_mlp": 1.04163027, "epoch": 0.5493461596272359, "flos": 22633777497600.0, "grad_norm": 2.4392954295819753, "language_loss": 0.67892766, "learning_rate": 1.7778818076375781e-06, "loss": 0.70046878, "num_input_tokens_seen": 196853290, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7734375, "step": 9137, "time_per_iteration": 2.476715087890625 }, { "auxiliary_loss_clip": 0.01039319, "auxiliary_loss_mlp": 0.01001304, "balance_loss_clip": 1.00010645, "balance_loss_mlp": 1.0147922, "epoch": 0.5494062828799038, "flos": 66151800754560.0, "grad_norm": 0.7388392643949616, "language_loss": 0.65356529, "learning_rate": 1.7774947594561947e-06, "loss": 0.67397153, "num_input_tokens_seen": 196913120, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.24609375, "step": 9138, "time_per_iteration": 3.1299431324005127 }, { "auxiliary_loss_clip": 0.01116398, "auxiliary_loss_mlp": 0.01033456, "balance_loss_clip": 1.01965117, "balance_loss_mlp": 1.04230738, "epoch": 0.5494664061325718, "flos": 21105958458240.0, "grad_norm": 1.7803817970330453, "language_loss": 0.75186908, "learning_rate": 1.7771077197122321e-06, "loss": 0.77336758, "num_input_tokens_seen": 196931530, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 9139, "time_per_iteration": 2.490718126296997 }, { "auxiliary_loss_clip": 0.01114688, "auxiliary_loss_mlp": 0.01028289, "balance_loss_clip": 1.01515222, "balance_loss_mlp": 1.04207242, "epoch": 0.5495265293852397, "flos": 14392566616320.0, "grad_norm": 2.103612437840412, "language_loss": 0.71091306, "learning_rate": 1.7767206884203672e-06, "loss": 0.73234278, "num_input_tokens_seen": 196949430, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9140, "time_per_iteration": 2.4816296100616455 }, { "auxiliary_loss_clip": 0.01113292, "auxiliary_loss_mlp": 0.01035961, "balance_loss_clip": 1.02254987, "balance_loss_mlp": 1.04017615, "epoch": 0.5495866526379077, "flos": 25549148966400.0, "grad_norm": 1.760131524715938, "language_loss": 0.7676717, "learning_rate": 1.7763336655952762e-06, "loss": 0.78916425, "num_input_tokens_seen": 196968265, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 9141, "time_per_iteration": 2.498983860015869 }, { "auxiliary_loss_clip": 0.01112181, "auxiliary_loss_mlp": 0.01029308, "balance_loss_clip": 1.01702356, "balance_loss_mlp": 1.04194641, "epoch": 0.5496467758905756, "flos": 21317256213120.0, "grad_norm": 3.785373725313325, "language_loss": 0.7497046, "learning_rate": 1.7759466512516346e-06, "loss": 0.77111948, "num_input_tokens_seen": 196984930, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 9142, "time_per_iteration": 2.4790468215942383 }, { "auxiliary_loss_clip": 0.01119331, "auxiliary_loss_mlp": 0.01036036, "balance_loss_clip": 1.02146292, "balance_loss_mlp": 1.04454851, "epoch": 0.5497068991432437, "flos": 22233086305920.0, "grad_norm": 2.0207684601995664, "language_loss": 0.7658174, "learning_rate": 1.7755596454041192e-06, "loss": 0.78737104, "num_input_tokens_seen": 197002320, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.75, "step": 9143, "time_per_iteration": 2.4702835083007812 }, { "auxiliary_loss_clip": 0.01113603, "auxiliary_loss_mlp": 0.01032527, "balance_loss_clip": 1.02005792, "balance_loss_mlp": 1.0411011, "epoch": 0.5497670223959116, "flos": 18479093028480.0, "grad_norm": 2.953654332715048, "language_loss": 0.80025494, "learning_rate": 1.7751726480674044e-06, "loss": 0.82171625, "num_input_tokens_seen": 197020825, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 9144, "time_per_iteration": 2.469268560409546 }, { "auxiliary_loss_clip": 0.01115805, "auxiliary_loss_mlp": 0.01029646, "balance_loss_clip": 1.01623511, "balance_loss_mlp": 1.04222286, "epoch": 0.5498271456485796, "flos": 29205107049600.0, "grad_norm": 2.8622161870894187, "language_loss": 0.71016383, "learning_rate": 1.7747856592561645e-06, "loss": 0.73161829, "num_input_tokens_seen": 197040450, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 9145, "time_per_iteration": 2.54114031791687 }, { "auxiliary_loss_clip": 0.01114759, "auxiliary_loss_mlp": 0.0103064, "balance_loss_clip": 1.01808739, "balance_loss_mlp": 1.04179358, "epoch": 0.5498872689012475, "flos": 34824372664320.0, "grad_norm": 2.13248550886365, "language_loss": 0.70319009, "learning_rate": 1.774398678985076e-06, "loss": 0.72464412, "num_input_tokens_seen": 197063930, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 9146, "time_per_iteration": 2.611448049545288 }, { "auxiliary_loss_clip": 0.01110196, "auxiliary_loss_mlp": 0.01029098, "balance_loss_clip": 1.01678956, "balance_loss_mlp": 1.04051948, "epoch": 0.5499473921539155, "flos": 25921938268800.0, "grad_norm": 1.7749693637634414, "language_loss": 0.64408439, "learning_rate": 1.7740117072688113e-06, "loss": 0.66547728, "num_input_tokens_seen": 197082660, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 9147, "time_per_iteration": 2.498788833618164 }, { "auxiliary_loss_clip": 0.01114604, "auxiliary_loss_mlp": 0.01032991, "balance_loss_clip": 1.02014041, "balance_loss_mlp": 1.04290152, "epoch": 0.5500075154065835, "flos": 22273701609600.0, "grad_norm": 2.6241971935862964, "language_loss": 0.81100309, "learning_rate": 1.7736247441220458e-06, "loss": 0.832479, "num_input_tokens_seen": 197100675, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 9148, "time_per_iteration": 2.496628761291504 }, { "auxiliary_loss_clip": 0.01114642, "auxiliary_loss_mlp": 0.01033622, "balance_loss_clip": 1.02069354, "balance_loss_mlp": 1.04256272, "epoch": 0.5500676386592515, "flos": 28037507552640.0, "grad_norm": 2.6856011812897247, "language_loss": 0.79190105, "learning_rate": 1.773237789559453e-06, "loss": 0.8133837, "num_input_tokens_seen": 197121320, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 9149, "time_per_iteration": 2.517721176147461 }, { "auxiliary_loss_clip": 0.01113678, "auxiliary_loss_mlp": 0.01027821, "balance_loss_clip": 1.01470768, "balance_loss_mlp": 1.04154873, "epoch": 0.5501277619119195, "flos": 23914819123200.0, "grad_norm": 2.0836091637679406, "language_loss": 0.72248071, "learning_rate": 1.7728508435957052e-06, "loss": 0.74389571, "num_input_tokens_seen": 197138965, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 9150, "time_per_iteration": 2.4972589015960693 }, { "auxiliary_loss_clip": 0.01114767, "auxiliary_loss_mlp": 0.01031034, "balance_loss_clip": 1.01711059, "balance_loss_mlp": 1.03949404, "epoch": 0.5501878851645874, "flos": 20923783655040.0, "grad_norm": 1.714967947462255, "language_loss": 0.75367171, "learning_rate": 1.772463906245477e-06, "loss": 0.77512968, "num_input_tokens_seen": 197156460, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 9151, "time_per_iteration": 2.4570460319519043 }, { "auxiliary_loss_clip": 0.011138, "auxiliary_loss_mlp": 0.01026265, "balance_loss_clip": 1.01368237, "balance_loss_mlp": 1.04125404, "epoch": 0.5502480084172554, "flos": 20665298407680.0, "grad_norm": 2.847196451878592, "language_loss": 0.7615453, "learning_rate": 1.7720769775234394e-06, "loss": 0.78294599, "num_input_tokens_seen": 197175140, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 9152, "time_per_iteration": 2.4714276790618896 }, { "auxiliary_loss_clip": 0.01113619, "auxiliary_loss_mlp": 0.0103148, "balance_loss_clip": 1.01870632, "balance_loss_mlp": 1.04166591, "epoch": 0.5503081316699233, "flos": 26432552056320.0, "grad_norm": 2.4317744827698826, "language_loss": 0.82607514, "learning_rate": 1.7716900574442662e-06, "loss": 0.84752607, "num_input_tokens_seen": 197194345, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9153, "time_per_iteration": 2.5056657791137695 }, { "auxiliary_loss_clip": 0.01112946, "auxiliary_loss_mlp": 0.01030898, "balance_loss_clip": 1.01772594, "balance_loss_mlp": 1.04115117, "epoch": 0.5503682549225913, "flos": 30629144718720.0, "grad_norm": 1.7009873790365153, "language_loss": 0.74038494, "learning_rate": 1.7713031460226294e-06, "loss": 0.76182336, "num_input_tokens_seen": 197215535, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 9154, "time_per_iteration": 2.601487636566162 }, { "auxiliary_loss_clip": 0.01118599, "auxiliary_loss_mlp": 0.0103285, "balance_loss_clip": 1.01893783, "balance_loss_mlp": 1.04191005, "epoch": 0.5504283781752592, "flos": 22565439872640.0, "grad_norm": 1.5480343553518292, "language_loss": 0.7246663, "learning_rate": 1.770916243273199e-06, "loss": 0.74618077, "num_input_tokens_seen": 197234945, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76953125, "step": 9155, "time_per_iteration": 2.463594675064087 }, { "auxiliary_loss_clip": 0.01036123, "auxiliary_loss_mlp": 0.01006081, "balance_loss_clip": 1.00477564, "balance_loss_mlp": 1.01174855, "epoch": 0.5504885014279273, "flos": 67901009270400.0, "grad_norm": 0.7496634378223606, "language_loss": 0.55359375, "learning_rate": 1.7705293492106483e-06, "loss": 0.57401586, "num_input_tokens_seen": 197302285, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.24414062, "step": 9156, "time_per_iteration": 3.239382266998291 }, { "auxiliary_loss_clip": 0.01112372, "auxiliary_loss_mlp": 0.01034226, "balance_loss_clip": 1.0210948, "balance_loss_mlp": 1.04085135, "epoch": 0.5505486246805952, "flos": 22450058409600.0, "grad_norm": 1.656591944189634, "language_loss": 0.82492292, "learning_rate": 1.7701424638496475e-06, "loss": 0.84638888, "num_input_tokens_seen": 197321575, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 9157, "time_per_iteration": 2.472702980041504 }, { "auxiliary_loss_clip": 0.01120316, "auxiliary_loss_mlp": 0.0103513, "balance_loss_clip": 1.02009141, "balance_loss_mlp": 1.04362845, "epoch": 0.5506087479332632, "flos": 26906896085760.0, "grad_norm": 2.2428361492672275, "language_loss": 0.75500357, "learning_rate": 1.7697555872048677e-06, "loss": 0.77655804, "num_input_tokens_seen": 197340255, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.765625, "step": 9158, "time_per_iteration": 2.5536630153656006 }, { "auxiliary_loss_clip": 0.01111157, "auxiliary_loss_mlp": 0.01032206, "balance_loss_clip": 1.01936102, "balance_loss_mlp": 1.04141951, "epoch": 0.5506688711859311, "flos": 22930256355840.0, "grad_norm": 9.581363064411901, "language_loss": 0.69851005, "learning_rate": 1.769368719290979e-06, "loss": 0.7199437, "num_input_tokens_seen": 197360360, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 9159, "time_per_iteration": 2.4943268299102783 }, { "auxiliary_loss_clip": 0.01117686, "auxiliary_loss_mlp": 0.01035843, "balance_loss_clip": 1.02220559, "balance_loss_mlp": 1.04303145, "epoch": 0.5507289944385991, "flos": 29606408772480.0, "grad_norm": 1.6140701164853248, "language_loss": 0.68274307, "learning_rate": 1.7689818601226516e-06, "loss": 0.70427835, "num_input_tokens_seen": 197381905, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 9160, "time_per_iteration": 2.541928768157959 }, { "auxiliary_loss_clip": 0.01112068, "auxiliary_loss_mlp": 0.010322, "balance_loss_clip": 1.0195992, "balance_loss_mlp": 1.04137754, "epoch": 0.5507891176912671, "flos": 15334431091200.0, "grad_norm": 2.205406958038822, "language_loss": 0.71490109, "learning_rate": 1.7685950097145552e-06, "loss": 0.73634374, "num_input_tokens_seen": 197398555, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 9161, "time_per_iteration": 2.4322917461395264 }, { "auxiliary_loss_clip": 0.01115632, "auxiliary_loss_mlp": 0.01039368, "balance_loss_clip": 1.02590358, "balance_loss_mlp": 1.04232454, "epoch": 0.5508492409439351, "flos": 26578313447040.0, "grad_norm": 3.014611543043638, "language_loss": 0.69611746, "learning_rate": 1.768208168081359e-06, "loss": 0.71766746, "num_input_tokens_seen": 197419630, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 9162, "time_per_iteration": 2.516444206237793 }, { "auxiliary_loss_clip": 0.01115168, "auxiliary_loss_mlp": 0.01037697, "balance_loss_clip": 1.02420306, "balance_loss_mlp": 1.04312062, "epoch": 0.5509093641966031, "flos": 25443428261760.0, "grad_norm": 1.6543304571582198, "language_loss": 0.8579874, "learning_rate": 1.767821335237733e-06, "loss": 0.87951612, "num_input_tokens_seen": 197438480, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 9163, "time_per_iteration": 2.4890623092651367 }, { "auxiliary_loss_clip": 0.01116149, "auxiliary_loss_mlp": 0.01035431, "balance_loss_clip": 1.023206, "balance_loss_mlp": 1.04423594, "epoch": 0.550969487449271, "flos": 18698543170560.0, "grad_norm": 1.7979418778558796, "language_loss": 0.8058666, "learning_rate": 1.7674345111983441e-06, "loss": 0.82738239, "num_input_tokens_seen": 197456755, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 9164, "time_per_iteration": 2.456604480743408 }, { "auxiliary_loss_clip": 0.01119522, "auxiliary_loss_mlp": 0.01032433, "balance_loss_clip": 1.01847947, "balance_loss_mlp": 1.04421222, "epoch": 0.551029610701939, "flos": 22708723224960.0, "grad_norm": 2.1859674493167303, "language_loss": 0.73169214, "learning_rate": 1.767047695977863e-06, "loss": 0.75321174, "num_input_tokens_seen": 197475530, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.75, "step": 9165, "time_per_iteration": 2.4805643558502197 }, { "auxiliary_loss_clip": 0.01112094, "auxiliary_loss_mlp": 0.01029206, "balance_loss_clip": 1.01680827, "balance_loss_mlp": 1.04060864, "epoch": 0.5510897339546069, "flos": 12420496166400.0, "grad_norm": 3.6351215305389157, "language_loss": 0.78968048, "learning_rate": 1.7666608895909563e-06, "loss": 0.81109351, "num_input_tokens_seen": 197490835, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 9166, "time_per_iteration": 2.44793438911438 }, { "auxiliary_loss_clip": 0.01116887, "auxiliary_loss_mlp": 0.0103023, "balance_loss_clip": 1.016765, "balance_loss_mlp": 1.04226398, "epoch": 0.5511498572072749, "flos": 18770579896320.0, "grad_norm": 2.0182589469039374, "language_loss": 0.76336414, "learning_rate": 1.7662740920522913e-06, "loss": 0.78483534, "num_input_tokens_seen": 197508770, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 9167, "time_per_iteration": 2.4668288230895996 }, { "auxiliary_loss_clip": 0.01113911, "auxiliary_loss_mlp": 0.01031958, "balance_loss_clip": 1.01760507, "balance_loss_mlp": 1.04194427, "epoch": 0.5512099804599428, "flos": 19573326996480.0, "grad_norm": 1.9166918886680946, "language_loss": 0.79415244, "learning_rate": 1.7658873033765374e-06, "loss": 0.81561112, "num_input_tokens_seen": 197527340, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.71875, "step": 9168, "time_per_iteration": 2.481132984161377 }, { "auxiliary_loss_clip": 0.01118919, "auxiliary_loss_mlp": 0.01038587, "balance_loss_clip": 1.02550364, "balance_loss_mlp": 1.04378569, "epoch": 0.5512701037126109, "flos": 26245600744320.0, "grad_norm": 1.6238671014843662, "language_loss": 0.68928838, "learning_rate": 1.7655005235783591e-06, "loss": 0.71086341, "num_input_tokens_seen": 197547280, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 9169, "time_per_iteration": 2.5086874961853027 }, { "auxiliary_loss_clip": 0.01111253, "auxiliary_loss_mlp": 0.01025343, "balance_loss_clip": 1.01324344, "balance_loss_mlp": 1.04099751, "epoch": 0.5513302269652788, "flos": 21945406279680.0, "grad_norm": 2.048018880119986, "language_loss": 0.85293424, "learning_rate": 1.7651137526724251e-06, "loss": 0.87430024, "num_input_tokens_seen": 197565045, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 9170, "time_per_iteration": 2.489149808883667 }, { "auxiliary_loss_clip": 0.01037189, "auxiliary_loss_mlp": 0.01003837, "balance_loss_clip": 1.00253761, "balance_loss_mlp": 1.01246142, "epoch": 0.5513903502179468, "flos": 68235948616320.0, "grad_norm": 0.8206815066078308, "language_loss": 0.59856558, "learning_rate": 1.7647269906734017e-06, "loss": 0.61897588, "num_input_tokens_seen": 197625005, "router_z_loss_clip": 0.01300049, "router_z_loss_mlp": 0.24804688, "step": 9171, "time_per_iteration": 3.104214906692505 }, { "auxiliary_loss_clip": 0.01115308, "auxiliary_loss_mlp": 0.01034686, "balance_loss_clip": 1.02203143, "balance_loss_mlp": 1.04285359, "epoch": 0.5514504734706147, "flos": 18734238311040.0, "grad_norm": 3.0925402894232485, "language_loss": 0.70677686, "learning_rate": 1.7643402375959533e-06, "loss": 0.72827679, "num_input_tokens_seen": 197645050, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 9172, "time_per_iteration": 3.863513469696045 }, { "auxiliary_loss_clip": 0.01112924, "auxiliary_loss_mlp": 0.01038093, "balance_loss_clip": 1.02521181, "balance_loss_mlp": 1.04059696, "epoch": 0.5515105967232827, "flos": 22270972176000.0, "grad_norm": 1.7810676579509803, "language_loss": 0.75843996, "learning_rate": 1.7639534934547474e-06, "loss": 0.77995014, "num_input_tokens_seen": 197663910, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 9173, "time_per_iteration": 2.5157313346862793 }, { "auxiliary_loss_clip": 0.01111916, "auxiliary_loss_mlp": 0.0103627, "balance_loss_clip": 1.02275157, "balance_loss_mlp": 1.04097462, "epoch": 0.5515707199759508, "flos": 22557682535040.0, "grad_norm": 2.9035427668385227, "language_loss": 0.75218928, "learning_rate": 1.7635667582644484e-06, "loss": 0.77367115, "num_input_tokens_seen": 197681580, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 9174, "time_per_iteration": 4.013200044631958 }, { "auxiliary_loss_clip": 0.01117404, "auxiliary_loss_mlp": 0.01035221, "balance_loss_clip": 1.02189922, "balance_loss_mlp": 1.04322076, "epoch": 0.5516308432286187, "flos": 28291072636800.0, "grad_norm": 2.9437078046765266, "language_loss": 0.72721612, "learning_rate": 1.7631800320397217e-06, "loss": 0.7487424, "num_input_tokens_seen": 197702095, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 9175, "time_per_iteration": 2.512202739715576 }, { "auxiliary_loss_clip": 0.01116311, "auxiliary_loss_mlp": 0.0103432, "balance_loss_clip": 1.02118301, "balance_loss_mlp": 1.04283834, "epoch": 0.5516909664812867, "flos": 18764474584320.0, "grad_norm": 1.9609967768721295, "language_loss": 0.69372869, "learning_rate": 1.7627933147952318e-06, "loss": 0.71523499, "num_input_tokens_seen": 197720720, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 9176, "time_per_iteration": 5.364535808563232 }, { "auxiliary_loss_clip": 0.01113443, "auxiliary_loss_mlp": 0.01035325, "balance_loss_clip": 1.02231324, "balance_loss_mlp": 1.04208708, "epoch": 0.5517510897339546, "flos": 27740346336000.0, "grad_norm": 1.6417588298694628, "language_loss": 0.71302867, "learning_rate": 1.7624066065456435e-06, "loss": 0.73451632, "num_input_tokens_seen": 197741820, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 9177, "time_per_iteration": 2.551445960998535 }, { "auxiliary_loss_clip": 0.01117519, "auxiliary_loss_mlp": 0.01031861, "balance_loss_clip": 1.01904595, "balance_loss_mlp": 1.04475141, "epoch": 0.5518112129866226, "flos": 18404470523520.0, "grad_norm": 1.635958758205206, "language_loss": 0.80431277, "learning_rate": 1.7620199073056204e-06, "loss": 0.82580662, "num_input_tokens_seen": 197759160, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 9178, "time_per_iteration": 2.461122751235962 }, { "auxiliary_loss_clip": 0.01118436, "auxiliary_loss_mlp": 0.01039706, "balance_loss_clip": 1.02553821, "balance_loss_mlp": 1.0432992, "epoch": 0.5518713362392905, "flos": 25082670015360.0, "grad_norm": 1.6578190725019892, "language_loss": 0.74730963, "learning_rate": 1.761633217089826e-06, "loss": 0.7688911, "num_input_tokens_seen": 197779760, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75390625, "step": 9179, "time_per_iteration": 2.514556407928467 }, { "auxiliary_loss_clip": 0.01116904, "auxiliary_loss_mlp": 0.01039275, "balance_loss_clip": 1.02613831, "balance_loss_mlp": 1.04368877, "epoch": 0.5519314594919585, "flos": 36538999361280.0, "grad_norm": 3.0824917576053275, "language_loss": 0.7048521, "learning_rate": 1.761246535912924e-06, "loss": 0.72641397, "num_input_tokens_seen": 197801545, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 9180, "time_per_iteration": 2.598539352416992 }, { "auxiliary_loss_clip": 0.01116842, "auxiliary_loss_mlp": 0.01038036, "balance_loss_clip": 1.0243926, "balance_loss_mlp": 1.04316497, "epoch": 0.5519915827446265, "flos": 20448613612800.0, "grad_norm": 6.320344627244839, "language_loss": 0.67270768, "learning_rate": 1.7608598637895776e-06, "loss": 0.69425642, "num_input_tokens_seen": 197820760, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 9181, "time_per_iteration": 2.472245931625366 }, { "auxiliary_loss_clip": 0.0111645, "auxiliary_loss_mlp": 0.01035606, "balance_loss_clip": 1.02194488, "balance_loss_mlp": 1.04142547, "epoch": 0.5520517059972945, "flos": 23768052151680.0, "grad_norm": 2.4628018698361216, "language_loss": 0.7922281, "learning_rate": 1.7604732007344486e-06, "loss": 0.81374866, "num_input_tokens_seen": 197840195, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 9182, "time_per_iteration": 2.4962759017944336 }, { "auxiliary_loss_clip": 0.01115927, "auxiliary_loss_mlp": 0.01032776, "balance_loss_clip": 1.0194726, "balance_loss_mlp": 1.04250932, "epoch": 0.5521118292499624, "flos": 22196457411840.0, "grad_norm": 2.0090520789018083, "language_loss": 0.83142936, "learning_rate": 1.7600865467622003e-06, "loss": 0.85291636, "num_input_tokens_seen": 197859475, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 9183, "time_per_iteration": 2.46573805809021 }, { "auxiliary_loss_clip": 0.01113915, "auxiliary_loss_mlp": 0.01028796, "balance_loss_clip": 1.01599884, "balance_loss_mlp": 1.0421021, "epoch": 0.5521719525026304, "flos": 23583291569280.0, "grad_norm": 3.670203007833223, "language_loss": 0.67370683, "learning_rate": 1.7596999018874936e-06, "loss": 0.69513392, "num_input_tokens_seen": 197879395, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 9184, "time_per_iteration": 2.495980739593506 }, { "auxiliary_loss_clip": 0.01115076, "auxiliary_loss_mlp": 0.01030555, "balance_loss_clip": 1.01660717, "balance_loss_mlp": 1.04238749, "epoch": 0.5522320757552983, "flos": 26137617482880.0, "grad_norm": 2.3298503007396256, "language_loss": 0.76574546, "learning_rate": 1.7593132661249917e-06, "loss": 0.78720176, "num_input_tokens_seen": 197900815, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7265625, "step": 9185, "time_per_iteration": 2.5244855880737305 }, { "auxiliary_loss_clip": 0.01117599, "auxiliary_loss_mlp": 0.01038993, "balance_loss_clip": 1.02545691, "balance_loss_mlp": 1.04354346, "epoch": 0.5522921990079663, "flos": 24676160820480.0, "grad_norm": 1.8569323193587035, "language_loss": 0.74111086, "learning_rate": 1.7589266394893536e-06, "loss": 0.76267684, "num_input_tokens_seen": 197918985, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 9186, "time_per_iteration": 2.4767673015594482 }, { "auxiliary_loss_clip": 0.01120173, "auxiliary_loss_mlp": 0.0104138, "balance_loss_clip": 1.02861238, "balance_loss_mlp": 1.04540658, "epoch": 0.5523523222606344, "flos": 22748153379840.0, "grad_norm": 4.399593073266328, "language_loss": 0.66415381, "learning_rate": 1.7585400219952421e-06, "loss": 0.68576932, "num_input_tokens_seen": 197937725, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.74609375, "step": 9187, "time_per_iteration": 2.4953646659851074 }, { "auxiliary_loss_clip": 0.01116732, "auxiliary_loss_mlp": 0.01031404, "balance_loss_clip": 1.01782584, "balance_loss_mlp": 1.04361153, "epoch": 0.5524124455133023, "flos": 19755825022080.0, "grad_norm": 1.8581102347123235, "language_loss": 0.77631533, "learning_rate": 1.758153413657318e-06, "loss": 0.79779661, "num_input_tokens_seen": 197955635, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 9188, "time_per_iteration": 2.451159954071045 }, { "auxiliary_loss_clip": 0.01114883, "auxiliary_loss_mlp": 0.01033962, "balance_loss_clip": 1.02052093, "balance_loss_mlp": 1.04269409, "epoch": 0.5524725687659703, "flos": 23294821443840.0, "grad_norm": 1.852764048541848, "language_loss": 0.81518179, "learning_rate": 1.7577668144902394e-06, "loss": 0.83667028, "num_input_tokens_seen": 197974490, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 9189, "time_per_iteration": 2.5251717567443848 }, { "auxiliary_loss_clip": 0.01115314, "auxiliary_loss_mlp": 0.0103453, "balance_loss_clip": 1.02101207, "balance_loss_mlp": 1.04408765, "epoch": 0.5525326920186382, "flos": 24862178378880.0, "grad_norm": 1.672839187559558, "language_loss": 0.76280689, "learning_rate": 1.7573802245086684e-06, "loss": 0.78430533, "num_input_tokens_seen": 197995735, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 9190, "time_per_iteration": 2.525125503540039 }, { "auxiliary_loss_clip": 0.01118712, "auxiliary_loss_mlp": 0.01038134, "balance_loss_clip": 1.02317286, "balance_loss_mlp": 1.0418123, "epoch": 0.5525928152713062, "flos": 13735580906880.0, "grad_norm": 2.8316320364750203, "language_loss": 0.79356694, "learning_rate": 1.7569936437272627e-06, "loss": 0.81513536, "num_input_tokens_seen": 198009685, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.76953125, "step": 9191, "time_per_iteration": 2.4502787590026855 }, { "auxiliary_loss_clip": 0.01116417, "auxiliary_loss_mlp": 0.01033675, "balance_loss_clip": 1.02022207, "balance_loss_mlp": 1.04402375, "epoch": 0.5526529385239741, "flos": 13071592045440.0, "grad_norm": 4.177339766148944, "language_loss": 0.68919158, "learning_rate": 1.7566070721606829e-06, "loss": 0.71069252, "num_input_tokens_seen": 198026845, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 9192, "time_per_iteration": 2.4490036964416504 }, { "auxiliary_loss_clip": 0.01113215, "auxiliary_loss_mlp": 0.01031217, "balance_loss_clip": 1.01948118, "balance_loss_mlp": 1.042799, "epoch": 0.5527130617766421, "flos": 23148377694720.0, "grad_norm": 1.703248297748343, "language_loss": 0.77442098, "learning_rate": 1.756220509823588e-06, "loss": 0.7958653, "num_input_tokens_seen": 198045275, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.703125, "step": 9193, "time_per_iteration": 2.480211019515991 }, { "auxiliary_loss_clip": 0.0111427, "auxiliary_loss_mlp": 0.01034252, "balance_loss_clip": 1.02147245, "balance_loss_mlp": 1.04238188, "epoch": 0.55277318502931, "flos": 21285547482240.0, "grad_norm": 1.7789284804070735, "language_loss": 0.78545058, "learning_rate": 1.7558339567306344e-06, "loss": 0.80693579, "num_input_tokens_seen": 198065760, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 9194, "time_per_iteration": 2.4777791500091553 }, { "auxiliary_loss_clip": 0.01117903, "auxiliary_loss_mlp": 0.01033797, "balance_loss_clip": 1.02071941, "balance_loss_mlp": 1.04135585, "epoch": 0.5528333082819781, "flos": 38324549462400.0, "grad_norm": 1.8094602601872711, "language_loss": 0.69587851, "learning_rate": 1.7554474128964825e-06, "loss": 0.71739554, "num_input_tokens_seen": 198087595, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.765625, "step": 9195, "time_per_iteration": 2.622224807739258 }, { "auxiliary_loss_clip": 0.01120397, "auxiliary_loss_mlp": 0.01032711, "balance_loss_clip": 1.01838171, "balance_loss_mlp": 1.0428443, "epoch": 0.552893431534646, "flos": 13553621585280.0, "grad_norm": 2.0407553794087105, "language_loss": 0.74756515, "learning_rate": 1.7550608783357887e-06, "loss": 0.76909626, "num_input_tokens_seen": 198104620, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7734375, "step": 9196, "time_per_iteration": 2.4380996227264404 }, { "auxiliary_loss_clip": 0.01114002, "auxiliary_loss_mlp": 0.01029722, "balance_loss_clip": 1.0171752, "balance_loss_mlp": 1.04253078, "epoch": 0.552953554787314, "flos": 21939408708480.0, "grad_norm": 1.9697527622613797, "language_loss": 0.77019745, "learning_rate": 1.7546743530632115e-06, "loss": 0.79163468, "num_input_tokens_seen": 198123565, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 9197, "time_per_iteration": 2.4708290100097656 }, { "auxiliary_loss_clip": 0.01111528, "auxiliary_loss_mlp": 0.01029685, "balance_loss_clip": 1.01770997, "balance_loss_mlp": 1.04045892, "epoch": 0.5530136780399819, "flos": 43658002558080.0, "grad_norm": 1.9420469566867604, "language_loss": 0.76421082, "learning_rate": 1.754287837093407e-06, "loss": 0.78562295, "num_input_tokens_seen": 198148270, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 9198, "time_per_iteration": 2.6924846172332764 }, { "auxiliary_loss_clip": 0.01113122, "auxiliary_loss_mlp": 0.01027143, "balance_loss_clip": 1.01475692, "balance_loss_mlp": 1.04108584, "epoch": 0.5530738012926499, "flos": 25045502417280.0, "grad_norm": 1.5214958860386518, "language_loss": 0.79490733, "learning_rate": 1.7539013304410327e-06, "loss": 0.81630999, "num_input_tokens_seen": 198168810, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 9199, "time_per_iteration": 2.5236096382141113 }, { "auxiliary_loss_clip": 0.01113551, "auxiliary_loss_mlp": 0.01031498, "balance_loss_clip": 1.01920199, "balance_loss_mlp": 1.04096091, "epoch": 0.553133924545318, "flos": 16472081623680.0, "grad_norm": 1.859009967834953, "language_loss": 0.6396364, "learning_rate": 1.7535148331207443e-06, "loss": 0.66108692, "num_input_tokens_seen": 198186200, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7265625, "step": 9200, "time_per_iteration": 2.4430792331695557 }, { "auxiliary_loss_clip": 0.01119367, "auxiliary_loss_mlp": 0.01028229, "balance_loss_clip": 1.01447868, "balance_loss_mlp": 1.04436362, "epoch": 0.5531940477979859, "flos": 24606207083520.0, "grad_norm": 2.468666205970199, "language_loss": 0.66282159, "learning_rate": 1.7531283451471978e-06, "loss": 0.68429756, "num_input_tokens_seen": 198207050, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 9201, "time_per_iteration": 2.5203335285186768 }, { "auxiliary_loss_clip": 0.01115632, "auxiliary_loss_mlp": 0.01032638, "balance_loss_clip": 1.01908946, "balance_loss_mlp": 1.04386067, "epoch": 0.5532541710506539, "flos": 22159577122560.0, "grad_norm": 2.1819879772318815, "language_loss": 0.60491085, "learning_rate": 1.7527418665350502e-06, "loss": 0.62639356, "num_input_tokens_seen": 198224565, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 9202, "time_per_iteration": 2.4794793128967285 }, { "auxiliary_loss_clip": 0.01112428, "auxiliary_loss_mlp": 0.01028916, "balance_loss_clip": 1.01631546, "balance_loss_mlp": 1.04193318, "epoch": 0.5533142943033218, "flos": 21397265758080.0, "grad_norm": 1.753629315888245, "language_loss": 0.64474446, "learning_rate": 1.7523553972989548e-06, "loss": 0.6661579, "num_input_tokens_seen": 198244790, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 9203, "time_per_iteration": 2.4907898902893066 }, { "auxiliary_loss_clip": 0.01115286, "auxiliary_loss_mlp": 0.01025206, "balance_loss_clip": 1.0130589, "balance_loss_mlp": 1.04308689, "epoch": 0.5533744175559898, "flos": 23550541344000.0, "grad_norm": 1.6395515973132464, "language_loss": 0.63803256, "learning_rate": 1.7519689374535683e-06, "loss": 0.65943742, "num_input_tokens_seen": 198264375, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.72265625, "step": 9204, "time_per_iteration": 2.4883995056152344 }, { "auxiliary_loss_clip": 0.01111243, "auxiliary_loss_mlp": 0.01028442, "balance_loss_clip": 1.01612139, "balance_loss_mlp": 1.0407095, "epoch": 0.5534345408086577, "flos": 24061514267520.0, "grad_norm": 1.6535199741967663, "language_loss": 0.77264726, "learning_rate": 1.7515824870135445e-06, "loss": 0.79404414, "num_input_tokens_seen": 198283895, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 9205, "time_per_iteration": 2.5271224975585938 }, { "auxiliary_loss_clip": 0.01113135, "auxiliary_loss_mlp": 0.01032836, "balance_loss_clip": 1.02027178, "balance_loss_mlp": 1.04308319, "epoch": 0.5534946640613257, "flos": 33771831408000.0, "grad_norm": 1.6230700667393676, "language_loss": 0.72479534, "learning_rate": 1.751196045993537e-06, "loss": 0.74625504, "num_input_tokens_seen": 198310035, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 9206, "time_per_iteration": 2.6222331523895264 }, { "auxiliary_loss_clip": 0.01114645, "auxiliary_loss_mlp": 0.01032246, "balance_loss_clip": 1.02032495, "balance_loss_mlp": 1.04280829, "epoch": 0.5535547873139937, "flos": 15159223526400.0, "grad_norm": 2.3867970890596353, "language_loss": 0.75525558, "learning_rate": 1.7508096144082012e-06, "loss": 0.77672452, "num_input_tokens_seen": 198327810, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.71875, "step": 9207, "time_per_iteration": 2.4658191204071045 }, { "auxiliary_loss_clip": 0.01117874, "auxiliary_loss_mlp": 0.01030776, "balance_loss_clip": 1.01749647, "balance_loss_mlp": 1.04235077, "epoch": 0.5536149105666617, "flos": 16980863817600.0, "grad_norm": 3.5168598734092864, "language_loss": 0.6098851, "learning_rate": 1.750423192272189e-06, "loss": 0.63137162, "num_input_tokens_seen": 198343150, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.75390625, "step": 9208, "time_per_iteration": 2.4216222763061523 }, { "auxiliary_loss_clip": 0.01113606, "auxiliary_loss_mlp": 0.01031697, "balance_loss_clip": 1.01926327, "balance_loss_mlp": 1.04126024, "epoch": 0.5536750338193296, "flos": 18149935772160.0, "grad_norm": 2.171079491107158, "language_loss": 0.64267725, "learning_rate": 1.7500367796001547e-06, "loss": 0.66413027, "num_input_tokens_seen": 198360925, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.72265625, "step": 9209, "time_per_iteration": 2.4588983058929443 }, { "auxiliary_loss_clip": 0.01112267, "auxiliary_loss_mlp": 0.0103393, "balance_loss_clip": 1.02035213, "balance_loss_mlp": 1.04120827, "epoch": 0.5537351570719976, "flos": 22747794243840.0, "grad_norm": 3.3483537057324075, "language_loss": 0.82366329, "learning_rate": 1.7496503764067513e-06, "loss": 0.84512532, "num_input_tokens_seen": 198379265, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 9210, "time_per_iteration": 2.47125244140625 }, { "auxiliary_loss_clip": 0.0111259, "auxiliary_loss_mlp": 0.01026468, "balance_loss_clip": 1.01452327, "balance_loss_mlp": 1.04236639, "epoch": 0.5537952803246655, "flos": 26356026130560.0, "grad_norm": 2.1670649773680783, "language_loss": 0.72814345, "learning_rate": 1.74926398270663e-06, "loss": 0.74953407, "num_input_tokens_seen": 198399490, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.703125, "step": 9211, "time_per_iteration": 2.5329205989837646 }, { "auxiliary_loss_clip": 0.01116432, "auxiliary_loss_mlp": 0.01035468, "balance_loss_clip": 1.02110291, "balance_loss_mlp": 1.04215848, "epoch": 0.5538554035773335, "flos": 18037427397120.0, "grad_norm": 2.12428731789803, "language_loss": 0.6650939, "learning_rate": 1.7488775985144437e-06, "loss": 0.68661284, "num_input_tokens_seen": 198419110, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7421875, "step": 9212, "time_per_iteration": 2.4607326984405518 }, { "auxiliary_loss_clip": 0.01115413, "auxiliary_loss_mlp": 0.01028486, "balance_loss_clip": 1.01437128, "balance_loss_mlp": 1.04110682, "epoch": 0.5539155268300014, "flos": 31686247002240.0, "grad_norm": 1.375037754081733, "language_loss": 0.51506418, "learning_rate": 1.7484912238448443e-06, "loss": 0.5365032, "num_input_tokens_seen": 198441360, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 9213, "time_per_iteration": 2.570133924484253 }, { "auxiliary_loss_clip": 0.01117736, "auxiliary_loss_mlp": 0.01027548, "balance_loss_clip": 1.01413143, "balance_loss_mlp": 1.04444444, "epoch": 0.5539756500826695, "flos": 15193769431680.0, "grad_norm": 2.2722463983249854, "language_loss": 0.86053586, "learning_rate": 1.7481048587124827e-06, "loss": 0.8819887, "num_input_tokens_seen": 198459835, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 9214, "time_per_iteration": 3.79738450050354 }, { "auxiliary_loss_clip": 0.01113779, "auxiliary_loss_mlp": 0.01029588, "balance_loss_clip": 1.01742911, "balance_loss_mlp": 1.04344904, "epoch": 0.5540357733353375, "flos": 26353117128960.0, "grad_norm": 2.0512067054092267, "language_loss": 0.69819027, "learning_rate": 1.7477185031320108e-06, "loss": 0.71962392, "num_input_tokens_seen": 198478955, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 9215, "time_per_iteration": 2.517885684967041 }, { "auxiliary_loss_clip": 0.01115992, "auxiliary_loss_mlp": 0.0103112, "balance_loss_clip": 1.01804256, "balance_loss_mlp": 1.04294348, "epoch": 0.5540958965880054, "flos": 21323684747520.0, "grad_norm": 1.6187324751253274, "language_loss": 0.73246706, "learning_rate": 1.7473321571180773e-06, "loss": 0.7539382, "num_input_tokens_seen": 198499030, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 9216, "time_per_iteration": 3.970515727996826 }, { "auxiliary_loss_clip": 0.0111326, "auxiliary_loss_mlp": 0.01028673, "balance_loss_clip": 1.01551223, "balance_loss_mlp": 1.04316688, "epoch": 0.5541560198406734, "flos": 25666828899840.0, "grad_norm": 2.0140661745920236, "language_loss": 0.71470833, "learning_rate": 1.7469458206853345e-06, "loss": 0.73612773, "num_input_tokens_seen": 198520265, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 9217, "time_per_iteration": 4.024096488952637 }, { "auxiliary_loss_clip": 0.01111433, "auxiliary_loss_mlp": 0.01025925, "balance_loss_clip": 1.01350951, "balance_loss_mlp": 1.04080367, "epoch": 0.5542161430933413, "flos": 21939624190080.0, "grad_norm": 3.5659430995070354, "language_loss": 0.78487873, "learning_rate": 1.7465594938484315e-06, "loss": 0.8062523, "num_input_tokens_seen": 198539645, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 9218, "time_per_iteration": 3.8553974628448486 }, { "auxiliary_loss_clip": 0.01116948, "auxiliary_loss_mlp": 0.01030966, "balance_loss_clip": 1.01719689, "balance_loss_mlp": 1.04236436, "epoch": 0.5542762663460093, "flos": 19571459489280.0, "grad_norm": 2.3553275467550363, "language_loss": 0.72354805, "learning_rate": 1.7461731766220176e-06, "loss": 0.74502712, "num_input_tokens_seen": 198558710, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.74609375, "step": 9219, "time_per_iteration": 2.4734766483306885 }, { "auxiliary_loss_clip": 0.01119621, "auxiliary_loss_mlp": 0.01041642, "balance_loss_clip": 1.02780747, "balance_loss_mlp": 1.04614651, "epoch": 0.5543363895986773, "flos": 19499063627520.0, "grad_norm": 1.5346652009007524, "language_loss": 0.71752274, "learning_rate": 1.7457868690207426e-06, "loss": 0.73913538, "num_input_tokens_seen": 198577050, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 9220, "time_per_iteration": 2.4745028018951416 }, { "auxiliary_loss_clip": 0.01114287, "auxiliary_loss_mlp": 0.01024087, "balance_loss_clip": 1.01236296, "balance_loss_mlp": 1.04343581, "epoch": 0.5543965128513453, "flos": 22635609091200.0, "grad_norm": 1.574814446580368, "language_loss": 0.79294491, "learning_rate": 1.7454005710592547e-06, "loss": 0.81432867, "num_input_tokens_seen": 198595290, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.70703125, "step": 9221, "time_per_iteration": 2.478616714477539 }, { "auxiliary_loss_clip": 0.01114609, "auxiliary_loss_mlp": 0.01032069, "balance_loss_clip": 1.01943254, "balance_loss_mlp": 1.04310346, "epoch": 0.5544566361040132, "flos": 25989952671360.0, "grad_norm": 1.9444788450672785, "language_loss": 0.83596551, "learning_rate": 1.7450142827522027e-06, "loss": 0.85743225, "num_input_tokens_seen": 198614110, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 9222, "time_per_iteration": 2.5047740936279297 }, { "auxiliary_loss_clip": 0.01119075, "auxiliary_loss_mlp": 0.01032322, "balance_loss_clip": 1.01895225, "balance_loss_mlp": 1.04439688, "epoch": 0.5545167593566812, "flos": 28257568225920.0, "grad_norm": 1.8831927477980768, "language_loss": 0.75859725, "learning_rate": 1.7446280041142344e-06, "loss": 0.78011119, "num_input_tokens_seen": 198633880, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.74609375, "step": 9223, "time_per_iteration": 2.5356080532073975 }, { "auxiliary_loss_clip": 0.01114319, "auxiliary_loss_mlp": 0.01029802, "balance_loss_clip": 1.01667082, "balance_loss_mlp": 1.04295921, "epoch": 0.5545768826093491, "flos": 28476551491200.0, "grad_norm": 1.9113718767331542, "language_loss": 0.81793368, "learning_rate": 1.7442417351599986e-06, "loss": 0.83937484, "num_input_tokens_seen": 198653505, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 9224, "time_per_iteration": 2.5210134983062744 }, { "auxiliary_loss_clip": 0.01118174, "auxiliary_loss_mlp": 0.01039502, "balance_loss_clip": 1.0266155, "balance_loss_mlp": 1.04448354, "epoch": 0.5546370058620171, "flos": 18478051534080.0, "grad_norm": 2.088551184105962, "language_loss": 0.57228118, "learning_rate": 1.743855475904141e-06, "loss": 0.59385788, "num_input_tokens_seen": 198671890, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 9225, "time_per_iteration": 2.463196039199829 }, { "auxiliary_loss_clip": 0.01115371, "auxiliary_loss_mlp": 0.01035635, "balance_loss_clip": 1.02295125, "balance_loss_mlp": 1.04194641, "epoch": 0.554697129114685, "flos": 22930507751040.0, "grad_norm": 1.9372348170417557, "language_loss": 0.67391795, "learning_rate": 1.7434692263613098e-06, "loss": 0.69542801, "num_input_tokens_seen": 198691995, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 9226, "time_per_iteration": 2.4796149730682373 }, { "auxiliary_loss_clip": 0.01114457, "auxiliary_loss_mlp": 0.01033036, "balance_loss_clip": 1.02012539, "balance_loss_mlp": 1.04142725, "epoch": 0.5547572523673531, "flos": 21797166850560.0, "grad_norm": 1.589220727798793, "language_loss": 0.74777931, "learning_rate": 1.7430829865461518e-06, "loss": 0.76925421, "num_input_tokens_seen": 198712440, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 9227, "time_per_iteration": 2.508902072906494 }, { "auxiliary_loss_clip": 0.01118402, "auxiliary_loss_mlp": 0.01035831, "balance_loss_clip": 1.02208674, "balance_loss_mlp": 1.04571688, "epoch": 0.5548173756200211, "flos": 22342829333760.0, "grad_norm": 1.5826642392750576, "language_loss": 0.73561114, "learning_rate": 1.7426967564733118e-06, "loss": 0.75715351, "num_input_tokens_seen": 198731515, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 9228, "time_per_iteration": 2.4868860244750977 }, { "auxiliary_loss_clip": 0.01116461, "auxiliary_loss_mlp": 0.01031218, "balance_loss_clip": 1.01863599, "balance_loss_mlp": 1.04383457, "epoch": 0.554877498872689, "flos": 17858736213120.0, "grad_norm": 1.9125411435991615, "language_loss": 0.75774062, "learning_rate": 1.7423105361574373e-06, "loss": 0.77921736, "num_input_tokens_seen": 198749750, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 9229, "time_per_iteration": 2.4668397903442383 }, { "auxiliary_loss_clip": 0.01116503, "auxiliary_loss_mlp": 0.01043968, "balance_loss_clip": 1.03047311, "balance_loss_mlp": 1.04371381, "epoch": 0.554937622125357, "flos": 17238343484160.0, "grad_norm": 1.4565879901973524, "language_loss": 0.68741387, "learning_rate": 1.741924325613172e-06, "loss": 0.70901859, "num_input_tokens_seen": 198768320, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 9230, "time_per_iteration": 2.499490737915039 }, { "auxiliary_loss_clip": 0.01116235, "auxiliary_loss_mlp": 0.01034319, "balance_loss_clip": 1.02065814, "balance_loss_mlp": 1.0424428, "epoch": 0.5549977453780249, "flos": 25368087484800.0, "grad_norm": 3.6539832131190524, "language_loss": 0.68028712, "learning_rate": 1.741538124855163e-06, "loss": 0.70179272, "num_input_tokens_seen": 198787230, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 9231, "time_per_iteration": 2.5236940383911133 }, { "auxiliary_loss_clip": 0.01120854, "auxiliary_loss_mlp": 0.01035134, "balance_loss_clip": 1.02106094, "balance_loss_mlp": 1.0448705, "epoch": 0.555057868630693, "flos": 25079114568960.0, "grad_norm": 1.9332637709746974, "language_loss": 0.78631854, "learning_rate": 1.7411519338980548e-06, "loss": 0.80787838, "num_input_tokens_seen": 198806720, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7578125, "step": 9232, "time_per_iteration": 2.51003360748291 }, { "auxiliary_loss_clip": 0.01113937, "auxiliary_loss_mlp": 0.010341, "balance_loss_clip": 1.02248287, "balance_loss_mlp": 1.04326284, "epoch": 0.5551179918833609, "flos": 26104220812800.0, "grad_norm": 1.7905014840364535, "language_loss": 0.82821584, "learning_rate": 1.7407657527564898e-06, "loss": 0.84969616, "num_input_tokens_seen": 198826235, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.70703125, "step": 9233, "time_per_iteration": 2.5329959392547607 }, { "auxiliary_loss_clip": 0.01118246, "auxiliary_loss_mlp": 0.01036834, "balance_loss_clip": 1.0239594, "balance_loss_mlp": 1.0428443, "epoch": 0.5551781151360289, "flos": 19384759572480.0, "grad_norm": 2.91791443384211, "language_loss": 0.74548429, "learning_rate": 1.7403795814451142e-06, "loss": 0.76703501, "num_input_tokens_seen": 198842655, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75390625, "step": 9234, "time_per_iteration": 2.4455130100250244 }, { "auxiliary_loss_clip": 0.01112928, "auxiliary_loss_mlp": 0.01028641, "balance_loss_clip": 1.01596332, "balance_loss_mlp": 1.04230332, "epoch": 0.5552382383886968, "flos": 21725956137600.0, "grad_norm": 1.9801802767946215, "language_loss": 0.64710498, "learning_rate": 1.7399934199785706e-06, "loss": 0.66852069, "num_input_tokens_seen": 198861210, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 9235, "time_per_iteration": 2.4959511756896973 }, { "auxiliary_loss_clip": 0.01116807, "auxiliary_loss_mlp": 0.01030203, "balance_loss_clip": 1.01700079, "balance_loss_mlp": 1.04320955, "epoch": 0.5552983616413648, "flos": 14356189117440.0, "grad_norm": 1.9491549747223238, "language_loss": 0.68074191, "learning_rate": 1.7396072683715029e-06, "loss": 0.70221204, "num_input_tokens_seen": 198880045, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 9236, "time_per_iteration": 2.4424667358398438 }, { "auxiliary_loss_clip": 0.01111396, "auxiliary_loss_mlp": 0.01026475, "balance_loss_clip": 1.01391649, "balance_loss_mlp": 1.04188991, "epoch": 0.5553584848940327, "flos": 25478548784640.0, "grad_norm": 1.69568454057883, "language_loss": 0.86427814, "learning_rate": 1.7392211266385536e-06, "loss": 0.88565683, "num_input_tokens_seen": 198900210, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 9237, "time_per_iteration": 2.5258705615997314 }, { "auxiliary_loss_clip": 0.01111137, "auxiliary_loss_mlp": 0.01036686, "balance_loss_clip": 1.02390075, "balance_loss_mlp": 1.04098499, "epoch": 0.5554186081467007, "flos": 22163850840960.0, "grad_norm": 1.5775719727286062, "language_loss": 0.73259032, "learning_rate": 1.7388349947943652e-06, "loss": 0.75406861, "num_input_tokens_seen": 198919055, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 9238, "time_per_iteration": 2.498361110687256 }, { "auxiliary_loss_clip": 0.01114876, "auxiliary_loss_mlp": 0.01033439, "balance_loss_clip": 1.02054024, "balance_loss_mlp": 1.04109776, "epoch": 0.5554787313993687, "flos": 49746656125440.0, "grad_norm": 1.7473335609620306, "language_loss": 0.78377581, "learning_rate": 1.73844887285358e-06, "loss": 0.80525893, "num_input_tokens_seen": 198943505, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 9239, "time_per_iteration": 2.745922565460205 }, { "auxiliary_loss_clip": 0.01115393, "auxiliary_loss_mlp": 0.01030502, "balance_loss_clip": 1.01743102, "balance_loss_mlp": 1.04265726, "epoch": 0.5555388546520367, "flos": 22127365601280.0, "grad_norm": 1.4731943480894547, "language_loss": 0.79897827, "learning_rate": 1.7380627608308393e-06, "loss": 0.82043719, "num_input_tokens_seen": 198963590, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 9240, "time_per_iteration": 2.497924566268921 }, { "auxiliary_loss_clip": 0.0111285, "auxiliary_loss_mlp": 0.01032243, "balance_loss_clip": 1.01967263, "balance_loss_mlp": 1.04065382, "epoch": 0.5555989779047047, "flos": 24682122478080.0, "grad_norm": 1.7883392208454174, "language_loss": 0.65542471, "learning_rate": 1.737676658740786e-06, "loss": 0.67687559, "num_input_tokens_seen": 198982680, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 9241, "time_per_iteration": 2.5409610271453857 }, { "auxiliary_loss_clip": 0.01116371, "auxiliary_loss_mlp": 0.01031819, "balance_loss_clip": 1.01892042, "balance_loss_mlp": 1.04313993, "epoch": 0.5556591011573726, "flos": 16106510954880.0, "grad_norm": 2.333383850529747, "language_loss": 0.72845209, "learning_rate": 1.7372905665980594e-06, "loss": 0.74993402, "num_input_tokens_seen": 199000185, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 9242, "time_per_iteration": 2.433149576187134 }, { "auxiliary_loss_clip": 0.01115904, "auxiliary_loss_mlp": 0.01033164, "balance_loss_clip": 1.01906776, "balance_loss_mlp": 1.04205728, "epoch": 0.5557192244100406, "flos": 12933695733120.0, "grad_norm": 1.6781778297273313, "language_loss": 0.63779801, "learning_rate": 1.7369044844173012e-06, "loss": 0.6592887, "num_input_tokens_seen": 199018380, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 9243, "time_per_iteration": 2.4685864448547363 }, { "auxiliary_loss_clip": 0.01116711, "auxiliary_loss_mlp": 0.0103227, "balance_loss_clip": 1.01931763, "balance_loss_mlp": 1.04478645, "epoch": 0.5557793476627085, "flos": 23111712887040.0, "grad_norm": 2.0735433181717435, "language_loss": 0.7489531, "learning_rate": 1.7365184122131509e-06, "loss": 0.77044284, "num_input_tokens_seen": 199037115, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 9244, "time_per_iteration": 2.478861093521118 }, { "auxiliary_loss_clip": 0.01110331, "auxiliary_loss_mlp": 0.01030914, "balance_loss_clip": 1.01922607, "balance_loss_mlp": 1.04195428, "epoch": 0.5558394709153766, "flos": 21428040735360.0, "grad_norm": 2.250082901411742, "language_loss": 0.7486853, "learning_rate": 1.7361323500002486e-06, "loss": 0.77009773, "num_input_tokens_seen": 199053375, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 9245, "time_per_iteration": 2.4825141429901123 }, { "auxiliary_loss_clip": 0.0111852, "auxiliary_loss_mlp": 0.01035062, "balance_loss_clip": 1.02115619, "balance_loss_mlp": 1.04319751, "epoch": 0.5558995941680445, "flos": 25078324469760.0, "grad_norm": 2.272965824099301, "language_loss": 0.79942828, "learning_rate": 1.7357462977932348e-06, "loss": 0.8209641, "num_input_tokens_seen": 199070930, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 9246, "time_per_iteration": 2.514052152633667 }, { "auxiliary_loss_clip": 0.0111357, "auxiliary_loss_mlp": 0.01032441, "balance_loss_clip": 1.01971531, "balance_loss_mlp": 1.04178834, "epoch": 0.5559597174207125, "flos": 20011149872640.0, "grad_norm": 1.9975467003339045, "language_loss": 0.74129272, "learning_rate": 1.7353602556067471e-06, "loss": 0.76275283, "num_input_tokens_seen": 199088675, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9247, "time_per_iteration": 2.491034984588623 }, { "auxiliary_loss_clip": 0.0111528, "auxiliary_loss_mlp": 0.01031504, "balance_loss_clip": 1.0181402, "balance_loss_mlp": 1.04252934, "epoch": 0.5560198406733804, "flos": 16835677044480.0, "grad_norm": 2.4595184345023786, "language_loss": 0.76048243, "learning_rate": 1.7349742234554254e-06, "loss": 0.78195024, "num_input_tokens_seen": 199103075, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 9248, "time_per_iteration": 2.433497428894043 }, { "auxiliary_loss_clip": 0.01040125, "auxiliary_loss_mlp": 0.01008295, "balance_loss_clip": 1.0068047, "balance_loss_mlp": 1.01542258, "epoch": 0.5560799639260484, "flos": 70697051758080.0, "grad_norm": 0.8502452601014919, "language_loss": 0.59412986, "learning_rate": 1.7345882013539081e-06, "loss": 0.61461407, "num_input_tokens_seen": 199160325, "router_z_loss_clip": 0.01489258, "router_z_loss_mlp": 0.24707031, "step": 9249, "time_per_iteration": 3.2133476734161377 }, { "auxiliary_loss_clip": 0.01110887, "auxiliary_loss_mlp": 0.01031415, "balance_loss_clip": 1.01834369, "balance_loss_mlp": 1.03910029, "epoch": 0.5561400871787163, "flos": 23148593176320.0, "grad_norm": 1.9684731495169123, "language_loss": 0.79704952, "learning_rate": 1.734202189316832e-06, "loss": 0.8184725, "num_input_tokens_seen": 199179760, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 9250, "time_per_iteration": 2.477689743041992 }, { "auxiliary_loss_clip": 0.01114244, "auxiliary_loss_mlp": 0.01035946, "balance_loss_clip": 1.02157557, "balance_loss_mlp": 1.03980064, "epoch": 0.5562002104313843, "flos": 17566423332480.0, "grad_norm": 2.267478432090252, "language_loss": 0.69052964, "learning_rate": 1.733816187358836e-06, "loss": 0.7120316, "num_input_tokens_seen": 199196695, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7421875, "step": 9251, "time_per_iteration": 2.4462778568267822 }, { "auxiliary_loss_clip": 0.0111289, "auxiliary_loss_mlp": 0.01030026, "balance_loss_clip": 1.01730061, "balance_loss_mlp": 1.04096961, "epoch": 0.5562603336840523, "flos": 25045430590080.0, "grad_norm": 1.5654490544025774, "language_loss": 0.75661641, "learning_rate": 1.7334301954945569e-06, "loss": 0.77804554, "num_input_tokens_seen": 199217845, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9252, "time_per_iteration": 2.503293514251709 }, { "auxiliary_loss_clip": 0.01116352, "auxiliary_loss_mlp": 0.01032517, "balance_loss_clip": 1.01934421, "balance_loss_mlp": 1.04245758, "epoch": 0.5563204569367203, "flos": 29059022436480.0, "grad_norm": 1.6150423679598176, "language_loss": 0.72749346, "learning_rate": 1.7330442137386313e-06, "loss": 0.74898207, "num_input_tokens_seen": 199239250, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 9253, "time_per_iteration": 2.541012763977051 }, { "auxiliary_loss_clip": 0.01116406, "auxiliary_loss_mlp": 0.0103303, "balance_loss_clip": 1.02062082, "balance_loss_mlp": 1.04434514, "epoch": 0.5563805801893883, "flos": 22090449398400.0, "grad_norm": 1.8802553722494815, "language_loss": 0.83172512, "learning_rate": 1.7326582421056965e-06, "loss": 0.85321951, "num_input_tokens_seen": 199258320, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 9254, "time_per_iteration": 2.467362880706787 }, { "auxiliary_loss_clip": 0.01039062, "auxiliary_loss_mlp": 0.01006302, "balance_loss_clip": 1.00488305, "balance_loss_mlp": 1.01440692, "epoch": 0.5564407034420562, "flos": 58636128689280.0, "grad_norm": 0.8746156334679054, "language_loss": 0.64812362, "learning_rate": 1.732272280610387e-06, "loss": 0.66857725, "num_input_tokens_seen": 199314840, "router_z_loss_clip": 0.01416016, "router_z_loss_mlp": 0.24609375, "step": 9255, "time_per_iteration": 4.29047703742981 }, { "auxiliary_loss_clip": 0.01114626, "auxiliary_loss_mlp": 0.01035537, "balance_loss_clip": 1.02301383, "balance_loss_mlp": 1.04453957, "epoch": 0.5565008266947242, "flos": 23112323418240.0, "grad_norm": 1.8813724807194026, "language_loss": 0.69576156, "learning_rate": 1.7318863292673399e-06, "loss": 0.71726316, "num_input_tokens_seen": 199335405, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 9256, "time_per_iteration": 2.5072548389434814 }, { "auxiliary_loss_clip": 0.01111225, "auxiliary_loss_mlp": 0.01028835, "balance_loss_clip": 1.01752257, "balance_loss_mlp": 1.04156792, "epoch": 0.5565609499473921, "flos": 21578399066880.0, "grad_norm": 1.7046985948330209, "language_loss": 0.75838798, "learning_rate": 1.73150038809119e-06, "loss": 0.77978861, "num_input_tokens_seen": 199354345, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6953125, "step": 9257, "time_per_iteration": 3.941596746444702 }, { "auxiliary_loss_clip": 0.01114027, "auxiliary_loss_mlp": 0.01036124, "balance_loss_clip": 1.02381563, "balance_loss_mlp": 1.04029822, "epoch": 0.5566210732000602, "flos": 18369637309440.0, "grad_norm": 2.66900511110641, "language_loss": 0.6076051, "learning_rate": 1.7311144570965724e-06, "loss": 0.62910664, "num_input_tokens_seen": 199372250, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.73828125, "step": 9258, "time_per_iteration": 2.4549896717071533 }, { "auxiliary_loss_clip": 0.01114663, "auxiliary_loss_mlp": 0.0103342, "balance_loss_clip": 1.02022338, "balance_loss_mlp": 1.04216456, "epoch": 0.5566811964527281, "flos": 25703350053120.0, "grad_norm": 1.837396847959513, "language_loss": 0.79240233, "learning_rate": 1.7307285362981215e-06, "loss": 0.81388319, "num_input_tokens_seen": 199392815, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9259, "time_per_iteration": 3.919767379760742 }, { "auxiliary_loss_clip": 0.01112167, "auxiliary_loss_mlp": 0.01031525, "balance_loss_clip": 1.01847792, "balance_loss_mlp": 1.04024756, "epoch": 0.5567413197053961, "flos": 26943991856640.0, "grad_norm": 2.202288144140928, "language_loss": 0.8160553, "learning_rate": 1.7303426257104712e-06, "loss": 0.83749223, "num_input_tokens_seen": 199412375, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 9260, "time_per_iteration": 3.91021466255188 }, { "auxiliary_loss_clip": 0.01113308, "auxiliary_loss_mlp": 0.01039081, "balance_loss_clip": 1.0257709, "balance_loss_mlp": 1.04148936, "epoch": 0.556801442958064, "flos": 20850597694080.0, "grad_norm": 1.7730863852151375, "language_loss": 0.68568993, "learning_rate": 1.729956725348256e-06, "loss": 0.70721382, "num_input_tokens_seen": 199431490, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 9261, "time_per_iteration": 2.485494375228882 }, { "auxiliary_loss_clip": 0.01038202, "auxiliary_loss_mlp": 0.0100461, "balance_loss_clip": 1.00310218, "balance_loss_mlp": 1.01358318, "epoch": 0.556861566210732, "flos": 70498213044480.0, "grad_norm": 0.7373460419896273, "language_loss": 0.6111232, "learning_rate": 1.729570835226108e-06, "loss": 0.63155127, "num_input_tokens_seen": 199495855, "router_z_loss_clip": 0.01507568, "router_z_loss_mlp": 0.24609375, "step": 9262, "time_per_iteration": 3.110896348953247 }, { "auxiliary_loss_clip": 0.01115227, "auxiliary_loss_mlp": 0.01038274, "balance_loss_clip": 1.02541757, "balance_loss_mlp": 1.04182434, "epoch": 0.5569216894633999, "flos": 25337276593920.0, "grad_norm": 2.1043074889560573, "language_loss": 0.6463744, "learning_rate": 1.7291849553586622e-06, "loss": 0.66790938, "num_input_tokens_seen": 199515870, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 9263, "time_per_iteration": 2.546985626220703 }, { "auxiliary_loss_clip": 0.01113082, "auxiliary_loss_mlp": 0.01034698, "balance_loss_clip": 1.02232397, "balance_loss_mlp": 1.04114425, "epoch": 0.556981812716068, "flos": 22638733574400.0, "grad_norm": 2.367423282987471, "language_loss": 0.73377311, "learning_rate": 1.7287990857605497e-06, "loss": 0.75525087, "num_input_tokens_seen": 199535745, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 9264, "time_per_iteration": 2.5215842723846436 }, { "auxiliary_loss_clip": 0.01115343, "auxiliary_loss_mlp": 0.01026004, "balance_loss_clip": 1.01268816, "balance_loss_mlp": 1.04323375, "epoch": 0.5570419359687359, "flos": 11035852738560.0, "grad_norm": 1.9137370504697306, "language_loss": 0.76011109, "learning_rate": 1.7284132264464022e-06, "loss": 0.78152454, "num_input_tokens_seen": 199554035, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 9265, "time_per_iteration": 2.4777286052703857 }, { "auxiliary_loss_clip": 0.01109478, "auxiliary_loss_mlp": 0.01032277, "balance_loss_clip": 1.02057099, "balance_loss_mlp": 1.04123282, "epoch": 0.5571020592214039, "flos": 22823135020800.0, "grad_norm": 1.389394073449441, "language_loss": 0.70982409, "learning_rate": 1.7280273774308536e-06, "loss": 0.73124158, "num_input_tokens_seen": 199576120, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 9266, "time_per_iteration": 2.505434989929199 }, { "auxiliary_loss_clip": 0.01111872, "auxiliary_loss_mlp": 0.01032994, "balance_loss_clip": 1.02035213, "balance_loss_mlp": 1.0402292, "epoch": 0.5571621824740719, "flos": 22927778317440.0, "grad_norm": 2.097589251048726, "language_loss": 0.68151951, "learning_rate": 1.727641538728533e-06, "loss": 0.70296824, "num_input_tokens_seen": 199593780, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 9267, "time_per_iteration": 2.5230982303619385 }, { "auxiliary_loss_clip": 0.0110939, "auxiliary_loss_mlp": 0.01037859, "balance_loss_clip": 1.02608705, "balance_loss_mlp": 1.041026, "epoch": 0.5572223057267398, "flos": 22966705681920.0, "grad_norm": 1.8596051871185142, "language_loss": 0.74297678, "learning_rate": 1.7272557103540736e-06, "loss": 0.76444936, "num_input_tokens_seen": 199613220, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 9268, "time_per_iteration": 2.5302467346191406 }, { "auxiliary_loss_clip": 0.0111197, "auxiliary_loss_mlp": 0.0102689, "balance_loss_clip": 1.01501131, "balance_loss_mlp": 1.04143608, "epoch": 0.5572824289794078, "flos": 20960053413120.0, "grad_norm": 1.993490402722364, "language_loss": 0.75069201, "learning_rate": 1.726869892322104e-06, "loss": 0.77208066, "num_input_tokens_seen": 199632085, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 9269, "time_per_iteration": 2.4800333976745605 }, { "auxiliary_loss_clip": 0.01112065, "auxiliary_loss_mlp": 0.0103207, "balance_loss_clip": 1.01963663, "balance_loss_mlp": 1.04024518, "epoch": 0.5573425522320757, "flos": 25042413847680.0, "grad_norm": 1.7054946762774839, "language_loss": 0.82774699, "learning_rate": 1.726484084647256e-06, "loss": 0.84918833, "num_input_tokens_seen": 199649295, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 9270, "time_per_iteration": 2.532728910446167 }, { "auxiliary_loss_clip": 0.01113342, "auxiliary_loss_mlp": 0.01031216, "balance_loss_clip": 1.01812696, "balance_loss_mlp": 1.04044271, "epoch": 0.5574026754847438, "flos": 23659637927040.0, "grad_norm": 2.216543199849054, "language_loss": 0.79517889, "learning_rate": 1.7260982873441591e-06, "loss": 0.81662446, "num_input_tokens_seen": 199668870, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 9271, "time_per_iteration": 2.480998992919922 }, { "auxiliary_loss_clip": 0.01114655, "auxiliary_loss_mlp": 0.01030051, "balance_loss_clip": 1.01721263, "balance_loss_mlp": 1.04214716, "epoch": 0.5574627987374117, "flos": 24782240661120.0, "grad_norm": 2.4523111607990926, "language_loss": 0.90093958, "learning_rate": 1.725712500427442e-06, "loss": 0.92238671, "num_input_tokens_seen": 199684870, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9272, "time_per_iteration": 2.5226943492889404 }, { "auxiliary_loss_clip": 0.0111098, "auxiliary_loss_mlp": 0.01033355, "balance_loss_clip": 1.02070141, "balance_loss_mlp": 1.04184973, "epoch": 0.5575229219900797, "flos": 21834944979840.0, "grad_norm": 2.4665145853757022, "language_loss": 0.84158337, "learning_rate": 1.7253267239117347e-06, "loss": 0.86302674, "num_input_tokens_seen": 199701975, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 9273, "time_per_iteration": 2.465017318725586 }, { "auxiliary_loss_clip": 0.01113733, "auxiliary_loss_mlp": 0.01037183, "balance_loss_clip": 1.02321184, "balance_loss_mlp": 1.04217553, "epoch": 0.5575830452427476, "flos": 27815148408960.0, "grad_norm": 2.3866109091928562, "language_loss": 0.74126804, "learning_rate": 1.7249409578116655e-06, "loss": 0.76277721, "num_input_tokens_seen": 199721865, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71484375, "step": 9274, "time_per_iteration": 2.5395236015319824 }, { "auxiliary_loss_clip": 0.01119339, "auxiliary_loss_mlp": 0.01037452, "balance_loss_clip": 1.02306306, "balance_loss_mlp": 1.04265451, "epoch": 0.5576431684954156, "flos": 17812805696640.0, "grad_norm": 3.7269761740012557, "language_loss": 0.78813285, "learning_rate": 1.7245552021418629e-06, "loss": 0.80970073, "num_input_tokens_seen": 199736455, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.765625, "step": 9275, "time_per_iteration": 2.407517433166504 }, { "auxiliary_loss_clip": 0.01113463, "auxiliary_loss_mlp": 0.01029766, "balance_loss_clip": 1.01700509, "balance_loss_mlp": 1.04217482, "epoch": 0.5577032917480835, "flos": 15486872411520.0, "grad_norm": 1.8394560048159925, "language_loss": 0.7515232, "learning_rate": 1.7241694569169546e-06, "loss": 0.77295548, "num_input_tokens_seen": 199753125, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 9276, "time_per_iteration": 2.4610323905944824 }, { "auxiliary_loss_clip": 0.01109963, "auxiliary_loss_mlp": 0.01033984, "balance_loss_clip": 1.02130044, "balance_loss_mlp": 1.03938138, "epoch": 0.5577634150007516, "flos": 21579763783680.0, "grad_norm": 1.6631386889174335, "language_loss": 0.75468314, "learning_rate": 1.7237837221515678e-06, "loss": 0.77612257, "num_input_tokens_seen": 199771365, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 9277, "time_per_iteration": 2.4831480979919434 }, { "auxiliary_loss_clip": 0.01110235, "auxiliary_loss_mlp": 0.01034226, "balance_loss_clip": 1.02193606, "balance_loss_mlp": 1.04024518, "epoch": 0.5578235382534195, "flos": 21139750177920.0, "grad_norm": 1.8946065680644737, "language_loss": 0.71731532, "learning_rate": 1.7233979978603304e-06, "loss": 0.73875993, "num_input_tokens_seen": 199790035, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 9278, "time_per_iteration": 2.5374083518981934 }, { "auxiliary_loss_clip": 0.01115191, "auxiliary_loss_mlp": 0.01034144, "balance_loss_clip": 1.02028012, "balance_loss_mlp": 1.0411979, "epoch": 0.5578836615060875, "flos": 26505199313280.0, "grad_norm": 1.6604459506223255, "language_loss": 0.75793463, "learning_rate": 1.723012284057868e-06, "loss": 0.77942801, "num_input_tokens_seen": 199811125, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 9279, "time_per_iteration": 2.503469228744507 }, { "auxiliary_loss_clip": 0.01112146, "auxiliary_loss_mlp": 0.01032017, "balance_loss_clip": 1.01929724, "balance_loss_mlp": 1.04046082, "epoch": 0.5579437847587555, "flos": 20153786780160.0, "grad_norm": 2.0505000161399782, "language_loss": 0.67213136, "learning_rate": 1.7226265807588082e-06, "loss": 0.69357294, "num_input_tokens_seen": 199829915, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9280, "time_per_iteration": 2.488511085510254 }, { "auxiliary_loss_clip": 0.01114854, "auxiliary_loss_mlp": 0.01036431, "balance_loss_clip": 1.0239861, "balance_loss_mlp": 1.04051161, "epoch": 0.5580039080114234, "flos": 26102281478400.0, "grad_norm": 2.090722461196662, "language_loss": 0.73040974, "learning_rate": 1.7222408879777763e-06, "loss": 0.75192261, "num_input_tokens_seen": 199850670, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7421875, "step": 9281, "time_per_iteration": 2.546299934387207 }, { "auxiliary_loss_clip": 0.01112711, "auxiliary_loss_mlp": 0.01034537, "balance_loss_clip": 1.02190709, "balance_loss_mlp": 1.0420115, "epoch": 0.5580640312640914, "flos": 13771671096960.0, "grad_norm": 5.831704602886785, "language_loss": 0.74634075, "learning_rate": 1.7218552057293974e-06, "loss": 0.76781321, "num_input_tokens_seen": 199867645, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 9282, "time_per_iteration": 2.457303524017334 }, { "auxiliary_loss_clip": 0.01111743, "auxiliary_loss_mlp": 0.01026914, "balance_loss_clip": 1.01416433, "balance_loss_mlp": 1.0415318, "epoch": 0.5581241545167593, "flos": 17675986792320.0, "grad_norm": 1.751850724252447, "language_loss": 0.66362751, "learning_rate": 1.721469534028297e-06, "loss": 0.68501413, "num_input_tokens_seen": 199886320, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 9283, "time_per_iteration": 2.4844443798065186 }, { "auxiliary_loss_clip": 0.01113736, "auxiliary_loss_mlp": 0.01028195, "balance_loss_clip": 1.01612544, "balance_loss_mlp": 1.04224968, "epoch": 0.5581842777694274, "flos": 19569161018880.0, "grad_norm": 2.3923746677467355, "language_loss": 0.8271212, "learning_rate": 1.7210838728890994e-06, "loss": 0.84854054, "num_input_tokens_seen": 199904895, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71484375, "step": 9284, "time_per_iteration": 2.4892570972442627 }, { "auxiliary_loss_clip": 0.01113637, "auxiliary_loss_mlp": 0.01029954, "balance_loss_clip": 1.01688862, "balance_loss_mlp": 1.04150176, "epoch": 0.5582444010220953, "flos": 20595165102720.0, "grad_norm": 2.5389080522995826, "language_loss": 0.85526556, "learning_rate": 1.7206982223264304e-06, "loss": 0.87670147, "num_input_tokens_seen": 199921090, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 9285, "time_per_iteration": 2.463310480117798 }, { "auxiliary_loss_clip": 0.0111313, "auxiliary_loss_mlp": 0.01034377, "balance_loss_clip": 1.02178836, "balance_loss_mlp": 1.04111147, "epoch": 0.5583045242747633, "flos": 19135504120320.0, "grad_norm": 2.3513620887845876, "language_loss": 0.73987758, "learning_rate": 1.720312582354912e-06, "loss": 0.76135266, "num_input_tokens_seen": 199939925, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 9286, "time_per_iteration": 2.4750235080718994 }, { "auxiliary_loss_clip": 0.01113827, "auxiliary_loss_mlp": 0.01031805, "balance_loss_clip": 1.0188117, "balance_loss_mlp": 1.04172528, "epoch": 0.5583646475274312, "flos": 27454569730560.0, "grad_norm": 1.7012579838687754, "language_loss": 0.739452, "learning_rate": 1.7199269529891684e-06, "loss": 0.76090837, "num_input_tokens_seen": 199960015, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 9287, "time_per_iteration": 2.572502613067627 }, { "auxiliary_loss_clip": 0.01115728, "auxiliary_loss_mlp": 0.01031152, "balance_loss_clip": 1.01747286, "balance_loss_mlp": 1.04116559, "epoch": 0.5584247707800992, "flos": 23653784010240.0, "grad_norm": 2.175097768525649, "language_loss": 0.75106293, "learning_rate": 1.7195413342438233e-06, "loss": 0.77253169, "num_input_tokens_seen": 199980505, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 9288, "time_per_iteration": 2.531611680984497 }, { "auxiliary_loss_clip": 0.01115762, "auxiliary_loss_mlp": 0.01034161, "balance_loss_clip": 1.02030849, "balance_loss_mlp": 1.04330182, "epoch": 0.5584848940327671, "flos": 13698880185600.0, "grad_norm": 2.13509051128455, "language_loss": 0.77777851, "learning_rate": 1.7191557261334984e-06, "loss": 0.79927772, "num_input_tokens_seen": 199999020, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 9289, "time_per_iteration": 2.4595818519592285 }, { "auxiliary_loss_clip": 0.01120475, "auxiliary_loss_mlp": 0.01032321, "balance_loss_clip": 1.01865947, "balance_loss_mlp": 1.04398131, "epoch": 0.5585450172854352, "flos": 27016208150400.0, "grad_norm": 1.9245161633228025, "language_loss": 0.61420286, "learning_rate": 1.718770128672817e-06, "loss": 0.63573086, "num_input_tokens_seen": 200019020, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.765625, "step": 9290, "time_per_iteration": 2.5291640758514404 }, { "auxiliary_loss_clip": 0.0111438, "auxiliary_loss_mlp": 0.01031849, "balance_loss_clip": 1.01880169, "balance_loss_mlp": 1.04062927, "epoch": 0.5586051405381031, "flos": 23185653033600.0, "grad_norm": 2.2232985377880765, "language_loss": 0.67670929, "learning_rate": 1.7183845418764e-06, "loss": 0.69817162, "num_input_tokens_seen": 200038110, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 9291, "time_per_iteration": 2.4841156005859375 }, { "auxiliary_loss_clip": 0.01115955, "auxiliary_loss_mlp": 0.0103524, "balance_loss_clip": 1.02197182, "balance_loss_mlp": 1.04197621, "epoch": 0.5586652637907711, "flos": 20775544225920.0, "grad_norm": 2.2359727249128656, "language_loss": 0.84478199, "learning_rate": 1.7179989657588698e-06, "loss": 0.86629403, "num_input_tokens_seen": 200056210, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 9292, "time_per_iteration": 2.4834165573120117 }, { "auxiliary_loss_clip": 0.01112883, "auxiliary_loss_mlp": 0.01037444, "balance_loss_clip": 1.02508211, "balance_loss_mlp": 1.0426631, "epoch": 0.5587253870434391, "flos": 28219897837440.0, "grad_norm": 2.211532277610236, "language_loss": 0.72969091, "learning_rate": 1.7176134003348476e-06, "loss": 0.75119424, "num_input_tokens_seen": 200075620, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 9293, "time_per_iteration": 2.523271083831787 }, { "auxiliary_loss_clip": 0.01112262, "auxiliary_loss_mlp": 0.01034092, "balance_loss_clip": 1.02169991, "balance_loss_mlp": 1.04173756, "epoch": 0.558785510296107, "flos": 26615732440320.0, "grad_norm": 2.0784212179701194, "language_loss": 0.72595406, "learning_rate": 1.7172278456189523e-06, "loss": 0.74741757, "num_input_tokens_seen": 200095945, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 9294, "time_per_iteration": 2.5402626991271973 }, { "auxiliary_loss_clip": 0.01115282, "auxiliary_loss_mlp": 0.01034155, "balance_loss_clip": 1.02129197, "balance_loss_mlp": 1.04246402, "epoch": 0.558845633548775, "flos": 20156767608960.0, "grad_norm": 2.2746443769760836, "language_loss": 0.68604732, "learning_rate": 1.716842301625806e-06, "loss": 0.7075417, "num_input_tokens_seen": 200114185, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9295, "time_per_iteration": 2.474567174911499 }, { "auxiliary_loss_clip": 0.01114709, "auxiliary_loss_mlp": 0.01032452, "balance_loss_clip": 1.0188148, "balance_loss_mlp": 1.04258311, "epoch": 0.5589057568014429, "flos": 24350774492160.0, "grad_norm": 2.324443571437637, "language_loss": 0.80500883, "learning_rate": 1.7164567683700281e-06, "loss": 0.82648039, "num_input_tokens_seen": 200135030, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 9296, "time_per_iteration": 2.510990619659424 }, { "auxiliary_loss_clip": 0.01112721, "auxiliary_loss_mlp": 0.01034055, "balance_loss_clip": 1.02041757, "balance_loss_mlp": 1.04154468, "epoch": 0.558965880054111, "flos": 21105168359040.0, "grad_norm": 1.5929514575679242, "language_loss": 0.65385568, "learning_rate": 1.7160712458662379e-06, "loss": 0.67532349, "num_input_tokens_seen": 200154290, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 9297, "time_per_iteration": 3.8743350505828857 }, { "auxiliary_loss_clip": 0.01115601, "auxiliary_loss_mlp": 0.01037667, "balance_loss_clip": 1.02380311, "balance_loss_mlp": 1.04150307, "epoch": 0.5590260033067789, "flos": 18436071513600.0, "grad_norm": 2.386144730586436, "language_loss": 0.7495544, "learning_rate": 1.7156857341290544e-06, "loss": 0.77108711, "num_input_tokens_seen": 200171555, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 9298, "time_per_iteration": 2.4532532691955566 }, { "auxiliary_loss_clip": 0.01037771, "auxiliary_loss_mlp": 0.01003997, "balance_loss_clip": 1.00254905, "balance_loss_mlp": 1.01312888, "epoch": 0.5590861265594469, "flos": 70577432490240.0, "grad_norm": 0.7114719080691823, "language_loss": 0.52456284, "learning_rate": 1.7153002331730967e-06, "loss": 0.54498053, "num_input_tokens_seen": 200237010, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.24609375, "step": 9299, "time_per_iteration": 4.56667685508728 }, { "auxiliary_loss_clip": 0.01111678, "auxiliary_loss_mlp": 0.01033192, "balance_loss_clip": 1.02088368, "balance_loss_mlp": 1.04201925, "epoch": 0.5591462498121148, "flos": 30664408896000.0, "grad_norm": 1.8831788804041527, "language_loss": 0.68919528, "learning_rate": 1.7149147430129824e-06, "loss": 0.71064395, "num_input_tokens_seen": 200260820, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 9300, "time_per_iteration": 2.571730613708496 }, { "auxiliary_loss_clip": 0.01114954, "auxiliary_loss_mlp": 0.01039398, "balance_loss_clip": 1.02561152, "balance_loss_mlp": 1.04123914, "epoch": 0.5592063730647828, "flos": 18150438562560.0, "grad_norm": 2.1829327708821604, "language_loss": 0.82370985, "learning_rate": 1.7145292636633293e-06, "loss": 0.84525347, "num_input_tokens_seen": 200278035, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 9301, "time_per_iteration": 5.298617839813232 }, { "auxiliary_loss_clip": 0.01112312, "auxiliary_loss_mlp": 0.01029621, "balance_loss_clip": 1.0167824, "balance_loss_mlp": 1.0404129, "epoch": 0.5592664963174507, "flos": 24060400945920.0, "grad_norm": 2.199871340240801, "language_loss": 0.67992789, "learning_rate": 1.714143795138756e-06, "loss": 0.70134723, "num_input_tokens_seen": 200297255, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 9302, "time_per_iteration": 2.498589038848877 }, { "auxiliary_loss_clip": 0.01116786, "auxiliary_loss_mlp": 0.0102746, "balance_loss_clip": 1.01408482, "balance_loss_mlp": 1.04238367, "epoch": 0.5593266195701188, "flos": 19827897661440.0, "grad_norm": 2.3385304824953463, "language_loss": 0.70651764, "learning_rate": 1.713758337453878e-06, "loss": 0.72796011, "num_input_tokens_seen": 200317505, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 9303, "time_per_iteration": 2.4778058528900146 }, { "auxiliary_loss_clip": 0.01111911, "auxiliary_loss_mlp": 0.01031431, "balance_loss_clip": 1.01967692, "balance_loss_mlp": 1.04348636, "epoch": 0.5593867428227867, "flos": 25300755440640.0, "grad_norm": 1.695532085349927, "language_loss": 0.72803319, "learning_rate": 1.7133728906233124e-06, "loss": 0.74946654, "num_input_tokens_seen": 200338350, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 9304, "time_per_iteration": 2.554558753967285 }, { "auxiliary_loss_clip": 0.01111712, "auxiliary_loss_mlp": 0.01028865, "balance_loss_clip": 1.01671195, "balance_loss_mlp": 1.04063964, "epoch": 0.5594468660754547, "flos": 12933013374720.0, "grad_norm": 2.0798968575637202, "language_loss": 0.78080159, "learning_rate": 1.7129874546616763e-06, "loss": 0.80220735, "num_input_tokens_seen": 200353965, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7109375, "step": 9305, "time_per_iteration": 2.5053725242614746 }, { "auxiliary_loss_clip": 0.01108595, "auxiliary_loss_mlp": 0.01028933, "balance_loss_clip": 1.01723838, "balance_loss_mlp": 1.04069757, "epoch": 0.5595069893281227, "flos": 19062713208960.0, "grad_norm": 1.5842837894617905, "language_loss": 0.69969034, "learning_rate": 1.7126020295835836e-06, "loss": 0.72106558, "num_input_tokens_seen": 200373595, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 9306, "time_per_iteration": 2.4952902793884277 }, { "auxiliary_loss_clip": 0.01038283, "auxiliary_loss_mlp": 0.01007121, "balance_loss_clip": 1.00551176, "balance_loss_mlp": 1.01382649, "epoch": 0.5595671125807906, "flos": 70273375862400.0, "grad_norm": 0.9363105972455474, "language_loss": 0.60284334, "learning_rate": 1.7122166154036518e-06, "loss": 0.62329739, "num_input_tokens_seen": 200429155, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.24414062, "step": 9307, "time_per_iteration": 3.215989351272583 }, { "auxiliary_loss_clip": 0.01110872, "auxiliary_loss_mlp": 0.010377, "balance_loss_clip": 1.0256654, "balance_loss_mlp": 1.04055786, "epoch": 0.5596272358334586, "flos": 20665513889280.0, "grad_norm": 1.9970531374946052, "language_loss": 0.74354446, "learning_rate": 1.7118312121364943e-06, "loss": 0.76503009, "num_input_tokens_seen": 200448290, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 9308, "time_per_iteration": 2.491246223449707 }, { "auxiliary_loss_clip": 0.01113233, "auxiliary_loss_mlp": 0.01034996, "balance_loss_clip": 1.0214597, "balance_loss_mlp": 1.03988111, "epoch": 0.5596873590861265, "flos": 25041013217280.0, "grad_norm": 1.9558778460897208, "language_loss": 0.69912457, "learning_rate": 1.7114458197967257e-06, "loss": 0.72060692, "num_input_tokens_seen": 200466555, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 9309, "time_per_iteration": 2.5415008068084717 }, { "auxiliary_loss_clip": 0.01116187, "auxiliary_loss_mlp": 0.01031202, "balance_loss_clip": 1.01658726, "balance_loss_mlp": 1.04316163, "epoch": 0.5597474823387946, "flos": 25958387594880.0, "grad_norm": 3.416571025336888, "language_loss": 0.75071013, "learning_rate": 1.7110604383989613e-06, "loss": 0.77218401, "num_input_tokens_seen": 200485980, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7265625, "step": 9310, "time_per_iteration": 2.520329236984253 }, { "auxiliary_loss_clip": 0.011164, "auxiliary_loss_mlp": 0.01032772, "balance_loss_clip": 1.0188365, "balance_loss_mlp": 1.04308224, "epoch": 0.5598076055914625, "flos": 26177442687360.0, "grad_norm": 2.689447423297649, "language_loss": 0.69564301, "learning_rate": 1.7106750679578133e-06, "loss": 0.71713465, "num_input_tokens_seen": 200504555, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 9311, "time_per_iteration": 2.539008378982544 }, { "auxiliary_loss_clip": 0.01110642, "auxiliary_loss_mlp": 0.01029176, "balance_loss_clip": 1.01635468, "balance_loss_mlp": 1.03969836, "epoch": 0.5598677288441305, "flos": 11655778590720.0, "grad_norm": 2.45331110830501, "language_loss": 0.72111732, "learning_rate": 1.7102897084878962e-06, "loss": 0.74251544, "num_input_tokens_seen": 200522700, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 9312, "time_per_iteration": 2.44097638130188 }, { "auxiliary_loss_clip": 0.01113412, "auxiliary_loss_mlp": 0.01033867, "balance_loss_clip": 1.0206883, "balance_loss_mlp": 1.04269767, "epoch": 0.5599278520967984, "flos": 22966597941120.0, "grad_norm": 1.87817797533219, "language_loss": 0.88995564, "learning_rate": 1.709904360003822e-06, "loss": 0.91142845, "num_input_tokens_seen": 200541910, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 9313, "time_per_iteration": 2.504441261291504 }, { "auxiliary_loss_clip": 0.01114994, "auxiliary_loss_mlp": 0.01040013, "balance_loss_clip": 1.02688217, "balance_loss_mlp": 1.04447222, "epoch": 0.5599879753494664, "flos": 21215557831680.0, "grad_norm": 1.7456027682791684, "language_loss": 0.77838576, "learning_rate": 1.709519022520204e-06, "loss": 0.79993582, "num_input_tokens_seen": 200562600, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 9314, "time_per_iteration": 2.498920440673828 }, { "auxiliary_loss_clip": 0.01111128, "auxiliary_loss_mlp": 0.01029481, "balance_loss_clip": 1.01674342, "balance_loss_mlp": 1.04075491, "epoch": 0.5600480986021343, "flos": 31903219105920.0, "grad_norm": 1.700980289668449, "language_loss": 0.70243436, "learning_rate": 1.7091336960516537e-06, "loss": 0.72384048, "num_input_tokens_seen": 200584795, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 9315, "time_per_iteration": 2.5666885375976562 }, { "auxiliary_loss_clip": 0.01113023, "auxiliary_loss_mlp": 0.01035885, "balance_loss_clip": 1.02264094, "balance_loss_mlp": 1.03960979, "epoch": 0.5601082218548024, "flos": 28476048700800.0, "grad_norm": 3.5960214349939745, "language_loss": 0.66816056, "learning_rate": 1.7087483806127824e-06, "loss": 0.6896497, "num_input_tokens_seen": 200606945, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 9316, "time_per_iteration": 2.518397569656372 }, { "auxiliary_loss_clip": 0.01111631, "auxiliary_loss_mlp": 0.0103101, "balance_loss_clip": 1.0175755, "balance_loss_mlp": 1.04127169, "epoch": 0.5601683451074703, "flos": 24097173494400.0, "grad_norm": 2.22098032600635, "language_loss": 0.86557317, "learning_rate": 1.7083630762182022e-06, "loss": 0.88699961, "num_input_tokens_seen": 200626340, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 9317, "time_per_iteration": 2.522064447402954 }, { "auxiliary_loss_clip": 0.01114951, "auxiliary_loss_mlp": 0.01033846, "balance_loss_clip": 1.01924264, "balance_loss_mlp": 1.04074752, "epoch": 0.5602284683601383, "flos": 26356205698560.0, "grad_norm": 1.6703366409552192, "language_loss": 0.77166843, "learning_rate": 1.7079777828825233e-06, "loss": 0.79315639, "num_input_tokens_seen": 200644520, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7421875, "step": 9318, "time_per_iteration": 2.5217576026916504 }, { "auxiliary_loss_clip": 0.01109925, "auxiliary_loss_mlp": 0.01038771, "balance_loss_clip": 1.02618289, "balance_loss_mlp": 1.03876054, "epoch": 0.5602885916128063, "flos": 24496392228480.0, "grad_norm": 1.7998444664184954, "language_loss": 0.75779569, "learning_rate": 1.7075925006203558e-06, "loss": 0.77928263, "num_input_tokens_seen": 200664845, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 9319, "time_per_iteration": 2.5268754959106445 }, { "auxiliary_loss_clip": 0.01109065, "auxiliary_loss_mlp": 0.0103156, "balance_loss_clip": 1.01913834, "balance_loss_mlp": 1.03992999, "epoch": 0.5603487148654742, "flos": 27345006270720.0, "grad_norm": 1.468301715829458, "language_loss": 0.85646844, "learning_rate": 1.7072072294463101e-06, "loss": 0.87787473, "num_input_tokens_seen": 200686535, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 9320, "time_per_iteration": 2.5373339653015137 }, { "auxiliary_loss_clip": 0.01035895, "auxiliary_loss_mlp": 0.01002835, "balance_loss_clip": 1.00129688, "balance_loss_mlp": 1.01151466, "epoch": 0.5604088381181422, "flos": 54087756180480.0, "grad_norm": 0.7722624194865109, "language_loss": 0.52592731, "learning_rate": 1.706821969374996e-06, "loss": 0.5463146, "num_input_tokens_seen": 200736965, "router_z_loss_clip": 0.01531982, "router_z_loss_mlp": 0.24414062, "step": 9321, "time_per_iteration": 2.9150030612945557 }, { "auxiliary_loss_clip": 0.01111501, "auxiliary_loss_mlp": 0.01031537, "balance_loss_clip": 1.0190202, "balance_loss_mlp": 1.04290366, "epoch": 0.5604689613708101, "flos": 22236390357120.0, "grad_norm": 1.3427152699311087, "language_loss": 0.74304169, "learning_rate": 1.7064367204210216e-06, "loss": 0.76447213, "num_input_tokens_seen": 200757420, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 9322, "time_per_iteration": 2.4863529205322266 }, { "auxiliary_loss_clip": 0.0111114, "auxiliary_loss_mlp": 0.01032817, "balance_loss_clip": 1.01905417, "balance_loss_mlp": 1.04008746, "epoch": 0.5605290846234782, "flos": 35297782940160.0, "grad_norm": 2.204033153290923, "language_loss": 0.73717827, "learning_rate": 1.7060514825989963e-06, "loss": 0.75861788, "num_input_tokens_seen": 200779520, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 9323, "time_per_iteration": 2.6106228828430176 }, { "auxiliary_loss_clip": 0.01114783, "auxiliary_loss_mlp": 0.01031048, "balance_loss_clip": 1.01754785, "balance_loss_mlp": 1.04219687, "epoch": 0.5605892078761461, "flos": 20263314326400.0, "grad_norm": 1.600280609258935, "language_loss": 0.61747712, "learning_rate": 1.7056662559235286e-06, "loss": 0.63893539, "num_input_tokens_seen": 200799485, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 9324, "time_per_iteration": 2.4843950271606445 }, { "auxiliary_loss_clip": 0.01110854, "auxiliary_loss_mlp": 0.01033088, "balance_loss_clip": 1.01982594, "balance_loss_mlp": 1.03977859, "epoch": 0.5606493311288141, "flos": 17308333134720.0, "grad_norm": 1.8150252048307716, "language_loss": 0.87080574, "learning_rate": 1.705281040409226e-06, "loss": 0.89224517, "num_input_tokens_seen": 200817540, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 9325, "time_per_iteration": 2.4656941890716553 }, { "auxiliary_loss_clip": 0.01112053, "auxiliary_loss_mlp": 0.01033336, "balance_loss_clip": 1.01962674, "balance_loss_mlp": 1.04019427, "epoch": 0.560709454381482, "flos": 21652985658240.0, "grad_norm": 1.806946056528597, "language_loss": 0.73891109, "learning_rate": 1.7048958360706952e-06, "loss": 0.76036501, "num_input_tokens_seen": 200838380, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 9326, "time_per_iteration": 2.488724946975708 }, { "auxiliary_loss_clip": 0.01115745, "auxiliary_loss_mlp": 0.01028743, "balance_loss_clip": 1.0150156, "balance_loss_mlp": 1.04154158, "epoch": 0.56076957763415, "flos": 20303355012480.0, "grad_norm": 3.5406035951469232, "language_loss": 0.78287232, "learning_rate": 1.7045106429225447e-06, "loss": 0.80431724, "num_input_tokens_seen": 200855640, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 9327, "time_per_iteration": 2.47180438041687 }, { "auxiliary_loss_clip": 0.01113146, "auxiliary_loss_mlp": 0.010311, "balance_loss_clip": 1.01733732, "balance_loss_mlp": 1.04291534, "epoch": 0.5608297008868179, "flos": 25045897466880.0, "grad_norm": 2.0163078843478317, "language_loss": 0.78555298, "learning_rate": 1.7041254609793795e-06, "loss": 0.80699539, "num_input_tokens_seen": 200876585, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.703125, "step": 9328, "time_per_iteration": 2.5112879276275635 }, { "auxiliary_loss_clip": 0.01110624, "auxiliary_loss_mlp": 0.01028813, "balance_loss_clip": 1.01598001, "balance_loss_mlp": 1.04019117, "epoch": 0.560889824139486, "flos": 19866825025920.0, "grad_norm": 1.806159371793623, "language_loss": 0.73551458, "learning_rate": 1.7037402902558066e-06, "loss": 0.75690901, "num_input_tokens_seen": 200898175, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 9329, "time_per_iteration": 2.528578519821167 }, { "auxiliary_loss_clip": 0.01113733, "auxiliary_loss_mlp": 0.01032224, "balance_loss_clip": 1.01827049, "balance_loss_mlp": 1.04021156, "epoch": 0.5609499473921539, "flos": 22929394429440.0, "grad_norm": 1.5953559945778835, "language_loss": 0.83740038, "learning_rate": 1.7033551307664324e-06, "loss": 0.85886002, "num_input_tokens_seen": 200917515, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73828125, "step": 9330, "time_per_iteration": 2.477168321609497 }, { "auxiliary_loss_clip": 0.01037133, "auxiliary_loss_mlp": 0.01000848, "balance_loss_clip": 0.99927408, "balance_loss_mlp": 1.01268196, "epoch": 0.5610100706448219, "flos": 53035825455360.0, "grad_norm": 0.7245710671453369, "language_loss": 0.57857108, "learning_rate": 1.7029699825258603e-06, "loss": 0.59895086, "num_input_tokens_seen": 200978615, "router_z_loss_clip": 0.01574707, "router_z_loss_mlp": 0.24414062, "step": 9331, "time_per_iteration": 3.1180403232574463 }, { "auxiliary_loss_clip": 0.01111286, "auxiliary_loss_mlp": 0.01031444, "balance_loss_clip": 1.01856971, "balance_loss_mlp": 1.04010463, "epoch": 0.5610701938974898, "flos": 21834944979840.0, "grad_norm": 3.061457289765988, "language_loss": 0.81764519, "learning_rate": 1.7025848455486971e-06, "loss": 0.83907247, "num_input_tokens_seen": 200997745, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9332, "time_per_iteration": 2.479275941848755 }, { "auxiliary_loss_clip": 0.01115566, "auxiliary_loss_mlp": 0.01032095, "balance_loss_clip": 1.01818371, "balance_loss_mlp": 1.04151559, "epoch": 0.5611303171501578, "flos": 17457183095040.0, "grad_norm": 2.000671800830906, "language_loss": 0.81898481, "learning_rate": 1.7021997198495454e-06, "loss": 0.84046137, "num_input_tokens_seen": 201016370, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 9333, "time_per_iteration": 2.4582881927490234 }, { "auxiliary_loss_clip": 0.01111631, "auxiliary_loss_mlp": 0.01028782, "balance_loss_clip": 1.0165273, "balance_loss_mlp": 1.04067516, "epoch": 0.5611904404028258, "flos": 22637799820800.0, "grad_norm": 2.254027217654311, "language_loss": 0.73350358, "learning_rate": 1.7018146054430108e-06, "loss": 0.75490773, "num_input_tokens_seen": 201034310, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.70703125, "step": 9334, "time_per_iteration": 2.4677934646606445 }, { "auxiliary_loss_clip": 0.0111433, "auxiliary_loss_mlp": 0.01035542, "balance_loss_clip": 1.0220356, "balance_loss_mlp": 1.04384637, "epoch": 0.5612505636554938, "flos": 14316327999360.0, "grad_norm": 3.0941710050335187, "language_loss": 0.71129888, "learning_rate": 1.7014295023436961e-06, "loss": 0.73279762, "num_input_tokens_seen": 201052030, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 9335, "time_per_iteration": 2.4398508071899414 }, { "auxiliary_loss_clip": 0.01112427, "auxiliary_loss_mlp": 0.01029665, "balance_loss_clip": 1.01689219, "balance_loss_mlp": 1.04090691, "epoch": 0.5613106869081618, "flos": 16508279554560.0, "grad_norm": 2.6763573027715806, "language_loss": 0.76877153, "learning_rate": 1.701044410566205e-06, "loss": 0.79019237, "num_input_tokens_seen": 201068445, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 9336, "time_per_iteration": 2.4356677532196045 }, { "auxiliary_loss_clip": 0.01110549, "auxiliary_loss_mlp": 0.01033843, "balance_loss_clip": 1.02182031, "balance_loss_mlp": 1.0411067, "epoch": 0.5613708101608297, "flos": 24058569352320.0, "grad_norm": 2.3516994242541536, "language_loss": 0.64514405, "learning_rate": 1.7006593301251393e-06, "loss": 0.66658795, "num_input_tokens_seen": 201082140, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 9337, "time_per_iteration": 2.504171848297119 }, { "auxiliary_loss_clip": 0.01036198, "auxiliary_loss_mlp": 0.01001293, "balance_loss_clip": 0.99969524, "balance_loss_mlp": 1.01176238, "epoch": 0.5614309334134977, "flos": 64905735997440.0, "grad_norm": 0.894482080430888, "language_loss": 0.62666118, "learning_rate": 1.700274261035102e-06, "loss": 0.64703608, "num_input_tokens_seen": 201137245, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.24414062, "step": 9338, "time_per_iteration": 4.554414987564087 }, { "auxiliary_loss_clip": 0.01114006, "auxiliary_loss_mlp": 0.01033219, "balance_loss_clip": 1.02084506, "balance_loss_mlp": 1.04128993, "epoch": 0.5614910566661656, "flos": 32919849740160.0, "grad_norm": 3.1839143925468703, "language_loss": 0.65485847, "learning_rate": 1.6998892033106946e-06, "loss": 0.67633069, "num_input_tokens_seen": 201157270, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 9339, "time_per_iteration": 2.5620715618133545 }, { "auxiliary_loss_clip": 0.01109792, "auxiliary_loss_mlp": 0.01032544, "balance_loss_clip": 1.01998544, "balance_loss_mlp": 1.04045975, "epoch": 0.5615511799188336, "flos": 18588871969920.0, "grad_norm": 4.061750882241537, "language_loss": 0.69668448, "learning_rate": 1.6995041569665184e-06, "loss": 0.71810782, "num_input_tokens_seen": 201174530, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 9340, "time_per_iteration": 2.488454580307007 }, { "auxiliary_loss_clip": 0.01110081, "auxiliary_loss_mlp": 0.01028855, "balance_loss_clip": 1.01690471, "balance_loss_mlp": 1.04257464, "epoch": 0.5616113031715015, "flos": 22820010537600.0, "grad_norm": 1.7271438703111057, "language_loss": 0.7730965, "learning_rate": 1.6991191220171756e-06, "loss": 0.79448581, "num_input_tokens_seen": 201194905, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 9341, "time_per_iteration": 4.028385877609253 }, { "auxiliary_loss_clip": 0.01111159, "auxiliary_loss_mlp": 0.01034135, "balance_loss_clip": 1.02107596, "balance_loss_mlp": 1.03937912, "epoch": 0.5616714264241696, "flos": 22345702421760.0, "grad_norm": 6.639812939028325, "language_loss": 0.79962963, "learning_rate": 1.6987340984772653e-06, "loss": 0.82108253, "num_input_tokens_seen": 201213715, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 9342, "time_per_iteration": 4.017202138900757 }, { "auxiliary_loss_clip": 0.01114877, "auxiliary_loss_mlp": 0.0103439, "balance_loss_clip": 1.02107406, "balance_loss_mlp": 1.04120553, "epoch": 0.5617315496768375, "flos": 18807783408000.0, "grad_norm": 2.0775067443208073, "language_loss": 0.76102889, "learning_rate": 1.6983490863613882e-06, "loss": 0.78252155, "num_input_tokens_seen": 201231415, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 9343, "time_per_iteration": 3.8614416122436523 }, { "auxiliary_loss_clip": 0.01112884, "auxiliary_loss_mlp": 0.0103878, "balance_loss_clip": 1.02560747, "balance_loss_mlp": 1.04323256, "epoch": 0.5617916729295055, "flos": 18369314087040.0, "grad_norm": 2.300448587002113, "language_loss": 0.6914897, "learning_rate": 1.6979640856841442e-06, "loss": 0.71300638, "num_input_tokens_seen": 201249625, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 9344, "time_per_iteration": 2.477527141571045 }, { "auxiliary_loss_clip": 0.01110726, "auxiliary_loss_mlp": 0.01036878, "balance_loss_clip": 1.02365756, "balance_loss_mlp": 1.03964615, "epoch": 0.5618517961821734, "flos": 28179964892160.0, "grad_norm": 1.9789434096622018, "language_loss": 0.66159427, "learning_rate": 1.6975790964601318e-06, "loss": 0.6830703, "num_input_tokens_seen": 201271205, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 9345, "time_per_iteration": 2.524270534515381 }, { "auxiliary_loss_clip": 0.01111518, "auxiliary_loss_mlp": 0.01030564, "balance_loss_clip": 1.01845241, "balance_loss_mlp": 1.04118943, "epoch": 0.5619119194348414, "flos": 15486872411520.0, "grad_norm": 2.2035826966036876, "language_loss": 0.87126791, "learning_rate": 1.6971941187039512e-06, "loss": 0.89268875, "num_input_tokens_seen": 201287700, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 9346, "time_per_iteration": 2.4499659538269043 }, { "auxiliary_loss_clip": 0.01111616, "auxiliary_loss_mlp": 0.0103573, "balance_loss_clip": 1.02258778, "balance_loss_mlp": 1.0410341, "epoch": 0.5619720426875094, "flos": 29128652951040.0, "grad_norm": 2.4251131975106035, "language_loss": 0.59380114, "learning_rate": 1.6968091524301993e-06, "loss": 0.61527455, "num_input_tokens_seen": 201307530, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 9347, "time_per_iteration": 2.5156641006469727 }, { "auxiliary_loss_clip": 0.01113766, "auxiliary_loss_mlp": 0.01038462, "balance_loss_clip": 1.02524185, "balance_loss_mlp": 1.04229283, "epoch": 0.5620321659401774, "flos": 18003743418240.0, "grad_norm": 2.316974055796248, "language_loss": 0.68986237, "learning_rate": 1.6964241976534745e-06, "loss": 0.71138471, "num_input_tokens_seen": 201326210, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 9348, "time_per_iteration": 2.4588770866394043 }, { "auxiliary_loss_clip": 0.0111493, "auxiliary_loss_mlp": 0.01037148, "balance_loss_clip": 1.02347517, "balance_loss_mlp": 1.03977644, "epoch": 0.5620922891928454, "flos": 20594518657920.0, "grad_norm": 1.7923980982562566, "language_loss": 0.78783238, "learning_rate": 1.6960392543883754e-06, "loss": 0.80935323, "num_input_tokens_seen": 201346120, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 9349, "time_per_iteration": 2.463604211807251 }, { "auxiliary_loss_clip": 0.0111222, "auxiliary_loss_mlp": 0.01034574, "balance_loss_clip": 1.02121043, "balance_loss_mlp": 1.04095221, "epoch": 0.5621524124455133, "flos": 26287006147200.0, "grad_norm": 3.7182538434497796, "language_loss": 0.67434824, "learning_rate": 1.6956543226494975e-06, "loss": 0.69581622, "num_input_tokens_seen": 201365700, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 9350, "time_per_iteration": 2.5310251712799072 }, { "auxiliary_loss_clip": 0.01114099, "auxiliary_loss_mlp": 0.01041413, "balance_loss_clip": 1.02760875, "balance_loss_mlp": 1.04170144, "epoch": 0.5622125356981813, "flos": 12750299867520.0, "grad_norm": 1.995485265411952, "language_loss": 0.7848683, "learning_rate": 1.6952694024514381e-06, "loss": 0.80642343, "num_input_tokens_seen": 201382795, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 9351, "time_per_iteration": 2.4430031776428223 }, { "auxiliary_loss_clip": 0.01114872, "auxiliary_loss_mlp": 0.01036788, "balance_loss_clip": 1.02412248, "balance_loss_mlp": 1.0415988, "epoch": 0.5622726589508492, "flos": 23805327490560.0, "grad_norm": 1.8067480922798795, "language_loss": 0.59108484, "learning_rate": 1.6948844938087945e-06, "loss": 0.6126014, "num_input_tokens_seen": 201402780, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.734375, "step": 9352, "time_per_iteration": 2.5064120292663574 }, { "auxiliary_loss_clip": 0.01108175, "auxiliary_loss_mlp": 0.01034611, "balance_loss_clip": 1.02283263, "balance_loss_mlp": 1.04115927, "epoch": 0.5623327822035172, "flos": 24718212668160.0, "grad_norm": 1.3801173929246207, "language_loss": 0.71905959, "learning_rate": 1.6944995967361604e-06, "loss": 0.74048752, "num_input_tokens_seen": 201424140, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 9353, "time_per_iteration": 2.51430344581604 }, { "auxiliary_loss_clip": 0.01113751, "auxiliary_loss_mlp": 0.01033314, "balance_loss_clip": 1.02066016, "balance_loss_mlp": 1.042207, "epoch": 0.5623929054561851, "flos": 14019274523520.0, "grad_norm": 3.475405276335413, "language_loss": 0.76829392, "learning_rate": 1.6941147112481327e-06, "loss": 0.78976452, "num_input_tokens_seen": 201439645, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 9354, "time_per_iteration": 2.457386016845703 }, { "auxiliary_loss_clip": 0.01114469, "auxiliary_loss_mlp": 0.01033084, "balance_loss_clip": 1.02003074, "balance_loss_mlp": 1.04082537, "epoch": 0.5624530287088532, "flos": 20704405340160.0, "grad_norm": 3.7782084696750236, "language_loss": 0.73170805, "learning_rate": 1.6937298373593056e-06, "loss": 0.75318354, "num_input_tokens_seen": 201459970, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 9355, "time_per_iteration": 2.499201774597168 }, { "auxiliary_loss_clip": 0.01111226, "auxiliary_loss_mlp": 0.01029401, "balance_loss_clip": 1.01675296, "balance_loss_mlp": 1.04024863, "epoch": 0.5625131519615211, "flos": 21470918595840.0, "grad_norm": 1.6507102602155703, "language_loss": 0.73615491, "learning_rate": 1.693344975084274e-06, "loss": 0.75756121, "num_input_tokens_seen": 201480055, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 9356, "time_per_iteration": 2.5217299461364746 }, { "auxiliary_loss_clip": 0.01112351, "auxiliary_loss_mlp": 0.01036466, "balance_loss_clip": 1.02361512, "balance_loss_mlp": 1.04253209, "epoch": 0.5625732752141891, "flos": 18698004466560.0, "grad_norm": 1.9259465557891282, "language_loss": 0.82926953, "learning_rate": 1.6929601244376318e-06, "loss": 0.85075778, "num_input_tokens_seen": 201497645, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 9357, "time_per_iteration": 2.477174758911133 }, { "auxiliary_loss_clip": 0.01111933, "auxiliary_loss_mlp": 0.01031838, "balance_loss_clip": 1.01978636, "balance_loss_mlp": 1.04158866, "epoch": 0.562633398466857, "flos": 16216900427520.0, "grad_norm": 2.2875984846230777, "language_loss": 0.72198862, "learning_rate": 1.6925752854339722e-06, "loss": 0.74342632, "num_input_tokens_seen": 201515455, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 9358, "time_per_iteration": 2.4875569343566895 }, { "auxiliary_loss_clip": 0.01111385, "auxiliary_loss_mlp": 0.01041027, "balance_loss_clip": 1.02828336, "balance_loss_mlp": 1.04162061, "epoch": 0.562693521719525, "flos": 22491930689280.0, "grad_norm": 1.6509224175944501, "language_loss": 0.77513075, "learning_rate": 1.6921904580878885e-06, "loss": 0.79665482, "num_input_tokens_seen": 201534500, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 9359, "time_per_iteration": 2.4815189838409424 }, { "auxiliary_loss_clip": 0.01111083, "auxiliary_loss_mlp": 0.01029225, "balance_loss_clip": 1.01744747, "balance_loss_mlp": 1.04032505, "epoch": 0.562753644972193, "flos": 25331171281920.0, "grad_norm": 1.8998088051095934, "language_loss": 0.70238543, "learning_rate": 1.6918056424139736e-06, "loss": 0.7237885, "num_input_tokens_seen": 201553280, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.7109375, "step": 9360, "time_per_iteration": 2.523118495941162 }, { "auxiliary_loss_clip": 0.01039923, "auxiliary_loss_mlp": 0.00999552, "balance_loss_clip": 0.99785912, "balance_loss_mlp": 1.01539814, "epoch": 0.562813768224861, "flos": 67392622126080.0, "grad_norm": 0.7783570453007466, "language_loss": 0.55586541, "learning_rate": 1.6914208384268197e-06, "loss": 0.57626015, "num_input_tokens_seen": 201610030, "router_z_loss_clip": 0.01696777, "router_z_loss_mlp": 0.24511719, "step": 9361, "time_per_iteration": 3.0423877239227295 }, { "auxiliary_loss_clip": 0.01110717, "auxiliary_loss_mlp": 0.01034251, "balance_loss_clip": 1.02223444, "balance_loss_mlp": 1.04192472, "epoch": 0.562873891477529, "flos": 23331163029120.0, "grad_norm": 1.5100959163677259, "language_loss": 0.81988823, "learning_rate": 1.691036046141018e-06, "loss": 0.84133786, "num_input_tokens_seen": 201628370, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 9362, "time_per_iteration": 2.506805181503296 }, { "auxiliary_loss_clip": 0.01110173, "auxiliary_loss_mlp": 0.01032079, "balance_loss_clip": 1.01932931, "balance_loss_mlp": 1.0406177, "epoch": 0.5629340147301969, "flos": 38472824805120.0, "grad_norm": 1.8164609302607364, "language_loss": 0.74497008, "learning_rate": 1.6906512655711614e-06, "loss": 0.76639265, "num_input_tokens_seen": 201649790, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 9363, "time_per_iteration": 2.636268138885498 }, { "auxiliary_loss_clip": 0.01113572, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.01896203, "balance_loss_mlp": 1.04109478, "epoch": 0.5629941379828649, "flos": 29242023252480.0, "grad_norm": 1.631283822998653, "language_loss": 0.82881969, "learning_rate": 1.690266496731839e-06, "loss": 0.85027558, "num_input_tokens_seen": 201669175, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 9364, "time_per_iteration": 2.546603202819824 }, { "auxiliary_loss_clip": 0.01111793, "auxiliary_loss_mlp": 0.01031766, "balance_loss_clip": 1.01975024, "balance_loss_mlp": 1.04292989, "epoch": 0.5630542612355328, "flos": 19420885676160.0, "grad_norm": 3.3481994541972213, "language_loss": 0.65073806, "learning_rate": 1.689881739637642e-06, "loss": 0.67217362, "num_input_tokens_seen": 201687000, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 9365, "time_per_iteration": 2.453885316848755 }, { "auxiliary_loss_clip": 0.01116466, "auxiliary_loss_mlp": 0.01031701, "balance_loss_clip": 1.01858211, "balance_loss_mlp": 1.04114079, "epoch": 0.5631143844882008, "flos": 22266303408000.0, "grad_norm": 3.2763160719551263, "language_loss": 0.82095253, "learning_rate": 1.6894969943031611e-06, "loss": 0.84243423, "num_input_tokens_seen": 201703335, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75390625, "step": 9366, "time_per_iteration": 2.490981340408325 }, { "auxiliary_loss_clip": 0.01111201, "auxiliary_loss_mlp": 0.01028725, "balance_loss_clip": 1.01731658, "balance_loss_mlp": 1.04299581, "epoch": 0.5631745077408687, "flos": 22965305051520.0, "grad_norm": 1.7749452006420374, "language_loss": 0.7339164, "learning_rate": 1.6891122607429845e-06, "loss": 0.75531566, "num_input_tokens_seen": 201723495, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 9367, "time_per_iteration": 2.488301992416382 }, { "auxiliary_loss_clip": 0.01039911, "auxiliary_loss_mlp": 0.01001214, "balance_loss_clip": 0.99965268, "balance_loss_mlp": 1.01524675, "epoch": 0.5632346309935368, "flos": 65080515576960.0, "grad_norm": 0.6460614908709392, "language_loss": 0.53495109, "learning_rate": 1.6887275389717028e-06, "loss": 0.55536234, "num_input_tokens_seen": 201792615, "router_z_loss_clip": 0.015625, "router_z_loss_mlp": 0.24609375, "step": 9368, "time_per_iteration": 3.2451729774475098 }, { "auxiliary_loss_clip": 0.0111254, "auxiliary_loss_mlp": 0.01034176, "balance_loss_clip": 1.02149796, "balance_loss_mlp": 1.04275954, "epoch": 0.5632947542462047, "flos": 23002903612800.0, "grad_norm": 1.9667172709561995, "language_loss": 0.69065535, "learning_rate": 1.6883428290039046e-06, "loss": 0.71212244, "num_input_tokens_seen": 201812520, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 9369, "time_per_iteration": 2.497745990753174 }, { "auxiliary_loss_clip": 0.01109103, "auxiliary_loss_mlp": 0.01036323, "balance_loss_clip": 1.0236634, "balance_loss_mlp": 1.03838599, "epoch": 0.5633548774988727, "flos": 30482593228800.0, "grad_norm": 1.8702112202486805, "language_loss": 0.75813782, "learning_rate": 1.6879581308541763e-06, "loss": 0.77959204, "num_input_tokens_seen": 201834185, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 9370, "time_per_iteration": 2.543163299560547 }, { "auxiliary_loss_clip": 0.01113395, "auxiliary_loss_mlp": 0.0103361, "balance_loss_clip": 1.01951313, "balance_loss_mlp": 1.04119742, "epoch": 0.5634150007515406, "flos": 18515039564160.0, "grad_norm": 2.7986190731974383, "language_loss": 0.75503993, "learning_rate": 1.687573444537108e-06, "loss": 0.77651, "num_input_tokens_seen": 201851305, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.72265625, "step": 9371, "time_per_iteration": 2.440676212310791 }, { "auxiliary_loss_clip": 0.01109641, "auxiliary_loss_mlp": 0.01032472, "balance_loss_clip": 1.02008581, "balance_loss_mlp": 1.0400064, "epoch": 0.5634751240042086, "flos": 19244672530560.0, "grad_norm": 2.6603374933853994, "language_loss": 0.76154864, "learning_rate": 1.687188770067285e-06, "loss": 0.78296971, "num_input_tokens_seen": 201870350, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 9372, "time_per_iteration": 2.46854567527771 }, { "auxiliary_loss_clip": 0.01110236, "auxiliary_loss_mlp": 0.01029362, "balance_loss_clip": 1.01655912, "balance_loss_mlp": 1.04110706, "epoch": 0.5635352472568766, "flos": 12020630987520.0, "grad_norm": 2.888688295790797, "language_loss": 0.71603125, "learning_rate": 1.6868041074592956e-06, "loss": 0.73742723, "num_input_tokens_seen": 201886800, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 9373, "time_per_iteration": 2.4338390827178955 }, { "auxiliary_loss_clip": 0.01114932, "auxiliary_loss_mlp": 0.01032992, "balance_loss_clip": 1.01906884, "balance_loss_mlp": 1.04340553, "epoch": 0.5635953705095446, "flos": 21871645701120.0, "grad_norm": 2.2460474152249192, "language_loss": 0.82574528, "learning_rate": 1.6864194567277264e-06, "loss": 0.84722447, "num_input_tokens_seen": 201904730, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 9374, "time_per_iteration": 2.4939842224121094 }, { "auxiliary_loss_clip": 0.01108863, "auxiliary_loss_mlp": 0.01025658, "balance_loss_clip": 1.01287317, "balance_loss_mlp": 1.03962994, "epoch": 0.5636554937622126, "flos": 27126166659840.0, "grad_norm": 2.656210306423321, "language_loss": 0.66162616, "learning_rate": 1.6860348178871618e-06, "loss": 0.68297136, "num_input_tokens_seen": 201924850, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 9375, "time_per_iteration": 2.568502426147461 }, { "auxiliary_loss_clip": 0.0111252, "auxiliary_loss_mlp": 0.01036568, "balance_loss_clip": 1.02398586, "balance_loss_mlp": 1.04028726, "epoch": 0.5637156170148805, "flos": 12926405272320.0, "grad_norm": 2.326661387127641, "language_loss": 0.81228089, "learning_rate": 1.6856501909521889e-06, "loss": 0.83377177, "num_input_tokens_seen": 201939500, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 9376, "time_per_iteration": 2.452390670776367 }, { "auxiliary_loss_clip": 0.01112964, "auxiliary_loss_mlp": 0.01031742, "balance_loss_clip": 1.01854599, "balance_loss_mlp": 1.03903282, "epoch": 0.5637757402675485, "flos": 45551033130240.0, "grad_norm": 1.5691460350385666, "language_loss": 0.68986118, "learning_rate": 1.6852655759373925e-06, "loss": 0.71130824, "num_input_tokens_seen": 201963000, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 9377, "time_per_iteration": 2.6808927059173584 }, { "auxiliary_loss_clip": 0.01108559, "auxiliary_loss_mlp": 0.01031985, "balance_loss_clip": 1.01942027, "balance_loss_mlp": 1.04168475, "epoch": 0.5638358635202164, "flos": 20886041439360.0, "grad_norm": 1.4034721289170202, "language_loss": 0.74490267, "learning_rate": 1.6848809728573565e-06, "loss": 0.76630813, "num_input_tokens_seen": 201983145, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.671875, "step": 9378, "time_per_iteration": 2.494091749191284 }, { "auxiliary_loss_clip": 0.01114505, "auxiliary_loss_mlp": 0.01031606, "balance_loss_clip": 1.017694, "balance_loss_mlp": 1.03839982, "epoch": 0.5638959867728844, "flos": 18806562345600.0, "grad_norm": 2.634032994187629, "language_loss": 0.8199138, "learning_rate": 1.6844963817266656e-06, "loss": 0.84137499, "num_input_tokens_seen": 202000335, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 9379, "time_per_iteration": 2.4513447284698486 }, { "auxiliary_loss_clip": 0.01111498, "auxiliary_loss_mlp": 0.01034827, "balance_loss_clip": 1.02195215, "balance_loss_mlp": 1.03909874, "epoch": 0.5639561100255523, "flos": 27490336698240.0, "grad_norm": 2.17740824468462, "language_loss": 0.71757281, "learning_rate": 1.6841118025599042e-06, "loss": 0.73903596, "num_input_tokens_seen": 202018275, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 9380, "time_per_iteration": 3.9448320865631104 }, { "auxiliary_loss_clip": 0.0111397, "auxiliary_loss_mlp": 0.01035206, "balance_loss_clip": 1.02085352, "balance_loss_mlp": 1.04129577, "epoch": 0.5640162332782204, "flos": 18076570243200.0, "grad_norm": 3.069010151794391, "language_loss": 0.74456364, "learning_rate": 1.6837272353716542e-06, "loss": 0.76605541, "num_input_tokens_seen": 202034330, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7265625, "step": 9381, "time_per_iteration": 2.504054307937622 }, { "auxiliary_loss_clip": 0.01112103, "auxiliary_loss_mlp": 0.01036162, "balance_loss_clip": 1.0233407, "balance_loss_mlp": 1.03962398, "epoch": 0.5640763565308883, "flos": 20884856290560.0, "grad_norm": 2.3041578020499283, "language_loss": 0.72324467, "learning_rate": 1.683342680176499e-06, "loss": 0.74472731, "num_input_tokens_seen": 202053100, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 9382, "time_per_iteration": 3.9510791301727295 }, { "auxiliary_loss_clip": 0.01036427, "auxiliary_loss_mlp": 0.01003102, "balance_loss_clip": 1.00162339, "balance_loss_mlp": 1.01190996, "epoch": 0.5641364797835563, "flos": 64447912224000.0, "grad_norm": 0.734529234789687, "language_loss": 0.54404962, "learning_rate": 1.682958136989022e-06, "loss": 0.56444502, "num_input_tokens_seen": 202120125, "router_z_loss_clip": 0.01477051, "router_z_loss_mlp": 0.24511719, "step": 9383, "time_per_iteration": 3.2444956302642822 }, { "auxiliary_loss_clip": 0.01113642, "auxiliary_loss_mlp": 0.01028551, "balance_loss_clip": 1.01488972, "balance_loss_mlp": 1.03953981, "epoch": 0.5641966030362242, "flos": 18660944609280.0, "grad_norm": 1.8088695165432662, "language_loss": 0.70519763, "learning_rate": 1.6825736058238033e-06, "loss": 0.7266196, "num_input_tokens_seen": 202138030, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 9384, "time_per_iteration": 3.8732049465179443 }, { "auxiliary_loss_clip": 0.0111142, "auxiliary_loss_mlp": 0.01031667, "balance_loss_clip": 1.01804161, "balance_loss_mlp": 1.03958178, "epoch": 0.5642567262888922, "flos": 22492325738880.0, "grad_norm": 2.30487065571341, "language_loss": 0.75682467, "learning_rate": 1.6821890866954263e-06, "loss": 0.77825552, "num_input_tokens_seen": 202155580, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 9385, "time_per_iteration": 2.5210015773773193 }, { "auxiliary_loss_clip": 0.01105488, "auxiliary_loss_mlp": 0.01029264, "balance_loss_clip": 1.0168364, "balance_loss_mlp": 1.03633988, "epoch": 0.5643168495415603, "flos": 13003972692480.0, "grad_norm": 27.72851274173686, "language_loss": 0.82330936, "learning_rate": 1.6818045796184703e-06, "loss": 0.84465688, "num_input_tokens_seen": 202170365, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 9386, "time_per_iteration": 2.431558132171631 }, { "auxiliary_loss_clip": 0.01115388, "auxiliary_loss_mlp": 0.01030654, "balance_loss_clip": 1.01681352, "balance_loss_mlp": 1.0409292, "epoch": 0.5643769727942282, "flos": 18588297352320.0, "grad_norm": 2.240039736009699, "language_loss": 0.70270199, "learning_rate": 1.681420084607516e-06, "loss": 0.7241624, "num_input_tokens_seen": 202189095, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 9387, "time_per_iteration": 2.437872886657715 }, { "auxiliary_loss_clip": 0.01113692, "auxiliary_loss_mlp": 0.01032362, "balance_loss_clip": 1.01950526, "balance_loss_mlp": 1.03995323, "epoch": 0.5644370960468962, "flos": 33806269572480.0, "grad_norm": 1.7088685486730677, "language_loss": 0.74891889, "learning_rate": 1.6810356016771452e-06, "loss": 0.77037942, "num_input_tokens_seen": 202213500, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 9388, "time_per_iteration": 2.602717161178589 }, { "auxiliary_loss_clip": 0.01107301, "auxiliary_loss_mlp": 0.01031641, "balance_loss_clip": 1.0200007, "balance_loss_mlp": 1.03860879, "epoch": 0.5644972192995641, "flos": 21214911386880.0, "grad_norm": 2.014076612535879, "language_loss": 0.82121086, "learning_rate": 1.6806511308419353e-06, "loss": 0.84260029, "num_input_tokens_seen": 202231920, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 9389, "time_per_iteration": 2.461094856262207 }, { "auxiliary_loss_clip": 0.01113823, "auxiliary_loss_mlp": 0.01035437, "balance_loss_clip": 1.02120376, "balance_loss_mlp": 1.04068494, "epoch": 0.5645573425522321, "flos": 18587722734720.0, "grad_norm": 2.516949235815702, "language_loss": 0.64028704, "learning_rate": 1.680266672116467e-06, "loss": 0.66177964, "num_input_tokens_seen": 202247600, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.73046875, "step": 9390, "time_per_iteration": 2.4636590480804443 }, { "auxiliary_loss_clip": 0.01111799, "auxiliary_loss_mlp": 0.0102539, "balance_loss_clip": 1.01354694, "balance_loss_mlp": 1.04197598, "epoch": 0.5646174658049, "flos": 18113809668480.0, "grad_norm": 2.0737892446410253, "language_loss": 0.91956109, "learning_rate": 1.6798822255153192e-06, "loss": 0.94093299, "num_input_tokens_seen": 202265350, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 9391, "time_per_iteration": 2.441239833831787 }, { "auxiliary_loss_clip": 0.011161, "auxiliary_loss_mlp": 0.01037716, "balance_loss_clip": 1.02357769, "balance_loss_mlp": 1.04060483, "epoch": 0.564677589057568, "flos": 28329964087680.0, "grad_norm": 2.6860978605105124, "language_loss": 0.59938562, "learning_rate": 1.6794977910530684e-06, "loss": 0.62092376, "num_input_tokens_seen": 202284285, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7578125, "step": 9392, "time_per_iteration": 2.5284841060638428 }, { "auxiliary_loss_clip": 0.0111054, "auxiliary_loss_mlp": 0.0102687, "balance_loss_clip": 1.01276207, "balance_loss_mlp": 1.03920484, "epoch": 0.564737712310236, "flos": 22163743100160.0, "grad_norm": 2.4955777462890074, "language_loss": 0.81543839, "learning_rate": 1.6791133687442937e-06, "loss": 0.8368125, "num_input_tokens_seen": 202303450, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7109375, "step": 9393, "time_per_iteration": 2.4840214252471924 }, { "auxiliary_loss_clip": 0.01112276, "auxiliary_loss_mlp": 0.01027353, "balance_loss_clip": 1.0143708, "balance_loss_mlp": 1.04089451, "epoch": 0.564797835562904, "flos": 20959011918720.0, "grad_norm": 61.64321574956801, "language_loss": 0.87602866, "learning_rate": 1.6787289586035725e-06, "loss": 0.897425, "num_input_tokens_seen": 202322315, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 9394, "time_per_iteration": 2.4824743270874023 }, { "auxiliary_loss_clip": 0.01112558, "auxiliary_loss_mlp": 0.010301, "balance_loss_clip": 1.01765478, "balance_loss_mlp": 1.04257202, "epoch": 0.5648579588155719, "flos": 17420302805760.0, "grad_norm": 2.0946853194464787, "language_loss": 0.84906423, "learning_rate": 1.6783445606454814e-06, "loss": 0.87049073, "num_input_tokens_seen": 202339905, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69921875, "step": 9395, "time_per_iteration": 2.4442074298858643 }, { "auxiliary_loss_clip": 0.01036192, "auxiliary_loss_mlp": 0.01002915, "balance_loss_clip": 1.00130582, "balance_loss_mlp": 1.01141405, "epoch": 0.5649180820682399, "flos": 69929568835200.0, "grad_norm": 0.8081579240533192, "language_loss": 0.58308524, "learning_rate": 1.677960174884597e-06, "loss": 0.60347629, "num_input_tokens_seen": 202397320, "router_z_loss_clip": 0.01611328, "router_z_loss_mlp": 0.24804688, "step": 9396, "time_per_iteration": 3.104564905166626 }, { "auxiliary_loss_clip": 0.01114146, "auxiliary_loss_mlp": 0.01029186, "balance_loss_clip": 1.01623964, "balance_loss_mlp": 1.0407784, "epoch": 0.5649782053209078, "flos": 24973070641920.0, "grad_norm": 2.0309677025015698, "language_loss": 0.70177794, "learning_rate": 1.6775758013354943e-06, "loss": 0.72321129, "num_input_tokens_seen": 202416865, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 9397, "time_per_iteration": 2.5305724143981934 }, { "auxiliary_loss_clip": 0.01113869, "auxiliary_loss_mlp": 0.0103265, "balance_loss_clip": 1.02037692, "balance_loss_mlp": 1.04095793, "epoch": 0.5650383285735758, "flos": 21726602582400.0, "grad_norm": 1.8911197301778804, "language_loss": 0.67180252, "learning_rate": 1.67719144001275e-06, "loss": 0.6932677, "num_input_tokens_seen": 202436210, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.73046875, "step": 9398, "time_per_iteration": 2.5423498153686523 }, { "auxiliary_loss_clip": 0.01034946, "auxiliary_loss_mlp": 0.01001308, "balance_loss_clip": 0.99985915, "balance_loss_mlp": 1.01040351, "epoch": 0.5650984518262439, "flos": 65904484636800.0, "grad_norm": 0.7905538292869386, "language_loss": 0.58103395, "learning_rate": 1.6768070909309386e-06, "loss": 0.60139656, "num_input_tokens_seen": 202492925, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.24609375, "step": 9399, "time_per_iteration": 3.0500576496124268 }, { "auxiliary_loss_clip": 0.01114008, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 1.01856124, "balance_loss_mlp": 1.03968811, "epoch": 0.5651585750789118, "flos": 21032592929280.0, "grad_norm": 2.0014883214045516, "language_loss": 0.73284054, "learning_rate": 1.6764227541046347e-06, "loss": 0.75431663, "num_input_tokens_seen": 202511905, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7421875, "step": 9400, "time_per_iteration": 2.470618963241577 }, { "auxiliary_loss_clip": 0.01115185, "auxiliary_loss_mlp": 0.01035165, "balance_loss_clip": 1.02080584, "balance_loss_mlp": 1.04065108, "epoch": 0.5652186983315798, "flos": 18551919853440.0, "grad_norm": 3.072566317439113, "language_loss": 0.60969853, "learning_rate": 1.676038429548412e-06, "loss": 0.63120198, "num_input_tokens_seen": 202529815, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.74609375, "step": 9401, "time_per_iteration": 2.455432653427124 }, { "auxiliary_loss_clip": 0.01111211, "auxiliary_loss_mlp": 0.01026315, "balance_loss_clip": 1.01351833, "balance_loss_mlp": 1.03910327, "epoch": 0.5652788215842477, "flos": 18478662065280.0, "grad_norm": 2.00623814347154, "language_loss": 0.8108173, "learning_rate": 1.6756541172768453e-06, "loss": 0.83219254, "num_input_tokens_seen": 202547710, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 9402, "time_per_iteration": 2.45658016204834 }, { "auxiliary_loss_clip": 0.01110907, "auxiliary_loss_mlp": 0.01033389, "balance_loss_clip": 1.02093804, "balance_loss_mlp": 1.04028404, "epoch": 0.5653389448369157, "flos": 30044052080640.0, "grad_norm": 1.6943993019089125, "language_loss": 0.7761212, "learning_rate": 1.6752698173045068e-06, "loss": 0.79756415, "num_input_tokens_seen": 202568835, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.70703125, "step": 9403, "time_per_iteration": 2.5703516006469727 }, { "auxiliary_loss_clip": 0.01112648, "auxiliary_loss_mlp": 0.01030742, "balance_loss_clip": 1.01781392, "balance_loss_mlp": 1.04006076, "epoch": 0.5653990680895836, "flos": 16727550128640.0, "grad_norm": 1.553095105727909, "language_loss": 0.69087845, "learning_rate": 1.6748855296459685e-06, "loss": 0.7123124, "num_input_tokens_seen": 202587385, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9404, "time_per_iteration": 2.455705404281616 }, { "auxiliary_loss_clip": 0.01108629, "auxiliary_loss_mlp": 0.01028449, "balance_loss_clip": 1.01600909, "balance_loss_mlp": 1.03969526, "epoch": 0.5654591913422516, "flos": 14538256179840.0, "grad_norm": 2.2858989629273667, "language_loss": 0.67340422, "learning_rate": 1.6745012543158045e-06, "loss": 0.69477499, "num_input_tokens_seen": 202604815, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 9405, "time_per_iteration": 2.473007917404175 }, { "auxiliary_loss_clip": 0.01109531, "auxiliary_loss_mlp": 0.01028741, "balance_loss_clip": 1.01669455, "balance_loss_mlp": 1.04228926, "epoch": 0.5655193145949196, "flos": 26209905603840.0, "grad_norm": 7.402988794799892, "language_loss": 0.74528742, "learning_rate": 1.6741169913285852e-06, "loss": 0.76667011, "num_input_tokens_seen": 202623775, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 9406, "time_per_iteration": 2.5076804161071777 }, { "auxiliary_loss_clip": 0.01114922, "auxiliary_loss_mlp": 0.01030044, "balance_loss_clip": 1.01556015, "balance_loss_mlp": 1.04072762, "epoch": 0.5655794378475876, "flos": 25046579825280.0, "grad_norm": 2.125194650915194, "language_loss": 0.79538476, "learning_rate": 1.673732740698882e-06, "loss": 0.81683445, "num_input_tokens_seen": 202643375, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7421875, "step": 9407, "time_per_iteration": 2.5206079483032227 }, { "auxiliary_loss_clip": 0.01109995, "auxiliary_loss_mlp": 0.01031164, "balance_loss_clip": 1.01860535, "balance_loss_mlp": 1.04184484, "epoch": 0.5656395611002555, "flos": 31032852652800.0, "grad_norm": 4.383695986416881, "language_loss": 0.7086556, "learning_rate": 1.6733485024412666e-06, "loss": 0.73006713, "num_input_tokens_seen": 202668400, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 9408, "time_per_iteration": 2.5810441970825195 }, { "auxiliary_loss_clip": 0.01111591, "auxiliary_loss_mlp": 0.01031705, "balance_loss_clip": 1.01861596, "balance_loss_mlp": 1.04146421, "epoch": 0.5656996843529235, "flos": 20229522606720.0, "grad_norm": 2.021042530315427, "language_loss": 0.81149119, "learning_rate": 1.672964276570308e-06, "loss": 0.83292413, "num_input_tokens_seen": 202685125, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 9409, "time_per_iteration": 2.4779226779937744 }, { "auxiliary_loss_clip": 0.01111106, "auxiliary_loss_mlp": 0.01028338, "balance_loss_clip": 1.0155828, "balance_loss_mlp": 1.03952456, "epoch": 0.5657598076055914, "flos": 20996251344000.0, "grad_norm": 1.7257158317587353, "language_loss": 0.78435338, "learning_rate": 1.6725800631005776e-06, "loss": 0.80574787, "num_input_tokens_seen": 202703830, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 9410, "time_per_iteration": 2.465625524520874 }, { "auxiliary_loss_clip": 0.01113021, "auxiliary_loss_mlp": 0.01031156, "balance_loss_clip": 1.01867485, "balance_loss_mlp": 1.04150689, "epoch": 0.5658199308582594, "flos": 11545999649280.0, "grad_norm": 2.806602940846721, "language_loss": 0.83472019, "learning_rate": 1.6721958620466432e-06, "loss": 0.85616195, "num_input_tokens_seen": 202719835, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 9411, "time_per_iteration": 2.4677646160125732 }, { "auxiliary_loss_clip": 0.01116613, "auxiliary_loss_mlp": 0.01033253, "balance_loss_clip": 1.01910913, "balance_loss_mlp": 1.04164386, "epoch": 0.5658800541109275, "flos": 14172146807040.0, "grad_norm": 2.193797632120436, "language_loss": 0.6743353, "learning_rate": 1.6718116734230749e-06, "loss": 0.69583398, "num_input_tokens_seen": 202736795, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 9412, "time_per_iteration": 2.430816173553467 }, { "auxiliary_loss_clip": 0.01108829, "auxiliary_loss_mlp": 0.01024557, "balance_loss_clip": 1.01351786, "balance_loss_mlp": 1.04116917, "epoch": 0.5659401773635954, "flos": 27305073325440.0, "grad_norm": 1.6308521304459696, "language_loss": 0.58453608, "learning_rate": 1.6714274972444413e-06, "loss": 0.60586995, "num_input_tokens_seen": 202756900, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.67578125, "step": 9413, "time_per_iteration": 2.543147087097168 }, { "auxiliary_loss_clip": 0.01111084, "auxiliary_loss_mlp": 0.01030665, "balance_loss_clip": 1.01803422, "balance_loss_mlp": 1.04157329, "epoch": 0.5660003006162634, "flos": 16728196573440.0, "grad_norm": 1.5692796894760785, "language_loss": 0.69291013, "learning_rate": 1.6710433335253092e-06, "loss": 0.71432763, "num_input_tokens_seen": 202775145, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 9414, "time_per_iteration": 2.4550108909606934 }, { "auxiliary_loss_clip": 0.01107903, "auxiliary_loss_mlp": 0.01028653, "balance_loss_clip": 1.01695263, "balance_loss_mlp": 1.03944993, "epoch": 0.5660604238689313, "flos": 21653452535040.0, "grad_norm": 1.6557796068333213, "language_loss": 0.78434199, "learning_rate": 1.670659182280247e-06, "loss": 0.80570757, "num_input_tokens_seen": 202794505, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.68359375, "step": 9415, "time_per_iteration": 2.498218297958374 }, { "auxiliary_loss_clip": 0.01036843, "auxiliary_loss_mlp": 0.01001352, "balance_loss_clip": 1.00001097, "balance_loss_mlp": 1.01237798, "epoch": 0.5661205471215993, "flos": 68824022083200.0, "grad_norm": 0.6900890857734012, "language_loss": 0.49177578, "learning_rate": 1.670275043523822e-06, "loss": 0.51215768, "num_input_tokens_seen": 202858580, "router_z_loss_clip": 0.01342773, "router_z_loss_mlp": 0.24414062, "step": 9416, "time_per_iteration": 3.221097946166992 }, { "auxiliary_loss_clip": 0.01113833, "auxiliary_loss_mlp": 0.01033669, "balance_loss_clip": 1.02054977, "balance_loss_mlp": 1.04229641, "epoch": 0.5661806703742672, "flos": 28621774177920.0, "grad_norm": 2.0968840072807304, "language_loss": 0.62990344, "learning_rate": 1.6698909172706e-06, "loss": 0.65137845, "num_input_tokens_seen": 202878565, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 9417, "time_per_iteration": 2.5355217456817627 }, { "auxiliary_loss_clip": 0.01112434, "auxiliary_loss_mlp": 0.01030322, "balance_loss_clip": 1.01688719, "balance_loss_mlp": 1.04044271, "epoch": 0.5662407936269352, "flos": 21397948116480.0, "grad_norm": 1.6585633620936406, "language_loss": 0.69163871, "learning_rate": 1.6695068035351479e-06, "loss": 0.71306622, "num_input_tokens_seen": 202897350, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 9418, "time_per_iteration": 2.490325450897217 }, { "auxiliary_loss_clip": 0.01112373, "auxiliary_loss_mlp": 0.01034148, "balance_loss_clip": 1.01982474, "balance_loss_mlp": 1.04092169, "epoch": 0.5663009168796032, "flos": 25660005315840.0, "grad_norm": 2.008761000317829, "language_loss": 0.64779866, "learning_rate": 1.6691227023320304e-06, "loss": 0.66926384, "num_input_tokens_seen": 202916745, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.71484375, "step": 9419, "time_per_iteration": 2.559143543243408 }, { "auxiliary_loss_clip": 0.01037289, "auxiliary_loss_mlp": 0.01002847, "balance_loss_clip": 1.00148189, "balance_loss_mlp": 1.01284802, "epoch": 0.5663610401322712, "flos": 67930458422400.0, "grad_norm": 0.7677734998041831, "language_loss": 0.59670568, "learning_rate": 1.6687386136758135e-06, "loss": 0.61710703, "num_input_tokens_seen": 202982375, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.24414062, "step": 9420, "time_per_iteration": 3.176530599594116 }, { "auxiliary_loss_clip": 0.01108733, "auxiliary_loss_mlp": 0.01031322, "balance_loss_clip": 1.019467, "balance_loss_mlp": 1.03944612, "epoch": 0.5664211633849391, "flos": 24609367480320.0, "grad_norm": 1.8196605914994202, "language_loss": 0.74122298, "learning_rate": 1.6683545375810618e-06, "loss": 0.76262355, "num_input_tokens_seen": 203002430, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 9421, "time_per_iteration": 3.9690823554992676 }, { "auxiliary_loss_clip": 0.01113482, "auxiliary_loss_mlp": 0.01032835, "balance_loss_clip": 1.01959121, "balance_loss_mlp": 1.0405674, "epoch": 0.5664812866376071, "flos": 11648811352320.0, "grad_norm": 2.0325365170480993, "language_loss": 0.72949016, "learning_rate": 1.6679704740623389e-06, "loss": 0.75095338, "num_input_tokens_seen": 203019425, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 9422, "time_per_iteration": 2.4565155506134033 }, { "auxiliary_loss_clip": 0.01109452, "auxiliary_loss_mlp": 0.01031689, "balance_loss_clip": 1.01976168, "balance_loss_mlp": 1.04145885, "epoch": 0.566541409890275, "flos": 24643985212800.0, "grad_norm": 5.318030833870458, "language_loss": 0.8178252, "learning_rate": 1.6675864231342085e-06, "loss": 0.83923656, "num_input_tokens_seen": 203039035, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 9423, "time_per_iteration": 2.5085289478302 }, { "auxiliary_loss_clip": 0.01110315, "auxiliary_loss_mlp": 0.01030866, "balance_loss_clip": 1.01815224, "balance_loss_mlp": 1.04052019, "epoch": 0.566601533142943, "flos": 22270577126400.0, "grad_norm": 1.491220036538624, "language_loss": 0.80501318, "learning_rate": 1.6672023848112353e-06, "loss": 0.82642502, "num_input_tokens_seen": 203059320, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 9424, "time_per_iteration": 3.962303400039673 }, { "auxiliary_loss_clip": 0.01115015, "auxiliary_loss_mlp": 0.01030411, "balance_loss_clip": 1.01665449, "balance_loss_mlp": 1.04178727, "epoch": 0.5666616563956111, "flos": 29971656218880.0, "grad_norm": 2.1740022723141026, "language_loss": 0.78971803, "learning_rate": 1.6668183591079805e-06, "loss": 0.81117231, "num_input_tokens_seen": 203078490, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 9425, "time_per_iteration": 2.5406227111816406 }, { "auxiliary_loss_clip": 0.01113078, "auxiliary_loss_mlp": 0.01030638, "balance_loss_clip": 1.01759684, "balance_loss_mlp": 1.04232121, "epoch": 0.566721779648279, "flos": 17781456101760.0, "grad_norm": 4.799608076288859, "language_loss": 0.58919108, "learning_rate": 1.6664343460390064e-06, "loss": 0.61062825, "num_input_tokens_seen": 203096065, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 9426, "time_per_iteration": 5.251490116119385 }, { "auxiliary_loss_clip": 0.01115393, "auxiliary_loss_mlp": 0.01026791, "balance_loss_clip": 1.0145241, "balance_loss_mlp": 1.04209399, "epoch": 0.566781902900947, "flos": 21033490769280.0, "grad_norm": 1.6847534047446335, "language_loss": 0.81752336, "learning_rate": 1.6660503456188764e-06, "loss": 0.83894521, "num_input_tokens_seen": 203115270, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.734375, "step": 9427, "time_per_iteration": 2.4749794006347656 }, { "auxiliary_loss_clip": 0.01111514, "auxiliary_loss_mlp": 0.01031947, "balance_loss_clip": 1.01947749, "balance_loss_mlp": 1.04260039, "epoch": 0.5668420261536149, "flos": 23148593176320.0, "grad_norm": 2.4583778761298842, "language_loss": 0.86196101, "learning_rate": 1.6656663578621498e-06, "loss": 0.88339561, "num_input_tokens_seen": 203134290, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 9428, "time_per_iteration": 2.4912588596343994 }, { "auxiliary_loss_clip": 0.011165, "auxiliary_loss_mlp": 0.0103327, "balance_loss_clip": 1.02003133, "balance_loss_mlp": 1.04256094, "epoch": 0.5669021494062829, "flos": 22601601889920.0, "grad_norm": 3.3724952216856017, "language_loss": 0.73205137, "learning_rate": 1.6652823827833886e-06, "loss": 0.75354904, "num_input_tokens_seen": 203152935, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 9429, "time_per_iteration": 2.487804889678955 }, { "auxiliary_loss_clip": 0.01112347, "auxiliary_loss_mlp": 0.01030323, "balance_loss_clip": 1.0166738, "balance_loss_mlp": 1.03998685, "epoch": 0.5669622726589508, "flos": 17381231786880.0, "grad_norm": 2.0678878981990763, "language_loss": 0.75519061, "learning_rate": 1.6648984203971538e-06, "loss": 0.77661735, "num_input_tokens_seen": 203170110, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 9430, "time_per_iteration": 2.4334824085235596 }, { "auxiliary_loss_clip": 0.0111265, "auxiliary_loss_mlp": 0.01032871, "balance_loss_clip": 1.01992536, "balance_loss_mlp": 1.04108977, "epoch": 0.5670223959116188, "flos": 18763253521920.0, "grad_norm": 4.637459744772556, "language_loss": 0.73085678, "learning_rate": 1.6645144707180032e-06, "loss": 0.752312, "num_input_tokens_seen": 203188825, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 9431, "time_per_iteration": 2.525184392929077 }, { "auxiliary_loss_clip": 0.01105326, "auxiliary_loss_mlp": 0.01026889, "balance_loss_clip": 1.01562405, "balance_loss_mlp": 1.04051018, "epoch": 0.5670825191642868, "flos": 13553334276480.0, "grad_norm": 1.6754277848786436, "language_loss": 0.73704916, "learning_rate": 1.6641305337604984e-06, "loss": 0.75837129, "num_input_tokens_seen": 203206860, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6484375, "step": 9432, "time_per_iteration": 2.470902681350708 }, { "auxiliary_loss_clip": 0.0111282, "auxiliary_loss_mlp": 0.01029039, "balance_loss_clip": 1.0169034, "balance_loss_mlp": 1.04173696, "epoch": 0.5671426424169548, "flos": 22054035985920.0, "grad_norm": 2.055567902688786, "language_loss": 0.78169441, "learning_rate": 1.663746609539197e-06, "loss": 0.80311298, "num_input_tokens_seen": 203225625, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7109375, "step": 9433, "time_per_iteration": 2.52541184425354 }, { "auxiliary_loss_clip": 0.01115393, "auxiliary_loss_mlp": 0.01033222, "balance_loss_clip": 1.01813567, "balance_loss_mlp": 1.04121876, "epoch": 0.5672027656696227, "flos": 21323972056320.0, "grad_norm": 2.526069345808165, "language_loss": 0.64130956, "learning_rate": 1.6633626980686582e-06, "loss": 0.66279572, "num_input_tokens_seen": 203242920, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.7421875, "step": 9434, "time_per_iteration": 2.537450075149536 }, { "auxiliary_loss_clip": 0.01109173, "auxiliary_loss_mlp": 0.01025835, "balance_loss_clip": 1.01372933, "balance_loss_mlp": 1.03999317, "epoch": 0.5672628889222907, "flos": 23514056104320.0, "grad_norm": 2.212894497544199, "language_loss": 0.66635525, "learning_rate": 1.6629787993634399e-06, "loss": 0.68770534, "num_input_tokens_seen": 203261995, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 9435, "time_per_iteration": 2.520113945007324 }, { "auxiliary_loss_clip": 0.01108095, "auxiliary_loss_mlp": 0.01030448, "balance_loss_clip": 1.01789546, "balance_loss_mlp": 1.03895807, "epoch": 0.5673230121749586, "flos": 27121928855040.0, "grad_norm": 1.6360000698226478, "language_loss": 0.71757603, "learning_rate": 1.6625949134380984e-06, "loss": 0.73896146, "num_input_tokens_seen": 203280670, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 9436, "time_per_iteration": 2.5434813499450684 }, { "auxiliary_loss_clip": 0.01113175, "auxiliary_loss_mlp": 0.01027714, "balance_loss_clip": 1.01522088, "balance_loss_mlp": 1.04039609, "epoch": 0.5673831354276266, "flos": 31141985149440.0, "grad_norm": 1.697662267829749, "language_loss": 0.73882794, "learning_rate": 1.6622110403071921e-06, "loss": 0.76023674, "num_input_tokens_seen": 203304800, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 9437, "time_per_iteration": 2.5793557167053223 }, { "auxiliary_loss_clip": 0.01116131, "auxiliary_loss_mlp": 0.01031901, "balance_loss_clip": 1.01872802, "balance_loss_mlp": 1.04425931, "epoch": 0.5674432586802945, "flos": 27673193859840.0, "grad_norm": 2.3871987995895165, "language_loss": 0.60970235, "learning_rate": 1.661827179985277e-06, "loss": 0.63118267, "num_input_tokens_seen": 203324060, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 9438, "time_per_iteration": 2.5254993438720703 }, { "auxiliary_loss_clip": 0.01110402, "auxiliary_loss_mlp": 0.01030059, "balance_loss_clip": 1.01736307, "balance_loss_mlp": 1.03857625, "epoch": 0.5675033819329626, "flos": 26615157822720.0, "grad_norm": 2.9082706963340947, "language_loss": 0.74782401, "learning_rate": 1.661443332486909e-06, "loss": 0.76922858, "num_input_tokens_seen": 203344360, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9439, "time_per_iteration": 2.5003769397735596 }, { "auxiliary_loss_clip": 0.01112881, "auxiliary_loss_mlp": 0.01031019, "balance_loss_clip": 1.01685715, "balance_loss_mlp": 1.04228377, "epoch": 0.5675635051856306, "flos": 19098372435840.0, "grad_norm": 1.9062962756013286, "language_loss": 0.84011823, "learning_rate": 1.6610594978266438e-06, "loss": 0.86155725, "num_input_tokens_seen": 203362115, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.70703125, "step": 9440, "time_per_iteration": 2.4670963287353516 }, { "auxiliary_loss_clip": 0.01115563, "auxiliary_loss_mlp": 0.01032725, "balance_loss_clip": 1.01930857, "balance_loss_mlp": 1.04085183, "epoch": 0.5676236284382985, "flos": 17566315591680.0, "grad_norm": 2.0164281505995505, "language_loss": 0.75295782, "learning_rate": 1.6606756760190365e-06, "loss": 0.77444071, "num_input_tokens_seen": 203380550, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 9441, "time_per_iteration": 2.4420886039733887 }, { "auxiliary_loss_clip": 0.01111308, "auxiliary_loss_mlp": 0.01031729, "balance_loss_clip": 1.01903939, "balance_loss_mlp": 1.04100871, "epoch": 0.5676837516909665, "flos": 15954069634560.0, "grad_norm": 2.020659403151086, "language_loss": 0.83003968, "learning_rate": 1.6602918670786413e-06, "loss": 0.85147005, "num_input_tokens_seen": 203396590, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 9442, "time_per_iteration": 2.437434673309326 }, { "auxiliary_loss_clip": 0.01108839, "auxiliary_loss_mlp": 0.01028324, "balance_loss_clip": 1.01603377, "balance_loss_mlp": 1.04304194, "epoch": 0.5677438749436344, "flos": 18295912644480.0, "grad_norm": 1.8986290183172225, "language_loss": 0.74338573, "learning_rate": 1.6599080710200126e-06, "loss": 0.76475739, "num_input_tokens_seen": 203414280, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.65625, "step": 9443, "time_per_iteration": 2.4737815856933594 }, { "auxiliary_loss_clip": 0.01113819, "auxiliary_loss_mlp": 0.01029751, "balance_loss_clip": 1.01663184, "balance_loss_mlp": 1.04224694, "epoch": 0.5678039981963025, "flos": 17931311642880.0, "grad_norm": 3.861924543144117, "language_loss": 0.7793147, "learning_rate": 1.6595242878577046e-06, "loss": 0.80075043, "num_input_tokens_seen": 203433280, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 9444, "time_per_iteration": 2.4521026611328125 }, { "auxiliary_loss_clip": 0.01116528, "auxiliary_loss_mlp": 0.01040622, "balance_loss_clip": 1.02712774, "balance_loss_mlp": 1.04309881, "epoch": 0.5678641214489704, "flos": 19316350120320.0, "grad_norm": 1.7365130795629657, "language_loss": 0.80673748, "learning_rate": 1.6591405176062687e-06, "loss": 0.828309, "num_input_tokens_seen": 203449935, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 9445, "time_per_iteration": 2.473750114440918 }, { "auxiliary_loss_clip": 0.01108678, "auxiliary_loss_mlp": 0.01028335, "balance_loss_clip": 1.01590753, "balance_loss_mlp": 1.03775895, "epoch": 0.5679242447016384, "flos": 27751084502400.0, "grad_norm": 1.54144929289284, "language_loss": 0.70689237, "learning_rate": 1.658756760280259e-06, "loss": 0.72826254, "num_input_tokens_seen": 203473025, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.70703125, "step": 9446, "time_per_iteration": 2.532636880874634 }, { "auxiliary_loss_clip": 0.01116041, "auxiliary_loss_mlp": 0.01031258, "balance_loss_clip": 1.01810312, "balance_loss_mlp": 1.04193997, "epoch": 0.5679843679543063, "flos": 23769093646080.0, "grad_norm": 1.8909450217605346, "language_loss": 0.73547721, "learning_rate": 1.6583730158942276e-06, "loss": 0.75695014, "num_input_tokens_seen": 203492895, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 9447, "time_per_iteration": 2.4976041316986084 }, { "auxiliary_loss_clip": 0.01115244, "auxiliary_loss_mlp": 0.01030843, "balance_loss_clip": 1.01741385, "balance_loss_mlp": 1.04125237, "epoch": 0.5680444912069743, "flos": 25591883172480.0, "grad_norm": 2.2632751275593677, "language_loss": 0.75087178, "learning_rate": 1.657989284462725e-06, "loss": 0.77233261, "num_input_tokens_seen": 203513710, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 9448, "time_per_iteration": 2.494204044342041 }, { "auxiliary_loss_clip": 0.01119144, "auxiliary_loss_mlp": 0.01037369, "balance_loss_clip": 1.02392244, "balance_loss_mlp": 1.04492521, "epoch": 0.5681046144596422, "flos": 23695799944320.0, "grad_norm": 2.6440947725633825, "language_loss": 0.76353049, "learning_rate": 1.6576055660003038e-06, "loss": 0.78509569, "num_input_tokens_seen": 203531630, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 9449, "time_per_iteration": 2.4894723892211914 }, { "auxiliary_loss_clip": 0.01113667, "auxiliary_loss_mlp": 0.01037674, "balance_loss_clip": 1.0246501, "balance_loss_mlp": 1.04153824, "epoch": 0.5681647377123102, "flos": 28000770917760.0, "grad_norm": 1.8714858185114862, "language_loss": 0.74821347, "learning_rate": 1.6572218605215128e-06, "loss": 0.76972687, "num_input_tokens_seen": 203551885, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 9450, "time_per_iteration": 2.5325851440429688 }, { "auxiliary_loss_clip": 0.01118445, "auxiliary_loss_mlp": 0.01033778, "balance_loss_clip": 1.0215174, "balance_loss_mlp": 1.04457009, "epoch": 0.5682248609649782, "flos": 22747758330240.0, "grad_norm": 1.8177288376369172, "language_loss": 0.66890806, "learning_rate": 1.6568381680409038e-06, "loss": 0.69043028, "num_input_tokens_seen": 203572250, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.73828125, "step": 9451, "time_per_iteration": 2.4877238273620605 }, { "auxiliary_loss_clip": 0.01119776, "auxiliary_loss_mlp": 0.01033891, "balance_loss_clip": 1.01922226, "balance_loss_mlp": 1.04150558, "epoch": 0.5682849842176462, "flos": 21288600138240.0, "grad_norm": 2.340122064415368, "language_loss": 0.72040343, "learning_rate": 1.656454488573026e-06, "loss": 0.74194014, "num_input_tokens_seen": 203590605, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.78125, "step": 9452, "time_per_iteration": 2.4691598415374756 }, { "auxiliary_loss_clip": 0.01109151, "auxiliary_loss_mlp": 0.01028351, "balance_loss_clip": 1.01614451, "balance_loss_mlp": 1.03946304, "epoch": 0.5683451074703142, "flos": 21141689512320.0, "grad_norm": 1.5900463229595891, "language_loss": 0.70271599, "learning_rate": 1.656070822132428e-06, "loss": 0.72409105, "num_input_tokens_seen": 203610080, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 9453, "time_per_iteration": 2.477879524230957 }, { "auxiliary_loss_clip": 0.01113677, "auxiliary_loss_mlp": 0.01031792, "balance_loss_clip": 1.01925123, "balance_loss_mlp": 1.04198897, "epoch": 0.5684052307229821, "flos": 22344481359360.0, "grad_norm": 1.7872880724484064, "language_loss": 0.69644195, "learning_rate": 1.6556871687336592e-06, "loss": 0.71789664, "num_input_tokens_seen": 203630060, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 9454, "time_per_iteration": 2.469447612762451 }, { "auxiliary_loss_clip": 0.0110917, "auxiliary_loss_mlp": 0.01031287, "balance_loss_clip": 1.01965201, "balance_loss_mlp": 1.03951347, "epoch": 0.5684653539756501, "flos": 21798639308160.0, "grad_norm": 1.954857605696603, "language_loss": 0.60478139, "learning_rate": 1.6553035283912671e-06, "loss": 0.62618589, "num_input_tokens_seen": 203649065, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 9455, "time_per_iteration": 2.5011303424835205 }, { "auxiliary_loss_clip": 0.01120622, "auxiliary_loss_mlp": 0.01034168, "balance_loss_clip": 1.02085221, "balance_loss_mlp": 1.04425538, "epoch": 0.568525477228318, "flos": 22999635475200.0, "grad_norm": 2.7566913296536852, "language_loss": 0.73152781, "learning_rate": 1.6549199011198e-06, "loss": 0.75307572, "num_input_tokens_seen": 203667545, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.76171875, "step": 9456, "time_per_iteration": 2.4695181846618652 }, { "auxiliary_loss_clip": 0.0111257, "auxiliary_loss_mlp": 0.0103307, "balance_loss_clip": 1.02127409, "balance_loss_mlp": 1.04187763, "epoch": 0.568585600480986, "flos": 21392489249280.0, "grad_norm": 1.7033815033682134, "language_loss": 0.76765954, "learning_rate": 1.6545362869338048e-06, "loss": 0.78911591, "num_input_tokens_seen": 203686025, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.70703125, "step": 9457, "time_per_iteration": 2.497652053833008 }, { "auxiliary_loss_clip": 0.01114339, "auxiliary_loss_mlp": 0.01034457, "balance_loss_clip": 1.02048588, "balance_loss_mlp": 1.04076338, "epoch": 0.568645723733654, "flos": 30007351359360.0, "grad_norm": 1.820179356659835, "language_loss": 0.66581565, "learning_rate": 1.6541526858478285e-06, "loss": 0.68730366, "num_input_tokens_seen": 203705540, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 9458, "time_per_iteration": 2.524172306060791 }, { "auxiliary_loss_clip": 0.0111359, "auxiliary_loss_mlp": 0.01027313, "balance_loss_clip": 1.01375842, "balance_loss_mlp": 1.04021847, "epoch": 0.568705846986322, "flos": 20412667077120.0, "grad_norm": 2.6435136560651213, "language_loss": 0.68699473, "learning_rate": 1.6537690978764167e-06, "loss": 0.70840371, "num_input_tokens_seen": 203723670, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 9459, "time_per_iteration": 2.4710185527801514 }, { "auxiliary_loss_clip": 0.0111714, "auxiliary_loss_mlp": 0.01032671, "balance_loss_clip": 1.01934934, "balance_loss_mlp": 1.04325342, "epoch": 0.5687659702389899, "flos": 17456752131840.0, "grad_norm": 2.9500583472825213, "language_loss": 0.77361333, "learning_rate": 1.6533855230341155e-06, "loss": 0.79511142, "num_input_tokens_seen": 203739705, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 9460, "time_per_iteration": 2.4256398677825928 }, { "auxiliary_loss_clip": 0.01113286, "auxiliary_loss_mlp": 0.01039452, "balance_loss_clip": 1.02574253, "balance_loss_mlp": 1.04046941, "epoch": 0.5688260934916579, "flos": 25406081095680.0, "grad_norm": 1.8174487108746324, "language_loss": 0.71824342, "learning_rate": 1.65300196133547e-06, "loss": 0.73977077, "num_input_tokens_seen": 203759000, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 9461, "time_per_iteration": 2.518259048461914 }, { "auxiliary_loss_clip": 0.01111633, "auxiliary_loss_mlp": 0.01034153, "balance_loss_clip": 1.02099204, "balance_loss_mlp": 1.04003704, "epoch": 0.5688862167443258, "flos": 21608024808960.0, "grad_norm": 2.4158681205063877, "language_loss": 0.73378646, "learning_rate": 1.6526184127950249e-06, "loss": 0.75524426, "num_input_tokens_seen": 203774295, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 9462, "time_per_iteration": 2.441709041595459 }, { "auxiliary_loss_clip": 0.01109073, "auxiliary_loss_mlp": 0.01026043, "balance_loss_clip": 1.0144918, "balance_loss_mlp": 1.04040778, "epoch": 0.5689463399969938, "flos": 22418996123520.0, "grad_norm": 2.290473857970571, "language_loss": 0.72837627, "learning_rate": 1.6522348774273246e-06, "loss": 0.74972737, "num_input_tokens_seen": 203792710, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6875, "step": 9463, "time_per_iteration": 3.8638012409210205 }, { "auxiliary_loss_clip": 0.01111443, "auxiliary_loss_mlp": 0.01033532, "balance_loss_clip": 1.02084208, "balance_loss_mlp": 1.03960657, "epoch": 0.5690064632496618, "flos": 18296810484480.0, "grad_norm": 2.7808319354122695, "language_loss": 0.74200511, "learning_rate": 1.6518513552469123e-06, "loss": 0.76345485, "num_input_tokens_seen": 203811645, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9464, "time_per_iteration": 2.440990447998047 }, { "auxiliary_loss_clip": 0.01113393, "auxiliary_loss_mlp": 0.01037265, "balance_loss_clip": 1.02428317, "balance_loss_mlp": 1.04070055, "epoch": 0.5690665865023298, "flos": 21579260993280.0, "grad_norm": 1.5926701549066862, "language_loss": 0.84098524, "learning_rate": 1.6514678462683312e-06, "loss": 0.86249185, "num_input_tokens_seen": 203830040, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 9465, "time_per_iteration": 2.468217134475708 }, { "auxiliary_loss_clip": 0.01108014, "auxiliary_loss_mlp": 0.01029403, "balance_loss_clip": 1.01718438, "balance_loss_mlp": 1.03869224, "epoch": 0.5691267097549978, "flos": 24421446501120.0, "grad_norm": 2.735544689408568, "language_loss": 0.72001326, "learning_rate": 1.651084350506125e-06, "loss": 0.74138743, "num_input_tokens_seen": 203851245, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 9466, "time_per_iteration": 4.047702312469482 }, { "auxiliary_loss_clip": 0.01042813, "auxiliary_loss_mlp": 0.01004809, "balance_loss_clip": 1.0032233, "balance_loss_mlp": 1.01802552, "epoch": 0.5691868330076657, "flos": 61657906199040.0, "grad_norm": 0.7157241985481398, "language_loss": 0.55365288, "learning_rate": 1.6507008679748343e-06, "loss": 0.5741291, "num_input_tokens_seen": 203916400, "router_z_loss_clip": 0.01586914, "router_z_loss_mlp": 0.24804688, "step": 9467, "time_per_iteration": 4.5499467849731445 }, { "auxiliary_loss_clip": 0.01114742, "auxiliary_loss_mlp": 0.01035397, "balance_loss_clip": 1.02147913, "balance_loss_mlp": 1.04034829, "epoch": 0.5692469562603337, "flos": 21325193118720.0, "grad_norm": 3.273936135090662, "language_loss": 0.63534361, "learning_rate": 1.6503173986890023e-06, "loss": 0.65684497, "num_input_tokens_seen": 203935870, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 9468, "time_per_iteration": 3.9017369747161865 }, { "auxiliary_loss_clip": 0.01112611, "auxiliary_loss_mlp": 0.01031354, "balance_loss_clip": 1.01834774, "balance_loss_mlp": 1.04147482, "epoch": 0.5693070795130016, "flos": 23367899664000.0, "grad_norm": 2.409792339743999, "language_loss": 0.79365361, "learning_rate": 1.64993394266317e-06, "loss": 0.81509322, "num_input_tokens_seen": 203954950, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 9469, "time_per_iteration": 2.481050491333008 }, { "auxiliary_loss_clip": 0.0111734, "auxiliary_loss_mlp": 0.01038489, "balance_loss_clip": 1.02491069, "balance_loss_mlp": 1.04249418, "epoch": 0.5693672027656697, "flos": 18697250280960.0, "grad_norm": 2.2335900979567476, "language_loss": 0.69359338, "learning_rate": 1.6495504999118769e-06, "loss": 0.71515167, "num_input_tokens_seen": 203972715, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.74609375, "step": 9470, "time_per_iteration": 2.451258659362793 }, { "auxiliary_loss_clip": 0.01112961, "auxiliary_loss_mlp": 0.01036186, "balance_loss_clip": 1.02246523, "balance_loss_mlp": 1.04051518, "epoch": 0.5694273260183376, "flos": 20449188230400.0, "grad_norm": 12.301381434711313, "language_loss": 0.74815476, "learning_rate": 1.6491670704496644e-06, "loss": 0.76964617, "num_input_tokens_seen": 203990775, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 9471, "time_per_iteration": 2.453334331512451 }, { "auxiliary_loss_clip": 0.01112207, "auxiliary_loss_mlp": 0.01031599, "balance_loss_clip": 1.01853967, "balance_loss_mlp": 1.04161525, "epoch": 0.5694874492710056, "flos": 17603195880960.0, "grad_norm": 2.8849266801283866, "language_loss": 0.57182443, "learning_rate": 1.6487836542910716e-06, "loss": 0.59326249, "num_input_tokens_seen": 204008845, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 9472, "time_per_iteration": 2.4883742332458496 }, { "auxiliary_loss_clip": 0.01110708, "auxiliary_loss_mlp": 0.01035702, "balance_loss_clip": 1.02319729, "balance_loss_mlp": 1.04213703, "epoch": 0.5695475725236735, "flos": 13370836250880.0, "grad_norm": 1.9593521776451237, "language_loss": 0.73991466, "learning_rate": 1.648400251450638e-06, "loss": 0.76137871, "num_input_tokens_seen": 204023755, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 9473, "time_per_iteration": 2.4606587886810303 }, { "auxiliary_loss_clip": 0.01042373, "auxiliary_loss_mlp": 0.01002839, "balance_loss_clip": 1.00134325, "balance_loss_mlp": 1.01776385, "epoch": 0.5696076957763415, "flos": 68174398661760.0, "grad_norm": 0.6564693305236967, "language_loss": 0.57565176, "learning_rate": 1.6480168619429023e-06, "loss": 0.59610391, "num_input_tokens_seen": 204091255, "router_z_loss_clip": 0.01495361, "router_z_loss_mlp": 0.24609375, "step": 9474, "time_per_iteration": 3.131944417953491 }, { "auxiliary_loss_clip": 0.0111358, "auxiliary_loss_mlp": 0.01035761, "balance_loss_clip": 1.02248156, "balance_loss_mlp": 1.04343569, "epoch": 0.5696678190290094, "flos": 33838301525760.0, "grad_norm": 1.6493667553174798, "language_loss": 0.54311681, "learning_rate": 1.6476334857824017e-06, "loss": 0.5646103, "num_input_tokens_seen": 204113285, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 9475, "time_per_iteration": 2.579359769821167 }, { "auxiliary_loss_clip": 0.01116172, "auxiliary_loss_mlp": 0.01035527, "balance_loss_clip": 1.02187777, "balance_loss_mlp": 1.04277921, "epoch": 0.5697279422816774, "flos": 26356600748160.0, "grad_norm": 1.6583956803897304, "language_loss": 0.79639041, "learning_rate": 1.647250122983675e-06, "loss": 0.81790745, "num_input_tokens_seen": 204133045, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 9476, "time_per_iteration": 2.5131092071533203 }, { "auxiliary_loss_clip": 0.01118036, "auxiliary_loss_mlp": 0.01042663, "balance_loss_clip": 1.03006303, "balance_loss_mlp": 1.04436123, "epoch": 0.5697880655343454, "flos": 22930507751040.0, "grad_norm": 1.9578086010677462, "language_loss": 0.66279149, "learning_rate": 1.6468667735612592e-06, "loss": 0.68439847, "num_input_tokens_seen": 204152590, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 9477, "time_per_iteration": 2.4784960746765137 }, { "auxiliary_loss_clip": 0.0111247, "auxiliary_loss_mlp": 0.01035007, "balance_loss_clip": 1.0216316, "balance_loss_mlp": 1.0397172, "epoch": 0.5698481887870134, "flos": 26761314263040.0, "grad_norm": 1.8094674518730645, "language_loss": 0.7080484, "learning_rate": 1.6464834375296906e-06, "loss": 0.72952318, "num_input_tokens_seen": 204171815, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 9478, "time_per_iteration": 2.5432496070861816 }, { "auxiliary_loss_clip": 0.01109596, "auxiliary_loss_mlp": 0.01031407, "balance_loss_clip": 1.01943827, "balance_loss_mlp": 1.04180062, "epoch": 0.5699083120396814, "flos": 15742269089280.0, "grad_norm": 3.2537404841566855, "language_loss": 0.69519961, "learning_rate": 1.6461001149035055e-06, "loss": 0.71660966, "num_input_tokens_seen": 204188535, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 9479, "time_per_iteration": 2.432931900024414 }, { "auxiliary_loss_clip": 0.01109573, "auxiliary_loss_mlp": 0.01035493, "balance_loss_clip": 1.02356052, "balance_loss_mlp": 1.04119682, "epoch": 0.5699684352923493, "flos": 19537272720000.0, "grad_norm": 1.487381235246349, "language_loss": 0.71727693, "learning_rate": 1.6457168056972392e-06, "loss": 0.73872757, "num_input_tokens_seen": 204208365, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 9480, "time_per_iteration": 2.4965054988861084 }, { "auxiliary_loss_clip": 0.01113023, "auxiliary_loss_mlp": 0.01030892, "balance_loss_clip": 1.01731968, "balance_loss_mlp": 1.04196966, "epoch": 0.5700285585450173, "flos": 16253349753600.0, "grad_norm": 2.253606096884184, "language_loss": 0.72192204, "learning_rate": 1.6453335099254276e-06, "loss": 0.74336123, "num_input_tokens_seen": 204226560, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 9481, "time_per_iteration": 2.427971839904785 }, { "auxiliary_loss_clip": 0.01115437, "auxiliary_loss_mlp": 0.01034816, "balance_loss_clip": 1.02135706, "balance_loss_mlp": 1.0444572, "epoch": 0.5700886817976852, "flos": 19864993432320.0, "grad_norm": 2.2533032004011697, "language_loss": 0.77874136, "learning_rate": 1.6449502276026041e-06, "loss": 0.80024385, "num_input_tokens_seen": 204245410, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 9482, "time_per_iteration": 2.4778075218200684 }, { "auxiliary_loss_clip": 0.01111487, "auxiliary_loss_mlp": 0.01029094, "balance_loss_clip": 1.01645827, "balance_loss_mlp": 1.04146361, "epoch": 0.5701488050503533, "flos": 23841704989440.0, "grad_norm": 1.6051877478940677, "language_loss": 0.77872908, "learning_rate": 1.6445669587433043e-06, "loss": 0.8001349, "num_input_tokens_seen": 204264840, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 9483, "time_per_iteration": 2.481675624847412 }, { "auxiliary_loss_clip": 0.01112883, "auxiliary_loss_mlp": 0.01041599, "balance_loss_clip": 1.02920127, "balance_loss_mlp": 1.04135466, "epoch": 0.5702089283030212, "flos": 23659673840640.0, "grad_norm": 1.5755953789617718, "language_loss": 0.81155467, "learning_rate": 1.6441837033620612e-06, "loss": 0.83309948, "num_input_tokens_seen": 204284335, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 9484, "time_per_iteration": 2.5201056003570557 }, { "auxiliary_loss_clip": 0.01114275, "auxiliary_loss_mlp": 0.01033215, "balance_loss_clip": 1.01936257, "balance_loss_mlp": 1.04233241, "epoch": 0.5702690515556892, "flos": 27891171544320.0, "grad_norm": 3.800100146145576, "language_loss": 0.60830736, "learning_rate": 1.6438004614734073e-06, "loss": 0.62978226, "num_input_tokens_seen": 204302590, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 9485, "time_per_iteration": 2.517376661300659 }, { "auxiliary_loss_clip": 0.01112523, "auxiliary_loss_mlp": 0.01031578, "balance_loss_clip": 1.01825666, "balance_loss_mlp": 1.04035163, "epoch": 0.5703291748083571, "flos": 24023951619840.0, "grad_norm": 1.7944574847675527, "language_loss": 0.65041113, "learning_rate": 1.6434172330918757e-06, "loss": 0.67185211, "num_input_tokens_seen": 204323055, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 9486, "time_per_iteration": 2.490879774093628 }, { "auxiliary_loss_clip": 0.01036706, "auxiliary_loss_mlp": 0.01000677, "balance_loss_clip": 0.99922866, "balance_loss_mlp": 1.01234031, "epoch": 0.5703892980610251, "flos": 57023382919680.0, "grad_norm": 0.6671393783324548, "language_loss": 0.48001391, "learning_rate": 1.6430340182319978e-06, "loss": 0.50038773, "num_input_tokens_seen": 204386160, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.24414062, "step": 9487, "time_per_iteration": 3.147695302963257 }, { "auxiliary_loss_clip": 0.01112072, "auxiliary_loss_mlp": 0.01029971, "balance_loss_clip": 1.01688838, "balance_loss_mlp": 1.04046845, "epoch": 0.570449421313693, "flos": 24351025887360.0, "grad_norm": 2.164973840845893, "language_loss": 0.85874486, "learning_rate": 1.6426508169083067e-06, "loss": 0.88016534, "num_input_tokens_seen": 204406315, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 9488, "time_per_iteration": 2.497343063354492 }, { "auxiliary_loss_clip": 0.01114023, "auxiliary_loss_mlp": 0.01032051, "balance_loss_clip": 1.01884294, "balance_loss_mlp": 1.03980017, "epoch": 0.570509544566361, "flos": 24828566227200.0, "grad_norm": 1.46658800050637, "language_loss": 0.79195285, "learning_rate": 1.6422676291353314e-06, "loss": 0.81341362, "num_input_tokens_seen": 204427645, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 9489, "time_per_iteration": 2.5052192211151123 }, { "auxiliary_loss_clip": 0.01112289, "auxiliary_loss_mlp": 0.0102857, "balance_loss_clip": 1.01663733, "balance_loss_mlp": 1.04101443, "epoch": 0.570569667819029, "flos": 21397301671680.0, "grad_norm": 2.349292049795531, "language_loss": 0.70040834, "learning_rate": 1.641884454927604e-06, "loss": 0.7218169, "num_input_tokens_seen": 204445910, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.71484375, "step": 9490, "time_per_iteration": 2.4751546382904053 }, { "auxiliary_loss_clip": 0.01111295, "auxiliary_loss_mlp": 0.01029329, "balance_loss_clip": 1.01684761, "balance_loss_mlp": 1.04082966, "epoch": 0.570629791071697, "flos": 23216751233280.0, "grad_norm": 1.8249247067703247, "language_loss": 0.76247871, "learning_rate": 1.6415012942996548e-06, "loss": 0.78388494, "num_input_tokens_seen": 204464680, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 9491, "time_per_iteration": 2.4648962020874023 }, { "auxiliary_loss_clip": 0.01037027, "auxiliary_loss_mlp": 0.01001853, "balance_loss_clip": 1.00040424, "balance_loss_mlp": 1.01264107, "epoch": 0.570689914324365, "flos": 65284666525440.0, "grad_norm": 0.8242796207769539, "language_loss": 0.57428044, "learning_rate": 1.641118147266011e-06, "loss": 0.59466922, "num_input_tokens_seen": 204525580, "router_z_loss_clip": 0.01446533, "router_z_loss_mlp": 0.24414062, "step": 9492, "time_per_iteration": 3.0847253799438477 }, { "auxiliary_loss_clip": 0.01112839, "auxiliary_loss_mlp": 0.01034934, "balance_loss_clip": 1.02137971, "balance_loss_mlp": 1.04184747, "epoch": 0.5707500375770329, "flos": 21141904993920.0, "grad_norm": 1.9967701636694792, "language_loss": 0.71714175, "learning_rate": 1.6407350138412035e-06, "loss": 0.73861957, "num_input_tokens_seen": 204541320, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 9493, "time_per_iteration": 2.486786127090454 }, { "auxiliary_loss_clip": 0.01115146, "auxiliary_loss_mlp": 0.01026969, "balance_loss_clip": 1.01417196, "balance_loss_mlp": 1.04121923, "epoch": 0.5708101608297009, "flos": 20812747737600.0, "grad_norm": 2.0509376441946445, "language_loss": 0.77778685, "learning_rate": 1.6403518940397606e-06, "loss": 0.79920799, "num_input_tokens_seen": 204560275, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73828125, "step": 9494, "time_per_iteration": 2.4640004634857178 }, { "auxiliary_loss_clip": 0.01115815, "auxiliary_loss_mlp": 0.01031855, "balance_loss_clip": 1.01809263, "balance_loss_mlp": 1.04012775, "epoch": 0.5708702840823688, "flos": 25812338895360.0, "grad_norm": 13.459266911800166, "language_loss": 0.80165523, "learning_rate": 1.6399687878762096e-06, "loss": 0.82313192, "num_input_tokens_seen": 204579430, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7578125, "step": 9495, "time_per_iteration": 2.514023542404175 }, { "auxiliary_loss_clip": 0.0111934, "auxiliary_loss_mlp": 0.01038777, "balance_loss_clip": 1.02342892, "balance_loss_mlp": 1.04279435, "epoch": 0.5709304073350369, "flos": 23651916503040.0, "grad_norm": 2.3842467916643253, "language_loss": 0.66667926, "learning_rate": 1.6395856953650784e-06, "loss": 0.6882605, "num_input_tokens_seen": 204597710, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.765625, "step": 9496, "time_per_iteration": 2.475843667984009 }, { "auxiliary_loss_clip": 0.01115572, "auxiliary_loss_mlp": 0.01038164, "balance_loss_clip": 1.02461004, "balance_loss_mlp": 1.04056621, "epoch": 0.5709905305877048, "flos": 16107552449280.0, "grad_norm": 2.5166337453073546, "language_loss": 0.69847977, "learning_rate": 1.6392026165208938e-06, "loss": 0.72001719, "num_input_tokens_seen": 204616140, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 9497, "time_per_iteration": 2.4550628662109375 }, { "auxiliary_loss_clip": 0.01114413, "auxiliary_loss_mlp": 0.01030135, "balance_loss_clip": 1.01635396, "balance_loss_mlp": 1.04039598, "epoch": 0.5710506538403728, "flos": 24750819239040.0, "grad_norm": 1.8001761265519975, "language_loss": 0.81312442, "learning_rate": 1.638819551358182e-06, "loss": 0.83456981, "num_input_tokens_seen": 204636470, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 9498, "time_per_iteration": 2.5129733085632324 }, { "auxiliary_loss_clip": 0.01111854, "auxiliary_loss_mlp": 0.01034642, "balance_loss_clip": 1.02030087, "balance_loss_mlp": 1.03896868, "epoch": 0.5711107770930407, "flos": 21982250655360.0, "grad_norm": 1.740365463798655, "language_loss": 0.66181052, "learning_rate": 1.638436499891469e-06, "loss": 0.68327546, "num_input_tokens_seen": 204656640, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7265625, "step": 9499, "time_per_iteration": 2.4816792011260986 }, { "auxiliary_loss_clip": 0.01113084, "auxiliary_loss_mlp": 0.01032127, "balance_loss_clip": 1.01941967, "balance_loss_mlp": 1.041816, "epoch": 0.5711709003457087, "flos": 19574009354880.0, "grad_norm": 2.798199074907925, "language_loss": 0.7216745, "learning_rate": 1.6380534621352805e-06, "loss": 0.74312663, "num_input_tokens_seen": 204675475, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 9500, "time_per_iteration": 2.4505043029785156 }, { "auxiliary_loss_clip": 0.01115142, "auxiliary_loss_mlp": 0.01031059, "balance_loss_clip": 1.01757669, "balance_loss_mlp": 1.04039574, "epoch": 0.5712310235983766, "flos": 24242683489920.0, "grad_norm": 2.095561024540992, "language_loss": 0.76311815, "learning_rate": 1.6376704381041407e-06, "loss": 0.78458017, "num_input_tokens_seen": 204695385, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 9501, "time_per_iteration": 2.504357099533081 }, { "auxiliary_loss_clip": 0.01112695, "auxiliary_loss_mlp": 0.01034252, "balance_loss_clip": 1.02135348, "balance_loss_mlp": 1.03895903, "epoch": 0.5712911468510447, "flos": 20996143603200.0, "grad_norm": 1.748307087956676, "language_loss": 0.75397503, "learning_rate": 1.6372874278125742e-06, "loss": 0.77544451, "num_input_tokens_seen": 204714730, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 9502, "time_per_iteration": 2.45652174949646 }, { "auxiliary_loss_clip": 0.0110984, "auxiliary_loss_mlp": 0.01025499, "balance_loss_clip": 1.0128386, "balance_loss_mlp": 1.03896832, "epoch": 0.5713512701037126, "flos": 18916987731840.0, "grad_norm": 2.370513962044405, "language_loss": 0.82275271, "learning_rate": 1.636904431275105e-06, "loss": 0.84410614, "num_input_tokens_seen": 204735025, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 9503, "time_per_iteration": 2.474717378616333 }, { "auxiliary_loss_clip": 0.01109532, "auxiliary_loss_mlp": 0.01030217, "balance_loss_clip": 1.01796246, "balance_loss_mlp": 1.03846622, "epoch": 0.5714113933563806, "flos": 17413443308160.0, "grad_norm": 2.5057839493799294, "language_loss": 0.86129272, "learning_rate": 1.6365214485062553e-06, "loss": 0.88269019, "num_input_tokens_seen": 204751365, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.7109375, "step": 9504, "time_per_iteration": 2.428471326828003 }, { "auxiliary_loss_clip": 0.01109336, "auxiliary_loss_mlp": 0.01028468, "balance_loss_clip": 1.01564693, "balance_loss_mlp": 1.04018247, "epoch": 0.5714715166090486, "flos": 20193360589440.0, "grad_norm": 7.712701089747398, "language_loss": 0.75510383, "learning_rate": 1.6361384795205496e-06, "loss": 0.77648181, "num_input_tokens_seen": 204768980, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 9505, "time_per_iteration": 3.8000571727752686 }, { "auxiliary_loss_clip": 0.0110926, "auxiliary_loss_mlp": 0.01029471, "balance_loss_clip": 1.01744282, "balance_loss_mlp": 1.0384357, "epoch": 0.5715316398617165, "flos": 18551668458240.0, "grad_norm": 1.628251743342898, "language_loss": 0.81427312, "learning_rate": 1.635755524332509e-06, "loss": 0.83566046, "num_input_tokens_seen": 204788110, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 9506, "time_per_iteration": 2.434896945953369 }, { "auxiliary_loss_clip": 0.0110922, "auxiliary_loss_mlp": 0.01029539, "balance_loss_clip": 1.01709938, "balance_loss_mlp": 1.03883719, "epoch": 0.5715917631143845, "flos": 18478195188480.0, "grad_norm": 2.0052571843002958, "language_loss": 0.77202785, "learning_rate": 1.6353725829566552e-06, "loss": 0.79341549, "num_input_tokens_seen": 204807240, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 9507, "time_per_iteration": 3.922236442565918 }, { "auxiliary_loss_clip": 0.01111846, "auxiliary_loss_mlp": 0.0103889, "balance_loss_clip": 1.02443624, "balance_loss_mlp": 1.0394361, "epoch": 0.5716518863670524, "flos": 24020037037440.0, "grad_norm": 1.449589874077538, "language_loss": 0.68549091, "learning_rate": 1.63498965540751e-06, "loss": 0.70699823, "num_input_tokens_seen": 204826415, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.72265625, "step": 9508, "time_per_iteration": 2.4883358478546143 }, { "auxiliary_loss_clip": 0.0111349, "auxiliary_loss_mlp": 0.01031762, "balance_loss_clip": 1.01868486, "balance_loss_mlp": 1.04028916, "epoch": 0.5717120096197205, "flos": 17819485626240.0, "grad_norm": 2.201798431554281, "language_loss": 0.79598773, "learning_rate": 1.634606741699593e-06, "loss": 0.81744027, "num_input_tokens_seen": 204844305, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 9509, "time_per_iteration": 3.8796372413635254 }, { "auxiliary_loss_clip": 0.01110206, "auxiliary_loss_mlp": 0.01032222, "balance_loss_clip": 1.01918066, "balance_loss_mlp": 1.04050136, "epoch": 0.5717721328723884, "flos": 21866043179520.0, "grad_norm": 1.846036246508316, "language_loss": 0.71722674, "learning_rate": 1.6342238418474255e-06, "loss": 0.7386511, "num_input_tokens_seen": 204861765, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 9510, "time_per_iteration": 2.4535722732543945 }, { "auxiliary_loss_clip": 0.01112089, "auxiliary_loss_mlp": 0.0103182, "balance_loss_clip": 1.01959479, "balance_loss_mlp": 1.04046929, "epoch": 0.5718322561250564, "flos": 28437624126720.0, "grad_norm": 1.476025759839608, "language_loss": 0.69674218, "learning_rate": 1.6338409558655264e-06, "loss": 0.71818125, "num_input_tokens_seen": 204882505, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71484375, "step": 9511, "time_per_iteration": 2.525141477584839 }, { "auxiliary_loss_clip": 0.01111105, "auxiliary_loss_mlp": 0.01034477, "balance_loss_clip": 1.02214515, "balance_loss_mlp": 1.03995097, "epoch": 0.5718923793777243, "flos": 13551825905280.0, "grad_norm": 2.2089204579481914, "language_loss": 0.61406529, "learning_rate": 1.6334580837684152e-06, "loss": 0.63552111, "num_input_tokens_seen": 204899830, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 9512, "time_per_iteration": 2.4464540481567383 }, { "auxiliary_loss_clip": 0.01109423, "auxiliary_loss_mlp": 0.01029478, "balance_loss_clip": 1.01698494, "balance_loss_mlp": 1.03933549, "epoch": 0.5719525026303923, "flos": 17822035491840.0, "grad_norm": 2.8452689400300906, "language_loss": 0.75786686, "learning_rate": 1.6330752255706104e-06, "loss": 0.77925587, "num_input_tokens_seen": 204918100, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 9513, "time_per_iteration": 2.474562168121338 }, { "auxiliary_loss_clip": 0.01036916, "auxiliary_loss_mlp": 0.01012741, "balance_loss_clip": 1.01147127, "balance_loss_mlp": 1.0125668, "epoch": 0.5720126258830602, "flos": 61298042814720.0, "grad_norm": 0.9011198051396002, "language_loss": 0.66849637, "learning_rate": 1.6326923812866288e-06, "loss": 0.68899292, "num_input_tokens_seen": 204972925, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.24414062, "step": 9514, "time_per_iteration": 3.0670690536499023 }, { "auxiliary_loss_clip": 0.01113614, "auxiliary_loss_mlp": 0.01038165, "balance_loss_clip": 1.02510595, "balance_loss_mlp": 1.0404253, "epoch": 0.5720727491357283, "flos": 23988040997760.0, "grad_norm": 2.4158281424645787, "language_loss": 0.81331635, "learning_rate": 1.63230955093099e-06, "loss": 0.8348341, "num_input_tokens_seen": 204990910, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 9515, "time_per_iteration": 2.4668478965759277 }, { "auxiliary_loss_clip": 0.01106583, "auxiliary_loss_mlp": 0.01030672, "balance_loss_clip": 1.01866746, "balance_loss_mlp": 1.0375284, "epoch": 0.5721328723883962, "flos": 23405426398080.0, "grad_norm": 1.7199336150818552, "language_loss": 0.86145139, "learning_rate": 1.6319267345182092e-06, "loss": 0.88282394, "num_input_tokens_seen": 205010500, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 9516, "time_per_iteration": 2.4677958488464355 }, { "auxiliary_loss_clip": 0.01109225, "auxiliary_loss_mlp": 0.01030044, "balance_loss_clip": 1.01698518, "balance_loss_mlp": 1.03908134, "epoch": 0.5721929956410642, "flos": 18804910320000.0, "grad_norm": 1.7873334823331404, "language_loss": 0.87333059, "learning_rate": 1.6315439320628038e-06, "loss": 0.8947233, "num_input_tokens_seen": 205028560, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 9517, "time_per_iteration": 2.447237014770508 }, { "auxiliary_loss_clip": 0.01108214, "auxiliary_loss_mlp": 0.01029677, "balance_loss_clip": 1.01682651, "balance_loss_mlp": 1.03790748, "epoch": 0.5722531188937322, "flos": 27196659100800.0, "grad_norm": 2.059566277787707, "language_loss": 0.85229576, "learning_rate": 1.6311611435792893e-06, "loss": 0.87367469, "num_input_tokens_seen": 205048650, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 9518, "time_per_iteration": 2.502953052520752 }, { "auxiliary_loss_clip": 0.01107881, "auxiliary_loss_mlp": 0.01033171, "balance_loss_clip": 1.02105331, "balance_loss_mlp": 1.0399406, "epoch": 0.5723132421464001, "flos": 15195672852480.0, "grad_norm": 6.124421052780155, "language_loss": 0.7893824, "learning_rate": 1.6307783690821812e-06, "loss": 0.81079286, "num_input_tokens_seen": 205066480, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 9519, "time_per_iteration": 2.433673620223999 }, { "auxiliary_loss_clip": 0.01110045, "auxiliary_loss_mlp": 0.01027406, "balance_loss_clip": 1.01544976, "balance_loss_mlp": 1.04001558, "epoch": 0.5723733653990681, "flos": 27599433281280.0, "grad_norm": 2.41458676353816, "language_loss": 0.82792711, "learning_rate": 1.6303956085859944e-06, "loss": 0.84930164, "num_input_tokens_seen": 205087475, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69921875, "step": 9520, "time_per_iteration": 2.511835813522339 }, { "auxiliary_loss_clip": 0.0111203, "auxiliary_loss_mlp": 0.01038656, "balance_loss_clip": 1.02507246, "balance_loss_mlp": 1.03949785, "epoch": 0.572433488651736, "flos": 18222870337920.0, "grad_norm": 2.9291170517524105, "language_loss": 0.72813106, "learning_rate": 1.630012862105243e-06, "loss": 0.7496379, "num_input_tokens_seen": 205106495, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 9521, "time_per_iteration": 2.434462785720825 }, { "auxiliary_loss_clip": 0.01109213, "auxiliary_loss_mlp": 0.01030728, "balance_loss_clip": 1.01822293, "balance_loss_mlp": 1.03883708, "epoch": 0.5724936119044041, "flos": 31249106484480.0, "grad_norm": 1.9872005967882318, "language_loss": 0.7817862, "learning_rate": 1.6296301296544415e-06, "loss": 0.80318564, "num_input_tokens_seen": 205128285, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 9522, "time_per_iteration": 2.530496597290039 }, { "auxiliary_loss_clip": 0.0110476, "auxiliary_loss_mlp": 0.01029504, "balance_loss_clip": 1.01829851, "balance_loss_mlp": 1.03736043, "epoch": 0.572553735157072, "flos": 19202189719680.0, "grad_norm": 1.6805296386086814, "language_loss": 0.71986413, "learning_rate": 1.629247411248102e-06, "loss": 0.74120677, "num_input_tokens_seen": 205146595, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.671875, "step": 9523, "time_per_iteration": 2.436922788619995 }, { "auxiliary_loss_clip": 0.0110596, "auxiliary_loss_mlp": 0.01027587, "balance_loss_clip": 1.01620805, "balance_loss_mlp": 1.03691483, "epoch": 0.57261385840974, "flos": 21214911386880.0, "grad_norm": 1.9919147448721162, "language_loss": 0.70168847, "learning_rate": 1.628864706900738e-06, "loss": 0.72302395, "num_input_tokens_seen": 205164295, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6875, "step": 9524, "time_per_iteration": 2.477329730987549 }, { "auxiliary_loss_clip": 0.01108763, "auxiliary_loss_mlp": 0.01028061, "balance_loss_clip": 1.01604426, "balance_loss_mlp": 1.03904593, "epoch": 0.5726739816624079, "flos": 33984529793280.0, "grad_norm": 1.520259229571165, "language_loss": 0.65132111, "learning_rate": 1.6284820166268615e-06, "loss": 0.67268932, "num_input_tokens_seen": 205185380, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 9525, "time_per_iteration": 2.556112766265869 }, { "auxiliary_loss_clip": 0.01104417, "auxiliary_loss_mlp": 0.0102733, "balance_loss_clip": 1.01588595, "balance_loss_mlp": 1.0365113, "epoch": 0.5727341049150759, "flos": 24275972419200.0, "grad_norm": 1.662600915322265, "language_loss": 0.7234655, "learning_rate": 1.628099340440984e-06, "loss": 0.74478292, "num_input_tokens_seen": 205204895, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 9526, "time_per_iteration": 2.4729013442993164 }, { "auxiliary_loss_clip": 0.01106294, "auxiliary_loss_mlp": 0.01029259, "balance_loss_clip": 1.01752269, "balance_loss_mlp": 1.03850055, "epoch": 0.5727942281677438, "flos": 28400564269440.0, "grad_norm": 1.707037757886357, "language_loss": 0.80173755, "learning_rate": 1.6277166783576176e-06, "loss": 0.82309306, "num_input_tokens_seen": 205223440, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 9527, "time_per_iteration": 2.5081491470336914 }, { "auxiliary_loss_clip": 0.01107722, "auxiliary_loss_mlp": 0.01033862, "balance_loss_clip": 1.02061749, "balance_loss_mlp": 1.03913546, "epoch": 0.5728543514204119, "flos": 19536769929600.0, "grad_norm": 1.981861010679342, "language_loss": 0.72270298, "learning_rate": 1.6273340303912713e-06, "loss": 0.74411881, "num_input_tokens_seen": 205242800, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 9528, "time_per_iteration": 2.450188159942627 }, { "auxiliary_loss_clip": 0.01108766, "auxiliary_loss_mlp": 0.01033198, "balance_loss_clip": 1.02042425, "balance_loss_mlp": 1.03860438, "epoch": 0.5729144746730798, "flos": 21506757390720.0, "grad_norm": 1.7530182156084897, "language_loss": 0.85559285, "learning_rate": 1.6269513965564557e-06, "loss": 0.87701249, "num_input_tokens_seen": 205259465, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 9529, "time_per_iteration": 2.4553146362304688 }, { "auxiliary_loss_clip": 0.01034055, "auxiliary_loss_mlp": 0.00999369, "balance_loss_clip": 0.99803942, "balance_loss_mlp": 1.00985253, "epoch": 0.5729745979257478, "flos": 58681628242560.0, "grad_norm": 0.7667008294643769, "language_loss": 0.5617522, "learning_rate": 1.6265687768676813e-06, "loss": 0.58208644, "num_input_tokens_seen": 205314100, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.2421875, "step": 9530, "time_per_iteration": 2.935969591140747 }, { "auxiliary_loss_clip": 0.01110027, "auxiliary_loss_mlp": 0.01025031, "balance_loss_clip": 1.01304448, "balance_loss_mlp": 1.03908491, "epoch": 0.5730347211784158, "flos": 18552099421440.0, "grad_norm": 2.680018691400595, "language_loss": 0.6626097, "learning_rate": 1.6261861713394553e-06, "loss": 0.68396026, "num_input_tokens_seen": 205333420, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 9531, "time_per_iteration": 2.444866180419922 }, { "auxiliary_loss_clip": 0.01109549, "auxiliary_loss_mlp": 0.01030407, "balance_loss_clip": 1.01760948, "balance_loss_mlp": 1.03937912, "epoch": 0.5730948444310837, "flos": 38031482396160.0, "grad_norm": 2.169424571323967, "language_loss": 0.75788212, "learning_rate": 1.6258035799862876e-06, "loss": 0.77928168, "num_input_tokens_seen": 205350995, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 9532, "time_per_iteration": 2.598036289215088 }, { "auxiliary_loss_clip": 0.01106979, "auxiliary_loss_mlp": 0.0102569, "balance_loss_clip": 1.01346469, "balance_loss_mlp": 1.03807068, "epoch": 0.5731549676837517, "flos": 25227066689280.0, "grad_norm": 1.3299137035624864, "language_loss": 0.79016495, "learning_rate": 1.625421002822686e-06, "loss": 0.81149161, "num_input_tokens_seen": 205372675, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69140625, "step": 9533, "time_per_iteration": 2.504906415939331 }, { "auxiliary_loss_clip": 0.01107447, "auxiliary_loss_mlp": 0.01025549, "balance_loss_clip": 1.01361632, "balance_loss_mlp": 1.0395515, "epoch": 0.5732150909364196, "flos": 23368222886400.0, "grad_norm": 1.8198394995184182, "language_loss": 0.85503793, "learning_rate": 1.6250384398631574e-06, "loss": 0.87636793, "num_input_tokens_seen": 205392590, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 9534, "time_per_iteration": 2.5547726154327393 }, { "auxiliary_loss_clip": 0.01108376, "auxiliary_loss_mlp": 0.01030646, "balance_loss_clip": 1.01761007, "balance_loss_mlp": 1.03846562, "epoch": 0.5732752141890877, "flos": 23079357711360.0, "grad_norm": 2.1546591209800585, "language_loss": 0.75234073, "learning_rate": 1.6246558911222085e-06, "loss": 0.77373093, "num_input_tokens_seen": 205414885, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 9535, "time_per_iteration": 2.5096161365509033 }, { "auxiliary_loss_clip": 0.01113087, "auxiliary_loss_mlp": 0.01033864, "balance_loss_clip": 1.02122164, "balance_loss_mlp": 1.04067922, "epoch": 0.5733353374417556, "flos": 24352282863360.0, "grad_norm": 2.4344345403101877, "language_loss": 0.71453106, "learning_rate": 1.624273356614346e-06, "loss": 0.7360006, "num_input_tokens_seen": 205434440, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 9536, "time_per_iteration": 2.4932491779327393 }, { "auxiliary_loss_clip": 0.01106654, "auxiliary_loss_mlp": 0.01028641, "balance_loss_clip": 1.01667273, "balance_loss_mlp": 1.03779936, "epoch": 0.5733954606944236, "flos": 27198849830400.0, "grad_norm": 1.9638060453487134, "language_loss": 0.70638388, "learning_rate": 1.6238908363540755e-06, "loss": 0.72773683, "num_input_tokens_seen": 205454225, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 9537, "time_per_iteration": 2.5039288997650146 }, { "auxiliary_loss_clip": 0.01108485, "auxiliary_loss_mlp": 0.01030408, "balance_loss_clip": 1.01867223, "balance_loss_mlp": 1.0386579, "epoch": 0.5734555839470915, "flos": 28765129357440.0, "grad_norm": 2.2106162387134294, "language_loss": 0.62624902, "learning_rate": 1.623508330355902e-06, "loss": 0.64763796, "num_input_tokens_seen": 205474750, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6953125, "step": 9538, "time_per_iteration": 2.5105390548706055 }, { "auxiliary_loss_clip": 0.0110896, "auxiliary_loss_mlp": 0.01036018, "balance_loss_clip": 1.02316737, "balance_loss_mlp": 1.0396142, "epoch": 0.5735157071997595, "flos": 22966813422720.0, "grad_norm": 2.0067035170914442, "language_loss": 0.83546621, "learning_rate": 1.6231258386343306e-06, "loss": 0.85691595, "num_input_tokens_seen": 205495495, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 9539, "time_per_iteration": 2.474705934524536 }, { "auxiliary_loss_clip": 0.01110832, "auxiliary_loss_mlp": 0.01033174, "balance_loss_clip": 1.02046585, "balance_loss_mlp": 1.03898096, "epoch": 0.5735758304524274, "flos": 18989455420800.0, "grad_norm": 1.9229840595913037, "language_loss": 0.72902596, "learning_rate": 1.6227433612038647e-06, "loss": 0.75046611, "num_input_tokens_seen": 205510070, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 9540, "time_per_iteration": 2.4558794498443604 }, { "auxiliary_loss_clip": 0.01105002, "auxiliary_loss_mlp": 0.0102616, "balance_loss_clip": 1.01475811, "balance_loss_mlp": 1.03619552, "epoch": 0.5736359537050955, "flos": 28397942576640.0, "grad_norm": 2.79956008361284, "language_loss": 0.80019057, "learning_rate": 1.6223608980790089e-06, "loss": 0.82150209, "num_input_tokens_seen": 205530190, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 9541, "time_per_iteration": 2.5024573802948 }, { "auxiliary_loss_clip": 0.01111687, "auxiliary_loss_mlp": 0.01032587, "balance_loss_clip": 1.01965272, "balance_loss_mlp": 1.03954053, "epoch": 0.5736960769577634, "flos": 15627210848640.0, "grad_norm": 2.4115517919022764, "language_loss": 0.6467942, "learning_rate": 1.6219784492742654e-06, "loss": 0.66823697, "num_input_tokens_seen": 205547380, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 9542, "time_per_iteration": 2.429417848587036 }, { "auxiliary_loss_clip": 0.0110797, "auxiliary_loss_mlp": 0.01030255, "balance_loss_clip": 1.01859093, "balance_loss_mlp": 1.0384506, "epoch": 0.5737562002104314, "flos": 18003994813440.0, "grad_norm": 2.1325418631001134, "language_loss": 0.83349258, "learning_rate": 1.6215960148041365e-06, "loss": 0.85487485, "num_input_tokens_seen": 205566540, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 9543, "time_per_iteration": 2.4240927696228027 }, { "auxiliary_loss_clip": 0.01113609, "auxiliary_loss_mlp": 0.01030408, "balance_loss_clip": 1.01702106, "balance_loss_mlp": 1.04017472, "epoch": 0.5738163234630994, "flos": 20698192287360.0, "grad_norm": 3.1642854741385564, "language_loss": 0.73820555, "learning_rate": 1.6212135946831257e-06, "loss": 0.7596457, "num_input_tokens_seen": 205584200, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 9544, "time_per_iteration": 2.466977834701538 }, { "auxiliary_loss_clip": 0.01110446, "auxiliary_loss_mlp": 0.0103044, "balance_loss_clip": 1.01817918, "balance_loss_mlp": 1.03854227, "epoch": 0.5738764467157673, "flos": 23149311448320.0, "grad_norm": 5.221356490047204, "language_loss": 0.76260293, "learning_rate": 1.620831188925733e-06, "loss": 0.78401178, "num_input_tokens_seen": 205604675, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71875, "step": 9545, "time_per_iteration": 2.4699151515960693 }, { "auxiliary_loss_clip": 0.01109885, "auxiliary_loss_mlp": 0.01031506, "balance_loss_clip": 1.01898885, "balance_loss_mlp": 1.03922987, "epoch": 0.5739365699684353, "flos": 29492930730240.0, "grad_norm": 2.2170085026570927, "language_loss": 0.5668714, "learning_rate": 1.620448797546459e-06, "loss": 0.58828533, "num_input_tokens_seen": 205624680, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 9546, "time_per_iteration": 3.934530019760132 }, { "auxiliary_loss_clip": 0.01108363, "auxiliary_loss_mlp": 0.01027828, "balance_loss_clip": 1.01543021, "balance_loss_mlp": 1.03688955, "epoch": 0.5739966932211032, "flos": 14027247342720.0, "grad_norm": 3.576953550098687, "language_loss": 0.76727957, "learning_rate": 1.6200664205598055e-06, "loss": 0.78864145, "num_input_tokens_seen": 205641950, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 9547, "time_per_iteration": 2.445326089859009 }, { "auxiliary_loss_clip": 0.01109363, "auxiliary_loss_mlp": 0.0103252, "balance_loss_clip": 1.01966298, "balance_loss_mlp": 1.03816509, "epoch": 0.5740568164737713, "flos": 19062030850560.0, "grad_norm": 3.697768891441133, "language_loss": 0.74573994, "learning_rate": 1.6196840579802704e-06, "loss": 0.76715875, "num_input_tokens_seen": 205660130, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9548, "time_per_iteration": 2.450277090072632 }, { "auxiliary_loss_clip": 0.01108058, "auxiliary_loss_mlp": 0.01030169, "balance_loss_clip": 1.0179621, "balance_loss_mlp": 1.03747582, "epoch": 0.5741169397264392, "flos": 22127832478080.0, "grad_norm": 2.180959783238089, "language_loss": 0.69308233, "learning_rate": 1.619301709822355e-06, "loss": 0.71446455, "num_input_tokens_seen": 205678895, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 9549, "time_per_iteration": 3.93107533454895 }, { "auxiliary_loss_clip": 0.01111004, "auxiliary_loss_mlp": 0.01030427, "balance_loss_clip": 1.01880395, "balance_loss_mlp": 1.04186773, "epoch": 0.5741770629791072, "flos": 24936836797440.0, "grad_norm": 1.5266175957712997, "language_loss": 0.79448307, "learning_rate": 1.6189193761005564e-06, "loss": 0.81589741, "num_input_tokens_seen": 205698450, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.69140625, "step": 9550, "time_per_iteration": 2.4914028644561768 }, { "auxiliary_loss_clip": 0.01110895, "auxiliary_loss_mlp": 0.01030127, "balance_loss_clip": 1.01780701, "balance_loss_mlp": 1.03974533, "epoch": 0.5742371862317751, "flos": 18801462614400.0, "grad_norm": 2.2726422173829346, "language_loss": 0.67772192, "learning_rate": 1.6185370568293727e-06, "loss": 0.69913214, "num_input_tokens_seen": 205714870, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 9551, "time_per_iteration": 3.9012069702148438 }, { "auxiliary_loss_clip": 0.01111966, "auxiliary_loss_mlp": 0.01033259, "balance_loss_clip": 1.02086759, "balance_loss_mlp": 1.03953719, "epoch": 0.5742973094844431, "flos": 24460661174400.0, "grad_norm": 1.8166283342195813, "language_loss": 0.71785104, "learning_rate": 1.6181547520233031e-06, "loss": 0.73930329, "num_input_tokens_seen": 205736045, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.72265625, "step": 9552, "time_per_iteration": 2.498046875 }, { "auxiliary_loss_clip": 0.01110985, "auxiliary_loss_mlp": 0.01032825, "balance_loss_clip": 1.02077866, "balance_loss_mlp": 1.04100657, "epoch": 0.574357432737111, "flos": 21652770176640.0, "grad_norm": 1.8775266542016773, "language_loss": 0.80321455, "learning_rate": 1.617772461696843e-06, "loss": 0.82465267, "num_input_tokens_seen": 205754445, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 9553, "time_per_iteration": 2.444145917892456 }, { "auxiliary_loss_clip": 0.01110887, "auxiliary_loss_mlp": 0.01028634, "balance_loss_clip": 1.0161767, "balance_loss_mlp": 1.03753901, "epoch": 0.5744175559897791, "flos": 16544728880640.0, "grad_norm": 2.5286288102782297, "language_loss": 0.83549953, "learning_rate": 1.6173901858644895e-06, "loss": 0.85689473, "num_input_tokens_seen": 205770595, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.734375, "step": 9554, "time_per_iteration": 2.4200425148010254 }, { "auxiliary_loss_clip": 0.01111931, "auxiliary_loss_mlp": 0.01035276, "balance_loss_clip": 1.02243745, "balance_loss_mlp": 1.03942192, "epoch": 0.574477679242447, "flos": 24207598880640.0, "grad_norm": 1.5188232483908268, "language_loss": 0.70922446, "learning_rate": 1.6170079245407385e-06, "loss": 0.7306965, "num_input_tokens_seen": 205791935, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 9555, "time_per_iteration": 2.4868404865264893 }, { "auxiliary_loss_clip": 0.01110639, "auxiliary_loss_mlp": 0.01025389, "balance_loss_clip": 1.01216853, "balance_loss_mlp": 1.03975511, "epoch": 0.574537802495115, "flos": 14903000835840.0, "grad_norm": 2.2206201870171465, "language_loss": 0.73031282, "learning_rate": 1.6166256777400853e-06, "loss": 0.75167316, "num_input_tokens_seen": 205807260, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 9556, "time_per_iteration": 2.4538745880126953 }, { "auxiliary_loss_clip": 0.01110959, "auxiliary_loss_mlp": 0.01026889, "balance_loss_clip": 1.01446164, "balance_loss_mlp": 1.0404644, "epoch": 0.5745979257477829, "flos": 24934969290240.0, "grad_norm": 1.7178389168975325, "language_loss": 0.74169672, "learning_rate": 1.6162434454770248e-06, "loss": 0.76307523, "num_input_tokens_seen": 205826885, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 9557, "time_per_iteration": 2.4961044788360596 }, { "auxiliary_loss_clip": 0.01111907, "auxiliary_loss_mlp": 0.01032363, "balance_loss_clip": 1.02029324, "balance_loss_mlp": 1.04101586, "epoch": 0.5746580490004509, "flos": 17235757704960.0, "grad_norm": 1.6020444713879576, "language_loss": 0.67870861, "learning_rate": 1.6158612277660514e-06, "loss": 0.70015132, "num_input_tokens_seen": 205844630, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7109375, "step": 9558, "time_per_iteration": 2.441063642501831 }, { "auxiliary_loss_clip": 0.01115081, "auxiliary_loss_mlp": 0.01039973, "balance_loss_clip": 1.02542984, "balance_loss_mlp": 1.03959775, "epoch": 0.5747181722531189, "flos": 13187871348480.0, "grad_norm": 2.021643185155347, "language_loss": 0.70569944, "learning_rate": 1.615479024621659e-06, "loss": 0.72724998, "num_input_tokens_seen": 205860960, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.75390625, "step": 9559, "time_per_iteration": 2.4289839267730713 }, { "auxiliary_loss_clip": 0.01109953, "auxiliary_loss_mlp": 0.01027468, "balance_loss_clip": 1.01677513, "balance_loss_mlp": 1.04077101, "epoch": 0.5747782955057869, "flos": 22963006581120.0, "grad_norm": 1.6087493220405502, "language_loss": 0.79238701, "learning_rate": 1.6150968360583398e-06, "loss": 0.81376123, "num_input_tokens_seen": 205880675, "router_z_loss_clip": 0.10693359, "router_z_loss_mlp": 0.6953125, "step": 9560, "time_per_iteration": 2.4940476417541504 }, { "auxiliary_loss_clip": 0.01109271, "auxiliary_loss_mlp": 0.01025661, "balance_loss_clip": 1.01326275, "balance_loss_mlp": 1.03858209, "epoch": 0.5748384187584549, "flos": 23403235668480.0, "grad_norm": 1.6671533037088837, "language_loss": 0.64352036, "learning_rate": 1.614714662090588e-06, "loss": 0.66486967, "num_input_tokens_seen": 205900050, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 9561, "time_per_iteration": 2.480365037918091 }, { "auxiliary_loss_clip": 0.01116831, "auxiliary_loss_mlp": 0.01039143, "balance_loss_clip": 1.02623296, "balance_loss_mlp": 1.04196227, "epoch": 0.5748985420111228, "flos": 17785514338560.0, "grad_norm": 1.75780453976018, "language_loss": 0.71197081, "learning_rate": 1.6143325027328945e-06, "loss": 0.73353046, "num_input_tokens_seen": 205918855, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.75, "step": 9562, "time_per_iteration": 2.4503402709960938 }, { "auxiliary_loss_clip": 0.01109747, "auxiliary_loss_mlp": 0.01031121, "balance_loss_clip": 1.01961744, "balance_loss_mlp": 1.03971243, "epoch": 0.5749586652637908, "flos": 19866250408320.0, "grad_norm": 1.5937973685348168, "language_loss": 0.84088552, "learning_rate": 1.613950357999751e-06, "loss": 0.8622942, "num_input_tokens_seen": 205936970, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69921875, "step": 9563, "time_per_iteration": 2.4416441917419434 }, { "auxiliary_loss_clip": 0.01111926, "auxiliary_loss_mlp": 0.01036842, "balance_loss_clip": 1.02446783, "balance_loss_mlp": 1.03905344, "epoch": 0.5750187885164587, "flos": 21287235421440.0, "grad_norm": 2.046899339656097, "language_loss": 0.57513797, "learning_rate": 1.6135682279056488e-06, "loss": 0.59662569, "num_input_tokens_seen": 205954630, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7265625, "step": 9564, "time_per_iteration": 2.4802474975585938 }, { "auxiliary_loss_clip": 0.01103796, "auxiliary_loss_mlp": 0.01026466, "balance_loss_clip": 1.01536179, "balance_loss_mlp": 1.0373733, "epoch": 0.5750789117691267, "flos": 18804658924800.0, "grad_norm": 2.0048181022318534, "language_loss": 0.75837898, "learning_rate": 1.613186112465078e-06, "loss": 0.77968168, "num_input_tokens_seen": 205971510, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6640625, "step": 9565, "time_per_iteration": 2.438419818878174 }, { "auxiliary_loss_clip": 0.01035276, "auxiliary_loss_mlp": 0.01010397, "balance_loss_clip": 1.00908554, "balance_loss_mlp": 1.01122594, "epoch": 0.5751390350217946, "flos": 70663224124800.0, "grad_norm": 0.742751519567327, "language_loss": 0.60781509, "learning_rate": 1.6128040116925287e-06, "loss": 0.62827188, "num_input_tokens_seen": 206035125, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.24023438, "step": 9566, "time_per_iteration": 3.1779606342315674 }, { "auxiliary_loss_clip": 0.01108438, "auxiliary_loss_mlp": 0.01033654, "balance_loss_clip": 1.02191222, "balance_loss_mlp": 1.03944492, "epoch": 0.5751991582744627, "flos": 14246338348800.0, "grad_norm": 2.031778816959953, "language_loss": 0.7547251, "learning_rate": 1.6124219256024901e-06, "loss": 0.77614605, "num_input_tokens_seen": 206052075, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 9567, "time_per_iteration": 2.429733991622925 }, { "auxiliary_loss_clip": 0.01108703, "auxiliary_loss_mlp": 0.01029057, "balance_loss_clip": 1.0174644, "balance_loss_mlp": 1.03915811, "epoch": 0.5752592815271306, "flos": 18328160079360.0, "grad_norm": 2.4210003600052215, "language_loss": 0.74827486, "learning_rate": 1.6120398542094504e-06, "loss": 0.76965255, "num_input_tokens_seen": 206069970, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6953125, "step": 9568, "time_per_iteration": 2.4621264934539795 }, { "auxiliary_loss_clip": 0.0110864, "auxiliary_loss_mlp": 0.01028676, "balance_loss_clip": 1.01660049, "balance_loss_mlp": 1.03826237, "epoch": 0.5753194047797986, "flos": 20922742160640.0, "grad_norm": 1.8234999334277118, "language_loss": 0.71307325, "learning_rate": 1.6116577975278994e-06, "loss": 0.73444635, "num_input_tokens_seen": 206088950, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 9569, "time_per_iteration": 2.461378335952759 }, { "auxiliary_loss_clip": 0.01111181, "auxiliary_loss_mlp": 0.01037615, "balance_loss_clip": 1.02514553, "balance_loss_mlp": 1.03968, "epoch": 0.5753795280324665, "flos": 19281804215040.0, "grad_norm": 2.2001141994155464, "language_loss": 0.55296928, "learning_rate": 1.6112757555723223e-06, "loss": 0.57445717, "num_input_tokens_seen": 206107780, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 9570, "time_per_iteration": 2.4501028060913086 }, { "auxiliary_loss_clip": 0.011057, "auxiliary_loss_mlp": 0.01030924, "balance_loss_clip": 1.01928329, "balance_loss_mlp": 1.03706419, "epoch": 0.5754396512851345, "flos": 21652877917440.0, "grad_norm": 1.6478582921819818, "language_loss": 0.64515221, "learning_rate": 1.6108937283572082e-06, "loss": 0.66651845, "num_input_tokens_seen": 206127445, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 9571, "time_per_iteration": 2.4898929595947266 }, { "auxiliary_loss_clip": 0.01106498, "auxiliary_loss_mlp": 0.01026386, "balance_loss_clip": 1.01434636, "balance_loss_mlp": 1.03712678, "epoch": 0.5754997745378025, "flos": 51021700179840.0, "grad_norm": 1.5326294422504827, "language_loss": 0.67246687, "learning_rate": 1.6105117158970434e-06, "loss": 0.69379574, "num_input_tokens_seen": 206152005, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 9572, "time_per_iteration": 2.751193046569824 }, { "auxiliary_loss_clip": 0.01109415, "auxiliary_loss_mlp": 0.01030871, "balance_loss_clip": 1.01877105, "balance_loss_mlp": 1.03975713, "epoch": 0.5755598977904705, "flos": 22856890826880.0, "grad_norm": 2.348210995960437, "language_loss": 0.72189826, "learning_rate": 1.6101297182063123e-06, "loss": 0.74330109, "num_input_tokens_seen": 206169875, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 9573, "time_per_iteration": 2.4647011756896973 }, { "auxiliary_loss_clip": 0.01106975, "auxiliary_loss_mlp": 0.01027154, "balance_loss_clip": 1.01656818, "balance_loss_mlp": 1.04150987, "epoch": 0.5756200210431385, "flos": 38472824805120.0, "grad_norm": 2.027365072967812, "language_loss": 0.76360786, "learning_rate": 1.6097477352995022e-06, "loss": 0.78494918, "num_input_tokens_seen": 206192635, "router_z_loss_clip": 0.10595703, "router_z_loss_mlp": 0.65625, "step": 9574, "time_per_iteration": 2.6359920501708984 }, { "auxiliary_loss_clip": 0.01113509, "auxiliary_loss_mlp": 0.01028249, "balance_loss_clip": 1.01524365, "balance_loss_mlp": 1.03973579, "epoch": 0.5756801442958064, "flos": 23910006700800.0, "grad_norm": 2.428378171892855, "language_loss": 0.66773522, "learning_rate": 1.6093657671910968e-06, "loss": 0.68915284, "num_input_tokens_seen": 206211485, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 9575, "time_per_iteration": 2.466050863265991 }, { "auxiliary_loss_clip": 0.0110721, "auxiliary_loss_mlp": 0.01029682, "balance_loss_clip": 1.01815486, "balance_loss_mlp": 1.04024267, "epoch": 0.5757402675484744, "flos": 21105276099840.0, "grad_norm": 1.6083859037852979, "language_loss": 0.79932767, "learning_rate": 1.6089838138955804e-06, "loss": 0.82069659, "num_input_tokens_seen": 206231740, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 9576, "time_per_iteration": 2.4760794639587402 }, { "auxiliary_loss_clip": 0.01106851, "auxiliary_loss_mlp": 0.01025425, "balance_loss_clip": 1.01429129, "balance_loss_mlp": 1.03859115, "epoch": 0.5758003908011423, "flos": 20559110826240.0, "grad_norm": 2.071235121566365, "language_loss": 0.69976699, "learning_rate": 1.6086018754274372e-06, "loss": 0.72108972, "num_input_tokens_seen": 206250975, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.68359375, "step": 9577, "time_per_iteration": 2.453691005706787 }, { "auxiliary_loss_clip": 0.01110413, "auxiliary_loss_mlp": 0.01028795, "balance_loss_clip": 1.0171181, "balance_loss_mlp": 1.03812075, "epoch": 0.5758605140538103, "flos": 16473015377280.0, "grad_norm": 1.8842721569512784, "language_loss": 0.66601455, "learning_rate": 1.6082199518011504e-06, "loss": 0.68740666, "num_input_tokens_seen": 206268800, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.72265625, "step": 9578, "time_per_iteration": 2.4527432918548584 }, { "auxiliary_loss_clip": 0.01106259, "auxiliary_loss_mlp": 0.01025032, "balance_loss_clip": 1.01371944, "balance_loss_mlp": 1.03808331, "epoch": 0.5759206373064782, "flos": 21287558643840.0, "grad_norm": 2.000422681581114, "language_loss": 0.72869837, "learning_rate": 1.6078380430312016e-06, "loss": 0.75001121, "num_input_tokens_seen": 206287190, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.68359375, "step": 9579, "time_per_iteration": 2.4495158195495605 }, { "auxiliary_loss_clip": 0.0111411, "auxiliary_loss_mlp": 0.01028539, "balance_loss_clip": 1.01540256, "balance_loss_mlp": 1.04019499, "epoch": 0.5759807605591463, "flos": 26067879227520.0, "grad_norm": 2.589405995238316, "language_loss": 0.64842147, "learning_rate": 1.6074561491320742e-06, "loss": 0.66984797, "num_input_tokens_seen": 206307020, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 9580, "time_per_iteration": 2.5077192783355713 }, { "auxiliary_loss_clip": 0.01110448, "auxiliary_loss_mlp": 0.01031314, "balance_loss_clip": 1.01874948, "balance_loss_mlp": 1.03967929, "epoch": 0.5760408838118142, "flos": 18873068376960.0, "grad_norm": 1.8115033849005462, "language_loss": 0.85345447, "learning_rate": 1.6070742701182486e-06, "loss": 0.87487215, "num_input_tokens_seen": 206324095, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 9581, "time_per_iteration": 2.433450937271118 }, { "auxiliary_loss_clip": 0.01117287, "auxiliary_loss_mlp": 0.0103435, "balance_loss_clip": 1.02146363, "balance_loss_mlp": 1.04293561, "epoch": 0.5761010070644822, "flos": 15378134964480.0, "grad_norm": 2.304382851067931, "language_loss": 0.67647463, "learning_rate": 1.6066924060042057e-06, "loss": 0.69799101, "num_input_tokens_seen": 206343210, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 9582, "time_per_iteration": 2.4683053493499756 }, { "auxiliary_loss_clip": 0.0103591, "auxiliary_loss_mlp": 0.01004178, "balance_loss_clip": 1.00287271, "balance_loss_mlp": 1.01189399, "epoch": 0.5761611303171501, "flos": 71471932882560.0, "grad_norm": 0.6459519362471372, "language_loss": 0.57294071, "learning_rate": 1.6063105568044271e-06, "loss": 0.59334159, "num_input_tokens_seen": 206415935, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.24023438, "step": 9583, "time_per_iteration": 3.2418837547302246 }, { "auxiliary_loss_clip": 0.01111078, "auxiliary_loss_mlp": 0.01026595, "balance_loss_clip": 1.01449561, "balance_loss_mlp": 1.0401473, "epoch": 0.5762212535698181, "flos": 16246167033600.0, "grad_norm": 1.9392747453186419, "language_loss": 0.82634664, "learning_rate": 1.6059287225333912e-06, "loss": 0.84772336, "num_input_tokens_seen": 206431900, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.70703125, "step": 9584, "time_per_iteration": 2.464006185531616 }, { "auxiliary_loss_clip": 0.01034944, "auxiliary_loss_mlp": 0.01006021, "balance_loss_clip": 1.00464368, "balance_loss_mlp": 1.01090169, "epoch": 0.5762813768224861, "flos": 70185504216960.0, "grad_norm": 0.6889406611716864, "language_loss": 0.49560073, "learning_rate": 1.6055469032055773e-06, "loss": 0.5160104, "num_input_tokens_seen": 206501200, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.24023438, "step": 9585, "time_per_iteration": 3.163912773132324 }, { "auxiliary_loss_clip": 0.01105479, "auxiliary_loss_mlp": 0.0102508, "balance_loss_clip": 1.01368952, "balance_loss_mlp": 1.03652287, "epoch": 0.5763415000751541, "flos": 20518028645760.0, "grad_norm": 1.648582095842301, "language_loss": 0.84843349, "learning_rate": 1.605165098835465e-06, "loss": 0.86973912, "num_input_tokens_seen": 206520575, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 9586, "time_per_iteration": 2.4828031063079834 }, { "auxiliary_loss_clip": 0.01107834, "auxiliary_loss_mlp": 0.0103016, "balance_loss_clip": 1.01729155, "balance_loss_mlp": 1.03765047, "epoch": 0.5764016233278221, "flos": 15815526877440.0, "grad_norm": 1.8170082229088473, "language_loss": 0.80173552, "learning_rate": 1.6047833094375308e-06, "loss": 0.82311547, "num_input_tokens_seen": 206538060, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 9587, "time_per_iteration": 2.4239609241485596 }, { "auxiliary_loss_clip": 0.0110823, "auxiliary_loss_mlp": 0.01029555, "balance_loss_clip": 1.01684093, "balance_loss_mlp": 1.03865647, "epoch": 0.57646174658049, "flos": 20772312001920.0, "grad_norm": 1.517983513272377, "language_loss": 0.65812171, "learning_rate": 1.6044015350262542e-06, "loss": 0.67949951, "num_input_tokens_seen": 206557320, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 9588, "time_per_iteration": 3.856775999069214 }, { "auxiliary_loss_clip": 0.0111094, "auxiliary_loss_mlp": 0.01031129, "balance_loss_clip": 1.01763403, "balance_loss_mlp": 1.03994811, "epoch": 0.576521869833158, "flos": 23549930812800.0, "grad_norm": 2.0329768465053206, "language_loss": 0.78551614, "learning_rate": 1.6040197756161104e-06, "loss": 0.80693686, "num_input_tokens_seen": 206575780, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 9589, "time_per_iteration": 2.51019549369812 }, { "auxiliary_loss_clip": 0.01104723, "auxiliary_loss_mlp": 0.01023869, "balance_loss_clip": 1.01249063, "balance_loss_mlp": 1.03694725, "epoch": 0.5765819930858259, "flos": 20266582464000.0, "grad_norm": 2.0347134751380698, "language_loss": 0.79653454, "learning_rate": 1.6036380312215762e-06, "loss": 0.81782043, "num_input_tokens_seen": 206594100, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.67578125, "step": 9590, "time_per_iteration": 2.4520761966705322 }, { "auxiliary_loss_clip": 0.0110813, "auxiliary_loss_mlp": 0.01025552, "balance_loss_clip": 1.01406014, "balance_loss_mlp": 1.03825295, "epoch": 0.5766421163384939, "flos": 23148772744320.0, "grad_norm": 2.6835170814497795, "language_loss": 0.63407224, "learning_rate": 1.6032563018571283e-06, "loss": 0.6554091, "num_input_tokens_seen": 206613325, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.69921875, "step": 9591, "time_per_iteration": 3.967024326324463 }, { "auxiliary_loss_clip": 0.0111127, "auxiliary_loss_mlp": 0.01035509, "balance_loss_clip": 1.02265859, "balance_loss_mlp": 1.04016495, "epoch": 0.5767022395911618, "flos": 25848895962240.0, "grad_norm": 1.9282858493892217, "language_loss": 0.78227139, "learning_rate": 1.6028745875372406e-06, "loss": 0.80373919, "num_input_tokens_seen": 206634265, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9592, "time_per_iteration": 5.320594549179077 }, { "auxiliary_loss_clip": 0.01034987, "auxiliary_loss_mlp": 0.01004423, "balance_loss_clip": 1.00316525, "balance_loss_mlp": 1.01087749, "epoch": 0.5767623628438299, "flos": 68293299657600.0, "grad_norm": 0.7387188420140334, "language_loss": 0.59650403, "learning_rate": 1.6024928882763885e-06, "loss": 0.61689812, "num_input_tokens_seen": 206696990, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.24121094, "step": 9593, "time_per_iteration": 3.229034185409546 }, { "auxiliary_loss_clip": 0.01112187, "auxiliary_loss_mlp": 0.01037459, "balance_loss_clip": 1.02456641, "balance_loss_mlp": 1.03929555, "epoch": 0.5768224860964978, "flos": 30188448754560.0, "grad_norm": 1.7107444445899418, "language_loss": 0.71320713, "learning_rate": 1.6021112040890463e-06, "loss": 0.7347036, "num_input_tokens_seen": 206717815, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9594, "time_per_iteration": 2.5432887077331543 }, { "auxiliary_loss_clip": 0.0110977, "auxiliary_loss_mlp": 0.01030217, "balance_loss_clip": 1.0190233, "balance_loss_mlp": 1.03918672, "epoch": 0.5768826093491658, "flos": 17895041884800.0, "grad_norm": 2.062330033775566, "language_loss": 0.71147716, "learning_rate": 1.6017295349896863e-06, "loss": 0.73287702, "num_input_tokens_seen": 206735985, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.703125, "step": 9595, "time_per_iteration": 2.4784281253814697 }, { "auxiliary_loss_clip": 0.0110759, "auxiliary_loss_mlp": 0.01028638, "balance_loss_clip": 1.0157752, "balance_loss_mlp": 1.03734112, "epoch": 0.5769427326018337, "flos": 17457183095040.0, "grad_norm": 2.865569821706833, "language_loss": 0.6943264, "learning_rate": 1.6013478809927828e-06, "loss": 0.71568865, "num_input_tokens_seen": 206753370, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 9596, "time_per_iteration": 2.446455955505371 }, { "auxiliary_loss_clip": 0.01114769, "auxiliary_loss_mlp": 0.01036413, "balance_loss_clip": 1.02260828, "balance_loss_mlp": 1.03948927, "epoch": 0.5770028558545017, "flos": 39421728345600.0, "grad_norm": 1.9300973692295387, "language_loss": 0.67264473, "learning_rate": 1.6009662421128074e-06, "loss": 0.69415659, "num_input_tokens_seen": 206777645, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 9597, "time_per_iteration": 2.6546168327331543 }, { "auxiliary_loss_clip": 0.01109844, "auxiliary_loss_mlp": 0.0103011, "balance_loss_clip": 1.0172652, "balance_loss_mlp": 1.03956556, "epoch": 0.5770629791071697, "flos": 21536383132800.0, "grad_norm": 1.8676022557479972, "language_loss": 0.82024825, "learning_rate": 1.6005846183642323e-06, "loss": 0.84164774, "num_input_tokens_seen": 206794865, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 9598, "time_per_iteration": 2.469395637512207 }, { "auxiliary_loss_clip": 0.01111177, "auxiliary_loss_mlp": 0.01031332, "balance_loss_clip": 1.01818299, "balance_loss_mlp": 1.03978086, "epoch": 0.5771231023598377, "flos": 20886795624960.0, "grad_norm": 1.4196843189946122, "language_loss": 0.7287488, "learning_rate": 1.6002030097615277e-06, "loss": 0.75017393, "num_input_tokens_seen": 206814095, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 9599, "time_per_iteration": 2.486410617828369 }, { "auxiliary_loss_clip": 0.01106115, "auxiliary_loss_mlp": 0.01027457, "balance_loss_clip": 1.01532698, "balance_loss_mlp": 1.03821397, "epoch": 0.5771832256125057, "flos": 18077216688000.0, "grad_norm": 2.848866081204188, "language_loss": 0.78285307, "learning_rate": 1.5998214163191663e-06, "loss": 0.80418879, "num_input_tokens_seen": 206832245, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 9600, "time_per_iteration": 2.4442005157470703 }, { "auxiliary_loss_clip": 0.01112978, "auxiliary_loss_mlp": 0.01037787, "balance_loss_clip": 1.02523994, "balance_loss_mlp": 1.04091334, "epoch": 0.5772433488651736, "flos": 26359078786560.0, "grad_norm": 2.6948509283364377, "language_loss": 0.72270286, "learning_rate": 1.5994398380516163e-06, "loss": 0.7442106, "num_input_tokens_seen": 206851535, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 9601, "time_per_iteration": 2.530442714691162 }, { "auxiliary_loss_clip": 0.01111371, "auxiliary_loss_mlp": 0.01033396, "balance_loss_clip": 1.02129579, "balance_loss_mlp": 1.04185212, "epoch": 0.5773034721178416, "flos": 19680987035520.0, "grad_norm": 1.782552246002425, "language_loss": 0.68722647, "learning_rate": 1.599058274973348e-06, "loss": 0.70867419, "num_input_tokens_seen": 206870595, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 9602, "time_per_iteration": 2.4580752849578857 }, { "auxiliary_loss_clip": 0.01106383, "auxiliary_loss_mlp": 0.01032484, "balance_loss_clip": 1.02080703, "balance_loss_mlp": 1.03971994, "epoch": 0.5773635953705095, "flos": 25082885496960.0, "grad_norm": 1.469542656524274, "language_loss": 0.73064089, "learning_rate": 1.5986767270988297e-06, "loss": 0.75202954, "num_input_tokens_seen": 206892320, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66796875, "step": 9603, "time_per_iteration": 2.538100481033325 }, { "auxiliary_loss_clip": 0.0110857, "auxiliary_loss_mlp": 0.01024794, "balance_loss_clip": 1.01274228, "balance_loss_mlp": 1.03927374, "epoch": 0.5774237186231775, "flos": 21032987978880.0, "grad_norm": 1.6367469334703957, "language_loss": 0.76394308, "learning_rate": 1.5982951944425298e-06, "loss": 0.78527677, "num_input_tokens_seen": 206912485, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69140625, "step": 9604, "time_per_iteration": 2.4656057357788086 }, { "auxiliary_loss_clip": 0.01113444, "auxiliary_loss_mlp": 0.0103292, "balance_loss_clip": 1.01975942, "balance_loss_mlp": 1.04121733, "epoch": 0.5774838418758454, "flos": 15231727128960.0, "grad_norm": 1.832036885074335, "language_loss": 0.83369672, "learning_rate": 1.5979136770189174e-06, "loss": 0.8551603, "num_input_tokens_seen": 206929100, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 9605, "time_per_iteration": 2.4753851890563965 }, { "auxiliary_loss_clip": 0.01116594, "auxiliary_loss_mlp": 0.01031012, "balance_loss_clip": 1.0161047, "balance_loss_mlp": 1.04141331, "epoch": 0.5775439651285135, "flos": 23582609210880.0, "grad_norm": 3.6981369174859537, "language_loss": 0.7842921, "learning_rate": 1.5975321748424581e-06, "loss": 0.80576813, "num_input_tokens_seen": 206947020, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.75, "step": 9606, "time_per_iteration": 2.4917142391204834 }, { "auxiliary_loss_clip": 0.0110921, "auxiliary_loss_mlp": 0.01030835, "balance_loss_clip": 1.01879513, "balance_loss_mlp": 1.03953075, "epoch": 0.5776040883811814, "flos": 18040515966720.0, "grad_norm": 2.2023087638839343, "language_loss": 0.74017298, "learning_rate": 1.597150687927619e-06, "loss": 0.76157343, "num_input_tokens_seen": 206964065, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 9607, "time_per_iteration": 2.454507350921631 }, { "auxiliary_loss_clip": 0.01113231, "auxiliary_loss_mlp": 0.01030912, "balance_loss_clip": 1.01847839, "balance_loss_mlp": 1.04146481, "epoch": 0.5776642116338494, "flos": 18624638937600.0, "grad_norm": 1.677103758776312, "language_loss": 0.69056451, "learning_rate": 1.5967692162888664e-06, "loss": 0.71200597, "num_input_tokens_seen": 206981940, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.71875, "step": 9608, "time_per_iteration": 2.4497146606445312 }, { "auxiliary_loss_clip": 0.01110679, "auxiliary_loss_mlp": 0.01029162, "balance_loss_clip": 1.01662719, "balance_loss_mlp": 1.03922629, "epoch": 0.5777243348865173, "flos": 28402539517440.0, "grad_norm": 3.905876272297977, "language_loss": 0.76407099, "learning_rate": 1.596387759940665e-06, "loss": 0.78546941, "num_input_tokens_seen": 207002365, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 9609, "time_per_iteration": 2.537377119064331 }, { "auxiliary_loss_clip": 0.01110161, "auxiliary_loss_mlp": 0.01033899, "balance_loss_clip": 1.02153718, "balance_loss_mlp": 1.03829765, "epoch": 0.5777844581391853, "flos": 24024705805440.0, "grad_norm": 1.7474119911497168, "language_loss": 0.77290118, "learning_rate": 1.5960063188974808e-06, "loss": 0.7943418, "num_input_tokens_seen": 207021195, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 9610, "time_per_iteration": 2.495175361633301 }, { "auxiliary_loss_clip": 0.0110962, "auxiliary_loss_mlp": 0.01029208, "balance_loss_clip": 1.01587999, "balance_loss_mlp": 1.03893816, "epoch": 0.5778445813918534, "flos": 17777361951360.0, "grad_norm": 2.695724720725423, "language_loss": 0.6867584, "learning_rate": 1.5956248931737777e-06, "loss": 0.70814657, "num_input_tokens_seen": 207037465, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.70703125, "step": 9611, "time_per_iteration": 2.462738037109375 }, { "auxiliary_loss_clip": 0.01108544, "auxiliary_loss_mlp": 0.01028117, "balance_loss_clip": 1.01527166, "balance_loss_mlp": 1.03848219, "epoch": 0.5779047046445213, "flos": 22233194046720.0, "grad_norm": 1.891702232959501, "language_loss": 0.83259338, "learning_rate": 1.5952434827840185e-06, "loss": 0.85395998, "num_input_tokens_seen": 207054230, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 9612, "time_per_iteration": 2.4650683403015137 }, { "auxiliary_loss_clip": 0.01110083, "auxiliary_loss_mlp": 0.01033051, "balance_loss_clip": 1.02046227, "balance_loss_mlp": 1.03975868, "epoch": 0.5779648278971893, "flos": 21434361528960.0, "grad_norm": 1.6812966513022263, "language_loss": 0.79673195, "learning_rate": 1.594862087742667e-06, "loss": 0.81816328, "num_input_tokens_seen": 207073150, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 9613, "time_per_iteration": 2.495490550994873 }, { "auxiliary_loss_clip": 0.01105168, "auxiliary_loss_mlp": 0.01030465, "balance_loss_clip": 1.01869321, "balance_loss_mlp": 1.0358361, "epoch": 0.5780249511498572, "flos": 19026120228480.0, "grad_norm": 1.9538354314016877, "language_loss": 0.77413177, "learning_rate": 1.5944807080641863e-06, "loss": 0.79548812, "num_input_tokens_seen": 207090375, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69140625, "step": 9614, "time_per_iteration": 2.4442639350891113 }, { "auxiliary_loss_clip": 0.01112116, "auxiliary_loss_mlp": 0.01035242, "balance_loss_clip": 1.02298713, "balance_loss_mlp": 1.03977108, "epoch": 0.5780850744025252, "flos": 12124663752960.0, "grad_norm": 2.9074391382789346, "language_loss": 0.81165206, "learning_rate": 1.5940993437630375e-06, "loss": 0.83312571, "num_input_tokens_seen": 207106030, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.72265625, "step": 9615, "time_per_iteration": 2.4250171184539795 }, { "auxiliary_loss_clip": 0.01111512, "auxiliary_loss_mlp": 0.01029009, "balance_loss_clip": 1.01622939, "balance_loss_mlp": 1.04014874, "epoch": 0.5781451976551931, "flos": 25044425009280.0, "grad_norm": 1.819252227152331, "language_loss": 0.6736179, "learning_rate": 1.5937179948536825e-06, "loss": 0.69502318, "num_input_tokens_seen": 207125435, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 9616, "time_per_iteration": 2.5021090507507324 }, { "auxiliary_loss_clip": 0.01107972, "auxiliary_loss_mlp": 0.01027589, "balance_loss_clip": 1.01507211, "balance_loss_mlp": 1.0395267, "epoch": 0.5782053209078611, "flos": 19245606284160.0, "grad_norm": 1.6351815150842504, "language_loss": 0.78022957, "learning_rate": 1.5933366613505812e-06, "loss": 0.80158514, "num_input_tokens_seen": 207145095, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 9617, "time_per_iteration": 2.4664909839630127 }, { "auxiliary_loss_clip": 0.01109385, "auxiliary_loss_mlp": 0.01032053, "balance_loss_clip": 1.0190177, "balance_loss_mlp": 1.03952646, "epoch": 0.578265444160529, "flos": 25993831340160.0, "grad_norm": 1.9164663875502976, "language_loss": 0.75137925, "learning_rate": 1.5929553432681947e-06, "loss": 0.77279371, "num_input_tokens_seen": 207166045, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 9618, "time_per_iteration": 2.4896976947784424 }, { "auxiliary_loss_clip": 0.01107622, "auxiliary_loss_mlp": 0.01028395, "balance_loss_clip": 1.01643777, "balance_loss_mlp": 1.0383271, "epoch": 0.5783255674131971, "flos": 21798603394560.0, "grad_norm": 1.9484910186275477, "language_loss": 0.81110907, "learning_rate": 1.5925740406209826e-06, "loss": 0.83246922, "num_input_tokens_seen": 207185290, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 9619, "time_per_iteration": 2.470590353012085 }, { "auxiliary_loss_clip": 0.01111728, "auxiliary_loss_mlp": 0.01035459, "balance_loss_clip": 1.0231086, "balance_loss_mlp": 1.03987908, "epoch": 0.578385690665865, "flos": 24789746603520.0, "grad_norm": 1.8585617174731113, "language_loss": 0.72369063, "learning_rate": 1.5921927534234039e-06, "loss": 0.74516249, "num_input_tokens_seen": 207205505, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 9620, "time_per_iteration": 2.4897494316101074 }, { "auxiliary_loss_clip": 0.01113055, "auxiliary_loss_mlp": 0.01029305, "balance_loss_clip": 1.01660895, "balance_loss_mlp": 1.04146266, "epoch": 0.578445813918533, "flos": 21212864311680.0, "grad_norm": 1.6696710965710722, "language_loss": 0.76980007, "learning_rate": 1.591811481689916e-06, "loss": 0.79122365, "num_input_tokens_seen": 207225315, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 9621, "time_per_iteration": 2.488468885421753 }, { "auxiliary_loss_clip": 0.01110842, "auxiliary_loss_mlp": 0.01031986, "balance_loss_clip": 1.01922464, "balance_loss_mlp": 1.03883517, "epoch": 0.5785059371712009, "flos": 25046795306880.0, "grad_norm": 1.6217541866788685, "language_loss": 0.702847, "learning_rate": 1.5914302254349787e-06, "loss": 0.72427523, "num_input_tokens_seen": 207247690, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9622, "time_per_iteration": 2.504818916320801 }, { "auxiliary_loss_clip": 0.01038974, "auxiliary_loss_mlp": 0.0100079, "balance_loss_clip": 0.99922818, "balance_loss_mlp": 1.014889, "epoch": 0.5785660604238689, "flos": 70843172284800.0, "grad_norm": 0.7814992273754627, "language_loss": 0.56002879, "learning_rate": 1.5910489846730476e-06, "loss": 0.58042639, "num_input_tokens_seen": 207301735, "router_z_loss_clip": 0.01556396, "router_z_loss_mlp": 0.24023438, "step": 9623, "time_per_iteration": 3.152193307876587 }, { "auxiliary_loss_clip": 0.01114956, "auxiliary_loss_mlp": 0.01033959, "balance_loss_clip": 1.0207628, "balance_loss_mlp": 1.04132164, "epoch": 0.578626183676537, "flos": 31649977244160.0, "grad_norm": 1.9853498962592315, "language_loss": 0.71720576, "learning_rate": 1.5906677594185799e-06, "loss": 0.73869491, "num_input_tokens_seen": 207321240, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 9624, "time_per_iteration": 2.608456611633301 }, { "auxiliary_loss_clip": 0.01113041, "auxiliary_loss_mlp": 0.01037806, "balance_loss_clip": 1.02449059, "balance_loss_mlp": 1.04216409, "epoch": 0.5786863069292049, "flos": 21865181253120.0, "grad_norm": 2.3791308696175473, "language_loss": 0.82507885, "learning_rate": 1.5902865496860322e-06, "loss": 0.8465873, "num_input_tokens_seen": 207339540, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 9625, "time_per_iteration": 2.4758989810943604 }, { "auxiliary_loss_clip": 0.01107745, "auxiliary_loss_mlp": 0.01035838, "balance_loss_clip": 1.02228999, "balance_loss_mlp": 1.03854346, "epoch": 0.5787464301818729, "flos": 23364954748800.0, "grad_norm": 2.5335168165925386, "language_loss": 0.70254087, "learning_rate": 1.5899053554898591e-06, "loss": 0.72397673, "num_input_tokens_seen": 207360470, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.69140625, "step": 9626, "time_per_iteration": 2.5126848220825195 }, { "auxiliary_loss_clip": 0.01107939, "auxiliary_loss_mlp": 0.0103668, "balance_loss_clip": 1.02428234, "balance_loss_mlp": 1.03876543, "epoch": 0.5788065534345408, "flos": 30004011394560.0, "grad_norm": 1.5119181992046082, "language_loss": 0.71555573, "learning_rate": 1.5895241768445166e-06, "loss": 0.7370019, "num_input_tokens_seen": 207383080, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 9627, "time_per_iteration": 2.539315938949585 }, { "auxiliary_loss_clip": 0.0110698, "auxiliary_loss_mlp": 0.01030061, "balance_loss_clip": 1.01786041, "balance_loss_mlp": 1.03803158, "epoch": 0.5788666766872088, "flos": 24527849564160.0, "grad_norm": 1.6471069107421876, "language_loss": 0.83702368, "learning_rate": 1.589143013764458e-06, "loss": 0.85839415, "num_input_tokens_seen": 207401000, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 9628, "time_per_iteration": 2.485278606414795 }, { "auxiliary_loss_clip": 0.011099, "auxiliary_loss_mlp": 0.01027913, "balance_loss_clip": 1.01527095, "balance_loss_mlp": 1.03828752, "epoch": 0.5789267999398767, "flos": 23732823888000.0, "grad_norm": 1.518024933902255, "language_loss": 0.7223528, "learning_rate": 1.5887618662641376e-06, "loss": 0.7437309, "num_input_tokens_seen": 207419230, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 9629, "time_per_iteration": 2.5038061141967773 }, { "auxiliary_loss_clip": 0.01111616, "auxiliary_loss_mlp": 0.01033917, "balance_loss_clip": 1.02103055, "balance_loss_mlp": 1.04079819, "epoch": 0.5789869231925447, "flos": 21135045496320.0, "grad_norm": 2.3172205832811956, "language_loss": 0.7449801, "learning_rate": 1.5883807343580087e-06, "loss": 0.76643538, "num_input_tokens_seen": 207437615, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 9630, "time_per_iteration": 3.957085609436035 }, { "auxiliary_loss_clip": 0.01105771, "auxiliary_loss_mlp": 0.01029002, "balance_loss_clip": 1.01693785, "balance_loss_mlp": 1.03775656, "epoch": 0.5790470464452127, "flos": 21209632087680.0, "grad_norm": 2.3040353683702874, "language_loss": 0.79181868, "learning_rate": 1.587999618060523e-06, "loss": 0.81316638, "num_input_tokens_seen": 207457270, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 9631, "time_per_iteration": 2.450317621231079 }, { "auxiliary_loss_clip": 0.01109377, "auxiliary_loss_mlp": 0.01027678, "balance_loss_clip": 1.01550698, "balance_loss_mlp": 1.03890228, "epoch": 0.5791071696978807, "flos": 23404384903680.0, "grad_norm": 1.5866925598844597, "language_loss": 0.7515747, "learning_rate": 1.5876185173861333e-06, "loss": 0.77294523, "num_input_tokens_seen": 207477890, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 9632, "time_per_iteration": 3.943711996078491 }, { "auxiliary_loss_clip": 0.01107905, "auxiliary_loss_mlp": 0.01025562, "balance_loss_clip": 1.0126164, "balance_loss_mlp": 1.03804135, "epoch": 0.5791672929505486, "flos": 24206521472640.0, "grad_norm": 2.443864450845369, "language_loss": 0.79470444, "learning_rate": 1.5872374323492915e-06, "loss": 0.81603909, "num_input_tokens_seen": 207497670, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 9633, "time_per_iteration": 2.4856045246124268 }, { "auxiliary_loss_clip": 0.01118422, "auxiliary_loss_mlp": 0.01039835, "balance_loss_clip": 1.02601886, "balance_loss_mlp": 1.04214227, "epoch": 0.5792274162032166, "flos": 24348871071360.0, "grad_norm": 2.265408793588708, "language_loss": 0.77277231, "learning_rate": 1.5868563629644464e-06, "loss": 0.79435486, "num_input_tokens_seen": 207516105, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.76171875, "step": 9634, "time_per_iteration": 5.289331674575806 }, { "auxiliary_loss_clip": 0.01112989, "auxiliary_loss_mlp": 0.01038365, "balance_loss_clip": 1.0251801, "balance_loss_mlp": 1.04031539, "epoch": 0.5792875394558845, "flos": 20449403712000.0, "grad_norm": 2.1935556943761476, "language_loss": 0.63436222, "learning_rate": 1.5864753092460502e-06, "loss": 0.6558758, "num_input_tokens_seen": 207533685, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 9635, "time_per_iteration": 2.465245246887207 }, { "auxiliary_loss_clip": 0.01109146, "auxiliary_loss_mlp": 0.01035107, "balance_loss_clip": 1.02277493, "balance_loss_mlp": 1.04015374, "epoch": 0.5793476627085525, "flos": 24060329118720.0, "grad_norm": 1.5426285886779174, "language_loss": 0.77432859, "learning_rate": 1.5860942712085516e-06, "loss": 0.79577118, "num_input_tokens_seen": 207552840, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 9636, "time_per_iteration": 2.4837825298309326 }, { "auxiliary_loss_clip": 0.01105516, "auxiliary_loss_mlp": 0.01028722, "balance_loss_clip": 1.01716459, "balance_loss_mlp": 1.03809738, "epoch": 0.5794077859612206, "flos": 22054287381120.0, "grad_norm": 1.6457626172777502, "language_loss": 0.68533754, "learning_rate": 1.5857132488663998e-06, "loss": 0.70667994, "num_input_tokens_seen": 207572095, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 9637, "time_per_iteration": 2.4723284244537354 }, { "auxiliary_loss_clip": 0.01111943, "auxiliary_loss_mlp": 0.01031743, "balance_loss_clip": 1.01862359, "balance_loss_mlp": 1.03894913, "epoch": 0.5794679092138885, "flos": 11434855991040.0, "grad_norm": 2.7136668488225415, "language_loss": 0.72021002, "learning_rate": 1.585332242234043e-06, "loss": 0.74164689, "num_input_tokens_seen": 207587495, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 9638, "time_per_iteration": 2.41050124168396 }, { "auxiliary_loss_clip": 0.01111661, "auxiliary_loss_mlp": 0.01032654, "balance_loss_clip": 1.02033329, "balance_loss_mlp": 1.04190636, "epoch": 0.5795280324665565, "flos": 18880215183360.0, "grad_norm": 2.432551720183699, "language_loss": 0.7251792, "learning_rate": 1.5849512513259291e-06, "loss": 0.74662232, "num_input_tokens_seen": 207606795, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 9639, "time_per_iteration": 2.4575881958007812 }, { "auxiliary_loss_clip": 0.01112543, "auxiliary_loss_mlp": 0.0103367, "balance_loss_clip": 1.02097964, "balance_loss_mlp": 1.04130518, "epoch": 0.5795881557192244, "flos": 13005947940480.0, "grad_norm": 2.22672241147539, "language_loss": 0.69808263, "learning_rate": 1.5845702761565054e-06, "loss": 0.71954477, "num_input_tokens_seen": 207623620, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 9640, "time_per_iteration": 2.4257547855377197 }, { "auxiliary_loss_clip": 0.01117317, "auxiliary_loss_mlp": 0.01039, "balance_loss_clip": 1.02555323, "balance_loss_mlp": 1.0416162, "epoch": 0.5796482789718924, "flos": 19932397303680.0, "grad_norm": 2.299322367229048, "language_loss": 0.77839255, "learning_rate": 1.5841893167402183e-06, "loss": 0.79995573, "num_input_tokens_seen": 207639380, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 9641, "time_per_iteration": 2.448775291442871 }, { "auxiliary_loss_clip": 0.01110426, "auxiliary_loss_mlp": 0.01035428, "balance_loss_clip": 1.02320921, "balance_loss_mlp": 1.04004669, "epoch": 0.5797084022245603, "flos": 21650794928640.0, "grad_norm": 2.0324491762125296, "language_loss": 0.73717231, "learning_rate": 1.5838083730915143e-06, "loss": 0.75863087, "num_input_tokens_seen": 207657915, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 9642, "time_per_iteration": 2.463923692703247 }, { "auxiliary_loss_clip": 0.01109381, "auxiliary_loss_mlp": 0.01030373, "balance_loss_clip": 1.01793337, "balance_loss_mlp": 1.03935671, "epoch": 0.5797685254772283, "flos": 26031573555840.0, "grad_norm": 1.865434402291452, "language_loss": 0.7372582, "learning_rate": 1.5834274452248378e-06, "loss": 0.75865579, "num_input_tokens_seen": 207678620, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 9643, "time_per_iteration": 2.5115811824798584 }, { "auxiliary_loss_clip": 0.01113306, "auxiliary_loss_mlp": 0.01032901, "balance_loss_clip": 1.01985335, "balance_loss_mlp": 1.04093659, "epoch": 0.5798286487298963, "flos": 22705167778560.0, "grad_norm": 1.9422590154419204, "language_loss": 0.66829163, "learning_rate": 1.5830465331546352e-06, "loss": 0.68975371, "num_input_tokens_seen": 207696980, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 9644, "time_per_iteration": 2.4509196281433105 }, { "auxiliary_loss_clip": 0.0111542, "auxiliary_loss_mlp": 0.01030277, "balance_loss_clip": 1.01693702, "balance_loss_mlp": 1.04134965, "epoch": 0.5798887719825643, "flos": 23148988225920.0, "grad_norm": 2.477729527986968, "language_loss": 0.85385877, "learning_rate": 1.5826656368953496e-06, "loss": 0.87531579, "num_input_tokens_seen": 207714065, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 9645, "time_per_iteration": 2.4732000827789307 }, { "auxiliary_loss_clip": 0.01112305, "auxiliary_loss_mlp": 0.0103305, "balance_loss_clip": 1.02110553, "balance_loss_mlp": 1.0416193, "epoch": 0.5799488952352322, "flos": 24426043441920.0, "grad_norm": 4.871218339523012, "language_loss": 0.75553977, "learning_rate": 1.5822847564614244e-06, "loss": 0.77699339, "num_input_tokens_seen": 207734720, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.70703125, "step": 9646, "time_per_iteration": 2.5077292919158936 }, { "auxiliary_loss_clip": 0.01115884, "auxiliary_loss_mlp": 0.01033076, "balance_loss_clip": 1.01969528, "balance_loss_mlp": 1.04290104, "epoch": 0.5800090184879002, "flos": 38395903829760.0, "grad_norm": 1.971568857199776, "language_loss": 0.59618759, "learning_rate": 1.5819038918673038e-06, "loss": 0.61767721, "num_input_tokens_seen": 207755435, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 9647, "time_per_iteration": 2.6075873374938965 }, { "auxiliary_loss_clip": 0.0111327, "auxiliary_loss_mlp": 0.01036077, "balance_loss_clip": 1.02289832, "balance_loss_mlp": 1.04081511, "epoch": 0.5800691417405681, "flos": 19784840232960.0, "grad_norm": 1.842170873664999, "language_loss": 0.84536517, "learning_rate": 1.5815230431274288e-06, "loss": 0.86685866, "num_input_tokens_seen": 207773570, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 9648, "time_per_iteration": 2.4423940181732178 }, { "auxiliary_loss_clip": 0.01041759, "auxiliary_loss_mlp": 0.01000989, "balance_loss_clip": 0.99957001, "balance_loss_mlp": 1.01695764, "epoch": 0.5801292649932361, "flos": 70314565783680.0, "grad_norm": 0.8619952606090542, "language_loss": 0.62976933, "learning_rate": 1.581142210256242e-06, "loss": 0.65019679, "num_input_tokens_seen": 207830095, "router_z_loss_clip": 0.01416016, "router_z_loss_mlp": 0.24804688, "step": 9649, "time_per_iteration": 3.121187686920166 }, { "auxiliary_loss_clip": 0.011087, "auxiliary_loss_mlp": 0.01027549, "balance_loss_clip": 1.01576567, "balance_loss_mlp": 1.04058456, "epoch": 0.5801893882459042, "flos": 18734812928640.0, "grad_norm": 2.453888291440535, "language_loss": 0.82189816, "learning_rate": 1.5807613932681857e-06, "loss": 0.84326065, "num_input_tokens_seen": 207848555, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 9650, "time_per_iteration": 2.4706311225891113 }, { "auxiliary_loss_clip": 0.01115103, "auxiliary_loss_mlp": 0.01030706, "balance_loss_clip": 1.01787329, "balance_loss_mlp": 1.04091263, "epoch": 0.5802495114985721, "flos": 15596507698560.0, "grad_norm": 3.2390525389518943, "language_loss": 0.77865231, "learning_rate": 1.580380592177698e-06, "loss": 0.8001104, "num_input_tokens_seen": 207867060, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 9651, "time_per_iteration": 2.4428551197052 }, { "auxiliary_loss_clip": 0.01114588, "auxiliary_loss_mlp": 0.01040071, "balance_loss_clip": 1.02662373, "balance_loss_mlp": 1.04240048, "epoch": 0.5803096347512401, "flos": 18255405081600.0, "grad_norm": 2.116056724204624, "language_loss": 0.74527323, "learning_rate": 1.5799998069992213e-06, "loss": 0.76681978, "num_input_tokens_seen": 207884520, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 9652, "time_per_iteration": 2.460692882537842 }, { "auxiliary_loss_clip": 0.01111237, "auxiliary_loss_mlp": 0.01034599, "balance_loss_clip": 1.02090788, "balance_loss_mlp": 1.0380857, "epoch": 0.580369758003908, "flos": 22893160584960.0, "grad_norm": 2.5158496711328113, "language_loss": 0.7668494, "learning_rate": 1.579619037747193e-06, "loss": 0.78830779, "num_input_tokens_seen": 207905370, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 9653, "time_per_iteration": 2.5128321647644043 }, { "auxiliary_loss_clip": 0.01113117, "auxiliary_loss_mlp": 0.01032019, "balance_loss_clip": 1.01817846, "balance_loss_mlp": 1.040977, "epoch": 0.580429881256576, "flos": 18697681244160.0, "grad_norm": 2.5867117463546823, "language_loss": 0.74275744, "learning_rate": 1.5792382844360534e-06, "loss": 0.76420873, "num_input_tokens_seen": 207923790, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 9654, "time_per_iteration": 2.449796676635742 }, { "auxiliary_loss_clip": 0.01108581, "auxiliary_loss_mlp": 0.01034006, "balance_loss_clip": 1.02173352, "balance_loss_mlp": 1.04080427, "epoch": 0.5804900045092439, "flos": 24681978823680.0, "grad_norm": 1.8242423614326233, "language_loss": 0.70245004, "learning_rate": 1.5788575470802408e-06, "loss": 0.72387588, "num_input_tokens_seen": 207942335, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 9655, "time_per_iteration": 2.5064313411712646 }, { "auxiliary_loss_clip": 0.01116084, "auxiliary_loss_mlp": 0.01035911, "balance_loss_clip": 1.02249444, "balance_loss_mlp": 1.04032183, "epoch": 0.580550127761912, "flos": 23112790295040.0, "grad_norm": 2.004515046976396, "language_loss": 0.69328028, "learning_rate": 1.5784768256941915e-06, "loss": 0.71480024, "num_input_tokens_seen": 207961975, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 9656, "time_per_iteration": 2.4693775177001953 }, { "auxiliary_loss_clip": 0.0110741, "auxiliary_loss_mlp": 0.01026799, "balance_loss_clip": 1.01479483, "balance_loss_mlp": 1.0401783, "epoch": 0.5806102510145799, "flos": 18475681236480.0, "grad_norm": 1.8268245065943747, "language_loss": 0.71920729, "learning_rate": 1.5780961202923433e-06, "loss": 0.74054933, "num_input_tokens_seen": 207979520, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 9657, "time_per_iteration": 2.4433200359344482 }, { "auxiliary_loss_clip": 0.01113181, "auxiliary_loss_mlp": 0.01035515, "balance_loss_clip": 1.02141237, "balance_loss_mlp": 1.03925383, "epoch": 0.5806703742672479, "flos": 23915645136000.0, "grad_norm": 2.1874047045339755, "language_loss": 0.71380347, "learning_rate": 1.5777154308891328e-06, "loss": 0.73529053, "num_input_tokens_seen": 207998375, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7421875, "step": 9658, "time_per_iteration": 2.464815616607666 }, { "auxiliary_loss_clip": 0.01039075, "auxiliary_loss_mlp": 0.01000023, "balance_loss_clip": 0.99865848, "balance_loss_mlp": 1.01447344, "epoch": 0.5807304975199158, "flos": 66311999412480.0, "grad_norm": 0.6730365529126588, "language_loss": 0.53537321, "learning_rate": 1.5773347574989953e-06, "loss": 0.5557642, "num_input_tokens_seen": 208060605, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.24609375, "step": 9659, "time_per_iteration": 3.136693239212036 }, { "auxiliary_loss_clip": 0.01113531, "auxiliary_loss_mlp": 0.01039882, "balance_loss_clip": 1.02645922, "balance_loss_mlp": 1.04059553, "epoch": 0.5807906207725838, "flos": 31722444933120.0, "grad_norm": 2.212602309170449, "language_loss": 0.62357724, "learning_rate": 1.576954100136366e-06, "loss": 0.64511144, "num_input_tokens_seen": 208080320, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 9660, "time_per_iteration": 2.556032657623291 }, { "auxiliary_loss_clip": 0.01112061, "auxiliary_loss_mlp": 0.01031621, "balance_loss_clip": 1.0182395, "balance_loss_mlp": 1.03816652, "epoch": 0.5808507440252517, "flos": 23801161512960.0, "grad_norm": 1.6274073939892408, "language_loss": 0.65480429, "learning_rate": 1.5765734588156797e-06, "loss": 0.6762411, "num_input_tokens_seen": 208099305, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73828125, "step": 9661, "time_per_iteration": 2.4766366481781006 }, { "auxiliary_loss_clip": 0.01106534, "auxiliary_loss_mlp": 0.01024145, "balance_loss_clip": 1.01268291, "balance_loss_mlp": 1.03951502, "epoch": 0.5809108672779197, "flos": 13698449222400.0, "grad_norm": 1.6128260699040193, "language_loss": 0.7474283, "learning_rate": 1.5761928335513704e-06, "loss": 0.76873505, "num_input_tokens_seen": 208116960, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 9662, "time_per_iteration": 2.4503440856933594 }, { "auxiliary_loss_clip": 0.01036106, "auxiliary_loss_mlp": 0.00999332, "balance_loss_clip": 0.99778271, "balance_loss_mlp": 1.01192808, "epoch": 0.5809709905305876, "flos": 69134866381440.0, "grad_norm": 0.8880454805746495, "language_loss": 0.58382261, "learning_rate": 1.5758122243578709e-06, "loss": 0.60417706, "num_input_tokens_seen": 208182190, "router_z_loss_clip": 0.01544189, "router_z_loss_mlp": 0.2421875, "step": 9663, "time_per_iteration": 3.157176971435547 }, { "auxiliary_loss_clip": 0.01112154, "auxiliary_loss_mlp": 0.01029726, "balance_loss_clip": 1.01717925, "balance_loss_mlp": 1.04108799, "epoch": 0.5810311137832557, "flos": 19827538525440.0, "grad_norm": 2.2022431282099513, "language_loss": 0.82473105, "learning_rate": 1.5754316312496152e-06, "loss": 0.84614992, "num_input_tokens_seen": 208197015, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 9664, "time_per_iteration": 2.4764938354492188 }, { "auxiliary_loss_clip": 0.01110299, "auxiliary_loss_mlp": 0.01025254, "balance_loss_clip": 1.0124445, "balance_loss_mlp": 1.03744555, "epoch": 0.5810912370359237, "flos": 29238503719680.0, "grad_norm": 1.8057836590754222, "language_loss": 0.8109138, "learning_rate": 1.5750510542410337e-06, "loss": 0.83226931, "num_input_tokens_seen": 208215795, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 9665, "time_per_iteration": 2.5245449542999268 }, { "auxiliary_loss_clip": 0.01115928, "auxiliary_loss_mlp": 0.01033388, "balance_loss_clip": 1.01880836, "balance_loss_mlp": 1.04251647, "epoch": 0.5811513602885916, "flos": 22785572373120.0, "grad_norm": 2.1151790629076634, "language_loss": 0.81176031, "learning_rate": 1.5746704933465599e-06, "loss": 0.8332535, "num_input_tokens_seen": 208234655, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.734375, "step": 9666, "time_per_iteration": 2.559088945388794 }, { "auxiliary_loss_clip": 0.01109184, "auxiliary_loss_mlp": 0.01029514, "balance_loss_clip": 1.01790261, "balance_loss_mlp": 1.04012716, "epoch": 0.5812114835412596, "flos": 18734346051840.0, "grad_norm": 1.868909291876109, "language_loss": 0.79544389, "learning_rate": 1.5742899485806227e-06, "loss": 0.81683093, "num_input_tokens_seen": 208251300, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.69140625, "step": 9667, "time_per_iteration": 2.4436123371124268 }, { "auxiliary_loss_clip": 0.01115544, "auxiliary_loss_mlp": 0.01037211, "balance_loss_clip": 1.02303672, "balance_loss_mlp": 1.03896368, "epoch": 0.5812716067939275, "flos": 26431295080320.0, "grad_norm": 1.4040076566662059, "language_loss": 0.78547555, "learning_rate": 1.573909419957653e-06, "loss": 0.80700308, "num_input_tokens_seen": 208272685, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.765625, "step": 9668, "time_per_iteration": 2.5257434844970703 }, { "auxiliary_loss_clip": 0.01110457, "auxiliary_loss_mlp": 0.01033518, "balance_loss_clip": 1.02086377, "balance_loss_mlp": 1.03929901, "epoch": 0.5813317300465956, "flos": 43397865285120.0, "grad_norm": 1.882873933413512, "language_loss": 0.64843822, "learning_rate": 1.5735289074920819e-06, "loss": 0.66987801, "num_input_tokens_seen": 208294315, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 9669, "time_per_iteration": 2.6476855278015137 }, { "auxiliary_loss_clip": 0.01112352, "auxiliary_loss_mlp": 0.01037018, "balance_loss_clip": 1.02348804, "balance_loss_mlp": 1.04110193, "epoch": 0.5813918532992635, "flos": 24785472885120.0, "grad_norm": 2.116967107704159, "language_loss": 0.73262048, "learning_rate": 1.5731484111983363e-06, "loss": 0.75411415, "num_input_tokens_seen": 208315610, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 9670, "time_per_iteration": 2.508474111557007 }, { "auxiliary_loss_clip": 0.01112562, "auxiliary_loss_mlp": 0.01038121, "balance_loss_clip": 1.02512097, "balance_loss_mlp": 1.03915739, "epoch": 0.5814519765519315, "flos": 22857357703680.0, "grad_norm": 2.540778163054687, "language_loss": 0.78897732, "learning_rate": 1.5727679310908464e-06, "loss": 0.81048417, "num_input_tokens_seen": 208334725, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 9671, "time_per_iteration": 3.8214528560638428 }, { "auxiliary_loss_clip": 0.01114688, "auxiliary_loss_mlp": 0.01036228, "balance_loss_clip": 1.02156496, "balance_loss_mlp": 1.04092181, "epoch": 0.5815120998045994, "flos": 24060831909120.0, "grad_norm": 3.276370844541029, "language_loss": 0.61095172, "learning_rate": 1.5723874671840399e-06, "loss": 0.63246083, "num_input_tokens_seen": 208353825, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.73828125, "step": 9672, "time_per_iteration": 2.486713171005249 }, { "auxiliary_loss_clip": 0.01108719, "auxiliary_loss_mlp": 0.01031371, "balance_loss_clip": 1.01944995, "balance_loss_mlp": 1.0399785, "epoch": 0.5815722230572674, "flos": 24279491952000.0, "grad_norm": 2.3692736711928486, "language_loss": 0.81558573, "learning_rate": 1.572007019492342e-06, "loss": 0.83698666, "num_input_tokens_seen": 208374160, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 9673, "time_per_iteration": 2.4826881885528564 }, { "auxiliary_loss_clip": 0.01115212, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 1.0231272, "balance_loss_mlp": 1.04083991, "epoch": 0.5816323463099353, "flos": 22200371994240.0, "grad_norm": 2.0273466472823567, "language_loss": 0.88229084, "learning_rate": 1.5716265880301817e-06, "loss": 0.90381336, "num_input_tokens_seen": 208392105, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.74609375, "step": 9674, "time_per_iteration": 3.923724412918091 }, { "auxiliary_loss_clip": 0.0111201, "auxiliary_loss_mlp": 0.01031238, "balance_loss_clip": 1.01900697, "balance_loss_mlp": 1.04011869, "epoch": 0.5816924695626033, "flos": 24134448833280.0, "grad_norm": 1.610884094556974, "language_loss": 0.78807312, "learning_rate": 1.571246172811984e-06, "loss": 0.80950558, "num_input_tokens_seen": 208411755, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71875, "step": 9675, "time_per_iteration": 3.9111058712005615 }, { "auxiliary_loss_clip": 0.01111415, "auxiliary_loss_mlp": 0.01033235, "balance_loss_clip": 1.02015746, "balance_loss_mlp": 1.04066789, "epoch": 0.5817525928152713, "flos": 21324223451520.0, "grad_norm": 2.3807224019647557, "language_loss": 0.70135242, "learning_rate": 1.5708657738521748e-06, "loss": 0.72279894, "num_input_tokens_seen": 208429995, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 9676, "time_per_iteration": 3.8910152912139893 }, { "auxiliary_loss_clip": 0.01111878, "auxiliary_loss_mlp": 0.01029297, "balance_loss_clip": 1.01643443, "balance_loss_mlp": 1.04049098, "epoch": 0.5818127160679393, "flos": 26934510666240.0, "grad_norm": 2.771357869328599, "language_loss": 0.6413132, "learning_rate": 1.5704853911651779e-06, "loss": 0.66272491, "num_input_tokens_seen": 208443655, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9677, "time_per_iteration": 2.4707353115081787 }, { "auxiliary_loss_clip": 0.0103444, "auxiliary_loss_mlp": 0.00999554, "balance_loss_clip": 0.99828988, "balance_loss_mlp": 1.01017833, "epoch": 0.5818728393206073, "flos": 63918626342400.0, "grad_norm": 0.8100962938430272, "language_loss": 0.5424664, "learning_rate": 1.5701050247654182e-06, "loss": 0.56280637, "num_input_tokens_seen": 208498405, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.2421875, "step": 9678, "time_per_iteration": 3.1510677337646484 }, { "auxiliary_loss_clip": 0.01034667, "auxiliary_loss_mlp": 0.00998467, "balance_loss_clip": 0.99714357, "balance_loss_mlp": 1.01031399, "epoch": 0.5819329625732752, "flos": 64954108638720.0, "grad_norm": 0.7764967354198962, "language_loss": 0.56208825, "learning_rate": 1.569724674667319e-06, "loss": 0.58241957, "num_input_tokens_seen": 208559075, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.24414062, "step": 9679, "time_per_iteration": 2.988689661026001 }, { "auxiliary_loss_clip": 0.01107625, "auxiliary_loss_mlp": 0.01035063, "balance_loss_clip": 1.02345181, "balance_loss_mlp": 1.03741765, "epoch": 0.5819930858259432, "flos": 21215270522880.0, "grad_norm": 2.2494681806609496, "language_loss": 0.65989774, "learning_rate": 1.5693443408853032e-06, "loss": 0.6813246, "num_input_tokens_seen": 208577770, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.703125, "step": 9680, "time_per_iteration": 2.4583616256713867 }, { "auxiliary_loss_clip": 0.01110256, "auxiliary_loss_mlp": 0.0102679, "balance_loss_clip": 1.01485109, "balance_loss_mlp": 1.03940427, "epoch": 0.5820532090786111, "flos": 19458520151040.0, "grad_norm": 2.183289452606774, "language_loss": 0.83188725, "learning_rate": 1.5689640234337933e-06, "loss": 0.85325772, "num_input_tokens_seen": 208595110, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 9681, "time_per_iteration": 2.5193400382995605 }, { "auxiliary_loss_clip": 0.01110652, "auxiliary_loss_mlp": 0.01031424, "balance_loss_clip": 1.01844847, "balance_loss_mlp": 1.03996396, "epoch": 0.5821133323312792, "flos": 17712615686400.0, "grad_norm": 14.418684003311025, "language_loss": 0.76078731, "learning_rate": 1.5685837223272109e-06, "loss": 0.78220809, "num_input_tokens_seen": 208612080, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 9682, "time_per_iteration": 2.4571447372436523 }, { "auxiliary_loss_clip": 0.01112836, "auxiliary_loss_mlp": 0.01029005, "balance_loss_clip": 1.01638079, "balance_loss_mlp": 1.03965044, "epoch": 0.5821734555839471, "flos": 24571804832640.0, "grad_norm": 3.6921495698572224, "language_loss": 0.75295818, "learning_rate": 1.568203437579977e-06, "loss": 0.77437657, "num_input_tokens_seen": 208630235, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 9683, "time_per_iteration": 2.544950246810913 }, { "auxiliary_loss_clip": 0.01112266, "auxiliary_loss_mlp": 0.01029028, "balance_loss_clip": 1.01612926, "balance_loss_mlp": 1.03811812, "epoch": 0.5822335788366151, "flos": 22382259488640.0, "grad_norm": 1.6201107133097843, "language_loss": 0.7407186, "learning_rate": 1.5678231692065116e-06, "loss": 0.76213151, "num_input_tokens_seen": 208647925, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 9684, "time_per_iteration": 2.4684383869171143 }, { "auxiliary_loss_clip": 0.01111829, "auxiliary_loss_mlp": 0.01038228, "balance_loss_clip": 1.02554369, "balance_loss_mlp": 1.03981054, "epoch": 0.582293702089283, "flos": 26722494639360.0, "grad_norm": 2.2260984808008213, "language_loss": 0.78223652, "learning_rate": 1.5674429172212348e-06, "loss": 0.80373704, "num_input_tokens_seen": 208666180, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9685, "time_per_iteration": 2.5279390811920166 }, { "auxiliary_loss_clip": 0.01112551, "auxiliary_loss_mlp": 0.01038459, "balance_loss_clip": 1.02575731, "balance_loss_mlp": 1.04072082, "epoch": 0.582353825341951, "flos": 17348661129600.0, "grad_norm": 1.7157943319967235, "language_loss": 0.75520247, "learning_rate": 1.5670626816385667e-06, "loss": 0.7767126, "num_input_tokens_seen": 208684240, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9686, "time_per_iteration": 2.457395076751709 }, { "auxiliary_loss_clip": 0.01035154, "auxiliary_loss_mlp": 0.01006711, "balance_loss_clip": 1.00528622, "balance_loss_mlp": 1.01077592, "epoch": 0.5824139485946189, "flos": 55473261534720.0, "grad_norm": 0.8141200252956406, "language_loss": 0.57411307, "learning_rate": 1.5666824624729244e-06, "loss": 0.59453171, "num_input_tokens_seen": 208736090, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.24414062, "step": 9687, "time_per_iteration": 2.9166038036346436 }, { "auxiliary_loss_clip": 0.01110608, "auxiliary_loss_mlp": 0.01031774, "balance_loss_clip": 1.0184586, "balance_loss_mlp": 1.03863263, "epoch": 0.582474071847287, "flos": 20303031790080.0, "grad_norm": 2.1369515118154085, "language_loss": 0.70669401, "learning_rate": 1.566302259738727e-06, "loss": 0.72811782, "num_input_tokens_seen": 208754600, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 9688, "time_per_iteration": 2.447396993637085 }, { "auxiliary_loss_clip": 0.01110535, "auxiliary_loss_mlp": 0.01033468, "balance_loss_clip": 1.02181554, "balance_loss_mlp": 1.03934383, "epoch": 0.5825341950999549, "flos": 23878010661120.0, "grad_norm": 3.2389546551735484, "language_loss": 0.65782571, "learning_rate": 1.5659220734503918e-06, "loss": 0.67926574, "num_input_tokens_seen": 208773140, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.7109375, "step": 9689, "time_per_iteration": 2.549852132797241 }, { "auxiliary_loss_clip": 0.01111041, "auxiliary_loss_mlp": 0.01031949, "balance_loss_clip": 1.0188601, "balance_loss_mlp": 1.04157925, "epoch": 0.5825943183526229, "flos": 23113041690240.0, "grad_norm": 1.904279974409882, "language_loss": 0.73480862, "learning_rate": 1.5655419036223341e-06, "loss": 0.75623852, "num_input_tokens_seen": 208793410, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 9690, "time_per_iteration": 2.4745893478393555 }, { "auxiliary_loss_clip": 0.01114765, "auxiliary_loss_mlp": 0.01036113, "balance_loss_clip": 1.02187347, "balance_loss_mlp": 1.0420866, "epoch": 0.5826544416052909, "flos": 22857429530880.0, "grad_norm": 1.7271696265487009, "language_loss": 0.75803733, "learning_rate": 1.5651617502689717e-06, "loss": 0.77954602, "num_input_tokens_seen": 208811920, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7265625, "step": 9691, "time_per_iteration": 2.4725756645202637 }, { "auxiliary_loss_clip": 0.01110754, "auxiliary_loss_mlp": 0.01032844, "balance_loss_clip": 1.02026772, "balance_loss_mlp": 1.03877807, "epoch": 0.5827145648579588, "flos": 31501845555840.0, "grad_norm": 2.715332231510253, "language_loss": 0.80733192, "learning_rate": 1.5647816134047184e-06, "loss": 0.82876801, "num_input_tokens_seen": 208834720, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 9692, "time_per_iteration": 2.545501232147217 }, { "auxiliary_loss_clip": 0.0103483, "auxiliary_loss_mlp": 0.01005096, "balance_loss_clip": 1.00357568, "balance_loss_mlp": 1.01012063, "epoch": 0.5827746881106268, "flos": 69811817074560.0, "grad_norm": 0.7562310768313963, "language_loss": 0.56964839, "learning_rate": 1.5644014930439907e-06, "loss": 0.59004772, "num_input_tokens_seen": 208898415, "router_z_loss_clip": 0.01519775, "router_z_loss_mlp": 0.24707031, "step": 9693, "time_per_iteration": 3.0521082878112793 }, { "auxiliary_loss_clip": 0.01110307, "auxiliary_loss_mlp": 0.01032775, "balance_loss_clip": 1.02084231, "balance_loss_mlp": 1.03943598, "epoch": 0.5828348113632947, "flos": 23112395245440.0, "grad_norm": 1.9861003249779114, "language_loss": 0.79552364, "learning_rate": 1.5640213892012025e-06, "loss": 0.81695443, "num_input_tokens_seen": 208919045, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.70703125, "step": 9694, "time_per_iteration": 2.4758753776550293 }, { "auxiliary_loss_clip": 0.01107796, "auxiliary_loss_mlp": 0.01030825, "balance_loss_clip": 1.01965559, "balance_loss_mlp": 1.03999329, "epoch": 0.5828949346159628, "flos": 21873082245120.0, "grad_norm": 1.6817249060277086, "language_loss": 0.76259756, "learning_rate": 1.5636413018907656e-06, "loss": 0.78398383, "num_input_tokens_seen": 208939375, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6796875, "step": 9695, "time_per_iteration": 2.50099515914917 }, { "auxiliary_loss_clip": 0.01035311, "auxiliary_loss_mlp": 0.01005631, "balance_loss_clip": 1.00418878, "balance_loss_mlp": 1.01089156, "epoch": 0.5829550578686307, "flos": 65962553950080.0, "grad_norm": 0.7668239577984075, "language_loss": 0.54993445, "learning_rate": 1.563261231127095e-06, "loss": 0.57034385, "num_input_tokens_seen": 209004760, "router_z_loss_clip": 0.0144043, "router_z_loss_mlp": 0.24414062, "step": 9696, "time_per_iteration": 3.1771039962768555 }, { "auxiliary_loss_clip": 0.0111209, "auxiliary_loss_mlp": 0.01027546, "balance_loss_clip": 1.01498735, "balance_loss_mlp": 1.04098034, "epoch": 0.5830151811212987, "flos": 16289799079680.0, "grad_norm": 2.272069996549914, "language_loss": 0.76951098, "learning_rate": 1.5628811769246021e-06, "loss": 0.79090738, "num_input_tokens_seen": 209022930, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 9697, "time_per_iteration": 2.4725077152252197 }, { "auxiliary_loss_clip": 0.01112853, "auxiliary_loss_mlp": 0.01035684, "balance_loss_clip": 1.02184343, "balance_loss_mlp": 1.03911686, "epoch": 0.5830753043739666, "flos": 24168851084160.0, "grad_norm": 1.788746167752848, "language_loss": 0.77710283, "learning_rate": 1.5625011392976991e-06, "loss": 0.79858816, "num_input_tokens_seen": 209043740, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73828125, "step": 9698, "time_per_iteration": 2.4823710918426514 }, { "auxiliary_loss_clip": 0.01113926, "auxiliary_loss_mlp": 0.01035176, "balance_loss_clip": 1.02231312, "balance_loss_mlp": 1.04292202, "epoch": 0.5831354276266346, "flos": 27059050097280.0, "grad_norm": 1.8984757164701698, "language_loss": 0.83799636, "learning_rate": 1.5621211182607966e-06, "loss": 0.85948741, "num_input_tokens_seen": 209068885, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 9699, "time_per_iteration": 2.6028878688812256 }, { "auxiliary_loss_clip": 0.0111097, "auxiliary_loss_mlp": 0.01027772, "balance_loss_clip": 1.01498127, "balance_loss_mlp": 1.03877342, "epoch": 0.5831955508793025, "flos": 23623475909760.0, "grad_norm": 2.7016661217101223, "language_loss": 0.66411543, "learning_rate": 1.561741113828305e-06, "loss": 0.68550289, "num_input_tokens_seen": 209087340, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 9700, "time_per_iteration": 2.47408390045166 }, { "auxiliary_loss_clip": 0.01111853, "auxiliary_loss_mlp": 0.01029544, "balance_loss_clip": 1.01654446, "balance_loss_mlp": 1.04010701, "epoch": 0.5832556741319705, "flos": 24973250209920.0, "grad_norm": 1.9737858051387698, "language_loss": 0.71677935, "learning_rate": 1.5613611260146344e-06, "loss": 0.73819333, "num_input_tokens_seen": 209108840, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 9701, "time_per_iteration": 2.5169265270233154 }, { "auxiliary_loss_clip": 0.01108214, "auxiliary_loss_mlp": 0.0103366, "balance_loss_clip": 1.02079773, "balance_loss_mlp": 1.03767359, "epoch": 0.5833157973846385, "flos": 23221563655680.0, "grad_norm": 1.8078806748521394, "language_loss": 0.85337651, "learning_rate": 1.5609811548341936e-06, "loss": 0.8747952, "num_input_tokens_seen": 209127985, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 9702, "time_per_iteration": 2.467512845993042 }, { "auxiliary_loss_clip": 0.01106094, "auxiliary_loss_mlp": 0.01032141, "balance_loss_clip": 1.02039254, "balance_loss_mlp": 1.03795576, "epoch": 0.5833759206373065, "flos": 21977941023360.0, "grad_norm": 1.633115058183703, "language_loss": 0.78070903, "learning_rate": 1.560601200301392e-06, "loss": 0.80209136, "num_input_tokens_seen": 209146885, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 9703, "time_per_iteration": 2.4851810932159424 }, { "auxiliary_loss_clip": 0.01115484, "auxiliary_loss_mlp": 0.0102961, "balance_loss_clip": 1.01616979, "balance_loss_mlp": 1.0427177, "epoch": 0.5834360438899745, "flos": 21762405463680.0, "grad_norm": 12.677685792235689, "language_loss": 0.71127403, "learning_rate": 1.5602212624306366e-06, "loss": 0.73272496, "num_input_tokens_seen": 209166130, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 9704, "time_per_iteration": 2.4602229595184326 }, { "auxiliary_loss_clip": 0.01109919, "auxiliary_loss_mlp": 0.0103201, "balance_loss_clip": 1.02003002, "balance_loss_mlp": 1.03993726, "epoch": 0.5834961671426424, "flos": 15992566035840.0, "grad_norm": 1.7777645401467221, "language_loss": 0.81699651, "learning_rate": 1.559841341236335e-06, "loss": 0.8384158, "num_input_tokens_seen": 209183350, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69921875, "step": 9705, "time_per_iteration": 2.450031042098999 }, { "auxiliary_loss_clip": 0.01109149, "auxiliary_loss_mlp": 0.01028916, "balance_loss_clip": 1.01682866, "balance_loss_mlp": 1.03923953, "epoch": 0.5835562903953104, "flos": 22818322598400.0, "grad_norm": 1.8949173703350273, "language_loss": 0.8057791, "learning_rate": 1.5594614367328937e-06, "loss": 0.82715976, "num_input_tokens_seen": 209203945, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 9706, "time_per_iteration": 2.4544901847839355 }, { "auxiliary_loss_clip": 0.01110937, "auxiliary_loss_mlp": 0.01030745, "balance_loss_clip": 1.01752424, "balance_loss_mlp": 1.04166508, "epoch": 0.5836164136479783, "flos": 48468056624640.0, "grad_norm": 1.8749725985539618, "language_loss": 0.74638903, "learning_rate": 1.5590815489347187e-06, "loss": 0.76780587, "num_input_tokens_seen": 209227080, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 9707, "time_per_iteration": 2.680318593978882 }, { "auxiliary_loss_clip": 0.01107049, "auxiliary_loss_mlp": 0.01026916, "balance_loss_clip": 1.01522744, "balance_loss_mlp": 1.03860939, "epoch": 0.5836765369006464, "flos": 26905998245760.0, "grad_norm": 1.8671193073478343, "language_loss": 0.81570989, "learning_rate": 1.5587016778562163e-06, "loss": 0.83704954, "num_input_tokens_seen": 209248170, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 9708, "time_per_iteration": 2.5210907459259033 }, { "auxiliary_loss_clip": 0.01108951, "auxiliary_loss_mlp": 0.01028088, "balance_loss_clip": 1.01543415, "balance_loss_mlp": 1.04023838, "epoch": 0.5837366601533143, "flos": 20084048524800.0, "grad_norm": 4.376405756207999, "language_loss": 0.7861129, "learning_rate": 1.5583218235117896e-06, "loss": 0.80748326, "num_input_tokens_seen": 209267730, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 9709, "time_per_iteration": 2.476351499557495 }, { "auxiliary_loss_clip": 0.01036238, "auxiliary_loss_mlp": 0.00999942, "balance_loss_clip": 0.99845201, "balance_loss_mlp": 1.01194263, "epoch": 0.5837967834059823, "flos": 65363885971200.0, "grad_norm": 0.7664368712331949, "language_loss": 0.56528968, "learning_rate": 1.557941985915844e-06, "loss": 0.58565152, "num_input_tokens_seen": 209332510, "router_z_loss_clip": 0.01489258, "router_z_loss_mlp": 0.24316406, "step": 9710, "time_per_iteration": 3.1155688762664795 }, { "auxiliary_loss_clip": 0.0111058, "auxiliary_loss_mlp": 0.01027813, "balance_loss_clip": 1.01634538, "balance_loss_mlp": 1.04229593, "epoch": 0.5838569066586502, "flos": 25338641310720.0, "grad_norm": 2.505451462542728, "language_loss": 0.65356767, "learning_rate": 1.5575621650827833e-06, "loss": 0.67495161, "num_input_tokens_seen": 209353355, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 9711, "time_per_iteration": 2.5232901573181152 }, { "auxiliary_loss_clip": 0.01113228, "auxiliary_loss_mlp": 0.01037328, "balance_loss_clip": 1.0231185, "balance_loss_mlp": 1.0395304, "epoch": 0.5839170299113182, "flos": 22229243550720.0, "grad_norm": 1.9775305886990142, "language_loss": 0.7874189, "learning_rate": 1.5571823610270085e-06, "loss": 0.8089245, "num_input_tokens_seen": 209370960, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.73828125, "step": 9712, "time_per_iteration": 2.483675479888916 }, { "auxiliary_loss_clip": 0.01108301, "auxiliary_loss_mlp": 0.01027817, "balance_loss_clip": 1.01521015, "balance_loss_mlp": 1.03827679, "epoch": 0.5839771531639861, "flos": 22200012858240.0, "grad_norm": 1.6601264506019437, "language_loss": 0.7313168, "learning_rate": 1.5568025737629234e-06, "loss": 0.75267798, "num_input_tokens_seen": 209390955, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 9713, "time_per_iteration": 3.852423906326294 }, { "auxiliary_loss_clip": 0.01114217, "auxiliary_loss_mlp": 0.01027971, "balance_loss_clip": 1.01399946, "balance_loss_mlp": 1.04052341, "epoch": 0.5840372764166541, "flos": 22419355259520.0, "grad_norm": 2.7196179989137925, "language_loss": 0.69259548, "learning_rate": 1.5564228033049292e-06, "loss": 0.71401739, "num_input_tokens_seen": 209410260, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73828125, "step": 9714, "time_per_iteration": 2.4897189140319824 }, { "auxiliary_loss_clip": 0.01110939, "auxiliary_loss_mlp": 0.01029258, "balance_loss_clip": 1.01665759, "balance_loss_mlp": 1.03887272, "epoch": 0.5840973996693221, "flos": 19828256797440.0, "grad_norm": 1.9695477595783764, "language_loss": 0.80466247, "learning_rate": 1.5560430496674268e-06, "loss": 0.82606447, "num_input_tokens_seen": 209429920, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 9715, "time_per_iteration": 2.4560186862945557 }, { "auxiliary_loss_clip": 0.01108949, "auxiliary_loss_mlp": 0.01029076, "balance_loss_clip": 1.01625466, "balance_loss_mlp": 1.03866458, "epoch": 0.5841575229219901, "flos": 21142982401920.0, "grad_norm": 1.987044885573296, "language_loss": 0.73424339, "learning_rate": 1.5556633128648167e-06, "loss": 0.75562358, "num_input_tokens_seen": 209449470, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 9716, "time_per_iteration": 3.922048807144165 }, { "auxiliary_loss_clip": 0.01106627, "auxiliary_loss_mlp": 0.01028367, "balance_loss_clip": 1.01648211, "balance_loss_mlp": 1.03883398, "epoch": 0.5842176461746581, "flos": 24640322025600.0, "grad_norm": 1.7575298147744367, "language_loss": 0.74868757, "learning_rate": 1.5552835929114976e-06, "loss": 0.77003753, "num_input_tokens_seen": 209467695, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 9717, "time_per_iteration": 3.8580820560455322 }, { "auxiliary_loss_clip": 0.01109715, "auxiliary_loss_mlp": 0.01036596, "balance_loss_clip": 1.02379322, "balance_loss_mlp": 1.03939509, "epoch": 0.584277769427326, "flos": 19131158574720.0, "grad_norm": 7.737198550262455, "language_loss": 0.80136973, "learning_rate": 1.5549038898218697e-06, "loss": 0.82283282, "num_input_tokens_seen": 209484250, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 9718, "time_per_iteration": 3.8737521171569824 }, { "auxiliary_loss_clip": 0.0111039, "auxiliary_loss_mlp": 0.0102828, "balance_loss_clip": 1.01539969, "balance_loss_mlp": 1.04124475, "epoch": 0.584337892679994, "flos": 22675111073280.0, "grad_norm": 1.8416775733206856, "language_loss": 0.67601717, "learning_rate": 1.5545242036103306e-06, "loss": 0.69740397, "num_input_tokens_seen": 209502830, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 9719, "time_per_iteration": 2.4833056926727295 }, { "auxiliary_loss_clip": 0.01110143, "auxiliary_loss_mlp": 0.01030839, "balance_loss_clip": 1.01827478, "balance_loss_mlp": 1.03870702, "epoch": 0.5843980159326619, "flos": 31284083352960.0, "grad_norm": 3.308450370989217, "language_loss": 0.76085508, "learning_rate": 1.5541445342912786e-06, "loss": 0.78226489, "num_input_tokens_seen": 209525995, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 9720, "time_per_iteration": 2.548853635787964 }, { "auxiliary_loss_clip": 0.01109212, "auxiliary_loss_mlp": 0.01037679, "balance_loss_clip": 1.02505445, "balance_loss_mlp": 1.03860784, "epoch": 0.58445813918533, "flos": 22748117466240.0, "grad_norm": 1.8689394240382184, "language_loss": 0.8282609, "learning_rate": 1.5537648818791105e-06, "loss": 0.84972984, "num_input_tokens_seen": 209545895, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 9721, "time_per_iteration": 2.473586320877075 }, { "auxiliary_loss_clip": 0.0103706, "auxiliary_loss_mlp": 0.00999127, "balance_loss_clip": 0.99766654, "balance_loss_mlp": 1.01287925, "epoch": 0.5845182624379979, "flos": 60686556658560.0, "grad_norm": 0.9342818527346444, "language_loss": 0.71368343, "learning_rate": 1.5533852463882226e-06, "loss": 0.73404533, "num_input_tokens_seen": 209602315, "router_z_loss_clip": 0.0145874, "router_z_loss_mlp": 0.2421875, "step": 9722, "time_per_iteration": 3.123319387435913 }, { "auxiliary_loss_clip": 0.01107916, "auxiliary_loss_mlp": 0.01032992, "balance_loss_clip": 1.02073789, "balance_loss_mlp": 1.03861725, "epoch": 0.5845783856906659, "flos": 16362446336640.0, "grad_norm": 2.2198952983008744, "language_loss": 0.89248413, "learning_rate": 1.5530056278330113e-06, "loss": 0.91389322, "num_input_tokens_seen": 209617615, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 9723, "time_per_iteration": 2.4479823112487793 }, { "auxiliary_loss_clip": 0.01108942, "auxiliary_loss_mlp": 0.01031137, "balance_loss_clip": 1.01925194, "balance_loss_mlp": 1.03999209, "epoch": 0.5846385089433338, "flos": 20083402080000.0, "grad_norm": 1.6021140478622722, "language_loss": 0.68826258, "learning_rate": 1.5526260262278709e-06, "loss": 0.70966339, "num_input_tokens_seen": 209637005, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 9724, "time_per_iteration": 2.484903335571289 }, { "auxiliary_loss_clip": 0.01115412, "auxiliary_loss_mlp": 0.01034198, "balance_loss_clip": 1.02122843, "balance_loss_mlp": 1.04346883, "epoch": 0.5846986321960018, "flos": 17311062568320.0, "grad_norm": 1.9402841126976205, "language_loss": 0.86228871, "learning_rate": 1.552246441587197e-06, "loss": 0.88378477, "num_input_tokens_seen": 209653170, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 9725, "time_per_iteration": 2.4284353256225586 }, { "auxiliary_loss_clip": 0.01113789, "auxiliary_loss_mlp": 0.01035558, "balance_loss_clip": 1.02277899, "balance_loss_mlp": 1.04102278, "epoch": 0.5847587554486697, "flos": 17197907748480.0, "grad_norm": 1.9676380778801017, "language_loss": 0.82497668, "learning_rate": 1.5518668739253821e-06, "loss": 0.84647018, "num_input_tokens_seen": 209671275, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 9726, "time_per_iteration": 2.468161106109619 }, { "auxiliary_loss_clip": 0.01111529, "auxiliary_loss_mlp": 0.01033176, "balance_loss_clip": 1.0213387, "balance_loss_mlp": 1.0411346, "epoch": 0.5848188787013378, "flos": 24529106540160.0, "grad_norm": 1.753356857450541, "language_loss": 0.66874552, "learning_rate": 1.5514873232568206e-06, "loss": 0.69019252, "num_input_tokens_seen": 209690380, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 9727, "time_per_iteration": 2.504074811935425 }, { "auxiliary_loss_clip": 0.01112327, "auxiliary_loss_mlp": 0.01041027, "balance_loss_clip": 1.02755618, "balance_loss_mlp": 1.04136407, "epoch": 0.5848790019540057, "flos": 20628382204800.0, "grad_norm": 2.005476930964716, "language_loss": 0.81605279, "learning_rate": 1.5511077895959055e-06, "loss": 0.83758634, "num_input_tokens_seen": 209708845, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 9728, "time_per_iteration": 2.4828991889953613 }, { "auxiliary_loss_clip": 0.0110767, "auxiliary_loss_mlp": 0.01030529, "balance_loss_clip": 1.01893544, "balance_loss_mlp": 1.03960752, "epoch": 0.5849391252066737, "flos": 22418852469120.0, "grad_norm": 2.254092526001647, "language_loss": 0.77488959, "learning_rate": 1.550728272957027e-06, "loss": 0.79627156, "num_input_tokens_seen": 209729000, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 9729, "time_per_iteration": 2.4723353385925293 }, { "auxiliary_loss_clip": 0.01110196, "auxiliary_loss_mlp": 0.01031073, "balance_loss_clip": 1.0177995, "balance_loss_mlp": 1.03899133, "epoch": 0.5849992484593417, "flos": 25410929431680.0, "grad_norm": 11.755469811052905, "language_loss": 0.70439804, "learning_rate": 1.5503487733545782e-06, "loss": 0.72581077, "num_input_tokens_seen": 209747435, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 9730, "time_per_iteration": 2.543597936630249 }, { "auxiliary_loss_clip": 0.01114039, "auxiliary_loss_mlp": 0.01032889, "balance_loss_clip": 1.01906037, "balance_loss_mlp": 1.04090071, "epoch": 0.5850593717120096, "flos": 21065163586560.0, "grad_norm": 1.7288955547779334, "language_loss": 0.78996849, "learning_rate": 1.5499692908029482e-06, "loss": 0.81143779, "num_input_tokens_seen": 209764910, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73046875, "step": 9731, "time_per_iteration": 2.4826629161834717 }, { "auxiliary_loss_clip": 0.011094, "auxiliary_loss_mlp": 0.01034243, "balance_loss_clip": 1.02099252, "balance_loss_mlp": 1.03950906, "epoch": 0.5851194949646776, "flos": 25301545539840.0, "grad_norm": 1.8096394036102925, "language_loss": 0.7080704, "learning_rate": 1.549589825316528e-06, "loss": 0.72950679, "num_input_tokens_seen": 209786115, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 9732, "time_per_iteration": 2.5228824615478516 }, { "auxiliary_loss_clip": 0.01115442, "auxiliary_loss_mlp": 0.01033302, "balance_loss_clip": 1.01932478, "balance_loss_mlp": 1.04265058, "epoch": 0.5851796182173455, "flos": 23587242065280.0, "grad_norm": 1.886624383844322, "language_loss": 0.52613151, "learning_rate": 1.5492103769097075e-06, "loss": 0.54761893, "num_input_tokens_seen": 209806095, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7265625, "step": 9733, "time_per_iteration": 2.4952967166900635 }, { "auxiliary_loss_clip": 0.01111986, "auxiliary_loss_mlp": 0.01034356, "balance_loss_clip": 1.02182114, "balance_loss_mlp": 1.04135799, "epoch": 0.5852397414700136, "flos": 24822712310400.0, "grad_norm": 2.410809292374077, "language_loss": 0.87685943, "learning_rate": 1.5488309455968739e-06, "loss": 0.89832282, "num_input_tokens_seen": 209823650, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 9734, "time_per_iteration": 2.5157482624053955 }, { "auxiliary_loss_clip": 0.01105653, "auxiliary_loss_mlp": 0.01030817, "balance_loss_clip": 1.01935494, "balance_loss_mlp": 1.03957951, "epoch": 0.5852998647226815, "flos": 19937784343680.0, "grad_norm": 1.5274797615919657, "language_loss": 0.7225219, "learning_rate": 1.5484515313924163e-06, "loss": 0.74388659, "num_input_tokens_seen": 209843220, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66015625, "step": 9735, "time_per_iteration": 2.480438470840454 }, { "auxiliary_loss_clip": 0.01112169, "auxiliary_loss_mlp": 0.01036296, "balance_loss_clip": 1.02281332, "balance_loss_mlp": 1.03932667, "epoch": 0.5853599879753495, "flos": 16720367408640.0, "grad_norm": 3.5524140575892984, "language_loss": 0.7463342, "learning_rate": 1.5480721343107217e-06, "loss": 0.76781887, "num_input_tokens_seen": 209854880, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 9736, "time_per_iteration": 2.441305637359619 }, { "auxiliary_loss_clip": 0.01108753, "auxiliary_loss_mlp": 0.01027999, "balance_loss_clip": 1.01601791, "balance_loss_mlp": 1.03963065, "epoch": 0.5854201112280174, "flos": 44456583680640.0, "grad_norm": 2.1542418872768443, "language_loss": 0.70423794, "learning_rate": 1.5476927543661772e-06, "loss": 0.72560549, "num_input_tokens_seen": 209877870, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 9737, "time_per_iteration": 2.6890180110931396 }, { "auxiliary_loss_clip": 0.011082, "auxiliary_loss_mlp": 0.01031451, "balance_loss_clip": 1.02022743, "balance_loss_mlp": 1.04079199, "epoch": 0.5854802344806854, "flos": 20339193807360.0, "grad_norm": 1.908772500747915, "language_loss": 0.82503635, "learning_rate": 1.547313391573169e-06, "loss": 0.84643286, "num_input_tokens_seen": 209896690, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 9738, "time_per_iteration": 2.4934582710266113 }, { "auxiliary_loss_clip": 0.01113122, "auxiliary_loss_mlp": 0.01035893, "balance_loss_clip": 1.02266049, "balance_loss_mlp": 1.04022777, "epoch": 0.5855403577333533, "flos": 20921054221440.0, "grad_norm": 2.301646689715829, "language_loss": 0.6864475, "learning_rate": 1.546934045946082e-06, "loss": 0.70793772, "num_input_tokens_seen": 209914640, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9739, "time_per_iteration": 2.4746670722961426 }, { "auxiliary_loss_clip": 0.01110544, "auxiliary_loss_mlp": 0.01025789, "balance_loss_clip": 1.01273561, "balance_loss_mlp": 1.03875828, "epoch": 0.5856004809860214, "flos": 20448649526400.0, "grad_norm": 2.357896202301621, "language_loss": 0.58606279, "learning_rate": 1.5465547174993017e-06, "loss": 0.60742611, "num_input_tokens_seen": 209933375, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 9740, "time_per_iteration": 2.5169425010681152 }, { "auxiliary_loss_clip": 0.01107947, "auxiliary_loss_mlp": 0.01031044, "balance_loss_clip": 1.01828218, "balance_loss_mlp": 1.03764546, "epoch": 0.5856606042386893, "flos": 19640766781440.0, "grad_norm": 1.8586168830591867, "language_loss": 0.75478852, "learning_rate": 1.5461754062472113e-06, "loss": 0.77617836, "num_input_tokens_seen": 209952055, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 9741, "time_per_iteration": 2.45943021774292 }, { "auxiliary_loss_clip": 0.01111568, "auxiliary_loss_mlp": 0.01031076, "balance_loss_clip": 1.01893437, "balance_loss_mlp": 1.04144967, "epoch": 0.5857207274913573, "flos": 21686166846720.0, "grad_norm": 2.8851854251243676, "language_loss": 0.75875926, "learning_rate": 1.5457961122041959e-06, "loss": 0.7801857, "num_input_tokens_seen": 209971190, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 9742, "time_per_iteration": 2.5012052059173584 }, { "auxiliary_loss_clip": 0.01108308, "auxiliary_loss_mlp": 0.01029006, "balance_loss_clip": 1.01715064, "balance_loss_mlp": 1.03914046, "epoch": 0.5857808507440253, "flos": 23182708118400.0, "grad_norm": 1.7091057227404218, "language_loss": 0.75190645, "learning_rate": 1.5454168353846369e-06, "loss": 0.77327961, "num_input_tokens_seen": 209990695, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 9743, "time_per_iteration": 2.50193190574646 }, { "auxiliary_loss_clip": 0.01107717, "auxiliary_loss_mlp": 0.01027663, "balance_loss_clip": 1.01611161, "balance_loss_mlp": 1.040061, "epoch": 0.5858409739966932, "flos": 27235299156480.0, "grad_norm": 1.7726540684649088, "language_loss": 0.80912125, "learning_rate": 1.5450375758029172e-06, "loss": 0.83047497, "num_input_tokens_seen": 210010210, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.67578125, "step": 9744, "time_per_iteration": 2.577277898788452 }, { "auxiliary_loss_clip": 0.01114038, "auxiliary_loss_mlp": 0.01032125, "balance_loss_clip": 1.01978064, "balance_loss_mlp": 1.04106998, "epoch": 0.5859010972493612, "flos": 27855512317440.0, "grad_norm": 2.299125093721049, "language_loss": 0.71778321, "learning_rate": 1.5446583334734183e-06, "loss": 0.73924488, "num_input_tokens_seen": 210030030, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.73046875, "step": 9745, "time_per_iteration": 2.516709327697754 }, { "auxiliary_loss_clip": 0.01038644, "auxiliary_loss_mlp": 0.01003124, "balance_loss_clip": 1.00190163, "balance_loss_mlp": 1.01450086, "epoch": 0.5859612205020291, "flos": 70007064428160.0, "grad_norm": 0.7741410060979045, "language_loss": 0.5328604, "learning_rate": 1.5442791084105204e-06, "loss": 0.55327797, "num_input_tokens_seen": 210094840, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.2421875, "step": 9746, "time_per_iteration": 3.192962169647217 }, { "auxiliary_loss_clip": 0.01112658, "auxiliary_loss_mlp": 0.01033021, "balance_loss_clip": 1.01943159, "balance_loss_mlp": 1.04057217, "epoch": 0.5860213437546972, "flos": 24056019486720.0, "grad_norm": 2.2203862230967353, "language_loss": 0.73739302, "learning_rate": 1.5438999006286054e-06, "loss": 0.7588498, "num_input_tokens_seen": 210114660, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 9747, "time_per_iteration": 2.503321647644043 }, { "auxiliary_loss_clip": 0.01113025, "auxiliary_loss_mlp": 0.01030623, "balance_loss_clip": 1.01785612, "balance_loss_mlp": 1.04183495, "epoch": 0.5860814670073651, "flos": 18947583141120.0, "grad_norm": 2.0444292541902533, "language_loss": 0.81485862, "learning_rate": 1.543520710142051e-06, "loss": 0.83629507, "num_input_tokens_seen": 210132770, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 9748, "time_per_iteration": 2.458793878555298 }, { "auxiliary_loss_clip": 0.01112029, "auxiliary_loss_mlp": 0.01031609, "balance_loss_clip": 1.01890159, "balance_loss_mlp": 1.04114485, "epoch": 0.5861415902600331, "flos": 22561848512640.0, "grad_norm": 2.0515288216146983, "language_loss": 0.72244555, "learning_rate": 1.5431415369652375e-06, "loss": 0.74388194, "num_input_tokens_seen": 210151895, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 9749, "time_per_iteration": 2.4833738803863525 }, { "auxiliary_loss_clip": 0.01109341, "auxiliary_loss_mlp": 0.01032375, "balance_loss_clip": 1.01972651, "balance_loss_mlp": 1.0414083, "epoch": 0.586201713512701, "flos": 14392027912320.0, "grad_norm": 2.204843693482695, "language_loss": 0.74938273, "learning_rate": 1.5427623811125428e-06, "loss": 0.77079988, "num_input_tokens_seen": 210168040, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 9750, "time_per_iteration": 2.5035858154296875 }, { "auxiliary_loss_clip": 0.01109915, "auxiliary_loss_mlp": 0.01032888, "balance_loss_clip": 1.01966763, "balance_loss_mlp": 1.04059184, "epoch": 0.586261836765369, "flos": 19498560837120.0, "grad_norm": 1.6827128166993912, "language_loss": 0.70963049, "learning_rate": 1.542383242598344e-06, "loss": 0.73105848, "num_input_tokens_seen": 210187720, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69140625, "step": 9751, "time_per_iteration": 2.4706602096557617 }, { "auxiliary_loss_clip": 0.01112747, "auxiliary_loss_mlp": 0.01030886, "balance_loss_clip": 1.01706958, "balance_loss_mlp": 1.03976083, "epoch": 0.5863219600180369, "flos": 20701819560960.0, "grad_norm": 2.7726793852955383, "language_loss": 0.74478817, "learning_rate": 1.5420041214370184e-06, "loss": 0.7662245, "num_input_tokens_seen": 210206080, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73046875, "step": 9752, "time_per_iteration": 2.4799933433532715 }, { "auxiliary_loss_clip": 0.01109392, "auxiliary_loss_mlp": 0.01031272, "balance_loss_clip": 1.01897585, "balance_loss_mlp": 1.04061604, "epoch": 0.586382083270705, "flos": 19792130693760.0, "grad_norm": 1.8336403903536056, "language_loss": 0.77276254, "learning_rate": 1.541625017642943e-06, "loss": 0.79416919, "num_input_tokens_seen": 210225660, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 9753, "time_per_iteration": 2.4715800285339355 }, { "auxiliary_loss_clip": 0.01106276, "auxiliary_loss_mlp": 0.01024787, "balance_loss_clip": 1.01322949, "balance_loss_mlp": 1.04006076, "epoch": 0.5864422065233729, "flos": 16500558130560.0, "grad_norm": 1.7581800664082976, "language_loss": 0.71688533, "learning_rate": 1.5412459312304927e-06, "loss": 0.73819602, "num_input_tokens_seen": 210242725, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66015625, "step": 9754, "time_per_iteration": 2.4528353214263916 }, { "auxiliary_loss_clip": 0.01109956, "auxiliary_loss_mlp": 0.01029271, "balance_loss_clip": 1.01675999, "balance_loss_mlp": 1.03935814, "epoch": 0.5865023297760409, "flos": 20413277608320.0, "grad_norm": 1.6388839810672846, "language_loss": 0.72154081, "learning_rate": 1.540866862214043e-06, "loss": 0.74293303, "num_input_tokens_seen": 210263225, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 9755, "time_per_iteration": 3.8993325233459473 }, { "auxiliary_loss_clip": 0.01038857, "auxiliary_loss_mlp": 0.01007756, "balance_loss_clip": 1.00643861, "balance_loss_mlp": 1.01494539, "epoch": 0.5865624530287089, "flos": 63350769254400.0, "grad_norm": 0.745359266039599, "language_loss": 0.56908977, "learning_rate": 1.540487810607967e-06, "loss": 0.58955586, "num_input_tokens_seen": 210322310, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.23925781, "step": 9756, "time_per_iteration": 3.0772545337677 }, { "auxiliary_loss_clip": 0.01106621, "auxiliary_loss_mlp": 0.01028604, "balance_loss_clip": 1.01698732, "balance_loss_mlp": 1.03829372, "epoch": 0.5866225762813768, "flos": 27016279977600.0, "grad_norm": 2.1984164878214085, "language_loss": 0.75877565, "learning_rate": 1.5401087764266396e-06, "loss": 0.78012788, "num_input_tokens_seen": 210340845, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 9757, "time_per_iteration": 4.0956549644470215 }, { "auxiliary_loss_clip": 0.01038772, "auxiliary_loss_mlp": 0.01004633, "balance_loss_clip": 1.00332165, "balance_loss_mlp": 1.01472485, "epoch": 0.5866826995340448, "flos": 72987038507520.0, "grad_norm": 0.8524658861044603, "language_loss": 0.60559583, "learning_rate": 1.5397297596844337e-06, "loss": 0.62602985, "num_input_tokens_seen": 210397815, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.24023438, "step": 9758, "time_per_iteration": 4.527001142501831 }, { "auxiliary_loss_clip": 0.01115956, "auxiliary_loss_mlp": 0.01028285, "balance_loss_clip": 1.0154109, "balance_loss_mlp": 1.04218388, "epoch": 0.5867428227867127, "flos": 21285727050240.0, "grad_norm": 2.215705615435462, "language_loss": 0.71993047, "learning_rate": 1.5393507603957212e-06, "loss": 0.74137288, "num_input_tokens_seen": 210413900, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 9759, "time_per_iteration": 2.48225474357605 }, { "auxiliary_loss_clip": 0.01112155, "auxiliary_loss_mlp": 0.01028756, "balance_loss_clip": 1.01735413, "balance_loss_mlp": 1.04215848, "epoch": 0.5868029460393808, "flos": 33468852188160.0, "grad_norm": 1.5234638709595243, "language_loss": 0.73267221, "learning_rate": 1.5389717785748742e-06, "loss": 0.75408137, "num_input_tokens_seen": 210434110, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.703125, "step": 9760, "time_per_iteration": 4.001410484313965 }, { "auxiliary_loss_clip": 0.01109115, "auxiliary_loss_mlp": 0.01029814, "balance_loss_clip": 1.01747584, "balance_loss_mlp": 1.03952289, "epoch": 0.5868630692920487, "flos": 17889475276800.0, "grad_norm": 1.8686130074193918, "language_loss": 0.72734904, "learning_rate": 1.5385928142362637e-06, "loss": 0.74873829, "num_input_tokens_seen": 210451685, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 9761, "time_per_iteration": 2.4741389751434326 }, { "auxiliary_loss_clip": 0.0111012, "auxiliary_loss_mlp": 0.01025863, "balance_loss_clip": 1.01241064, "balance_loss_mlp": 1.03767717, "epoch": 0.5869231925447167, "flos": 21035035054080.0, "grad_norm": 1.8982310725330813, "language_loss": 0.74948895, "learning_rate": 1.5382138673942597e-06, "loss": 0.77084875, "num_input_tokens_seen": 210470825, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 9762, "time_per_iteration": 2.5127594470977783 }, { "auxiliary_loss_clip": 0.01108674, "auxiliary_loss_mlp": 0.01031305, "balance_loss_clip": 1.01838291, "balance_loss_mlp": 1.03998375, "epoch": 0.5869833157973846, "flos": 74738219293440.0, "grad_norm": 1.4450952283945717, "language_loss": 0.72413623, "learning_rate": 1.5378349380632317e-06, "loss": 0.74553609, "num_input_tokens_seen": 210500075, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 9763, "time_per_iteration": 2.875074863433838 }, { "auxiliary_loss_clip": 0.01107394, "auxiliary_loss_mlp": 0.01028715, "balance_loss_clip": 1.01713943, "balance_loss_mlp": 1.03852868, "epoch": 0.5870434390500526, "flos": 17638998762240.0, "grad_norm": 1.6466812775696262, "language_loss": 0.80182439, "learning_rate": 1.53745602625755e-06, "loss": 0.82318544, "num_input_tokens_seen": 210518150, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.69140625, "step": 9764, "time_per_iteration": 2.4404242038726807 }, { "auxiliary_loss_clip": 0.01109297, "auxiliary_loss_mlp": 0.01029197, "balance_loss_clip": 1.01706183, "balance_loss_mlp": 1.03926277, "epoch": 0.5871035623027205, "flos": 21506146859520.0, "grad_norm": 1.7190233487741315, "language_loss": 0.78956461, "learning_rate": 1.5370771319915819e-06, "loss": 0.81094962, "num_input_tokens_seen": 210537760, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69921875, "step": 9765, "time_per_iteration": 2.4797918796539307 }, { "auxiliary_loss_clip": 0.01107408, "auxiliary_loss_mlp": 0.01030108, "balance_loss_clip": 1.01754379, "balance_loss_mlp": 1.03933525, "epoch": 0.5871636855553886, "flos": 13551861818880.0, "grad_norm": 1.8773824254178468, "language_loss": 0.83600211, "learning_rate": 1.5366982552796947e-06, "loss": 0.85737729, "num_input_tokens_seen": 210555515, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 9766, "time_per_iteration": 2.4390883445739746 }, { "auxiliary_loss_clip": 0.01112723, "auxiliary_loss_mlp": 0.01031929, "balance_loss_clip": 1.01950741, "balance_loss_mlp": 1.04036093, "epoch": 0.5872238088080565, "flos": 26212922346240.0, "grad_norm": 1.594914374348604, "language_loss": 0.69806767, "learning_rate": 1.536319396136257e-06, "loss": 0.71951425, "num_input_tokens_seen": 210575000, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.72265625, "step": 9767, "time_per_iteration": 2.525932550430298 }, { "auxiliary_loss_clip": 0.0111139, "auxiliary_loss_mlp": 0.01031418, "balance_loss_clip": 1.01862657, "balance_loss_mlp": 1.0393517, "epoch": 0.5872839320607245, "flos": 30665198995200.0, "grad_norm": 1.9523979613199367, "language_loss": 0.62881494, "learning_rate": 1.5359405545756336e-06, "loss": 0.65024304, "num_input_tokens_seen": 210595185, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 9768, "time_per_iteration": 2.5523431301116943 }, { "auxiliary_loss_clip": 0.01038387, "auxiliary_loss_mlp": 0.0100402, "balance_loss_clip": 1.00265527, "balance_loss_mlp": 1.01447439, "epoch": 0.5873440553133924, "flos": 60303570871680.0, "grad_norm": 0.7406139577454233, "language_loss": 0.53954065, "learning_rate": 1.5355617306121914e-06, "loss": 0.55996472, "num_input_tokens_seen": 210653210, "router_z_loss_clip": 0.01367188, "router_z_loss_mlp": 0.23828125, "step": 9769, "time_per_iteration": 3.1027791500091553 }, { "auxiliary_loss_clip": 0.01107504, "auxiliary_loss_mlp": 0.01031317, "balance_loss_clip": 1.01921129, "balance_loss_mlp": 1.03847384, "epoch": 0.5874041785660604, "flos": 21539292134400.0, "grad_norm": 1.4856619226184813, "language_loss": 0.70818931, "learning_rate": 1.5351829242602945e-06, "loss": 0.72957754, "num_input_tokens_seen": 210673750, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 9770, "time_per_iteration": 2.4819819927215576 }, { "auxiliary_loss_clip": 0.01107848, "auxiliary_loss_mlp": 0.01031617, "balance_loss_clip": 1.01896262, "balance_loss_mlp": 1.03915834, "epoch": 0.5874643018187284, "flos": 24388947671040.0, "grad_norm": 4.427670657482369, "language_loss": 0.67586803, "learning_rate": 1.5348041355343077e-06, "loss": 0.69726264, "num_input_tokens_seen": 210692960, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 9771, "time_per_iteration": 2.5247175693511963 }, { "auxiliary_loss_clip": 0.0111045, "auxiliary_loss_mlp": 0.010333, "balance_loss_clip": 1.02028203, "balance_loss_mlp": 1.03860855, "epoch": 0.5875244250713964, "flos": 28147717457280.0, "grad_norm": 1.765470753339395, "language_loss": 0.6619401, "learning_rate": 1.5344253644485954e-06, "loss": 0.68337762, "num_input_tokens_seen": 210714040, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 9772, "time_per_iteration": 2.5459935665130615 }, { "auxiliary_loss_clip": 0.01112149, "auxiliary_loss_mlp": 0.01040084, "balance_loss_clip": 1.02645195, "balance_loss_mlp": 1.03966427, "epoch": 0.5875845483240644, "flos": 25812410722560.0, "grad_norm": 1.6053410542514999, "language_loss": 0.74482727, "learning_rate": 1.534046611017519e-06, "loss": 0.76634955, "num_input_tokens_seen": 210733710, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 9773, "time_per_iteration": 2.5187265872955322 }, { "auxiliary_loss_clip": 0.011099, "auxiliary_loss_mlp": 0.01037099, "balance_loss_clip": 1.02368224, "balance_loss_mlp": 1.03965974, "epoch": 0.5876446715767323, "flos": 26906572863360.0, "grad_norm": 9.341056403573106, "language_loss": 0.5388391, "learning_rate": 1.5336678752554421e-06, "loss": 0.56030905, "num_input_tokens_seen": 210753580, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 9774, "time_per_iteration": 2.5381906032562256 }, { "auxiliary_loss_clip": 0.01111638, "auxiliary_loss_mlp": 0.01034075, "balance_loss_clip": 1.02114069, "balance_loss_mlp": 1.04108644, "epoch": 0.5877047948294003, "flos": 36684832579200.0, "grad_norm": 6.018393738290335, "language_loss": 0.64676327, "learning_rate": 1.5332891571767264e-06, "loss": 0.6682204, "num_input_tokens_seen": 210773495, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 9775, "time_per_iteration": 2.607436418533325 }, { "auxiliary_loss_clip": 0.01108359, "auxiliary_loss_mlp": 0.0103244, "balance_loss_clip": 1.01991153, "balance_loss_mlp": 1.03841639, "epoch": 0.5877649180820682, "flos": 26724721282560.0, "grad_norm": 2.2067455419457325, "language_loss": 0.73547173, "learning_rate": 1.5329104567957326e-06, "loss": 0.75687969, "num_input_tokens_seen": 210793645, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 9776, "time_per_iteration": 2.5528862476348877 }, { "auxiliary_loss_clip": 0.01110366, "auxiliary_loss_mlp": 0.01032548, "balance_loss_clip": 1.02060962, "balance_loss_mlp": 1.03989697, "epoch": 0.5878250413347362, "flos": 21032197879680.0, "grad_norm": 1.9637883344223015, "language_loss": 0.74363762, "learning_rate": 1.532531774126821e-06, "loss": 0.7650668, "num_input_tokens_seen": 210813415, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 9777, "time_per_iteration": 2.4893763065338135 }, { "auxiliary_loss_clip": 0.01107428, "auxiliary_loss_mlp": 0.01033237, "balance_loss_clip": 1.02165604, "balance_loss_mlp": 1.03970361, "epoch": 0.5878851645874041, "flos": 25484259047040.0, "grad_norm": 1.5312065543450804, "language_loss": 0.74308598, "learning_rate": 1.5321531091843512e-06, "loss": 0.76449263, "num_input_tokens_seen": 210833850, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.67578125, "step": 9778, "time_per_iteration": 2.5331192016601562 }, { "auxiliary_loss_clip": 0.01108211, "auxiliary_loss_mlp": 0.01028092, "balance_loss_clip": 1.01575375, "balance_loss_mlp": 1.03851068, "epoch": 0.5879452878400722, "flos": 23769129559680.0, "grad_norm": 3.0477294254218203, "language_loss": 0.70225805, "learning_rate": 1.5317744619826824e-06, "loss": 0.72362101, "num_input_tokens_seen": 210853115, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 9779, "time_per_iteration": 2.614654541015625 }, { "auxiliary_loss_clip": 0.01111475, "auxiliary_loss_mlp": 0.01031348, "balance_loss_clip": 1.01847911, "balance_loss_mlp": 1.0389179, "epoch": 0.5880054110927401, "flos": 17824513530240.0, "grad_norm": 1.9579200159238788, "language_loss": 0.66981089, "learning_rate": 1.5313958325361727e-06, "loss": 0.69123912, "num_input_tokens_seen": 210872090, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9780, "time_per_iteration": 2.5098438262939453 }, { "auxiliary_loss_clip": 0.01112709, "auxiliary_loss_mlp": 0.01036944, "balance_loss_clip": 1.02332413, "balance_loss_mlp": 1.04165888, "epoch": 0.5880655343454081, "flos": 19463404400640.0, "grad_norm": 6.955184654422184, "language_loss": 0.72563481, "learning_rate": 1.5310172208591807e-06, "loss": 0.74713135, "num_input_tokens_seen": 210888490, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 9781, "time_per_iteration": 2.4576199054718018 }, { "auxiliary_loss_clip": 0.01109316, "auxiliary_loss_mlp": 0.01032917, "balance_loss_clip": 1.02085912, "balance_loss_mlp": 1.03880072, "epoch": 0.588125657598076, "flos": 21397588980480.0, "grad_norm": 1.3814264587076874, "language_loss": 0.70461893, "learning_rate": 1.5306386269660622e-06, "loss": 0.72604132, "num_input_tokens_seen": 210908220, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.70703125, "step": 9782, "time_per_iteration": 2.5048177242279053 }, { "auxiliary_loss_clip": 0.01111731, "auxiliary_loss_mlp": 0.01027564, "balance_loss_clip": 1.01497567, "balance_loss_mlp": 1.03839314, "epoch": 0.588185780850744, "flos": 16034653797120.0, "grad_norm": 2.3983687766776884, "language_loss": 0.70106637, "learning_rate": 1.5302600508711741e-06, "loss": 0.72245932, "num_input_tokens_seen": 210923945, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 9783, "time_per_iteration": 2.4332449436187744 }, { "auxiliary_loss_clip": 0.01114352, "auxiliary_loss_mlp": 0.0103412, "balance_loss_clip": 1.02044702, "balance_loss_mlp": 1.04204559, "epoch": 0.588245904103412, "flos": 23728226947200.0, "grad_norm": 2.019425524894993, "language_loss": 0.69259179, "learning_rate": 1.5298814925888719e-06, "loss": 0.71407652, "num_input_tokens_seen": 210941955, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 9784, "time_per_iteration": 2.50590181350708 }, { "auxiliary_loss_clip": 0.01111205, "auxiliary_loss_mlp": 0.01031394, "balance_loss_clip": 1.01828694, "balance_loss_mlp": 1.03804004, "epoch": 0.58830602735608, "flos": 33802534558080.0, "grad_norm": 2.0855183225688982, "language_loss": 0.69433385, "learning_rate": 1.5295029521335102e-06, "loss": 0.71575975, "num_input_tokens_seen": 210963105, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 9785, "time_per_iteration": 2.580104351043701 }, { "auxiliary_loss_clip": 0.01107048, "auxiliary_loss_mlp": 0.01026113, "balance_loss_clip": 1.01435935, "balance_loss_mlp": 1.03796673, "epoch": 0.588366150608748, "flos": 17090714586240.0, "grad_norm": 1.947550938618884, "language_loss": 0.77649999, "learning_rate": 1.5291244295194448e-06, "loss": 0.79783165, "num_input_tokens_seen": 210978720, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 9786, "time_per_iteration": 2.470684289932251 }, { "auxiliary_loss_clip": 0.01111959, "auxiliary_loss_mlp": 0.01034781, "balance_loss_clip": 1.02206707, "balance_loss_mlp": 1.03997409, "epoch": 0.5884262738614159, "flos": 22127186033280.0, "grad_norm": 1.5788229788674277, "language_loss": 0.79463631, "learning_rate": 1.5287459247610276e-06, "loss": 0.8161037, "num_input_tokens_seen": 210998750, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9787, "time_per_iteration": 2.491441249847412 }, { "auxiliary_loss_clip": 0.01109874, "auxiliary_loss_mlp": 0.01029558, "balance_loss_clip": 1.0179652, "balance_loss_mlp": 1.03917193, "epoch": 0.5884863971140839, "flos": 21031838743680.0, "grad_norm": 1.514778719316853, "language_loss": 0.66286242, "learning_rate": 1.5283674378726116e-06, "loss": 0.68425673, "num_input_tokens_seen": 211017550, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.70703125, "step": 9788, "time_per_iteration": 2.49059796333313 }, { "auxiliary_loss_clip": 0.01108034, "auxiliary_loss_mlp": 0.01030513, "balance_loss_clip": 1.0178355, "balance_loss_mlp": 1.03997767, "epoch": 0.5885465203667518, "flos": 23805112008960.0, "grad_norm": 2.9772357997852503, "language_loss": 0.80307341, "learning_rate": 1.5279889688685506e-06, "loss": 0.82445884, "num_input_tokens_seen": 211034135, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 9789, "time_per_iteration": 2.506199359893799 }, { "auxiliary_loss_clip": 0.01107571, "auxiliary_loss_mlp": 0.01028801, "balance_loss_clip": 1.01664758, "balance_loss_mlp": 1.03880668, "epoch": 0.5886066436194198, "flos": 18880574319360.0, "grad_norm": 1.664722659519154, "language_loss": 0.70399928, "learning_rate": 1.5276105177631944e-06, "loss": 0.72536296, "num_input_tokens_seen": 211053850, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 9790, "time_per_iteration": 2.5061287879943848 }, { "auxiliary_loss_clip": 0.01108737, "auxiliary_loss_mlp": 0.01032877, "balance_loss_clip": 1.02033663, "balance_loss_mlp": 1.0399307, "epoch": 0.5886667668720877, "flos": 24790141653120.0, "grad_norm": 1.9529757080485732, "language_loss": 0.82957393, "learning_rate": 1.527232084570895e-06, "loss": 0.85099006, "num_input_tokens_seen": 211072165, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 9791, "time_per_iteration": 2.505340099334717 }, { "auxiliary_loss_clip": 0.01112783, "auxiliary_loss_mlp": 0.01032911, "balance_loss_clip": 1.02044201, "balance_loss_mlp": 1.04171848, "epoch": 0.5887268901247558, "flos": 21614381516160.0, "grad_norm": 1.6401892327806689, "language_loss": 0.76695877, "learning_rate": 1.5268536693060026e-06, "loss": 0.78841573, "num_input_tokens_seen": 211089630, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 9792, "time_per_iteration": 2.508950710296631 }, { "auxiliary_loss_clip": 0.01112727, "auxiliary_loss_mlp": 0.01032334, "balance_loss_clip": 1.01924503, "balance_loss_mlp": 1.0396843, "epoch": 0.5887870133774237, "flos": 20481722974080.0, "grad_norm": 1.962928187562476, "language_loss": 0.6929332, "learning_rate": 1.5264752719828662e-06, "loss": 0.71438378, "num_input_tokens_seen": 211106120, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 9793, "time_per_iteration": 2.5030288696289062 }, { "auxiliary_loss_clip": 0.01107768, "auxiliary_loss_mlp": 0.01030501, "balance_loss_clip": 1.01803207, "balance_loss_mlp": 1.03949165, "epoch": 0.5888471366300917, "flos": 19206283870080.0, "grad_norm": 1.8523016703882387, "language_loss": 0.6045562, "learning_rate": 1.5260968926158353e-06, "loss": 0.62593895, "num_input_tokens_seen": 211122450, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.68359375, "step": 9794, "time_per_iteration": 2.4731569290161133 }, { "auxiliary_loss_clip": 0.01111888, "auxiliary_loss_mlp": 0.01036071, "balance_loss_clip": 1.02356577, "balance_loss_mlp": 1.04120922, "epoch": 0.5889072598827596, "flos": 19972904866560.0, "grad_norm": 1.6544817859901537, "language_loss": 0.65232664, "learning_rate": 1.525718531219257e-06, "loss": 0.67380619, "num_input_tokens_seen": 211141765, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 9795, "time_per_iteration": 2.478891372680664 }, { "auxiliary_loss_clip": 0.01108274, "auxiliary_loss_mlp": 0.0103517, "balance_loss_clip": 1.023875, "balance_loss_mlp": 1.03956008, "epoch": 0.5889673831354276, "flos": 20741249715840.0, "grad_norm": 1.6639602903548205, "language_loss": 0.74102217, "learning_rate": 1.5253401878074801e-06, "loss": 0.7624566, "num_input_tokens_seen": 211160475, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6875, "step": 9796, "time_per_iteration": 4.0027196407318115 }, { "auxiliary_loss_clip": 0.01109857, "auxiliary_loss_mlp": 0.01031387, "balance_loss_clip": 1.01917446, "balance_loss_mlp": 1.04003739, "epoch": 0.5890275063880956, "flos": 25300935008640.0, "grad_norm": 1.575367995041608, "language_loss": 0.82875448, "learning_rate": 1.5249618623948507e-06, "loss": 0.85016692, "num_input_tokens_seen": 211180480, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 9797, "time_per_iteration": 2.517937421798706 }, { "auxiliary_loss_clip": 0.01105605, "auxiliary_loss_mlp": 0.01029852, "balance_loss_clip": 1.01748967, "balance_loss_mlp": 1.03802538, "epoch": 0.5890876296407636, "flos": 11765377964160.0, "grad_norm": 2.084173220170559, "language_loss": 0.788728, "learning_rate": 1.5245835549957152e-06, "loss": 0.81008255, "num_input_tokens_seen": 211198000, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.67578125, "step": 9798, "time_per_iteration": 2.459774971008301 }, { "auxiliary_loss_clip": 0.01108381, "auxiliary_loss_mlp": 0.01030884, "balance_loss_clip": 1.01938057, "balance_loss_mlp": 1.04013801, "epoch": 0.5891477528934316, "flos": 13589460380160.0, "grad_norm": 2.2422133402964075, "language_loss": 0.7441045, "learning_rate": 1.5242052656244186e-06, "loss": 0.76549721, "num_input_tokens_seen": 211214765, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 9799, "time_per_iteration": 3.8974077701568604 }, { "auxiliary_loss_clip": 0.01113373, "auxiliary_loss_mlp": 0.01031813, "balance_loss_clip": 1.01861644, "balance_loss_mlp": 1.04158783, "epoch": 0.5892078761460995, "flos": 15049193189760.0, "grad_norm": 2.220915120799846, "language_loss": 0.76432967, "learning_rate": 1.5238269942953064e-06, "loss": 0.7857815, "num_input_tokens_seen": 211232335, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 9800, "time_per_iteration": 3.8551273345947266 }, { "auxiliary_loss_clip": 0.01109989, "auxiliary_loss_mlp": 0.01035611, "balance_loss_clip": 1.02370787, "balance_loss_mlp": 1.03908563, "epoch": 0.5892679993987675, "flos": 15778215624960.0, "grad_norm": 2.1964011836553627, "language_loss": 0.78968596, "learning_rate": 1.523448741022722e-06, "loss": 0.81114191, "num_input_tokens_seen": 211249985, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 9801, "time_per_iteration": 3.882347822189331 }, { "auxiliary_loss_clip": 0.01111907, "auxiliary_loss_mlp": 0.01029334, "balance_loss_clip": 1.01669741, "balance_loss_mlp": 1.04029799, "epoch": 0.5893281226514354, "flos": 25265203954560.0, "grad_norm": 1.7434947300631087, "language_loss": 0.66093045, "learning_rate": 1.5230705058210088e-06, "loss": 0.68234289, "num_input_tokens_seen": 211268425, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 9802, "time_per_iteration": 2.5029234886169434 }, { "auxiliary_loss_clip": 0.01108988, "auxiliary_loss_mlp": 0.01027493, "balance_loss_clip": 1.01517272, "balance_loss_mlp": 1.04076314, "epoch": 0.5893882459041034, "flos": 19458232842240.0, "grad_norm": 6.211109823050366, "language_loss": 0.78112257, "learning_rate": 1.5226922887045108e-06, "loss": 0.80248743, "num_input_tokens_seen": 211286680, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 9803, "time_per_iteration": 2.4495177268981934 }, { "auxiliary_loss_clip": 0.01110624, "auxiliary_loss_mlp": 0.01033749, "balance_loss_clip": 1.02139854, "balance_loss_mlp": 1.04029226, "epoch": 0.5894483691567713, "flos": 20634056553600.0, "grad_norm": 1.4123020580515995, "language_loss": 0.73244911, "learning_rate": 1.5223140896875686e-06, "loss": 0.75389284, "num_input_tokens_seen": 211307700, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 9804, "time_per_iteration": 2.522810697555542 }, { "auxiliary_loss_clip": 0.01109035, "auxiliary_loss_mlp": 0.01029695, "balance_loss_clip": 1.0177083, "balance_loss_mlp": 1.0409646, "epoch": 0.5895084924094394, "flos": 17778223877760.0, "grad_norm": 1.5608208428241181, "language_loss": 0.75038546, "learning_rate": 1.5219359087845234e-06, "loss": 0.77177274, "num_input_tokens_seen": 211324835, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 9805, "time_per_iteration": 2.4414029121398926 }, { "auxiliary_loss_clip": 0.01115746, "auxiliary_loss_mlp": 0.0103001, "balance_loss_clip": 1.01656342, "balance_loss_mlp": 1.0412724, "epoch": 0.5895686156621073, "flos": 20121072468480.0, "grad_norm": 14.309771358972379, "language_loss": 0.77940166, "learning_rate": 1.5215577460097174e-06, "loss": 0.80085921, "num_input_tokens_seen": 211344130, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 9806, "time_per_iteration": 2.4943583011627197 }, { "auxiliary_loss_clip": 0.01108381, "auxiliary_loss_mlp": 0.01027888, "balance_loss_clip": 1.0151031, "balance_loss_mlp": 1.03847432, "epoch": 0.5896287389147753, "flos": 20850058990080.0, "grad_norm": 2.2336936077626586, "language_loss": 0.77050245, "learning_rate": 1.5211796013774887e-06, "loss": 0.79186511, "num_input_tokens_seen": 211362915, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 9807, "time_per_iteration": 2.483400583267212 }, { "auxiliary_loss_clip": 0.01114894, "auxiliary_loss_mlp": 0.01031733, "balance_loss_clip": 1.01898408, "balance_loss_mlp": 1.0426513, "epoch": 0.5896888621674432, "flos": 14537897043840.0, "grad_norm": 1.888483773063122, "language_loss": 0.74388754, "learning_rate": 1.5208014749021786e-06, "loss": 0.7653538, "num_input_tokens_seen": 211380700, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 9808, "time_per_iteration": 2.473557472229004 }, { "auxiliary_loss_clip": 0.01114953, "auxiliary_loss_mlp": 0.01029551, "balance_loss_clip": 1.01600921, "balance_loss_mlp": 1.04266047, "epoch": 0.5897489854201112, "flos": 20886759711360.0, "grad_norm": 2.3287800042843187, "language_loss": 0.7228564, "learning_rate": 1.5204233665981236e-06, "loss": 0.74430144, "num_input_tokens_seen": 211400095, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 9809, "time_per_iteration": 2.48396635055542 }, { "auxiliary_loss_clip": 0.01112622, "auxiliary_loss_mlp": 0.01034632, "balance_loss_clip": 1.0217756, "balance_loss_mlp": 1.04009247, "epoch": 0.5898091086727792, "flos": 20011149872640.0, "grad_norm": 2.698831207743503, "language_loss": 0.81837517, "learning_rate": 1.5200452764796627e-06, "loss": 0.83984774, "num_input_tokens_seen": 211417810, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 9810, "time_per_iteration": 2.474759340286255 }, { "auxiliary_loss_clip": 0.01109231, "auxiliary_loss_mlp": 0.01029424, "balance_loss_clip": 1.01738393, "balance_loss_mlp": 1.0405972, "epoch": 0.5898692319254472, "flos": 16253242012800.0, "grad_norm": 1.7305678550182477, "language_loss": 0.81321716, "learning_rate": 1.5196672045611336e-06, "loss": 0.83460373, "num_input_tokens_seen": 211436020, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 9811, "time_per_iteration": 2.4751315116882324 }, { "auxiliary_loss_clip": 0.01111874, "auxiliary_loss_mlp": 0.01029654, "balance_loss_clip": 1.01652348, "balance_loss_mlp": 1.04033804, "epoch": 0.5899293551781152, "flos": 20448541785600.0, "grad_norm": 2.0434095485583974, "language_loss": 0.76871181, "learning_rate": 1.5192891508568715e-06, "loss": 0.79012716, "num_input_tokens_seen": 211454335, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 9812, "time_per_iteration": 2.486018180847168 }, { "auxiliary_loss_clip": 0.01110077, "auxiliary_loss_mlp": 0.01028049, "balance_loss_clip": 1.0168016, "balance_loss_mlp": 1.04134655, "epoch": 0.5899894784307831, "flos": 13881701433600.0, "grad_norm": 1.800524307705089, "language_loss": 0.70475566, "learning_rate": 1.5189111153812133e-06, "loss": 0.72613692, "num_input_tokens_seen": 211472775, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6875, "step": 9813, "time_per_iteration": 2.454312562942505 }, { "auxiliary_loss_clip": 0.01109966, "auxiliary_loss_mlp": 0.01032088, "balance_loss_clip": 1.01939774, "balance_loss_mlp": 1.03929543, "epoch": 0.5900496016834511, "flos": 20083797129600.0, "grad_norm": 1.7018711205758559, "language_loss": 0.7212429, "learning_rate": 1.518533098148494e-06, "loss": 0.74266344, "num_input_tokens_seen": 211492195, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 9814, "time_per_iteration": 2.503653049468994 }, { "auxiliary_loss_clip": 0.01110768, "auxiliary_loss_mlp": 0.01027444, "balance_loss_clip": 1.01505864, "balance_loss_mlp": 1.04046059, "epoch": 0.590109724936119, "flos": 20259148348800.0, "grad_norm": 1.7209359057733846, "language_loss": 0.78204459, "learning_rate": 1.5181550991730476e-06, "loss": 0.80342674, "num_input_tokens_seen": 211510220, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 9815, "time_per_iteration": 2.4780569076538086 }, { "auxiliary_loss_clip": 0.01115592, "auxiliary_loss_mlp": 0.0103491, "balance_loss_clip": 1.02166033, "balance_loss_mlp": 1.04122019, "epoch": 0.590169848188787, "flos": 24235069806720.0, "grad_norm": 2.186824548329141, "language_loss": 0.75978518, "learning_rate": 1.5177771184692083e-06, "loss": 0.78129017, "num_input_tokens_seen": 211526260, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 9816, "time_per_iteration": 2.5013349056243896 }, { "auxiliary_loss_clip": 0.01113204, "auxiliary_loss_mlp": 0.01032295, "balance_loss_clip": 1.01999831, "balance_loss_mlp": 1.04376411, "epoch": 0.590229971441455, "flos": 17784724239360.0, "grad_norm": 2.476473566945619, "language_loss": 0.81086516, "learning_rate": 1.517399156051309e-06, "loss": 0.83232015, "num_input_tokens_seen": 211542890, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 9817, "time_per_iteration": 2.450209140777588 }, { "auxiliary_loss_clip": 0.01113195, "auxiliary_loss_mlp": 0.01033696, "balance_loss_clip": 1.02172685, "balance_loss_mlp": 1.04232502, "epoch": 0.590290094694123, "flos": 22236893147520.0, "grad_norm": 1.9220480400294617, "language_loss": 0.76501048, "learning_rate": 1.517021211933682e-06, "loss": 0.78647935, "num_input_tokens_seen": 211562685, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.7109375, "step": 9818, "time_per_iteration": 2.4945242404937744 }, { "auxiliary_loss_clip": 0.01108416, "auxiliary_loss_mlp": 0.01028963, "balance_loss_clip": 1.017501, "balance_loss_mlp": 1.04024017, "epoch": 0.5903502179467909, "flos": 19098623831040.0, "grad_norm": 4.39261438306437, "language_loss": 0.66907418, "learning_rate": 1.5166432861306592e-06, "loss": 0.69044793, "num_input_tokens_seen": 211579960, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 9819, "time_per_iteration": 2.466085910797119 }, { "auxiliary_loss_clip": 0.01113405, "auxiliary_loss_mlp": 0.01030617, "balance_loss_clip": 1.01823676, "balance_loss_mlp": 1.0429498, "epoch": 0.5904103411994589, "flos": 24235500769920.0, "grad_norm": 2.2769828382568083, "language_loss": 0.78161985, "learning_rate": 1.5162653786565714e-06, "loss": 0.80306, "num_input_tokens_seen": 211599310, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 9820, "time_per_iteration": 2.512885808944702 }, { "auxiliary_loss_clip": 0.0104195, "auxiliary_loss_mlp": 0.0100024, "balance_loss_clip": 0.99870247, "balance_loss_mlp": 1.01811099, "epoch": 0.5904704644521268, "flos": 64876613045760.0, "grad_norm": 0.9312367970351948, "language_loss": 0.65101147, "learning_rate": 1.5158874895257487e-06, "loss": 0.67143333, "num_input_tokens_seen": 211658790, "router_z_loss_clip": 0.01531982, "router_z_loss_mlp": 0.23828125, "step": 9821, "time_per_iteration": 3.075815439224243 }, { "auxiliary_loss_clip": 0.01110064, "auxiliary_loss_mlp": 0.01028742, "balance_loss_clip": 1.01675606, "balance_loss_mlp": 1.04114461, "epoch": 0.5905305877047948, "flos": 19609991804160.0, "grad_norm": 1.9862617487901275, "language_loss": 0.61267674, "learning_rate": 1.515509618752521e-06, "loss": 0.63406479, "num_input_tokens_seen": 211677240, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 9822, "time_per_iteration": 2.477829694747925 }, { "auxiliary_loss_clip": 0.01113616, "auxiliary_loss_mlp": 0.01035771, "balance_loss_clip": 1.02299166, "balance_loss_mlp": 1.04124272, "epoch": 0.5905907109574628, "flos": 18989634988800.0, "grad_norm": 1.8409731862525665, "language_loss": 0.82693839, "learning_rate": 1.5151317663512173e-06, "loss": 0.84843218, "num_input_tokens_seen": 211695485, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 9823, "time_per_iteration": 2.470532178878784 }, { "auxiliary_loss_clip": 0.01109856, "auxiliary_loss_mlp": 0.01026853, "balance_loss_clip": 1.01383519, "balance_loss_mlp": 1.04111767, "epoch": 0.5906508342101308, "flos": 22200407907840.0, "grad_norm": 2.2807066467999015, "language_loss": 0.73237234, "learning_rate": 1.514753932336165e-06, "loss": 0.75373936, "num_input_tokens_seen": 211713090, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 9824, "time_per_iteration": 2.514829397201538 }, { "auxiliary_loss_clip": 0.01118656, "auxiliary_loss_mlp": 0.01030845, "balance_loss_clip": 1.01655209, "balance_loss_mlp": 1.04145479, "epoch": 0.5907109574627988, "flos": 20886687884160.0, "grad_norm": 2.6455114397326045, "language_loss": 0.82900167, "learning_rate": 1.514376116721693e-06, "loss": 0.85049677, "num_input_tokens_seen": 211732510, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76953125, "step": 9825, "time_per_iteration": 2.4701874256134033 }, { "auxiliary_loss_clip": 0.01107243, "auxiliary_loss_mlp": 0.01031209, "balance_loss_clip": 1.02043211, "balance_loss_mlp": 1.04061365, "epoch": 0.5907710807154667, "flos": 21506649649920.0, "grad_norm": 1.697002444305767, "language_loss": 0.76659739, "learning_rate": 1.5139983195221272e-06, "loss": 0.78798187, "num_input_tokens_seen": 211748695, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.6640625, "step": 9826, "time_per_iteration": 2.4942615032196045 }, { "auxiliary_loss_clip": 0.01108351, "auxiliary_loss_mlp": 0.01025381, "balance_loss_clip": 1.01390731, "balance_loss_mlp": 1.03940058, "epoch": 0.5908312039681347, "flos": 22018376759040.0, "grad_norm": 2.2748781249314627, "language_loss": 0.72183537, "learning_rate": 1.513620540751793e-06, "loss": 0.74317271, "num_input_tokens_seen": 211768545, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6875, "step": 9827, "time_per_iteration": 2.481290102005005 }, { "auxiliary_loss_clip": 0.01110272, "auxiliary_loss_mlp": 0.01029526, "balance_loss_clip": 1.0177834, "balance_loss_mlp": 1.03977525, "epoch": 0.5908913272208026, "flos": 18479523991680.0, "grad_norm": 1.8434443146504778, "language_loss": 0.79447693, "learning_rate": 1.5132427804250178e-06, "loss": 0.81587487, "num_input_tokens_seen": 211786665, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.70703125, "step": 9828, "time_per_iteration": 2.4726181030273438 }, { "auxiliary_loss_clip": 0.01115142, "auxiliary_loss_mlp": 0.01031966, "balance_loss_clip": 1.01891804, "balance_loss_mlp": 1.04285634, "epoch": 0.5909514504734706, "flos": 12312189682560.0, "grad_norm": 2.1900288980440545, "language_loss": 0.87848794, "learning_rate": 1.5128650385561241e-06, "loss": 0.89995897, "num_input_tokens_seen": 211801215, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 9829, "time_per_iteration": 2.4968831539154053 }, { "auxiliary_loss_clip": 0.01040575, "auxiliary_loss_mlp": 0.01002429, "balance_loss_clip": 1.00123084, "balance_loss_mlp": 1.01695919, "epoch": 0.5910115737261386, "flos": 70213262451840.0, "grad_norm": 0.7650135388791943, "language_loss": 0.57908517, "learning_rate": 1.5124873151594376e-06, "loss": 0.5995152, "num_input_tokens_seen": 211857005, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.23632812, "step": 9830, "time_per_iteration": 3.052504539489746 }, { "auxiliary_loss_clip": 0.01120262, "auxiliary_loss_mlp": 0.01032467, "balance_loss_clip": 1.018466, "balance_loss_mlp": 1.043962, "epoch": 0.5910716969788066, "flos": 22017766227840.0, "grad_norm": 3.29642005509467, "language_loss": 0.75759971, "learning_rate": 1.5121096102492812e-06, "loss": 0.779127, "num_input_tokens_seen": 211876675, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 9831, "time_per_iteration": 2.4911952018737793 }, { "auxiliary_loss_clip": 0.01108381, "auxiliary_loss_mlp": 0.01030738, "balance_loss_clip": 1.01880527, "balance_loss_mlp": 1.04142046, "epoch": 0.5911318202314745, "flos": 21251648021760.0, "grad_norm": 1.9062479964787344, "language_loss": 0.77685964, "learning_rate": 1.5117319238399767e-06, "loss": 0.79825085, "num_input_tokens_seen": 211895725, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 9832, "time_per_iteration": 2.493577718734741 }, { "auxiliary_loss_clip": 0.01108812, "auxiliary_loss_mlp": 0.0102506, "balance_loss_clip": 1.01313925, "balance_loss_mlp": 1.03908861, "epoch": 0.5911919434841425, "flos": 17821604528640.0, "grad_norm": 1.9263945931948976, "language_loss": 0.8316856, "learning_rate": 1.511354255945847e-06, "loss": 0.85302436, "num_input_tokens_seen": 211913860, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 9833, "time_per_iteration": 2.4483413696289062 }, { "auxiliary_loss_clip": 0.01111929, "auxiliary_loss_mlp": 0.01032798, "balance_loss_clip": 1.02017975, "balance_loss_mlp": 1.04101956, "epoch": 0.5912520667368104, "flos": 20374781207040.0, "grad_norm": 1.558244757340326, "language_loss": 0.74469125, "learning_rate": 1.5109766065812123e-06, "loss": 0.76613855, "num_input_tokens_seen": 211932880, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 9834, "time_per_iteration": 2.4864284992218018 }, { "auxiliary_loss_clip": 0.01110855, "auxiliary_loss_mlp": 0.01031969, "balance_loss_clip": 1.02008343, "balance_loss_mlp": 1.03942084, "epoch": 0.5913121899894784, "flos": 17930557457280.0, "grad_norm": 2.622916341760061, "language_loss": 0.78340477, "learning_rate": 1.5105989757603942e-06, "loss": 0.80483299, "num_input_tokens_seen": 211948625, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.71484375, "step": 9835, "time_per_iteration": 2.4559502601623535 }, { "auxiliary_loss_clip": 0.01113114, "auxiliary_loss_mlp": 0.01032822, "balance_loss_clip": 1.02034104, "balance_loss_mlp": 1.04172778, "epoch": 0.5913723132421465, "flos": 22126934638080.0, "grad_norm": 1.9665927183294967, "language_loss": 0.73698986, "learning_rate": 1.5102213634977117e-06, "loss": 0.7584492, "num_input_tokens_seen": 211965355, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 9836, "time_per_iteration": 2.4870412349700928 }, { "auxiliary_loss_clip": 0.01112627, "auxiliary_loss_mlp": 0.01028178, "balance_loss_clip": 1.01585722, "balance_loss_mlp": 1.04148579, "epoch": 0.5914324364948144, "flos": 15697918771200.0, "grad_norm": 2.079250372635015, "language_loss": 0.82217306, "learning_rate": 1.5098437698074841e-06, "loss": 0.84358114, "num_input_tokens_seen": 211982245, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 9837, "time_per_iteration": 2.4547224044799805 }, { "auxiliary_loss_clip": 0.0111194, "auxiliary_loss_mlp": 0.01028924, "balance_loss_clip": 1.01609707, "balance_loss_mlp": 1.04068589, "epoch": 0.5914925597474824, "flos": 22747327367040.0, "grad_norm": 1.7466479535784147, "language_loss": 0.7950893, "learning_rate": 1.5094661947040304e-06, "loss": 0.81649792, "num_input_tokens_seen": 212000250, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9838, "time_per_iteration": 3.8743014335632324 }, { "auxiliary_loss_clip": 0.01112287, "auxiliary_loss_mlp": 0.01034051, "balance_loss_clip": 1.02118874, "balance_loss_mlp": 1.04097676, "epoch": 0.5915526830001503, "flos": 18292788161280.0, "grad_norm": 1.8266967084639805, "language_loss": 0.69493443, "learning_rate": 1.5090886382016673e-06, "loss": 0.71639788, "num_input_tokens_seen": 212017505, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 9839, "time_per_iteration": 2.4741883277893066 }, { "auxiliary_loss_clip": 0.01113632, "auxiliary_loss_mlp": 0.01037891, "balance_loss_clip": 1.0249567, "balance_loss_mlp": 1.0413934, "epoch": 0.5916128062528183, "flos": 17019072910080.0, "grad_norm": 2.0907441229480344, "language_loss": 0.65717793, "learning_rate": 1.5087111003147124e-06, "loss": 0.67869312, "num_input_tokens_seen": 212034595, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 9840, "time_per_iteration": 2.4536566734313965 }, { "auxiliary_loss_clip": 0.01113015, "auxiliary_loss_mlp": 0.01030669, "balance_loss_clip": 1.01788402, "balance_loss_mlp": 1.04013467, "epoch": 0.5916729295054862, "flos": 24754231031040.0, "grad_norm": 1.918190967642561, "language_loss": 0.81710887, "learning_rate": 1.5083335810574813e-06, "loss": 0.83854568, "num_input_tokens_seen": 212055775, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73046875, "step": 9841, "time_per_iteration": 5.419496536254883 }, { "auxiliary_loss_clip": 0.0110917, "auxiliary_loss_mlp": 0.01028847, "balance_loss_clip": 1.01719427, "balance_loss_mlp": 1.03963482, "epoch": 0.5917330527581542, "flos": 15958199698560.0, "grad_norm": 1.6711802876609954, "language_loss": 0.69440275, "learning_rate": 1.507956080444291e-06, "loss": 0.71578294, "num_input_tokens_seen": 212074000, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 9842, "time_per_iteration": 3.9005470275878906 }, { "auxiliary_loss_clip": 0.01109791, "auxiliary_loss_mlp": 0.01034792, "balance_loss_clip": 1.0226326, "balance_loss_mlp": 1.0382719, "epoch": 0.5917931760108222, "flos": 23800730549760.0, "grad_norm": 1.7991493990688385, "language_loss": 0.82566303, "learning_rate": 1.5075785984894549e-06, "loss": 0.84710884, "num_input_tokens_seen": 212091415, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71484375, "step": 9843, "time_per_iteration": 2.485326051712036 }, { "auxiliary_loss_clip": 0.01111344, "auxiliary_loss_mlp": 0.01032886, "balance_loss_clip": 1.01948118, "balance_loss_mlp": 1.04005837, "epoch": 0.5918532992634902, "flos": 23249609199360.0, "grad_norm": 2.5130651684064156, "language_loss": 0.8142674, "learning_rate": 1.5072011352072875e-06, "loss": 0.83570969, "num_input_tokens_seen": 212105255, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 9844, "time_per_iteration": 2.4590108394622803 }, { "auxiliary_loss_clip": 0.01113457, "auxiliary_loss_mlp": 0.01028294, "balance_loss_clip": 1.0154556, "balance_loss_mlp": 1.04209352, "epoch": 0.5919134225161581, "flos": 19499853726720.0, "grad_norm": 1.9201444122666764, "language_loss": 0.74508035, "learning_rate": 1.5068236906121032e-06, "loss": 0.76649785, "num_input_tokens_seen": 212122765, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 9845, "time_per_iteration": 2.4562079906463623 }, { "auxiliary_loss_clip": 0.01110915, "auxiliary_loss_mlp": 0.01026434, "balance_loss_clip": 1.01327372, "balance_loss_mlp": 1.03896022, "epoch": 0.5919735457688261, "flos": 38800940567040.0, "grad_norm": 1.7790461815348353, "language_loss": 0.64270324, "learning_rate": 1.506446264718213e-06, "loss": 0.66407675, "num_input_tokens_seen": 212143960, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 9846, "time_per_iteration": 2.6303658485412598 }, { "auxiliary_loss_clip": 0.01104484, "auxiliary_loss_mlp": 0.01025401, "balance_loss_clip": 1.01463008, "balance_loss_mlp": 1.03825474, "epoch": 0.592033669021494, "flos": 22163994495360.0, "grad_norm": 1.622296105455636, "language_loss": 0.76332253, "learning_rate": 1.506068857539931e-06, "loss": 0.78462136, "num_input_tokens_seen": 212162005, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.6640625, "step": 9847, "time_per_iteration": 2.462524652481079 }, { "auxiliary_loss_clip": 0.01111406, "auxiliary_loss_mlp": 0.01029668, "balance_loss_clip": 1.01669765, "balance_loss_mlp": 1.03975368, "epoch": 0.592093792274162, "flos": 22710985781760.0, "grad_norm": 1.9120378939313671, "language_loss": 0.6224733, "learning_rate": 1.5056914690915667e-06, "loss": 0.64388406, "num_input_tokens_seen": 212181635, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 9848, "time_per_iteration": 2.538478136062622 }, { "auxiliary_loss_clip": 0.01112067, "auxiliary_loss_mlp": 0.01034137, "balance_loss_clip": 1.02157271, "balance_loss_mlp": 1.0397296, "epoch": 0.59215391552683, "flos": 22528954632960.0, "grad_norm": 1.8597379411489494, "language_loss": 0.75996149, "learning_rate": 1.5053140993874312e-06, "loss": 0.78142351, "num_input_tokens_seen": 212201615, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 9849, "time_per_iteration": 2.488365650177002 }, { "auxiliary_loss_clip": 0.01111862, "auxiliary_loss_mlp": 0.01036978, "balance_loss_clip": 1.02433014, "balance_loss_mlp": 1.04017437, "epoch": 0.592214038779498, "flos": 24499013921280.0, "grad_norm": 2.0100191251367194, "language_loss": 0.75636643, "learning_rate": 1.5049367484418353e-06, "loss": 0.77785486, "num_input_tokens_seen": 212219355, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 9850, "time_per_iteration": 2.5165421962738037 }, { "auxiliary_loss_clip": 0.01109407, "auxiliary_loss_mlp": 0.01035155, "balance_loss_clip": 1.02266777, "balance_loss_mlp": 1.03885317, "epoch": 0.592274162032166, "flos": 21831353619840.0, "grad_norm": 1.8953605671592144, "language_loss": 0.75287157, "learning_rate": 1.5045594162690868e-06, "loss": 0.7743172, "num_input_tokens_seen": 212236710, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 9851, "time_per_iteration": 2.467020034790039 }, { "auxiliary_loss_clip": 0.01111276, "auxiliary_loss_mlp": 0.0102905, "balance_loss_clip": 1.01662254, "balance_loss_mlp": 1.04015994, "epoch": 0.5923342852848339, "flos": 24608146417920.0, "grad_norm": 4.315815841972306, "language_loss": 0.70828468, "learning_rate": 1.5041821028834954e-06, "loss": 0.72968793, "num_input_tokens_seen": 212256195, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 9852, "time_per_iteration": 2.5314455032348633 }, { "auxiliary_loss_clip": 0.01114455, "auxiliary_loss_mlp": 0.01034681, "balance_loss_clip": 1.02179432, "balance_loss_mlp": 1.03997922, "epoch": 0.5923944085375019, "flos": 19938143479680.0, "grad_norm": 2.048520196580464, "language_loss": 0.8051753, "learning_rate": 1.5038048082993685e-06, "loss": 0.82666665, "num_input_tokens_seen": 212274085, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7421875, "step": 9853, "time_per_iteration": 2.466747283935547 }, { "auxiliary_loss_clip": 0.01106835, "auxiliary_loss_mlp": 0.01029935, "balance_loss_clip": 1.01793718, "balance_loss_mlp": 1.03837013, "epoch": 0.5924545317901698, "flos": 28658510812800.0, "grad_norm": 1.6327571071367244, "language_loss": 0.6780442, "learning_rate": 1.5034275325310124e-06, "loss": 0.69941193, "num_input_tokens_seen": 212295530, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 9854, "time_per_iteration": 2.5568132400512695 }, { "auxiliary_loss_clip": 0.01108571, "auxiliary_loss_mlp": 0.01027234, "balance_loss_clip": 1.01477671, "balance_loss_mlp": 1.03879309, "epoch": 0.5925146550428378, "flos": 19864885691520.0, "grad_norm": 1.7397979996497337, "language_loss": 0.8915025, "learning_rate": 1.5030502755927344e-06, "loss": 0.91286057, "num_input_tokens_seen": 212313770, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 9855, "time_per_iteration": 2.4645280838012695 }, { "auxiliary_loss_clip": 0.01108326, "auxiliary_loss_mlp": 0.0102977, "balance_loss_clip": 1.01815939, "balance_loss_mlp": 1.03989601, "epoch": 0.5925747782955058, "flos": 15122989681920.0, "grad_norm": 1.7363698273289534, "language_loss": 0.86628836, "learning_rate": 1.5026730374988397e-06, "loss": 0.88766932, "num_input_tokens_seen": 212331525, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 9856, "time_per_iteration": 2.474217414855957 }, { "auxiliary_loss_clip": 0.0110889, "auxiliary_loss_mlp": 0.01038247, "balance_loss_clip": 1.02595043, "balance_loss_mlp": 1.03777301, "epoch": 0.5926349015481738, "flos": 18405440190720.0, "grad_norm": 1.999823209008736, "language_loss": 0.77895176, "learning_rate": 1.5022958182636332e-06, "loss": 0.80042315, "num_input_tokens_seen": 212347295, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 9857, "time_per_iteration": 2.4308903217315674 }, { "auxiliary_loss_clip": 0.01112457, "auxiliary_loss_mlp": 0.01034708, "balance_loss_clip": 1.02262604, "balance_loss_mlp": 1.04230225, "epoch": 0.5926950248008417, "flos": 23111138269440.0, "grad_norm": 2.3893347496912307, "language_loss": 0.64561188, "learning_rate": 1.501918617901419e-06, "loss": 0.6670835, "num_input_tokens_seen": 212365750, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 9858, "time_per_iteration": 2.481412649154663 }, { "auxiliary_loss_clip": 0.01108481, "auxiliary_loss_mlp": 0.01030769, "balance_loss_clip": 1.01873446, "balance_loss_mlp": 1.03967714, "epoch": 0.5927551480535097, "flos": 28033916192640.0, "grad_norm": 2.444397430612805, "language_loss": 0.77152789, "learning_rate": 1.501541436426501e-06, "loss": 0.79292035, "num_input_tokens_seen": 212385300, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 9859, "time_per_iteration": 2.5218122005462646 }, { "auxiliary_loss_clip": 0.01113182, "auxiliary_loss_mlp": 0.01036686, "balance_loss_clip": 1.02275586, "balance_loss_mlp": 1.04073191, "epoch": 0.5928152713061776, "flos": 21798675221760.0, "grad_norm": 5.402319841292858, "language_loss": 0.75209618, "learning_rate": 1.5011642738531818e-06, "loss": 0.77359486, "num_input_tokens_seen": 212402140, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.72265625, "step": 9860, "time_per_iteration": 2.478499174118042 }, { "auxiliary_loss_clip": 0.01110224, "auxiliary_loss_mlp": 0.01034161, "balance_loss_clip": 1.0227412, "balance_loss_mlp": 1.04032385, "epoch": 0.5928753945588456, "flos": 24316839118080.0, "grad_norm": 1.7846192968568195, "language_loss": 0.76555634, "learning_rate": 1.500787130195763e-06, "loss": 0.78700024, "num_input_tokens_seen": 212421790, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 9861, "time_per_iteration": 2.5009491443634033 }, { "auxiliary_loss_clip": 0.01107295, "auxiliary_loss_mlp": 0.01027295, "balance_loss_clip": 1.01580286, "balance_loss_mlp": 1.03793991, "epoch": 0.5929355178115137, "flos": 26464619923200.0, "grad_norm": 1.765294212961986, "language_loss": 0.70594931, "learning_rate": 1.5004100054685465e-06, "loss": 0.72729516, "num_input_tokens_seen": 212442115, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6953125, "step": 9862, "time_per_iteration": 2.5211644172668457 }, { "auxiliary_loss_clip": 0.01107992, "auxiliary_loss_mlp": 0.01031336, "balance_loss_clip": 1.0190928, "balance_loss_mlp": 1.03780198, "epoch": 0.5929956410641816, "flos": 24965995662720.0, "grad_norm": 2.251562288869921, "language_loss": 0.77718687, "learning_rate": 1.500032899685832e-06, "loss": 0.79858017, "num_input_tokens_seen": 212459535, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 9863, "time_per_iteration": 2.4995882511138916 }, { "auxiliary_loss_clip": 0.01111277, "auxiliary_loss_mlp": 0.01036495, "balance_loss_clip": 1.02416921, "balance_loss_mlp": 1.04082358, "epoch": 0.5930557643168496, "flos": 26208325405440.0, "grad_norm": 2.0597618367352055, "language_loss": 0.70488274, "learning_rate": 1.499655812861921e-06, "loss": 0.72636044, "num_input_tokens_seen": 212479385, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 9864, "time_per_iteration": 2.542147397994995 }, { "auxiliary_loss_clip": 0.01111283, "auxiliary_loss_mlp": 0.01034366, "balance_loss_clip": 1.02107453, "balance_loss_mlp": 1.03883219, "epoch": 0.5931158875695175, "flos": 27854937699840.0, "grad_norm": 1.6417254081892454, "language_loss": 0.67266029, "learning_rate": 1.4992787450111112e-06, "loss": 0.69411677, "num_input_tokens_seen": 212500060, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 9865, "time_per_iteration": 2.5210931301116943 }, { "auxiliary_loss_clip": 0.0111162, "auxiliary_loss_mlp": 0.01036475, "balance_loss_clip": 1.02318883, "balance_loss_mlp": 1.0392921, "epoch": 0.5931760108221855, "flos": 15413650536960.0, "grad_norm": 2.164830046437922, "language_loss": 0.78125858, "learning_rate": 1.4989016961477015e-06, "loss": 0.8027395, "num_input_tokens_seen": 212518590, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 9866, "time_per_iteration": 2.4724762439727783 }, { "auxiliary_loss_clip": 0.01108321, "auxiliary_loss_mlp": 0.01028799, "balance_loss_clip": 1.01727724, "balance_loss_mlp": 1.03970206, "epoch": 0.5932361340748534, "flos": 30188520581760.0, "grad_norm": 1.809223981381698, "language_loss": 0.72091663, "learning_rate": 1.4985246662859903e-06, "loss": 0.74228776, "num_input_tokens_seen": 212538190, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 9867, "time_per_iteration": 2.5337769985198975 }, { "auxiliary_loss_clip": 0.01110661, "auxiliary_loss_mlp": 0.01029827, "balance_loss_clip": 1.01654732, "balance_loss_mlp": 1.04095984, "epoch": 0.5932962573275214, "flos": 20157557708160.0, "grad_norm": 1.5394904763922772, "language_loss": 0.66685045, "learning_rate": 1.4981476554402732e-06, "loss": 0.68825531, "num_input_tokens_seen": 212557820, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 9868, "time_per_iteration": 2.472921848297119 }, { "auxiliary_loss_clip": 0.01111012, "auxiliary_loss_mlp": 0.01031602, "balance_loss_clip": 1.01887083, "balance_loss_mlp": 1.03995502, "epoch": 0.5933563805801894, "flos": 25445906300160.0, "grad_norm": 1.685246741901288, "language_loss": 0.75623757, "learning_rate": 1.4977706636248478e-06, "loss": 0.77766371, "num_input_tokens_seen": 212577645, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 9869, "time_per_iteration": 2.49025297164917 }, { "auxiliary_loss_clip": 0.01113159, "auxiliary_loss_mlp": 0.01036544, "balance_loss_clip": 1.02321029, "balance_loss_mlp": 1.04122424, "epoch": 0.5934165038328574, "flos": 59995740337920.0, "grad_norm": 1.7939747217305848, "language_loss": 0.74280113, "learning_rate": 1.4973936908540091e-06, "loss": 0.76429814, "num_input_tokens_seen": 212603430, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 9870, "time_per_iteration": 2.8237669467926025 }, { "auxiliary_loss_clip": 0.0111215, "auxiliary_loss_mlp": 0.01026338, "balance_loss_clip": 1.01381516, "balance_loss_mlp": 1.04021597, "epoch": 0.5934766270855253, "flos": 24420548661120.0, "grad_norm": 2.092205368263428, "language_loss": 0.72254109, "learning_rate": 1.4970167371420517e-06, "loss": 0.74392605, "num_input_tokens_seen": 212620730, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 9871, "time_per_iteration": 2.4808154106140137 }, { "auxiliary_loss_clip": 0.01110924, "auxiliary_loss_mlp": 0.01032089, "balance_loss_clip": 1.01903558, "balance_loss_mlp": 1.03909481, "epoch": 0.5935367503381933, "flos": 23513158264320.0, "grad_norm": 2.150121946751004, "language_loss": 0.74339563, "learning_rate": 1.496639802503271e-06, "loss": 0.76482576, "num_input_tokens_seen": 212639745, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 9872, "time_per_iteration": 2.47792387008667 }, { "auxiliary_loss_clip": 0.01111696, "auxiliary_loss_mlp": 0.01038142, "balance_loss_clip": 1.0247612, "balance_loss_mlp": 1.03833306, "epoch": 0.5935968735908612, "flos": 18948337326720.0, "grad_norm": 2.054320903258224, "language_loss": 0.79073524, "learning_rate": 1.4962628869519583e-06, "loss": 0.81223363, "num_input_tokens_seen": 212655915, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 9873, "time_per_iteration": 2.425304651260376 }, { "auxiliary_loss_clip": 0.01110166, "auxiliary_loss_mlp": 0.01028437, "balance_loss_clip": 1.01566958, "balance_loss_mlp": 1.03932548, "epoch": 0.5936569968435292, "flos": 25483433034240.0, "grad_norm": 1.5751356307062072, "language_loss": 0.85109377, "learning_rate": 1.4958859905024078e-06, "loss": 0.8724798, "num_input_tokens_seen": 212676115, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 9874, "time_per_iteration": 2.5141680240631104 }, { "auxiliary_loss_clip": 0.01038492, "auxiliary_loss_mlp": 0.01003931, "balance_loss_clip": 1.0026139, "balance_loss_mlp": 1.0144912, "epoch": 0.5937171200961973, "flos": 66378361789440.0, "grad_norm": 0.7158422826122124, "language_loss": 0.59971178, "learning_rate": 1.4955091131689115e-06, "loss": 0.62013602, "num_input_tokens_seen": 212737560, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.24023438, "step": 9875, "time_per_iteration": 3.1623339653015137 }, { "auxiliary_loss_clip": 0.01113094, "auxiliary_loss_mlp": 0.01029498, "balance_loss_clip": 1.01566339, "balance_loss_mlp": 1.03817308, "epoch": 0.5937772433488652, "flos": 14903467712640.0, "grad_norm": 3.5213394096039607, "language_loss": 0.7807256, "learning_rate": 1.4951322549657594e-06, "loss": 0.8021515, "num_input_tokens_seen": 212755365, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75, "step": 9876, "time_per_iteration": 2.445312738418579 }, { "auxiliary_loss_clip": 0.01104027, "auxiliary_loss_mlp": 0.01026981, "balance_loss_clip": 1.01539958, "balance_loss_mlp": 1.03710341, "epoch": 0.5938373666015332, "flos": 22561489376640.0, "grad_norm": 1.5750542240110181, "language_loss": 0.7582646, "learning_rate": 1.494755415907243e-06, "loss": 0.77957469, "num_input_tokens_seen": 212773875, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66796875, "step": 9877, "time_per_iteration": 2.488891124725342 }, { "auxiliary_loss_clip": 0.0111071, "auxiliary_loss_mlp": 0.01028492, "balance_loss_clip": 1.01483035, "balance_loss_mlp": 1.03798175, "epoch": 0.5938974898542011, "flos": 18440883936000.0, "grad_norm": 3.327021696938301, "language_loss": 0.80898345, "learning_rate": 1.4943785960076522e-06, "loss": 0.83037543, "num_input_tokens_seen": 212790590, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 9878, "time_per_iteration": 2.435521364212036 }, { "auxiliary_loss_clip": 0.01110167, "auxiliary_loss_mlp": 0.01033631, "balance_loss_clip": 1.02075028, "balance_loss_mlp": 1.03832448, "epoch": 0.5939576131068691, "flos": 45586728270720.0, "grad_norm": 4.274842017974542, "language_loss": 0.70883656, "learning_rate": 1.4940017952812754e-06, "loss": 0.73027456, "num_input_tokens_seen": 212812265, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 9879, "time_per_iteration": 4.097792863845825 }, { "auxiliary_loss_clip": 0.01107687, "auxiliary_loss_mlp": 0.01034701, "balance_loss_clip": 1.02189827, "balance_loss_mlp": 1.038059, "epoch": 0.594017736359537, "flos": 23587708942080.0, "grad_norm": 1.8569974668179734, "language_loss": 0.57785738, "learning_rate": 1.493625013742401e-06, "loss": 0.59928125, "num_input_tokens_seen": 212831915, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 9880, "time_per_iteration": 2.4881439208984375 }, { "auxiliary_loss_clip": 0.01108918, "auxiliary_loss_mlp": 0.01031235, "balance_loss_clip": 1.01809835, "balance_loss_mlp": 1.03818011, "epoch": 0.594077859612205, "flos": 29457235589760.0, "grad_norm": 2.14233751973336, "language_loss": 0.77188623, "learning_rate": 1.4932482514053177e-06, "loss": 0.79328775, "num_input_tokens_seen": 212851350, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 9881, "time_per_iteration": 2.521519422531128 }, { "auxiliary_loss_clip": 0.01107132, "auxiliary_loss_mlp": 0.01025335, "balance_loss_clip": 1.01295567, "balance_loss_mlp": 1.03627098, "epoch": 0.594137982864873, "flos": 16800089644800.0, "grad_norm": 3.2528852734397793, "language_loss": 0.82725251, "learning_rate": 1.4928715082843112e-06, "loss": 0.8485772, "num_input_tokens_seen": 212867995, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 9882, "time_per_iteration": 3.9285330772399902 }, { "auxiliary_loss_clip": 0.01109877, "auxiliary_loss_mlp": 0.01035341, "balance_loss_clip": 1.0227344, "balance_loss_mlp": 1.03937268, "epoch": 0.594198106117541, "flos": 12750263953920.0, "grad_norm": 2.3688970434431176, "language_loss": 0.7984165, "learning_rate": 1.492494784393667e-06, "loss": 0.81986862, "num_input_tokens_seen": 212885220, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 9883, "time_per_iteration": 3.8872077465057373 }, { "auxiliary_loss_clip": 0.01114751, "auxiliary_loss_mlp": 0.01029959, "balance_loss_clip": 1.01641083, "balance_loss_mlp": 1.04105163, "epoch": 0.5942582293702089, "flos": 20996538652800.0, "grad_norm": 31.626579498475987, "language_loss": 0.74327362, "learning_rate": 1.4921180797476725e-06, "loss": 0.76472068, "num_input_tokens_seen": 212903195, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 9884, "time_per_iteration": 3.9310951232910156 }, { "auxiliary_loss_clip": 0.01110434, "auxiliary_loss_mlp": 0.01027714, "balance_loss_clip": 1.01492858, "balance_loss_mlp": 1.03987718, "epoch": 0.5943183526228769, "flos": 28291431772800.0, "grad_norm": 1.812234425776911, "language_loss": 0.65775728, "learning_rate": 1.4917413943606106e-06, "loss": 0.67913878, "num_input_tokens_seen": 212923340, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 9885, "time_per_iteration": 2.5392873287200928 }, { "auxiliary_loss_clip": 0.01109864, "auxiliary_loss_mlp": 0.01034717, "balance_loss_clip": 1.02175939, "balance_loss_mlp": 1.04006791, "epoch": 0.5943784758755448, "flos": 26614619118720.0, "grad_norm": 2.185872916841581, "language_loss": 0.76937521, "learning_rate": 1.4913647282467667e-06, "loss": 0.79082108, "num_input_tokens_seen": 212942755, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 9886, "time_per_iteration": 2.505887031555176 }, { "auxiliary_loss_clip": 0.0103647, "auxiliary_loss_mlp": 0.01003253, "balance_loss_clip": 1.00192964, "balance_loss_mlp": 1.01227427, "epoch": 0.5944385991282128, "flos": 64190935347840.0, "grad_norm": 0.8366396592904098, "language_loss": 0.64596587, "learning_rate": 1.490988081420423e-06, "loss": 0.66636312, "num_input_tokens_seen": 212999355, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.2421875, "step": 9887, "time_per_iteration": 2.976010322570801 }, { "auxiliary_loss_clip": 0.01107582, "auxiliary_loss_mlp": 0.01027175, "balance_loss_clip": 1.01523626, "balance_loss_mlp": 1.03736711, "epoch": 0.5944987223808808, "flos": 19571998193280.0, "grad_norm": 1.7709799795499501, "language_loss": 0.69662499, "learning_rate": 1.4906114538958615e-06, "loss": 0.71797252, "num_input_tokens_seen": 213018570, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 9888, "time_per_iteration": 2.4366507530212402 }, { "auxiliary_loss_clip": 0.01109581, "auxiliary_loss_mlp": 0.01029876, "balance_loss_clip": 1.01581573, "balance_loss_mlp": 1.03974974, "epoch": 0.5945588456335488, "flos": 26177586341760.0, "grad_norm": 1.5642198260824596, "language_loss": 0.79722077, "learning_rate": 1.490234845687366e-06, "loss": 0.81861532, "num_input_tokens_seen": 213037735, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.69921875, "step": 9889, "time_per_iteration": 2.517838478088379 }, { "auxiliary_loss_clip": 0.01107118, "auxiliary_loss_mlp": 0.01029633, "balance_loss_clip": 1.01717556, "balance_loss_mlp": 1.03678524, "epoch": 0.5946189688862168, "flos": 20446494710400.0, "grad_norm": 2.6195092261194053, "language_loss": 0.70617634, "learning_rate": 1.4898582568092154e-06, "loss": 0.72754383, "num_input_tokens_seen": 213057160, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 9890, "time_per_iteration": 2.4788219928741455 }, { "auxiliary_loss_clip": 0.01111102, "auxiliary_loss_mlp": 0.01030886, "balance_loss_clip": 1.01725483, "balance_loss_mlp": 1.03911042, "epoch": 0.5946790921388847, "flos": 13437521850240.0, "grad_norm": 2.6271005076541143, "language_loss": 0.69162428, "learning_rate": 1.489481687275691e-06, "loss": 0.71304417, "num_input_tokens_seen": 213073630, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 9891, "time_per_iteration": 2.4461023807525635 }, { "auxiliary_loss_clip": 0.01105958, "auxiliary_loss_mlp": 0.01032116, "balance_loss_clip": 1.01966476, "balance_loss_mlp": 1.0368917, "epoch": 0.5947392153915527, "flos": 20412272027520.0, "grad_norm": 3.56778289856807, "language_loss": 0.5320487, "learning_rate": 1.4891051371010726e-06, "loss": 0.55342937, "num_input_tokens_seen": 213092450, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 9892, "time_per_iteration": 2.4631505012512207 }, { "auxiliary_loss_clip": 0.01036626, "auxiliary_loss_mlp": 0.01001675, "balance_loss_clip": 1.00035179, "balance_loss_mlp": 1.01260448, "epoch": 0.5947993386442206, "flos": 65619138994560.0, "grad_norm": 0.6578506086655437, "language_loss": 0.54574859, "learning_rate": 1.4887286062996375e-06, "loss": 0.56613159, "num_input_tokens_seen": 213155465, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.24023438, "step": 9893, "time_per_iteration": 3.140949010848999 }, { "auxiliary_loss_clip": 0.01105434, "auxiliary_loss_mlp": 0.01029322, "balance_loss_clip": 1.01734161, "balance_loss_mlp": 1.03758669, "epoch": 0.5948594618968887, "flos": 23183103168000.0, "grad_norm": 2.0920142842558405, "language_loss": 0.75085139, "learning_rate": 1.4883520948856658e-06, "loss": 0.77219898, "num_input_tokens_seen": 213174875, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 9894, "time_per_iteration": 2.471151351928711 }, { "auxiliary_loss_clip": 0.01107548, "auxiliary_loss_mlp": 0.01029671, "balance_loss_clip": 1.01764297, "balance_loss_mlp": 1.03789854, "epoch": 0.5949195851495566, "flos": 13626771632640.0, "grad_norm": 1.7206885137834471, "language_loss": 0.77941561, "learning_rate": 1.487975602873434e-06, "loss": 0.80078775, "num_input_tokens_seen": 213192695, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 9895, "time_per_iteration": 2.452004909515381 }, { "auxiliary_loss_clip": 0.01112175, "auxiliary_loss_mlp": 0.01029842, "balance_loss_clip": 1.01685452, "balance_loss_mlp": 1.04031813, "epoch": 0.5949797084022246, "flos": 19751012599680.0, "grad_norm": 1.7840408179018044, "language_loss": 0.79089588, "learning_rate": 1.4875991302772182e-06, "loss": 0.81231606, "num_input_tokens_seen": 213211195, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 9896, "time_per_iteration": 2.490377426147461 }, { "auxiliary_loss_clip": 0.01109029, "auxiliary_loss_mlp": 0.01029623, "balance_loss_clip": 1.01689196, "balance_loss_mlp": 1.0382551, "epoch": 0.5950398316548925, "flos": 25773878407680.0, "grad_norm": 1.9158428938191807, "language_loss": 0.83400273, "learning_rate": 1.4872226771112954e-06, "loss": 0.85538924, "num_input_tokens_seen": 213231975, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 9897, "time_per_iteration": 2.5299782752990723 }, { "auxiliary_loss_clip": 0.01110943, "auxiliary_loss_mlp": 0.01029035, "balance_loss_clip": 1.01706576, "balance_loss_mlp": 1.03982496, "epoch": 0.5950999549075605, "flos": 23039029716480.0, "grad_norm": 1.8583304590679772, "language_loss": 0.70855546, "learning_rate": 1.486846243389939e-06, "loss": 0.72995532, "num_input_tokens_seen": 213249760, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 9898, "time_per_iteration": 2.471268892288208 }, { "auxiliary_loss_clip": 0.01113948, "auxiliary_loss_mlp": 0.01034767, "balance_loss_clip": 1.0207479, "balance_loss_mlp": 1.03890157, "epoch": 0.5951600781602284, "flos": 32446367637120.0, "grad_norm": 2.279046138970854, "language_loss": 0.64092767, "learning_rate": 1.4864698291274251e-06, "loss": 0.66241491, "num_input_tokens_seen": 213269890, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 9899, "time_per_iteration": 2.566434621810913 }, { "auxiliary_loss_clip": 0.01109023, "auxiliary_loss_mlp": 0.01028146, "balance_loss_clip": 1.01688695, "balance_loss_mlp": 1.03972173, "epoch": 0.5952202014128964, "flos": 23800874204160.0, "grad_norm": 1.8206162450395769, "language_loss": 0.71706641, "learning_rate": 1.4860934343380267e-06, "loss": 0.73843813, "num_input_tokens_seen": 213289400, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.69140625, "step": 9900, "time_per_iteration": 2.4790234565734863 }, { "auxiliary_loss_clip": 0.01107667, "auxiliary_loss_mlp": 0.01029297, "balance_loss_clip": 1.01669049, "balance_loss_mlp": 1.03954554, "epoch": 0.5952803246655644, "flos": 22492182084480.0, "grad_norm": 1.7141959153511768, "language_loss": 0.8472687, "learning_rate": 1.4857170590360169e-06, "loss": 0.8686384, "num_input_tokens_seen": 213308040, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 9901, "time_per_iteration": 2.498931407928467 }, { "auxiliary_loss_clip": 0.01036401, "auxiliary_loss_mlp": 0.01002021, "balance_loss_clip": 1.00076342, "balance_loss_mlp": 1.01258194, "epoch": 0.5953404479182324, "flos": 51234688851840.0, "grad_norm": 0.7937626049492518, "language_loss": 0.5818187, "learning_rate": 1.4853407032356674e-06, "loss": 0.60220289, "num_input_tokens_seen": 213358585, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.23828125, "step": 9902, "time_per_iteration": 2.943591594696045 }, { "auxiliary_loss_clip": 0.01110299, "auxiliary_loss_mlp": 0.01028473, "balance_loss_clip": 1.01547337, "balance_loss_mlp": 1.0382421, "epoch": 0.5954005711709004, "flos": 23112682554240.0, "grad_norm": 1.9029607485323403, "language_loss": 0.76952612, "learning_rate": 1.4849643669512503e-06, "loss": 0.79091382, "num_input_tokens_seen": 213379585, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 9903, "time_per_iteration": 2.5336146354675293 }, { "auxiliary_loss_clip": 0.01109273, "auxiliary_loss_mlp": 0.01030993, "balance_loss_clip": 1.01873863, "balance_loss_mlp": 1.03885221, "epoch": 0.5954606944235683, "flos": 35954732736000.0, "grad_norm": 1.717386218357051, "language_loss": 0.77775455, "learning_rate": 1.4845880501970362e-06, "loss": 0.7991572, "num_input_tokens_seen": 213401465, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 9904, "time_per_iteration": 2.611473560333252 }, { "auxiliary_loss_clip": 0.01110735, "auxiliary_loss_mlp": 0.01033756, "balance_loss_clip": 1.02130461, "balance_loss_mlp": 1.03787875, "epoch": 0.5955208176762363, "flos": 30443665864320.0, "grad_norm": 1.786814548736816, "language_loss": 0.72599304, "learning_rate": 1.4842117529872942e-06, "loss": 0.74743795, "num_input_tokens_seen": 213422720, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7265625, "step": 9905, "time_per_iteration": 2.5407955646514893 }, { "auxiliary_loss_clip": 0.01109688, "auxiliary_loss_mlp": 0.01026551, "balance_loss_clip": 1.01362288, "balance_loss_mlp": 1.03864455, "epoch": 0.5955809409289042, "flos": 17640112083840.0, "grad_norm": 1.7898132038342704, "language_loss": 0.69597119, "learning_rate": 1.483835475336295e-06, "loss": 0.71733356, "num_input_tokens_seen": 213439480, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 9906, "time_per_iteration": 2.4691755771636963 }, { "auxiliary_loss_clip": 0.0110843, "auxiliary_loss_mlp": 0.0102869, "balance_loss_clip": 1.01621485, "balance_loss_mlp": 1.03814268, "epoch": 0.5956410641815723, "flos": 24279887001600.0, "grad_norm": 2.3933842540565835, "language_loss": 0.75257587, "learning_rate": 1.4834592172583057e-06, "loss": 0.77394712, "num_input_tokens_seen": 213458895, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 9907, "time_per_iteration": 2.4714343547821045 }, { "auxiliary_loss_clip": 0.01109831, "auxiliary_loss_mlp": 0.01031899, "balance_loss_clip": 1.01958442, "balance_loss_mlp": 1.03906655, "epoch": 0.5957011874342402, "flos": 35734277013120.0, "grad_norm": 1.7068007828568152, "language_loss": 0.66892743, "learning_rate": 1.483082978767595e-06, "loss": 0.69034469, "num_input_tokens_seen": 213481730, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 9908, "time_per_iteration": 2.643249034881592 }, { "auxiliary_loss_clip": 0.01109443, "auxiliary_loss_mlp": 0.01028431, "balance_loss_clip": 1.01645017, "balance_loss_mlp": 1.03988862, "epoch": 0.5957613106869082, "flos": 21245004005760.0, "grad_norm": 2.176021781712142, "language_loss": 0.76539248, "learning_rate": 1.4827067598784298e-06, "loss": 0.78677118, "num_input_tokens_seen": 213497225, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 9909, "time_per_iteration": 2.449671506881714 }, { "auxiliary_loss_clip": 0.01038062, "auxiliary_loss_mlp": 0.01003469, "balance_loss_clip": 1.0021162, "balance_loss_mlp": 1.01370811, "epoch": 0.5958214339395761, "flos": 65940969876480.0, "grad_norm": 0.952925764826085, "language_loss": 0.73493147, "learning_rate": 1.4823305606050753e-06, "loss": 0.75534678, "num_input_tokens_seen": 213556890, "router_z_loss_clip": 0.0135498, "router_z_loss_mlp": 0.24316406, "step": 9910, "time_per_iteration": 3.1675171852111816 }, { "auxiliary_loss_clip": 0.01109655, "auxiliary_loss_mlp": 0.01033727, "balance_loss_clip": 1.02100778, "balance_loss_mlp": 1.03858948, "epoch": 0.5958815571922441, "flos": 23218690567680.0, "grad_norm": 1.6339526685454064, "language_loss": 0.69658995, "learning_rate": 1.481954380961799e-06, "loss": 0.71802378, "num_input_tokens_seen": 213575800, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 9911, "time_per_iteration": 2.4655609130859375 }, { "auxiliary_loss_clip": 0.01116957, "auxiliary_loss_mlp": 0.01032139, "balance_loss_clip": 1.01838231, "balance_loss_mlp": 1.04200292, "epoch": 0.595941680444912, "flos": 16538623568640.0, "grad_norm": 1.952518938869758, "language_loss": 0.66123939, "learning_rate": 1.4815782209628631e-06, "loss": 0.68273032, "num_input_tokens_seen": 213592740, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 9912, "time_per_iteration": 2.451653242111206 }, { "auxiliary_loss_clip": 0.01109571, "auxiliary_loss_mlp": 0.0103514, "balance_loss_clip": 1.02178824, "balance_loss_mlp": 1.03831244, "epoch": 0.59600180369758, "flos": 27818883423360.0, "grad_norm": 2.099577852898227, "language_loss": 0.73446941, "learning_rate": 1.4812020806225337e-06, "loss": 0.75591648, "num_input_tokens_seen": 213611970, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 9913, "time_per_iteration": 2.505178213119507 }, { "auxiliary_loss_clip": 0.01110721, "auxiliary_loss_mlp": 0.01028488, "balance_loss_clip": 1.01542854, "balance_loss_mlp": 1.03606606, "epoch": 0.596061926950248, "flos": 29491566013440.0, "grad_norm": 1.9635744411025937, "language_loss": 0.7976799, "learning_rate": 1.4808259599550738e-06, "loss": 0.81907201, "num_input_tokens_seen": 213632230, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.75, "step": 9914, "time_per_iteration": 2.548900604248047 }, { "auxiliary_loss_clip": 0.01108948, "auxiliary_loss_mlp": 0.01030685, "balance_loss_clip": 1.01862061, "balance_loss_mlp": 1.03839159, "epoch": 0.596122050202916, "flos": 16836790366080.0, "grad_norm": 1.8823054851344336, "language_loss": 0.68045098, "learning_rate": 1.4804498589747448e-06, "loss": 0.70184731, "num_input_tokens_seen": 213649645, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 9915, "time_per_iteration": 2.4587857723236084 }, { "auxiliary_loss_clip": 0.01109569, "auxiliary_loss_mlp": 0.01034116, "balance_loss_clip": 1.02134848, "balance_loss_mlp": 1.03755987, "epoch": 0.596182173455584, "flos": 20996646393600.0, "grad_norm": 1.589974138010706, "language_loss": 0.78691697, "learning_rate": 1.4800737776958095e-06, "loss": 0.80835384, "num_input_tokens_seen": 213668850, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 9916, "time_per_iteration": 2.4748101234436035 }, { "auxiliary_loss_clip": 0.01108911, "auxiliary_loss_mlp": 0.01032021, "balance_loss_clip": 1.01909828, "balance_loss_mlp": 1.03704607, "epoch": 0.5962422967082519, "flos": 16065680169600.0, "grad_norm": 2.0272379753717287, "language_loss": 0.83209151, "learning_rate": 1.4796977161325286e-06, "loss": 0.85350084, "num_input_tokens_seen": 213685695, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 9917, "time_per_iteration": 2.4241979122161865 }, { "auxiliary_loss_clip": 0.01109604, "auxiliary_loss_mlp": 0.01033235, "balance_loss_clip": 1.02127779, "balance_loss_mlp": 1.03953552, "epoch": 0.5963024199609199, "flos": 12166966995840.0, "grad_norm": 1.8074384865291186, "language_loss": 0.7728802, "learning_rate": 1.4793216742991625e-06, "loss": 0.79430854, "num_input_tokens_seen": 213703515, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.703125, "step": 9918, "time_per_iteration": 2.479313611984253 }, { "auxiliary_loss_clip": 0.0111191, "auxiliary_loss_mlp": 0.01035023, "balance_loss_clip": 1.02236271, "balance_loss_mlp": 1.04087806, "epoch": 0.5963625432135878, "flos": 28074280101120.0, "grad_norm": 1.6426476172797864, "language_loss": 0.78829342, "learning_rate": 1.4789456522099707e-06, "loss": 0.80976278, "num_input_tokens_seen": 213724170, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 9919, "time_per_iteration": 2.4977807998657227 }, { "auxiliary_loss_clip": 0.01109331, "auxiliary_loss_mlp": 0.01036632, "balance_loss_clip": 1.02316082, "balance_loss_mlp": 1.03836787, "epoch": 0.5964226664662559, "flos": 19860324664320.0, "grad_norm": 1.9921687046520282, "language_loss": 0.78059536, "learning_rate": 1.4785696498792122e-06, "loss": 0.802055, "num_input_tokens_seen": 213740620, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 9920, "time_per_iteration": 2.474970817565918 }, { "auxiliary_loss_clip": 0.0111201, "auxiliary_loss_mlp": 0.01030344, "balance_loss_clip": 1.01808357, "balance_loss_mlp": 1.04036605, "epoch": 0.5964827897189238, "flos": 12932618325120.0, "grad_norm": 2.281349197510238, "language_loss": 0.82478482, "learning_rate": 1.4781936673211446e-06, "loss": 0.84620833, "num_input_tokens_seen": 213755390, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71875, "step": 9921, "time_per_iteration": 3.8270487785339355 }, { "auxiliary_loss_clip": 0.01108654, "auxiliary_loss_mlp": 0.01032135, "balance_loss_clip": 1.01900959, "balance_loss_mlp": 1.0375421, "epoch": 0.5965429129715918, "flos": 18150797698560.0, "grad_norm": 2.572174111084006, "language_loss": 0.80838692, "learning_rate": 1.4778177045500252e-06, "loss": 0.82979482, "num_input_tokens_seen": 213773225, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 9922, "time_per_iteration": 2.454310894012451 }, { "auxiliary_loss_clip": 0.01109596, "auxiliary_loss_mlp": 0.01027668, "balance_loss_clip": 1.01473391, "balance_loss_mlp": 1.03885818, "epoch": 0.5966030362242597, "flos": 21763231476480.0, "grad_norm": 3.868266138901831, "language_loss": 0.76717603, "learning_rate": 1.477441761580111e-06, "loss": 0.78854871, "num_input_tokens_seen": 213791860, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 9923, "time_per_iteration": 2.487457036972046 }, { "auxiliary_loss_clip": 0.01114246, "auxiliary_loss_mlp": 0.01033739, "balance_loss_clip": 1.0192554, "balance_loss_mlp": 1.03989375, "epoch": 0.5966631594769277, "flos": 18807208790400.0, "grad_norm": 1.8595287750665968, "language_loss": 0.7608546, "learning_rate": 1.4770658384256573e-06, "loss": 0.78233445, "num_input_tokens_seen": 213809455, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7421875, "step": 9924, "time_per_iteration": 3.943136692047119 }, { "auxiliary_loss_clip": 0.01105287, "auxiliary_loss_mlp": 0.01033039, "balance_loss_clip": 1.02005768, "balance_loss_mlp": 1.0373137, "epoch": 0.5967232827295956, "flos": 14064163545600.0, "grad_norm": 1.755591575673885, "language_loss": 0.66553974, "learning_rate": 1.4766899351009204e-06, "loss": 0.68692297, "num_input_tokens_seen": 213826615, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6796875, "step": 9925, "time_per_iteration": 3.8612771034240723 }, { "auxiliary_loss_clip": 0.01108955, "auxiliary_loss_mlp": 0.01029717, "balance_loss_clip": 1.01721156, "balance_loss_mlp": 1.04070795, "epoch": 0.5967834059822636, "flos": 17238235743360.0, "grad_norm": 2.11857931618354, "language_loss": 0.71557581, "learning_rate": 1.4763140516201528e-06, "loss": 0.7369625, "num_input_tokens_seen": 213844495, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 9926, "time_per_iteration": 3.9726085662841797 }, { "auxiliary_loss_clip": 0.01112569, "auxiliary_loss_mlp": 0.01032145, "balance_loss_clip": 1.01869202, "balance_loss_mlp": 1.04028094, "epoch": 0.5968435292349316, "flos": 42520244284800.0, "grad_norm": 1.9644294478841688, "language_loss": 0.70328188, "learning_rate": 1.4759381879976088e-06, "loss": 0.724729, "num_input_tokens_seen": 213869125, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 9927, "time_per_iteration": 2.6416285037994385 }, { "auxiliary_loss_clip": 0.01114194, "auxiliary_loss_mlp": 0.01031797, "balance_loss_clip": 1.01820123, "balance_loss_mlp": 1.03906322, "epoch": 0.5969036524875996, "flos": 37630898945280.0, "grad_norm": 2.3639562168917103, "language_loss": 0.64018422, "learning_rate": 1.4755623442475415e-06, "loss": 0.66164416, "num_input_tokens_seen": 213891115, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 9928, "time_per_iteration": 2.6064629554748535 }, { "auxiliary_loss_clip": 0.01105225, "auxiliary_loss_mlp": 0.01034129, "balance_loss_clip": 1.02220237, "balance_loss_mlp": 1.03582668, "epoch": 0.5969637757402676, "flos": 23148377694720.0, "grad_norm": 1.63387367118268, "language_loss": 0.6949389, "learning_rate": 1.4751865203842022e-06, "loss": 0.7163325, "num_input_tokens_seen": 213911925, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 9929, "time_per_iteration": 2.4623401165008545 }, { "auxiliary_loss_clip": 0.01108042, "auxiliary_loss_mlp": 0.01029862, "balance_loss_clip": 1.01819777, "balance_loss_mlp": 1.03965688, "epoch": 0.5970238989929355, "flos": 24020934877440.0, "grad_norm": 2.3713364041287424, "language_loss": 0.76137811, "learning_rate": 1.4748107164218431e-06, "loss": 0.78275716, "num_input_tokens_seen": 213930715, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.68359375, "step": 9930, "time_per_iteration": 2.4914870262145996 }, { "auxiliary_loss_clip": 0.01114278, "auxiliary_loss_mlp": 0.01034846, "balance_loss_clip": 1.02079749, "balance_loss_mlp": 1.04021454, "epoch": 0.5970840222456035, "flos": 19426883247360.0, "grad_norm": 1.7479014333824427, "language_loss": 0.69013357, "learning_rate": 1.4744349323747146e-06, "loss": 0.71162486, "num_input_tokens_seen": 213950015, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 9931, "time_per_iteration": 2.4667880535125732 }, { "auxiliary_loss_clip": 0.01038609, "auxiliary_loss_mlp": 0.01013452, "balance_loss_clip": 1.0119915, "balance_loss_mlp": 1.0140034, "epoch": 0.5971441454982714, "flos": 62976615235200.0, "grad_norm": 0.8688826104721603, "language_loss": 0.64234936, "learning_rate": 1.474059168257065e-06, "loss": 0.66287005, "num_input_tokens_seen": 214003330, "router_z_loss_clip": 0.0145874, "router_z_loss_mlp": 0.24609375, "step": 9932, "time_per_iteration": 3.017200231552124 }, { "auxiliary_loss_clip": 0.01109531, "auxiliary_loss_mlp": 0.01032485, "balance_loss_clip": 1.01927662, "balance_loss_mlp": 1.03865421, "epoch": 0.5972042687509395, "flos": 20266223328000.0, "grad_norm": 1.8373969307028242, "language_loss": 0.74316645, "learning_rate": 1.4736834240831454e-06, "loss": 0.76458657, "num_input_tokens_seen": 214021680, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 9933, "time_per_iteration": 2.4575858116149902 }, { "auxiliary_loss_clip": 0.01038304, "auxiliary_loss_mlp": 0.01007892, "balance_loss_clip": 1.00654447, "balance_loss_mlp": 1.01405597, "epoch": 0.5972643920036074, "flos": 71652383832960.0, "grad_norm": 0.6772605189458272, "language_loss": 0.52045178, "learning_rate": 1.473307699867203e-06, "loss": 0.54091376, "num_input_tokens_seen": 214090265, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.2421875, "step": 9934, "time_per_iteration": 3.1971559524536133 }, { "auxiliary_loss_clip": 0.01037574, "auxiliary_loss_mlp": 0.01003757, "balance_loss_clip": 1.00245762, "balance_loss_mlp": 1.01333404, "epoch": 0.5973245152562754, "flos": 56892702263040.0, "grad_norm": 0.8321014110970315, "language_loss": 0.54186565, "learning_rate": 1.4729319956234849e-06, "loss": 0.56227899, "num_input_tokens_seen": 214146375, "router_z_loss_clip": 0.01300049, "router_z_loss_mlp": 0.2421875, "step": 9935, "time_per_iteration": 3.007587432861328 }, { "auxiliary_loss_clip": 0.011085, "auxiliary_loss_mlp": 0.01032095, "balance_loss_clip": 1.01872528, "balance_loss_mlp": 1.03715467, "epoch": 0.5973846385089433, "flos": 24164361884160.0, "grad_norm": 3.067837026324408, "language_loss": 0.65886748, "learning_rate": 1.4725563113662394e-06, "loss": 0.68027347, "num_input_tokens_seen": 214165340, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 9936, "time_per_iteration": 2.4752798080444336 }, { "auxiliary_loss_clip": 0.0111205, "auxiliary_loss_mlp": 0.01032454, "balance_loss_clip": 1.02030706, "balance_loss_mlp": 1.03982019, "epoch": 0.5974447617616113, "flos": 17670599752320.0, "grad_norm": 2.037163935471797, "language_loss": 0.67565322, "learning_rate": 1.4721806471097103e-06, "loss": 0.69709826, "num_input_tokens_seen": 214181360, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.72265625, "step": 9937, "time_per_iteration": 2.4552249908447266 }, { "auxiliary_loss_clip": 0.01111835, "auxiliary_loss_mlp": 0.01027901, "balance_loss_clip": 1.01450753, "balance_loss_mlp": 1.03897023, "epoch": 0.5975048850142792, "flos": 22892514140160.0, "grad_norm": 2.650964683007535, "language_loss": 0.76849246, "learning_rate": 1.4718050028681442e-06, "loss": 0.78988987, "num_input_tokens_seen": 214198525, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 9938, "time_per_iteration": 2.4887917041778564 }, { "auxiliary_loss_clip": 0.01111113, "auxiliary_loss_mlp": 0.0102644, "balance_loss_clip": 1.01369619, "balance_loss_mlp": 1.03880596, "epoch": 0.5975650082669473, "flos": 24353108876160.0, "grad_norm": 1.6090665881028117, "language_loss": 0.75738609, "learning_rate": 1.4714293786557855e-06, "loss": 0.77876163, "num_input_tokens_seen": 214218710, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 9939, "time_per_iteration": 2.555166482925415 }, { "auxiliary_loss_clip": 0.01114904, "auxiliary_loss_mlp": 0.01028919, "balance_loss_clip": 1.01425672, "balance_loss_mlp": 1.03870964, "epoch": 0.5976251315196152, "flos": 20923352691840.0, "grad_norm": 3.4541496908690834, "language_loss": 0.69018763, "learning_rate": 1.471053774486878e-06, "loss": 0.71162587, "num_input_tokens_seen": 214237800, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.76171875, "step": 9940, "time_per_iteration": 2.466895341873169 }, { "auxiliary_loss_clip": 0.01106741, "auxiliary_loss_mlp": 0.01028797, "balance_loss_clip": 1.01706076, "balance_loss_mlp": 1.0373534, "epoch": 0.5976852547722832, "flos": 35844594658560.0, "grad_norm": 1.3748350783986827, "language_loss": 0.70276928, "learning_rate": 1.470678190375664e-06, "loss": 0.72412467, "num_input_tokens_seen": 214260355, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 9941, "time_per_iteration": 2.603440761566162 }, { "auxiliary_loss_clip": 0.01106606, "auxiliary_loss_mlp": 0.01031217, "balance_loss_clip": 1.01778793, "balance_loss_mlp": 1.03634048, "epoch": 0.5977453780249512, "flos": 12855948744960.0, "grad_norm": 2.040880637891972, "language_loss": 0.77897739, "learning_rate": 1.470302626336386e-06, "loss": 0.80035561, "num_input_tokens_seen": 214277120, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 9942, "time_per_iteration": 2.439075469970703 }, { "auxiliary_loss_clip": 0.0110963, "auxiliary_loss_mlp": 0.0103598, "balance_loss_clip": 1.02264595, "balance_loss_mlp": 1.0366565, "epoch": 0.5978055012776191, "flos": 20959155573120.0, "grad_norm": 1.8358100589636517, "language_loss": 0.75632191, "learning_rate": 1.4699270823832857e-06, "loss": 0.77777803, "num_input_tokens_seen": 214295300, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 9943, "time_per_iteration": 2.4775474071502686 }, { "auxiliary_loss_clip": 0.01109025, "auxiliary_loss_mlp": 0.0102943, "balance_loss_clip": 1.01742578, "balance_loss_mlp": 1.03842926, "epoch": 0.5978656245302871, "flos": 34058003063040.0, "grad_norm": 1.7438744705044142, "language_loss": 0.61991906, "learning_rate": 1.4695515585306032e-06, "loss": 0.64130366, "num_input_tokens_seen": 214317050, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 9944, "time_per_iteration": 2.573763847351074 }, { "auxiliary_loss_clip": 0.01110921, "auxiliary_loss_mlp": 0.01032536, "balance_loss_clip": 1.01902318, "balance_loss_mlp": 1.03900135, "epoch": 0.597925747782955, "flos": 37373275624320.0, "grad_norm": 1.5855298740332595, "language_loss": 0.72491693, "learning_rate": 1.4691760547925795e-06, "loss": 0.74635148, "num_input_tokens_seen": 214337470, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 9945, "time_per_iteration": 2.621086359024048 }, { "auxiliary_loss_clip": 0.01108388, "auxiliary_loss_mlp": 0.01028792, "balance_loss_clip": 1.0156014, "balance_loss_mlp": 1.03727186, "epoch": 0.5979858710356231, "flos": 25374803328000.0, "grad_norm": 1.89879742509069, "language_loss": 0.67503297, "learning_rate": 1.4688005711834522e-06, "loss": 0.6964047, "num_input_tokens_seen": 214357975, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 9946, "time_per_iteration": 2.517078161239624 }, { "auxiliary_loss_clip": 0.01111684, "auxiliary_loss_mlp": 0.01028799, "balance_loss_clip": 1.01501846, "balance_loss_mlp": 1.0382154, "epoch": 0.598045994288291, "flos": 13698413308800.0, "grad_norm": 2.014013612789165, "language_loss": 0.88516062, "learning_rate": 1.468425107717461e-06, "loss": 0.90656549, "num_input_tokens_seen": 214374125, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.734375, "step": 9947, "time_per_iteration": 2.415515899658203 }, { "auxiliary_loss_clip": 0.0110504, "auxiliary_loss_mlp": 0.01031114, "balance_loss_clip": 1.01913333, "balance_loss_mlp": 1.03668261, "epoch": 0.598106117540959, "flos": 21981352815360.0, "grad_norm": 1.758111693502659, "language_loss": 0.71868527, "learning_rate": 1.4680496644088432e-06, "loss": 0.7400468, "num_input_tokens_seen": 214393395, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 9948, "time_per_iteration": 2.452672004699707 }, { "auxiliary_loss_clip": 0.01110302, "auxiliary_loss_mlp": 0.01033496, "balance_loss_clip": 1.0194416, "balance_loss_mlp": 1.03787339, "epoch": 0.5981662407936269, "flos": 20559362221440.0, "grad_norm": 1.9411452192663705, "language_loss": 0.89282, "learning_rate": 1.4676742412718347e-06, "loss": 0.914258, "num_input_tokens_seen": 214411550, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 9949, "time_per_iteration": 2.4940104484558105 }, { "auxiliary_loss_clip": 0.01109118, "auxiliary_loss_mlp": 0.01029107, "balance_loss_clip": 1.0168699, "balance_loss_mlp": 1.03892589, "epoch": 0.5982263640462949, "flos": 14063840323200.0, "grad_norm": 1.8300157177394534, "language_loss": 0.70604181, "learning_rate": 1.467298838320673e-06, "loss": 0.72742403, "num_input_tokens_seen": 214429780, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 9950, "time_per_iteration": 2.4257495403289795 }, { "auxiliary_loss_clip": 0.01110341, "auxiliary_loss_mlp": 0.01032298, "balance_loss_clip": 1.01947677, "balance_loss_mlp": 1.03833318, "epoch": 0.5982864872989628, "flos": 17707228646400.0, "grad_norm": 1.8572609616405504, "language_loss": 0.77814245, "learning_rate": 1.4669234555695921e-06, "loss": 0.79956883, "num_input_tokens_seen": 214447775, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 9951, "time_per_iteration": 2.4655096530914307 }, { "auxiliary_loss_clip": 0.01111439, "auxiliary_loss_mlp": 0.01039691, "balance_loss_clip": 1.02531457, "balance_loss_mlp": 1.03854799, "epoch": 0.5983466105516309, "flos": 16764789553920.0, "grad_norm": 1.4271178946430663, "language_loss": 0.73857975, "learning_rate": 1.4665480930328275e-06, "loss": 0.76009107, "num_input_tokens_seen": 214467245, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.73046875, "step": 9952, "time_per_iteration": 2.4852542877197266 }, { "auxiliary_loss_clip": 0.01112528, "auxiliary_loss_mlp": 0.01031287, "balance_loss_clip": 1.01658857, "balance_loss_mlp": 1.039029, "epoch": 0.5984067338042988, "flos": 20042714949120.0, "grad_norm": 2.553209609940393, "language_loss": 0.78787184, "learning_rate": 1.466172750724613e-06, "loss": 0.80930996, "num_input_tokens_seen": 214484385, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.734375, "step": 9953, "time_per_iteration": 2.4686760902404785 }, { "auxiliary_loss_clip": 0.0110824, "auxiliary_loss_mlp": 0.01033305, "balance_loss_clip": 1.02084768, "balance_loss_mlp": 1.03775859, "epoch": 0.5984668570569668, "flos": 26319900026880.0, "grad_norm": 1.5791024756547312, "language_loss": 0.6983552, "learning_rate": 1.4657974286591807e-06, "loss": 0.71977067, "num_input_tokens_seen": 214503465, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 9954, "time_per_iteration": 2.507019281387329 }, { "auxiliary_loss_clip": 0.01109114, "auxiliary_loss_mlp": 0.01031419, "balance_loss_clip": 1.01855075, "balance_loss_mlp": 1.03784704, "epoch": 0.5985269803096348, "flos": 20593728558720.0, "grad_norm": 2.2148828342441207, "language_loss": 0.73281813, "learning_rate": 1.4654221268507637e-06, "loss": 0.75422347, "num_input_tokens_seen": 214520725, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 9955, "time_per_iteration": 2.489136219024658 }, { "auxiliary_loss_clip": 0.01109734, "auxiliary_loss_mlp": 0.01029499, "balance_loss_clip": 1.0168153, "balance_loss_mlp": 1.03833389, "epoch": 0.5985871035623027, "flos": 26865382942080.0, "grad_norm": 1.7851100062915646, "language_loss": 0.68913084, "learning_rate": 1.4650468453135934e-06, "loss": 0.71052325, "num_input_tokens_seen": 214540675, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 9956, "time_per_iteration": 2.4908125400543213 }, { "auxiliary_loss_clip": 0.01112252, "auxiliary_loss_mlp": 0.01030896, "balance_loss_clip": 1.01769352, "balance_loss_mlp": 1.03995645, "epoch": 0.5986472268149707, "flos": 19609704495360.0, "grad_norm": 1.9617316637067332, "language_loss": 0.73576915, "learning_rate": 1.4646715840618999e-06, "loss": 0.7572006, "num_input_tokens_seen": 214559910, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 9957, "time_per_iteration": 2.4457645416259766 }, { "auxiliary_loss_clip": 0.01106789, "auxiliary_loss_mlp": 0.01026938, "balance_loss_clip": 1.01458788, "balance_loss_mlp": 1.03816652, "epoch": 0.5987073500676386, "flos": 21794616984960.0, "grad_norm": 1.933202802412267, "language_loss": 0.84872055, "learning_rate": 1.4642963431099138e-06, "loss": 0.87005776, "num_input_tokens_seen": 214575960, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 9958, "time_per_iteration": 2.480257511138916 }, { "auxiliary_loss_clip": 0.01110815, "auxiliary_loss_mlp": 0.01033188, "balance_loss_clip": 1.01986659, "balance_loss_mlp": 1.03834724, "epoch": 0.5987674733203067, "flos": 24314361079680.0, "grad_norm": 1.9787751190289542, "language_loss": 0.66511071, "learning_rate": 1.463921122471864e-06, "loss": 0.68655074, "num_input_tokens_seen": 214594230, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 9959, "time_per_iteration": 2.520888090133667 }, { "auxiliary_loss_clip": 0.01111009, "auxiliary_loss_mlp": 0.01027844, "balance_loss_clip": 1.01514864, "balance_loss_mlp": 1.03938389, "epoch": 0.5988275965729746, "flos": 21320201128320.0, "grad_norm": 1.8542526320391997, "language_loss": 0.83894008, "learning_rate": 1.4635459221619796e-06, "loss": 0.86032867, "num_input_tokens_seen": 214613130, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 9960, "time_per_iteration": 2.4620521068573 }, { "auxiliary_loss_clip": 0.01107558, "auxiliary_loss_mlp": 0.01026575, "balance_loss_clip": 1.0140698, "balance_loss_mlp": 1.03751874, "epoch": 0.5988877198256426, "flos": 25118041933440.0, "grad_norm": 1.5443863019893136, "language_loss": 0.79471874, "learning_rate": 1.4631707421944868e-06, "loss": 0.81606007, "num_input_tokens_seen": 214634470, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 9961, "time_per_iteration": 2.5204639434814453 }, { "auxiliary_loss_clip": 0.0110891, "auxiliary_loss_mlp": 0.01029867, "balance_loss_clip": 1.01690865, "balance_loss_mlp": 1.03807497, "epoch": 0.5989478430783105, "flos": 26429104350720.0, "grad_norm": 1.9350006486224731, "language_loss": 0.67291307, "learning_rate": 1.4627955825836136e-06, "loss": 0.69430077, "num_input_tokens_seen": 214654030, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 9962, "time_per_iteration": 3.9604368209838867 }, { "auxiliary_loss_clip": 0.01110141, "auxiliary_loss_mlp": 0.01031771, "balance_loss_clip": 1.01851487, "balance_loss_mlp": 1.03855228, "epoch": 0.5990079663309785, "flos": 25778439434880.0, "grad_norm": 1.3332393961877003, "language_loss": 0.74212545, "learning_rate": 1.4624204433435857e-06, "loss": 0.76354456, "num_input_tokens_seen": 214676985, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 9963, "time_per_iteration": 2.5510687828063965 }, { "auxiliary_loss_clip": 0.01108086, "auxiliary_loss_mlp": 0.01028069, "balance_loss_clip": 1.01490831, "balance_loss_mlp": 1.03776479, "epoch": 0.5990680895836464, "flos": 36831779118720.0, "grad_norm": 1.6268432796261532, "language_loss": 0.67903018, "learning_rate": 1.4620453244886281e-06, "loss": 0.70039177, "num_input_tokens_seen": 214700105, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 9964, "time_per_iteration": 2.6029114723205566 }, { "auxiliary_loss_clip": 0.01107916, "auxiliary_loss_mlp": 0.01027694, "balance_loss_clip": 1.01469994, "balance_loss_mlp": 1.03908348, "epoch": 0.5991282128363145, "flos": 24133550993280.0, "grad_norm": 2.044623182678867, "language_loss": 0.77169192, "learning_rate": 1.4616702260329662e-06, "loss": 0.79304802, "num_input_tokens_seen": 214717885, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 9965, "time_per_iteration": 3.9768295288085938 }, { "auxiliary_loss_clip": 0.01108602, "auxiliary_loss_mlp": 0.01023818, "balance_loss_clip": 1.01134253, "balance_loss_mlp": 1.03675556, "epoch": 0.5991883360889824, "flos": 10304064956160.0, "grad_norm": 1.8620684563302794, "language_loss": 0.77354658, "learning_rate": 1.4612951479908229e-06, "loss": 0.79487079, "num_input_tokens_seen": 214733680, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.71875, "step": 9966, "time_per_iteration": 3.8718388080596924 }, { "auxiliary_loss_clip": 0.01110277, "auxiliary_loss_mlp": 0.01026418, "balance_loss_clip": 1.01404381, "balance_loss_mlp": 1.0398128, "epoch": 0.5992484593416504, "flos": 23951196622080.0, "grad_norm": 1.5221084687575825, "language_loss": 0.73513347, "learning_rate": 1.460920090376422e-06, "loss": 0.75650042, "num_input_tokens_seen": 214753285, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 9967, "time_per_iteration": 3.9698846340179443 }, { "auxiliary_loss_clip": 0.01116133, "auxiliary_loss_mlp": 0.01034978, "balance_loss_clip": 1.02073801, "balance_loss_mlp": 1.04044712, "epoch": 0.5993085825943184, "flos": 11944105061760.0, "grad_norm": 2.2407335567257443, "language_loss": 0.68819886, "learning_rate": 1.4605450532039847e-06, "loss": 0.70971, "num_input_tokens_seen": 214767810, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 9968, "time_per_iteration": 2.4061129093170166 }, { "auxiliary_loss_clip": 0.01111932, "auxiliary_loss_mlp": 0.01034862, "balance_loss_clip": 1.02071238, "balance_loss_mlp": 1.03865016, "epoch": 0.5993687058469863, "flos": 19026838500480.0, "grad_norm": 4.098834493165161, "language_loss": 0.78963721, "learning_rate": 1.4601700364877334e-06, "loss": 0.81110513, "num_input_tokens_seen": 214786040, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.734375, "step": 9969, "time_per_iteration": 2.4604876041412354 }, { "auxiliary_loss_clip": 0.01108225, "auxiliary_loss_mlp": 0.01031713, "balance_loss_clip": 1.01808167, "balance_loss_mlp": 1.03622878, "epoch": 0.5994288290996543, "flos": 14282967242880.0, "grad_norm": 1.6884522111763698, "language_loss": 0.81282246, "learning_rate": 1.4597950402418889e-06, "loss": 0.83422172, "num_input_tokens_seen": 214803110, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 9970, "time_per_iteration": 2.422828197479248 }, { "auxiliary_loss_clip": 0.01111697, "auxiliary_loss_mlp": 0.0103978, "balance_loss_clip": 1.02429521, "balance_loss_mlp": 1.03758955, "epoch": 0.5994889523523222, "flos": 19206643006080.0, "grad_norm": 2.899283625234163, "language_loss": 0.62338316, "learning_rate": 1.4594200644806697e-06, "loss": 0.64489794, "num_input_tokens_seen": 214819945, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7421875, "step": 9971, "time_per_iteration": 2.4461917877197266 }, { "auxiliary_loss_clip": 0.01106236, "auxiliary_loss_mlp": 0.01028027, "balance_loss_clip": 1.01543856, "balance_loss_mlp": 1.03759456, "epoch": 0.5995490756049903, "flos": 28037040675840.0, "grad_norm": 1.5642823100512633, "language_loss": 0.78976774, "learning_rate": 1.4590451092182962e-06, "loss": 0.81111044, "num_input_tokens_seen": 214838810, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 9972, "time_per_iteration": 2.497982978820801 }, { "auxiliary_loss_clip": 0.01115063, "auxiliary_loss_mlp": 0.01034572, "balance_loss_clip": 1.0203141, "balance_loss_mlp": 1.03872442, "epoch": 0.5996091988576582, "flos": 29052953038080.0, "grad_norm": 2.0422112643845924, "language_loss": 0.76239878, "learning_rate": 1.4586701744689864e-06, "loss": 0.78389513, "num_input_tokens_seen": 214857040, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76171875, "step": 9973, "time_per_iteration": 2.530993938446045 }, { "auxiliary_loss_clip": 0.01109831, "auxiliary_loss_mlp": 0.01033015, "balance_loss_clip": 1.01915097, "balance_loss_mlp": 1.03760183, "epoch": 0.5996693221103262, "flos": 20813968800000.0, "grad_norm": 2.0770851737013807, "language_loss": 0.65119755, "learning_rate": 1.4582952602469578e-06, "loss": 0.67262602, "num_input_tokens_seen": 214873375, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 9974, "time_per_iteration": 2.443135976791382 }, { "auxiliary_loss_clip": 0.01109167, "auxiliary_loss_mlp": 0.01028957, "balance_loss_clip": 1.01608884, "balance_loss_mlp": 1.03719664, "epoch": 0.5997294453629941, "flos": 23768914078080.0, "grad_norm": 1.3511808047566967, "language_loss": 0.74453098, "learning_rate": 1.457920366566428e-06, "loss": 0.76591223, "num_input_tokens_seen": 214893900, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 9975, "time_per_iteration": 2.5411829948425293 }, { "auxiliary_loss_clip": 0.01110213, "auxiliary_loss_mlp": 0.01029471, "balance_loss_clip": 1.01558936, "balance_loss_mlp": 1.03881836, "epoch": 0.5997895686156621, "flos": 20960017499520.0, "grad_norm": 2.3332598950396455, "language_loss": 0.77013743, "learning_rate": 1.457545493441611e-06, "loss": 0.7915343, "num_input_tokens_seen": 214912110, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 9976, "time_per_iteration": 2.4805283546447754 }, { "auxiliary_loss_clip": 0.01109287, "auxiliary_loss_mlp": 0.0103759, "balance_loss_clip": 1.02376139, "balance_loss_mlp": 1.03782654, "epoch": 0.59984969186833, "flos": 28365443746560.0, "grad_norm": 3.1920374428891463, "language_loss": 0.7477442, "learning_rate": 1.4571706408867237e-06, "loss": 0.76921296, "num_input_tokens_seen": 214930140, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 9977, "time_per_iteration": 2.515413761138916 }, { "auxiliary_loss_clip": 0.01110853, "auxiliary_loss_mlp": 0.01032194, "balance_loss_clip": 1.01895034, "balance_loss_mlp": 1.03838968, "epoch": 0.5999098151209981, "flos": 22565906749440.0, "grad_norm": 1.960680627977188, "language_loss": 0.69041187, "learning_rate": 1.4567958089159802e-06, "loss": 0.71184236, "num_input_tokens_seen": 214949200, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 9978, "time_per_iteration": 2.471813917160034 }, { "auxiliary_loss_clip": 0.01117215, "auxiliary_loss_mlp": 0.01033073, "balance_loss_clip": 1.01857734, "balance_loss_mlp": 1.04268265, "epoch": 0.599969938373666, "flos": 18768712389120.0, "grad_norm": 2.56853536756479, "language_loss": 0.81631315, "learning_rate": 1.456420997543594e-06, "loss": 0.83781606, "num_input_tokens_seen": 214965775, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.74609375, "step": 9979, "time_per_iteration": 2.4460017681121826 }, { "auxiliary_loss_clip": 0.01106602, "auxiliary_loss_mlp": 0.01036372, "balance_loss_clip": 1.02363491, "balance_loss_mlp": 1.03799534, "epoch": 0.600030061626334, "flos": 11327231865600.0, "grad_norm": 1.8325375216809727, "language_loss": 0.6999436, "learning_rate": 1.4560462067837782e-06, "loss": 0.72137332, "num_input_tokens_seen": 214982480, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 9980, "time_per_iteration": 2.4319136142730713 }, { "auxiliary_loss_clip": 0.01112826, "auxiliary_loss_mlp": 0.01033411, "balance_loss_clip": 1.01902199, "balance_loss_mlp": 1.03899074, "epoch": 0.600090184879002, "flos": 16578664254720.0, "grad_norm": 2.7939605222177426, "language_loss": 0.68311679, "learning_rate": 1.4556714366507445e-06, "loss": 0.70457911, "num_input_tokens_seen": 214998110, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.73828125, "step": 9981, "time_per_iteration": 2.4413087368011475 }, { "auxiliary_loss_clip": 0.01110119, "auxiliary_loss_mlp": 0.0103753, "balance_loss_clip": 1.02579379, "balance_loss_mlp": 1.03953016, "epoch": 0.6001503081316699, "flos": 23618627573760.0, "grad_norm": 2.1352098960201187, "language_loss": 0.78376544, "learning_rate": 1.4552966871587048e-06, "loss": 0.80524194, "num_input_tokens_seen": 215017995, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.703125, "step": 9982, "time_per_iteration": 2.500253200531006 }, { "auxiliary_loss_clip": 0.01110123, "auxiliary_loss_mlp": 0.0103499, "balance_loss_clip": 1.02127504, "balance_loss_mlp": 1.03929353, "epoch": 0.6002104313843379, "flos": 20667668705280.0, "grad_norm": 1.7258220328041802, "language_loss": 0.73212105, "learning_rate": 1.4549219583218686e-06, "loss": 0.75357223, "num_input_tokens_seen": 215038285, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 9983, "time_per_iteration": 2.5053844451904297 }, { "auxiliary_loss_clip": 0.01109833, "auxiliary_loss_mlp": 0.01034962, "balance_loss_clip": 1.02124071, "balance_loss_mlp": 1.03741717, "epoch": 0.6002705546370058, "flos": 22455229968000.0, "grad_norm": 2.147078404019152, "language_loss": 0.7839452, "learning_rate": 1.454547250154447e-06, "loss": 0.80539316, "num_input_tokens_seen": 215057825, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 9984, "time_per_iteration": 2.4685332775115967 }, { "auxiliary_loss_clip": 0.01110215, "auxiliary_loss_mlp": 0.01033179, "balance_loss_clip": 1.02044106, "balance_loss_mlp": 1.03897762, "epoch": 0.6003306778896739, "flos": 25191982080000.0, "grad_norm": 1.6104886646624763, "language_loss": 0.83108664, "learning_rate": 1.4541725626706485e-06, "loss": 0.85252053, "num_input_tokens_seen": 215077790, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 9985, "time_per_iteration": 2.634037733078003 }, { "auxiliary_loss_clip": 0.0111001, "auxiliary_loss_mlp": 0.01036129, "balance_loss_clip": 1.02348733, "balance_loss_mlp": 1.03870249, "epoch": 0.6003908011423418, "flos": 26687733252480.0, "grad_norm": 1.854009814094654, "language_loss": 0.71262944, "learning_rate": 1.4537978958846809e-06, "loss": 0.73409081, "num_input_tokens_seen": 215097650, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 9986, "time_per_iteration": 2.519221305847168 }, { "auxiliary_loss_clip": 0.01112701, "auxiliary_loss_mlp": 0.01033307, "balance_loss_clip": 1.01916242, "balance_loss_mlp": 1.04054022, "epoch": 0.6004509243950098, "flos": 22565080736640.0, "grad_norm": 1.384565899517498, "language_loss": 0.71867418, "learning_rate": 1.4534232498107514e-06, "loss": 0.74013424, "num_input_tokens_seen": 215118235, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.72265625, "step": 9987, "time_per_iteration": 2.476661443710327 }, { "auxiliary_loss_clip": 0.0110784, "auxiliary_loss_mlp": 0.01034226, "balance_loss_clip": 1.02115464, "balance_loss_mlp": 1.0377605, "epoch": 0.6005110476476777, "flos": 19719303868800.0, "grad_norm": 2.241504920741769, "language_loss": 0.8490386, "learning_rate": 1.4530486244630673e-06, "loss": 0.87045926, "num_input_tokens_seen": 215136755, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 9988, "time_per_iteration": 2.5398876667022705 }, { "auxiliary_loss_clip": 0.01108686, "auxiliary_loss_mlp": 0.01035787, "balance_loss_clip": 1.02211976, "balance_loss_mlp": 1.03833556, "epoch": 0.6005711709003457, "flos": 17712543859200.0, "grad_norm": 1.9237446923595327, "language_loss": 0.65364283, "learning_rate": 1.4526740198558346e-06, "loss": 0.67508757, "num_input_tokens_seen": 215155225, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 9989, "time_per_iteration": 2.440415859222412 }, { "auxiliary_loss_clip": 0.01108015, "auxiliary_loss_mlp": 0.01033269, "balance_loss_clip": 1.02073967, "balance_loss_mlp": 1.03750896, "epoch": 0.6006312941530136, "flos": 18514464946560.0, "grad_norm": 1.5537524782759444, "language_loss": 0.80086333, "learning_rate": 1.452299436003257e-06, "loss": 0.82227612, "num_input_tokens_seen": 215174815, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 9990, "time_per_iteration": 2.4548118114471436 }, { "auxiliary_loss_clip": 0.01112585, "auxiliary_loss_mlp": 0.01034583, "balance_loss_clip": 1.0215292, "balance_loss_mlp": 1.03869367, "epoch": 0.6006914174056817, "flos": 21390837223680.0, "grad_norm": 1.9108134343153167, "language_loss": 0.8277508, "learning_rate": 1.4519248729195403e-06, "loss": 0.84922248, "num_input_tokens_seen": 215192045, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 9991, "time_per_iteration": 2.452529191970825 }, { "auxiliary_loss_clip": 0.01107021, "auxiliary_loss_mlp": 0.01030513, "balance_loss_clip": 1.01785326, "balance_loss_mlp": 1.03819132, "epoch": 0.6007515406583496, "flos": 12750515349120.0, "grad_norm": 1.749712917675334, "language_loss": 0.82956862, "learning_rate": 1.4515503306188878e-06, "loss": 0.85094392, "num_input_tokens_seen": 215209885, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 9992, "time_per_iteration": 2.4366960525512695 }, { "auxiliary_loss_clip": 0.01108728, "auxiliary_loss_mlp": 0.01038405, "balance_loss_clip": 1.02468383, "balance_loss_mlp": 1.03929496, "epoch": 0.6008116639110176, "flos": 19206894401280.0, "grad_norm": 1.997923922093422, "language_loss": 0.66743624, "learning_rate": 1.4511758091155008e-06, "loss": 0.68890762, "num_input_tokens_seen": 215228150, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.6953125, "step": 9993, "time_per_iteration": 2.4322052001953125 }, { "auxiliary_loss_clip": 0.01111169, "auxiliary_loss_mlp": 0.01032014, "balance_loss_clip": 1.01884151, "balance_loss_mlp": 1.03948188, "epoch": 0.6008717871636855, "flos": 17055342668160.0, "grad_norm": 2.398727663090492, "language_loss": 0.81154716, "learning_rate": 1.4508013084235826e-06, "loss": 0.83297896, "num_input_tokens_seen": 215243755, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 9994, "time_per_iteration": 2.437717914581299 }, { "auxiliary_loss_clip": 0.01106487, "auxiliary_loss_mlp": 0.01027377, "balance_loss_clip": 1.0154202, "balance_loss_mlp": 1.03924727, "epoch": 0.6009319104163535, "flos": 20298686244480.0, "grad_norm": 1.8094908585902376, "language_loss": 0.72448564, "learning_rate": 1.4504268285573337e-06, "loss": 0.74582434, "num_input_tokens_seen": 215262130, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 9995, "time_per_iteration": 2.4325110912323 }, { "auxiliary_loss_clip": 0.01112169, "auxiliary_loss_mlp": 0.01031593, "balance_loss_clip": 1.01843858, "balance_loss_mlp": 1.03915226, "epoch": 0.6009920336690215, "flos": 21836776573440.0, "grad_norm": 1.8340172124337604, "language_loss": 0.80905139, "learning_rate": 1.4500523695309546e-06, "loss": 0.83048904, "num_input_tokens_seen": 215281785, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 9996, "time_per_iteration": 2.4769127368927 }, { "auxiliary_loss_clip": 0.01110291, "auxiliary_loss_mlp": 0.01035502, "balance_loss_clip": 1.02188265, "balance_loss_mlp": 1.03987384, "epoch": 0.6010521569216895, "flos": 22596107109120.0, "grad_norm": 1.8611886314628563, "language_loss": 0.78681755, "learning_rate": 1.4496779313586447e-06, "loss": 0.80827546, "num_input_tokens_seen": 215297550, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 9997, "time_per_iteration": 2.4368276596069336 }, { "auxiliary_loss_clip": 0.01112463, "auxiliary_loss_mlp": 0.01034008, "balance_loss_clip": 1.01992357, "balance_loss_mlp": 1.03908932, "epoch": 0.6011122801743575, "flos": 19171702051200.0, "grad_norm": 1.731692735751728, "language_loss": 0.73088539, "learning_rate": 1.4493035140546028e-06, "loss": 0.75235009, "num_input_tokens_seen": 215316360, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 9998, "time_per_iteration": 2.484755039215088 }, { "auxiliary_loss_clip": 0.01107405, "auxiliary_loss_mlp": 0.01033096, "balance_loss_clip": 1.0203948, "balance_loss_mlp": 1.0381428, "epoch": 0.6011724034270254, "flos": 25010022758400.0, "grad_norm": 2.0334532156793985, "language_loss": 0.72368509, "learning_rate": 1.448929117633027e-06, "loss": 0.74509007, "num_input_tokens_seen": 215336405, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 9999, "time_per_iteration": 2.610144853591919 }, { "auxiliary_loss_clip": 0.01111697, "auxiliary_loss_mlp": 0.01036929, "balance_loss_clip": 1.0232141, "balance_loss_mlp": 1.03787816, "epoch": 0.6012325266796934, "flos": 21797669640960.0, "grad_norm": 2.0271798446727263, "language_loss": 0.78380394, "learning_rate": 1.4485547421081142e-06, "loss": 0.80529022, "num_input_tokens_seen": 215356590, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 10000, "time_per_iteration": 2.478890895843506 }, { "auxiliary_loss_clip": 0.01116186, "auxiliary_loss_mlp": 0.01036332, "balance_loss_clip": 1.02181828, "balance_loss_mlp": 1.04142129, "epoch": 0.6012926499323613, "flos": 19573003774080.0, "grad_norm": 1.8681550520232428, "language_loss": 0.78030455, "learning_rate": 1.4481803874940608e-06, "loss": 0.8018297, "num_input_tokens_seen": 215374295, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.74609375, "step": 10001, "time_per_iteration": 2.4410669803619385 }, { "auxiliary_loss_clip": 0.01115054, "auxiliary_loss_mlp": 0.01031482, "balance_loss_clip": 1.01714098, "balance_loss_mlp": 1.04071712, "epoch": 0.6013527731850293, "flos": 34860786076800.0, "grad_norm": 2.5870821816399436, "language_loss": 0.59142494, "learning_rate": 1.4478060538050624e-06, "loss": 0.61289024, "num_input_tokens_seen": 215394535, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7421875, "step": 10002, "time_per_iteration": 2.58720064163208 }, { "auxiliary_loss_clip": 0.0111579, "auxiliary_loss_mlp": 0.01036995, "balance_loss_clip": 1.02235603, "balance_loss_mlp": 1.04262125, "epoch": 0.6014128964376972, "flos": 23291948355840.0, "grad_norm": 1.6640793785801153, "language_loss": 0.78106451, "learning_rate": 1.447431741055314e-06, "loss": 0.8025924, "num_input_tokens_seen": 215414355, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.73046875, "step": 10003, "time_per_iteration": 3.854046106338501 }, { "auxiliary_loss_clip": 0.01113936, "auxiliary_loss_mlp": 0.01038134, "balance_loss_clip": 1.02488375, "balance_loss_mlp": 1.04103792, "epoch": 0.6014730196903653, "flos": 24820916630400.0, "grad_norm": 2.271235209144909, "language_loss": 0.77740002, "learning_rate": 1.4470574492590091e-06, "loss": 0.79892075, "num_input_tokens_seen": 215428280, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 10004, "time_per_iteration": 2.4516589641571045 }, { "auxiliary_loss_clip": 0.0111042, "auxiliary_loss_mlp": 0.01030203, "balance_loss_clip": 1.01702452, "balance_loss_mlp": 1.03962207, "epoch": 0.6015331429430332, "flos": 23112359331840.0, "grad_norm": 1.5550896887567769, "language_loss": 0.72786111, "learning_rate": 1.4466831784303408e-06, "loss": 0.74926734, "num_input_tokens_seen": 215448970, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 10005, "time_per_iteration": 2.4708292484283447 }, { "auxiliary_loss_clip": 0.0110763, "auxiliary_loss_mlp": 0.01029901, "balance_loss_clip": 1.01681757, "balance_loss_mlp": 1.03945041, "epoch": 0.6015932661957012, "flos": 19201363706880.0, "grad_norm": 4.064002441878919, "language_loss": 0.7501964, "learning_rate": 1.4463089285835026e-06, "loss": 0.7715717, "num_input_tokens_seen": 215465260, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6796875, "step": 10006, "time_per_iteration": 2.4223599433898926 }, { "auxiliary_loss_clip": 0.01108565, "auxiliary_loss_mlp": 0.01032539, "balance_loss_clip": 1.0190506, "balance_loss_mlp": 1.03700924, "epoch": 0.6016533894483691, "flos": 18113630100480.0, "grad_norm": 2.2593119533170247, "language_loss": 0.74153692, "learning_rate": 1.445934699732685e-06, "loss": 0.76294792, "num_input_tokens_seen": 215482725, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 10007, "time_per_iteration": 3.9257686138153076 }, { "auxiliary_loss_clip": 0.01109728, "auxiliary_loss_mlp": 0.01029047, "balance_loss_clip": 1.01619005, "balance_loss_mlp": 1.03927422, "epoch": 0.6017135127010371, "flos": 16216900427520.0, "grad_norm": 1.988841396800588, "language_loss": 0.70115703, "learning_rate": 1.4455604918920785e-06, "loss": 0.72254479, "num_input_tokens_seen": 215500420, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10008, "time_per_iteration": 5.244581937789917 }, { "auxiliary_loss_clip": 0.0110973, "auxiliary_loss_mlp": 0.01028182, "balance_loss_clip": 1.01577234, "balance_loss_mlp": 1.03893518, "epoch": 0.6017736359537051, "flos": 23444246021760.0, "grad_norm": 1.5060421782263085, "language_loss": 0.76511574, "learning_rate": 1.4451863050758748e-06, "loss": 0.78649485, "num_input_tokens_seen": 215522260, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 10009, "time_per_iteration": 2.5340538024902344 }, { "auxiliary_loss_clip": 0.01111976, "auxiliary_loss_mlp": 0.01034874, "balance_loss_clip": 1.02149892, "balance_loss_mlp": 1.04063201, "epoch": 0.601833759206373, "flos": 23514056104320.0, "grad_norm": 3.9417806387402763, "language_loss": 0.74008667, "learning_rate": 1.4448121392982608e-06, "loss": 0.76155519, "num_input_tokens_seen": 215541715, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 10010, "time_per_iteration": 2.527378559112549 }, { "auxiliary_loss_clip": 0.01040054, "auxiliary_loss_mlp": 0.01006534, "balance_loss_clip": 1.00517547, "balance_loss_mlp": 1.01508951, "epoch": 0.6018938824590411, "flos": 63991668648960.0, "grad_norm": 0.8046289717765454, "language_loss": 0.55122876, "learning_rate": 1.4444379945734268e-06, "loss": 0.57169461, "num_input_tokens_seen": 215603020, "router_z_loss_clip": 0.01361084, "router_z_loss_mlp": 0.25, "step": 10011, "time_per_iteration": 3.1601312160491943 }, { "auxiliary_loss_clip": 0.01111312, "auxiliary_loss_mlp": 0.01034291, "balance_loss_clip": 1.02157164, "balance_loss_mlp": 1.03906679, "epoch": 0.601954005711709, "flos": 34640007131520.0, "grad_norm": 1.540369270817469, "language_loss": 0.62304008, "learning_rate": 1.44406387091556e-06, "loss": 0.64449614, "num_input_tokens_seen": 215625115, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 10012, "time_per_iteration": 2.5873444080352783 }, { "auxiliary_loss_clip": 0.01110053, "auxiliary_loss_mlp": 0.01026745, "balance_loss_clip": 1.01474679, "balance_loss_mlp": 1.03999472, "epoch": 0.602014128964377, "flos": 19427062815360.0, "grad_norm": 2.236361532675874, "language_loss": 0.74965501, "learning_rate": 1.4436897683388462e-06, "loss": 0.77102292, "num_input_tokens_seen": 215643730, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 10013, "time_per_iteration": 2.4313838481903076 }, { "auxiliary_loss_clip": 0.01105702, "auxiliary_loss_mlp": 0.01029295, "balance_loss_clip": 1.01691544, "balance_loss_mlp": 1.03849673, "epoch": 0.6020742522170449, "flos": 28329389470080.0, "grad_norm": 1.5986351306150175, "language_loss": 0.81185895, "learning_rate": 1.4433156868574732e-06, "loss": 0.83320892, "num_input_tokens_seen": 215664425, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 10014, "time_per_iteration": 2.5305306911468506 }, { "auxiliary_loss_clip": 0.01105672, "auxiliary_loss_mlp": 0.01026948, "balance_loss_clip": 1.01411545, "balance_loss_mlp": 1.03853679, "epoch": 0.6021343754697129, "flos": 22747040058240.0, "grad_norm": 1.6152743686484738, "language_loss": 0.72503066, "learning_rate": 1.442941626485624e-06, "loss": 0.7463569, "num_input_tokens_seen": 215684280, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.671875, "step": 10015, "time_per_iteration": 2.463158369064331 }, { "auxiliary_loss_clip": 0.01039047, "auxiliary_loss_mlp": 0.01002568, "balance_loss_clip": 1.00116181, "balance_loss_mlp": 1.01403093, "epoch": 0.6021944987223808, "flos": 65752007402880.0, "grad_norm": 0.8487771932634287, "language_loss": 0.54845554, "learning_rate": 1.4425675872374848e-06, "loss": 0.56887168, "num_input_tokens_seen": 215739780, "router_z_loss_clip": 0.01403809, "router_z_loss_mlp": 0.25, "step": 10016, "time_per_iteration": 3.0069966316223145 }, { "auxiliary_loss_clip": 0.01109714, "auxiliary_loss_mlp": 0.01030226, "balance_loss_clip": 1.01712513, "balance_loss_mlp": 1.03952801, "epoch": 0.6022546219750489, "flos": 16105182151680.0, "grad_norm": 1.7816338521440518, "language_loss": 0.83211935, "learning_rate": 1.4421935691272381e-06, "loss": 0.85351872, "num_input_tokens_seen": 215757885, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 10017, "time_per_iteration": 2.4375669956207275 }, { "auxiliary_loss_clip": 0.01110108, "auxiliary_loss_mlp": 0.01030347, "balance_loss_clip": 1.017663, "balance_loss_mlp": 1.0415504, "epoch": 0.6023147452277168, "flos": 25512555985920.0, "grad_norm": 2.0815546284360003, "language_loss": 0.83737314, "learning_rate": 1.4418195721690677e-06, "loss": 0.85877764, "num_input_tokens_seen": 215776415, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 10018, "time_per_iteration": 2.471825361251831 }, { "auxiliary_loss_clip": 0.01111998, "auxiliary_loss_mlp": 0.01036846, "balance_loss_clip": 1.02301228, "balance_loss_mlp": 1.0384903, "epoch": 0.6023748684803848, "flos": 22636075968000.0, "grad_norm": 1.7313515337794514, "language_loss": 0.78114909, "learning_rate": 1.4414455963771549e-06, "loss": 0.80263758, "num_input_tokens_seen": 215794865, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 10019, "time_per_iteration": 2.4944865703582764 }, { "auxiliary_loss_clip": 0.01107653, "auxiliary_loss_mlp": 0.01032063, "balance_loss_clip": 1.01993966, "balance_loss_mlp": 1.03682065, "epoch": 0.6024349917330527, "flos": 26210444307840.0, "grad_norm": 2.0609804879589837, "language_loss": 0.73774385, "learning_rate": 1.441071641765681e-06, "loss": 0.75914097, "num_input_tokens_seen": 215816840, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 10020, "time_per_iteration": 2.50858473777771 }, { "auxiliary_loss_clip": 0.01110772, "auxiliary_loss_mlp": 0.01034745, "balance_loss_clip": 1.02162647, "balance_loss_mlp": 1.03967619, "epoch": 0.6024951149857207, "flos": 21251755762560.0, "grad_norm": 6.428636849624239, "language_loss": 0.64200354, "learning_rate": 1.4406977083488264e-06, "loss": 0.66345865, "num_input_tokens_seen": 215836100, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 10021, "time_per_iteration": 2.509437322616577 }, { "auxiliary_loss_clip": 0.01109029, "auxiliary_loss_mlp": 0.01032735, "balance_loss_clip": 1.01966953, "balance_loss_mlp": 1.03812838, "epoch": 0.6025552382383887, "flos": 26943453152640.0, "grad_norm": 1.382995565231143, "language_loss": 0.80534339, "learning_rate": 1.4403237961407704e-06, "loss": 0.82676101, "num_input_tokens_seen": 215858480, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 10022, "time_per_iteration": 2.5189368724823 }, { "auxiliary_loss_clip": 0.01113904, "auxiliary_loss_mlp": 0.01031023, "balance_loss_clip": 1.01786852, "balance_loss_mlp": 1.04033971, "epoch": 0.6026153614910567, "flos": 31684379495040.0, "grad_norm": 1.637851686818981, "language_loss": 0.66323328, "learning_rate": 1.439949905155693e-06, "loss": 0.68468249, "num_input_tokens_seen": 215879950, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 10023, "time_per_iteration": 2.5734333992004395 }, { "auxiliary_loss_clip": 0.0111234, "auxiliary_loss_mlp": 0.01033844, "balance_loss_clip": 1.02084398, "balance_loss_mlp": 1.0394938, "epoch": 0.6026754847437247, "flos": 29312731175040.0, "grad_norm": 2.0090279281873697, "language_loss": 0.73861533, "learning_rate": 1.4395760354077707e-06, "loss": 0.76007712, "num_input_tokens_seen": 215899830, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 10024, "time_per_iteration": 2.5080244541168213 }, { "auxiliary_loss_clip": 0.01108499, "auxiliary_loss_mlp": 0.01033433, "balance_loss_clip": 1.02005768, "balance_loss_mlp": 1.03855419, "epoch": 0.6027356079963926, "flos": 23586775188480.0, "grad_norm": 1.7755024428349238, "language_loss": 0.73077238, "learning_rate": 1.4392021869111815e-06, "loss": 0.75219172, "num_input_tokens_seen": 215920440, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69921875, "step": 10025, "time_per_iteration": 2.486299753189087 }, { "auxiliary_loss_clip": 0.01114839, "auxiliary_loss_mlp": 0.01033958, "balance_loss_clip": 1.02043939, "balance_loss_mlp": 1.03905082, "epoch": 0.6027957312490606, "flos": 20813753318400.0, "grad_norm": 2.3146800484149757, "language_loss": 0.67561591, "learning_rate": 1.4388283596801016e-06, "loss": 0.69710386, "num_input_tokens_seen": 215940535, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7578125, "step": 10026, "time_per_iteration": 2.508047103881836 }, { "auxiliary_loss_clip": 0.01104217, "auxiliary_loss_mlp": 0.01033979, "balance_loss_clip": 1.02150929, "balance_loss_mlp": 1.03669381, "epoch": 0.6028558545017285, "flos": 19935773182080.0, "grad_norm": 1.8907679556414863, "language_loss": 0.80016589, "learning_rate": 1.4384545537287061e-06, "loss": 0.82154787, "num_input_tokens_seen": 215958045, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 10027, "time_per_iteration": 2.4571244716644287 }, { "auxiliary_loss_clip": 0.01112923, "auxiliary_loss_mlp": 0.01032852, "balance_loss_clip": 1.01981628, "balance_loss_mlp": 1.0397774, "epoch": 0.6029159777543965, "flos": 22820836550400.0, "grad_norm": 2.1010341421020993, "language_loss": 0.71509123, "learning_rate": 1.438080769071171e-06, "loss": 0.73654896, "num_input_tokens_seen": 215977330, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.73046875, "step": 10028, "time_per_iteration": 2.4732468128204346 }, { "auxiliary_loss_clip": 0.0111308, "auxiliary_loss_mlp": 0.01040629, "balance_loss_clip": 1.02706909, "balance_loss_mlp": 1.03943372, "epoch": 0.6029761010070644, "flos": 23587242065280.0, "grad_norm": 1.861859082268613, "language_loss": 0.8402155, "learning_rate": 1.437707005721669e-06, "loss": 0.86175257, "num_input_tokens_seen": 215997865, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 10029, "time_per_iteration": 2.482649326324463 }, { "auxiliary_loss_clip": 0.01105536, "auxiliary_loss_mlp": 0.01036607, "balance_loss_clip": 1.02377975, "balance_loss_mlp": 1.03618991, "epoch": 0.6030362242597325, "flos": 13662430859520.0, "grad_norm": 1.8254956941909273, "language_loss": 0.80328918, "learning_rate": 1.437333263694373e-06, "loss": 0.82471061, "num_input_tokens_seen": 216016230, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 10030, "time_per_iteration": 2.420524835586548 }, { "auxiliary_loss_clip": 0.01111083, "auxiliary_loss_mlp": 0.01033278, "balance_loss_clip": 1.02048039, "balance_loss_mlp": 1.03996909, "epoch": 0.6030963475124004, "flos": 24422883045120.0, "grad_norm": 1.5782630997836322, "language_loss": 0.71002841, "learning_rate": 1.4369595430034572e-06, "loss": 0.73147196, "num_input_tokens_seen": 216035785, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 10031, "time_per_iteration": 2.499713897705078 }, { "auxiliary_loss_clip": 0.01112592, "auxiliary_loss_mlp": 0.01037727, "balance_loss_clip": 1.02266455, "balance_loss_mlp": 1.03730714, "epoch": 0.6031564707650684, "flos": 29644043247360.0, "grad_norm": 1.746886347777059, "language_loss": 0.73386979, "learning_rate": 1.4365858436630912e-06, "loss": 0.75537294, "num_input_tokens_seen": 216059555, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.75390625, "step": 10032, "time_per_iteration": 2.5432543754577637 }, { "auxiliary_loss_clip": 0.01114456, "auxiliary_loss_mlp": 0.01033866, "balance_loss_clip": 1.01987648, "balance_loss_mlp": 1.04137886, "epoch": 0.6032165940177363, "flos": 16618776768000.0, "grad_norm": 1.814237704126161, "language_loss": 0.68514252, "learning_rate": 1.4362121656874465e-06, "loss": 0.70662576, "num_input_tokens_seen": 216077235, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73046875, "step": 10033, "time_per_iteration": 2.441859722137451 }, { "auxiliary_loss_clip": 0.01111889, "auxiliary_loss_mlp": 0.01034924, "balance_loss_clip": 1.02165031, "balance_loss_mlp": 1.04121268, "epoch": 0.6032767172704043, "flos": 17488173553920.0, "grad_norm": 1.9038299755699248, "language_loss": 0.75625801, "learning_rate": 1.4358385090906934e-06, "loss": 0.77772617, "num_input_tokens_seen": 216094985, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 10034, "time_per_iteration": 2.459385871887207 }, { "auxiliary_loss_clip": 0.01111425, "auxiliary_loss_mlp": 0.01033671, "balance_loss_clip": 1.01992631, "balance_loss_mlp": 1.03878856, "epoch": 0.6033368405230723, "flos": 26832955939200.0, "grad_norm": 1.8528232493882504, "language_loss": 0.74467546, "learning_rate": 1.4354648738870004e-06, "loss": 0.76612639, "num_input_tokens_seen": 216115905, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 10035, "time_per_iteration": 2.56459379196167 }, { "auxiliary_loss_clip": 0.01107589, "auxiliary_loss_mlp": 0.01027504, "balance_loss_clip": 1.01513553, "balance_loss_mlp": 1.03823614, "epoch": 0.6033969637757403, "flos": 16909904499840.0, "grad_norm": 1.5653691413253734, "language_loss": 0.86459589, "learning_rate": 1.435091260090536e-06, "loss": 0.88594675, "num_input_tokens_seen": 216132420, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 10036, "time_per_iteration": 2.438783884048462 }, { "auxiliary_loss_clip": 0.01111398, "auxiliary_loss_mlp": 0.01034027, "balance_loss_clip": 1.02007973, "balance_loss_mlp": 1.03934503, "epoch": 0.6034570870284083, "flos": 22930076787840.0, "grad_norm": 2.4681800641301104, "language_loss": 0.70625639, "learning_rate": 1.4347176677154676e-06, "loss": 0.7277106, "num_input_tokens_seen": 216149800, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 10037, "time_per_iteration": 2.493516445159912 }, { "auxiliary_loss_clip": 0.01109757, "auxiliary_loss_mlp": 0.01034586, "balance_loss_clip": 1.02143121, "balance_loss_mlp": 1.04028416, "epoch": 0.6035172102810762, "flos": 23366319465600.0, "grad_norm": 1.8784745126582418, "language_loss": 0.8508324, "learning_rate": 1.4343440967759616e-06, "loss": 0.87227583, "num_input_tokens_seen": 216168200, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 10038, "time_per_iteration": 2.4624078273773193 }, { "auxiliary_loss_clip": 0.01113035, "auxiliary_loss_mlp": 0.01034628, "balance_loss_clip": 1.0210557, "balance_loss_mlp": 1.03962266, "epoch": 0.6035773335337442, "flos": 20887082933760.0, "grad_norm": 2.15015137050276, "language_loss": 0.76151413, "learning_rate": 1.4339705472861846e-06, "loss": 0.78299075, "num_input_tokens_seen": 216187105, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 10039, "time_per_iteration": 2.462874174118042 }, { "auxiliary_loss_clip": 0.01108261, "auxiliary_loss_mlp": 0.01028246, "balance_loss_clip": 1.01546121, "balance_loss_mlp": 1.0377667, "epoch": 0.6036374567864121, "flos": 24936298093440.0, "grad_norm": 1.7847972226837203, "language_loss": 0.71213257, "learning_rate": 1.433597019260301e-06, "loss": 0.73349762, "num_input_tokens_seen": 216205440, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10040, "time_per_iteration": 2.4895224571228027 }, { "auxiliary_loss_clip": 0.01114904, "auxiliary_loss_mlp": 0.01031965, "balance_loss_clip": 1.01653337, "balance_loss_mlp": 1.03982008, "epoch": 0.6036975800390801, "flos": 23148269953920.0, "grad_norm": 2.254388165031413, "language_loss": 0.78300667, "learning_rate": 1.433223512712475e-06, "loss": 0.80447537, "num_input_tokens_seen": 216223130, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.75, "step": 10041, "time_per_iteration": 2.464939832687378 }, { "auxiliary_loss_clip": 0.01110157, "auxiliary_loss_mlp": 0.01029386, "balance_loss_clip": 1.01653504, "balance_loss_mlp": 1.03936267, "epoch": 0.603757703291748, "flos": 18660729127680.0, "grad_norm": 1.7843592755920021, "language_loss": 0.75767046, "learning_rate": 1.4328500276568704e-06, "loss": 0.77906591, "num_input_tokens_seen": 216240260, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 10042, "time_per_iteration": 2.4302637577056885 }, { "auxiliary_loss_clip": 0.01108332, "auxiliary_loss_mlp": 0.01031084, "balance_loss_clip": 1.0182507, "balance_loss_mlp": 1.03807247, "epoch": 0.6038178265444161, "flos": 19682603147520.0, "grad_norm": 1.8349010967935728, "language_loss": 0.84785879, "learning_rate": 1.4324765641076498e-06, "loss": 0.86925292, "num_input_tokens_seen": 216258510, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 10043, "time_per_iteration": 2.464634656906128 }, { "auxiliary_loss_clip": 0.01112103, "auxiliary_loss_mlp": 0.01038799, "balance_loss_clip": 1.02410018, "balance_loss_mlp": 1.03869367, "epoch": 0.603877949797084, "flos": 22638230784000.0, "grad_norm": 2.067739168042627, "language_loss": 0.69909704, "learning_rate": 1.432103122078974e-06, "loss": 0.72060603, "num_input_tokens_seen": 216277550, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.734375, "step": 10044, "time_per_iteration": 2.469123363494873 }, { "auxiliary_loss_clip": 0.01114938, "auxiliary_loss_mlp": 0.01033156, "balance_loss_clip": 1.01852274, "balance_loss_mlp": 1.04173279, "epoch": 0.603938073049752, "flos": 25447881548160.0, "grad_norm": 1.8512244529464965, "language_loss": 0.78179848, "learning_rate": 1.4317297015850057e-06, "loss": 0.8032794, "num_input_tokens_seen": 216296690, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.734375, "step": 10045, "time_per_iteration": 3.9312281608581543 }, { "auxiliary_loss_clip": 0.01107824, "auxiliary_loss_mlp": 0.01034409, "balance_loss_clip": 1.02112293, "balance_loss_mlp": 1.03809428, "epoch": 0.6039981963024199, "flos": 22340135813760.0, "grad_norm": 2.7298954941808846, "language_loss": 0.76826096, "learning_rate": 1.4313563026399036e-06, "loss": 0.78968334, "num_input_tokens_seen": 216316110, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 10046, "time_per_iteration": 2.499753952026367 }, { "auxiliary_loss_clip": 0.01107721, "auxiliary_loss_mlp": 0.01031483, "balance_loss_clip": 1.01897764, "balance_loss_mlp": 1.03680825, "epoch": 0.6040583195550879, "flos": 20703148364160.0, "grad_norm": 1.6599848001249633, "language_loss": 0.87147832, "learning_rate": 1.430982925257827e-06, "loss": 0.89287043, "num_input_tokens_seen": 216333855, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 10047, "time_per_iteration": 2.4604532718658447 }, { "auxiliary_loss_clip": 0.0110914, "auxiliary_loss_mlp": 0.01032708, "balance_loss_clip": 1.02022672, "balance_loss_mlp": 1.04024315, "epoch": 0.604118442807756, "flos": 27163118776320.0, "grad_norm": 1.709359511025717, "language_loss": 0.75459135, "learning_rate": 1.4306095694529358e-06, "loss": 0.77600986, "num_input_tokens_seen": 216354890, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 10048, "time_per_iteration": 2.536571741104126 }, { "auxiliary_loss_clip": 0.01117083, "auxiliary_loss_mlp": 0.01046089, "balance_loss_clip": 1.03009093, "balance_loss_mlp": 1.0400362, "epoch": 0.6041785660604239, "flos": 30881524654080.0, "grad_norm": 2.003273841531189, "language_loss": 0.66244602, "learning_rate": 1.430236235239386e-06, "loss": 0.68407774, "num_input_tokens_seen": 216376055, "router_z_loss_clip": 0.16015625, "router_z_loss_mlp": 0.76953125, "step": 10049, "time_per_iteration": 5.337399959564209 }, { "auxiliary_loss_clip": 0.01108066, "auxiliary_loss_mlp": 0.01031867, "balance_loss_clip": 1.01916516, "balance_loss_mlp": 1.03820753, "epoch": 0.6042386893130919, "flos": 19938215306880.0, "grad_norm": 1.677641020104941, "language_loss": 0.66762984, "learning_rate": 1.429862922631336e-06, "loss": 0.68902916, "num_input_tokens_seen": 216396295, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10050, "time_per_iteration": 3.8696422576904297 }, { "auxiliary_loss_clip": 0.01113053, "auxiliary_loss_mlp": 0.01035631, "balance_loss_clip": 1.02241075, "balance_loss_mlp": 1.04121685, "epoch": 0.6042988125657598, "flos": 32415915882240.0, "grad_norm": 1.818094784514993, "language_loss": 0.69687945, "learning_rate": 1.4294896316429408e-06, "loss": 0.71836627, "num_input_tokens_seen": 216416605, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 10051, "time_per_iteration": 2.561283588409424 }, { "auxiliary_loss_clip": 0.01105833, "auxiliary_loss_mlp": 0.01031469, "balance_loss_clip": 1.01817119, "balance_loss_mlp": 1.03510356, "epoch": 0.6043589358184278, "flos": 17420805596160.0, "grad_norm": 1.9105852597736606, "language_loss": 0.64469659, "learning_rate": 1.4291163622883553e-06, "loss": 0.66606963, "num_input_tokens_seen": 216435130, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 10052, "time_per_iteration": 2.4176933765411377 }, { "auxiliary_loss_clip": 0.01110343, "auxiliary_loss_mlp": 0.01035013, "balance_loss_clip": 1.02120221, "balance_loss_mlp": 1.03843546, "epoch": 0.6044190590710957, "flos": 27672834723840.0, "grad_norm": 2.122043049681095, "language_loss": 0.68866205, "learning_rate": 1.4287431145817358e-06, "loss": 0.71011561, "num_input_tokens_seen": 216455640, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 10053, "time_per_iteration": 2.5080416202545166 }, { "auxiliary_loss_clip": 0.01040435, "auxiliary_loss_mlp": 0.01003848, "balance_loss_clip": 1.00254822, "balance_loss_mlp": 1.01589298, "epoch": 0.6044791823237637, "flos": 65316267515520.0, "grad_norm": 0.7361332583712783, "language_loss": 0.60425609, "learning_rate": 1.4283698885372336e-06, "loss": 0.62469894, "num_input_tokens_seen": 216518130, "router_z_loss_clip": 0.01300049, "router_z_loss_mlp": 0.24511719, "step": 10054, "time_per_iteration": 3.184220552444458 }, { "auxiliary_loss_clip": 0.01107969, "auxiliary_loss_mlp": 0.01031829, "balance_loss_clip": 1.0177269, "balance_loss_mlp": 1.03804696, "epoch": 0.6045393055764317, "flos": 24492369905280.0, "grad_norm": 1.8118417875498751, "language_loss": 0.8580547, "learning_rate": 1.4279966841690027e-06, "loss": 0.87945265, "num_input_tokens_seen": 216536845, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.69921875, "step": 10055, "time_per_iteration": 2.50118350982666 }, { "auxiliary_loss_clip": 0.01116679, "auxiliary_loss_mlp": 0.01042602, "balance_loss_clip": 1.02760577, "balance_loss_mlp": 1.04371405, "epoch": 0.6045994288290997, "flos": 19054345340160.0, "grad_norm": 3.208354633718491, "language_loss": 0.73351526, "learning_rate": 1.4276235014911952e-06, "loss": 0.75510806, "num_input_tokens_seen": 216551860, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.73046875, "step": 10056, "time_per_iteration": 2.4688563346862793 }, { "auxiliary_loss_clip": 0.01109892, "auxiliary_loss_mlp": 0.01035079, "balance_loss_clip": 1.02197766, "balance_loss_mlp": 1.04071641, "epoch": 0.6046595520817676, "flos": 26576697335040.0, "grad_norm": 1.4858236320121538, "language_loss": 0.79904759, "learning_rate": 1.4272503405179616e-06, "loss": 0.82049733, "num_input_tokens_seen": 216574775, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 10057, "time_per_iteration": 2.5470964908599854 }, { "auxiliary_loss_clip": 0.01108744, "auxiliary_loss_mlp": 0.01032339, "balance_loss_clip": 1.01843309, "balance_loss_mlp": 1.03908074, "epoch": 0.6047196753344356, "flos": 13582277660160.0, "grad_norm": 2.472124056274514, "language_loss": 0.74924386, "learning_rate": 1.4268772012634527e-06, "loss": 0.77065468, "num_input_tokens_seen": 216590100, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.6953125, "step": 10058, "time_per_iteration": 2.417501926422119 }, { "auxiliary_loss_clip": 0.01106637, "auxiliary_loss_mlp": 0.01029391, "balance_loss_clip": 1.01652813, "balance_loss_mlp": 1.03722537, "epoch": 0.6047797985871035, "flos": 25520456977920.0, "grad_norm": 2.3957193792948, "language_loss": 0.71222407, "learning_rate": 1.4265040837418176e-06, "loss": 0.7335844, "num_input_tokens_seen": 216610145, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 10059, "time_per_iteration": 2.523494243621826 }, { "auxiliary_loss_clip": 0.01109325, "auxiliary_loss_mlp": 0.01030952, "balance_loss_clip": 1.01772594, "balance_loss_mlp": 1.03879499, "epoch": 0.6048399218397715, "flos": 20520147548160.0, "grad_norm": 1.6041796063644473, "language_loss": 0.76163769, "learning_rate": 1.4261309879672054e-06, "loss": 0.78304052, "num_input_tokens_seen": 216630625, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 10060, "time_per_iteration": 2.4786245822906494 }, { "auxiliary_loss_clip": 0.01107689, "auxiliary_loss_mlp": 0.01030879, "balance_loss_clip": 1.01790285, "balance_loss_mlp": 1.0382762, "epoch": 0.6049000450924396, "flos": 20408788408320.0, "grad_norm": 1.927182501614926, "language_loss": 0.73466569, "learning_rate": 1.4257579139537628e-06, "loss": 0.75605142, "num_input_tokens_seen": 216649255, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 10061, "time_per_iteration": 2.477839708328247 }, { "auxiliary_loss_clip": 0.0111153, "auxiliary_loss_mlp": 0.01030402, "balance_loss_clip": 1.01755154, "balance_loss_mlp": 1.0395472, "epoch": 0.6049601683451075, "flos": 20741357456640.0, "grad_norm": 2.091495719216888, "language_loss": 0.67494869, "learning_rate": 1.425384861715639e-06, "loss": 0.69636798, "num_input_tokens_seen": 216668100, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 10062, "time_per_iteration": 2.492882013320923 }, { "auxiliary_loss_clip": 0.01109379, "auxiliary_loss_mlp": 0.01041368, "balance_loss_clip": 1.02771854, "balance_loss_mlp": 1.03863633, "epoch": 0.6050202915977755, "flos": 20083114771200.0, "grad_norm": 2.174369098577527, "language_loss": 0.71665156, "learning_rate": 1.425011831266978e-06, "loss": 0.738159, "num_input_tokens_seen": 216686125, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 10063, "time_per_iteration": 2.465554714202881 }, { "auxiliary_loss_clip": 0.01107688, "auxiliary_loss_mlp": 0.01033789, "balance_loss_clip": 1.02072918, "balance_loss_mlp": 1.03916907, "epoch": 0.6050804148504434, "flos": 15960821391360.0, "grad_norm": 1.7803416820266695, "language_loss": 0.85271335, "learning_rate": 1.424638822621926e-06, "loss": 0.8741281, "num_input_tokens_seen": 216704265, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.68359375, "step": 10064, "time_per_iteration": 2.46276593208313 }, { "auxiliary_loss_clip": 0.01107179, "auxiliary_loss_mlp": 0.01030627, "balance_loss_clip": 1.01750767, "balance_loss_mlp": 1.03732347, "epoch": 0.6051405381031114, "flos": 17456644391040.0, "grad_norm": 2.217251400326383, "language_loss": 0.79765201, "learning_rate": 1.4242658357946278e-06, "loss": 0.81903005, "num_input_tokens_seen": 216721765, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 10065, "time_per_iteration": 2.4421896934509277 }, { "auxiliary_loss_clip": 0.01113776, "auxiliary_loss_mlp": 0.01033387, "balance_loss_clip": 1.0189805, "balance_loss_mlp": 1.0403254, "epoch": 0.6052006613557793, "flos": 11400130517760.0, "grad_norm": 2.742485301785903, "language_loss": 0.78707933, "learning_rate": 1.423892870799226e-06, "loss": 0.80855095, "num_input_tokens_seen": 216738295, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.734375, "step": 10066, "time_per_iteration": 2.434762716293335 }, { "auxiliary_loss_clip": 0.01109083, "auxiliary_loss_mlp": 0.01033098, "balance_loss_clip": 1.01960921, "balance_loss_mlp": 1.03880548, "epoch": 0.6052607846084473, "flos": 24750998807040.0, "grad_norm": 1.6472471123563739, "language_loss": 0.73138988, "learning_rate": 1.4235199276498655e-06, "loss": 0.75281173, "num_input_tokens_seen": 216759875, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 10067, "time_per_iteration": 2.5166802406311035 }, { "auxiliary_loss_clip": 0.01109572, "auxiliary_loss_mlp": 0.01030239, "balance_loss_clip": 1.01745963, "balance_loss_mlp": 1.03986394, "epoch": 0.6053209078611153, "flos": 20741141975040.0, "grad_norm": 1.404377629667343, "language_loss": 0.68994397, "learning_rate": 1.4231470063606863e-06, "loss": 0.7113421, "num_input_tokens_seen": 216780705, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 10068, "time_per_iteration": 2.5086042881011963 }, { "auxiliary_loss_clip": 0.01108308, "auxiliary_loss_mlp": 0.01032595, "balance_loss_clip": 1.01962519, "balance_loss_mlp": 1.03629851, "epoch": 0.6053810311137833, "flos": 18953149749120.0, "grad_norm": 2.0888407582476636, "language_loss": 0.87367427, "learning_rate": 1.4227741069458303e-06, "loss": 0.89508325, "num_input_tokens_seen": 216797625, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 10069, "time_per_iteration": 2.435866117477417 }, { "auxiliary_loss_clip": 0.01108344, "auxiliary_loss_mlp": 0.01027346, "balance_loss_clip": 1.01447129, "balance_loss_mlp": 1.03789401, "epoch": 0.6054411543664512, "flos": 23951124794880.0, "grad_norm": 1.5204068284930055, "language_loss": 0.8304255, "learning_rate": 1.4224012294194387e-06, "loss": 0.85178244, "num_input_tokens_seen": 216817610, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 10070, "time_per_iteration": 2.471174955368042 }, { "auxiliary_loss_clip": 0.01110262, "auxiliary_loss_mlp": 0.01031909, "balance_loss_clip": 1.01886737, "balance_loss_mlp": 1.03863358, "epoch": 0.6055012776191192, "flos": 20593979953920.0, "grad_norm": 1.5680918653065936, "language_loss": 0.86088777, "learning_rate": 1.4220283737956496e-06, "loss": 0.8823095, "num_input_tokens_seen": 216836835, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 10071, "time_per_iteration": 2.478111505508423 }, { "auxiliary_loss_clip": 0.01113271, "auxiliary_loss_mlp": 0.01036969, "balance_loss_clip": 1.02277684, "balance_loss_mlp": 1.04075289, "epoch": 0.6055614008717871, "flos": 30298191782400.0, "grad_norm": 1.6750244940447858, "language_loss": 0.76856387, "learning_rate": 1.421655540088603e-06, "loss": 0.79006624, "num_input_tokens_seen": 216856760, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7265625, "step": 10072, "time_per_iteration": 2.529010772705078 }, { "auxiliary_loss_clip": 0.01107451, "auxiliary_loss_mlp": 0.01030434, "balance_loss_clip": 1.01596248, "balance_loss_mlp": 1.03597212, "epoch": 0.6056215241244551, "flos": 27125017424640.0, "grad_norm": 3.1397999407182877, "language_loss": 0.74309337, "learning_rate": 1.4212827283124367e-06, "loss": 0.76447225, "num_input_tokens_seen": 216878795, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.71484375, "step": 10073, "time_per_iteration": 2.508399724960327 }, { "auxiliary_loss_clip": 0.01038945, "auxiliary_loss_mlp": 0.0100091, "balance_loss_clip": 0.99948579, "balance_loss_mlp": 1.01456356, "epoch": 0.6056816473771232, "flos": 56007323925120.0, "grad_norm": 0.8372125988059299, "language_loss": 0.55211401, "learning_rate": 1.4209099384812863e-06, "loss": 0.57251257, "num_input_tokens_seen": 216937800, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.24414062, "step": 10074, "time_per_iteration": 3.130552291870117 }, { "auxiliary_loss_clip": 0.01109659, "auxiliary_loss_mlp": 0.01033162, "balance_loss_clip": 1.0203712, "balance_loss_mlp": 1.04014993, "epoch": 0.6057417706297911, "flos": 23549499849600.0, "grad_norm": 2.571047734876232, "language_loss": 0.82297194, "learning_rate": 1.4205371706092894e-06, "loss": 0.84440017, "num_input_tokens_seen": 216955280, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 10075, "time_per_iteration": 2.4773929119110107 }, { "auxiliary_loss_clip": 0.0111, "auxiliary_loss_mlp": 0.01026216, "balance_loss_clip": 1.01253712, "balance_loss_mlp": 1.03883457, "epoch": 0.6058018938824591, "flos": 27744296832000.0, "grad_norm": 2.124983613856651, "language_loss": 0.78412139, "learning_rate": 1.4201644247105813e-06, "loss": 0.80548358, "num_input_tokens_seen": 216976950, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 10076, "time_per_iteration": 2.503326892852783 }, { "auxiliary_loss_clip": 0.01111754, "auxiliary_loss_mlp": 0.010323, "balance_loss_clip": 1.01871002, "balance_loss_mlp": 1.03857183, "epoch": 0.605862017135127, "flos": 22783381643520.0, "grad_norm": 2.125558215461618, "language_loss": 0.72470593, "learning_rate": 1.4197917007992964e-06, "loss": 0.74614656, "num_input_tokens_seen": 216996945, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 10077, "time_per_iteration": 2.4874937534332275 }, { "auxiliary_loss_clip": 0.01110545, "auxiliary_loss_mlp": 0.01034707, "balance_loss_clip": 1.02059245, "balance_loss_mlp": 1.03919911, "epoch": 0.605922140387795, "flos": 21215019127680.0, "grad_norm": 2.3099475409867725, "language_loss": 0.55238163, "learning_rate": 1.4194189988895682e-06, "loss": 0.57383418, "num_input_tokens_seen": 217016580, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.71484375, "step": 10078, "time_per_iteration": 2.470698118209839 }, { "auxiliary_loss_clip": 0.01111665, "auxiliary_loss_mlp": 0.01032024, "balance_loss_clip": 1.01870799, "balance_loss_mlp": 1.03859854, "epoch": 0.6059822636404629, "flos": 27268372604160.0, "grad_norm": 1.685293458217093, "language_loss": 0.70751065, "learning_rate": 1.4190463189955297e-06, "loss": 0.72894752, "num_input_tokens_seen": 217037300, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 10079, "time_per_iteration": 2.5233700275421143 }, { "auxiliary_loss_clip": 0.01108377, "auxiliary_loss_mlp": 0.01035247, "balance_loss_clip": 1.02212214, "balance_loss_mlp": 1.03815484, "epoch": 0.606042386893131, "flos": 20631327120000.0, "grad_norm": 1.8380494790365534, "language_loss": 0.62702447, "learning_rate": 1.4186736611313131e-06, "loss": 0.64846075, "num_input_tokens_seen": 217055805, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 10080, "time_per_iteration": 2.504793405532837 }, { "auxiliary_loss_clip": 0.01111026, "auxiliary_loss_mlp": 0.01032384, "balance_loss_clip": 1.01884723, "balance_loss_mlp": 1.03904247, "epoch": 0.6061025101457989, "flos": 23002293081600.0, "grad_norm": 2.0184105664583756, "language_loss": 0.70962381, "learning_rate": 1.4183010253110492e-06, "loss": 0.73105788, "num_input_tokens_seen": 217074175, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 10081, "time_per_iteration": 2.4910144805908203 }, { "auxiliary_loss_clip": 0.01109107, "auxiliary_loss_mlp": 0.01030973, "balance_loss_clip": 1.01714468, "balance_loss_mlp": 1.03907561, "epoch": 0.6061626333984669, "flos": 29898937134720.0, "grad_norm": 1.6884111768408763, "language_loss": 0.69583631, "learning_rate": 1.4179284115488691e-06, "loss": 0.71723717, "num_input_tokens_seen": 217095695, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.69921875, "step": 10082, "time_per_iteration": 2.518463134765625 }, { "auxiliary_loss_clip": 0.01111897, "auxiliary_loss_mlp": 0.01030679, "balance_loss_clip": 1.01748812, "balance_loss_mlp": 1.04140496, "epoch": 0.6062227566511348, "flos": 25009196745600.0, "grad_norm": 1.4507090830218798, "language_loss": 0.66070443, "learning_rate": 1.4175558198589015e-06, "loss": 0.68213022, "num_input_tokens_seen": 217116260, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 10083, "time_per_iteration": 2.5194742679595947 }, { "auxiliary_loss_clip": 0.0110938, "auxiliary_loss_mlp": 0.01029982, "balance_loss_clip": 1.0167259, "balance_loss_mlp": 1.03787458, "epoch": 0.6062828799038028, "flos": 19463943104640.0, "grad_norm": 5.109737468098994, "language_loss": 0.74039787, "learning_rate": 1.4171832502552764e-06, "loss": 0.76179147, "num_input_tokens_seen": 217134465, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 10084, "time_per_iteration": 2.4718873500823975 }, { "auxiliary_loss_clip": 0.01110553, "auxiliary_loss_mlp": 0.01034165, "balance_loss_clip": 1.02110577, "balance_loss_mlp": 1.03897238, "epoch": 0.6063430031564707, "flos": 13589568120960.0, "grad_norm": 2.3221412599597016, "language_loss": 0.72278458, "learning_rate": 1.4168107027521204e-06, "loss": 0.7442317, "num_input_tokens_seen": 217149920, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 10085, "time_per_iteration": 2.4795024394989014 }, { "auxiliary_loss_clip": 0.01111085, "auxiliary_loss_mlp": 0.01036225, "balance_loss_clip": 1.02350497, "balance_loss_mlp": 1.04062068, "epoch": 0.6064031264091387, "flos": 23255499029760.0, "grad_norm": 2.4653569836005893, "language_loss": 0.76237011, "learning_rate": 1.4164381773635605e-06, "loss": 0.78384328, "num_input_tokens_seen": 217168165, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 10086, "time_per_iteration": 3.847356081008911 }, { "auxiliary_loss_clip": 0.01108967, "auxiliary_loss_mlp": 0.010352, "balance_loss_clip": 1.02184272, "balance_loss_mlp": 1.0403707, "epoch": 0.6064632496618068, "flos": 22458462192000.0, "grad_norm": 1.2825580672012586, "language_loss": 0.72609115, "learning_rate": 1.4160656741037246e-06, "loss": 0.74753284, "num_input_tokens_seen": 217190070, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6875, "step": 10087, "time_per_iteration": 2.495419979095459 }, { "auxiliary_loss_clip": 0.01106966, "auxiliary_loss_mlp": 0.01031218, "balance_loss_clip": 1.01969695, "balance_loss_mlp": 1.03836656, "epoch": 0.6065233729144747, "flos": 25118652464640.0, "grad_norm": 1.6531064094247252, "language_loss": 0.83925653, "learning_rate": 1.4156931929867355e-06, "loss": 0.86063838, "num_input_tokens_seen": 217209370, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 10088, "time_per_iteration": 2.51468563079834 }, { "auxiliary_loss_clip": 0.01106948, "auxiliary_loss_mlp": 0.01029006, "balance_loss_clip": 1.01586914, "balance_loss_mlp": 1.03838992, "epoch": 0.6065834961671427, "flos": 23477355383040.0, "grad_norm": 2.1824812942561325, "language_loss": 0.71133196, "learning_rate": 1.4153207340267201e-06, "loss": 0.73269153, "num_input_tokens_seen": 217226990, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 10089, "time_per_iteration": 2.4783523082733154 }, { "auxiliary_loss_clip": 0.0111033, "auxiliary_loss_mlp": 0.01039128, "balance_loss_clip": 1.02637315, "balance_loss_mlp": 1.03975415, "epoch": 0.6066436194198106, "flos": 17019396132480.0, "grad_norm": 2.097647720142251, "language_loss": 0.8215, "learning_rate": 1.4149482972378009e-06, "loss": 0.84299457, "num_input_tokens_seen": 217244585, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 10090, "time_per_iteration": 3.9424209594726562 }, { "auxiliary_loss_clip": 0.01116716, "auxiliary_loss_mlp": 0.01040873, "balance_loss_clip": 1.02655602, "balance_loss_mlp": 1.04024363, "epoch": 0.6067037426724786, "flos": 18514752255360.0, "grad_norm": 2.481202141740781, "language_loss": 0.75621355, "learning_rate": 1.4145758826341e-06, "loss": 0.77778947, "num_input_tokens_seen": 217263435, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 10091, "time_per_iteration": 3.921168327331543 }, { "auxiliary_loss_clip": 0.0110713, "auxiliary_loss_mlp": 0.01034234, "balance_loss_clip": 1.02096558, "balance_loss_mlp": 1.0381515, "epoch": 0.6067638659251465, "flos": 22345989730560.0, "grad_norm": 1.5116702831641824, "language_loss": 0.79245448, "learning_rate": 1.4142034902297415e-06, "loss": 0.81386805, "num_input_tokens_seen": 217283725, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69140625, "step": 10092, "time_per_iteration": 3.911318302154541 }, { "auxiliary_loss_clip": 0.01112106, "auxiliary_loss_mlp": 0.01034791, "balance_loss_clip": 1.02105808, "balance_loss_mlp": 1.03975344, "epoch": 0.6068239891778145, "flos": 12451019748480.0, "grad_norm": 2.196150065554169, "language_loss": 0.76061857, "learning_rate": 1.4138311200388444e-06, "loss": 0.78208756, "num_input_tokens_seen": 217301120, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 10093, "time_per_iteration": 2.4375240802764893 }, { "auxiliary_loss_clip": 0.01108675, "auxiliary_loss_mlp": 0.01032708, "balance_loss_clip": 1.01987529, "balance_loss_mlp": 1.04002786, "epoch": 0.6068841124304825, "flos": 23185868515200.0, "grad_norm": 1.9562063585547969, "language_loss": 0.8760258, "learning_rate": 1.4134587720755304e-06, "loss": 0.89743966, "num_input_tokens_seen": 217319585, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 10094, "time_per_iteration": 2.4703218936920166 }, { "auxiliary_loss_clip": 0.01108682, "auxiliary_loss_mlp": 0.01032519, "balance_loss_clip": 1.01882792, "balance_loss_mlp": 1.03834963, "epoch": 0.6069442356831505, "flos": 18587902302720.0, "grad_norm": 1.7549195895598093, "language_loss": 0.72144306, "learning_rate": 1.413086446353919e-06, "loss": 0.74285507, "num_input_tokens_seen": 217338880, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 10095, "time_per_iteration": 2.449951171875 }, { "auxiliary_loss_clip": 0.01107472, "auxiliary_loss_mlp": 0.01026889, "balance_loss_clip": 1.01428258, "balance_loss_mlp": 1.03786063, "epoch": 0.6070043589358184, "flos": 20960340721920.0, "grad_norm": 1.660509325486274, "language_loss": 0.76558721, "learning_rate": 1.4127141428881273e-06, "loss": 0.7869308, "num_input_tokens_seen": 217357480, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 10096, "time_per_iteration": 2.454023838043213 }, { "auxiliary_loss_clip": 0.01111653, "auxiliary_loss_mlp": 0.01041455, "balance_loss_clip": 1.02813387, "balance_loss_mlp": 1.03998995, "epoch": 0.6070644821884864, "flos": 11692443398400.0, "grad_norm": 2.984073926515532, "language_loss": 0.79713035, "learning_rate": 1.4123418616922749e-06, "loss": 0.81866145, "num_input_tokens_seen": 217374575, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 10097, "time_per_iteration": 2.4645721912384033 }, { "auxiliary_loss_clip": 0.01106076, "auxiliary_loss_mlp": 0.01030019, "balance_loss_clip": 1.0174247, "balance_loss_mlp": 1.03755605, "epoch": 0.6071246054411543, "flos": 19310568030720.0, "grad_norm": 1.3954255629690744, "language_loss": 0.67164493, "learning_rate": 1.411969602780478e-06, "loss": 0.69300592, "num_input_tokens_seen": 217392950, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 10098, "time_per_iteration": 2.4636571407318115 }, { "auxiliary_loss_clip": 0.0110792, "auxiliary_loss_mlp": 0.01030064, "balance_loss_clip": 1.01792848, "balance_loss_mlp": 1.03852105, "epoch": 0.6071847286938223, "flos": 17749029098880.0, "grad_norm": 2.0406825602750693, "language_loss": 0.80646384, "learning_rate": 1.4115973661668523e-06, "loss": 0.82784367, "num_input_tokens_seen": 217412145, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 10099, "time_per_iteration": 2.4635732173919678 }, { "auxiliary_loss_clip": 0.01112877, "auxiliary_loss_mlp": 0.01036307, "balance_loss_clip": 1.02218688, "balance_loss_mlp": 1.03845549, "epoch": 0.6072448519464904, "flos": 22637512512000.0, "grad_norm": 2.012553692037193, "language_loss": 0.71318877, "learning_rate": 1.4112251518655133e-06, "loss": 0.73468065, "num_input_tokens_seen": 217432080, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.74609375, "step": 10100, "time_per_iteration": 2.466815710067749 }, { "auxiliary_loss_clip": 0.01114551, "auxiliary_loss_mlp": 0.01036041, "balance_loss_clip": 1.0219686, "balance_loss_mlp": 1.04316807, "epoch": 0.6073049751991583, "flos": 19537308633600.0, "grad_norm": 2.070155254070884, "language_loss": 0.71223807, "learning_rate": 1.4108529598905764e-06, "loss": 0.73374403, "num_input_tokens_seen": 217450945, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71484375, "step": 10101, "time_per_iteration": 2.4707014560699463 }, { "auxiliary_loss_clip": 0.01106057, "auxiliary_loss_mlp": 0.0103337, "balance_loss_clip": 1.02069235, "balance_loss_mlp": 1.03739572, "epoch": 0.6073650984518263, "flos": 28294233033600.0, "grad_norm": 1.8630587484706695, "language_loss": 0.69709921, "learning_rate": 1.410480790256154e-06, "loss": 0.71849346, "num_input_tokens_seen": 217473105, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 10102, "time_per_iteration": 2.5815696716308594 }, { "auxiliary_loss_clip": 0.01109968, "auxiliary_loss_mlp": 0.01033772, "balance_loss_clip": 1.02103484, "balance_loss_mlp": 1.03946114, "epoch": 0.6074252217044942, "flos": 25664422688640.0, "grad_norm": 2.175904141350396, "language_loss": 0.73704952, "learning_rate": 1.4101086429763589e-06, "loss": 0.75848693, "num_input_tokens_seen": 217491780, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 10103, "time_per_iteration": 2.5170559883117676 }, { "auxiliary_loss_clip": 0.01114639, "auxiliary_loss_mlp": 0.01035502, "balance_loss_clip": 1.02151847, "balance_loss_mlp": 1.04148793, "epoch": 0.6074853449571622, "flos": 22857106308480.0, "grad_norm": 1.7919900241880873, "language_loss": 0.76540285, "learning_rate": 1.4097365180653032e-06, "loss": 0.78690428, "num_input_tokens_seen": 217510605, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 10104, "time_per_iteration": 2.4652647972106934 }, { "auxiliary_loss_clip": 0.01041534, "auxiliary_loss_mlp": 0.01003668, "balance_loss_clip": 1.0022788, "balance_loss_mlp": 1.01706886, "epoch": 0.6075454682098301, "flos": 67111406547840.0, "grad_norm": 0.7158492679118086, "language_loss": 0.56021559, "learning_rate": 1.4093644155370977e-06, "loss": 0.58066761, "num_input_tokens_seen": 217574815, "router_z_loss_clip": 0.01391602, "router_z_loss_mlp": 0.24414062, "step": 10105, "time_per_iteration": 3.09963059425354 }, { "auxiliary_loss_clip": 0.01042255, "auxiliary_loss_mlp": 0.01003832, "balance_loss_clip": 1.0025686, "balance_loss_mlp": 1.0175817, "epoch": 0.6076055914624982, "flos": 70712024751360.0, "grad_norm": 0.7705388349273179, "language_loss": 0.5682627, "learning_rate": 1.4089923354058533e-06, "loss": 0.58872354, "num_input_tokens_seen": 217632375, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.24707031, "step": 10106, "time_per_iteration": 3.0412838459014893 }, { "auxiliary_loss_clip": 0.01107521, "auxiliary_loss_mlp": 0.01032447, "balance_loss_clip": 1.0197928, "balance_loss_mlp": 1.03870416, "epoch": 0.6076657147151661, "flos": 28364545906560.0, "grad_norm": 1.4550526391402587, "language_loss": 0.69078928, "learning_rate": 1.4086202776856784e-06, "loss": 0.71218902, "num_input_tokens_seen": 217653055, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 10107, "time_per_iteration": 2.5159411430358887 }, { "auxiliary_loss_clip": 0.01112925, "auxiliary_loss_mlp": 0.01032694, "balance_loss_clip": 1.01968193, "balance_loss_mlp": 1.04042339, "epoch": 0.6077258379678341, "flos": 15049767807360.0, "grad_norm": 1.8497497812838672, "language_loss": 0.81043756, "learning_rate": 1.4082482423906815e-06, "loss": 0.83189368, "num_input_tokens_seen": 217671520, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 10108, "time_per_iteration": 2.454620838165283 }, { "auxiliary_loss_clip": 0.01114457, "auxiliary_loss_mlp": 0.01033213, "balance_loss_clip": 1.0190866, "balance_loss_mlp": 1.04105318, "epoch": 0.607785961220502, "flos": 36167251553280.0, "grad_norm": 1.9678228377873115, "language_loss": 0.71650183, "learning_rate": 1.4078762295349714e-06, "loss": 0.73797858, "num_input_tokens_seen": 217691880, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 10109, "time_per_iteration": 2.575307607650757 }, { "auxiliary_loss_clip": 0.01106441, "auxiliary_loss_mlp": 0.01029982, "balance_loss_clip": 1.0183053, "balance_loss_mlp": 1.03912067, "epoch": 0.60784608447317, "flos": 22524249951360.0, "grad_norm": 1.5896059115086063, "language_loss": 0.80128539, "learning_rate": 1.407504239132653e-06, "loss": 0.82264966, "num_input_tokens_seen": 217710530, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 10110, "time_per_iteration": 2.4778292179107666 }, { "auxiliary_loss_clip": 0.01110092, "auxiliary_loss_mlp": 0.01030099, "balance_loss_clip": 1.01722443, "balance_loss_mlp": 1.03936839, "epoch": 0.6079062077258379, "flos": 23841166285440.0, "grad_norm": 3.049920362599111, "language_loss": 0.71023244, "learning_rate": 1.4071322711978338e-06, "loss": 0.73163438, "num_input_tokens_seen": 217728650, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 10111, "time_per_iteration": 2.472273111343384 }, { "auxiliary_loss_clip": 0.01111892, "auxiliary_loss_mlp": 0.01029689, "balance_loss_clip": 1.01654017, "balance_loss_mlp": 1.04030895, "epoch": 0.6079663309785059, "flos": 23367037737600.0, "grad_norm": 1.6288671457980712, "language_loss": 0.65593266, "learning_rate": 1.4067603257446186e-06, "loss": 0.67734849, "num_input_tokens_seen": 217747135, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10112, "time_per_iteration": 2.492504596710205 }, { "auxiliary_loss_clip": 0.01041282, "auxiliary_loss_mlp": 0.01001632, "balance_loss_clip": 1.00029683, "balance_loss_mlp": 1.01662147, "epoch": 0.6080264542311739, "flos": 71382873110400.0, "grad_norm": 0.6357118536173654, "language_loss": 0.49619204, "learning_rate": 1.4063884027871105e-06, "loss": 0.51662117, "num_input_tokens_seen": 217811860, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.24609375, "step": 10113, "time_per_iteration": 3.1178805828094482 }, { "auxiliary_loss_clip": 0.01041039, "auxiliary_loss_mlp": 0.01001576, "balance_loss_clip": 1.00011599, "balance_loss_mlp": 1.01638055, "epoch": 0.6080865774838419, "flos": 66529833442560.0, "grad_norm": 0.8452049148647864, "language_loss": 0.57020473, "learning_rate": 1.4060165023394147e-06, "loss": 0.59063089, "num_input_tokens_seen": 217866510, "router_z_loss_clip": 0.0145874, "router_z_loss_mlp": 0.24609375, "step": 10114, "time_per_iteration": 3.0476622581481934 }, { "auxiliary_loss_clip": 0.01110771, "auxiliary_loss_mlp": 0.01027798, "balance_loss_clip": 1.01380253, "balance_loss_mlp": 1.03890049, "epoch": 0.6081467007365099, "flos": 19207935895680.0, "grad_norm": 1.964124900403479, "language_loss": 0.70231807, "learning_rate": 1.4056446244156317e-06, "loss": 0.72370374, "num_input_tokens_seen": 217885650, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 10115, "time_per_iteration": 2.4521360397338867 }, { "auxiliary_loss_clip": 0.01109283, "auxiliary_loss_mlp": 0.01029062, "balance_loss_clip": 1.01609254, "balance_loss_mlp": 1.03903663, "epoch": 0.6082068239891778, "flos": 24167737762560.0, "grad_norm": 1.6194327917751632, "language_loss": 0.72569549, "learning_rate": 1.4052727690298642e-06, "loss": 0.74707901, "num_input_tokens_seen": 217905300, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 10116, "time_per_iteration": 2.495603561401367 }, { "auxiliary_loss_clip": 0.01111649, "auxiliary_loss_mlp": 0.01034756, "balance_loss_clip": 1.0206356, "balance_loss_mlp": 1.03928101, "epoch": 0.6082669472418458, "flos": 37413316310400.0, "grad_norm": 1.7606139505584837, "language_loss": 0.5364005, "learning_rate": 1.4049009361962138e-06, "loss": 0.55786455, "num_input_tokens_seen": 217927845, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7265625, "step": 10117, "time_per_iteration": 2.593859910964966 }, { "auxiliary_loss_clip": 0.01109831, "auxiliary_loss_mlp": 0.01028678, "balance_loss_clip": 1.01613152, "balance_loss_mlp": 1.03934991, "epoch": 0.6083270704945137, "flos": 15085534775040.0, "grad_norm": 1.7504721009933664, "language_loss": 0.70105439, "learning_rate": 1.4045291259287786e-06, "loss": 0.72243953, "num_input_tokens_seen": 217946145, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 10118, "time_per_iteration": 2.4586071968078613 }, { "auxiliary_loss_clip": 0.01110049, "auxiliary_loss_mlp": 0.01028736, "balance_loss_clip": 1.01602292, "balance_loss_mlp": 1.04002023, "epoch": 0.6083871937471818, "flos": 20668458804480.0, "grad_norm": 2.283530794581281, "language_loss": 0.74808425, "learning_rate": 1.4041573382416588e-06, "loss": 0.76947212, "num_input_tokens_seen": 217965190, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 10119, "time_per_iteration": 2.476328134536743 }, { "auxiliary_loss_clip": 0.0110689, "auxiliary_loss_mlp": 0.01034637, "balance_loss_clip": 1.02195954, "balance_loss_mlp": 1.03798473, "epoch": 0.6084473169998497, "flos": 21506901045120.0, "grad_norm": 1.7542121562882413, "language_loss": 0.67583197, "learning_rate": 1.4037855731489525e-06, "loss": 0.69724727, "num_input_tokens_seen": 217983625, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 10120, "time_per_iteration": 2.4904532432556152 }, { "auxiliary_loss_clip": 0.01112016, "auxiliary_loss_mlp": 0.01031409, "balance_loss_clip": 1.01758063, "balance_loss_mlp": 1.03942585, "epoch": 0.6085074402525177, "flos": 26870051710080.0, "grad_norm": 1.8202773551527374, "language_loss": 0.74549806, "learning_rate": 1.4034138306647571e-06, "loss": 0.76693237, "num_input_tokens_seen": 218006005, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 10121, "time_per_iteration": 2.5369367599487305 }, { "auxiliary_loss_clip": 0.01108296, "auxiliary_loss_mlp": 0.01029614, "balance_loss_clip": 1.01778245, "balance_loss_mlp": 1.03880548, "epoch": 0.6085675635051856, "flos": 10889839952640.0, "grad_norm": 1.8884156194069897, "language_loss": 0.80781889, "learning_rate": 1.4030421108031685e-06, "loss": 0.829198, "num_input_tokens_seen": 218024195, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 10122, "time_per_iteration": 2.4532783031463623 }, { "auxiliary_loss_clip": 0.01110858, "auxiliary_loss_mlp": 0.01031574, "balance_loss_clip": 1.01849031, "balance_loss_mlp": 1.04034925, "epoch": 0.6086276867578536, "flos": 34862186707200.0, "grad_norm": 1.5465834423927567, "language_loss": 0.56054837, "learning_rate": 1.402670413578284e-06, "loss": 0.58197272, "num_input_tokens_seen": 218047190, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 10123, "time_per_iteration": 2.5820226669311523 }, { "auxiliary_loss_clip": 0.01110599, "auxiliary_loss_mlp": 0.01035745, "balance_loss_clip": 1.02242351, "balance_loss_mlp": 1.04108882, "epoch": 0.6086878100105215, "flos": 20047706939520.0, "grad_norm": 2.3754951239350026, "language_loss": 0.73935473, "learning_rate": 1.4022987390041965e-06, "loss": 0.76081812, "num_input_tokens_seen": 218065945, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 10124, "time_per_iteration": 2.480167865753174 }, { "auxiliary_loss_clip": 0.01109329, "auxiliary_loss_mlp": 0.01033833, "balance_loss_clip": 1.02079105, "balance_loss_mlp": 1.03898191, "epoch": 0.6087479332631895, "flos": 18332469711360.0, "grad_norm": 2.3487340116649618, "language_loss": 0.65698302, "learning_rate": 1.4019270870950006e-06, "loss": 0.67841464, "num_input_tokens_seen": 218085285, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 10125, "time_per_iteration": 2.471627950668335 }, { "auxiliary_loss_clip": 0.0110721, "auxiliary_loss_mlp": 0.01031086, "balance_loss_clip": 1.01847339, "balance_loss_mlp": 1.03825879, "epoch": 0.6088080565158575, "flos": 24493411399680.0, "grad_norm": 2.2798922904677057, "language_loss": 0.76776767, "learning_rate": 1.40155545786479e-06, "loss": 0.7891506, "num_input_tokens_seen": 218104735, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 10126, "time_per_iteration": 2.5224595069885254 }, { "auxiliary_loss_clip": 0.01112748, "auxiliary_loss_mlp": 0.01030699, "balance_loss_clip": 1.01700771, "balance_loss_mlp": 1.03881848, "epoch": 0.6088681797685255, "flos": 10269016260480.0, "grad_norm": 2.667153881289333, "language_loss": 0.71627259, "learning_rate": 1.4011838513276558e-06, "loss": 0.73770702, "num_input_tokens_seen": 218121855, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 10127, "time_per_iteration": 2.4345340728759766 }, { "auxiliary_loss_clip": 0.01114874, "auxiliary_loss_mlp": 0.01031564, "balance_loss_clip": 1.01743746, "balance_loss_mlp": 1.04133999, "epoch": 0.6089283030211935, "flos": 21973703218560.0, "grad_norm": 2.5822542786013702, "language_loss": 0.73361242, "learning_rate": 1.400812267497691e-06, "loss": 0.75507683, "num_input_tokens_seen": 218137325, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 10128, "time_per_iteration": 3.9302732944488525 }, { "auxiliary_loss_clip": 0.0110925, "auxiliary_loss_mlp": 0.01032514, "balance_loss_clip": 1.01952636, "balance_loss_mlp": 1.03930283, "epoch": 0.6089884262738614, "flos": 17785191116160.0, "grad_norm": 2.126831849710127, "language_loss": 0.73328239, "learning_rate": 1.4004407063889842e-06, "loss": 0.75470006, "num_input_tokens_seen": 218155530, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 10129, "time_per_iteration": 2.4380104541778564 }, { "auxiliary_loss_clip": 0.01110176, "auxiliary_loss_mlp": 0.01030568, "balance_loss_clip": 1.01715744, "balance_loss_mlp": 1.03999925, "epoch": 0.6090485495265294, "flos": 36910423946880.0, "grad_norm": 1.4981251463547844, "language_loss": 0.65428895, "learning_rate": 1.400069168015626e-06, "loss": 0.67569637, "num_input_tokens_seen": 218182535, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69921875, "step": 10130, "time_per_iteration": 2.6562259197235107 }, { "auxiliary_loss_clip": 0.01106147, "auxiliary_loss_mlp": 0.01028195, "balance_loss_clip": 1.01663208, "balance_loss_mlp": 1.03767252, "epoch": 0.6091086727791973, "flos": 19899036547200.0, "grad_norm": 1.665856128923062, "language_loss": 0.77307332, "learning_rate": 1.3996976523917054e-06, "loss": 0.79441673, "num_input_tokens_seen": 218201740, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 10131, "time_per_iteration": 3.9150924682617188 }, { "auxiliary_loss_clip": 0.01109269, "auxiliary_loss_mlp": 0.01031931, "balance_loss_clip": 1.02009988, "balance_loss_mlp": 1.03995657, "epoch": 0.6091687960318654, "flos": 22163635359360.0, "grad_norm": 2.0254291303652123, "language_loss": 0.77440965, "learning_rate": 1.3993261595313093e-06, "loss": 0.79582167, "num_input_tokens_seen": 218219800, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 10132, "time_per_iteration": 3.8652546405792236 }, { "auxiliary_loss_clip": 0.011065, "auxiliary_loss_mlp": 0.01035134, "balance_loss_clip": 1.02292085, "balance_loss_mlp": 1.03944802, "epoch": 0.6092289192845333, "flos": 21465280160640.0, "grad_norm": 1.7380709241515546, "language_loss": 0.75862849, "learning_rate": 1.3989546894485261e-06, "loss": 0.78004479, "num_input_tokens_seen": 218237585, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 10133, "time_per_iteration": 2.4922783374786377 }, { "auxiliary_loss_clip": 0.01107104, "auxiliary_loss_mlp": 0.010289, "balance_loss_clip": 1.0157932, "balance_loss_mlp": 1.03740931, "epoch": 0.6092890425372013, "flos": 28694924225280.0, "grad_norm": 2.0686978066192574, "language_loss": 0.64046395, "learning_rate": 1.3985832421574414e-06, "loss": 0.66182399, "num_input_tokens_seen": 218258700, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 10134, "time_per_iteration": 3.962519884109497 }, { "auxiliary_loss_clip": 0.01106976, "auxiliary_loss_mlp": 0.01028061, "balance_loss_clip": 1.01544297, "balance_loss_mlp": 1.03830302, "epoch": 0.6093491657898692, "flos": 20813178700800.0, "grad_norm": 1.8280987700651024, "language_loss": 0.78662217, "learning_rate": 1.3982118176721397e-06, "loss": 0.80797255, "num_input_tokens_seen": 218275655, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 10135, "time_per_iteration": 2.454740524291992 }, { "auxiliary_loss_clip": 0.01110506, "auxiliary_loss_mlp": 0.01028674, "balance_loss_clip": 1.01633549, "balance_loss_mlp": 1.0395112, "epoch": 0.6094092890425372, "flos": 25446983708160.0, "grad_norm": 2.4130119497029963, "language_loss": 0.72095144, "learning_rate": 1.3978404160067069e-06, "loss": 0.74234331, "num_input_tokens_seen": 218295720, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 10136, "time_per_iteration": 2.5352706909179688 }, { "auxiliary_loss_clip": 0.01111551, "auxiliary_loss_mlp": 0.01030438, "balance_loss_clip": 1.017313, "balance_loss_mlp": 1.04036117, "epoch": 0.6094694122952051, "flos": 35621265847680.0, "grad_norm": 1.8127665742054446, "language_loss": 0.74319172, "learning_rate": 1.3974690371752253e-06, "loss": 0.76461154, "num_input_tokens_seen": 218316745, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 10137, "time_per_iteration": 2.5984833240509033 }, { "auxiliary_loss_clip": 0.01111267, "auxiliary_loss_mlp": 0.01035578, "balance_loss_clip": 1.02230406, "balance_loss_mlp": 1.03921723, "epoch": 0.6095295355478731, "flos": 24456962073600.0, "grad_norm": 2.7266175843029923, "language_loss": 0.80417293, "learning_rate": 1.3970976811917785e-06, "loss": 0.82564139, "num_input_tokens_seen": 218335385, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 10138, "time_per_iteration": 2.515761137008667 }, { "auxiliary_loss_clip": 0.01105044, "auxiliary_loss_mlp": 0.01033103, "balance_loss_clip": 1.02096188, "balance_loss_mlp": 1.03772414, "epoch": 0.6095896588005411, "flos": 15633208419840.0, "grad_norm": 1.7757092753430195, "language_loss": 0.81396371, "learning_rate": 1.3967263480704481e-06, "loss": 0.83534515, "num_input_tokens_seen": 218353320, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 10139, "time_per_iteration": 2.4410223960876465 }, { "auxiliary_loss_clip": 0.0111323, "auxiliary_loss_mlp": 0.01035972, "balance_loss_clip": 1.0223043, "balance_loss_mlp": 1.04102635, "epoch": 0.6096497820532091, "flos": 15550577182080.0, "grad_norm": 2.5630482941300214, "language_loss": 0.8350507, "learning_rate": 1.396355037825315e-06, "loss": 0.85654271, "num_input_tokens_seen": 218365620, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 10140, "time_per_iteration": 2.40236496925354 }, { "auxiliary_loss_clip": 0.01110686, "auxiliary_loss_mlp": 0.01031782, "balance_loss_clip": 1.01859164, "balance_loss_mlp": 1.03868449, "epoch": 0.6097099053058771, "flos": 24204474397440.0, "grad_norm": 1.9189406449092918, "language_loss": 0.75886977, "learning_rate": 1.3959837504704592e-06, "loss": 0.78029442, "num_input_tokens_seen": 218383785, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 10141, "time_per_iteration": 2.48659086227417 }, { "auxiliary_loss_clip": 0.01109358, "auxiliary_loss_mlp": 0.01031314, "balance_loss_clip": 1.01867139, "balance_loss_mlp": 1.0385344, "epoch": 0.609770028558545, "flos": 19570238426880.0, "grad_norm": 2.1669333116787377, "language_loss": 0.76347464, "learning_rate": 1.3956124860199603e-06, "loss": 0.78488135, "num_input_tokens_seen": 218399055, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 10142, "time_per_iteration": 2.4309964179992676 }, { "auxiliary_loss_clip": 0.01109523, "auxiliary_loss_mlp": 0.01027442, "balance_loss_clip": 1.01408505, "balance_loss_mlp": 1.03939843, "epoch": 0.609830151811213, "flos": 23949185460480.0, "grad_norm": 1.7025884316550817, "language_loss": 0.7672103, "learning_rate": 1.3952412444878964e-06, "loss": 0.78857994, "num_input_tokens_seen": 218419120, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 10143, "time_per_iteration": 2.488408088684082 }, { "auxiliary_loss_clip": 0.01109043, "auxiliary_loss_mlp": 0.01033871, "balance_loss_clip": 1.02037621, "balance_loss_mlp": 1.03840876, "epoch": 0.6098902750638809, "flos": 16179732829440.0, "grad_norm": 1.9081500698767269, "language_loss": 0.75106609, "learning_rate": 1.3948700258883448e-06, "loss": 0.77249521, "num_input_tokens_seen": 218435290, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 10144, "time_per_iteration": 2.4428560733795166 }, { "auxiliary_loss_clip": 0.01111139, "auxiliary_loss_mlp": 0.01033057, "balance_loss_clip": 1.01995623, "balance_loss_mlp": 1.03943741, "epoch": 0.609950398316549, "flos": 44526393763200.0, "grad_norm": 1.838490099886389, "language_loss": 0.72783744, "learning_rate": 1.394498830235383e-06, "loss": 0.74927938, "num_input_tokens_seen": 218457880, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 10145, "time_per_iteration": 2.6811647415161133 }, { "auxiliary_loss_clip": 0.011088, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 1.02210748, "balance_loss_mlp": 1.03919435, "epoch": 0.6100105215692169, "flos": 23221743223680.0, "grad_norm": 1.686123547239094, "language_loss": 0.69262213, "learning_rate": 1.3941276575430862e-06, "loss": 0.71405697, "num_input_tokens_seen": 218475930, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 10146, "time_per_iteration": 2.538388252258301 }, { "auxiliary_loss_clip": 0.01109471, "auxiliary_loss_mlp": 0.01031512, "balance_loss_clip": 1.0196929, "balance_loss_mlp": 1.04113555, "epoch": 0.6100706448218849, "flos": 15012564295680.0, "grad_norm": 1.5918451043735005, "language_loss": 0.76772499, "learning_rate": 1.3937565078255289e-06, "loss": 0.78913486, "num_input_tokens_seen": 218493675, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 10147, "time_per_iteration": 2.463700294494629 }, { "auxiliary_loss_clip": 0.01106921, "auxiliary_loss_mlp": 0.0103027, "balance_loss_clip": 1.0179739, "balance_loss_mlp": 1.03709972, "epoch": 0.6101307680745528, "flos": 19639976682240.0, "grad_norm": 1.7241230202612783, "language_loss": 0.78177953, "learning_rate": 1.393385381096786e-06, "loss": 0.80315149, "num_input_tokens_seen": 218511780, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 10148, "time_per_iteration": 2.46653413772583 }, { "auxiliary_loss_clip": 0.0111472, "auxiliary_loss_mlp": 0.01037474, "balance_loss_clip": 1.02252531, "balance_loss_mlp": 1.03969646, "epoch": 0.6101908913272208, "flos": 29935566028800.0, "grad_norm": 2.7238112663872864, "language_loss": 0.5374828, "learning_rate": 1.39301427737093e-06, "loss": 0.55900472, "num_input_tokens_seen": 218531850, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.75390625, "step": 10149, "time_per_iteration": 2.543297290802002 }, { "auxiliary_loss_clip": 0.01107077, "auxiliary_loss_mlp": 0.01034724, "balance_loss_clip": 1.02189755, "balance_loss_mlp": 1.04008794, "epoch": 0.6102510145798887, "flos": 21798639308160.0, "grad_norm": 1.895856525202593, "language_loss": 0.80110168, "learning_rate": 1.3926431966620333e-06, "loss": 0.82251972, "num_input_tokens_seen": 218551245, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.671875, "step": 10150, "time_per_iteration": 2.4768388271331787 }, { "auxiliary_loss_clip": 0.01113389, "auxiliary_loss_mlp": 0.01037718, "balance_loss_clip": 1.02387214, "balance_loss_mlp": 1.0415293, "epoch": 0.6103111378325567, "flos": 20706129192960.0, "grad_norm": 1.4440661316053982, "language_loss": 0.68946385, "learning_rate": 1.3922721389841684e-06, "loss": 0.71097493, "num_input_tokens_seen": 218571365, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 10151, "time_per_iteration": 2.4846391677856445 }, { "auxiliary_loss_clip": 0.01107701, "auxiliary_loss_mlp": 0.01031249, "balance_loss_clip": 1.01923299, "balance_loss_mlp": 1.03811622, "epoch": 0.6103712610852247, "flos": 29381643417600.0, "grad_norm": 1.6236376268592212, "language_loss": 0.70803809, "learning_rate": 1.3919011043514036e-06, "loss": 0.72942758, "num_input_tokens_seen": 218588315, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 10152, "time_per_iteration": 2.521799325942993 }, { "auxiliary_loss_clip": 0.01112506, "auxiliary_loss_mlp": 0.01032751, "balance_loss_clip": 1.01925611, "balance_loss_mlp": 1.04076624, "epoch": 0.6104313843378927, "flos": 20813035046400.0, "grad_norm": 1.8512679801569552, "language_loss": 0.78452981, "learning_rate": 1.391530092777811e-06, "loss": 0.80598235, "num_input_tokens_seen": 218605940, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 10153, "time_per_iteration": 2.466481924057007 }, { "auxiliary_loss_clip": 0.01111278, "auxiliary_loss_mlp": 0.01032794, "balance_loss_clip": 1.0194366, "balance_loss_mlp": 1.0398531, "epoch": 0.6104915075905607, "flos": 26578457101440.0, "grad_norm": 1.6393835768049378, "language_loss": 0.79079241, "learning_rate": 1.3911591042774573e-06, "loss": 0.81223309, "num_input_tokens_seen": 218626100, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 10154, "time_per_iteration": 2.517569065093994 }, { "auxiliary_loss_clip": 0.01109464, "auxiliary_loss_mlp": 0.01036013, "balance_loss_clip": 1.02348423, "balance_loss_mlp": 1.03993535, "epoch": 0.6105516308432286, "flos": 23915788790400.0, "grad_norm": 1.6826309102730153, "language_loss": 0.70479882, "learning_rate": 1.3907881388644116e-06, "loss": 0.72625363, "num_input_tokens_seen": 218645060, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 10155, "time_per_iteration": 2.5095114707946777 }, { "auxiliary_loss_clip": 0.01110413, "auxiliary_loss_mlp": 0.01032639, "balance_loss_clip": 1.01875734, "balance_loss_mlp": 1.04053855, "epoch": 0.6106117540958966, "flos": 31577365900800.0, "grad_norm": 1.5168037144303903, "language_loss": 0.71793336, "learning_rate": 1.3904171965527413e-06, "loss": 0.73936391, "num_input_tokens_seen": 218667690, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.69921875, "step": 10156, "time_per_iteration": 2.5469303131103516 }, { "auxiliary_loss_clip": 0.01107925, "auxiliary_loss_mlp": 0.01030422, "balance_loss_clip": 1.017524, "balance_loss_mlp": 1.04015589, "epoch": 0.6106718773485645, "flos": 19608160210560.0, "grad_norm": 1.574895373783371, "language_loss": 0.67392182, "learning_rate": 1.3900462773565114e-06, "loss": 0.69530529, "num_input_tokens_seen": 218687505, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 10157, "time_per_iteration": 2.4849464893341064 }, { "auxiliary_loss_clip": 0.011084, "auxiliary_loss_mlp": 0.01028353, "balance_loss_clip": 1.0161159, "balance_loss_mlp": 1.03714442, "epoch": 0.6107320006012326, "flos": 17123895774720.0, "grad_norm": 1.8339333674959843, "language_loss": 0.7217164, "learning_rate": 1.3896753812897877e-06, "loss": 0.74308395, "num_input_tokens_seen": 218705315, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 10158, "time_per_iteration": 2.4442453384399414 }, { "auxiliary_loss_clip": 0.01110911, "auxiliary_loss_mlp": 0.01035066, "balance_loss_clip": 1.02256072, "balance_loss_mlp": 1.03942835, "epoch": 0.6107921238539005, "flos": 30148228500480.0, "grad_norm": 1.5865348531952432, "language_loss": 0.69174546, "learning_rate": 1.389304508366635e-06, "loss": 0.71320522, "num_input_tokens_seen": 218725735, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 10159, "time_per_iteration": 2.5501980781555176 }, { "auxiliary_loss_clip": 0.01111739, "auxiliary_loss_mlp": 0.01030493, "balance_loss_clip": 1.01714778, "balance_loss_mlp": 1.03980911, "epoch": 0.6108522471065685, "flos": 18440273404800.0, "grad_norm": 1.9842000579103176, "language_loss": 0.78711933, "learning_rate": 1.3889336586011167e-06, "loss": 0.80854166, "num_input_tokens_seen": 218743215, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 10160, "time_per_iteration": 2.4412128925323486 }, { "auxiliary_loss_clip": 0.01038685, "auxiliary_loss_mlp": 0.01005073, "balance_loss_clip": 1.00372016, "balance_loss_mlp": 1.01367116, "epoch": 0.6109123703592364, "flos": 64135454791680.0, "grad_norm": 0.8208095442079423, "language_loss": 0.61408305, "learning_rate": 1.388562832007295e-06, "loss": 0.63452059, "num_input_tokens_seen": 218806440, "router_z_loss_clip": 0.0135498, "router_z_loss_mlp": 0.25, "step": 10161, "time_per_iteration": 3.242446184158325 }, { "auxiliary_loss_clip": 0.01113412, "auxiliary_loss_mlp": 0.01034038, "balance_loss_clip": 1.02035284, "balance_loss_mlp": 1.04107654, "epoch": 0.6109724936119044, "flos": 20667848273280.0, "grad_norm": 1.7338460177233672, "language_loss": 0.76196897, "learning_rate": 1.3881920285992324e-06, "loss": 0.78344351, "num_input_tokens_seen": 218825720, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 10162, "time_per_iteration": 2.4687769412994385 }, { "auxiliary_loss_clip": 0.01109568, "auxiliary_loss_mlp": 0.01030673, "balance_loss_clip": 1.01738667, "balance_loss_mlp": 1.03935218, "epoch": 0.6110326168645723, "flos": 31351882273920.0, "grad_norm": 1.9288659338648346, "language_loss": 0.71700794, "learning_rate": 1.3878212483909888e-06, "loss": 0.73841035, "num_input_tokens_seen": 218847735, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 10163, "time_per_iteration": 2.5541741847991943 }, { "auxiliary_loss_clip": 0.01105755, "auxiliary_loss_mlp": 0.01026848, "balance_loss_clip": 1.01492691, "balance_loss_mlp": 1.03702223, "epoch": 0.6110927401172404, "flos": 25003378742400.0, "grad_norm": 1.9806088423959203, "language_loss": 0.59312278, "learning_rate": 1.387450491396625e-06, "loss": 0.61444879, "num_input_tokens_seen": 218866585, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 10164, "time_per_iteration": 2.4815800189971924 }, { "auxiliary_loss_clip": 0.01107428, "auxiliary_loss_mlp": 0.01031402, "balance_loss_clip": 1.01886082, "balance_loss_mlp": 1.03770721, "epoch": 0.6111528633699083, "flos": 26248078782720.0, "grad_norm": 1.7930362772709671, "language_loss": 0.75662565, "learning_rate": 1.3870797576302003e-06, "loss": 0.77801394, "num_input_tokens_seen": 218885560, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 10165, "time_per_iteration": 2.5102059841156006 }, { "auxiliary_loss_clip": 0.0110949, "auxiliary_loss_mlp": 0.0102919, "balance_loss_clip": 1.01633942, "balance_loss_mlp": 1.0420903, "epoch": 0.6112129866225763, "flos": 22382474970240.0, "grad_norm": 1.6733810144873267, "language_loss": 0.7939955, "learning_rate": 1.3867090471057719e-06, "loss": 0.81538224, "num_input_tokens_seen": 218905055, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.67578125, "step": 10166, "time_per_iteration": 2.4596261978149414 }, { "auxiliary_loss_clip": 0.01108377, "auxiliary_loss_mlp": 0.01028197, "balance_loss_clip": 1.01491666, "balance_loss_mlp": 1.03824782, "epoch": 0.6112731098752443, "flos": 25227892702080.0, "grad_norm": 1.771806251831892, "language_loss": 0.67535651, "learning_rate": 1.3863383598373987e-06, "loss": 0.69672227, "num_input_tokens_seen": 218924030, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 10167, "time_per_iteration": 2.5041956901550293 }, { "auxiliary_loss_clip": 0.01107218, "auxiliary_loss_mlp": 0.01032082, "balance_loss_clip": 1.02005386, "balance_loss_mlp": 1.03916907, "epoch": 0.6113332331279122, "flos": 22893160584960.0, "grad_norm": 1.8316999316072657, "language_loss": 0.79259443, "learning_rate": 1.3859676958391364e-06, "loss": 0.81398743, "num_input_tokens_seen": 218943750, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 10168, "time_per_iteration": 2.474557876586914 }, { "auxiliary_loss_clip": 0.0111387, "auxiliary_loss_mlp": 0.01034982, "balance_loss_clip": 1.02058768, "balance_loss_mlp": 1.03864813, "epoch": 0.6113933563805802, "flos": 18620329305600.0, "grad_norm": 3.011216461153966, "language_loss": 0.86174345, "learning_rate": 1.3855970551250398e-06, "loss": 0.883232, "num_input_tokens_seen": 218957585, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75390625, "step": 10169, "time_per_iteration": 2.434279680252075 }, { "auxiliary_loss_clip": 0.01105283, "auxiliary_loss_mlp": 0.01026768, "balance_loss_clip": 1.01509118, "balance_loss_mlp": 1.03699017, "epoch": 0.6114534796332481, "flos": 41866275317760.0, "grad_norm": 1.9322509101746224, "language_loss": 0.78651404, "learning_rate": 1.3852264377091652e-06, "loss": 0.80783451, "num_input_tokens_seen": 218980025, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 10170, "time_per_iteration": 4.1037211418151855 }, { "auxiliary_loss_clip": 0.01113666, "auxiliary_loss_mlp": 0.0103892, "balance_loss_clip": 1.02425766, "balance_loss_mlp": 1.03858817, "epoch": 0.6115136028859162, "flos": 21908454163200.0, "grad_norm": 3.506302861085165, "language_loss": 0.69224149, "learning_rate": 1.3848558436055651e-06, "loss": 0.71376741, "num_input_tokens_seen": 218998200, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 10171, "time_per_iteration": 2.470973253250122 }, { "auxiliary_loss_clip": 0.01110627, "auxiliary_loss_mlp": 0.01034588, "balance_loss_clip": 1.01984763, "balance_loss_mlp": 1.03831339, "epoch": 0.6115737261385841, "flos": 28804846821120.0, "grad_norm": 1.7920013264252868, "language_loss": 0.79677325, "learning_rate": 1.3844852728282934e-06, "loss": 0.81822538, "num_input_tokens_seen": 219017910, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.72265625, "step": 10172, "time_per_iteration": 2.5319082736968994 }, { "auxiliary_loss_clip": 0.01113299, "auxiliary_loss_mlp": 0.01033628, "balance_loss_clip": 1.01997828, "balance_loss_mlp": 1.0394454, "epoch": 0.6116338493912521, "flos": 21251468453760.0, "grad_norm": 1.760208023958075, "language_loss": 0.66910601, "learning_rate": 1.3841147253914022e-06, "loss": 0.69057524, "num_input_tokens_seen": 219037730, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 10173, "time_per_iteration": 3.9349374771118164 }, { "auxiliary_loss_clip": 0.011117, "auxiliary_loss_mlp": 0.01033244, "balance_loss_clip": 1.01943994, "balance_loss_mlp": 1.04023731, "epoch": 0.61169397264392, "flos": 17530189488000.0, "grad_norm": 2.2924724763532267, "language_loss": 0.55923903, "learning_rate": 1.3837442013089416e-06, "loss": 0.58068848, "num_input_tokens_seen": 219056755, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 10174, "time_per_iteration": 3.836095094680786 }, { "auxiliary_loss_clip": 0.0111421, "auxiliary_loss_mlp": 0.01032455, "balance_loss_clip": 1.0187819, "balance_loss_mlp": 1.04221225, "epoch": 0.611754095896588, "flos": 23951555758080.0, "grad_norm": 2.2159142636956504, "language_loss": 0.65982831, "learning_rate": 1.3833737005949628e-06, "loss": 0.68129498, "num_input_tokens_seen": 219076985, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 10175, "time_per_iteration": 3.981830596923828 }, { "auxiliary_loss_clip": 0.01105498, "auxiliary_loss_mlp": 0.01023413, "balance_loss_clip": 1.01149845, "balance_loss_mlp": 1.03576207, "epoch": 0.6118142191492559, "flos": 25994872834560.0, "grad_norm": 1.9916158835010382, "language_loss": 0.82812774, "learning_rate": 1.3830032232635154e-06, "loss": 0.84941685, "num_input_tokens_seen": 219096050, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 10176, "time_per_iteration": 2.4993982315063477 }, { "auxiliary_loss_clip": 0.01110424, "auxiliary_loss_mlp": 0.01037058, "balance_loss_clip": 1.02263927, "balance_loss_mlp": 1.03986931, "epoch": 0.611874342401924, "flos": 24603190341120.0, "grad_norm": 4.638605353479907, "language_loss": 0.77402186, "learning_rate": 1.3826327693286474e-06, "loss": 0.7954967, "num_input_tokens_seen": 219112665, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.703125, "step": 10177, "time_per_iteration": 2.493602991104126 }, { "auxiliary_loss_clip": 0.01107915, "auxiliary_loss_mlp": 0.01032072, "balance_loss_clip": 1.0196743, "balance_loss_mlp": 1.03770018, "epoch": 0.6119344656545919, "flos": 15887132640000.0, "grad_norm": 1.8854758064039328, "language_loss": 0.75750554, "learning_rate": 1.3822623388044065e-06, "loss": 0.77890539, "num_input_tokens_seen": 219129120, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 10178, "time_per_iteration": 2.4230854511260986 }, { "auxiliary_loss_clip": 0.01110698, "auxiliary_loss_mlp": 0.01033941, "balance_loss_clip": 1.0201304, "balance_loss_mlp": 1.03999829, "epoch": 0.6119945889072599, "flos": 21652877917440.0, "grad_norm": 1.5460454857243817, "language_loss": 0.6717912, "learning_rate": 1.3818919317048402e-06, "loss": 0.6932376, "num_input_tokens_seen": 219148950, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.70703125, "step": 10179, "time_per_iteration": 2.470964193344116 }, { "auxiliary_loss_clip": 0.01111087, "auxiliary_loss_mlp": 0.01032325, "balance_loss_clip": 1.01967692, "balance_loss_mlp": 1.03994298, "epoch": 0.6120547121599279, "flos": 13772533023360.0, "grad_norm": 1.8184561916811592, "language_loss": 0.83619368, "learning_rate": 1.3815215480439933e-06, "loss": 0.85762787, "num_input_tokens_seen": 219165585, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 10180, "time_per_iteration": 2.434770107269287 }, { "auxiliary_loss_clip": 0.01109104, "auxiliary_loss_mlp": 0.01030567, "balance_loss_clip": 1.0160954, "balance_loss_mlp": 1.03982151, "epoch": 0.6121148354125958, "flos": 20079164275200.0, "grad_norm": 1.6408880729993458, "language_loss": 0.77899539, "learning_rate": 1.3811511878359113e-06, "loss": 0.80039209, "num_input_tokens_seen": 219183280, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.6953125, "step": 10181, "time_per_iteration": 2.4692201614379883 }, { "auxiliary_loss_clip": 0.01109778, "auxiliary_loss_mlp": 0.01031968, "balance_loss_clip": 1.01892662, "balance_loss_mlp": 1.03869581, "epoch": 0.6121749586652638, "flos": 13471313569920.0, "grad_norm": 1.989565677411676, "language_loss": 0.80681264, "learning_rate": 1.3807808510946384e-06, "loss": 0.82823014, "num_input_tokens_seen": 219197200, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 10182, "time_per_iteration": 2.4164745807647705 }, { "auxiliary_loss_clip": 0.01103291, "auxiliary_loss_mlp": 0.01027103, "balance_loss_clip": 1.01614761, "balance_loss_mlp": 1.03676319, "epoch": 0.6122350819179317, "flos": 20120533764480.0, "grad_norm": 1.5340565166191813, "language_loss": 0.8306728, "learning_rate": 1.3804105378342177e-06, "loss": 0.85197669, "num_input_tokens_seen": 219216825, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6640625, "step": 10183, "time_per_iteration": 2.485781669616699 }, { "auxiliary_loss_clip": 0.01037653, "auxiliary_loss_mlp": 0.01005456, "balance_loss_clip": 1.00399554, "balance_loss_mlp": 1.01305699, "epoch": 0.6122952051705998, "flos": 65429242767360.0, "grad_norm": 0.7163477767417878, "language_loss": 0.62830293, "learning_rate": 1.3800402480686914e-06, "loss": 0.64873397, "num_input_tokens_seen": 219283795, "router_z_loss_clip": 0.0145874, "router_z_loss_mlp": 0.24609375, "step": 10184, "time_per_iteration": 3.184569835662842 }, { "auxiliary_loss_clip": 0.01110884, "auxiliary_loss_mlp": 0.01028299, "balance_loss_clip": 1.0162648, "balance_loss_mlp": 1.04050934, "epoch": 0.6123553284232677, "flos": 20376253664640.0, "grad_norm": 2.0426033238619703, "language_loss": 0.82464451, "learning_rate": 1.379669981812101e-06, "loss": 0.84603631, "num_input_tokens_seen": 219302385, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 10185, "time_per_iteration": 2.461763858795166 }, { "auxiliary_loss_clip": 0.01114194, "auxiliary_loss_mlp": 0.0103328, "balance_loss_clip": 1.02041745, "balance_loss_mlp": 1.04028928, "epoch": 0.6124154516759357, "flos": 23987645948160.0, "grad_norm": 1.8298748799845344, "language_loss": 0.74744219, "learning_rate": 1.3792997390784868e-06, "loss": 0.7689169, "num_input_tokens_seen": 219319765, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 10186, "time_per_iteration": 2.469943046569824 }, { "auxiliary_loss_clip": 0.01106935, "auxiliary_loss_mlp": 0.01030307, "balance_loss_clip": 1.01839185, "balance_loss_mlp": 1.03851628, "epoch": 0.6124755749286036, "flos": 21468799693440.0, "grad_norm": 1.8744262804388914, "language_loss": 0.78446782, "learning_rate": 1.3789295198818895e-06, "loss": 0.80584025, "num_input_tokens_seen": 219337440, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 10187, "time_per_iteration": 2.4671499729156494 }, { "auxiliary_loss_clip": 0.01105638, "auxiliary_loss_mlp": 0.01030428, "balance_loss_clip": 1.01749349, "balance_loss_mlp": 1.03583944, "epoch": 0.6125356981812716, "flos": 23879195809920.0, "grad_norm": 1.5969668177821024, "language_loss": 0.83173704, "learning_rate": 1.3785593242363462e-06, "loss": 0.85309768, "num_input_tokens_seen": 219357525, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 10188, "time_per_iteration": 2.496619701385498 }, { "auxiliary_loss_clip": 0.01107772, "auxiliary_loss_mlp": 0.01033236, "balance_loss_clip": 1.02016473, "balance_loss_mlp": 1.03735161, "epoch": 0.6125958214339395, "flos": 14425604150400.0, "grad_norm": 1.7339225996423004, "language_loss": 0.75436664, "learning_rate": 1.378189152155896e-06, "loss": 0.77577674, "num_input_tokens_seen": 219374855, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 10189, "time_per_iteration": 2.4580957889556885 }, { "auxiliary_loss_clip": 0.01109962, "auxiliary_loss_mlp": 0.01033081, "balance_loss_clip": 1.01999223, "balance_loss_mlp": 1.03925037, "epoch": 0.6126559446866076, "flos": 23259090389760.0, "grad_norm": 1.4980672955297663, "language_loss": 0.74297768, "learning_rate": 1.3778190036545758e-06, "loss": 0.76440811, "num_input_tokens_seen": 219394740, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 10190, "time_per_iteration": 2.4755806922912598 }, { "auxiliary_loss_clip": 0.01110666, "auxiliary_loss_mlp": 0.01032767, "balance_loss_clip": 1.01855779, "balance_loss_mlp": 1.03942978, "epoch": 0.6127160679392755, "flos": 26864808324480.0, "grad_norm": 1.5634670582031431, "language_loss": 0.68385816, "learning_rate": 1.3774488787464207e-06, "loss": 0.70529246, "num_input_tokens_seen": 219413755, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7109375, "step": 10191, "time_per_iteration": 2.4957354068756104 }, { "auxiliary_loss_clip": 0.01109133, "auxiliary_loss_mlp": 0.01033251, "balance_loss_clip": 1.01936901, "balance_loss_mlp": 1.03739905, "epoch": 0.6127761911919435, "flos": 26396425952640.0, "grad_norm": 2.3044399001912694, "language_loss": 0.74041653, "learning_rate": 1.377078777445467e-06, "loss": 0.76184034, "num_input_tokens_seen": 219433560, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 10192, "time_per_iteration": 2.527937173843384 }, { "auxiliary_loss_clip": 0.01106702, "auxiliary_loss_mlp": 0.01029608, "balance_loss_clip": 1.01704288, "balance_loss_mlp": 1.03806925, "epoch": 0.6128363144446115, "flos": 22634747164800.0, "grad_norm": 1.9412659627947801, "language_loss": 0.84037524, "learning_rate": 1.3767086997657478e-06, "loss": 0.86173832, "num_input_tokens_seen": 219452640, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 10193, "time_per_iteration": 2.483515739440918 }, { "auxiliary_loss_clip": 0.01108298, "auxiliary_loss_mlp": 0.01031771, "balance_loss_clip": 1.01852059, "balance_loss_mlp": 1.03735137, "epoch": 0.6128964376972794, "flos": 26759051706240.0, "grad_norm": 2.0829787430870508, "language_loss": 0.70260608, "learning_rate": 1.3763386457212979e-06, "loss": 0.72400677, "num_input_tokens_seen": 219468585, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 10194, "time_per_iteration": 2.5159008502960205 }, { "auxiliary_loss_clip": 0.0103593, "auxiliary_loss_mlp": 0.01004519, "balance_loss_clip": 1.00308263, "balance_loss_mlp": 1.01167917, "epoch": 0.6129565609499474, "flos": 65567929178880.0, "grad_norm": 0.819428499241974, "language_loss": 0.58658284, "learning_rate": 1.375968615326149e-06, "loss": 0.6069873, "num_input_tokens_seen": 219523015, "router_z_loss_clip": 0.01434326, "router_z_loss_mlp": 0.2421875, "step": 10195, "time_per_iteration": 2.9183690547943115 }, { "auxiliary_loss_clip": 0.01109128, "auxiliary_loss_mlp": 0.01038771, "balance_loss_clip": 1.02535963, "balance_loss_mlp": 1.0388509, "epoch": 0.6130166842026153, "flos": 16362087200640.0, "grad_norm": 2.0132136445639777, "language_loss": 0.69996154, "learning_rate": 1.3755986085943324e-06, "loss": 0.72144043, "num_input_tokens_seen": 219539980, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 10196, "time_per_iteration": 2.452340602874756 }, { "auxiliary_loss_clip": 0.01108224, "auxiliary_loss_mlp": 0.01039508, "balance_loss_clip": 1.02682447, "balance_loss_mlp": 1.03864741, "epoch": 0.6130768074552834, "flos": 23652455207040.0, "grad_norm": 1.8493310120366608, "language_loss": 0.71487421, "learning_rate": 1.3752286255398788e-06, "loss": 0.73635161, "num_input_tokens_seen": 219556980, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10197, "time_per_iteration": 2.4939522743225098 }, { "auxiliary_loss_clip": 0.01110856, "auxiliary_loss_mlp": 0.01041852, "balance_loss_clip": 1.028566, "balance_loss_mlp": 1.0386436, "epoch": 0.6131369307079513, "flos": 20047455544320.0, "grad_norm": 1.9366040014563235, "language_loss": 0.78800058, "learning_rate": 1.3748586661768191e-06, "loss": 0.80952764, "num_input_tokens_seen": 219576410, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 10198, "time_per_iteration": 2.4492194652557373 }, { "auxiliary_loss_clip": 0.01112893, "auxiliary_loss_mlp": 0.01036407, "balance_loss_clip": 1.02306116, "balance_loss_mlp": 1.03970051, "epoch": 0.6131970539606193, "flos": 22672166158080.0, "grad_norm": 1.6961163171903317, "language_loss": 0.74358881, "learning_rate": 1.374488730519181e-06, "loss": 0.76508176, "num_input_tokens_seen": 219597180, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 10199, "time_per_iteration": 2.4920334815979004 }, { "auxiliary_loss_clip": 0.01110617, "auxiliary_loss_mlp": 0.0103895, "balance_loss_clip": 1.02493119, "balance_loss_mlp": 1.03800559, "epoch": 0.6132571772132872, "flos": 26870913636480.0, "grad_norm": 1.9579125959155743, "language_loss": 0.60872424, "learning_rate": 1.374118818580993e-06, "loss": 0.63021982, "num_input_tokens_seen": 219617630, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 10200, "time_per_iteration": 2.522376775741577 }, { "auxiliary_loss_clip": 0.01108654, "auxiliary_loss_mlp": 0.01038791, "balance_loss_clip": 1.02586913, "balance_loss_mlp": 1.03801978, "epoch": 0.6133173004659552, "flos": 22892657794560.0, "grad_norm": 2.0630824346588317, "language_loss": 0.68349731, "learning_rate": 1.3737489303762822e-06, "loss": 0.70497179, "num_input_tokens_seen": 219637025, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 10201, "time_per_iteration": 2.4838945865631104 }, { "auxiliary_loss_clip": 0.01104731, "auxiliary_loss_mlp": 0.01029094, "balance_loss_clip": 1.01546884, "balance_loss_mlp": 1.03516769, "epoch": 0.6133774237186231, "flos": 20485098852480.0, "grad_norm": 1.8631121602483969, "language_loss": 0.83308768, "learning_rate": 1.3733790659190746e-06, "loss": 0.85442591, "num_input_tokens_seen": 219656625, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.6953125, "step": 10202, "time_per_iteration": 2.4603257179260254 }, { "auxiliary_loss_clip": 0.01035187, "auxiliary_loss_mlp": 0.01002011, "balance_loss_clip": 1.00052071, "balance_loss_mlp": 1.01063132, "epoch": 0.6134375469712912, "flos": 69413065217280.0, "grad_norm": 0.9217884861799445, "language_loss": 0.67016375, "learning_rate": 1.3730092252233953e-06, "loss": 0.69053578, "num_input_tokens_seen": 219718090, "router_z_loss_clip": 0.01489258, "router_z_loss_mlp": 0.24609375, "step": 10203, "time_per_iteration": 3.106462001800537 }, { "auxiliary_loss_clip": 0.01109595, "auxiliary_loss_mlp": 0.01030196, "balance_loss_clip": 1.01652241, "balance_loss_mlp": 1.03817129, "epoch": 0.6134976702239591, "flos": 41281541815680.0, "grad_norm": 1.5848811799702904, "language_loss": 0.61115086, "learning_rate": 1.37263940830327e-06, "loss": 0.63254881, "num_input_tokens_seen": 219740100, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 10204, "time_per_iteration": 2.6522445678710938 }, { "auxiliary_loss_clip": 0.01106449, "auxiliary_loss_mlp": 0.01029367, "balance_loss_clip": 1.01624227, "balance_loss_mlp": 1.03757632, "epoch": 0.6135577934766271, "flos": 22346600261760.0, "grad_norm": 1.8057907004154192, "language_loss": 0.72721827, "learning_rate": 1.3722696151727204e-06, "loss": 0.7485764, "num_input_tokens_seen": 219761225, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 10205, "time_per_iteration": 2.5035159587860107 }, { "auxiliary_loss_clip": 0.01104361, "auxiliary_loss_mlp": 0.01024104, "balance_loss_clip": 1.01043034, "balance_loss_mlp": 1.03628445, "epoch": 0.6136179167292951, "flos": 23728155120000.0, "grad_norm": 1.7103257420130482, "language_loss": 0.75794178, "learning_rate": 1.3718998458457701e-06, "loss": 0.77922642, "num_input_tokens_seen": 219780085, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.6796875, "step": 10206, "time_per_iteration": 2.4901552200317383 }, { "auxiliary_loss_clip": 0.01108791, "auxiliary_loss_mlp": 0.01032557, "balance_loss_clip": 1.01855612, "balance_loss_mlp": 1.03725111, "epoch": 0.613678039981963, "flos": 26024678144640.0, "grad_norm": 2.060580881411255, "language_loss": 0.75643027, "learning_rate": 1.3715301003364407e-06, "loss": 0.77784377, "num_input_tokens_seen": 219797895, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 10207, "time_per_iteration": 2.496750593185425 }, { "auxiliary_loss_clip": 0.01107787, "auxiliary_loss_mlp": 0.01035637, "balance_loss_clip": 1.02286983, "balance_loss_mlp": 1.037498, "epoch": 0.613738163234631, "flos": 9859957200000.0, "grad_norm": 2.06238947309236, "language_loss": 0.82416278, "learning_rate": 1.3711603786587525e-06, "loss": 0.84559703, "num_input_tokens_seen": 219811295, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10208, "time_per_iteration": 2.429934024810791 }, { "auxiliary_loss_clip": 0.01114564, "auxiliary_loss_mlp": 0.01034828, "balance_loss_clip": 1.02006936, "balance_loss_mlp": 1.0410552, "epoch": 0.613798286487299, "flos": 33182070001920.0, "grad_norm": 2.1888392935978604, "language_loss": 0.7264533, "learning_rate": 1.3707906808267265e-06, "loss": 0.74794716, "num_input_tokens_seen": 219832735, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.734375, "step": 10209, "time_per_iteration": 2.5635414123535156 }, { "auxiliary_loss_clip": 0.01108118, "auxiliary_loss_mlp": 0.01036348, "balance_loss_clip": 1.02315176, "balance_loss_mlp": 1.03894675, "epoch": 0.613858409739967, "flos": 25627901535360.0, "grad_norm": 2.3387604670827606, "language_loss": 0.74409544, "learning_rate": 1.37042100685438e-06, "loss": 0.76554006, "num_input_tokens_seen": 219852755, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69140625, "step": 10210, "time_per_iteration": 2.503241777420044 }, { "auxiliary_loss_clip": 0.01034829, "auxiliary_loss_mlp": 0.01003482, "balance_loss_clip": 1.00188494, "balance_loss_mlp": 1.01034927, "epoch": 0.6139185329926349, "flos": 67192313932800.0, "grad_norm": 0.8777240919063587, "language_loss": 0.64980459, "learning_rate": 1.3700513567557325e-06, "loss": 0.67018771, "num_input_tokens_seen": 219922785, "router_z_loss_clip": 0.01599121, "router_z_loss_mlp": 0.24511719, "step": 10211, "time_per_iteration": 3.2388522624969482 }, { "auxiliary_loss_clip": 0.01107922, "auxiliary_loss_mlp": 0.01035767, "balance_loss_clip": 1.02211797, "balance_loss_mlp": 1.03776288, "epoch": 0.6139786562453029, "flos": 21543637680000.0, "grad_norm": 1.786751901877575, "language_loss": 0.75593269, "learning_rate": 1.369681730544801e-06, "loss": 0.77736956, "num_input_tokens_seen": 219942215, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 10212, "time_per_iteration": 3.8231658935546875 }, { "auxiliary_loss_clip": 0.01109695, "auxiliary_loss_mlp": 0.01031742, "balance_loss_clip": 1.01888514, "balance_loss_mlp": 1.03941345, "epoch": 0.6140387794979708, "flos": 26068489758720.0, "grad_norm": 1.6385055182267794, "language_loss": 0.73901451, "learning_rate": 1.3693121282356009e-06, "loss": 0.76042885, "num_input_tokens_seen": 219963830, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 10213, "time_per_iteration": 2.505800724029541 }, { "auxiliary_loss_clip": 0.01114203, "auxiliary_loss_mlp": 0.01036356, "balance_loss_clip": 1.02187765, "balance_loss_mlp": 1.04006445, "epoch": 0.6140989027506388, "flos": 23694614795520.0, "grad_norm": 1.532631875821164, "language_loss": 0.73077559, "learning_rate": 1.3689425498421483e-06, "loss": 0.75228119, "num_input_tokens_seen": 219983815, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7421875, "step": 10214, "time_per_iteration": 2.531651020050049 }, { "auxiliary_loss_clip": 0.01111469, "auxiliary_loss_mlp": 0.01029181, "balance_loss_clip": 1.01557958, "balance_loss_mlp": 1.03888547, "epoch": 0.6141590260033067, "flos": 22231721589120.0, "grad_norm": 1.8017431033844284, "language_loss": 0.74473095, "learning_rate": 1.3685729953784572e-06, "loss": 0.76613742, "num_input_tokens_seen": 220003165, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 10215, "time_per_iteration": 3.9395408630371094 }, { "auxiliary_loss_clip": 0.01108844, "auxiliary_loss_mlp": 0.01030992, "balance_loss_clip": 1.01763487, "balance_loss_mlp": 1.03839397, "epoch": 0.6142191492559748, "flos": 23871653953920.0, "grad_norm": 2.254286297738362, "language_loss": 0.78521955, "learning_rate": 1.368203464858542e-06, "loss": 0.80661792, "num_input_tokens_seen": 220021015, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 10216, "time_per_iteration": 5.350324392318726 }, { "auxiliary_loss_clip": 0.01109949, "auxiliary_loss_mlp": 0.01033294, "balance_loss_clip": 1.01889372, "balance_loss_mlp": 1.03980398, "epoch": 0.6142792725086427, "flos": 15042513260160.0, "grad_norm": 2.3529189477592687, "language_loss": 0.80046409, "learning_rate": 1.3678339582964147e-06, "loss": 0.82189655, "num_input_tokens_seen": 220035780, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.69921875, "step": 10217, "time_per_iteration": 2.4122838973999023 }, { "auxiliary_loss_clip": 0.01110016, "auxiliary_loss_mlp": 0.01031272, "balance_loss_clip": 1.01742625, "balance_loss_mlp": 1.03725708, "epoch": 0.6143393957613107, "flos": 23330947547520.0, "grad_norm": 2.281889006220928, "language_loss": 0.782915, "learning_rate": 1.3674644757060865e-06, "loss": 0.80432785, "num_input_tokens_seen": 220054280, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 10218, "time_per_iteration": 2.4808545112609863 }, { "auxiliary_loss_clip": 0.01110142, "auxiliary_loss_mlp": 0.0103205, "balance_loss_clip": 1.01856148, "balance_loss_mlp": 1.0396843, "epoch": 0.6143995190139786, "flos": 20117086058880.0, "grad_norm": 1.7317530947397526, "language_loss": 0.82123637, "learning_rate": 1.367095017101569e-06, "loss": 0.84265828, "num_input_tokens_seen": 220074120, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 10219, "time_per_iteration": 2.4706034660339355 }, { "auxiliary_loss_clip": 0.01110088, "auxiliary_loss_mlp": 0.01034344, "balance_loss_clip": 1.0205214, "balance_loss_mlp": 1.03754163, "epoch": 0.6144596422666466, "flos": 42303559489920.0, "grad_norm": 1.781155377850653, "language_loss": 0.67004752, "learning_rate": 1.3667255824968717e-06, "loss": 0.69149172, "num_input_tokens_seen": 220096320, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 10220, "time_per_iteration": 2.650542974472046 }, { "auxiliary_loss_clip": 0.01106212, "auxiliary_loss_mlp": 0.01026447, "balance_loss_clip": 1.01313758, "balance_loss_mlp": 1.03702009, "epoch": 0.6145197655193146, "flos": 21573622558080.0, "grad_norm": 1.9968841089123646, "language_loss": 0.71674979, "learning_rate": 1.3663561719060041e-06, "loss": 0.73807639, "num_input_tokens_seen": 220114850, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69140625, "step": 10221, "time_per_iteration": 2.4610886573791504 }, { "auxiliary_loss_clip": 0.01106038, "auxiliary_loss_mlp": 0.01030074, "balance_loss_clip": 1.01713991, "balance_loss_mlp": 1.03623748, "epoch": 0.6145798887719826, "flos": 21471098163840.0, "grad_norm": 2.368017074508472, "language_loss": 0.79859591, "learning_rate": 1.3659867853429735e-06, "loss": 0.81995702, "num_input_tokens_seen": 220133395, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 10222, "time_per_iteration": 2.4716835021972656 }, { "auxiliary_loss_clip": 0.011123, "auxiliary_loss_mlp": 0.01031235, "balance_loss_clip": 1.0181396, "balance_loss_mlp": 1.04044533, "epoch": 0.6146400120246506, "flos": 20777016683520.0, "grad_norm": 1.989912580505323, "language_loss": 0.76164508, "learning_rate": 1.365617422821788e-06, "loss": 0.78308046, "num_input_tokens_seen": 220152790, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10223, "time_per_iteration": 2.463156223297119 }, { "auxiliary_loss_clip": 0.01108928, "auxiliary_loss_mlp": 0.01035287, "balance_loss_clip": 1.02169728, "balance_loss_mlp": 1.0408442, "epoch": 0.6147001352773185, "flos": 13881306384000.0, "grad_norm": 2.168348433655558, "language_loss": 0.77909207, "learning_rate": 1.3652480843564535e-06, "loss": 0.80053419, "num_input_tokens_seen": 220169535, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6796875, "step": 10224, "time_per_iteration": 2.4413139820098877 }, { "auxiliary_loss_clip": 0.01104371, "auxiliary_loss_mlp": 0.01030637, "balance_loss_clip": 1.01885343, "balance_loss_mlp": 1.03649187, "epoch": 0.6147602585299865, "flos": 56641791807360.0, "grad_norm": 1.1920036320579839, "language_loss": 0.66399741, "learning_rate": 1.3648787699609746e-06, "loss": 0.68534756, "num_input_tokens_seen": 220195305, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 10225, "time_per_iteration": 2.7928919792175293 }, { "auxiliary_loss_clip": 0.01110077, "auxiliary_loss_mlp": 0.01032095, "balance_loss_clip": 1.01864243, "balance_loss_mlp": 1.03800702, "epoch": 0.6148203817826544, "flos": 32817217605120.0, "grad_norm": 2.4675381159688823, "language_loss": 0.63511038, "learning_rate": 1.364509479649357e-06, "loss": 0.65653211, "num_input_tokens_seen": 220215040, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 10226, "time_per_iteration": 2.5743114948272705 }, { "auxiliary_loss_clip": 0.01109284, "auxiliary_loss_mlp": 0.01035253, "balance_loss_clip": 1.02113318, "balance_loss_mlp": 1.03844047, "epoch": 0.6148805050353224, "flos": 18332038748160.0, "grad_norm": 1.7121935232733418, "language_loss": 0.75546968, "learning_rate": 1.3641402134356037e-06, "loss": 0.77691507, "num_input_tokens_seen": 220234205, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7109375, "step": 10227, "time_per_iteration": 2.4677929878234863 }, { "auxiliary_loss_clip": 0.01111714, "auxiliary_loss_mlp": 0.01037469, "balance_loss_clip": 1.02169776, "balance_loss_mlp": 1.03869581, "epoch": 0.6149406282879903, "flos": 14063983977600.0, "grad_norm": 2.533906497206873, "language_loss": 0.62369347, "learning_rate": 1.3637709713337164e-06, "loss": 0.64518523, "num_input_tokens_seen": 220252730, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.73046875, "step": 10228, "time_per_iteration": 2.4731247425079346 }, { "auxiliary_loss_clip": 0.01107485, "auxiliary_loss_mlp": 0.01031156, "balance_loss_clip": 1.01845407, "balance_loss_mlp": 1.03841043, "epoch": 0.6150007515406584, "flos": 25190186400000.0, "grad_norm": 1.4758616319269142, "language_loss": 0.74539638, "learning_rate": 1.3634017533576985e-06, "loss": 0.76678276, "num_input_tokens_seen": 220273345, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 10229, "time_per_iteration": 2.5178537368774414 }, { "auxiliary_loss_clip": 0.01111114, "auxiliary_loss_mlp": 0.01037226, "balance_loss_clip": 1.02372003, "balance_loss_mlp": 1.04086769, "epoch": 0.6150608747933263, "flos": 21945262625280.0, "grad_norm": 1.91237858514134, "language_loss": 0.7794401, "learning_rate": 1.3630325595215493e-06, "loss": 0.80092353, "num_input_tokens_seen": 220293845, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 10230, "time_per_iteration": 2.4939777851104736 }, { "auxiliary_loss_clip": 0.01107863, "auxiliary_loss_mlp": 0.01031, "balance_loss_clip": 1.01819062, "balance_loss_mlp": 1.03726959, "epoch": 0.6151209980459943, "flos": 30117453523200.0, "grad_norm": 1.6933869419848022, "language_loss": 0.73012722, "learning_rate": 1.36266338983927e-06, "loss": 0.75151587, "num_input_tokens_seen": 220316070, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 10231, "time_per_iteration": 2.565169334411621 }, { "auxiliary_loss_clip": 0.01110656, "auxiliary_loss_mlp": 0.01035871, "balance_loss_clip": 1.02323508, "balance_loss_mlp": 1.03955781, "epoch": 0.6151811212986622, "flos": 30008356940160.0, "grad_norm": 1.7975166209043392, "language_loss": 0.70086068, "learning_rate": 1.362294244324858e-06, "loss": 0.72232592, "num_input_tokens_seen": 220335695, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 10232, "time_per_iteration": 2.5665791034698486 }, { "auxiliary_loss_clip": 0.01105674, "auxiliary_loss_mlp": 0.01031019, "balance_loss_clip": 1.01810288, "balance_loss_mlp": 1.03819585, "epoch": 0.6152412445513302, "flos": 18872888808960.0, "grad_norm": 2.224249845691212, "language_loss": 0.9176898, "learning_rate": 1.3619251229923126e-06, "loss": 0.93905675, "num_input_tokens_seen": 220353720, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.671875, "step": 10233, "time_per_iteration": 2.460482358932495 }, { "auxiliary_loss_clip": 0.01106305, "auxiliary_loss_mlp": 0.01031464, "balance_loss_clip": 1.02022243, "balance_loss_mlp": 1.03797388, "epoch": 0.6153013678039982, "flos": 25703601448320.0, "grad_norm": 2.2364793898085207, "language_loss": 0.71731985, "learning_rate": 1.3615560258556306e-06, "loss": 0.73869753, "num_input_tokens_seen": 220372515, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.68359375, "step": 10234, "time_per_iteration": 2.505098342895508 }, { "auxiliary_loss_clip": 0.01109641, "auxiliary_loss_mlp": 0.01032846, "balance_loss_clip": 1.0193994, "balance_loss_mlp": 1.03776312, "epoch": 0.6153614910566662, "flos": 28510271383680.0, "grad_norm": 1.9615870632243018, "language_loss": 0.67122006, "learning_rate": 1.3611869529288077e-06, "loss": 0.69264495, "num_input_tokens_seen": 220393490, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 10235, "time_per_iteration": 2.531832695007324 }, { "auxiliary_loss_clip": 0.01111306, "auxiliary_loss_mlp": 0.01031136, "balance_loss_clip": 1.01819003, "balance_loss_mlp": 1.03848219, "epoch": 0.6154216143093342, "flos": 23549787158400.0, "grad_norm": 2.6132413492178177, "language_loss": 0.8108533, "learning_rate": 1.3608179042258398e-06, "loss": 0.83227772, "num_input_tokens_seen": 220412855, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 10236, "time_per_iteration": 2.4886066913604736 }, { "auxiliary_loss_clip": 0.01112743, "auxiliary_loss_mlp": 0.0103042, "balance_loss_clip": 1.01730132, "balance_loss_mlp": 1.03982234, "epoch": 0.6154817375620021, "flos": 22748081552640.0, "grad_norm": 1.6676534946956936, "language_loss": 0.80718791, "learning_rate": 1.360448879760721e-06, "loss": 0.82861954, "num_input_tokens_seen": 220433440, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 10237, "time_per_iteration": 2.482149362564087 }, { "auxiliary_loss_clip": 0.01108705, "auxiliary_loss_mlp": 0.01037493, "balance_loss_clip": 1.02443957, "balance_loss_mlp": 1.0388, "epoch": 0.6155418608146701, "flos": 27162975121920.0, "grad_norm": 1.7352351167683864, "language_loss": 0.75913996, "learning_rate": 1.3600798795474449e-06, "loss": 0.78060198, "num_input_tokens_seen": 220453445, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 10238, "time_per_iteration": 2.51809024810791 }, { "auxiliary_loss_clip": 0.01038433, "auxiliary_loss_mlp": 0.01002813, "balance_loss_clip": 1.00156736, "balance_loss_mlp": 1.01392174, "epoch": 0.615601984067338, "flos": 68811165014400.0, "grad_norm": 0.7687518978631376, "language_loss": 0.57711136, "learning_rate": 1.3597109036000036e-06, "loss": 0.59752381, "num_input_tokens_seen": 220509730, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.24511719, "step": 10239, "time_per_iteration": 3.1000664234161377 }, { "auxiliary_loss_clip": 0.01109924, "auxiliary_loss_mlp": 0.01034512, "balance_loss_clip": 1.02127993, "balance_loss_mlp": 1.03833115, "epoch": 0.615662107320006, "flos": 15517144598400.0, "grad_norm": 1.9479159776277448, "language_loss": 0.77052581, "learning_rate": 1.3593419519323892e-06, "loss": 0.79197013, "num_input_tokens_seen": 220527295, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 10240, "time_per_iteration": 2.442436933517456 }, { "auxiliary_loss_clip": 0.0111209, "auxiliary_loss_mlp": 0.01034578, "balance_loss_clip": 1.02076197, "balance_loss_mlp": 1.04112113, "epoch": 0.615722230572674, "flos": 21063691128960.0, "grad_norm": 3.3961241716979726, "language_loss": 0.72970343, "learning_rate": 1.3589730245585922e-06, "loss": 0.75117004, "num_input_tokens_seen": 220542730, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 10241, "time_per_iteration": 2.4536383152008057 }, { "auxiliary_loss_clip": 0.01109662, "auxiliary_loss_mlp": 0.01028471, "balance_loss_clip": 1.01614511, "balance_loss_mlp": 1.04083407, "epoch": 0.615782353825342, "flos": 23256791919360.0, "grad_norm": 1.7587835407355044, "language_loss": 0.7211231, "learning_rate": 1.3586041214926018e-06, "loss": 0.74250448, "num_input_tokens_seen": 220562995, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 10242, "time_per_iteration": 2.4827182292938232 }, { "auxiliary_loss_clip": 0.01110396, "auxiliary_loss_mlp": 0.01030161, "balance_loss_clip": 1.0177573, "balance_loss_mlp": 1.04080963, "epoch": 0.6158424770780099, "flos": 21103911383040.0, "grad_norm": 2.4010242287375103, "language_loss": 0.72574848, "learning_rate": 1.3582352427484086e-06, "loss": 0.747154, "num_input_tokens_seen": 220581775, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 10243, "time_per_iteration": 2.4741344451904297 }, { "auxiliary_loss_clip": 0.01038404, "auxiliary_loss_mlp": 0.01003689, "balance_loss_clip": 1.0022105, "balance_loss_mlp": 1.01405263, "epoch": 0.6159026003306779, "flos": 70333276769280.0, "grad_norm": 0.7556792123645991, "language_loss": 0.56867945, "learning_rate": 1.3578663883399984e-06, "loss": 0.58910036, "num_input_tokens_seen": 220646395, "router_z_loss_clip": 0.01477051, "router_z_loss_mlp": 0.24414062, "step": 10244, "time_per_iteration": 3.1537094116210938 }, { "auxiliary_loss_clip": 0.01109606, "auxiliary_loss_mlp": 0.01033496, "balance_loss_clip": 1.02045441, "balance_loss_mlp": 1.0399363, "epoch": 0.6159627235833458, "flos": 33874355802240.0, "grad_norm": 1.603541062077456, "language_loss": 0.63938588, "learning_rate": 1.3574975582813593e-06, "loss": 0.66081691, "num_input_tokens_seen": 220668335, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 10245, "time_per_iteration": 2.5931477546691895 }, { "auxiliary_loss_clip": 0.01106924, "auxiliary_loss_mlp": 0.0102838, "balance_loss_clip": 1.01608992, "balance_loss_mlp": 1.03861308, "epoch": 0.6160228468360138, "flos": 26575440359040.0, "grad_norm": 2.102842823086868, "language_loss": 0.79074144, "learning_rate": 1.3571287525864771e-06, "loss": 0.81209445, "num_input_tokens_seen": 220688915, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 10246, "time_per_iteration": 2.5089187622070312 }, { "auxiliary_loss_clip": 0.01115103, "auxiliary_loss_mlp": 0.01047155, "balance_loss_clip": 1.03257537, "balance_loss_mlp": 1.04094601, "epoch": 0.6160829700886818, "flos": 17193274894080.0, "grad_norm": 3.304076652433213, "language_loss": 0.87696254, "learning_rate": 1.3567599712693368e-06, "loss": 0.89858508, "num_input_tokens_seen": 220703465, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7421875, "step": 10247, "time_per_iteration": 2.4515380859375 }, { "auxiliary_loss_clip": 0.01114085, "auxiliary_loss_mlp": 0.01029519, "balance_loss_clip": 1.01694286, "balance_loss_mlp": 1.04269075, "epoch": 0.6161430933413498, "flos": 23623547736960.0, "grad_norm": 1.8280567283539677, "language_loss": 0.79863125, "learning_rate": 1.3563912143439235e-06, "loss": 0.82006729, "num_input_tokens_seen": 220722090, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 10248, "time_per_iteration": 2.496302604675293 }, { "auxiliary_loss_clip": 0.01107439, "auxiliary_loss_mlp": 0.01027539, "balance_loss_clip": 1.01511765, "balance_loss_mlp": 1.03836513, "epoch": 0.6162032165940178, "flos": 23002436736000.0, "grad_norm": 2.1890348976937735, "language_loss": 0.86982399, "learning_rate": 1.3560224818242191e-06, "loss": 0.89117378, "num_input_tokens_seen": 220741075, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 10249, "time_per_iteration": 2.497451066970825 }, { "auxiliary_loss_clip": 0.01109449, "auxiliary_loss_mlp": 0.01029155, "balance_loss_clip": 1.01530337, "balance_loss_mlp": 1.03935885, "epoch": 0.6162633398466857, "flos": 39421979740800.0, "grad_norm": 4.251263437617753, "language_loss": 0.68551815, "learning_rate": 1.3556537737242072e-06, "loss": 0.70690417, "num_input_tokens_seen": 220763395, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.703125, "step": 10250, "time_per_iteration": 2.6295716762542725 }, { "auxiliary_loss_clip": 0.01103947, "auxiliary_loss_mlp": 0.01026208, "balance_loss_clip": 1.0140965, "balance_loss_mlp": 1.03799653, "epoch": 0.6163234630993537, "flos": 19244672530560.0, "grad_norm": 2.70912797778771, "language_loss": 0.74299729, "learning_rate": 1.3552850900578692e-06, "loss": 0.76429886, "num_input_tokens_seen": 220780640, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66015625, "step": 10251, "time_per_iteration": 2.4734303951263428 }, { "auxiliary_loss_clip": 0.01108577, "auxiliary_loss_mlp": 0.01026475, "balance_loss_clip": 1.01341605, "balance_loss_mlp": 1.03821301, "epoch": 0.6163835863520216, "flos": 15961791058560.0, "grad_norm": 2.1832235458024214, "language_loss": 0.68528795, "learning_rate": 1.3549164308391844e-06, "loss": 0.70663846, "num_input_tokens_seen": 220797960, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 10252, "time_per_iteration": 2.4881038665771484 }, { "auxiliary_loss_clip": 0.01037087, "auxiliary_loss_mlp": 0.01001756, "balance_loss_clip": 1.00030184, "balance_loss_mlp": 1.01274657, "epoch": 0.6164437096046896, "flos": 68103834393600.0, "grad_norm": 0.8900360244427675, "language_loss": 0.57833391, "learning_rate": 1.3545477960821333e-06, "loss": 0.59872234, "num_input_tokens_seen": 220856930, "router_z_loss_clip": 0.01452637, "router_z_loss_mlp": 0.24316406, "step": 10253, "time_per_iteration": 4.543191194534302 }, { "auxiliary_loss_clip": 0.01108124, "auxiliary_loss_mlp": 0.01031065, "balance_loss_clip": 1.01841724, "balance_loss_mlp": 1.03747427, "epoch": 0.6165038328573575, "flos": 21361211481600.0, "grad_norm": 2.0788264531465916, "language_loss": 0.79455441, "learning_rate": 1.3541791858006946e-06, "loss": 0.81594628, "num_input_tokens_seen": 220877595, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 10254, "time_per_iteration": 2.491300344467163 }, { "auxiliary_loss_clip": 0.0111166, "auxiliary_loss_mlp": 0.01029847, "balance_loss_clip": 1.01719332, "balance_loss_mlp": 1.0389477, "epoch": 0.6165639561100256, "flos": 21101972048640.0, "grad_norm": 1.772656401222073, "language_loss": 0.8038677, "learning_rate": 1.353810600008846e-06, "loss": 0.82528281, "num_input_tokens_seen": 220896880, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 10255, "time_per_iteration": 2.476555824279785 }, { "auxiliary_loss_clip": 0.01111456, "auxiliary_loss_mlp": 0.01030739, "balance_loss_clip": 1.017483, "balance_loss_mlp": 1.03997087, "epoch": 0.6166240793626935, "flos": 25338533569920.0, "grad_norm": 2.0246586827985813, "language_loss": 0.65682119, "learning_rate": 1.3534420387205646e-06, "loss": 0.67824316, "num_input_tokens_seen": 220916425, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 10256, "time_per_iteration": 3.974771022796631 }, { "auxiliary_loss_clip": 0.01108031, "auxiliary_loss_mlp": 0.01028779, "balance_loss_clip": 1.01651847, "balance_loss_mlp": 1.0395751, "epoch": 0.6166842026153615, "flos": 19682639061120.0, "grad_norm": 1.5930685248397694, "language_loss": 0.7222687, "learning_rate": 1.353073501949825e-06, "loss": 0.74363679, "num_input_tokens_seen": 220935050, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 10257, "time_per_iteration": 3.8876731395721436 }, { "auxiliary_loss_clip": 0.01112144, "auxiliary_loss_mlp": 0.01030411, "balance_loss_clip": 1.01738107, "balance_loss_mlp": 1.04078197, "epoch": 0.6167443258680294, "flos": 19318361281920.0, "grad_norm": 1.761984932189959, "language_loss": 0.72525299, "learning_rate": 1.3527049897106034e-06, "loss": 0.74667847, "num_input_tokens_seen": 220953085, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 10258, "time_per_iteration": 3.9051601886749268 }, { "auxiliary_loss_clip": 0.01109425, "auxiliary_loss_mlp": 0.01030784, "balance_loss_clip": 1.01781988, "balance_loss_mlp": 1.03921127, "epoch": 0.6168044491206974, "flos": 25265239868160.0, "grad_norm": 2.710842561853431, "language_loss": 0.64061487, "learning_rate": 1.3523365020168735e-06, "loss": 0.66201693, "num_input_tokens_seen": 220969050, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 10259, "time_per_iteration": 2.5038514137268066 }, { "auxiliary_loss_clip": 0.01108714, "auxiliary_loss_mlp": 0.01032599, "balance_loss_clip": 1.01954579, "balance_loss_mlp": 1.04080105, "epoch": 0.6168645723733654, "flos": 13219903301760.0, "grad_norm": 1.690139950394343, "language_loss": 0.71226352, "learning_rate": 1.3519680388826084e-06, "loss": 0.73367667, "num_input_tokens_seen": 220985825, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6796875, "step": 10260, "time_per_iteration": 2.4507458209991455 }, { "auxiliary_loss_clip": 0.01116725, "auxiliary_loss_mlp": 0.01036022, "balance_loss_clip": 1.02192545, "balance_loss_mlp": 1.04345191, "epoch": 0.6169246956260334, "flos": 26652038112000.0, "grad_norm": 1.9283366291465076, "language_loss": 0.68154693, "learning_rate": 1.3515996003217803e-06, "loss": 0.7030744, "num_input_tokens_seen": 221004465, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 10261, "time_per_iteration": 2.5342960357666016 }, { "auxiliary_loss_clip": 0.01108066, "auxiliary_loss_mlp": 0.01035794, "balance_loss_clip": 1.02362299, "balance_loss_mlp": 1.0383203, "epoch": 0.6169848188787014, "flos": 23148413608320.0, "grad_norm": 2.6837280160522368, "language_loss": 0.7136153, "learning_rate": 1.3512311863483602e-06, "loss": 0.7350539, "num_input_tokens_seen": 221023260, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 10262, "time_per_iteration": 2.5013113021850586 }, { "auxiliary_loss_clip": 0.01111451, "auxiliary_loss_mlp": 0.010338, "balance_loss_clip": 1.02031732, "balance_loss_mlp": 1.04114783, "epoch": 0.6170449421313693, "flos": 23331917214720.0, "grad_norm": 2.0691451887506505, "language_loss": 0.70155901, "learning_rate": 1.3508627969763188e-06, "loss": 0.72301155, "num_input_tokens_seen": 221043090, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 10263, "time_per_iteration": 2.50441312789917 }, { "auxiliary_loss_clip": 0.01111982, "auxiliary_loss_mlp": 0.01029821, "balance_loss_clip": 1.01763797, "balance_loss_mlp": 1.04036021, "epoch": 0.6171050653840373, "flos": 15851617067520.0, "grad_norm": 2.11647818659475, "language_loss": 0.76255316, "learning_rate": 1.3504944322196244e-06, "loss": 0.78397119, "num_input_tokens_seen": 221061435, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71484375, "step": 10264, "time_per_iteration": 2.4436044692993164 }, { "auxiliary_loss_clip": 0.01109971, "auxiliary_loss_mlp": 0.01029712, "balance_loss_clip": 1.01608014, "balance_loss_mlp": 1.04006469, "epoch": 0.6171651886367052, "flos": 20045516209920.0, "grad_norm": 2.7691432226812376, "language_loss": 0.85337818, "learning_rate": 1.350126092092247e-06, "loss": 0.87477505, "num_input_tokens_seen": 221078705, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69921875, "step": 10265, "time_per_iteration": 2.476705551147461 }, { "auxiliary_loss_clip": 0.0110713, "auxiliary_loss_mlp": 0.0103588, "balance_loss_clip": 1.02266598, "balance_loss_mlp": 1.0392735, "epoch": 0.6172253118893732, "flos": 26432695710720.0, "grad_norm": 1.808167619064542, "language_loss": 0.64786369, "learning_rate": 1.349757776608153e-06, "loss": 0.66929376, "num_input_tokens_seen": 221099245, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6796875, "step": 10266, "time_per_iteration": 2.515319347381592 }, { "auxiliary_loss_clip": 0.01107715, "auxiliary_loss_mlp": 0.0103236, "balance_loss_clip": 1.01987875, "balance_loss_mlp": 1.03785253, "epoch": 0.6172854351420412, "flos": 22632879657600.0, "grad_norm": 1.6531765845606698, "language_loss": 0.75445288, "learning_rate": 1.3493894857813094e-06, "loss": 0.77585363, "num_input_tokens_seen": 221116930, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69921875, "step": 10267, "time_per_iteration": 2.4851603507995605 }, { "auxiliary_loss_clip": 0.0111202, "auxiliary_loss_mlp": 0.01030541, "balance_loss_clip": 1.01716542, "balance_loss_mlp": 1.04013515, "epoch": 0.6173455583947092, "flos": 21212936138880.0, "grad_norm": 1.5939178571244794, "language_loss": 0.7468099, "learning_rate": 1.3490212196256818e-06, "loss": 0.7682355, "num_input_tokens_seen": 221137660, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 10268, "time_per_iteration": 2.4735655784606934 }, { "auxiliary_loss_clip": 0.01113091, "auxiliary_loss_mlp": 0.01030339, "balance_loss_clip": 1.0166117, "balance_loss_mlp": 1.03997886, "epoch": 0.6174056816473771, "flos": 19500284689920.0, "grad_norm": 1.6518994832552496, "language_loss": 0.75643212, "learning_rate": 1.3486529781552342e-06, "loss": 0.77786642, "num_input_tokens_seen": 221156225, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 10269, "time_per_iteration": 2.5103931427001953 }, { "auxiliary_loss_clip": 0.01107348, "auxiliary_loss_mlp": 0.01031795, "balance_loss_clip": 1.01892591, "balance_loss_mlp": 1.03755808, "epoch": 0.6174658049000451, "flos": 15997342544640.0, "grad_norm": 2.030526062923039, "language_loss": 0.76401496, "learning_rate": 1.3482847613839318e-06, "loss": 0.78540635, "num_input_tokens_seen": 221173820, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 10270, "time_per_iteration": 2.4267656803131104 }, { "auxiliary_loss_clip": 0.01109201, "auxiliary_loss_mlp": 0.01026598, "balance_loss_clip": 1.0130384, "balance_loss_mlp": 1.03849852, "epoch": 0.617525928152713, "flos": 21903893136000.0, "grad_norm": 1.738863944372062, "language_loss": 0.82408535, "learning_rate": 1.347916569325736e-06, "loss": 0.84544337, "num_input_tokens_seen": 221191815, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.70703125, "step": 10271, "time_per_iteration": 2.468019962310791 }, { "auxiliary_loss_clip": 0.01110693, "auxiliary_loss_mlp": 0.01036079, "balance_loss_clip": 1.02262616, "balance_loss_mlp": 1.03967273, "epoch": 0.617586051405381, "flos": 21105958458240.0, "grad_norm": 1.8392707385664127, "language_loss": 0.7714479, "learning_rate": 1.3475484019946093e-06, "loss": 0.79291558, "num_input_tokens_seen": 221211205, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 10272, "time_per_iteration": 2.4611446857452393 }, { "auxiliary_loss_clip": 0.01035754, "auxiliary_loss_mlp": 0.01001108, "balance_loss_clip": 0.99982041, "balance_loss_mlp": 1.01184726, "epoch": 0.617646174658049, "flos": 58610776665600.0, "grad_norm": 0.8197243020622667, "language_loss": 0.5913974, "learning_rate": 1.347180259404513e-06, "loss": 0.61176598, "num_input_tokens_seen": 221268430, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.23925781, "step": 10273, "time_per_iteration": 2.954155921936035 }, { "auxiliary_loss_clip": 0.0110746, "auxiliary_loss_mlp": 0.01028328, "balance_loss_clip": 1.01479745, "balance_loss_mlp": 1.03839159, "epoch": 0.617706297910717, "flos": 13878684691200.0, "grad_norm": 2.8847651832146015, "language_loss": 0.72557163, "learning_rate": 1.3468121415694059e-06, "loss": 0.74692947, "num_input_tokens_seen": 221281930, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6875, "step": 10274, "time_per_iteration": 2.4454314708709717 }, { "auxiliary_loss_clip": 0.01108508, "auxiliary_loss_mlp": 0.01028941, "balance_loss_clip": 1.01626348, "balance_loss_mlp": 1.03918767, "epoch": 0.617766421163385, "flos": 19208438686080.0, "grad_norm": 2.68698777507479, "language_loss": 0.77543288, "learning_rate": 1.3464440485032484e-06, "loss": 0.79680735, "num_input_tokens_seen": 221301605, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10275, "time_per_iteration": 2.4673304557800293 }, { "auxiliary_loss_clip": 0.0110868, "auxiliary_loss_mlp": 0.01032596, "balance_loss_clip": 1.01943552, "balance_loss_mlp": 1.03877008, "epoch": 0.6178265444160529, "flos": 22565978576640.0, "grad_norm": 1.7068598528246082, "language_loss": 0.79368865, "learning_rate": 1.346075980219998e-06, "loss": 0.8151015, "num_input_tokens_seen": 221320105, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 10276, "time_per_iteration": 2.480604410171509 }, { "auxiliary_loss_clip": 0.01113087, "auxiliary_loss_mlp": 0.01035572, "balance_loss_clip": 1.02202439, "balance_loss_mlp": 1.04122388, "epoch": 0.6178866676687209, "flos": 11984289402240.0, "grad_norm": 1.9832773475912828, "language_loss": 0.809322, "learning_rate": 1.345707936733612e-06, "loss": 0.83080858, "num_input_tokens_seen": 221335915, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 10277, "time_per_iteration": 2.45855975151062 }, { "auxiliary_loss_clip": 0.01110826, "auxiliary_loss_mlp": 0.010338, "balance_loss_clip": 1.01976871, "balance_loss_mlp": 1.03892422, "epoch": 0.6179467909213888, "flos": 20991510748800.0, "grad_norm": 1.9388117525264115, "language_loss": 0.81372166, "learning_rate": 1.3453399180580466e-06, "loss": 0.83516783, "num_input_tokens_seen": 221353965, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71875, "step": 10278, "time_per_iteration": 2.470733880996704 }, { "auxiliary_loss_clip": 0.01107959, "auxiliary_loss_mlp": 0.01034411, "balance_loss_clip": 1.02197707, "balance_loss_mlp": 1.0380857, "epoch": 0.6180069141740568, "flos": 25338102606720.0, "grad_norm": 1.7069814678295623, "language_loss": 0.73872066, "learning_rate": 1.3449719242072567e-06, "loss": 0.76014435, "num_input_tokens_seen": 221374080, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 10279, "time_per_iteration": 2.5169529914855957 }, { "auxiliary_loss_clip": 0.01106096, "auxiliary_loss_mlp": 0.01032029, "balance_loss_clip": 1.0195303, "balance_loss_mlp": 1.03668654, "epoch": 0.6180670374267248, "flos": 19645722858240.0, "grad_norm": 1.6998817142463263, "language_loss": 0.70723212, "learning_rate": 1.3446039551951975e-06, "loss": 0.72861338, "num_input_tokens_seen": 221392910, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 10280, "time_per_iteration": 2.4666311740875244 }, { "auxiliary_loss_clip": 0.01109058, "auxiliary_loss_mlp": 0.01035297, "balance_loss_clip": 1.02202868, "balance_loss_mlp": 1.03859282, "epoch": 0.6181271606793928, "flos": 19464876858240.0, "grad_norm": 1.4925383174883287, "language_loss": 0.72680128, "learning_rate": 1.3442360110358215e-06, "loss": 0.74824476, "num_input_tokens_seen": 221410990, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 10281, "time_per_iteration": 2.481048345565796 }, { "auxiliary_loss_clip": 0.01105391, "auxiliary_loss_mlp": 0.01033561, "balance_loss_clip": 1.02197409, "balance_loss_mlp": 1.03833699, "epoch": 0.6181872839320607, "flos": 25594289383680.0, "grad_norm": 1.4992684294510832, "language_loss": 0.76776814, "learning_rate": 1.3438680917430827e-06, "loss": 0.78915769, "num_input_tokens_seen": 221431020, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 10282, "time_per_iteration": 2.5103228092193604 }, { "auxiliary_loss_clip": 0.01110641, "auxiliary_loss_mlp": 0.01030368, "balance_loss_clip": 1.01470375, "balance_loss_mlp": 1.03886831, "epoch": 0.6182474071847287, "flos": 25551806572800.0, "grad_norm": 1.7720916989058642, "language_loss": 0.68866348, "learning_rate": 1.343500197330931e-06, "loss": 0.71007359, "num_input_tokens_seen": 221453235, "router_z_loss_clip": 0.15625, "router_z_loss_mlp": 0.71875, "step": 10283, "time_per_iteration": 2.5327603816986084 }, { "auxiliary_loss_clip": 0.01114189, "auxiliary_loss_mlp": 0.01031328, "balance_loss_clip": 1.01752377, "balance_loss_mlp": 1.03899479, "epoch": 0.6183075304373966, "flos": 22123738327680.0, "grad_norm": 2.0176737948254084, "language_loss": 0.7519964, "learning_rate": 1.3431323278133176e-06, "loss": 0.77345157, "num_input_tokens_seen": 221472560, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 10284, "time_per_iteration": 2.479076623916626 }, { "auxiliary_loss_clip": 0.0110587, "auxiliary_loss_mlp": 0.01039798, "balance_loss_clip": 1.02697694, "balance_loss_mlp": 1.039505, "epoch": 0.6183676536900646, "flos": 22455589104000.0, "grad_norm": 1.787203100221555, "language_loss": 0.75562811, "learning_rate": 1.3427644832041922e-06, "loss": 0.77708477, "num_input_tokens_seen": 221492835, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6640625, "step": 10285, "time_per_iteration": 2.5069291591644287 }, { "auxiliary_loss_clip": 0.01109554, "auxiliary_loss_mlp": 0.01033174, "balance_loss_clip": 1.0202632, "balance_loss_mlp": 1.03881204, "epoch": 0.6184277769427327, "flos": 23364128736000.0, "grad_norm": 1.4937128786021023, "language_loss": 0.72926134, "learning_rate": 1.342396663517503e-06, "loss": 0.75068861, "num_input_tokens_seen": 221511870, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 10286, "time_per_iteration": 2.498565196990967 }, { "auxiliary_loss_clip": 0.01107414, "auxiliary_loss_mlp": 0.01030729, "balance_loss_clip": 1.01837325, "balance_loss_mlp": 1.03836405, "epoch": 0.6184879001954006, "flos": 22711057608960.0, "grad_norm": 1.923499930902978, "language_loss": 0.75836742, "learning_rate": 1.342028868767199e-06, "loss": 0.77974886, "num_input_tokens_seen": 221529915, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 10287, "time_per_iteration": 2.496389627456665 }, { "auxiliary_loss_clip": 0.01108054, "auxiliary_loss_mlp": 0.01033014, "balance_loss_clip": 1.02003217, "balance_loss_mlp": 1.03883207, "epoch": 0.6185480234480686, "flos": 23841920471040.0, "grad_norm": 2.271020998281283, "language_loss": 0.72963506, "learning_rate": 1.3416610989672262e-06, "loss": 0.7510457, "num_input_tokens_seen": 221549745, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 10288, "time_per_iteration": 2.486849784851074 }, { "auxiliary_loss_clip": 0.01102307, "auxiliary_loss_mlp": 0.01035756, "balance_loss_clip": 1.02335203, "balance_loss_mlp": 1.03531718, "epoch": 0.6186081467007365, "flos": 45477595774080.0, "grad_norm": 1.6921958784332394, "language_loss": 0.72867584, "learning_rate": 1.3412933541315296e-06, "loss": 0.75005639, "num_input_tokens_seen": 221572455, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 10289, "time_per_iteration": 2.6961488723754883 }, { "auxiliary_loss_clip": 0.01111005, "auxiliary_loss_mlp": 0.0103293, "balance_loss_clip": 1.01913786, "balance_loss_mlp": 1.03842735, "epoch": 0.6186682699534045, "flos": 23550864566400.0, "grad_norm": 1.719960122284423, "language_loss": 0.79471231, "learning_rate": 1.340925634274056e-06, "loss": 0.81615162, "num_input_tokens_seen": 221591325, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 10290, "time_per_iteration": 2.508147716522217 }, { "auxiliary_loss_clip": 0.01111274, "auxiliary_loss_mlp": 0.01036013, "balance_loss_clip": 1.02306116, "balance_loss_mlp": 1.03875327, "epoch": 0.6187283932060724, "flos": 25774201630080.0, "grad_norm": 1.6333679030153363, "language_loss": 0.81693101, "learning_rate": 1.3405579394087475e-06, "loss": 0.83840394, "num_input_tokens_seen": 221611640, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 10291, "time_per_iteration": 2.5316998958587646 }, { "auxiliary_loss_clip": 0.01107884, "auxiliary_loss_mlp": 0.01037167, "balance_loss_clip": 1.02459061, "balance_loss_mlp": 1.0378325, "epoch": 0.6187885164587404, "flos": 25265203954560.0, "grad_norm": 1.6902590992030742, "language_loss": 0.77493429, "learning_rate": 1.3401902695495487e-06, "loss": 0.79638487, "num_input_tokens_seen": 221631225, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 10292, "time_per_iteration": 2.496661424636841 }, { "auxiliary_loss_clip": 0.01115421, "auxiliary_loss_mlp": 0.0104325, "balance_loss_clip": 1.02818155, "balance_loss_mlp": 1.04107511, "epoch": 0.6188486397114084, "flos": 26250772302720.0, "grad_norm": 2.0305497268429606, "language_loss": 0.73592883, "learning_rate": 1.339822624710401e-06, "loss": 0.75751555, "num_input_tokens_seen": 221651035, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.7421875, "step": 10293, "time_per_iteration": 2.52874755859375 }, { "auxiliary_loss_clip": 0.01110862, "auxiliary_loss_mlp": 0.0103752, "balance_loss_clip": 1.02404976, "balance_loss_mlp": 1.04043615, "epoch": 0.6189087629640764, "flos": 20923388605440.0, "grad_norm": 1.5656059941491434, "language_loss": 0.83366162, "learning_rate": 1.3394550049052454e-06, "loss": 0.85514545, "num_input_tokens_seen": 221671300, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 10294, "time_per_iteration": 2.4829177856445312 }, { "auxiliary_loss_clip": 0.01111249, "auxiliary_loss_mlp": 0.01036003, "balance_loss_clip": 1.02288961, "balance_loss_mlp": 1.03950202, "epoch": 0.6189688862167443, "flos": 14829814874880.0, "grad_norm": 2.999086323656149, "language_loss": 0.70964432, "learning_rate": 1.3390874101480225e-06, "loss": 0.73111677, "num_input_tokens_seen": 221687320, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10295, "time_per_iteration": 3.804893732070923 }, { "auxiliary_loss_clip": 0.01110396, "auxiliary_loss_mlp": 0.0103524, "balance_loss_clip": 1.02174532, "balance_loss_mlp": 1.04025316, "epoch": 0.6190290094694123, "flos": 24285058560000.0, "grad_norm": 1.604126578892403, "language_loss": 0.70248544, "learning_rate": 1.3387198404526705e-06, "loss": 0.7239418, "num_input_tokens_seen": 221710175, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 10296, "time_per_iteration": 2.5596141815185547 }, { "auxiliary_loss_clip": 0.01114379, "auxiliary_loss_mlp": 0.01039363, "balance_loss_clip": 1.02459311, "balance_loss_mlp": 1.04183388, "epoch": 0.6190891327220802, "flos": 22529457423360.0, "grad_norm": 2.056301515947073, "language_loss": 0.7175653, "learning_rate": 1.3383522958331287e-06, "loss": 0.73910272, "num_input_tokens_seen": 221728145, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7265625, "step": 10297, "time_per_iteration": 2.477618932723999 }, { "auxiliary_loss_clip": 0.01037303, "auxiliary_loss_mlp": 0.01001697, "balance_loss_clip": 1.00025487, "balance_loss_mlp": 1.01352453, "epoch": 0.6191492559747482, "flos": 67729357152000.0, "grad_norm": 0.8911941847010001, "language_loss": 0.6414237, "learning_rate": 1.3379847763033345e-06, "loss": 0.66181374, "num_input_tokens_seen": 221786100, "router_z_loss_clip": 0.0144043, "router_z_loss_mlp": 0.23828125, "step": 10298, "time_per_iteration": 4.4624669551849365 }, { "auxiliary_loss_clip": 0.01111156, "auxiliary_loss_mlp": 0.01034643, "balance_loss_clip": 1.0219655, "balance_loss_mlp": 1.03876519, "epoch": 0.6192093792274163, "flos": 22346672088960.0, "grad_norm": 1.9140629107880496, "language_loss": 0.74013317, "learning_rate": 1.3376172818772236e-06, "loss": 0.7615912, "num_input_tokens_seen": 221806450, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 10299, "time_per_iteration": 3.899407148361206 }, { "auxiliary_loss_clip": 0.01116288, "auxiliary_loss_mlp": 0.01032241, "balance_loss_clip": 1.01860917, "balance_loss_mlp": 1.04131162, "epoch": 0.6192695024800842, "flos": 13553944807680.0, "grad_norm": 2.005551941041566, "language_loss": 0.68356997, "learning_rate": 1.337249812568732e-06, "loss": 0.70505524, "num_input_tokens_seen": 221823330, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75, "step": 10300, "time_per_iteration": 3.8915469646453857 }, { "auxiliary_loss_clip": 0.01113022, "auxiliary_loss_mlp": 0.01037898, "balance_loss_clip": 1.0244931, "balance_loss_mlp": 1.04059458, "epoch": 0.6193296257327522, "flos": 17415310815360.0, "grad_norm": 1.9847582808193918, "language_loss": 0.67105269, "learning_rate": 1.3368823683917939e-06, "loss": 0.69256192, "num_input_tokens_seen": 221839360, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 10301, "time_per_iteration": 2.444236993789673 }, { "auxiliary_loss_clip": 0.01109522, "auxiliary_loss_mlp": 0.01036328, "balance_loss_clip": 1.02316737, "balance_loss_mlp": 1.03758276, "epoch": 0.6193897489854201, "flos": 31101118450560.0, "grad_norm": 1.7881903012524933, "language_loss": 0.727808, "learning_rate": 1.3365149493603424e-06, "loss": 0.74926651, "num_input_tokens_seen": 221859465, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 10302, "time_per_iteration": 2.567885637283325 }, { "auxiliary_loss_clip": 0.01110853, "auxiliary_loss_mlp": 0.01031693, "balance_loss_clip": 1.0174768, "balance_loss_mlp": 1.03945029, "epoch": 0.6194498722380881, "flos": 19134031662720.0, "grad_norm": 1.7672661789481585, "language_loss": 0.80494565, "learning_rate": 1.3361475554883107e-06, "loss": 0.82637107, "num_input_tokens_seen": 221878555, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.71484375, "step": 10303, "time_per_iteration": 2.464831590652466 }, { "auxiliary_loss_clip": 0.01114414, "auxiliary_loss_mlp": 0.01032974, "balance_loss_clip": 1.01841295, "balance_loss_mlp": 1.04007745, "epoch": 0.619509995490756, "flos": 21835088634240.0, "grad_norm": 1.6575255276728729, "language_loss": 0.76122808, "learning_rate": 1.3357801867896307e-06, "loss": 0.78270197, "num_input_tokens_seen": 221898790, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7421875, "step": 10304, "time_per_iteration": 2.4792368412017822 }, { "auxiliary_loss_clip": 0.01114978, "auxiliary_loss_mlp": 0.01034547, "balance_loss_clip": 1.02080834, "balance_loss_mlp": 1.03979254, "epoch": 0.619570118743424, "flos": 23806548552960.0, "grad_norm": 2.519907654040588, "language_loss": 0.77339578, "learning_rate": 1.3354128432782324e-06, "loss": 0.794891, "num_input_tokens_seen": 221918875, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.75, "step": 10305, "time_per_iteration": 2.5015175342559814 }, { "auxiliary_loss_clip": 0.01118111, "auxiliary_loss_mlp": 0.01036221, "balance_loss_clip": 1.02069998, "balance_loss_mlp": 1.04247427, "epoch": 0.619630241996092, "flos": 21101612912640.0, "grad_norm": 1.8722563583208496, "language_loss": 0.78870285, "learning_rate": 1.335045524968045e-06, "loss": 0.81024623, "num_input_tokens_seen": 221937895, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.7578125, "step": 10306, "time_per_iteration": 2.501901149749756 }, { "auxiliary_loss_clip": 0.01105956, "auxiliary_loss_mlp": 0.01028232, "balance_loss_clip": 1.01591146, "balance_loss_mlp": 1.03775311, "epoch": 0.61969036524876, "flos": 27308269635840.0, "grad_norm": 1.7754054275524864, "language_loss": 0.8046478, "learning_rate": 1.3346782318729988e-06, "loss": 0.82598972, "num_input_tokens_seen": 221955920, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 10307, "time_per_iteration": 2.511784315109253 }, { "auxiliary_loss_clip": 0.01038309, "auxiliary_loss_mlp": 0.01001438, "balance_loss_clip": 1.00009084, "balance_loss_mlp": 1.01429594, "epoch": 0.6197504885014279, "flos": 51648955384320.0, "grad_norm": 0.8108320347645575, "language_loss": 0.59351546, "learning_rate": 1.3343109640070203e-06, "loss": 0.61391288, "num_input_tokens_seen": 222011405, "router_z_loss_clip": 0.01348877, "router_z_loss_mlp": 0.24023438, "step": 10308, "time_per_iteration": 3.115434169769287 }, { "auxiliary_loss_clip": 0.01106978, "auxiliary_loss_mlp": 0.01029319, "balance_loss_clip": 1.01755273, "balance_loss_mlp": 1.03847861, "epoch": 0.6198106117540959, "flos": 30557107992960.0, "grad_norm": 1.934769060624859, "language_loss": 0.68355793, "learning_rate": 1.333943721384037e-06, "loss": 0.70492089, "num_input_tokens_seen": 222034545, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 10309, "time_per_iteration": 2.55051851272583 }, { "auxiliary_loss_clip": 0.01109634, "auxiliary_loss_mlp": 0.01035905, "balance_loss_clip": 1.02278054, "balance_loss_mlp": 1.03935456, "epoch": 0.6198707350067638, "flos": 18909733184640.0, "grad_norm": 1.7895943746762923, "language_loss": 0.72184646, "learning_rate": 1.3335765040179746e-06, "loss": 0.74330181, "num_input_tokens_seen": 222052690, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 10310, "time_per_iteration": 2.4713330268859863 }, { "auxiliary_loss_clip": 0.01114816, "auxiliary_loss_mlp": 0.01042553, "balance_loss_clip": 1.02768779, "balance_loss_mlp": 1.04186964, "epoch": 0.6199308582594318, "flos": 21433858738560.0, "grad_norm": 5.877206471656714, "language_loss": 0.79096109, "learning_rate": 1.3332093119227573e-06, "loss": 0.81253481, "num_input_tokens_seen": 222069095, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7265625, "step": 10311, "time_per_iteration": 2.4704091548919678 }, { "auxiliary_loss_clip": 0.01110008, "auxiliary_loss_mlp": 0.01037623, "balance_loss_clip": 1.02406275, "balance_loss_mlp": 1.03739429, "epoch": 0.6199909815120999, "flos": 18407379525120.0, "grad_norm": 2.167714079779824, "language_loss": 0.72502208, "learning_rate": 1.3328421451123105e-06, "loss": 0.74649841, "num_input_tokens_seen": 222087360, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 10312, "time_per_iteration": 2.468641996383667 }, { "auxiliary_loss_clip": 0.011122, "auxiliary_loss_mlp": 0.01033286, "balance_loss_clip": 1.01993501, "balance_loss_mlp": 1.03952873, "epoch": 0.6200511047647678, "flos": 21466860359040.0, "grad_norm": 4.6883653694504, "language_loss": 0.71719646, "learning_rate": 1.3324750036005557e-06, "loss": 0.73865128, "num_input_tokens_seen": 222106130, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 10313, "time_per_iteration": 2.456779718399048 }, { "auxiliary_loss_clip": 0.01114631, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.02263606, "balance_loss_mlp": 1.04082537, "epoch": 0.6201112280174358, "flos": 18215903099520.0, "grad_norm": 10.962788250933702, "language_loss": 0.78322428, "learning_rate": 1.332107887401416e-06, "loss": 0.80474037, "num_input_tokens_seen": 222123125, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.73828125, "step": 10314, "time_per_iteration": 2.4671690464019775 }, { "auxiliary_loss_clip": 0.01110173, "auxiliary_loss_mlp": 0.01035638, "balance_loss_clip": 1.02222729, "balance_loss_mlp": 1.03776932, "epoch": 0.6201713512701037, "flos": 20011185786240.0, "grad_norm": 1.9444639971915936, "language_loss": 0.78338897, "learning_rate": 1.331740796528812e-06, "loss": 0.80484712, "num_input_tokens_seen": 222140655, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 10315, "time_per_iteration": 2.470388650894165 }, { "auxiliary_loss_clip": 0.01113214, "auxiliary_loss_mlp": 0.01037721, "balance_loss_clip": 1.02455461, "balance_loss_mlp": 1.04030371, "epoch": 0.6202314745227717, "flos": 22487692884480.0, "grad_norm": 64.20955972054011, "language_loss": 0.76136082, "learning_rate": 1.3313737309966641e-06, "loss": 0.78287017, "num_input_tokens_seen": 222160450, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 10316, "time_per_iteration": 2.499074935913086 }, { "auxiliary_loss_clip": 0.01111, "auxiliary_loss_mlp": 0.01034689, "balance_loss_clip": 1.02091455, "balance_loss_mlp": 1.03680658, "epoch": 0.6202915977754396, "flos": 26828682220800.0, "grad_norm": 1.870804805044937, "language_loss": 0.77463919, "learning_rate": 1.3310066908188915e-06, "loss": 0.79609609, "num_input_tokens_seen": 222179170, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 10317, "time_per_iteration": 2.5189895629882812 }, { "auxiliary_loss_clip": 0.01038133, "auxiliary_loss_mlp": 0.0100091, "balance_loss_clip": 0.99944413, "balance_loss_mlp": 1.0141356, "epoch": 0.6203517210281076, "flos": 62742694890240.0, "grad_norm": 0.6978222401090594, "language_loss": 0.59039545, "learning_rate": 1.3306396760094122e-06, "loss": 0.6107859, "num_input_tokens_seen": 222242660, "router_z_loss_clip": 0.01464844, "router_z_loss_mlp": 0.24023438, "step": 10318, "time_per_iteration": 3.1597800254821777 }, { "auxiliary_loss_clip": 0.01113881, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 1.02442336, "balance_loss_mlp": 1.04206705, "epoch": 0.6204118442807756, "flos": 23404277162880.0, "grad_norm": 1.6579519180271094, "language_loss": 0.77993572, "learning_rate": 1.330272686582143e-06, "loss": 0.80146611, "num_input_tokens_seen": 222262170, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.71875, "step": 10319, "time_per_iteration": 2.493651866912842 }, { "auxiliary_loss_clip": 0.01109684, "auxiliary_loss_mlp": 0.01034731, "balance_loss_clip": 1.02232778, "balance_loss_mlp": 1.03969622, "epoch": 0.6204719675334436, "flos": 20193647898240.0, "grad_norm": 3.26803305600964, "language_loss": 0.66077197, "learning_rate": 1.3299057225510013e-06, "loss": 0.68221617, "num_input_tokens_seen": 222280375, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 10320, "time_per_iteration": 2.505614757537842 }, { "auxiliary_loss_clip": 0.01108749, "auxiliary_loss_mlp": 0.01032266, "balance_loss_clip": 1.01960564, "balance_loss_mlp": 1.03883386, "epoch": 0.6205320907861115, "flos": 13188050916480.0, "grad_norm": 1.6710879810014507, "language_loss": 0.76309341, "learning_rate": 1.3295387839299013e-06, "loss": 0.78450352, "num_input_tokens_seen": 222297325, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 10321, "time_per_iteration": 2.442755937576294 }, { "auxiliary_loss_clip": 0.01106478, "auxiliary_loss_mlp": 0.01030374, "balance_loss_clip": 1.0176065, "balance_loss_mlp": 1.03673363, "epoch": 0.6205922140387795, "flos": 20668386977280.0, "grad_norm": 1.8839366462805587, "language_loss": 0.73829108, "learning_rate": 1.329171870732758e-06, "loss": 0.75965965, "num_input_tokens_seen": 222317095, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10322, "time_per_iteration": 2.4916751384735107 }, { "auxiliary_loss_clip": 0.0110942, "auxiliary_loss_mlp": 0.01028527, "balance_loss_clip": 1.01618874, "balance_loss_mlp": 1.0389533, "epoch": 0.6206523372914474, "flos": 23877831093120.0, "grad_norm": 2.0657742362915754, "language_loss": 0.72990739, "learning_rate": 1.3288049829734845e-06, "loss": 0.75128686, "num_input_tokens_seen": 222337055, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 10323, "time_per_iteration": 2.5172324180603027 }, { "auxiliary_loss_clip": 0.01116803, "auxiliary_loss_mlp": 0.01034142, "balance_loss_clip": 1.0202899, "balance_loss_mlp": 1.04074907, "epoch": 0.6207124605441154, "flos": 13406603218560.0, "grad_norm": 2.6354525589683773, "language_loss": 0.58967882, "learning_rate": 1.3284381206659933e-06, "loss": 0.61118829, "num_input_tokens_seen": 222354515, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.76171875, "step": 10324, "time_per_iteration": 2.459557294845581 }, { "auxiliary_loss_clip": 0.01113008, "auxiliary_loss_mlp": 0.01034792, "balance_loss_clip": 1.02077937, "balance_loss_mlp": 1.04071367, "epoch": 0.6207725837967835, "flos": 18916341287040.0, "grad_norm": 1.9593017496683411, "language_loss": 0.75999904, "learning_rate": 1.3280712838241956e-06, "loss": 0.78147709, "num_input_tokens_seen": 222372755, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.72265625, "step": 10325, "time_per_iteration": 2.448392629623413 }, { "auxiliary_loss_clip": 0.01114203, "auxiliary_loss_mlp": 0.01035414, "balance_loss_clip": 1.02131748, "balance_loss_mlp": 1.04021144, "epoch": 0.6208327070494514, "flos": 23980211832960.0, "grad_norm": 1.827789621915997, "language_loss": 0.72587913, "learning_rate": 1.327704472462003e-06, "loss": 0.74737525, "num_input_tokens_seen": 222391380, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 10326, "time_per_iteration": 2.5047378540039062 }, { "auxiliary_loss_clip": 0.01115577, "auxiliary_loss_mlp": 0.01040452, "balance_loss_clip": 1.02627242, "balance_loss_mlp": 1.04100966, "epoch": 0.6208928303021194, "flos": 22820405587200.0, "grad_norm": 2.783136545254907, "language_loss": 0.73890769, "learning_rate": 1.3273376865933234e-06, "loss": 0.76046801, "num_input_tokens_seen": 222411165, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7421875, "step": 10327, "time_per_iteration": 2.509276866912842 }, { "auxiliary_loss_clip": 0.01116522, "auxiliary_loss_mlp": 0.01036789, "balance_loss_clip": 1.02256107, "balance_loss_mlp": 1.04265428, "epoch": 0.6209529535547873, "flos": 17564519911680.0, "grad_norm": 2.755261579361231, "language_loss": 0.79480606, "learning_rate": 1.326970926232066e-06, "loss": 0.81633914, "num_input_tokens_seen": 222428110, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7421875, "step": 10328, "time_per_iteration": 2.438433885574341 }, { "auxiliary_loss_clip": 0.01112135, "auxiliary_loss_mlp": 0.01039887, "balance_loss_clip": 1.02565908, "balance_loss_mlp": 1.03901708, "epoch": 0.6210130768074553, "flos": 22011912311040.0, "grad_norm": 25.914956469013326, "language_loss": 0.77690762, "learning_rate": 1.3266041913921396e-06, "loss": 0.79842788, "num_input_tokens_seen": 222446385, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.73046875, "step": 10329, "time_per_iteration": 2.4615750312805176 }, { "auxiliary_loss_clip": 0.01038141, "auxiliary_loss_mlp": 0.010021, "balance_loss_clip": 1.00072277, "balance_loss_mlp": 1.01416302, "epoch": 0.6210732000601232, "flos": 63676873854720.0, "grad_norm": 0.8452091033328598, "language_loss": 0.62199998, "learning_rate": 1.3262374820874484e-06, "loss": 0.64240241, "num_input_tokens_seen": 222502150, "router_z_loss_clip": 0.01379395, "router_z_loss_mlp": 0.24023438, "step": 10330, "time_per_iteration": 3.0588021278381348 }, { "auxiliary_loss_clip": 0.01115923, "auxiliary_loss_mlp": 0.01039156, "balance_loss_clip": 1.02466059, "balance_loss_mlp": 1.04094648, "epoch": 0.6211333233127913, "flos": 24243365848320.0, "grad_norm": 2.2768166998362065, "language_loss": 0.77605945, "learning_rate": 1.3258707983319002e-06, "loss": 0.79761028, "num_input_tokens_seen": 222519880, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.75, "step": 10331, "time_per_iteration": 2.5065577030181885 }, { "auxiliary_loss_clip": 0.01115799, "auxiliary_loss_mlp": 0.01036993, "balance_loss_clip": 1.02281928, "balance_loss_mlp": 1.04137373, "epoch": 0.6211934465654592, "flos": 16943803960320.0, "grad_norm": 2.0105286170817034, "language_loss": 0.67641139, "learning_rate": 1.3255041401393992e-06, "loss": 0.69793934, "num_input_tokens_seen": 222538545, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7421875, "step": 10332, "time_per_iteration": 2.472261667251587 }, { "auxiliary_loss_clip": 0.01111449, "auxiliary_loss_mlp": 0.01033699, "balance_loss_clip": 1.02073467, "balance_loss_mlp": 1.03961515, "epoch": 0.6212535698181272, "flos": 15267386355840.0, "grad_norm": 2.131556718620782, "language_loss": 0.76650536, "learning_rate": 1.3251375075238476e-06, "loss": 0.78795683, "num_input_tokens_seen": 222556935, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 10333, "time_per_iteration": 2.457066297531128 }, { "auxiliary_loss_clip": 0.01110394, "auxiliary_loss_mlp": 0.0103558, "balance_loss_clip": 1.02278852, "balance_loss_mlp": 1.04050159, "epoch": 0.6213136930707951, "flos": 13443950384640.0, "grad_norm": 3.10819599482103, "language_loss": 0.69864297, "learning_rate": 1.3247709004991507e-06, "loss": 0.72010273, "num_input_tokens_seen": 222574035, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 10334, "time_per_iteration": 2.4555177688598633 }, { "auxiliary_loss_clip": 0.01109967, "auxiliary_loss_mlp": 0.01035078, "balance_loss_clip": 1.0231812, "balance_loss_mlp": 1.04022622, "epoch": 0.6213738163234631, "flos": 18111223889280.0, "grad_norm": 1.7501418224973795, "language_loss": 0.70258844, "learning_rate": 1.3244043190792078e-06, "loss": 0.72403884, "num_input_tokens_seen": 222592290, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 10335, "time_per_iteration": 2.457566976547241 }, { "auxiliary_loss_clip": 0.0110778, "auxiliary_loss_mlp": 0.01034272, "balance_loss_clip": 1.02150488, "balance_loss_mlp": 1.03831911, "epoch": 0.621433939576131, "flos": 25337348421120.0, "grad_norm": 1.5880513771311227, "language_loss": 0.80107594, "learning_rate": 1.3240377632779213e-06, "loss": 0.82249647, "num_input_tokens_seen": 222612805, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 10336, "time_per_iteration": 2.509439706802368 }, { "auxiliary_loss_clip": 0.01108925, "auxiliary_loss_mlp": 0.01031334, "balance_loss_clip": 1.01847744, "balance_loss_mlp": 1.03996885, "epoch": 0.621494062828799, "flos": 22565619440640.0, "grad_norm": 2.6252824890327355, "language_loss": 0.72932744, "learning_rate": 1.3236712331091907e-06, "loss": 0.75073004, "num_input_tokens_seen": 222632260, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 10337, "time_per_iteration": 3.938199520111084 }, { "auxiliary_loss_clip": 0.0111409, "auxiliary_loss_mlp": 0.01040249, "balance_loss_clip": 1.02660549, "balance_loss_mlp": 1.040133, "epoch": 0.621554186081467, "flos": 27417976750080.0, "grad_norm": 2.259054970047221, "language_loss": 0.63185012, "learning_rate": 1.3233047285869145e-06, "loss": 0.65339351, "num_input_tokens_seen": 222653570, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 10338, "time_per_iteration": 2.5265092849731445 }, { "auxiliary_loss_clip": 0.01111945, "auxiliary_loss_mlp": 0.01038224, "balance_loss_clip": 1.02494431, "balance_loss_mlp": 1.04104543, "epoch": 0.621614309334135, "flos": 22346815743360.0, "grad_norm": 1.9538005575719646, "language_loss": 0.71559358, "learning_rate": 1.322938249724991e-06, "loss": 0.73709524, "num_input_tokens_seen": 222672480, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 10339, "time_per_iteration": 2.488358497619629 }, { "auxiliary_loss_clip": 0.0110864, "auxiliary_loss_mlp": 0.01033004, "balance_loss_clip": 1.01975417, "balance_loss_mlp": 1.03998685, "epoch": 0.621674432586803, "flos": 19281229597440.0, "grad_norm": 1.5436645913005036, "language_loss": 0.69911408, "learning_rate": 1.3225717965373166e-06, "loss": 0.72053057, "num_input_tokens_seen": 222691200, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 10340, "time_per_iteration": 5.415876388549805 }, { "auxiliary_loss_clip": 0.01105469, "auxiliary_loss_mlp": 0.01029536, "balance_loss_clip": 1.01691151, "balance_loss_mlp": 1.03636003, "epoch": 0.6217345558394709, "flos": 21609533180160.0, "grad_norm": 2.2345766313563207, "language_loss": 0.69143748, "learning_rate": 1.322205369037788e-06, "loss": 0.71278757, "num_input_tokens_seen": 222709975, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 10341, "time_per_iteration": 2.4785983562469482 }, { "auxiliary_loss_clip": 0.01113267, "auxiliary_loss_mlp": 0.01035517, "balance_loss_clip": 1.02111077, "balance_loss_mlp": 1.04124093, "epoch": 0.6217946790921389, "flos": 18004102554240.0, "grad_norm": 1.850860458667703, "language_loss": 0.80983579, "learning_rate": 1.321838967240299e-06, "loss": 0.83132368, "num_input_tokens_seen": 222729005, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.71875, "step": 10342, "time_per_iteration": 3.900778293609619 }, { "auxiliary_loss_clip": 0.01035668, "auxiliary_loss_mlp": 0.01006759, "balance_loss_clip": 1.00538826, "balance_loss_mlp": 1.01195014, "epoch": 0.6218548023448068, "flos": 61973631768960.0, "grad_norm": 0.7840028091980465, "language_loss": 0.57356495, "learning_rate": 1.3214725911587452e-06, "loss": 0.59398919, "num_input_tokens_seen": 222786090, "router_z_loss_clip": 0.01373291, "router_z_loss_mlp": 0.23730469, "step": 10343, "time_per_iteration": 3.0334553718566895 }, { "auxiliary_loss_clip": 0.01105857, "auxiliary_loss_mlp": 0.01025916, "balance_loss_clip": 1.01435852, "balance_loss_mlp": 1.03895378, "epoch": 0.6219149255974749, "flos": 25739152934400.0, "grad_norm": 1.8745790658165917, "language_loss": 0.72747743, "learning_rate": 1.3211062408070184e-06, "loss": 0.74879515, "num_input_tokens_seen": 222806100, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 10344, "time_per_iteration": 2.5171892642974854 }, { "auxiliary_loss_clip": 0.01110606, "auxiliary_loss_mlp": 0.0104075, "balance_loss_clip": 1.02854919, "balance_loss_mlp": 1.04027689, "epoch": 0.6219750488501428, "flos": 25411073086080.0, "grad_norm": 1.980447196270755, "language_loss": 0.60130394, "learning_rate": 1.3207399161990105e-06, "loss": 0.62281752, "num_input_tokens_seen": 222826575, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 10345, "time_per_iteration": 2.538729667663574 }, { "auxiliary_loss_clip": 0.01108421, "auxiliary_loss_mlp": 0.01031911, "balance_loss_clip": 1.01862502, "balance_loss_mlp": 1.03747725, "epoch": 0.6220351721028108, "flos": 20047383717120.0, "grad_norm": 1.8176276050377833, "language_loss": 0.7825951, "learning_rate": 1.320373617348614e-06, "loss": 0.80399841, "num_input_tokens_seen": 222845285, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 10346, "time_per_iteration": 2.451880693435669 }, { "auxiliary_loss_clip": 0.01112509, "auxiliary_loss_mlp": 0.01036326, "balance_loss_clip": 1.02234316, "balance_loss_mlp": 1.03914559, "epoch": 0.6220952953554787, "flos": 27488397363840.0, "grad_norm": 1.6055070502989133, "language_loss": 0.71226442, "learning_rate": 1.3200073442697171e-06, "loss": 0.73375273, "num_input_tokens_seen": 222864575, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 10347, "time_per_iteration": 2.5218570232391357 }, { "auxiliary_loss_clip": 0.01106354, "auxiliary_loss_mlp": 0.01031194, "balance_loss_clip": 1.01814067, "balance_loss_mlp": 1.03655684, "epoch": 0.6221554186081467, "flos": 19207612673280.0, "grad_norm": 1.7071623897340398, "language_loss": 0.71850073, "learning_rate": 1.3196410969762108e-06, "loss": 0.73987627, "num_input_tokens_seen": 222884420, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 10348, "time_per_iteration": 2.4763247966766357 }, { "auxiliary_loss_clip": 0.01034621, "auxiliary_loss_mlp": 0.0100319, "balance_loss_clip": 1.00189078, "balance_loss_mlp": 1.01073885, "epoch": 0.6222155418608146, "flos": 62950939989120.0, "grad_norm": 0.8106479474018037, "language_loss": 0.54123425, "learning_rate": 1.3192748754819815e-06, "loss": 0.56161237, "num_input_tokens_seen": 222944690, "router_z_loss_clip": 0.01300049, "router_z_loss_mlp": 0.23828125, "step": 10349, "time_per_iteration": 3.0706193447113037 }, { "auxiliary_loss_clip": 0.01110964, "auxiliary_loss_mlp": 0.01029721, "balance_loss_clip": 1.01697147, "balance_loss_mlp": 1.03973317, "epoch": 0.6222756651134826, "flos": 22601099099520.0, "grad_norm": 2.840900618741387, "language_loss": 0.6995874, "learning_rate": 1.3189086798009173e-06, "loss": 0.72099423, "num_input_tokens_seen": 222962990, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 10350, "time_per_iteration": 2.4505414962768555 }, { "auxiliary_loss_clip": 0.01111432, "auxiliary_loss_mlp": 0.01032921, "balance_loss_clip": 1.01881814, "balance_loss_mlp": 1.0398289, "epoch": 0.6223357883661506, "flos": 21142228216320.0, "grad_norm": 2.043799427699858, "language_loss": 0.57479221, "learning_rate": 1.3185425099469046e-06, "loss": 0.59623575, "num_input_tokens_seen": 222980715, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71875, "step": 10351, "time_per_iteration": 2.4907870292663574 }, { "auxiliary_loss_clip": 0.01033451, "auxiliary_loss_mlp": 0.01001371, "balance_loss_clip": 1.00003552, "balance_loss_mlp": 1.00983274, "epoch": 0.6223959116188186, "flos": 63765071700480.0, "grad_norm": 0.8055135105516309, "language_loss": 0.61168921, "learning_rate": 1.3181763659338276e-06, "loss": 0.6320374, "num_input_tokens_seen": 223040685, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.23632812, "step": 10352, "time_per_iteration": 3.0414514541625977 }, { "auxiliary_loss_clip": 0.01105374, "auxiliary_loss_mlp": 0.01031219, "balance_loss_clip": 1.01817143, "balance_loss_mlp": 1.03664696, "epoch": 0.6224560348714866, "flos": 22565727181440.0, "grad_norm": 2.008241067988054, "language_loss": 0.81867707, "learning_rate": 1.3178102477755714e-06, "loss": 0.84004295, "num_input_tokens_seen": 223059000, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 10353, "time_per_iteration": 2.521456718444824 }, { "auxiliary_loss_clip": 0.01105075, "auxiliary_loss_mlp": 0.01030664, "balance_loss_clip": 1.01813531, "balance_loss_mlp": 1.03732455, "epoch": 0.6225161581241545, "flos": 24097748112000.0, "grad_norm": 2.0615384858516403, "language_loss": 0.75920814, "learning_rate": 1.3174441554860195e-06, "loss": 0.78056556, "num_input_tokens_seen": 223079345, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 10354, "time_per_iteration": 2.495419502258301 }, { "auxiliary_loss_clip": 0.0110798, "auxiliary_loss_mlp": 0.01027825, "balance_loss_clip": 1.01517153, "balance_loss_mlp": 1.03844774, "epoch": 0.6225762813768225, "flos": 20443513881600.0, "grad_norm": 1.4533578333936124, "language_loss": 0.78517675, "learning_rate": 1.3170780890790528e-06, "loss": 0.80653483, "num_input_tokens_seen": 223097880, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10355, "time_per_iteration": 2.4792258739471436 }, { "auxiliary_loss_clip": 0.01109499, "auxiliary_loss_mlp": 0.0103396, "balance_loss_clip": 1.02099597, "balance_loss_mlp": 1.03936744, "epoch": 0.6226364046294904, "flos": 27198131558400.0, "grad_norm": 1.4637288045209036, "language_loss": 0.7819798, "learning_rate": 1.3167120485685538e-06, "loss": 0.8034144, "num_input_tokens_seen": 223118185, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 10356, "time_per_iteration": 2.511136293411255 }, { "auxiliary_loss_clip": 0.01115195, "auxiliary_loss_mlp": 0.01039009, "balance_loss_clip": 1.02437627, "balance_loss_mlp": 1.0397017, "epoch": 0.6226965278821585, "flos": 20445776438400.0, "grad_norm": 2.0198615427540334, "language_loss": 0.68328887, "learning_rate": 1.3163460339684024e-06, "loss": 0.70483088, "num_input_tokens_seen": 223137600, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75390625, "step": 10357, "time_per_iteration": 2.4662368297576904 }, { "auxiliary_loss_clip": 0.01114452, "auxiliary_loss_mlp": 0.01031653, "balance_loss_clip": 1.01685917, "balance_loss_mlp": 1.04028344, "epoch": 0.6227566511348264, "flos": 22162737519360.0, "grad_norm": 3.412656605576025, "language_loss": 0.757622, "learning_rate": 1.3159800452924778e-06, "loss": 0.77908301, "num_input_tokens_seen": 223154360, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7421875, "step": 10358, "time_per_iteration": 2.459169626235962 }, { "auxiliary_loss_clip": 0.01108349, "auxiliary_loss_mlp": 0.01032975, "balance_loss_clip": 1.01933146, "balance_loss_mlp": 1.03673506, "epoch": 0.6228167743874944, "flos": 18040875102720.0, "grad_norm": 2.077855390919318, "language_loss": 0.82187945, "learning_rate": 1.3156140825546588e-06, "loss": 0.84329265, "num_input_tokens_seen": 223172255, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 10359, "time_per_iteration": 2.4475886821746826 }, { "auxiliary_loss_clip": 0.01105816, "auxiliary_loss_mlp": 0.01044225, "balance_loss_clip": 1.03047442, "balance_loss_mlp": 1.0371244, "epoch": 0.6228768976401623, "flos": 17742851959680.0, "grad_norm": 4.75210017596943, "language_loss": 0.73103678, "learning_rate": 1.315248145768822e-06, "loss": 0.75253713, "num_input_tokens_seen": 223186965, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.6875, "step": 10360, "time_per_iteration": 2.415532350540161 }, { "auxiliary_loss_clip": 0.01108254, "auxiliary_loss_mlp": 0.01038116, "balance_loss_clip": 1.02480042, "balance_loss_mlp": 1.0377121, "epoch": 0.6229370208928303, "flos": 17894934144000.0, "grad_norm": 2.0565439899525826, "language_loss": 0.78047663, "learning_rate": 1.3148822349488442e-06, "loss": 0.80194032, "num_input_tokens_seen": 223206045, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 10361, "time_per_iteration": 2.4467074871063232 }, { "auxiliary_loss_clip": 0.01111142, "auxiliary_loss_mlp": 0.01027837, "balance_loss_clip": 1.01522493, "balance_loss_mlp": 1.04110992, "epoch": 0.6229971441454982, "flos": 17347763289600.0, "grad_norm": 2.7220630169615956, "language_loss": 0.67700887, "learning_rate": 1.3145163501086005e-06, "loss": 0.69839859, "num_input_tokens_seen": 223224820, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 10362, "time_per_iteration": 2.4605681896209717 }, { "auxiliary_loss_clip": 0.01109872, "auxiliary_loss_mlp": 0.01034267, "balance_loss_clip": 1.02074862, "balance_loss_mlp": 1.03901589, "epoch": 0.6230572673981662, "flos": 29241376807680.0, "grad_norm": 2.0915050263794774, "language_loss": 0.67203271, "learning_rate": 1.3141504912619658e-06, "loss": 0.69347411, "num_input_tokens_seen": 223243205, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 10363, "time_per_iteration": 2.521583318710327 }, { "auxiliary_loss_clip": 0.01111973, "auxiliary_loss_mlp": 0.01035199, "balance_loss_clip": 1.02133512, "balance_loss_mlp": 1.03914869, "epoch": 0.6231173906508342, "flos": 16325961096960.0, "grad_norm": 1.912304402241763, "language_loss": 0.86450434, "learning_rate": 1.3137846584228127e-06, "loss": 0.88597608, "num_input_tokens_seen": 223261370, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 10364, "time_per_iteration": 2.432764768600464 }, { "auxiliary_loss_clip": 0.01033706, "auxiliary_loss_mlp": 0.0100468, "balance_loss_clip": 1.00325572, "balance_loss_mlp": 1.00981951, "epoch": 0.6231775139035022, "flos": 68702032517760.0, "grad_norm": 0.9445938698612639, "language_loss": 0.60843521, "learning_rate": 1.313418851605015e-06, "loss": 0.62881905, "num_input_tokens_seen": 223315050, "router_z_loss_clip": 0.01422119, "router_z_loss_mlp": 0.23828125, "step": 10365, "time_per_iteration": 3.087602138519287 }, { "auxiliary_loss_clip": 0.01115701, "auxiliary_loss_mlp": 0.01038262, "balance_loss_clip": 1.02281213, "balance_loss_mlp": 1.04061759, "epoch": 0.6232376371561702, "flos": 19821038163840.0, "grad_norm": 1.9789667499284258, "language_loss": 0.75476778, "learning_rate": 1.3130530708224427e-06, "loss": 0.7763074, "num_input_tokens_seen": 223332130, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.75, "step": 10366, "time_per_iteration": 2.475154161453247 }, { "auxiliary_loss_clip": 0.01113533, "auxiliary_loss_mlp": 0.01039359, "balance_loss_clip": 1.02523255, "balance_loss_mlp": 1.03996718, "epoch": 0.6232977604088381, "flos": 23258264376960.0, "grad_norm": 3.12225518207838, "language_loss": 0.76590806, "learning_rate": 1.3126873160889665e-06, "loss": 0.78743696, "num_input_tokens_seen": 223351605, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.734375, "step": 10367, "time_per_iteration": 2.4738852977752686 }, { "auxiliary_loss_clip": 0.0111015, "auxiliary_loss_mlp": 0.01036578, "balance_loss_clip": 1.02335179, "balance_loss_mlp": 1.04108524, "epoch": 0.6233578836615061, "flos": 21106425335040.0, "grad_norm": 1.4682509999615023, "language_loss": 0.78478462, "learning_rate": 1.312321587418457e-06, "loss": 0.80625188, "num_input_tokens_seen": 223372090, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 10368, "time_per_iteration": 2.493734836578369 }, { "auxiliary_loss_clip": 0.01113125, "auxiliary_loss_mlp": 0.01031401, "balance_loss_clip": 1.01797819, "balance_loss_mlp": 1.0406853, "epoch": 0.623418006914174, "flos": 23769416868480.0, "grad_norm": 2.0554668758537837, "language_loss": 0.68695903, "learning_rate": 1.3119558848247811e-06, "loss": 0.70840424, "num_input_tokens_seen": 223390110, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 10369, "time_per_iteration": 2.4823625087738037 }, { "auxiliary_loss_clip": 0.01113465, "auxiliary_loss_mlp": 0.01041764, "balance_loss_clip": 1.02735186, "balance_loss_mlp": 1.04098248, "epoch": 0.6234781301668421, "flos": 17890480857600.0, "grad_norm": 2.348763200066112, "language_loss": 0.88284099, "learning_rate": 1.3115902083218072e-06, "loss": 0.90439332, "num_input_tokens_seen": 223404205, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7265625, "step": 10370, "time_per_iteration": 2.418369770050049 }, { "auxiliary_loss_clip": 0.0110761, "auxiliary_loss_mlp": 0.01028436, "balance_loss_clip": 1.01500106, "balance_loss_mlp": 1.03772128, "epoch": 0.62353825341951, "flos": 26175503352960.0, "grad_norm": 1.463423449199534, "language_loss": 0.66097212, "learning_rate": 1.311224557923402e-06, "loss": 0.68233263, "num_input_tokens_seen": 223424855, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69921875, "step": 10371, "time_per_iteration": 2.5164198875427246 }, { "auxiliary_loss_clip": 0.01103335, "auxiliary_loss_mlp": 0.01031089, "balance_loss_clip": 1.01965106, "balance_loss_mlp": 1.03635788, "epoch": 0.623598376672178, "flos": 31139902160640.0, "grad_norm": 1.394768611732998, "language_loss": 0.77407277, "learning_rate": 1.3108589336434298e-06, "loss": 0.79541707, "num_input_tokens_seen": 223447225, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 10372, "time_per_iteration": 2.5687973499298096 }, { "auxiliary_loss_clip": 0.01108364, "auxiliary_loss_mlp": 0.01036508, "balance_loss_clip": 1.02250707, "balance_loss_mlp": 1.03621721, "epoch": 0.6236584999248459, "flos": 23730202195200.0, "grad_norm": 1.6006845093493067, "language_loss": 0.77463573, "learning_rate": 1.3104933354957568e-06, "loss": 0.79608446, "num_input_tokens_seen": 223467520, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.72265625, "step": 10373, "time_per_iteration": 2.4856221675872803 }, { "auxiliary_loss_clip": 0.01106125, "auxiliary_loss_mlp": 0.01028887, "balance_loss_clip": 1.01626861, "balance_loss_mlp": 1.037624, "epoch": 0.6237186231775139, "flos": 21762764599680.0, "grad_norm": 1.840717749992959, "language_loss": 0.69726485, "learning_rate": 1.3101277634942448e-06, "loss": 0.71861494, "num_input_tokens_seen": 223488130, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 10374, "time_per_iteration": 2.4743056297302246 }, { "auxiliary_loss_clip": 0.0111191, "auxiliary_loss_mlp": 0.01031186, "balance_loss_clip": 1.01821589, "balance_loss_mlp": 1.03960109, "epoch": 0.6237787464301818, "flos": 14939486075520.0, "grad_norm": 2.386179343765959, "language_loss": 0.77293324, "learning_rate": 1.3097622176527577e-06, "loss": 0.79436421, "num_input_tokens_seen": 223505105, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 10375, "time_per_iteration": 2.4280037879943848 }, { "auxiliary_loss_clip": 0.01108048, "auxiliary_loss_mlp": 0.01031475, "balance_loss_clip": 1.01871955, "balance_loss_mlp": 1.03868556, "epoch": 0.6238388696828499, "flos": 35590311302400.0, "grad_norm": 1.4842761205256525, "language_loss": 0.70337582, "learning_rate": 1.3093966979851566e-06, "loss": 0.72477102, "num_input_tokens_seen": 223528065, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 10376, "time_per_iteration": 2.6053268909454346 }, { "auxiliary_loss_clip": 0.01111773, "auxiliary_loss_mlp": 0.0103491, "balance_loss_clip": 1.02074206, "balance_loss_mlp": 1.03930473, "epoch": 0.6238989929355178, "flos": 23623511823360.0, "grad_norm": 1.621733064960336, "language_loss": 0.76649714, "learning_rate": 1.309031204505301e-06, "loss": 0.78796399, "num_input_tokens_seen": 223547305, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.72265625, "step": 10377, "time_per_iteration": 2.474682569503784 }, { "auxiliary_loss_clip": 0.01110693, "auxiliary_loss_mlp": 0.01031156, "balance_loss_clip": 1.01928878, "balance_loss_mlp": 1.03958082, "epoch": 0.6239591161881858, "flos": 22087468569600.0, "grad_norm": 1.7042956907473927, "language_loss": 0.68214715, "learning_rate": 1.308665737227052e-06, "loss": 0.70356566, "num_input_tokens_seen": 223567205, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.7109375, "step": 10378, "time_per_iteration": 3.925098419189453 }, { "auxiliary_loss_clip": 0.01107957, "auxiliary_loss_mlp": 0.01032684, "balance_loss_clip": 1.01966619, "balance_loss_mlp": 1.0373795, "epoch": 0.6240192394408538, "flos": 24535930124160.0, "grad_norm": 1.966724938952943, "language_loss": 0.7627753, "learning_rate": 1.3083002961642675e-06, "loss": 0.78418171, "num_input_tokens_seen": 223586560, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 10379, "time_per_iteration": 2.5111052989959717 }, { "auxiliary_loss_clip": 0.01106721, "auxiliary_loss_mlp": 0.01024243, "balance_loss_clip": 1.01143432, "balance_loss_mlp": 1.03668702, "epoch": 0.6240793626935217, "flos": 27931930502400.0, "grad_norm": 1.4171106980801047, "language_loss": 0.79408467, "learning_rate": 1.3079348813308051e-06, "loss": 0.81539434, "num_input_tokens_seen": 223610595, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 10380, "time_per_iteration": 2.5511858463287354 }, { "auxiliary_loss_clip": 0.01107576, "auxiliary_loss_mlp": 0.01030624, "balance_loss_clip": 1.01897681, "balance_loss_mlp": 1.03938878, "epoch": 0.6241394859461897, "flos": 22892514140160.0, "grad_norm": 1.7999136271168756, "language_loss": 0.79561228, "learning_rate": 1.3075694927405207e-06, "loss": 0.81699425, "num_input_tokens_seen": 223630230, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 10381, "time_per_iteration": 3.990893602371216 }, { "auxiliary_loss_clip": 0.01109494, "auxiliary_loss_mlp": 0.01033911, "balance_loss_clip": 1.02106047, "balance_loss_mlp": 1.03820264, "epoch": 0.6241996091988576, "flos": 12750766744320.0, "grad_norm": 1.891580400407987, "language_loss": 0.74602222, "learning_rate": 1.3072041304072718e-06, "loss": 0.76745629, "num_input_tokens_seen": 223648360, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 10382, "time_per_iteration": 3.814688205718994 }, { "auxiliary_loss_clip": 0.01106016, "auxiliary_loss_mlp": 0.0102785, "balance_loss_clip": 1.01499343, "balance_loss_mlp": 1.03753853, "epoch": 0.6242597324515257, "flos": 25851302173440.0, "grad_norm": 1.4211708996872512, "language_loss": 0.78552914, "learning_rate": 1.306838794344911e-06, "loss": 0.80686784, "num_input_tokens_seen": 223671255, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 10383, "time_per_iteration": 2.5239815711975098 }, { "auxiliary_loss_clip": 0.01107367, "auxiliary_loss_mlp": 0.01032286, "balance_loss_clip": 1.02000141, "balance_loss_mlp": 1.03847516, "epoch": 0.6243198557041936, "flos": 19937712516480.0, "grad_norm": 2.020288516694399, "language_loss": 0.75267655, "learning_rate": 1.3064734845672925e-06, "loss": 0.77407312, "num_input_tokens_seen": 223689860, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 10384, "time_per_iteration": 3.9721758365631104 }, { "auxiliary_loss_clip": 0.01110455, "auxiliary_loss_mlp": 0.01031242, "balance_loss_clip": 1.01790822, "balance_loss_mlp": 1.03872442, "epoch": 0.6243799789568616, "flos": 18406194376320.0, "grad_norm": 1.7301948611084994, "language_loss": 0.66419339, "learning_rate": 1.3061082010882694e-06, "loss": 0.68561035, "num_input_tokens_seen": 223707835, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 10385, "time_per_iteration": 2.430624485015869 }, { "auxiliary_loss_clip": 0.01034233, "auxiliary_loss_mlp": 0.01002102, "balance_loss_clip": 1.00070715, "balance_loss_mlp": 1.01035547, "epoch": 0.6244401022095295, "flos": 66027587523840.0, "grad_norm": 0.7584899504076541, "language_loss": 0.6195454, "learning_rate": 1.305742943921692e-06, "loss": 0.63990873, "num_input_tokens_seen": 223771875, "router_z_loss_clip": 0.01397705, "router_z_loss_mlp": 0.23828125, "step": 10386, "time_per_iteration": 3.1230597496032715 }, { "auxiliary_loss_clip": 0.01108515, "auxiliary_loss_mlp": 0.01033127, "balance_loss_clip": 1.01984107, "balance_loss_mlp": 1.03758001, "epoch": 0.6245002254621975, "flos": 24571266128640.0, "grad_norm": 2.6599465882864872, "language_loss": 0.71922052, "learning_rate": 1.3053777130814128e-06, "loss": 0.74063694, "num_input_tokens_seen": 223788895, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 10387, "time_per_iteration": 2.4692540168762207 }, { "auxiliary_loss_clip": 0.01115036, "auxiliary_loss_mlp": 0.01042812, "balance_loss_clip": 1.02737427, "balance_loss_mlp": 1.0396837, "epoch": 0.6245603487148654, "flos": 29168837291520.0, "grad_norm": 2.3483070694855592, "language_loss": 0.65487707, "learning_rate": 1.3050125085812798e-06, "loss": 0.6764555, "num_input_tokens_seen": 223810385, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.75390625, "step": 10388, "time_per_iteration": 2.5489320755004883 }, { "auxiliary_loss_clip": 0.01107594, "auxiliary_loss_mlp": 0.01025897, "balance_loss_clip": 1.01372552, "balance_loss_mlp": 1.03750122, "epoch": 0.6246204719675335, "flos": 14790097411200.0, "grad_norm": 2.551854966363755, "language_loss": 0.78823829, "learning_rate": 1.3046473304351417e-06, "loss": 0.80957317, "num_input_tokens_seen": 223826040, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 10389, "time_per_iteration": 2.4318134784698486 }, { "auxiliary_loss_clip": 0.01106751, "auxiliary_loss_mlp": 0.01032426, "balance_loss_clip": 1.01908016, "balance_loss_mlp": 1.03715754, "epoch": 0.6246805952202014, "flos": 12493538472960.0, "grad_norm": 1.8031878928673661, "language_loss": 0.60591477, "learning_rate": 1.3042821786568475e-06, "loss": 0.62730652, "num_input_tokens_seen": 223842300, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 10390, "time_per_iteration": 2.4633984565734863 }, { "auxiliary_loss_clip": 0.01111216, "auxiliary_loss_mlp": 0.01034984, "balance_loss_clip": 1.02148962, "balance_loss_mlp": 1.03898931, "epoch": 0.6247407184728694, "flos": 12786677366400.0, "grad_norm": 9.76456813690225, "language_loss": 0.77136934, "learning_rate": 1.3039170532602416e-06, "loss": 0.79283136, "num_input_tokens_seen": 223858320, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 10391, "time_per_iteration": 2.438699960708618 }, { "auxiliary_loss_clip": 0.01112007, "auxiliary_loss_mlp": 0.01029927, "balance_loss_clip": 1.01608109, "balance_loss_mlp": 1.04029059, "epoch": 0.6248008417255374, "flos": 40629188960640.0, "grad_norm": 1.7902763012189138, "language_loss": 0.64795315, "learning_rate": 1.3035519542591718e-06, "loss": 0.6693725, "num_input_tokens_seen": 223883545, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 10392, "time_per_iteration": 2.670650005340576 }, { "auxiliary_loss_clip": 0.01113206, "auxiliary_loss_mlp": 0.01032679, "balance_loss_clip": 1.01931, "balance_loss_mlp": 1.04039538, "epoch": 0.6248609649782053, "flos": 19902017376000.0, "grad_norm": 1.78668429024679, "language_loss": 0.76756585, "learning_rate": 1.3031868816674819e-06, "loss": 0.78902471, "num_input_tokens_seen": 223901445, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 10393, "time_per_iteration": 2.4761855602264404 }, { "auxiliary_loss_clip": 0.01114288, "auxiliary_loss_mlp": 0.01035418, "balance_loss_clip": 1.02102971, "balance_loss_mlp": 1.04153562, "epoch": 0.6249210882308733, "flos": 19682746801920.0, "grad_norm": 2.0274841385291644, "language_loss": 0.823291, "learning_rate": 1.3028218354990142e-06, "loss": 0.84478807, "num_input_tokens_seen": 223920170, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7265625, "step": 10394, "time_per_iteration": 2.470454454421997 }, { "auxiliary_loss_clip": 0.01112895, "auxiliary_loss_mlp": 0.01036702, "balance_loss_clip": 1.02253997, "balance_loss_mlp": 1.03995335, "epoch": 0.6249812114835412, "flos": 13990726189440.0, "grad_norm": 1.9308178706385413, "language_loss": 0.75148553, "learning_rate": 1.3024568157676128e-06, "loss": 0.77298152, "num_input_tokens_seen": 223936495, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.73046875, "step": 10395, "time_per_iteration": 2.421931028366089 }, { "auxiliary_loss_clip": 0.01111736, "auxiliary_loss_mlp": 0.01034737, "balance_loss_clip": 1.0209204, "balance_loss_mlp": 1.0384444, "epoch": 0.6250413347362093, "flos": 14530031965440.0, "grad_norm": 4.068712789064435, "language_loss": 0.72967649, "learning_rate": 1.302091822487119e-06, "loss": 0.75114119, "num_input_tokens_seen": 223950070, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 10396, "time_per_iteration": 2.422165632247925 }, { "auxiliary_loss_clip": 0.0111078, "auxiliary_loss_mlp": 0.01038328, "balance_loss_clip": 1.0251075, "balance_loss_mlp": 1.0399915, "epoch": 0.6251014579888772, "flos": 22963006581120.0, "grad_norm": 1.7009750921587905, "language_loss": 0.75642252, "learning_rate": 1.3017268556713732e-06, "loss": 0.77791357, "num_input_tokens_seen": 223970065, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 10397, "time_per_iteration": 2.4694855213165283 }, { "auxiliary_loss_clip": 0.01109214, "auxiliary_loss_mlp": 0.01032942, "balance_loss_clip": 1.0190239, "balance_loss_mlp": 1.03795838, "epoch": 0.6251615812415452, "flos": 28111232217600.0, "grad_norm": 2.3498378206256407, "language_loss": 0.75238633, "learning_rate": 1.3013619153342154e-06, "loss": 0.77380782, "num_input_tokens_seen": 223990315, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 10398, "time_per_iteration": 2.5432076454162598 }, { "auxiliary_loss_clip": 0.01112222, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.0160408, "balance_loss_mlp": 1.03808391, "epoch": 0.6252217044942131, "flos": 26724469887360.0, "grad_norm": 1.7973095184461536, "language_loss": 0.73619568, "learning_rate": 1.300997001489483e-06, "loss": 0.75763232, "num_input_tokens_seen": 224009960, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7421875, "step": 10399, "time_per_iteration": 2.5092644691467285 }, { "auxiliary_loss_clip": 0.01113852, "auxiliary_loss_mlp": 0.01035177, "balance_loss_clip": 1.02147985, "balance_loss_mlp": 1.04179537, "epoch": 0.6252818277468811, "flos": 20006768413440.0, "grad_norm": 1.7660336525103326, "language_loss": 0.74453908, "learning_rate": 1.3006321141510147e-06, "loss": 0.76602936, "num_input_tokens_seen": 224028870, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 10400, "time_per_iteration": 2.4911017417907715 }, { "auxiliary_loss_clip": 0.01035015, "auxiliary_loss_mlp": 0.01000677, "balance_loss_clip": 0.99925804, "balance_loss_mlp": 1.01100564, "epoch": 0.625341950999549, "flos": 59278285059840.0, "grad_norm": 0.836530238466171, "language_loss": 0.56510007, "learning_rate": 1.3002672533326465e-06, "loss": 0.58545697, "num_input_tokens_seen": 224094140, "router_z_loss_clip": 0.01416016, "router_z_loss_mlp": 0.24023438, "step": 10401, "time_per_iteration": 3.171463966369629 }, { "auxiliary_loss_clip": 0.01111197, "auxiliary_loss_mlp": 0.01033222, "balance_loss_clip": 1.01954877, "balance_loss_mlp": 1.0387218, "epoch": 0.625402074252217, "flos": 20157090831360.0, "grad_norm": 2.263407560837372, "language_loss": 0.83403397, "learning_rate": 1.2999024190482146e-06, "loss": 0.85547817, "num_input_tokens_seen": 224113235, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 10402, "time_per_iteration": 2.4692347049713135 }, { "auxiliary_loss_clip": 0.01108864, "auxiliary_loss_mlp": 0.01031478, "balance_loss_clip": 1.01828134, "balance_loss_mlp": 1.03804922, "epoch": 0.625462197504885, "flos": 29132531619840.0, "grad_norm": 2.01309756972675, "language_loss": 0.69205129, "learning_rate": 1.2995376113115527e-06, "loss": 0.71345466, "num_input_tokens_seen": 224134530, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 10403, "time_per_iteration": 2.5251967906951904 }, { "auxiliary_loss_clip": 0.01113632, "auxiliary_loss_mlp": 0.01031858, "balance_loss_clip": 1.01661694, "balance_loss_mlp": 1.04024124, "epoch": 0.625522320757553, "flos": 26104436294400.0, "grad_norm": 1.966431322028808, "language_loss": 0.71760577, "learning_rate": 1.2991728301364954e-06, "loss": 0.73906064, "num_input_tokens_seen": 224154170, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.734375, "step": 10404, "time_per_iteration": 2.5016887187957764 }, { "auxiliary_loss_clip": 0.01113038, "auxiliary_loss_mlp": 0.01037465, "balance_loss_clip": 1.023458, "balance_loss_mlp": 1.04149294, "epoch": 0.625582444010221, "flos": 20630967984000.0, "grad_norm": 1.9036668589310093, "language_loss": 0.69514048, "learning_rate": 1.2988080755368742e-06, "loss": 0.71664548, "num_input_tokens_seen": 224172730, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71484375, "step": 10405, "time_per_iteration": 2.4661593437194824 }, { "auxiliary_loss_clip": 0.01112454, "auxiliary_loss_mlp": 0.01033095, "balance_loss_clip": 1.01987493, "balance_loss_mlp": 1.04119015, "epoch": 0.6256425672628889, "flos": 20521512264960.0, "grad_norm": 1.6173779703907096, "language_loss": 0.78969377, "learning_rate": 1.2984433475265207e-06, "loss": 0.81114924, "num_input_tokens_seen": 224192620, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 10406, "time_per_iteration": 2.4644439220428467 }, { "auxiliary_loss_clip": 0.01111286, "auxiliary_loss_mlp": 0.01034742, "balance_loss_clip": 1.02115262, "balance_loss_mlp": 1.04016685, "epoch": 0.6257026905155569, "flos": 29529200488320.0, "grad_norm": 1.8986300604567592, "language_loss": 0.68898523, "learning_rate": 1.2980786461192666e-06, "loss": 0.71044552, "num_input_tokens_seen": 224214660, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 10407, "time_per_iteration": 2.514728546142578 }, { "auxiliary_loss_clip": 0.01108767, "auxiliary_loss_mlp": 0.01031418, "balance_loss_clip": 1.01899028, "balance_loss_mlp": 1.04034531, "epoch": 0.6257628137682248, "flos": 24024885373440.0, "grad_norm": 1.7814323390395428, "language_loss": 0.85426563, "learning_rate": 1.2977139713289398e-06, "loss": 0.87566745, "num_input_tokens_seen": 224234170, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 10408, "time_per_iteration": 2.5032167434692383 }, { "auxiliary_loss_clip": 0.01108732, "auxiliary_loss_mlp": 0.01036301, "balance_loss_clip": 1.02335477, "balance_loss_mlp": 1.03827918, "epoch": 0.6258229370208929, "flos": 20850956830080.0, "grad_norm": 1.811358419992253, "language_loss": 0.79535204, "learning_rate": 1.2973493231693699e-06, "loss": 0.81680238, "num_input_tokens_seen": 224253115, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 10409, "time_per_iteration": 2.457472085952759 }, { "auxiliary_loss_clip": 0.01110068, "auxiliary_loss_mlp": 0.01031335, "balance_loss_clip": 1.0179894, "balance_loss_mlp": 1.03963757, "epoch": 0.6258830602735608, "flos": 22231542021120.0, "grad_norm": 2.0556133706412845, "language_loss": 0.69718635, "learning_rate": 1.2969847016543845e-06, "loss": 0.71860039, "num_input_tokens_seen": 224271375, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 10410, "time_per_iteration": 2.4713923931121826 }, { "auxiliary_loss_clip": 0.01109916, "auxiliary_loss_mlp": 0.01028487, "balance_loss_clip": 1.01589847, "balance_loss_mlp": 1.04172003, "epoch": 0.6259431835262288, "flos": 25076887925760.0, "grad_norm": 2.396968708055094, "language_loss": 0.67611301, "learning_rate": 1.2966201067978086e-06, "loss": 0.69749701, "num_input_tokens_seen": 224290315, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 10411, "time_per_iteration": 2.4845142364501953 }, { "auxiliary_loss_clip": 0.01111304, "auxiliary_loss_mlp": 0.01034467, "balance_loss_clip": 1.0211637, "balance_loss_mlp": 1.03978467, "epoch": 0.6260033067788967, "flos": 28252288926720.0, "grad_norm": 1.7943057812506777, "language_loss": 0.69246751, "learning_rate": 1.2962555386134702e-06, "loss": 0.71392524, "num_input_tokens_seen": 224310545, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 10412, "time_per_iteration": 2.515306234359741 }, { "auxiliary_loss_clip": 0.01106069, "auxiliary_loss_mlp": 0.01033005, "balance_loss_clip": 1.02047622, "balance_loss_mlp": 1.03769827, "epoch": 0.6260634300315647, "flos": 23367432787200.0, "grad_norm": 2.1368858860656985, "language_loss": 0.69318616, "learning_rate": 1.2958909971151908e-06, "loss": 0.71457696, "num_input_tokens_seen": 224331115, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 10413, "time_per_iteration": 2.4780333042144775 }, { "auxiliary_loss_clip": 0.01112748, "auxiliary_loss_mlp": 0.01034449, "balance_loss_clip": 1.0200243, "balance_loss_mlp": 1.03847182, "epoch": 0.6261235532842326, "flos": 18035308494720.0, "grad_norm": 5.901927243432721, "language_loss": 0.80850893, "learning_rate": 1.295526482316796e-06, "loss": 0.82998091, "num_input_tokens_seen": 224347525, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7421875, "step": 10414, "time_per_iteration": 2.4437930583953857 }, { "auxiliary_loss_clip": 0.0111329, "auxiliary_loss_mlp": 0.01036559, "balance_loss_clip": 1.02339268, "balance_loss_mlp": 1.04284561, "epoch": 0.6261836765369007, "flos": 22011265866240.0, "grad_norm": 1.6518873177939606, "language_loss": 0.74470532, "learning_rate": 1.2951619942321083e-06, "loss": 0.76620382, "num_input_tokens_seen": 224367045, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 10415, "time_per_iteration": 2.46600341796875 }, { "auxiliary_loss_clip": 0.01110394, "auxiliary_loss_mlp": 0.01031459, "balance_loss_clip": 1.01876366, "balance_loss_mlp": 1.04018211, "epoch": 0.6262437997895686, "flos": 24936010784640.0, "grad_norm": 1.5530216070409641, "language_loss": 0.74567419, "learning_rate": 1.2947975328749472e-06, "loss": 0.7670927, "num_input_tokens_seen": 224388860, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 10416, "time_per_iteration": 2.519254446029663 }, { "auxiliary_loss_clip": 0.01108081, "auxiliary_loss_mlp": 0.0103131, "balance_loss_clip": 1.01851869, "balance_loss_mlp": 1.04023468, "epoch": 0.6263039230422366, "flos": 31608428186880.0, "grad_norm": 2.0362305403749676, "language_loss": 0.83890939, "learning_rate": 1.2944330982591352e-06, "loss": 0.86030328, "num_input_tokens_seen": 224409645, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 10417, "time_per_iteration": 2.5368337631225586 }, { "auxiliary_loss_clip": 0.01108648, "auxiliary_loss_mlp": 0.01031598, "balance_loss_clip": 1.01783574, "balance_loss_mlp": 1.03799546, "epoch": 0.6263640462949046, "flos": 17639465639040.0, "grad_norm": 4.009205712986234, "language_loss": 0.57416111, "learning_rate": 1.2940686903984904e-06, "loss": 0.59556359, "num_input_tokens_seen": 224428530, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.70703125, "step": 10418, "time_per_iteration": 2.462143898010254 }, { "auxiliary_loss_clip": 0.0111552, "auxiliary_loss_mlp": 0.01041051, "balance_loss_clip": 1.02632856, "balance_loss_mlp": 1.04047263, "epoch": 0.6264241695475725, "flos": 19974951941760.0, "grad_norm": 2.1829994779526993, "language_loss": 0.85020351, "learning_rate": 1.2937043093068316e-06, "loss": 0.87176919, "num_input_tokens_seen": 224447175, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.75, "step": 10419, "time_per_iteration": 2.4513182640075684 }, { "auxiliary_loss_clip": 0.0111332, "auxiliary_loss_mlp": 0.0103275, "balance_loss_clip": 1.01987529, "balance_loss_mlp": 1.04185939, "epoch": 0.6264842928002405, "flos": 27344323912320.0, "grad_norm": 1.770448120326755, "language_loss": 0.6439352, "learning_rate": 1.2933399549979762e-06, "loss": 0.66539598, "num_input_tokens_seen": 224469445, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 10420, "time_per_iteration": 3.970446825027466 }, { "auxiliary_loss_clip": 0.01112862, "auxiliary_loss_mlp": 0.01031036, "balance_loss_clip": 1.01713037, "balance_loss_mlp": 1.04007363, "epoch": 0.6265444160529084, "flos": 22997265177600.0, "grad_norm": 3.0090774877671067, "language_loss": 0.86004633, "learning_rate": 1.292975627485741e-06, "loss": 0.88148534, "num_input_tokens_seen": 224486590, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 10421, "time_per_iteration": 2.467116355895996 }, { "auxiliary_loss_clip": 0.01111277, "auxiliary_loss_mlp": 0.01030133, "balance_loss_clip": 1.01687133, "balance_loss_mlp": 1.04033673, "epoch": 0.6266045393055765, "flos": 19938323047680.0, "grad_norm": 2.6663853949873637, "language_loss": 0.79491389, "learning_rate": 1.2926113267839403e-06, "loss": 0.81632805, "num_input_tokens_seen": 224502795, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 10422, "time_per_iteration": 2.4547059535980225 }, { "auxiliary_loss_clip": 0.01106665, "auxiliary_loss_mlp": 0.01027969, "balance_loss_clip": 1.01427221, "balance_loss_mlp": 1.03739667, "epoch": 0.6266646625582444, "flos": 24389091325440.0, "grad_norm": 2.639759884748808, "language_loss": 0.7500779, "learning_rate": 1.292247052906389e-06, "loss": 0.77142423, "num_input_tokens_seen": 224522300, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69140625, "step": 10423, "time_per_iteration": 3.9444527626037598 }, { "auxiliary_loss_clip": 0.01108753, "auxiliary_loss_mlp": 0.01031539, "balance_loss_clip": 1.01833057, "balance_loss_mlp": 1.03848052, "epoch": 0.6267247858109124, "flos": 14683802088960.0, "grad_norm": 2.4228967308355345, "language_loss": 0.77827829, "learning_rate": 1.2918828058669004e-06, "loss": 0.79968119, "num_input_tokens_seen": 224538260, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 10424, "time_per_iteration": 3.90702748298645 }, { "auxiliary_loss_clip": 0.01108076, "auxiliary_loss_mlp": 0.01031178, "balance_loss_clip": 1.01681972, "balance_loss_mlp": 1.03895044, "epoch": 0.6267849090635803, "flos": 24929977299840.0, "grad_norm": 2.1827282606890313, "language_loss": 0.68832576, "learning_rate": 1.2915185856792868e-06, "loss": 0.70971829, "num_input_tokens_seen": 224559155, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.69140625, "step": 10425, "time_per_iteration": 3.9569754600524902 }, { "auxiliary_loss_clip": 0.01105614, "auxiliary_loss_mlp": 0.01030017, "balance_loss_clip": 1.01816714, "balance_loss_mlp": 1.03925312, "epoch": 0.6268450323162483, "flos": 25337851211520.0, "grad_norm": 7.118223653843939, "language_loss": 0.7446872, "learning_rate": 1.2911543923573598e-06, "loss": 0.76604348, "num_input_tokens_seen": 224578660, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 10426, "time_per_iteration": 2.5440239906311035 }, { "auxiliary_loss_clip": 0.01110163, "auxiliary_loss_mlp": 0.01034996, "balance_loss_clip": 1.02156675, "balance_loss_mlp": 1.03910542, "epoch": 0.6269051555689162, "flos": 26177299032960.0, "grad_norm": 1.556129437473156, "language_loss": 0.8051455, "learning_rate": 1.290790225914929e-06, "loss": 0.82659715, "num_input_tokens_seen": 224599080, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 10427, "time_per_iteration": 2.537767171859741 }, { "auxiliary_loss_clip": 0.01111396, "auxiliary_loss_mlp": 0.01035352, "balance_loss_clip": 1.02182174, "balance_loss_mlp": 1.04004884, "epoch": 0.6269652788215843, "flos": 18256877539200.0, "grad_norm": 2.1697983067469657, "language_loss": 0.68460083, "learning_rate": 1.2904260863658034e-06, "loss": 0.70606828, "num_input_tokens_seen": 224614225, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 10428, "time_per_iteration": 2.416261911392212 }, { "auxiliary_loss_clip": 0.01110499, "auxiliary_loss_mlp": 0.01035324, "balance_loss_clip": 1.02301526, "balance_loss_mlp": 1.04111707, "epoch": 0.6270254020742522, "flos": 11765413877760.0, "grad_norm": 1.7011671380690185, "language_loss": 0.71669674, "learning_rate": 1.2900619737237928e-06, "loss": 0.73815501, "num_input_tokens_seen": 224632365, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 10429, "time_per_iteration": 2.4345695972442627 }, { "auxiliary_loss_clip": 0.01113197, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.01897359, "balance_loss_mlp": 1.04106116, "epoch": 0.6270855253269202, "flos": 23475631530240.0, "grad_norm": 1.5891254301279725, "language_loss": 0.79889721, "learning_rate": 1.2896978880027023e-06, "loss": 0.82035583, "num_input_tokens_seen": 224651125, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 10430, "time_per_iteration": 2.520785331726074 }, { "auxiliary_loss_clip": 0.01036912, "auxiliary_loss_mlp": 0.01000864, "balance_loss_clip": 0.99955863, "balance_loss_mlp": 1.01334929, "epoch": 0.6271456485795882, "flos": 70064520232320.0, "grad_norm": 1.0705216740902652, "language_loss": 0.59168446, "learning_rate": 1.2893338292163393e-06, "loss": 0.61206222, "num_input_tokens_seen": 224716115, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.23632812, "step": 10431, "time_per_iteration": 3.2039883136749268 }, { "auxiliary_loss_clip": 0.01036849, "auxiliary_loss_mlp": 0.00999985, "balance_loss_clip": 0.99866754, "balance_loss_mlp": 1.01318359, "epoch": 0.6272057718322561, "flos": 65156718280320.0, "grad_norm": 0.8737727792357537, "language_loss": 0.63795441, "learning_rate": 1.2889697973785095e-06, "loss": 0.65832275, "num_input_tokens_seen": 224782930, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.23632812, "step": 10432, "time_per_iteration": 3.1638309955596924 }, { "auxiliary_loss_clip": 0.01106915, "auxiliary_loss_mlp": 0.01033397, "balance_loss_clip": 1.0215534, "balance_loss_mlp": 1.03748763, "epoch": 0.6272658950849241, "flos": 24389342720640.0, "grad_norm": 1.8356888460554484, "language_loss": 0.65055978, "learning_rate": 1.2886057925030153e-06, "loss": 0.67196292, "num_input_tokens_seen": 224802010, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 10433, "time_per_iteration": 2.492363452911377 }, { "auxiliary_loss_clip": 0.01113783, "auxiliary_loss_mlp": 0.01036093, "balance_loss_clip": 1.02239597, "balance_loss_mlp": 1.0406909, "epoch": 0.627326018337592, "flos": 17966001202560.0, "grad_norm": 2.3608827045429814, "language_loss": 0.6138984, "learning_rate": 1.2882418146036612e-06, "loss": 0.6353972, "num_input_tokens_seen": 224818875, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 10434, "time_per_iteration": 2.475783109664917 }, { "auxiliary_loss_clip": 0.01109317, "auxiliary_loss_mlp": 0.01026142, "balance_loss_clip": 1.01274884, "balance_loss_mlp": 1.03769326, "epoch": 0.6273861415902601, "flos": 20230097224320.0, "grad_norm": 2.6198729295289453, "language_loss": 0.84494126, "learning_rate": 1.2878778636942484e-06, "loss": 0.86629587, "num_input_tokens_seen": 224837790, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 10435, "time_per_iteration": 2.462059497833252 }, { "auxiliary_loss_clip": 0.01036885, "auxiliary_loss_mlp": 0.0100079, "balance_loss_clip": 0.99953806, "balance_loss_mlp": 1.01309681, "epoch": 0.627446264842928, "flos": 64953210798720.0, "grad_norm": 0.7364326979822798, "language_loss": 0.61523259, "learning_rate": 1.2875139397885786e-06, "loss": 0.63560927, "num_input_tokens_seen": 224899685, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.23828125, "step": 10436, "time_per_iteration": 3.103950023651123 }, { "auxiliary_loss_clip": 0.0111346, "auxiliary_loss_mlp": 0.01038611, "balance_loss_clip": 1.02415705, "balance_loss_mlp": 1.04282618, "epoch": 0.627506388095596, "flos": 23584261236480.0, "grad_norm": 1.5062270008848424, "language_loss": 0.77521813, "learning_rate": 1.2871500429004523e-06, "loss": 0.79673886, "num_input_tokens_seen": 224918650, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.70703125, "step": 10437, "time_per_iteration": 2.48987078666687 }, { "auxiliary_loss_clip": 0.01037269, "auxiliary_loss_mlp": 0.01000223, "balance_loss_clip": 0.99896538, "balance_loss_mlp": 1.01348329, "epoch": 0.6275665113482639, "flos": 67583631674880.0, "grad_norm": 0.7464239608878397, "language_loss": 0.54246557, "learning_rate": 1.2867861730436667e-06, "loss": 0.56284058, "num_input_tokens_seen": 224981575, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.23828125, "step": 10438, "time_per_iteration": 3.0323846340179443 }, { "auxiliary_loss_clip": 0.01108942, "auxiliary_loss_mlp": 0.01036739, "balance_loss_clip": 1.02328014, "balance_loss_mlp": 1.03875303, "epoch": 0.6276266346009319, "flos": 27636924101760.0, "grad_norm": 2.8730735625051573, "language_loss": 0.84390283, "learning_rate": 1.2864223302320214e-06, "loss": 0.86535966, "num_input_tokens_seen": 225000820, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 10439, "time_per_iteration": 2.576383352279663 }, { "auxiliary_loss_clip": 0.01112571, "auxiliary_loss_mlp": 0.01040677, "balance_loss_clip": 1.02664578, "balance_loss_mlp": 1.04029346, "epoch": 0.6276867578535998, "flos": 22746142218240.0, "grad_norm": 2.1126252581875433, "language_loss": 0.80449343, "learning_rate": 1.2860585144793128e-06, "loss": 0.82602596, "num_input_tokens_seen": 225017585, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.72265625, "step": 10440, "time_per_iteration": 2.46567702293396 }, { "auxiliary_loss_clip": 0.01104435, "auxiliary_loss_mlp": 0.01032005, "balance_loss_clip": 1.0205729, "balance_loss_mlp": 1.03796625, "epoch": 0.6277468811062679, "flos": 24644200694400.0, "grad_norm": 2.120247209399359, "language_loss": 0.74936783, "learning_rate": 1.285694725799337e-06, "loss": 0.77073222, "num_input_tokens_seen": 225039085, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 10441, "time_per_iteration": 2.527190685272217 }, { "auxiliary_loss_clip": 0.01108047, "auxiliary_loss_mlp": 0.01032255, "balance_loss_clip": 1.01895118, "balance_loss_mlp": 1.03898573, "epoch": 0.6278070043589358, "flos": 19678975873920.0, "grad_norm": 3.2825728366706257, "language_loss": 0.72284961, "learning_rate": 1.2853309642058884e-06, "loss": 0.74425268, "num_input_tokens_seen": 225058105, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69140625, "step": 10442, "time_per_iteration": 2.4586033821105957 }, { "auxiliary_loss_clip": 0.01110655, "auxiliary_loss_mlp": 0.01033188, "balance_loss_clip": 1.02048016, "balance_loss_mlp": 1.04030132, "epoch": 0.6278671276116038, "flos": 22121834906880.0, "grad_norm": 1.5894108132890488, "language_loss": 0.72053778, "learning_rate": 1.284967229712762e-06, "loss": 0.74197614, "num_input_tokens_seen": 225077605, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 10443, "time_per_iteration": 2.5197594165802 }, { "auxiliary_loss_clip": 0.01109928, "auxiliary_loss_mlp": 0.01029933, "balance_loss_clip": 1.01733232, "balance_loss_mlp": 1.04007208, "epoch": 0.6279272508642717, "flos": 23038562839680.0, "grad_norm": 3.5099170209073027, "language_loss": 0.73615825, "learning_rate": 1.2846035223337492e-06, "loss": 0.75755692, "num_input_tokens_seen": 225097775, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 10444, "time_per_iteration": 2.487793207168579 }, { "auxiliary_loss_clip": 0.01108423, "auxiliary_loss_mlp": 0.01029027, "balance_loss_clip": 1.01599193, "balance_loss_mlp": 1.0400126, "epoch": 0.6279873741169397, "flos": 19824090819840.0, "grad_norm": 1.9833954465798487, "language_loss": 0.72626573, "learning_rate": 1.2842398420826423e-06, "loss": 0.74764025, "num_input_tokens_seen": 225115585, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.68359375, "step": 10445, "time_per_iteration": 2.4713735580444336 }, { "auxiliary_loss_clip": 0.01108572, "auxiliary_loss_mlp": 0.01028363, "balance_loss_clip": 1.01558995, "balance_loss_mlp": 1.03804874, "epoch": 0.6280474973696077, "flos": 23915393740800.0, "grad_norm": 1.5563215695601973, "language_loss": 0.69192874, "learning_rate": 1.2838761889732331e-06, "loss": 0.71329808, "num_input_tokens_seen": 225135575, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10446, "time_per_iteration": 2.4865152835845947 }, { "auxiliary_loss_clip": 0.01114022, "auxiliary_loss_mlp": 0.01033886, "balance_loss_clip": 1.01961672, "balance_loss_mlp": 1.03946435, "epoch": 0.6281076206222757, "flos": 17967976450560.0, "grad_norm": 2.592974313730909, "language_loss": 0.73371857, "learning_rate": 1.2835125630193102e-06, "loss": 0.7551977, "num_input_tokens_seen": 225154230, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 10447, "time_per_iteration": 2.449342966079712 }, { "auxiliary_loss_clip": 0.01036885, "auxiliary_loss_mlp": 0.01006484, "balance_loss_clip": 1.00523794, "balance_loss_mlp": 1.01316047, "epoch": 0.6281677438749437, "flos": 66778370622720.0, "grad_norm": 0.6786470546238731, "language_loss": 0.5234862, "learning_rate": 1.2831489642346626e-06, "loss": 0.54391986, "num_input_tokens_seen": 225213650, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.23730469, "step": 10448, "time_per_iteration": 2.9618377685546875 }, { "auxiliary_loss_clip": 0.01111594, "auxiliary_loss_mlp": 0.01042173, "balance_loss_clip": 1.02866066, "balance_loss_mlp": 1.04086566, "epoch": 0.6282278671276116, "flos": 11656173640320.0, "grad_norm": 2.2936438695906616, "language_loss": 0.91184938, "learning_rate": 1.282785392633079e-06, "loss": 0.93338704, "num_input_tokens_seen": 225230135, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 10449, "time_per_iteration": 2.4373273849487305 }, { "auxiliary_loss_clip": 0.01107139, "auxiliary_loss_mlp": 0.01029471, "balance_loss_clip": 1.01749659, "balance_loss_mlp": 1.03758502, "epoch": 0.6282879903802796, "flos": 42741597847680.0, "grad_norm": 11.744698082756367, "language_loss": 0.60245425, "learning_rate": 1.2824218482283438e-06, "loss": 0.62382036, "num_input_tokens_seen": 225253520, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 10450, "time_per_iteration": 2.6755800247192383 }, { "auxiliary_loss_clip": 0.01107022, "auxiliary_loss_mlp": 0.01032714, "balance_loss_clip": 1.02038777, "balance_loss_mlp": 1.03992224, "epoch": 0.6283481136329475, "flos": 20009210538240.0, "grad_norm": 1.7225703740283762, "language_loss": 0.76736522, "learning_rate": 1.2820583310342452e-06, "loss": 0.78876257, "num_input_tokens_seen": 225272460, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.671875, "step": 10451, "time_per_iteration": 2.4612250328063965 }, { "auxiliary_loss_clip": 0.01109049, "auxiliary_loss_mlp": 0.01033851, "balance_loss_clip": 1.02109528, "balance_loss_mlp": 1.03781199, "epoch": 0.6284082368856155, "flos": 21904431840000.0, "grad_norm": 2.583856604561038, "language_loss": 0.77807206, "learning_rate": 1.281694841064566e-06, "loss": 0.79950106, "num_input_tokens_seen": 225291700, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 10452, "time_per_iteration": 2.50205659866333 }, { "auxiliary_loss_clip": 0.01110444, "auxiliary_loss_mlp": 0.01033235, "balance_loss_clip": 1.02002645, "balance_loss_mlp": 1.04068196, "epoch": 0.6284683601382834, "flos": 25484187219840.0, "grad_norm": 2.241961248384879, "language_loss": 0.72623658, "learning_rate": 1.2813313783330904e-06, "loss": 0.74767339, "num_input_tokens_seen": 225311470, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 10453, "time_per_iteration": 2.5265655517578125 }, { "auxiliary_loss_clip": 0.01107889, "auxiliary_loss_mlp": 0.01031443, "balance_loss_clip": 1.01786506, "balance_loss_mlp": 1.03717637, "epoch": 0.6285284833909515, "flos": 16538695395840.0, "grad_norm": 1.6781555851440917, "language_loss": 0.80726206, "learning_rate": 1.2809679428536013e-06, "loss": 0.82865536, "num_input_tokens_seen": 225328385, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.70703125, "step": 10454, "time_per_iteration": 2.440023422241211 }, { "auxiliary_loss_clip": 0.01108073, "auxiliary_loss_mlp": 0.01032243, "balance_loss_clip": 1.02003598, "balance_loss_mlp": 1.04004586, "epoch": 0.6285886066436194, "flos": 22820692896000.0, "grad_norm": 2.200606025816854, "language_loss": 0.82060599, "learning_rate": 1.2806045346398792e-06, "loss": 0.84200919, "num_input_tokens_seen": 225348415, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 10455, "time_per_iteration": 2.4791154861450195 }, { "auxiliary_loss_clip": 0.01108444, "auxiliary_loss_mlp": 0.01031898, "balance_loss_clip": 1.01915407, "balance_loss_mlp": 1.03875446, "epoch": 0.6286487298962874, "flos": 24715734629760.0, "grad_norm": 1.5114530772324737, "language_loss": 0.81746638, "learning_rate": 1.280241153705706e-06, "loss": 0.83886975, "num_input_tokens_seen": 225367740, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10456, "time_per_iteration": 2.511897087097168 }, { "auxiliary_loss_clip": 0.01112765, "auxiliary_loss_mlp": 0.01030949, "balance_loss_clip": 1.01726341, "balance_loss_mlp": 1.04086196, "epoch": 0.6287088531489553, "flos": 20740818752640.0, "grad_norm": 1.8203417926699477, "language_loss": 0.72191215, "learning_rate": 1.27987780006486e-06, "loss": 0.74334931, "num_input_tokens_seen": 225388405, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 10457, "time_per_iteration": 2.467102289199829 }, { "auxiliary_loss_clip": 0.01113187, "auxiliary_loss_mlp": 0.01031326, "balance_loss_clip": 1.01793873, "balance_loss_mlp": 1.03886509, "epoch": 0.6287689764016233, "flos": 23070630706560.0, "grad_norm": 1.8963274934482328, "language_loss": 0.7969172, "learning_rate": 1.2795144737311202e-06, "loss": 0.8183623, "num_input_tokens_seen": 225408360, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.74609375, "step": 10458, "time_per_iteration": 2.48306941986084 }, { "auxiliary_loss_clip": 0.0111225, "auxiliary_loss_mlp": 0.01032469, "balance_loss_clip": 1.01962996, "balance_loss_mlp": 1.04048431, "epoch": 0.6288290996542913, "flos": 32233669251840.0, "grad_norm": 1.5405265745890147, "language_loss": 0.61476803, "learning_rate": 1.2791511747182635e-06, "loss": 0.63621521, "num_input_tokens_seen": 225431310, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 10459, "time_per_iteration": 2.551814556121826 }, { "auxiliary_loss_clip": 0.01108984, "auxiliary_loss_mlp": 0.0103108, "balance_loss_clip": 1.01876557, "balance_loss_mlp": 1.03886867, "epoch": 0.6288892229069593, "flos": 24641327606400.0, "grad_norm": 1.6836787851334387, "language_loss": 0.78626025, "learning_rate": 1.2787879030400666e-06, "loss": 0.80766088, "num_input_tokens_seen": 225450385, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 10460, "time_per_iteration": 2.5022430419921875 }, { "auxiliary_loss_clip": 0.0110909, "auxiliary_loss_mlp": 0.01026008, "balance_loss_clip": 1.01319313, "balance_loss_mlp": 1.04007125, "epoch": 0.6289493461596273, "flos": 17858341163520.0, "grad_norm": 1.8405112680037778, "language_loss": 0.74394703, "learning_rate": 1.2784246587103047e-06, "loss": 0.76529801, "num_input_tokens_seen": 225467325, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 10461, "time_per_iteration": 2.439810037612915 }, { "auxiliary_loss_clip": 0.01106145, "auxiliary_loss_mlp": 0.01034249, "balance_loss_clip": 1.02180934, "balance_loss_mlp": 1.0381912, "epoch": 0.6290094694122952, "flos": 22345379199360.0, "grad_norm": 3.5789544172271897, "language_loss": 0.70111418, "learning_rate": 1.2780614417427523e-06, "loss": 0.72251821, "num_input_tokens_seen": 225487370, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 10462, "time_per_iteration": 3.8653881549835205 }, { "auxiliary_loss_clip": 0.01102067, "auxiliary_loss_mlp": 0.01030195, "balance_loss_clip": 1.01879275, "balance_loss_mlp": 1.03690112, "epoch": 0.6290695926649632, "flos": 28402431776640.0, "grad_norm": 3.069850067960388, "language_loss": 0.72566527, "learning_rate": 1.2776982521511821e-06, "loss": 0.74698788, "num_input_tokens_seen": 225506915, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.65234375, "step": 10463, "time_per_iteration": 2.5044853687286377 }, { "auxiliary_loss_clip": 0.01108169, "auxiliary_loss_mlp": 0.01037004, "balance_loss_clip": 1.02378964, "balance_loss_mlp": 1.04118729, "epoch": 0.6291297159176311, "flos": 21505464501120.0, "grad_norm": 2.8674001192132033, "language_loss": 0.7274282, "learning_rate": 1.2773350899493665e-06, "loss": 0.74887991, "num_input_tokens_seen": 225525670, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.671875, "step": 10464, "time_per_iteration": 3.9691367149353027 }, { "auxiliary_loss_clip": 0.0110801, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 1.01848006, "balance_loss_mlp": 1.04052103, "epoch": 0.6291898391702991, "flos": 12203308581120.0, "grad_norm": 7.634071209239923, "language_loss": 0.69338429, "learning_rate": 1.2769719551510768e-06, "loss": 0.7147705, "num_input_tokens_seen": 225542235, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 10465, "time_per_iteration": 2.4502902030944824 }, { "auxiliary_loss_clip": 0.0103636, "auxiliary_loss_mlp": 0.0100588, "balance_loss_clip": 1.00460422, "balance_loss_mlp": 1.01260185, "epoch": 0.629249962422967, "flos": 69299479434240.0, "grad_norm": 0.698776121354466, "language_loss": 0.59707689, "learning_rate": 1.2766088477700832e-06, "loss": 0.61749923, "num_input_tokens_seen": 225607185, "router_z_loss_clip": 0.01275635, "router_z_loss_mlp": 0.23828125, "step": 10466, "time_per_iteration": 5.998225212097168 }, { "auxiliary_loss_clip": 0.01103073, "auxiliary_loss_mlp": 0.0103043, "balance_loss_clip": 1.018664, "balance_loss_mlp": 1.03498471, "epoch": 0.6293100856756351, "flos": 40077888042240.0, "grad_norm": 2.887407971187555, "language_loss": 0.6486913, "learning_rate": 1.276245767820154e-06, "loss": 0.67002636, "num_input_tokens_seen": 225628785, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 10467, "time_per_iteration": 2.6103367805480957 }, { "auxiliary_loss_clip": 0.01036834, "auxiliary_loss_mlp": 0.0100381, "balance_loss_clip": 1.00261199, "balance_loss_mlp": 1.01299191, "epoch": 0.629370208928303, "flos": 67501108177920.0, "grad_norm": 4.882918528318643, "language_loss": 0.56865728, "learning_rate": 1.2758827153150586e-06, "loss": 0.5890637, "num_input_tokens_seen": 225678980, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.23828125, "step": 10468, "time_per_iteration": 2.858672618865967 }, { "auxiliary_loss_clip": 0.01036305, "auxiliary_loss_mlp": 0.0100412, "balance_loss_clip": 1.00279105, "balance_loss_mlp": 1.0125773, "epoch": 0.629430332180971, "flos": 60660450449280.0, "grad_norm": 0.740056066184377, "language_loss": 0.57987279, "learning_rate": 1.2755196902685626e-06, "loss": 0.60027701, "num_input_tokens_seen": 225740295, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.23730469, "step": 10469, "time_per_iteration": 3.0199429988861084 }, { "auxiliary_loss_clip": 0.01035486, "auxiliary_loss_mlp": 0.01005227, "balance_loss_clip": 1.00392187, "balance_loss_mlp": 1.01176834, "epoch": 0.6294904554336389, "flos": 66869764778880.0, "grad_norm": 0.6776689012108533, "language_loss": 0.52131182, "learning_rate": 1.2751566926944329e-06, "loss": 0.54171896, "num_input_tokens_seen": 225805615, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.23730469, "step": 10470, "time_per_iteration": 3.133479595184326 }, { "auxiliary_loss_clip": 0.0110668, "auxiliary_loss_mlp": 0.01031426, "balance_loss_clip": 1.01886725, "balance_loss_mlp": 1.03798497, "epoch": 0.6295505786863069, "flos": 42522794150400.0, "grad_norm": 1.6348169873824816, "language_loss": 0.74331832, "learning_rate": 1.2747937226064342e-06, "loss": 0.76469946, "num_input_tokens_seen": 225826585, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 10471, "time_per_iteration": 2.6780121326446533 }, { "auxiliary_loss_clip": 0.01108693, "auxiliary_loss_mlp": 0.01028858, "balance_loss_clip": 1.01635349, "balance_loss_mlp": 1.03833032, "epoch": 0.629610701938975, "flos": 17384140788480.0, "grad_norm": 2.064083653553016, "language_loss": 0.62956792, "learning_rate": 1.2744307800183297e-06, "loss": 0.65094346, "num_input_tokens_seen": 225844095, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 10472, "time_per_iteration": 2.4360125064849854 }, { "auxiliary_loss_clip": 0.01113151, "auxiliary_loss_mlp": 0.01031, "balance_loss_clip": 1.01819086, "balance_loss_mlp": 1.04147148, "epoch": 0.6296708251916429, "flos": 24242934885120.0, "grad_norm": 1.8634082536484096, "language_loss": 0.6928494, "learning_rate": 1.2740678649438828e-06, "loss": 0.71429086, "num_input_tokens_seen": 225864310, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 10473, "time_per_iteration": 2.519169569015503 }, { "auxiliary_loss_clip": 0.01106656, "auxiliary_loss_mlp": 0.01029062, "balance_loss_clip": 1.01652145, "balance_loss_mlp": 1.03741825, "epoch": 0.6297309484443109, "flos": 19278536077440.0, "grad_norm": 1.6470608126928956, "language_loss": 0.74554503, "learning_rate": 1.2737049773968554e-06, "loss": 0.76690221, "num_input_tokens_seen": 225883830, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 10474, "time_per_iteration": 2.494860887527466 }, { "auxiliary_loss_clip": 0.01107271, "auxiliary_loss_mlp": 0.01029942, "balance_loss_clip": 1.01715708, "balance_loss_mlp": 1.03722644, "epoch": 0.6297910716969788, "flos": 30662685043200.0, "grad_norm": 1.5016171504367626, "language_loss": 0.66305053, "learning_rate": 1.2733421173910081e-06, "loss": 0.68442273, "num_input_tokens_seen": 225905755, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 10475, "time_per_iteration": 2.6105148792266846 }, { "auxiliary_loss_clip": 0.01104648, "auxiliary_loss_mlp": 0.01031734, "balance_loss_clip": 1.01962256, "balance_loss_mlp": 1.03766704, "epoch": 0.6298511949496468, "flos": 14423018371200.0, "grad_norm": 2.0505318929598384, "language_loss": 0.90155107, "learning_rate": 1.272979284940101e-06, "loss": 0.92291486, "num_input_tokens_seen": 225922155, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 10476, "time_per_iteration": 2.411698818206787 }, { "auxiliary_loss_clip": 0.01107606, "auxiliary_loss_mlp": 0.01032301, "balance_loss_clip": 1.02005243, "balance_loss_mlp": 1.03879511, "epoch": 0.6299113182023147, "flos": 23514163845120.0, "grad_norm": 1.8739789000837492, "language_loss": 0.75563163, "learning_rate": 1.2726164800578913e-06, "loss": 0.77703071, "num_input_tokens_seen": 225941060, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 10477, "time_per_iteration": 2.532975196838379 }, { "auxiliary_loss_clip": 0.01108365, "auxiliary_loss_mlp": 0.01028523, "balance_loss_clip": 1.01559496, "balance_loss_mlp": 1.03777957, "epoch": 0.6299714414549827, "flos": 22674500542080.0, "grad_norm": 1.8295792160716098, "language_loss": 0.7065475, "learning_rate": 1.272253702758138e-06, "loss": 0.72791636, "num_input_tokens_seen": 225960870, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 10478, "time_per_iteration": 2.4604392051696777 }, { "auxiliary_loss_clip": 0.011132, "auxiliary_loss_mlp": 0.01030262, "balance_loss_clip": 1.01654077, "balance_loss_mlp": 1.04027009, "epoch": 0.6300315647076506, "flos": 14501735026560.0, "grad_norm": 3.6046646543478227, "language_loss": 0.67813456, "learning_rate": 1.2718909530545974e-06, "loss": 0.69956923, "num_input_tokens_seen": 225977895, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 10479, "time_per_iteration": 2.4548745155334473 }, { "auxiliary_loss_clip": 0.01107155, "auxiliary_loss_mlp": 0.01030624, "balance_loss_clip": 1.0181365, "balance_loss_mlp": 1.03895867, "epoch": 0.6300916879603187, "flos": 21871681614720.0, "grad_norm": 1.6478589183229473, "language_loss": 0.73555237, "learning_rate": 1.2715282309610245e-06, "loss": 0.75693011, "num_input_tokens_seen": 225997835, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 10480, "time_per_iteration": 2.4778122901916504 }, { "auxiliary_loss_clip": 0.01108684, "auxiliary_loss_mlp": 0.01034792, "balance_loss_clip": 1.02107096, "balance_loss_mlp": 1.03810644, "epoch": 0.6301518112129866, "flos": 21834047139840.0, "grad_norm": 1.9672354998880652, "language_loss": 0.78884578, "learning_rate": 1.2711655364911744e-06, "loss": 0.8102805, "num_input_tokens_seen": 226017620, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 10481, "time_per_iteration": 2.4803125858306885 }, { "auxiliary_loss_clip": 0.01036044, "auxiliary_loss_mlp": 0.01003128, "balance_loss_clip": 1.00195372, "balance_loss_mlp": 1.01204014, "epoch": 0.6302119344656546, "flos": 44334237957120.0, "grad_norm": 0.9034377007940648, "language_loss": 0.61820507, "learning_rate": 1.2708028696588e-06, "loss": 0.63859677, "num_input_tokens_seen": 226068755, "router_z_loss_clip": 0.01171875, "router_z_loss_mlp": 0.24023438, "step": 10482, "time_per_iteration": 2.8491976261138916 }, { "auxiliary_loss_clip": 0.01113668, "auxiliary_loss_mlp": 0.01035856, "balance_loss_clip": 1.0218246, "balance_loss_mlp": 1.03907609, "epoch": 0.6302720577183225, "flos": 11217919800960.0, "grad_norm": 2.926593614708118, "language_loss": 0.83501279, "learning_rate": 1.2704402304776541e-06, "loss": 0.85650802, "num_input_tokens_seen": 226084395, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.74609375, "step": 10483, "time_per_iteration": 2.444866180419922 }, { "auxiliary_loss_clip": 0.01102304, "auxiliary_loss_mlp": 0.01031546, "balance_loss_clip": 1.01935744, "balance_loss_mlp": 1.03689075, "epoch": 0.6303321809709905, "flos": 27964932122880.0, "grad_norm": 1.6765198971547841, "language_loss": 0.72516328, "learning_rate": 1.270077618961487e-06, "loss": 0.7465018, "num_input_tokens_seen": 226105890, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.65625, "step": 10484, "time_per_iteration": 2.511073112487793 }, { "auxiliary_loss_clip": 0.01107483, "auxiliary_loss_mlp": 0.01028483, "balance_loss_clip": 1.01541734, "balance_loss_mlp": 1.03695345, "epoch": 0.6303923042236586, "flos": 28220759763840.0, "grad_norm": 1.660829230141056, "language_loss": 0.74200916, "learning_rate": 1.2697150351240506e-06, "loss": 0.76336884, "num_input_tokens_seen": 226126760, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 10485, "time_per_iteration": 2.5181684494018555 }, { "auxiliary_loss_clip": 0.01111878, "auxiliary_loss_mlp": 0.010374, "balance_loss_clip": 1.02447152, "balance_loss_mlp": 1.03923905, "epoch": 0.6304524274763265, "flos": 27631034271360.0, "grad_norm": 2.257643036715305, "language_loss": 0.81058991, "learning_rate": 1.269352478979093e-06, "loss": 0.83208275, "num_input_tokens_seen": 226147315, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 10486, "time_per_iteration": 2.535865068435669 }, { "auxiliary_loss_clip": 0.01107125, "auxiliary_loss_mlp": 0.01036514, "balance_loss_clip": 1.02453947, "balance_loss_mlp": 1.03801894, "epoch": 0.6305125507289945, "flos": 17311313963520.0, "grad_norm": 1.7986612844712189, "language_loss": 0.638762, "learning_rate": 1.2689899505403628e-06, "loss": 0.66019845, "num_input_tokens_seen": 226165935, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 10487, "time_per_iteration": 2.4504053592681885 }, { "auxiliary_loss_clip": 0.0110853, "auxiliary_loss_mlp": 0.01035265, "balance_loss_clip": 1.02278328, "balance_loss_mlp": 1.03939652, "epoch": 0.6305726739816624, "flos": 25808280658560.0, "grad_norm": 2.3923460312006632, "language_loss": 0.6685285, "learning_rate": 1.2686274498216065e-06, "loss": 0.68996644, "num_input_tokens_seen": 226186890, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 10488, "time_per_iteration": 2.561828136444092 }, { "auxiliary_loss_clip": 0.01108919, "auxiliary_loss_mlp": 0.01030932, "balance_loss_clip": 1.01874256, "balance_loss_mlp": 1.03796232, "epoch": 0.6306327972343304, "flos": 21797454159360.0, "grad_norm": 5.542912007575892, "language_loss": 0.67412353, "learning_rate": 1.2682649768365706e-06, "loss": 0.69552201, "num_input_tokens_seen": 226206710, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 10489, "time_per_iteration": 2.5181775093078613 }, { "auxiliary_loss_clip": 0.01114095, "auxiliary_loss_mlp": 0.01034178, "balance_loss_clip": 1.02091575, "balance_loss_mlp": 1.03956461, "epoch": 0.6306929204869983, "flos": 20777375819520.0, "grad_norm": 1.6832246544789, "language_loss": 0.6898545, "learning_rate": 1.2679025315990007e-06, "loss": 0.71133721, "num_input_tokens_seen": 226225565, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 10490, "time_per_iteration": 2.478877305984497 }, { "auxiliary_loss_clip": 0.01108688, "auxiliary_loss_mlp": 0.0103682, "balance_loss_clip": 1.02394533, "balance_loss_mlp": 1.03804493, "epoch": 0.6307530437396663, "flos": 23654214973440.0, "grad_norm": 1.8407959170975712, "language_loss": 0.78335202, "learning_rate": 1.2675401141226393e-06, "loss": 0.80480707, "num_input_tokens_seen": 226243680, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 10491, "time_per_iteration": 2.5051143169403076 }, { "auxiliary_loss_clip": 0.011063, "auxiliary_loss_mlp": 0.01032804, "balance_loss_clip": 1.02098465, "balance_loss_mlp": 1.03774619, "epoch": 0.6308131669923343, "flos": 24719002767360.0, "grad_norm": 2.520091722156329, "language_loss": 0.55278724, "learning_rate": 1.2671777244212308e-06, "loss": 0.57417828, "num_input_tokens_seen": 226264345, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 10492, "time_per_iteration": 2.5169739723205566 }, { "auxiliary_loss_clip": 0.01110392, "auxiliary_loss_mlp": 0.01036233, "balance_loss_clip": 1.02329254, "balance_loss_mlp": 1.03930604, "epoch": 0.6308732902450023, "flos": 22565403959040.0, "grad_norm": 1.904623542263946, "language_loss": 0.64473748, "learning_rate": 1.2668153625085168e-06, "loss": 0.6662038, "num_input_tokens_seen": 226283165, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 10493, "time_per_iteration": 2.503619432449341 }, { "auxiliary_loss_clip": 0.01109106, "auxiliary_loss_mlp": 0.01031764, "balance_loss_clip": 1.01882386, "balance_loss_mlp": 1.03863907, "epoch": 0.6309334134976702, "flos": 24644200694400.0, "grad_norm": 1.786679643168085, "language_loss": 0.82872295, "learning_rate": 1.2664530283982367e-06, "loss": 0.85013163, "num_input_tokens_seen": 226304080, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 10494, "time_per_iteration": 2.5283806324005127 }, { "auxiliary_loss_clip": 0.01110437, "auxiliary_loss_mlp": 0.01032106, "balance_loss_clip": 1.01908803, "balance_loss_mlp": 1.03953671, "epoch": 0.6309935367503382, "flos": 41427949651200.0, "grad_norm": 2.3989021080494815, "language_loss": 0.79410309, "learning_rate": 1.2660907221041317e-06, "loss": 0.81552851, "num_input_tokens_seen": 226325925, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 10495, "time_per_iteration": 2.663057565689087 }, { "auxiliary_loss_clip": 0.01108033, "auxiliary_loss_mlp": 0.01033401, "balance_loss_clip": 1.02013326, "balance_loss_mlp": 1.03753543, "epoch": 0.6310536600030061, "flos": 15118931445120.0, "grad_norm": 3.8709866449806625, "language_loss": 0.70254999, "learning_rate": 1.2657284436399403e-06, "loss": 0.72396433, "num_input_tokens_seen": 226344190, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 10496, "time_per_iteration": 2.4246771335601807 }, { "auxiliary_loss_clip": 0.01108416, "auxiliary_loss_mlp": 0.01039669, "balance_loss_clip": 1.02624655, "balance_loss_mlp": 1.03705013, "epoch": 0.6311137832556741, "flos": 15231619388160.0, "grad_norm": 2.268627479045406, "language_loss": 0.79695719, "learning_rate": 1.2653661930193997e-06, "loss": 0.81843805, "num_input_tokens_seen": 226361520, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 10497, "time_per_iteration": 2.481557607650757 }, { "auxiliary_loss_clip": 0.01104826, "auxiliary_loss_mlp": 0.01032307, "balance_loss_clip": 1.02049351, "balance_loss_mlp": 1.03684807, "epoch": 0.6311739065083422, "flos": 22018664067840.0, "grad_norm": 3.811331494707248, "language_loss": 0.73970723, "learning_rate": 1.265003970256247e-06, "loss": 0.7610786, "num_input_tokens_seen": 226381920, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 10498, "time_per_iteration": 2.4644086360931396 }, { "auxiliary_loss_clip": 0.01108767, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 1.02313089, "balance_loss_mlp": 1.03814638, "epoch": 0.6312340297610101, "flos": 22710770300160.0, "grad_norm": 2.518638894915242, "language_loss": 0.69554996, "learning_rate": 1.264641775364217e-06, "loss": 0.71699297, "num_input_tokens_seen": 226400035, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 10499, "time_per_iteration": 2.494511842727661 }, { "auxiliary_loss_clip": 0.01106743, "auxiliary_loss_mlp": 0.01044282, "balance_loss_clip": 1.03198552, "balance_loss_mlp": 1.03912497, "epoch": 0.6312941530136781, "flos": 24280102483200.0, "grad_norm": 3.0677920516486776, "language_loss": 0.70079076, "learning_rate": 1.2642796083570448e-06, "loss": 0.72230101, "num_input_tokens_seen": 226418280, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 10500, "time_per_iteration": 2.4781413078308105 }, { "auxiliary_loss_clip": 0.01108919, "auxiliary_loss_mlp": 0.01033174, "balance_loss_clip": 1.02113926, "balance_loss_mlp": 1.03883445, "epoch": 0.631354276266346, "flos": 21725956137600.0, "grad_norm": 1.9969199659775594, "language_loss": 0.74419785, "learning_rate": 1.2639174692484634e-06, "loss": 0.7656188, "num_input_tokens_seen": 226436650, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 10501, "time_per_iteration": 2.4604618549346924 }, { "auxiliary_loss_clip": 0.01107557, "auxiliary_loss_mlp": 0.01034697, "balance_loss_clip": 1.02178025, "balance_loss_mlp": 1.03799176, "epoch": 0.631414399519014, "flos": 24025100855040.0, "grad_norm": 1.7390901718122023, "language_loss": 0.75278193, "learning_rate": 1.2635553580522053e-06, "loss": 0.77420449, "num_input_tokens_seen": 226456275, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 10502, "time_per_iteration": 2.498967170715332 }, { "auxiliary_loss_clip": 0.01112878, "auxiliary_loss_mlp": 0.0104496, "balance_loss_clip": 1.03156686, "balance_loss_mlp": 1.03998113, "epoch": 0.6314745227716819, "flos": 24315797623680.0, "grad_norm": 1.9290658253653394, "language_loss": 0.85601509, "learning_rate": 1.2631932747820022e-06, "loss": 0.87759346, "num_input_tokens_seen": 226473610, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 10503, "time_per_iteration": 3.927516460418701 }, { "auxiliary_loss_clip": 0.01109219, "auxiliary_loss_mlp": 0.01037198, "balance_loss_clip": 1.02418613, "balance_loss_mlp": 1.03775299, "epoch": 0.6315346460243499, "flos": 23366391292800.0, "grad_norm": 1.978269448022401, "language_loss": 0.86635721, "learning_rate": 1.2628312194515838e-06, "loss": 0.88782144, "num_input_tokens_seen": 226493665, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 10504, "time_per_iteration": 2.5201127529144287 }, { "auxiliary_loss_clip": 0.01113273, "auxiliary_loss_mlp": 0.01039299, "balance_loss_clip": 1.02554834, "balance_loss_mlp": 1.03960884, "epoch": 0.6315947692770179, "flos": 20260333497600.0, "grad_norm": 1.6599331248366516, "language_loss": 0.76350784, "learning_rate": 1.2624691920746793e-06, "loss": 0.78503358, "num_input_tokens_seen": 226511625, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 10505, "time_per_iteration": 2.468160629272461 }, { "auxiliary_loss_clip": 0.01110125, "auxiliary_loss_mlp": 0.01039655, "balance_loss_clip": 1.02629733, "balance_loss_mlp": 1.03847587, "epoch": 0.6316548925296859, "flos": 25265850399360.0, "grad_norm": 1.9355544727766203, "language_loss": 0.81843734, "learning_rate": 1.2621071926650166e-06, "loss": 0.83993506, "num_input_tokens_seen": 226530085, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 10506, "time_per_iteration": 3.9738810062408447 }, { "auxiliary_loss_clip": 0.01111227, "auxiliary_loss_mlp": 0.01037458, "balance_loss_clip": 1.02463698, "balance_loss_mlp": 1.04051805, "epoch": 0.6317150157823538, "flos": 22930579578240.0, "grad_norm": 1.8706751217992739, "language_loss": 0.74574977, "learning_rate": 1.2617452212363238e-06, "loss": 0.76723659, "num_input_tokens_seen": 226548115, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 10507, "time_per_iteration": 3.8381965160369873 }, { "auxiliary_loss_clip": 0.0111368, "auxiliary_loss_mlp": 0.01038732, "balance_loss_clip": 1.02539873, "balance_loss_mlp": 1.03996134, "epoch": 0.6317751390350218, "flos": 22527051212160.0, "grad_norm": 1.6075166927234434, "language_loss": 0.67452514, "learning_rate": 1.2613832778023258e-06, "loss": 0.69604933, "num_input_tokens_seen": 226567955, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 10508, "time_per_iteration": 3.920670986175537 }, { "auxiliary_loss_clip": 0.01109066, "auxiliary_loss_mlp": 0.01035705, "balance_loss_clip": 1.02308106, "balance_loss_mlp": 1.03861487, "epoch": 0.6318352622876897, "flos": 23294749616640.0, "grad_norm": 1.771612213791261, "language_loss": 0.71124113, "learning_rate": 1.2610213623767478e-06, "loss": 0.73268884, "num_input_tokens_seen": 226588205, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 10509, "time_per_iteration": 2.4810569286346436 }, { "auxiliary_loss_clip": 0.01107004, "auxiliary_loss_mlp": 0.01028702, "balance_loss_clip": 1.01666176, "balance_loss_mlp": 1.03703022, "epoch": 0.6318953855403577, "flos": 20704082117760.0, "grad_norm": 1.5609371519683335, "language_loss": 0.79308665, "learning_rate": 1.2606594749733143e-06, "loss": 0.81444371, "num_input_tokens_seen": 226606965, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 10510, "time_per_iteration": 2.4796297550201416 }, { "auxiliary_loss_clip": 0.0111059, "auxiliary_loss_mlp": 0.0103502, "balance_loss_clip": 1.02166247, "balance_loss_mlp": 1.03840041, "epoch": 0.6319555087930258, "flos": 22820046451200.0, "grad_norm": 1.4735362394046363, "language_loss": 0.70709872, "learning_rate": 1.2602976156057469e-06, "loss": 0.72855484, "num_input_tokens_seen": 226627845, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 10511, "time_per_iteration": 2.499880075454712 }, { "auxiliary_loss_clip": 0.01106728, "auxiliary_loss_mlp": 0.01034126, "balance_loss_clip": 1.02173984, "balance_loss_mlp": 1.03803515, "epoch": 0.6320156320456937, "flos": 19970929618560.0, "grad_norm": 1.8802155641757052, "language_loss": 0.80108738, "learning_rate": 1.2599357842877684e-06, "loss": 0.82249588, "num_input_tokens_seen": 226645855, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 10512, "time_per_iteration": 2.4678502082824707 }, { "auxiliary_loss_clip": 0.01110757, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.02105534, "balance_loss_mlp": 1.03995585, "epoch": 0.6320757552983617, "flos": 27013406889600.0, "grad_norm": 1.9574158187536548, "language_loss": 0.70614648, "learning_rate": 1.2595739810330994e-06, "loss": 0.72759998, "num_input_tokens_seen": 226665375, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 10513, "time_per_iteration": 2.51491117477417 }, { "auxiliary_loss_clip": 0.01112921, "auxiliary_loss_mlp": 0.01029505, "balance_loss_clip": 1.01661229, "balance_loss_mlp": 1.03973341, "epoch": 0.6321358785510296, "flos": 23695943598720.0, "grad_norm": 1.6232774799978866, "language_loss": 0.66565537, "learning_rate": 1.259212205855459e-06, "loss": 0.68707961, "num_input_tokens_seen": 226685270, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 10514, "time_per_iteration": 2.4814751148223877 }, { "auxiliary_loss_clip": 0.01104845, "auxiliary_loss_mlp": 0.01030666, "balance_loss_clip": 1.01817286, "balance_loss_mlp": 1.03596818, "epoch": 0.6321960018036976, "flos": 25995231970560.0, "grad_norm": 1.7457183784968495, "language_loss": 0.74256468, "learning_rate": 1.2588504587685663e-06, "loss": 0.76391983, "num_input_tokens_seen": 226705325, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 10515, "time_per_iteration": 2.5203542709350586 }, { "auxiliary_loss_clip": 0.01103816, "auxiliary_loss_mlp": 0.01029747, "balance_loss_clip": 1.01786149, "balance_loss_mlp": 1.03615475, "epoch": 0.6322561250563655, "flos": 22821016118400.0, "grad_norm": 1.9120430492516356, "language_loss": 0.89898872, "learning_rate": 1.2584887397861379e-06, "loss": 0.92032433, "num_input_tokens_seen": 226723815, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 10516, "time_per_iteration": 2.4524080753326416 }, { "auxiliary_loss_clip": 0.01115528, "auxiliary_loss_mlp": 0.01033708, "balance_loss_clip": 1.01949787, "balance_loss_mlp": 1.04084098, "epoch": 0.6323162483090335, "flos": 18988413926400.0, "grad_norm": 1.7801921242202996, "language_loss": 0.8170436, "learning_rate": 1.2581270489218911e-06, "loss": 0.8385359, "num_input_tokens_seen": 226741550, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 10517, "time_per_iteration": 2.4727320671081543 }, { "auxiliary_loss_clip": 0.01107596, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.02463818, "balance_loss_mlp": 1.03830814, "epoch": 0.6323763715617015, "flos": 19865173000320.0, "grad_norm": 1.7619897051146367, "language_loss": 0.77953702, "learning_rate": 1.257765386189541e-06, "loss": 0.80098695, "num_input_tokens_seen": 226761115, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 10518, "time_per_iteration": 2.4493486881256104 }, { "auxiliary_loss_clip": 0.0110543, "auxiliary_loss_mlp": 0.01033416, "balance_loss_clip": 1.02057099, "balance_loss_mlp": 1.03683996, "epoch": 0.6324364948143695, "flos": 22782699285120.0, "grad_norm": 1.6102068880363856, "language_loss": 0.85285759, "learning_rate": 1.2574037516028018e-06, "loss": 0.87424606, "num_input_tokens_seen": 226782225, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 10519, "time_per_iteration": 2.504232168197632 }, { "auxiliary_loss_clip": 0.01106602, "auxiliary_loss_mlp": 0.01033974, "balance_loss_clip": 1.02204108, "balance_loss_mlp": 1.03910041, "epoch": 0.6324966180670374, "flos": 22235923480320.0, "grad_norm": 1.823424150578825, "language_loss": 0.72047651, "learning_rate": 1.2570421451753867e-06, "loss": 0.74188221, "num_input_tokens_seen": 226802375, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 10520, "time_per_iteration": 2.479649782180786 }, { "auxiliary_loss_clip": 0.01106357, "auxiliary_loss_mlp": 0.01031293, "balance_loss_clip": 1.01883006, "balance_loss_mlp": 1.037395, "epoch": 0.6325567413197054, "flos": 21689183589120.0, "grad_norm": 1.9668674480028963, "language_loss": 0.71655452, "learning_rate": 1.2566805669210081e-06, "loss": 0.73793101, "num_input_tokens_seen": 226822165, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 10521, "time_per_iteration": 2.4505138397216797 }, { "auxiliary_loss_clip": 0.01109456, "auxiliary_loss_mlp": 0.01033038, "balance_loss_clip": 1.02013326, "balance_loss_mlp": 1.03858149, "epoch": 0.6326168645723733, "flos": 19937137898880.0, "grad_norm": 1.79434111707323, "language_loss": 0.72072482, "learning_rate": 1.256319016853377e-06, "loss": 0.74214977, "num_input_tokens_seen": 226841645, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 10522, "time_per_iteration": 2.4607534408569336 }, { "auxiliary_loss_clip": 0.01110467, "auxiliary_loss_mlp": 0.01031391, "balance_loss_clip": 1.01843286, "balance_loss_mlp": 1.03897023, "epoch": 0.6326769878250413, "flos": 20230348619520.0, "grad_norm": 2.090216721418117, "language_loss": 0.81546634, "learning_rate": 1.2559574949862023e-06, "loss": 0.83688498, "num_input_tokens_seen": 226860355, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 10523, "time_per_iteration": 2.4719507694244385 }, { "auxiliary_loss_clip": 0.01106491, "auxiliary_loss_mlp": 0.01025911, "balance_loss_clip": 1.0132091, "balance_loss_mlp": 1.03715134, "epoch": 0.6327371110777094, "flos": 20775759707520.0, "grad_norm": 2.1846925046759065, "language_loss": 0.74035227, "learning_rate": 1.255596001333195e-06, "loss": 0.76167631, "num_input_tokens_seen": 226878390, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10524, "time_per_iteration": 2.437859535217285 }, { "auxiliary_loss_clip": 0.01114476, "auxiliary_loss_mlp": 0.01040041, "balance_loss_clip": 1.0255332, "balance_loss_mlp": 1.03809404, "epoch": 0.6327972343303773, "flos": 30336544529280.0, "grad_norm": 3.023809895069726, "language_loss": 0.8459723, "learning_rate": 1.2552345359080615e-06, "loss": 0.86751747, "num_input_tokens_seen": 226898420, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.765625, "step": 10525, "time_per_iteration": 2.5573883056640625 }, { "auxiliary_loss_clip": 0.01106428, "auxiliary_loss_mlp": 0.01024065, "balance_loss_clip": 1.01203716, "balance_loss_mlp": 1.0374459, "epoch": 0.6328573575830453, "flos": 17092258871040.0, "grad_norm": 1.7743016321377991, "language_loss": 0.67115593, "learning_rate": 1.2548730987245093e-06, "loss": 0.69246078, "num_input_tokens_seen": 226916305, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 10526, "time_per_iteration": 2.4276981353759766 }, { "auxiliary_loss_clip": 0.01112213, "auxiliary_loss_mlp": 0.01037322, "balance_loss_clip": 1.02361298, "balance_loss_mlp": 1.03957117, "epoch": 0.6329174808357132, "flos": 25047154442880.0, "grad_norm": 1.4334907699841486, "language_loss": 0.73795068, "learning_rate": 1.254511689796244e-06, "loss": 0.75944602, "num_input_tokens_seen": 226937705, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 10527, "time_per_iteration": 2.540118455886841 }, { "auxiliary_loss_clip": 0.01107104, "auxiliary_loss_mlp": 0.0103192, "balance_loss_clip": 1.02005315, "balance_loss_mlp": 1.03880429, "epoch": 0.6329776040883812, "flos": 16836826279680.0, "grad_norm": 2.299965997045254, "language_loss": 0.71816552, "learning_rate": 1.2541503091369693e-06, "loss": 0.73955572, "num_input_tokens_seen": 226954880, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.68359375, "step": 10528, "time_per_iteration": 2.4512946605682373 }, { "auxiliary_loss_clip": 0.01105374, "auxiliary_loss_mlp": 0.01027384, "balance_loss_clip": 1.01403296, "balance_loss_mlp": 1.03595173, "epoch": 0.6330377273410491, "flos": 13516705382400.0, "grad_norm": 6.803587053325699, "language_loss": 0.66977626, "learning_rate": 1.2537889567603905e-06, "loss": 0.69110382, "num_input_tokens_seen": 226972595, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 10529, "time_per_iteration": 2.4375529289245605 }, { "auxiliary_loss_clip": 0.01112939, "auxiliary_loss_mlp": 0.01029379, "balance_loss_clip": 1.0161413, "balance_loss_mlp": 1.04057741, "epoch": 0.6330978505937171, "flos": 21538825257600.0, "grad_norm": 2.034517763533324, "language_loss": 0.75426495, "learning_rate": 1.2534276326802092e-06, "loss": 0.77568811, "num_input_tokens_seen": 226991910, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 10530, "time_per_iteration": 2.472109794616699 }, { "auxiliary_loss_clip": 0.01111815, "auxiliary_loss_mlp": 0.01030753, "balance_loss_clip": 1.01837301, "balance_loss_mlp": 1.04056168, "epoch": 0.6331579738463851, "flos": 25009484054400.0, "grad_norm": 1.7166222236948023, "language_loss": 0.73953485, "learning_rate": 1.2530663369101259e-06, "loss": 0.76096052, "num_input_tokens_seen": 227010175, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 10531, "time_per_iteration": 2.4969635009765625 }, { "auxiliary_loss_clip": 0.01104177, "auxiliary_loss_mlp": 0.0102971, "balance_loss_clip": 1.01672792, "balance_loss_mlp": 1.03639638, "epoch": 0.6332180970990531, "flos": 14976007228800.0, "grad_norm": 2.2713172375345345, "language_loss": 0.79624087, "learning_rate": 1.2527050694638432e-06, "loss": 0.81757975, "num_input_tokens_seen": 227025540, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6796875, "step": 10532, "time_per_iteration": 2.4349749088287354 }, { "auxiliary_loss_clip": 0.01104968, "auxiliary_loss_mlp": 0.01029358, "balance_loss_clip": 1.01817584, "balance_loss_mlp": 1.03674507, "epoch": 0.633278220351721, "flos": 22706963458560.0, "grad_norm": 1.789881165148796, "language_loss": 0.74832278, "learning_rate": 1.2523438303550582e-06, "loss": 0.76966608, "num_input_tokens_seen": 227045520, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.68359375, "step": 10533, "time_per_iteration": 2.4941935539245605 }, { "auxiliary_loss_clip": 0.01113405, "auxiliary_loss_mlp": 0.01038701, "balance_loss_clip": 1.02460504, "balance_loss_mlp": 1.03867173, "epoch": 0.633338343604389, "flos": 12602922364800.0, "grad_norm": 2.6066105580189785, "language_loss": 0.76855314, "learning_rate": 1.2519826195974706e-06, "loss": 0.79007411, "num_input_tokens_seen": 227059420, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 10534, "time_per_iteration": 2.414609432220459 }, { "auxiliary_loss_clip": 0.01107928, "auxiliary_loss_mlp": 0.01034088, "balance_loss_clip": 1.02177334, "balance_loss_mlp": 1.03888726, "epoch": 0.6333984668570569, "flos": 25960111447680.0, "grad_norm": 1.532720519756126, "language_loss": 0.85395688, "learning_rate": 1.251621437204777e-06, "loss": 0.87537706, "num_input_tokens_seen": 227081310, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 10535, "time_per_iteration": 2.538228750228882 }, { "auxiliary_loss_clip": 0.01110717, "auxiliary_loss_mlp": 0.01031078, "balance_loss_clip": 1.01781011, "balance_loss_mlp": 1.03924811, "epoch": 0.6334585901097249, "flos": 23659242877440.0, "grad_norm": 2.1940084975482748, "language_loss": 0.76389784, "learning_rate": 1.2512602831906733e-06, "loss": 0.78531581, "num_input_tokens_seen": 227100365, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 10536, "time_per_iteration": 2.4804952144622803 }, { "auxiliary_loss_clip": 0.01110927, "auxiliary_loss_mlp": 0.01028293, "balance_loss_clip": 1.01600862, "balance_loss_mlp": 1.04189301, "epoch": 0.633518713362393, "flos": 28760496503040.0, "grad_norm": 2.9722122142686125, "language_loss": 0.59855592, "learning_rate": 1.250899157568855e-06, "loss": 0.61994815, "num_input_tokens_seen": 227119680, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 10537, "time_per_iteration": 2.548002004623413 }, { "auxiliary_loss_clip": 0.01033258, "auxiliary_loss_mlp": 0.00999927, "balance_loss_clip": 0.99870539, "balance_loss_mlp": 1.00993168, "epoch": 0.6335788366150609, "flos": 70420322401920.0, "grad_norm": 0.8025840494352529, "language_loss": 0.52483314, "learning_rate": 1.2505380603530155e-06, "loss": 0.545165, "num_input_tokens_seen": 227184465, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.23339844, "step": 10538, "time_per_iteration": 3.1523327827453613 }, { "auxiliary_loss_clip": 0.01113025, "auxiliary_loss_mlp": 0.01032182, "balance_loss_clip": 1.01903915, "balance_loss_mlp": 1.03961515, "epoch": 0.6336389598677289, "flos": 23732069702400.0, "grad_norm": 1.9087188155341115, "language_loss": 0.8324157, "learning_rate": 1.250176991556848e-06, "loss": 0.85386777, "num_input_tokens_seen": 227202185, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 10539, "time_per_iteration": 2.477435827255249 }, { "auxiliary_loss_clip": 0.01110091, "auxiliary_loss_mlp": 0.01031122, "balance_loss_clip": 1.01743054, "balance_loss_mlp": 1.03871977, "epoch": 0.6336990831203968, "flos": 29276676898560.0, "grad_norm": 1.6670939998407033, "language_loss": 0.86953062, "learning_rate": 1.2498159511940438e-06, "loss": 0.89094269, "num_input_tokens_seen": 227222020, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 10540, "time_per_iteration": 2.5343339443206787 }, { "auxiliary_loss_clip": 0.01104645, "auxiliary_loss_mlp": 0.01032045, "balance_loss_clip": 1.02069032, "balance_loss_mlp": 1.0369966, "epoch": 0.6337592063730648, "flos": 29096836479360.0, "grad_norm": 1.6182076279732143, "language_loss": 0.72496808, "learning_rate": 1.2494549392782943e-06, "loss": 0.74633503, "num_input_tokens_seen": 227240885, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.67578125, "step": 10541, "time_per_iteration": 2.519993782043457 }, { "auxiliary_loss_clip": 0.01114278, "auxiliary_loss_mlp": 0.01032933, "balance_loss_clip": 1.01854479, "balance_loss_mlp": 1.0399816, "epoch": 0.6338193296257327, "flos": 34706477249280.0, "grad_norm": 2.5733440229326168, "language_loss": 0.84741008, "learning_rate": 1.2490939558232887e-06, "loss": 0.86888218, "num_input_tokens_seen": 227257880, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7421875, "step": 10542, "time_per_iteration": 2.563624143600464 }, { "auxiliary_loss_clip": 0.0110833, "auxiliary_loss_mlp": 0.0102978, "balance_loss_clip": 1.01583254, "balance_loss_mlp": 1.03840709, "epoch": 0.6338794528784008, "flos": 16687581269760.0, "grad_norm": 1.7183781609710653, "language_loss": 0.77647924, "learning_rate": 1.2487330008427153e-06, "loss": 0.79786026, "num_input_tokens_seen": 227274840, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.69921875, "step": 10543, "time_per_iteration": 2.4354817867279053 }, { "auxiliary_loss_clip": 0.01103271, "auxiliary_loss_mlp": 0.0103677, "balance_loss_clip": 1.02458715, "balance_loss_mlp": 1.03664207, "epoch": 0.6339395761310687, "flos": 22346600261760.0, "grad_norm": 1.725247680334166, "language_loss": 0.73664916, "learning_rate": 1.2483720743502618e-06, "loss": 0.75804961, "num_input_tokens_seen": 227294835, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 10544, "time_per_iteration": 2.459559440612793 }, { "auxiliary_loss_clip": 0.01112602, "auxiliary_loss_mlp": 0.01039069, "balance_loss_clip": 1.02585459, "balance_loss_mlp": 1.03850317, "epoch": 0.6339996993837367, "flos": 18551812112640.0, "grad_norm": 1.8646619318599962, "language_loss": 0.6849438, "learning_rate": 1.2480111763596144e-06, "loss": 0.70646048, "num_input_tokens_seen": 227314935, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73828125, "step": 10545, "time_per_iteration": 3.8589413166046143 }, { "auxiliary_loss_clip": 0.01103765, "auxiliary_loss_mlp": 0.01038702, "balance_loss_clip": 1.02583385, "balance_loss_mlp": 1.03603077, "epoch": 0.6340598226364046, "flos": 12969498614400.0, "grad_norm": 2.2604211452073875, "language_loss": 0.70938253, "learning_rate": 1.2476503068844592e-06, "loss": 0.73080724, "num_input_tokens_seen": 227332905, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.67578125, "step": 10546, "time_per_iteration": 2.4463729858398438 }, { "auxiliary_loss_clip": 0.01103197, "auxiliary_loss_mlp": 0.01032524, "balance_loss_clip": 1.02069235, "balance_loss_mlp": 1.03673768, "epoch": 0.6341199458890726, "flos": 26687984647680.0, "grad_norm": 1.3178744426801774, "language_loss": 0.78196287, "learning_rate": 1.2472894659384792e-06, "loss": 0.80332005, "num_input_tokens_seen": 227354915, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 10547, "time_per_iteration": 2.5522701740264893 }, { "auxiliary_loss_clip": 0.01110323, "auxiliary_loss_mlp": 0.01035505, "balance_loss_clip": 1.02275586, "balance_loss_mlp": 1.0380075, "epoch": 0.6341800691417405, "flos": 18734274224640.0, "grad_norm": 1.7820311956173358, "language_loss": 0.63255912, "learning_rate": 1.2469286535353578e-06, "loss": 0.65401745, "num_input_tokens_seen": 227372990, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 10548, "time_per_iteration": 5.3075783252716064 }, { "auxiliary_loss_clip": 0.0110789, "auxiliary_loss_mlp": 0.01031108, "balance_loss_clip": 1.01869261, "balance_loss_mlp": 1.03842056, "epoch": 0.6342401923944085, "flos": 26249443499520.0, "grad_norm": 1.6913158830350485, "language_loss": 0.61912507, "learning_rate": 1.2465678696887785e-06, "loss": 0.64051509, "num_input_tokens_seen": 227393270, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 10549, "time_per_iteration": 2.539170265197754 }, { "auxiliary_loss_clip": 0.01107351, "auxiliary_loss_mlp": 0.01030689, "balance_loss_clip": 1.01882207, "balance_loss_mlp": 1.03712261, "epoch": 0.6343003156470765, "flos": 24680937329280.0, "grad_norm": 1.68345054352903, "language_loss": 0.73938113, "learning_rate": 1.2462071144124197e-06, "loss": 0.76076156, "num_input_tokens_seen": 227413630, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 10550, "time_per_iteration": 3.9527480602264404 }, { "auxiliary_loss_clip": 0.01033528, "auxiliary_loss_mlp": 0.00999718, "balance_loss_clip": 0.99856204, "balance_loss_mlp": 1.00982189, "epoch": 0.6343604388997445, "flos": 69805352626560.0, "grad_norm": 0.7077182489080777, "language_loss": 0.57685369, "learning_rate": 1.2458463877199638e-06, "loss": 0.59718615, "num_input_tokens_seen": 227476630, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.23632812, "step": 10551, "time_per_iteration": 3.1190247535705566 }, { "auxiliary_loss_clip": 0.01105344, "auxiliary_loss_mlp": 0.01027378, "balance_loss_clip": 1.01554048, "balance_loss_mlp": 1.0373168, "epoch": 0.6344205621524125, "flos": 21982430223360.0, "grad_norm": 1.795678900882669, "language_loss": 0.67122012, "learning_rate": 1.2454856896250881e-06, "loss": 0.69254732, "num_input_tokens_seen": 227496060, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 10552, "time_per_iteration": 2.463135242462158 }, { "auxiliary_loss_clip": 0.0110912, "auxiliary_loss_mlp": 0.01027602, "balance_loss_clip": 1.01492453, "balance_loss_mlp": 1.03659761, "epoch": 0.6344806854050804, "flos": 20448865008000.0, "grad_norm": 1.7573234358219876, "language_loss": 0.82172668, "learning_rate": 1.24512502014147e-06, "loss": 0.84309387, "num_input_tokens_seen": 227513440, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 10553, "time_per_iteration": 2.4613542556762695 }, { "auxiliary_loss_clip": 0.01109364, "auxiliary_loss_mlp": 0.01031381, "balance_loss_clip": 1.01867867, "balance_loss_mlp": 1.03787613, "epoch": 0.6345408086577484, "flos": 40510611187200.0, "grad_norm": 2.427294479507878, "language_loss": 0.54552746, "learning_rate": 1.2447643792827879e-06, "loss": 0.56693488, "num_input_tokens_seen": 227535395, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 10554, "time_per_iteration": 2.6288931369781494 }, { "auxiliary_loss_clip": 0.01111431, "auxiliary_loss_mlp": 0.01031979, "balance_loss_clip": 1.0193429, "balance_loss_mlp": 1.04041147, "epoch": 0.6346009319104163, "flos": 21361319222400.0, "grad_norm": 1.9370475013097779, "language_loss": 0.70724022, "learning_rate": 1.2444037670627153e-06, "loss": 0.72867429, "num_input_tokens_seen": 227554545, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 10555, "time_per_iteration": 2.4495317935943604 }, { "auxiliary_loss_clip": 0.010328, "auxiliary_loss_mlp": 0.01001226, "balance_loss_clip": 1.00003397, "balance_loss_mlp": 1.00920868, "epoch": 0.6346610551630844, "flos": 71365419100800.0, "grad_norm": 0.7805932747694411, "language_loss": 0.55271077, "learning_rate": 1.2440431834949276e-06, "loss": 0.57305104, "num_input_tokens_seen": 227608575, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.23632812, "step": 10556, "time_per_iteration": 3.0161614418029785 }, { "auxiliary_loss_clip": 0.0110908, "auxiliary_loss_mlp": 0.0103188, "balance_loss_clip": 1.01815355, "balance_loss_mlp": 1.0370568, "epoch": 0.6347211784157523, "flos": 25411504049280.0, "grad_norm": 2.037811037999834, "language_loss": 0.68452322, "learning_rate": 1.2436826285930985e-06, "loss": 0.70593286, "num_input_tokens_seen": 227628175, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.72265625, "step": 10557, "time_per_iteration": 2.4975717067718506 }, { "auxiliary_loss_clip": 0.01106714, "auxiliary_loss_mlp": 0.01031005, "balance_loss_clip": 1.0188334, "balance_loss_mlp": 1.03785157, "epoch": 0.6347813016684203, "flos": 15742735966080.0, "grad_norm": 1.7361822290569566, "language_loss": 0.70202804, "learning_rate": 1.2433221023709002e-06, "loss": 0.72340524, "num_input_tokens_seen": 227645330, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 10558, "time_per_iteration": 2.440763235092163 }, { "auxiliary_loss_clip": 0.01106923, "auxiliary_loss_mlp": 0.01030221, "balance_loss_clip": 1.01740026, "balance_loss_mlp": 1.03687966, "epoch": 0.6348414249210882, "flos": 21464777370240.0, "grad_norm": 1.6484036704431955, "language_loss": 0.78293884, "learning_rate": 1.2429616048420031e-06, "loss": 0.80431026, "num_input_tokens_seen": 227665250, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 10559, "time_per_iteration": 2.466792106628418 }, { "auxiliary_loss_clip": 0.01111439, "auxiliary_loss_mlp": 0.01034928, "balance_loss_clip": 1.02187502, "balance_loss_mlp": 1.0394392, "epoch": 0.6349015481737562, "flos": 21653057485440.0, "grad_norm": 1.7806093897361444, "language_loss": 0.68462276, "learning_rate": 1.242601136020078e-06, "loss": 0.7060864, "num_input_tokens_seen": 227685070, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10560, "time_per_iteration": 2.521256446838379 }, { "auxiliary_loss_clip": 0.01106907, "auxiliary_loss_mlp": 0.01033237, "balance_loss_clip": 1.02083898, "balance_loss_mlp": 1.03735757, "epoch": 0.6349616714264241, "flos": 22194984954240.0, "grad_norm": 1.9334518273486303, "language_loss": 0.76960748, "learning_rate": 1.2422406959187939e-06, "loss": 0.79100889, "num_input_tokens_seen": 227704430, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 10561, "time_per_iteration": 2.501120090484619 }, { "auxiliary_loss_clip": 0.01108415, "auxiliary_loss_mlp": 0.01030962, "balance_loss_clip": 1.01814151, "balance_loss_mlp": 1.03743744, "epoch": 0.6350217946790921, "flos": 25410354814080.0, "grad_norm": 4.487248410122546, "language_loss": 0.72526592, "learning_rate": 1.2418802845518178e-06, "loss": 0.7466597, "num_input_tokens_seen": 227724920, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 10562, "time_per_iteration": 2.513721466064453 }, { "auxiliary_loss_clip": 0.01110812, "auxiliary_loss_mlp": 0.01027705, "balance_loss_clip": 1.01478243, "balance_loss_mlp": 1.03842854, "epoch": 0.63508191793176, "flos": 19718944732800.0, "grad_norm": 1.962743593422489, "language_loss": 0.80838907, "learning_rate": 1.2415199019328185e-06, "loss": 0.82977426, "num_input_tokens_seen": 227743400, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 10563, "time_per_iteration": 2.5127739906311035 }, { "auxiliary_loss_clip": 0.01111862, "auxiliary_loss_mlp": 0.01035346, "balance_loss_clip": 1.02227437, "balance_loss_mlp": 1.03994656, "epoch": 0.6351420411844281, "flos": 18186923802240.0, "grad_norm": 2.4701244857691513, "language_loss": 0.8077507, "learning_rate": 1.2411595480754597e-06, "loss": 0.8292228, "num_input_tokens_seen": 227759990, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10564, "time_per_iteration": 2.474456548690796 }, { "auxiliary_loss_clip": 0.01111277, "auxiliary_loss_mlp": 0.01031262, "balance_loss_clip": 1.01833963, "balance_loss_mlp": 1.04064345, "epoch": 0.6352021644370961, "flos": 33726511422720.0, "grad_norm": 1.629817751624142, "language_loss": 0.72469759, "learning_rate": 1.240799222993407e-06, "loss": 0.74612296, "num_input_tokens_seen": 227780835, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 10565, "time_per_iteration": 2.571589708328247 }, { "auxiliary_loss_clip": 0.01109303, "auxiliary_loss_mlp": 0.01031723, "balance_loss_clip": 1.01771045, "balance_loss_mlp": 1.03791785, "epoch": 0.635262287689764, "flos": 20374781207040.0, "grad_norm": 1.9815913256690554, "language_loss": 0.6896615, "learning_rate": 1.240438926700324e-06, "loss": 0.71107179, "num_input_tokens_seen": 227798580, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7109375, "step": 10566, "time_per_iteration": 2.4944703578948975 }, { "auxiliary_loss_clip": 0.01104147, "auxiliary_loss_mlp": 0.01031324, "balance_loss_clip": 1.01921821, "balance_loss_mlp": 1.03687501, "epoch": 0.635322410942432, "flos": 27525421307520.0, "grad_norm": 2.25123934074912, "language_loss": 0.69727492, "learning_rate": 1.2400786592098725e-06, "loss": 0.7186296, "num_input_tokens_seen": 227819210, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 10567, "time_per_iteration": 2.507624387741089 }, { "auxiliary_loss_clip": 0.01106873, "auxiliary_loss_mlp": 0.01029009, "balance_loss_clip": 1.01726139, "balance_loss_mlp": 1.03946269, "epoch": 0.6353825341950999, "flos": 21543601766400.0, "grad_norm": 3.340158515609436, "language_loss": 0.8455627, "learning_rate": 1.2397184205357154e-06, "loss": 0.86692154, "num_input_tokens_seen": 227838340, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 10568, "time_per_iteration": 2.5029964447021484 }, { "auxiliary_loss_clip": 0.01109878, "auxiliary_loss_mlp": 0.01034947, "balance_loss_clip": 1.02148271, "balance_loss_mlp": 1.03863013, "epoch": 0.635442657447768, "flos": 31759756185600.0, "grad_norm": 1.9624528652462325, "language_loss": 0.84052408, "learning_rate": 1.2393582106915113e-06, "loss": 0.86197227, "num_input_tokens_seen": 227859170, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 10569, "time_per_iteration": 2.5338263511657715 }, { "auxiliary_loss_clip": 0.01105901, "auxiliary_loss_mlp": 0.01027283, "balance_loss_clip": 1.01451552, "balance_loss_mlp": 1.03742254, "epoch": 0.6355027807004359, "flos": 19828831415040.0, "grad_norm": 1.8257495112577085, "language_loss": 0.69217759, "learning_rate": 1.2389980296909198e-06, "loss": 0.71350944, "num_input_tokens_seen": 227878545, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 10570, "time_per_iteration": 2.469287395477295 }, { "auxiliary_loss_clip": 0.01110561, "auxiliary_loss_mlp": 0.01035355, "balance_loss_clip": 1.0220933, "balance_loss_mlp": 1.03713894, "epoch": 0.6355629039531039, "flos": 30372383324160.0, "grad_norm": 1.659617285046582, "language_loss": 0.65717506, "learning_rate": 1.2386378775476e-06, "loss": 0.67863423, "num_input_tokens_seen": 227898875, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 10571, "time_per_iteration": 2.5302133560180664 }, { "auxiliary_loss_clip": 0.01113327, "auxiliary_loss_mlp": 0.01028117, "balance_loss_clip": 1.0154568, "balance_loss_mlp": 1.04081869, "epoch": 0.6356230272057718, "flos": 17932065828480.0, "grad_norm": 2.7114322042264276, "language_loss": 0.71118063, "learning_rate": 1.2382777542752074e-06, "loss": 0.73259509, "num_input_tokens_seen": 227917130, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 10572, "time_per_iteration": 2.467773914337158 }, { "auxiliary_loss_clip": 0.0110594, "auxiliary_loss_mlp": 0.01032467, "balance_loss_clip": 1.0208261, "balance_loss_mlp": 1.03767252, "epoch": 0.6356831504584398, "flos": 25375844822400.0, "grad_norm": 1.8369462784721309, "language_loss": 0.81334662, "learning_rate": 1.2379176598873992e-06, "loss": 0.83473074, "num_input_tokens_seen": 227939550, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 10573, "time_per_iteration": 2.513131618499756 }, { "auxiliary_loss_clip": 0.01109746, "auxiliary_loss_mlp": 0.01031074, "balance_loss_clip": 1.01852703, "balance_loss_mlp": 1.03824198, "epoch": 0.6357432737111077, "flos": 46500331720320.0, "grad_norm": 1.7837654840543298, "language_loss": 0.69169605, "learning_rate": 1.2375575943978303e-06, "loss": 0.71310419, "num_input_tokens_seen": 227962200, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 10574, "time_per_iteration": 2.703111410140991 }, { "auxiliary_loss_clip": 0.0110766, "auxiliary_loss_mlp": 0.0103056, "balance_loss_clip": 1.01778674, "balance_loss_mlp": 1.0384264, "epoch": 0.6358033969637757, "flos": 17274361847040.0, "grad_norm": 2.365099847049701, "language_loss": 0.86978346, "learning_rate": 1.2371975578201525e-06, "loss": 0.89116567, "num_input_tokens_seen": 227979270, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 10575, "time_per_iteration": 2.420783758163452 }, { "auxiliary_loss_clip": 0.01108606, "auxiliary_loss_mlp": 0.01033109, "balance_loss_clip": 1.02039528, "balance_loss_mlp": 1.0390811, "epoch": 0.6358635202164437, "flos": 27125520215040.0, "grad_norm": 1.7488080501564942, "language_loss": 0.71884525, "learning_rate": 1.2368375501680204e-06, "loss": 0.74026239, "num_input_tokens_seen": 228000550, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10576, "time_per_iteration": 2.5070204734802246 }, { "auxiliary_loss_clip": 0.01109579, "auxiliary_loss_mlp": 0.01034165, "balance_loss_clip": 1.02153444, "balance_loss_mlp": 1.03802204, "epoch": 0.6359236434691117, "flos": 27525205825920.0, "grad_norm": 1.7795268142124478, "language_loss": 0.69306386, "learning_rate": 1.236477571455085e-06, "loss": 0.71450132, "num_input_tokens_seen": 228022005, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 10577, "time_per_iteration": 2.5078134536743164 }, { "auxiliary_loss_clip": 0.01107475, "auxiliary_loss_mlp": 0.01032632, "balance_loss_clip": 1.02004361, "balance_loss_mlp": 1.03794789, "epoch": 0.6359837667217797, "flos": 39348290989440.0, "grad_norm": 1.7589347703090759, "language_loss": 0.72120976, "learning_rate": 1.2361176216949964e-06, "loss": 0.74261081, "num_input_tokens_seen": 228043770, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 10578, "time_per_iteration": 2.605381965637207 }, { "auxiliary_loss_clip": 0.01032542, "auxiliary_loss_mlp": 0.01004013, "balance_loss_clip": 1.00278533, "balance_loss_mlp": 1.00907826, "epoch": 0.6360438899744476, "flos": 56413797206400.0, "grad_norm": 0.736744446145543, "language_loss": 0.54466343, "learning_rate": 1.2357577009014044e-06, "loss": 0.56502903, "num_input_tokens_seen": 228104985, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.234375, "step": 10579, "time_per_iteration": 3.163695812225342 }, { "auxiliary_loss_clip": 0.01109323, "auxiliary_loss_mlp": 0.01031956, "balance_loss_clip": 1.01917112, "balance_loss_mlp": 1.03882611, "epoch": 0.6361040132271156, "flos": 24973106555520.0, "grad_norm": 1.670639707105212, "language_loss": 0.77723265, "learning_rate": 1.2353978090879568e-06, "loss": 0.79864544, "num_input_tokens_seen": 228125620, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10580, "time_per_iteration": 2.5161354541778564 }, { "auxiliary_loss_clip": 0.01108307, "auxiliary_loss_mlp": 0.01024431, "balance_loss_clip": 1.01231933, "balance_loss_mlp": 1.03825855, "epoch": 0.6361641364797835, "flos": 23259198130560.0, "grad_norm": 1.9072662760907808, "language_loss": 0.65931773, "learning_rate": 1.235037946268301e-06, "loss": 0.68064511, "num_input_tokens_seen": 228143495, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 10581, "time_per_iteration": 2.471900224685669 }, { "auxiliary_loss_clip": 0.01106595, "auxiliary_loss_mlp": 0.01031591, "balance_loss_clip": 1.01933002, "balance_loss_mlp": 1.03671956, "epoch": 0.6362242597324516, "flos": 25994513698560.0, "grad_norm": 1.4848246290857803, "language_loss": 0.68440318, "learning_rate": 1.2346781124560828e-06, "loss": 0.70578504, "num_input_tokens_seen": 228166500, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69921875, "step": 10582, "time_per_iteration": 2.5253639221191406 }, { "auxiliary_loss_clip": 0.01109453, "auxiliary_loss_mlp": 0.01037302, "balance_loss_clip": 1.02455211, "balance_loss_mlp": 1.03804076, "epoch": 0.6362843829851195, "flos": 25703242312320.0, "grad_norm": 3.351679680646085, "language_loss": 0.8517313, "learning_rate": 1.2343183076649473e-06, "loss": 0.87319881, "num_input_tokens_seen": 228185325, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 10583, "time_per_iteration": 2.516744375228882 }, { "auxiliary_loss_clip": 0.01106693, "auxiliary_loss_mlp": 0.01031451, "balance_loss_clip": 1.01882052, "balance_loss_mlp": 1.03859043, "epoch": 0.6363445062377875, "flos": 20522912895360.0, "grad_norm": 1.6096487260875374, "language_loss": 0.75530165, "learning_rate": 1.233958531908538e-06, "loss": 0.77668309, "num_input_tokens_seen": 228204050, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 10584, "time_per_iteration": 2.5205557346343994 }, { "auxiliary_loss_clip": 0.01110135, "auxiliary_loss_mlp": 0.01037367, "balance_loss_clip": 1.02386034, "balance_loss_mlp": 1.038517, "epoch": 0.6364046294904554, "flos": 19463799450240.0, "grad_norm": 1.947997664801956, "language_loss": 0.72808957, "learning_rate": 1.2335987852004985e-06, "loss": 0.74956465, "num_input_tokens_seen": 228222430, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 10585, "time_per_iteration": 2.453080177307129 }, { "auxiliary_loss_clip": 0.01109818, "auxiliary_loss_mlp": 0.01030767, "balance_loss_clip": 1.01848805, "balance_loss_mlp": 1.03953886, "epoch": 0.6364647527431234, "flos": 20995892208000.0, "grad_norm": 1.9614657361228964, "language_loss": 0.82821989, "learning_rate": 1.2332390675544697e-06, "loss": 0.84962571, "num_input_tokens_seen": 228241925, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 10586, "time_per_iteration": 2.4778754711151123 }, { "auxiliary_loss_clip": 0.01108271, "auxiliary_loss_mlp": 0.01024573, "balance_loss_clip": 1.01255727, "balance_loss_mlp": 1.03847313, "epoch": 0.6365248759957913, "flos": 25770789838080.0, "grad_norm": 1.480254780478122, "language_loss": 0.72483957, "learning_rate": 1.2328793789840918e-06, "loss": 0.74616808, "num_input_tokens_seen": 228262535, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 10587, "time_per_iteration": 3.869584083557129 }, { "auxiliary_loss_clip": 0.01108313, "auxiliary_loss_mlp": 0.01027876, "balance_loss_clip": 1.01574659, "balance_loss_mlp": 1.03811729, "epoch": 0.6365849992484593, "flos": 22455589104000.0, "grad_norm": 2.5253469320385906, "language_loss": 0.76968658, "learning_rate": 1.2325197195030058e-06, "loss": 0.79104853, "num_input_tokens_seen": 228281340, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 10588, "time_per_iteration": 2.461717128753662 }, { "auxiliary_loss_clip": 0.01106373, "auxiliary_loss_mlp": 0.01030317, "balance_loss_clip": 1.01770508, "balance_loss_mlp": 1.03919697, "epoch": 0.6366451225011273, "flos": 19025689265280.0, "grad_norm": 1.4403492047226871, "language_loss": 0.80042386, "learning_rate": 1.2321600891248478e-06, "loss": 0.82179075, "num_input_tokens_seen": 228300865, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.671875, "step": 10589, "time_per_iteration": 2.504318952560425 }, { "auxiliary_loss_clip": 0.01108125, "auxiliary_loss_mlp": 0.0103128, "balance_loss_clip": 1.01875091, "balance_loss_mlp": 1.03951287, "epoch": 0.6367052457537953, "flos": 25228395492480.0, "grad_norm": 1.9074865655516668, "language_loss": 0.67251968, "learning_rate": 1.231800487863257e-06, "loss": 0.6939137, "num_input_tokens_seen": 228320815, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 10590, "time_per_iteration": 5.3896403312683105 }, { "auxiliary_loss_clip": 0.01115131, "auxiliary_loss_mlp": 0.0103559, "balance_loss_clip": 1.02242327, "balance_loss_mlp": 1.03962564, "epoch": 0.6367653690064633, "flos": 19208438686080.0, "grad_norm": 1.6390778451693737, "language_loss": 0.79137874, "learning_rate": 1.2314409157318685e-06, "loss": 0.81288594, "num_input_tokens_seen": 228339065, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7578125, "step": 10591, "time_per_iteration": 3.9956483840942383 }, { "auxiliary_loss_clip": 0.01107878, "auxiliary_loss_mlp": 0.01026106, "balance_loss_clip": 1.01431561, "balance_loss_mlp": 1.03984189, "epoch": 0.6368254922591312, "flos": 23546806329600.0, "grad_norm": 1.8391815115623311, "language_loss": 0.88887352, "learning_rate": 1.231081372744317e-06, "loss": 0.91021335, "num_input_tokens_seen": 228359210, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 10592, "time_per_iteration": 2.4796268939971924 }, { "auxiliary_loss_clip": 0.01104984, "auxiliary_loss_mlp": 0.01026846, "balance_loss_clip": 1.01537752, "balance_loss_mlp": 1.03756392, "epoch": 0.6368856155117992, "flos": 26467313443200.0, "grad_norm": 1.4101259997551798, "language_loss": 0.68244302, "learning_rate": 1.2307218589142376e-06, "loss": 0.70376134, "num_input_tokens_seen": 228379630, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 10593, "time_per_iteration": 2.5092995166778564 }, { "auxiliary_loss_clip": 0.01104842, "auxiliary_loss_mlp": 0.01032353, "balance_loss_clip": 1.02048576, "balance_loss_mlp": 1.03581893, "epoch": 0.6369457387644671, "flos": 33692432394240.0, "grad_norm": 2.1093936568330625, "language_loss": 0.63296479, "learning_rate": 1.2303623742552618e-06, "loss": 0.65433675, "num_input_tokens_seen": 228401410, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 10594, "time_per_iteration": 2.5943009853363037 }, { "auxiliary_loss_clip": 0.01033499, "auxiliary_loss_mlp": 0.01000893, "balance_loss_clip": 0.99965924, "balance_loss_mlp": 1.00971079, "epoch": 0.6370058620171352, "flos": 70908600908160.0, "grad_norm": 0.7667338195286908, "language_loss": 0.54601645, "learning_rate": 1.230002918781022e-06, "loss": 0.56636035, "num_input_tokens_seen": 228470335, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.23828125, "step": 10595, "time_per_iteration": 3.202338218688965 }, { "auxiliary_loss_clip": 0.01110217, "auxiliary_loss_mlp": 0.01037465, "balance_loss_clip": 1.02426863, "balance_loss_mlp": 1.03837919, "epoch": 0.6370659852698031, "flos": 21141940907520.0, "grad_norm": 1.9767310018411999, "language_loss": 0.66846108, "learning_rate": 1.2296434925051493e-06, "loss": 0.68993795, "num_input_tokens_seen": 228490765, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 10596, "time_per_iteration": 2.4853644371032715 }, { "auxiliary_loss_clip": 0.01109134, "auxiliary_loss_mlp": 0.01032229, "balance_loss_clip": 1.01981354, "balance_loss_mlp": 1.03918791, "epoch": 0.6371261085224711, "flos": 20193288762240.0, "grad_norm": 2.100613967966703, "language_loss": 0.78816503, "learning_rate": 1.2292840954412718e-06, "loss": 0.80957866, "num_input_tokens_seen": 228509700, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 10597, "time_per_iteration": 2.474771499633789 }, { "auxiliary_loss_clip": 0.01108535, "auxiliary_loss_mlp": 0.01030962, "balance_loss_clip": 1.01914239, "balance_loss_mlp": 1.03885424, "epoch": 0.637186231775139, "flos": 19683536901120.0, "grad_norm": 2.2207220969911132, "language_loss": 0.74813688, "learning_rate": 1.2289247276030189e-06, "loss": 0.76953185, "num_input_tokens_seen": 228529050, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 10598, "time_per_iteration": 2.45943546295166 }, { "auxiliary_loss_clip": 0.01108725, "auxiliary_loss_mlp": 0.0103218, "balance_loss_clip": 1.01990175, "balance_loss_mlp": 1.03841329, "epoch": 0.637246355027807, "flos": 13071196995840.0, "grad_norm": 2.3696997830682793, "language_loss": 0.68016255, "learning_rate": 1.2285653890040176e-06, "loss": 0.7015717, "num_input_tokens_seen": 228544665, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 10599, "time_per_iteration": 2.4350123405456543 }, { "auxiliary_loss_clip": 0.01110541, "auxiliary_loss_mlp": 0.0103324, "balance_loss_clip": 1.02012134, "balance_loss_mlp": 1.03833938, "epoch": 0.6373064782804749, "flos": 18222654856320.0, "grad_norm": 2.0272766301386524, "language_loss": 0.80545515, "learning_rate": 1.2282060796578942e-06, "loss": 0.82689297, "num_input_tokens_seen": 228562060, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 10600, "time_per_iteration": 2.4545114040374756 }, { "auxiliary_loss_clip": 0.01106275, "auxiliary_loss_mlp": 0.01029141, "balance_loss_clip": 1.01709509, "balance_loss_mlp": 1.03714395, "epoch": 0.637366601533143, "flos": 24498475217280.0, "grad_norm": 1.7460700288052287, "language_loss": 0.79880047, "learning_rate": 1.2278467995782732e-06, "loss": 0.82015467, "num_input_tokens_seen": 228582550, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69140625, "step": 10601, "time_per_iteration": 2.516674757003784 }, { "auxiliary_loss_clip": 0.01108062, "auxiliary_loss_mlp": 0.01027824, "balance_loss_clip": 1.01533127, "balance_loss_mlp": 1.03751421, "epoch": 0.6374267247858109, "flos": 26359042872960.0, "grad_norm": 7.100866341095163, "language_loss": 0.66825497, "learning_rate": 1.2274875487787797e-06, "loss": 0.68961382, "num_input_tokens_seen": 228604960, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 10602, "time_per_iteration": 2.5314557552337646 }, { "auxiliary_loss_clip": 0.01104293, "auxiliary_loss_mlp": 0.01025082, "balance_loss_clip": 1.01291704, "balance_loss_mlp": 1.0355953, "epoch": 0.6374868480384789, "flos": 20371728551040.0, "grad_norm": 1.7464195691126696, "language_loss": 0.79711437, "learning_rate": 1.2271283272730354e-06, "loss": 0.81840813, "num_input_tokens_seen": 228622195, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 10603, "time_per_iteration": 2.4775898456573486 }, { "auxiliary_loss_clip": 0.01108141, "auxiliary_loss_mlp": 0.01030908, "balance_loss_clip": 1.01833129, "balance_loss_mlp": 1.03868365, "epoch": 0.6375469712911469, "flos": 20996251344000.0, "grad_norm": 1.9269223480546371, "language_loss": 0.77119267, "learning_rate": 1.2267691350746621e-06, "loss": 0.79258311, "num_input_tokens_seen": 228639735, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 10604, "time_per_iteration": 2.4826955795288086 }, { "auxiliary_loss_clip": 0.01109463, "auxiliary_loss_mlp": 0.01030658, "balance_loss_clip": 1.01790881, "balance_loss_mlp": 1.0369494, "epoch": 0.6376070945438148, "flos": 19715748422400.0, "grad_norm": 1.7957623902793511, "language_loss": 0.76958257, "learning_rate": 1.226409972197281e-06, "loss": 0.7909838, "num_input_tokens_seen": 228658195, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 10605, "time_per_iteration": 2.497816801071167 }, { "auxiliary_loss_clip": 0.01108083, "auxiliary_loss_mlp": 0.0102694, "balance_loss_clip": 1.01358235, "balance_loss_mlp": 1.03743625, "epoch": 0.6376672177964828, "flos": 21506757390720.0, "grad_norm": 1.941353040774688, "language_loss": 0.65815246, "learning_rate": 1.2260508386545106e-06, "loss": 0.67950273, "num_input_tokens_seen": 228677415, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.70703125, "step": 10606, "time_per_iteration": 2.5156846046447754 }, { "auxiliary_loss_clip": 0.011042, "auxiliary_loss_mlp": 0.01032748, "balance_loss_clip": 1.02113688, "balance_loss_mlp": 1.03747344, "epoch": 0.6377273410491507, "flos": 18843873598080.0, "grad_norm": 1.6721579026043756, "language_loss": 0.7512548, "learning_rate": 1.225691734459971e-06, "loss": 0.77262425, "num_input_tokens_seen": 228696450, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 10607, "time_per_iteration": 2.467153787612915 }, { "auxiliary_loss_clip": 0.01108748, "auxiliary_loss_mlp": 0.01034432, "balance_loss_clip": 1.02200484, "balance_loss_mlp": 1.0386405, "epoch": 0.6377874643018188, "flos": 53062970181120.0, "grad_norm": 1.6228178415421814, "language_loss": 0.66082758, "learning_rate": 1.225332659627278e-06, "loss": 0.68225938, "num_input_tokens_seen": 228721600, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 10608, "time_per_iteration": 2.751059055328369 }, { "auxiliary_loss_clip": 0.01034109, "auxiliary_loss_mlp": 0.01000248, "balance_loss_clip": 0.99901432, "balance_loss_mlp": 1.01057792, "epoch": 0.6378475875544867, "flos": 65135026465920.0, "grad_norm": 0.7261960131536032, "language_loss": 0.51917136, "learning_rate": 1.2249736141700475e-06, "loss": 0.5395149, "num_input_tokens_seen": 228784535, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.23535156, "step": 10609, "time_per_iteration": 3.0521316528320312 }, { "auxiliary_loss_clip": 0.01102709, "auxiliary_loss_mlp": 0.01022675, "balance_loss_clip": 1.01199353, "balance_loss_mlp": 1.03595114, "epoch": 0.6379077108071547, "flos": 23002759958400.0, "grad_norm": 1.8571311740746281, "language_loss": 0.74867392, "learning_rate": 1.2246145981018965e-06, "loss": 0.76992786, "num_input_tokens_seen": 228804110, "router_z_loss_clip": 0.10693359, "router_z_loss_mlp": 0.66796875, "step": 10610, "time_per_iteration": 2.48069167137146 }, { "auxiliary_loss_clip": 0.01033576, "auxiliary_loss_mlp": 0.0100011, "balance_loss_clip": 0.99888784, "balance_loss_mlp": 1.01025796, "epoch": 0.6379678340598226, "flos": 67601947610880.0, "grad_norm": 0.8622621332642644, "language_loss": 0.63106871, "learning_rate": 1.2242556114364364e-06, "loss": 0.65140557, "num_input_tokens_seen": 228867705, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.23339844, "step": 10611, "time_per_iteration": 3.1218037605285645 }, { "auxiliary_loss_clip": 0.01106595, "auxiliary_loss_mlp": 0.01027535, "balance_loss_clip": 1.01532757, "balance_loss_mlp": 1.03704727, "epoch": 0.6380279573124906, "flos": 29680061610240.0, "grad_norm": 2.7987075334911684, "language_loss": 0.72742712, "learning_rate": 1.223896654187282e-06, "loss": 0.74876845, "num_input_tokens_seen": 228889215, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 10612, "time_per_iteration": 2.5520246028900146 }, { "auxiliary_loss_clip": 0.01033253, "auxiliary_loss_mlp": 0.01000701, "balance_loss_clip": 0.99946707, "balance_loss_mlp": 1.01003194, "epoch": 0.6380880805651585, "flos": 66484046580480.0, "grad_norm": 0.7118384990621209, "language_loss": 0.57846189, "learning_rate": 1.2235377263680446e-06, "loss": 0.59880149, "num_input_tokens_seen": 228948465, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.23242188, "step": 10613, "time_per_iteration": 3.0055367946624756 }, { "auxiliary_loss_clip": 0.01108775, "auxiliary_loss_mlp": 0.0102839, "balance_loss_clip": 1.01561046, "balance_loss_mlp": 1.03789043, "epoch": 0.6381482038178266, "flos": 23914998691200.0, "grad_norm": 1.6773591131304053, "language_loss": 0.74954641, "learning_rate": 1.2231788279923334e-06, "loss": 0.77091801, "num_input_tokens_seen": 228967955, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 10614, "time_per_iteration": 2.4929697513580322 }, { "auxiliary_loss_clip": 0.01109849, "auxiliary_loss_mlp": 0.01030534, "balance_loss_clip": 1.01829135, "balance_loss_mlp": 1.04033566, "epoch": 0.6382083270704945, "flos": 24243042625920.0, "grad_norm": 1.8120494180013538, "language_loss": 0.7968272, "learning_rate": 1.2228199590737599e-06, "loss": 0.81823099, "num_input_tokens_seen": 228985495, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 10615, "time_per_iteration": 2.4889464378356934 }, { "auxiliary_loss_clip": 0.01033563, "auxiliary_loss_mlp": 0.00999806, "balance_loss_clip": 0.99851888, "balance_loss_mlp": 1.01030827, "epoch": 0.6382684503231625, "flos": 70775552931840.0, "grad_norm": 0.6585361321818294, "language_loss": 0.55552292, "learning_rate": 1.2224611196259305e-06, "loss": 0.57585663, "num_input_tokens_seen": 229052995, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.23242188, "step": 10616, "time_per_iteration": 3.1765146255493164 }, { "auxiliary_loss_clip": 0.01106737, "auxiliary_loss_mlp": 0.01030134, "balance_loss_clip": 1.01773632, "balance_loss_mlp": 1.03721356, "epoch": 0.6383285735758305, "flos": 16544836621440.0, "grad_norm": 1.6588984718450963, "language_loss": 0.84173894, "learning_rate": 1.2221023096624538e-06, "loss": 0.86310768, "num_input_tokens_seen": 229071030, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 10617, "time_per_iteration": 2.497755527496338 }, { "auxiliary_loss_clip": 0.01107564, "auxiliary_loss_mlp": 0.01037388, "balance_loss_clip": 1.02431107, "balance_loss_mlp": 1.03714621, "epoch": 0.6383886968284984, "flos": 14427651225600.0, "grad_norm": 1.918995345464964, "language_loss": 0.86991215, "learning_rate": 1.221743529196936e-06, "loss": 0.89136165, "num_input_tokens_seen": 229088275, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 10618, "time_per_iteration": 2.462632656097412 }, { "auxiliary_loss_clip": 0.01110485, "auxiliary_loss_mlp": 0.01031806, "balance_loss_clip": 1.02055264, "balance_loss_mlp": 1.03998637, "epoch": 0.6384488200811664, "flos": 17929659617280.0, "grad_norm": 2.145434279244774, "language_loss": 0.73481083, "learning_rate": 1.2213847782429806e-06, "loss": 0.75623369, "num_input_tokens_seen": 229105190, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.703125, "step": 10619, "time_per_iteration": 2.459038019180298 }, { "auxiliary_loss_clip": 0.01113159, "auxiliary_loss_mlp": 0.01036526, "balance_loss_clip": 1.02281094, "balance_loss_mlp": 1.04012179, "epoch": 0.6385089433338343, "flos": 18515578268160.0, "grad_norm": 2.047740264123826, "language_loss": 0.76756376, "learning_rate": 1.221026056814193e-06, "loss": 0.78906059, "num_input_tokens_seen": 229122290, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 10620, "time_per_iteration": 2.4632108211517334 }, { "auxiliary_loss_clip": 0.01105832, "auxiliary_loss_mlp": 0.01028952, "balance_loss_clip": 1.01704299, "balance_loss_mlp": 1.03748786, "epoch": 0.6385690665865024, "flos": 24753620499840.0, "grad_norm": 7.054060136313578, "language_loss": 0.70737922, "learning_rate": 1.2206673649241752e-06, "loss": 0.7287271, "num_input_tokens_seen": 229141620, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 10621, "time_per_iteration": 2.500906229019165 }, { "auxiliary_loss_clip": 0.01100212, "auxiliary_loss_mlp": 0.01027129, "balance_loss_clip": 1.01617384, "balance_loss_mlp": 1.03527153, "epoch": 0.6386291898391703, "flos": 20120569678080.0, "grad_norm": 1.6162223756039078, "language_loss": 0.77742946, "learning_rate": 1.220308702586529e-06, "loss": 0.79870284, "num_input_tokens_seen": 229161570, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6484375, "step": 10622, "time_per_iteration": 2.496746301651001 }, { "auxiliary_loss_clip": 0.01105074, "auxiliary_loss_mlp": 0.01027359, "balance_loss_clip": 1.01590276, "balance_loss_mlp": 1.03759861, "epoch": 0.6386893130918383, "flos": 16867278034560.0, "grad_norm": 1.8832222529310392, "language_loss": 0.75284863, "learning_rate": 1.2199500698148546e-06, "loss": 0.77417302, "num_input_tokens_seen": 229178465, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 10623, "time_per_iteration": 2.464823007583618 }, { "auxiliary_loss_clip": 0.01101673, "auxiliary_loss_mlp": 0.01029966, "balance_loss_clip": 1.01904082, "balance_loss_mlp": 1.03521347, "epoch": 0.6387494363445062, "flos": 22966274718720.0, "grad_norm": 1.4946181355384822, "language_loss": 0.76567709, "learning_rate": 1.2195914666227527e-06, "loss": 0.78699338, "num_input_tokens_seen": 229198975, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6640625, "step": 10624, "time_per_iteration": 2.506103515625 }, { "auxiliary_loss_clip": 0.01106421, "auxiliary_loss_mlp": 0.01030181, "balance_loss_clip": 1.0187968, "balance_loss_mlp": 1.03799272, "epoch": 0.6388095595971742, "flos": 22857716839680.0, "grad_norm": 4.694088019714004, "language_loss": 0.80615985, "learning_rate": 1.21923289302382e-06, "loss": 0.82752591, "num_input_tokens_seen": 229218825, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.68359375, "step": 10625, "time_per_iteration": 2.4787535667419434 }, { "auxiliary_loss_clip": 0.0110843, "auxiliary_loss_mlp": 0.01037226, "balance_loss_clip": 1.02441072, "balance_loss_mlp": 1.03857327, "epoch": 0.6388696828498421, "flos": 17311529445120.0, "grad_norm": 4.204621975467642, "language_loss": 0.72894019, "learning_rate": 1.218874349031654e-06, "loss": 0.75039673, "num_input_tokens_seen": 229236060, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 10626, "time_per_iteration": 2.447652578353882 }, { "auxiliary_loss_clip": 0.01107753, "auxiliary_loss_mlp": 0.01030099, "balance_loss_clip": 1.01779664, "balance_loss_mlp": 1.03806388, "epoch": 0.6389298061025102, "flos": 17128636369920.0, "grad_norm": 2.2403684784736, "language_loss": 0.72754419, "learning_rate": 1.2185158346598517e-06, "loss": 0.74892271, "num_input_tokens_seen": 229255160, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 10627, "time_per_iteration": 2.43574857711792 }, { "auxiliary_loss_clip": 0.01112637, "auxiliary_loss_mlp": 0.01031795, "balance_loss_clip": 1.0181812, "balance_loss_mlp": 1.03899598, "epoch": 0.6389899293551781, "flos": 27710971989120.0, "grad_norm": 1.997352368142102, "language_loss": 0.67221975, "learning_rate": 1.2181573499220064e-06, "loss": 0.69366407, "num_input_tokens_seen": 229278705, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 10628, "time_per_iteration": 3.921736240386963 }, { "auxiliary_loss_clip": 0.01102799, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 1.02052891, "balance_loss_mlp": 1.03720045, "epoch": 0.6390500526078461, "flos": 21215701486080.0, "grad_norm": 2.2296593873037094, "language_loss": 0.67904735, "learning_rate": 1.2177988948317135e-06, "loss": 0.70039493, "num_input_tokens_seen": 229299990, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.65625, "step": 10629, "time_per_iteration": 2.519299268722534 }, { "auxiliary_loss_clip": 0.01112127, "auxiliary_loss_mlp": 0.01040054, "balance_loss_clip": 1.02554023, "balance_loss_mlp": 1.03825426, "epoch": 0.6391101758605141, "flos": 21581056673280.0, "grad_norm": 1.8486120389066536, "language_loss": 0.75373656, "learning_rate": 1.2174404694025646e-06, "loss": 0.77525842, "num_input_tokens_seen": 229319230, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.73828125, "step": 10630, "time_per_iteration": 2.4655585289001465 }, { "auxiliary_loss_clip": 0.01105854, "auxiliary_loss_mlp": 0.01031646, "balance_loss_clip": 1.0204165, "balance_loss_mlp": 1.03777933, "epoch": 0.639170299113182, "flos": 19900473091200.0, "grad_norm": 1.7311523343881328, "language_loss": 0.70627499, "learning_rate": 1.2170820736481511e-06, "loss": 0.72764999, "num_input_tokens_seen": 229338600, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6796875, "step": 10631, "time_per_iteration": 5.343527317047119 }, { "auxiliary_loss_clip": 0.01032991, "auxiliary_loss_mlp": 0.0101038, "balance_loss_clip": 1.00914049, "balance_loss_mlp": 1.00942636, "epoch": 0.63923042236585, "flos": 69877604833920.0, "grad_norm": 0.7728149523796198, "language_loss": 0.6305232, "learning_rate": 1.2167237075820646e-06, "loss": 0.65095693, "num_input_tokens_seen": 229402420, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.23535156, "step": 10632, "time_per_iteration": 3.1328036785125732 }, { "auxiliary_loss_clip": 0.01105266, "auxiliary_loss_mlp": 0.0103621, "balance_loss_clip": 1.02380013, "balance_loss_mlp": 1.03748477, "epoch": 0.639290545618518, "flos": 22674823764480.0, "grad_norm": 2.41812773033167, "language_loss": 0.66833615, "learning_rate": 1.216365371217893e-06, "loss": 0.68975097, "num_input_tokens_seen": 229419185, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 10633, "time_per_iteration": 3.962742328643799 }, { "auxiliary_loss_clip": 0.01107647, "auxiliary_loss_mlp": 0.01032494, "balance_loss_clip": 1.02078795, "balance_loss_mlp": 1.03888309, "epoch": 0.639350668871186, "flos": 19829190551040.0, "grad_norm": 1.846367296058918, "language_loss": 0.82361376, "learning_rate": 1.216007064569225e-06, "loss": 0.84501523, "num_input_tokens_seen": 229436735, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 10634, "time_per_iteration": 2.459592819213867 }, { "auxiliary_loss_clip": 0.01108979, "auxiliary_loss_mlp": 0.01035157, "balance_loss_clip": 1.0219841, "balance_loss_mlp": 1.03940344, "epoch": 0.6394107921238539, "flos": 20553328736640.0, "grad_norm": 1.7841902279766413, "language_loss": 0.75073081, "learning_rate": 1.2156487876496483e-06, "loss": 0.77217215, "num_input_tokens_seen": 229455595, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 10635, "time_per_iteration": 2.4490013122558594 }, { "auxiliary_loss_clip": 0.01107467, "auxiliary_loss_mlp": 0.01033583, "balance_loss_clip": 1.02114975, "balance_loss_mlp": 1.03727508, "epoch": 0.6394709153765219, "flos": 25774991729280.0, "grad_norm": 1.7952986212200572, "language_loss": 0.71720266, "learning_rate": 1.2152905404727475e-06, "loss": 0.73861313, "num_input_tokens_seen": 229476230, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 10636, "time_per_iteration": 2.525296211242676 }, { "auxiliary_loss_clip": 0.01110399, "auxiliary_loss_mlp": 0.01035326, "balance_loss_clip": 1.02228463, "balance_loss_mlp": 1.03907716, "epoch": 0.6395310386291898, "flos": 17530153574400.0, "grad_norm": 1.932695260373112, "language_loss": 0.73713917, "learning_rate": 1.2149323230521085e-06, "loss": 0.75859642, "num_input_tokens_seen": 229494300, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 10637, "time_per_iteration": 2.456481695175171 }, { "auxiliary_loss_clip": 0.01109083, "auxiliary_loss_mlp": 0.01034831, "balance_loss_clip": 1.02184319, "balance_loss_mlp": 1.03725743, "epoch": 0.6395911618818578, "flos": 18588225525120.0, "grad_norm": 1.815251610682601, "language_loss": 0.78015339, "learning_rate": 1.2145741354013143e-06, "loss": 0.80159259, "num_input_tokens_seen": 229512985, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 10638, "time_per_iteration": 2.458621025085449 }, { "auxiliary_loss_clip": 0.01105641, "auxiliary_loss_mlp": 0.01031401, "balance_loss_clip": 1.01874125, "balance_loss_mlp": 1.03692007, "epoch": 0.6396512851345257, "flos": 28366557068160.0, "grad_norm": 1.7284447634632336, "language_loss": 0.81697583, "learning_rate": 1.2142159775339478e-06, "loss": 0.83834624, "num_input_tokens_seen": 229534270, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 10639, "time_per_iteration": 2.5421488285064697 }, { "auxiliary_loss_clip": 0.01033789, "auxiliary_loss_mlp": 0.01007553, "balance_loss_clip": 1.00634336, "balance_loss_mlp": 1.01043463, "epoch": 0.6397114083871938, "flos": 70724307202560.0, "grad_norm": 0.8154365538236661, "language_loss": 0.59043598, "learning_rate": 1.21385784946359e-06, "loss": 0.61084938, "num_input_tokens_seen": 229596455, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.23339844, "step": 10640, "time_per_iteration": 3.08282470703125 }, { "auxiliary_loss_clip": 0.01104542, "auxiliary_loss_mlp": 0.01028399, "balance_loss_clip": 1.01676965, "balance_loss_mlp": 1.0372057, "epoch": 0.6397715316398617, "flos": 18142537570560.0, "grad_norm": 1.8008264103460407, "language_loss": 0.78336871, "learning_rate": 1.2134997512038215e-06, "loss": 0.80469811, "num_input_tokens_seen": 229612860, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 10641, "time_per_iteration": 2.4895379543304443 }, { "auxiliary_loss_clip": 0.01113297, "auxiliary_loss_mlp": 0.01036817, "balance_loss_clip": 1.0235436, "balance_loss_mlp": 1.03908288, "epoch": 0.6398316548925297, "flos": 25739512070400.0, "grad_norm": 1.9644870304090525, "language_loss": 0.63675654, "learning_rate": 1.2131416827682209e-06, "loss": 0.65825772, "num_input_tokens_seen": 229633960, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 10642, "time_per_iteration": 2.5066561698913574 }, { "auxiliary_loss_clip": 0.01032907, "auxiliary_loss_mlp": 0.01005445, "balance_loss_clip": 1.00410962, "balance_loss_mlp": 1.00939882, "epoch": 0.6398917781451977, "flos": 71214234756480.0, "grad_norm": 0.9430715288144207, "language_loss": 0.55990386, "learning_rate": 1.2127836441703667e-06, "loss": 0.58028734, "num_input_tokens_seen": 229686730, "router_z_loss_clip": 0.0133667, "router_z_loss_mlp": 0.234375, "step": 10643, "time_per_iteration": 3.0370078086853027 }, { "auxiliary_loss_clip": 0.01112634, "auxiliary_loss_mlp": 0.01028289, "balance_loss_clip": 1.01602268, "balance_loss_mlp": 1.04041159, "epoch": 0.6399519013978656, "flos": 20521835487360.0, "grad_norm": 2.0699687601204606, "language_loss": 0.76783389, "learning_rate": 1.2124256354238358e-06, "loss": 0.7892431, "num_input_tokens_seen": 229704800, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.72265625, "step": 10644, "time_per_iteration": 2.4548373222351074 }, { "auxiliary_loss_clip": 0.01108737, "auxiliary_loss_mlp": 0.01028873, "balance_loss_clip": 1.01600409, "balance_loss_mlp": 1.03976333, "epoch": 0.6400120246505336, "flos": 24460840742400.0, "grad_norm": 1.9260437111422144, "language_loss": 0.82661045, "learning_rate": 1.212067656542203e-06, "loss": 0.84798658, "num_input_tokens_seen": 229725265, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 10645, "time_per_iteration": 2.5153019428253174 }, { "auxiliary_loss_clip": 0.01112147, "auxiliary_loss_mlp": 0.01037206, "balance_loss_clip": 1.02295482, "balance_loss_mlp": 1.03826046, "epoch": 0.6400721479032015, "flos": 28366090191360.0, "grad_norm": 1.8403243136361256, "language_loss": 0.73704481, "learning_rate": 1.2117097075390447e-06, "loss": 0.75853837, "num_input_tokens_seen": 229744840, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.734375, "step": 10646, "time_per_iteration": 2.510413646697998 }, { "auxiliary_loss_clip": 0.01109393, "auxiliary_loss_mlp": 0.01032341, "balance_loss_clip": 1.01900125, "balance_loss_mlp": 1.03834486, "epoch": 0.6401322711558696, "flos": 17816540711040.0, "grad_norm": 2.3229675953951663, "language_loss": 0.79858279, "learning_rate": 1.2113517884279327e-06, "loss": 0.82000017, "num_input_tokens_seen": 229759095, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.70703125, "step": 10647, "time_per_iteration": 2.433820962905884 }, { "auxiliary_loss_clip": 0.01108536, "auxiliary_loss_mlp": 0.01027996, "balance_loss_clip": 1.01579487, "balance_loss_mlp": 1.04038346, "epoch": 0.6401923944085375, "flos": 26030855283840.0, "grad_norm": 1.688846590933332, "language_loss": 0.75739276, "learning_rate": 1.2109938992224399e-06, "loss": 0.77875817, "num_input_tokens_seen": 229777750, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 10648, "time_per_iteration": 2.5061686038970947 }, { "auxiliary_loss_clip": 0.01107515, "auxiliary_loss_mlp": 0.01028672, "balance_loss_clip": 1.01652491, "balance_loss_mlp": 1.03747153, "epoch": 0.6402525176612055, "flos": 23586451966080.0, "grad_norm": 2.000409520204037, "language_loss": 0.78657067, "learning_rate": 1.210636039936138e-06, "loss": 0.8079325, "num_input_tokens_seen": 229796785, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69921875, "step": 10649, "time_per_iteration": 2.5053069591522217 }, { "auxiliary_loss_clip": 0.01108081, "auxiliary_loss_mlp": 0.01032265, "balance_loss_clip": 1.01839495, "balance_loss_mlp": 1.03852916, "epoch": 0.6403126409138734, "flos": 18041413806720.0, "grad_norm": 2.0769004868891847, "language_loss": 0.75509691, "learning_rate": 1.2102782105825956e-06, "loss": 0.77650034, "num_input_tokens_seen": 229815425, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.6953125, "step": 10650, "time_per_iteration": 2.43165922164917 }, { "auxiliary_loss_clip": 0.01108143, "auxiliary_loss_mlp": 0.0103028, "balance_loss_clip": 1.017066, "balance_loss_mlp": 1.03843307, "epoch": 0.6403727641665414, "flos": 21979485308160.0, "grad_norm": 1.5786201122638577, "language_loss": 0.70723969, "learning_rate": 1.2099204111753833e-06, "loss": 0.72862399, "num_input_tokens_seen": 229834545, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 10651, "time_per_iteration": 2.5041744709014893 }, { "auxiliary_loss_clip": 0.01109217, "auxiliary_loss_mlp": 0.01036518, "balance_loss_clip": 1.02282083, "balance_loss_mlp": 1.03913426, "epoch": 0.6404328874192093, "flos": 24895539135360.0, "grad_norm": 2.7424643947237533, "language_loss": 0.64055955, "learning_rate": 1.2095626417280684e-06, "loss": 0.66201693, "num_input_tokens_seen": 229849175, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 10652, "time_per_iteration": 2.467289686203003 }, { "auxiliary_loss_clip": 0.01108718, "auxiliary_loss_mlp": 0.0102711, "balance_loss_clip": 1.01473665, "balance_loss_mlp": 1.03907657, "epoch": 0.6404930106718774, "flos": 17597198309760.0, "grad_norm": 1.9577421032067153, "language_loss": 0.79360402, "learning_rate": 1.2092049022542168e-06, "loss": 0.81496239, "num_input_tokens_seen": 229865400, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 10653, "time_per_iteration": 2.446779489517212 }, { "auxiliary_loss_clip": 0.01117062, "auxiliary_loss_mlp": 0.01044826, "balance_loss_clip": 1.0300858, "balance_loss_mlp": 1.03921223, "epoch": 0.6405531339245453, "flos": 20157880930560.0, "grad_norm": 2.902888528285601, "language_loss": 0.70095074, "learning_rate": 1.2088471927673952e-06, "loss": 0.72256964, "num_input_tokens_seen": 229882945, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.77734375, "step": 10654, "time_per_iteration": 2.4697301387786865 }, { "auxiliary_loss_clip": 0.01113612, "auxiliary_loss_mlp": 0.01035647, "balance_loss_clip": 1.02193189, "balance_loss_mlp": 1.03978717, "epoch": 0.6406132571772133, "flos": 21942281796480.0, "grad_norm": 1.7141935567221422, "language_loss": 0.72496492, "learning_rate": 1.2084895132811666e-06, "loss": 0.74645746, "num_input_tokens_seen": 229901590, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 10655, "time_per_iteration": 2.487516164779663 }, { "auxiliary_loss_clip": 0.01111727, "auxiliary_loss_mlp": 0.01032411, "balance_loss_clip": 1.01950109, "balance_loss_mlp": 1.03959107, "epoch": 0.6406733804298813, "flos": 28768002445440.0, "grad_norm": 3.3150579879601954, "language_loss": 0.82717836, "learning_rate": 1.2081318638090952e-06, "loss": 0.84861976, "num_input_tokens_seen": 229922535, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 10656, "time_per_iteration": 2.521318197250366 }, { "auxiliary_loss_clip": 0.01107755, "auxiliary_loss_mlp": 0.01031924, "balance_loss_clip": 1.01924014, "balance_loss_mlp": 1.03662384, "epoch": 0.6407335036825492, "flos": 17457183095040.0, "grad_norm": 2.6991234188683473, "language_loss": 0.72048944, "learning_rate": 1.2077742443647433e-06, "loss": 0.7418862, "num_input_tokens_seen": 229939575, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 10657, "time_per_iteration": 2.4444427490234375 }, { "auxiliary_loss_clip": 0.01109408, "auxiliary_loss_mlp": 0.01032264, "balance_loss_clip": 1.01952648, "balance_loss_mlp": 1.03872395, "epoch": 0.6407936269352172, "flos": 22125282612480.0, "grad_norm": 2.692444837980264, "language_loss": 0.7720347, "learning_rate": 1.2074166549616707e-06, "loss": 0.79345143, "num_input_tokens_seen": 229958840, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 10658, "time_per_iteration": 2.5132029056549072 }, { "auxiliary_loss_clip": 0.01110916, "auxiliary_loss_mlp": 0.01042211, "balance_loss_clip": 1.02846074, "balance_loss_mlp": 1.0388025, "epoch": 0.6408537501878852, "flos": 23110635479040.0, "grad_norm": 1.9103326123757516, "language_loss": 0.76515007, "learning_rate": 1.2070590956134386e-06, "loss": 0.78668141, "num_input_tokens_seen": 229979680, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.72265625, "step": 10659, "time_per_iteration": 2.508119583129883 }, { "auxiliary_loss_clip": 0.01109363, "auxiliary_loss_mlp": 0.01034711, "balance_loss_clip": 1.02122295, "balance_loss_mlp": 1.03842735, "epoch": 0.6409138734405532, "flos": 16472440759680.0, "grad_norm": 1.7869142600551242, "language_loss": 0.78356671, "learning_rate": 1.2067015663336046e-06, "loss": 0.80500746, "num_input_tokens_seen": 229996830, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 10660, "time_per_iteration": 2.4433555603027344 }, { "auxiliary_loss_clip": 0.01115028, "auxiliary_loss_mlp": 0.01034756, "balance_loss_clip": 1.02070165, "balance_loss_mlp": 1.04044247, "epoch": 0.6409739966932211, "flos": 22777922776320.0, "grad_norm": 2.047016688689759, "language_loss": 0.68576843, "learning_rate": 1.206344067135727e-06, "loss": 0.70726633, "num_input_tokens_seen": 230015115, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.74609375, "step": 10661, "time_per_iteration": 2.492021322250366 }, { "auxiliary_loss_clip": 0.01108959, "auxiliary_loss_mlp": 0.01033159, "balance_loss_clip": 1.02101183, "balance_loss_mlp": 1.04050446, "epoch": 0.6410341199458891, "flos": 25152049134720.0, "grad_norm": 1.6899642785544975, "language_loss": 0.76173449, "learning_rate": 1.205986598033362e-06, "loss": 0.78315568, "num_input_tokens_seen": 230035515, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 10662, "time_per_iteration": 2.5405397415161133 }, { "auxiliary_loss_clip": 0.01108788, "auxiliary_loss_mlp": 0.01029326, "balance_loss_clip": 1.01642728, "balance_loss_mlp": 1.03811634, "epoch": 0.641094243198557, "flos": 27046193028480.0, "grad_norm": 1.9348968120625392, "language_loss": 0.69676512, "learning_rate": 1.2056291590400644e-06, "loss": 0.7181462, "num_input_tokens_seen": 230054355, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 10663, "time_per_iteration": 2.522814989089966 }, { "auxiliary_loss_clip": 0.01111872, "auxiliary_loss_mlp": 0.01041682, "balance_loss_clip": 1.0267992, "balance_loss_mlp": 1.03948224, "epoch": 0.641154366451225, "flos": 25374551932800.0, "grad_norm": 2.3392846925544784, "language_loss": 0.68276763, "learning_rate": 1.205271750169389e-06, "loss": 0.70430321, "num_input_tokens_seen": 230074605, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.72265625, "step": 10664, "time_per_iteration": 2.514796257019043 }, { "auxiliary_loss_clip": 0.01105483, "auxiliary_loss_mlp": 0.01031496, "balance_loss_clip": 1.01928306, "balance_loss_mlp": 1.03690004, "epoch": 0.6412144897038929, "flos": 25153342024320.0, "grad_norm": 2.0056299056687243, "language_loss": 0.66019833, "learning_rate": 1.2049143714348881e-06, "loss": 0.68156815, "num_input_tokens_seen": 230093820, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 10665, "time_per_iteration": 2.4977853298187256 }, { "auxiliary_loss_clip": 0.0110629, "auxiliary_loss_mlp": 0.0102655, "balance_loss_clip": 1.01396108, "balance_loss_mlp": 1.03761744, "epoch": 0.641274612956561, "flos": 23440762402560.0, "grad_norm": 1.8171438499981225, "language_loss": 0.64422798, "learning_rate": 1.2045570228501145e-06, "loss": 0.66555637, "num_input_tokens_seen": 230114285, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 10666, "time_per_iteration": 2.491750478744507 }, { "auxiliary_loss_clip": 0.01109972, "auxiliary_loss_mlp": 0.01030043, "balance_loss_clip": 1.01707268, "balance_loss_mlp": 1.03847289, "epoch": 0.6413347362092289, "flos": 19427493778560.0, "grad_norm": 1.4858731064701032, "language_loss": 0.70786619, "learning_rate": 1.2041997044286176e-06, "loss": 0.72926635, "num_input_tokens_seen": 230132760, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 10667, "time_per_iteration": 2.4761977195739746 }, { "auxiliary_loss_clip": 0.01119481, "auxiliary_loss_mlp": 0.01041536, "balance_loss_clip": 1.02665842, "balance_loss_mlp": 1.04172242, "epoch": 0.6413948594618969, "flos": 17196578945280.0, "grad_norm": 2.6142739179799075, "language_loss": 0.77760446, "learning_rate": 1.2038424161839484e-06, "loss": 0.7992146, "num_input_tokens_seen": 230149690, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.77734375, "step": 10668, "time_per_iteration": 2.4274189472198486 }, { "auxiliary_loss_clip": 0.01111093, "auxiliary_loss_mlp": 0.01032484, "balance_loss_clip": 1.01976991, "balance_loss_mlp": 1.04073393, "epoch": 0.6414549827145648, "flos": 22269787027200.0, "grad_norm": 1.5527287464260957, "language_loss": 0.6766271, "learning_rate": 1.2034851581296544e-06, "loss": 0.6980629, "num_input_tokens_seen": 230166950, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 10669, "time_per_iteration": 2.468535900115967 }, { "auxiliary_loss_clip": 0.01117297, "auxiliary_loss_mlp": 0.01040034, "balance_loss_clip": 1.0262233, "balance_loss_mlp": 1.04193163, "epoch": 0.6415151059672328, "flos": 19640192163840.0, "grad_norm": 1.766116057433525, "language_loss": 0.78488946, "learning_rate": 1.2031279302792825e-06, "loss": 0.80646271, "num_input_tokens_seen": 230184785, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75390625, "step": 10670, "time_per_iteration": 3.779387950897217 }, { "auxiliary_loss_clip": 0.01112352, "auxiliary_loss_mlp": 0.01032963, "balance_loss_clip": 1.01968932, "balance_loss_mlp": 1.03867745, "epoch": 0.6415752292199008, "flos": 14865833237760.0, "grad_norm": 2.4238075313918688, "language_loss": 0.88358802, "learning_rate": 1.20277073264638e-06, "loss": 0.90504122, "num_input_tokens_seen": 230201385, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 10671, "time_per_iteration": 2.43880558013916 }, { "auxiliary_loss_clip": 0.01107072, "auxiliary_loss_mlp": 0.01026607, "balance_loss_clip": 1.01446009, "balance_loss_mlp": 1.0392586, "epoch": 0.6416353524725688, "flos": 13735580906880.0, "grad_norm": 1.4958595150214367, "language_loss": 0.69050586, "learning_rate": 1.2024135652444907e-06, "loss": 0.71184266, "num_input_tokens_seen": 230220380, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 10672, "time_per_iteration": 2.4558165073394775 }, { "auxiliary_loss_clip": 0.01114046, "auxiliary_loss_mlp": 0.01030379, "balance_loss_clip": 1.01574063, "balance_loss_mlp": 1.03926301, "epoch": 0.6416954757252368, "flos": 24534924543360.0, "grad_norm": 1.8783191195913307, "language_loss": 0.73880386, "learning_rate": 1.2020564280871593e-06, "loss": 0.76024818, "num_input_tokens_seen": 230239845, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 10673, "time_per_iteration": 5.382788896560669 }, { "auxiliary_loss_clip": 0.01110674, "auxiliary_loss_mlp": 0.01035795, "balance_loss_clip": 1.02185369, "balance_loss_mlp": 1.03940725, "epoch": 0.6417555989779047, "flos": 27710002321920.0, "grad_norm": 3.172371177068489, "language_loss": 0.69165707, "learning_rate": 1.2016993211879283e-06, "loss": 0.71312177, "num_input_tokens_seen": 230262420, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 10674, "time_per_iteration": 2.5626885890960693 }, { "auxiliary_loss_clip": 0.01114839, "auxiliary_loss_mlp": 0.01028052, "balance_loss_clip": 1.01434851, "balance_loss_mlp": 1.03943181, "epoch": 0.6418157222305727, "flos": 20556632787840.0, "grad_norm": 1.8890760420187966, "language_loss": 0.66222823, "learning_rate": 1.201342244560338e-06, "loss": 0.68365717, "num_input_tokens_seen": 230279950, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.75390625, "step": 10675, "time_per_iteration": 3.9398510456085205 }, { "auxiliary_loss_clip": 0.01112244, "auxiliary_loss_mlp": 0.01038274, "balance_loss_clip": 1.02568591, "balance_loss_mlp": 1.04134727, "epoch": 0.6418758454832406, "flos": 22601530062720.0, "grad_norm": 1.8912519954242004, "language_loss": 0.66399312, "learning_rate": 1.2009851982179307e-06, "loss": 0.68549836, "num_input_tokens_seen": 230299705, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 10676, "time_per_iteration": 2.472107172012329 }, { "auxiliary_loss_clip": 0.01112907, "auxiliary_loss_mlp": 0.01033683, "balance_loss_clip": 1.01887739, "balance_loss_mlp": 1.04045415, "epoch": 0.6419359687359086, "flos": 27375098889600.0, "grad_norm": 2.19189719015843, "language_loss": 0.7508806, "learning_rate": 1.2006281821742446e-06, "loss": 0.7723465, "num_input_tokens_seen": 230320030, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7265625, "step": 10677, "time_per_iteration": 2.52361798286438 }, { "auxiliary_loss_clip": 0.01032003, "auxiliary_loss_mlp": 0.01003437, "balance_loss_clip": 1.00220966, "balance_loss_mlp": 1.00847101, "epoch": 0.6419960919885765, "flos": 67251924552960.0, "grad_norm": 0.7652890383448974, "language_loss": 0.60619891, "learning_rate": 1.200271196442818e-06, "loss": 0.6265533, "num_input_tokens_seen": 230381495, "router_z_loss_clip": 0.01226807, "router_z_loss_mlp": 0.23535156, "step": 10678, "time_per_iteration": 3.1504175662994385 }, { "auxiliary_loss_clip": 0.01109698, "auxiliary_loss_mlp": 0.010355, "balance_loss_clip": 1.02303636, "balance_loss_mlp": 1.0406791, "epoch": 0.6420562152412446, "flos": 19901873721600.0, "grad_norm": 1.7038569146720155, "language_loss": 0.67457652, "learning_rate": 1.1999142410371875e-06, "loss": 0.69602841, "num_input_tokens_seen": 230401385, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 10679, "time_per_iteration": 2.4821126461029053 }, { "auxiliary_loss_clip": 0.01112346, "auxiliary_loss_mlp": 0.01032177, "balance_loss_clip": 1.01827717, "balance_loss_mlp": 1.04019499, "epoch": 0.6421163384939125, "flos": 24790177566720.0, "grad_norm": 1.7751272886666056, "language_loss": 0.7312296, "learning_rate": 1.1995573159708897e-06, "loss": 0.75267488, "num_input_tokens_seen": 230421340, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 10680, "time_per_iteration": 2.486523389816284 }, { "auxiliary_loss_clip": 0.01110456, "auxiliary_loss_mlp": 0.01030264, "balance_loss_clip": 1.01805091, "balance_loss_mlp": 1.03930199, "epoch": 0.6421764617465805, "flos": 25592816926080.0, "grad_norm": 1.8290291435215638, "language_loss": 0.67368984, "learning_rate": 1.1992004212574582e-06, "loss": 0.69509709, "num_input_tokens_seen": 230441270, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 10681, "time_per_iteration": 2.49690318107605 }, { "auxiliary_loss_clip": 0.01107706, "auxiliary_loss_mlp": 0.01029833, "balance_loss_clip": 1.01729774, "balance_loss_mlp": 1.03791499, "epoch": 0.6422365849992484, "flos": 14134727813760.0, "grad_norm": 1.9812031193723647, "language_loss": 0.74854308, "learning_rate": 1.198843556910427e-06, "loss": 0.76991844, "num_input_tokens_seen": 230457455, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 10682, "time_per_iteration": 2.4290771484375 }, { "auxiliary_loss_clip": 0.01105442, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.02247858, "balance_loss_mlp": 1.03762805, "epoch": 0.6422967082519164, "flos": 22383911514240.0, "grad_norm": 1.5235980281399637, "language_loss": 0.79109257, "learning_rate": 1.1984867229433287e-06, "loss": 0.81249154, "num_input_tokens_seen": 230478955, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 10683, "time_per_iteration": 2.5206985473632812 }, { "auxiliary_loss_clip": 0.01112943, "auxiliary_loss_mlp": 0.01037606, "balance_loss_clip": 1.02361083, "balance_loss_mlp": 1.04006696, "epoch": 0.6423568315045844, "flos": 14647927380480.0, "grad_norm": 1.753117249060229, "language_loss": 0.67085034, "learning_rate": 1.1981299193696941e-06, "loss": 0.69235581, "num_input_tokens_seen": 230496425, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7265625, "step": 10684, "time_per_iteration": 2.4562807083129883 }, { "auxiliary_loss_clip": 0.01110739, "auxiliary_loss_mlp": 0.01033303, "balance_loss_clip": 1.02010059, "balance_loss_mlp": 1.03894556, "epoch": 0.6424169547572524, "flos": 26833925606400.0, "grad_norm": 2.738835278400085, "language_loss": 0.71627712, "learning_rate": 1.1977731462030533e-06, "loss": 0.73771763, "num_input_tokens_seen": 230516245, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 10685, "time_per_iteration": 2.5095717906951904 }, { "auxiliary_loss_clip": 0.01108372, "auxiliary_loss_mlp": 0.0103486, "balance_loss_clip": 1.02246225, "balance_loss_mlp": 1.03936255, "epoch": 0.6424770780099204, "flos": 22707430335360.0, "grad_norm": 4.061371965213237, "language_loss": 0.75569129, "learning_rate": 1.197416403456935e-06, "loss": 0.77712357, "num_input_tokens_seen": 230534745, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 10686, "time_per_iteration": 2.485393762588501 }, { "auxiliary_loss_clip": 0.0111407, "auxiliary_loss_mlp": 0.0103335, "balance_loss_clip": 1.01927173, "balance_loss_mlp": 1.04051995, "epoch": 0.6425372012625883, "flos": 28469512425600.0, "grad_norm": 2.017822590232474, "language_loss": 0.68703389, "learning_rate": 1.197059691144867e-06, "loss": 0.70850807, "num_input_tokens_seen": 230555895, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 10687, "time_per_iteration": 2.531339645385742 }, { "auxiliary_loss_clip": 0.01112791, "auxiliary_loss_mlp": 0.01030727, "balance_loss_clip": 1.01790023, "balance_loss_mlp": 1.04084265, "epoch": 0.6425973245152563, "flos": 29351694453120.0, "grad_norm": 1.9419084279772603, "language_loss": 0.66302109, "learning_rate": 1.1967030092803767e-06, "loss": 0.68445623, "num_input_tokens_seen": 230577460, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 10688, "time_per_iteration": 2.516011953353882 }, { "auxiliary_loss_clip": 0.01109882, "auxiliary_loss_mlp": 0.01031771, "balance_loss_clip": 1.01862812, "balance_loss_mlp": 1.03872395, "epoch": 0.6426574477679242, "flos": 16430388912000.0, "grad_norm": 1.9307266646590866, "language_loss": 0.7293824, "learning_rate": 1.1963463578769876e-06, "loss": 0.75079894, "num_input_tokens_seen": 230595030, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 10689, "time_per_iteration": 2.4560153484344482 }, { "auxiliary_loss_clip": 0.01108576, "auxiliary_loss_mlp": 0.01031815, "balance_loss_clip": 1.0201925, "balance_loss_mlp": 1.04023004, "epoch": 0.6427175710205922, "flos": 21835914647040.0, "grad_norm": 1.9467806091215603, "language_loss": 0.72013128, "learning_rate": 1.195989736948226e-06, "loss": 0.74153519, "num_input_tokens_seen": 230615135, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 10690, "time_per_iteration": 2.4599037170410156 }, { "auxiliary_loss_clip": 0.01108087, "auxiliary_loss_mlp": 0.01030303, "balance_loss_clip": 1.0173924, "balance_loss_mlp": 1.03871036, "epoch": 0.6427776942732601, "flos": 17786627660160.0, "grad_norm": 1.7050916796588256, "language_loss": 0.77753747, "learning_rate": 1.1956331465076143e-06, "loss": 0.79892135, "num_input_tokens_seen": 230631965, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 10691, "time_per_iteration": 2.4632585048675537 }, { "auxiliary_loss_clip": 0.0111283, "auxiliary_loss_mlp": 0.01036169, "balance_loss_clip": 1.02295482, "balance_loss_mlp": 1.04039598, "epoch": 0.6428378175259282, "flos": 15085893911040.0, "grad_norm": 1.6958254035243754, "language_loss": 0.74262679, "learning_rate": 1.1952765865686738e-06, "loss": 0.76411682, "num_input_tokens_seen": 230649565, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 10692, "time_per_iteration": 2.4653842449188232 }, { "auxiliary_loss_clip": 0.01110446, "auxiliary_loss_mlp": 0.01033854, "balance_loss_clip": 1.02080011, "balance_loss_mlp": 1.03977346, "epoch": 0.6428979407785961, "flos": 23841776816640.0, "grad_norm": 2.1645964631796053, "language_loss": 0.61385453, "learning_rate": 1.1949200571449263e-06, "loss": 0.63529754, "num_input_tokens_seen": 230669265, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 10693, "time_per_iteration": 2.503519296646118 }, { "auxiliary_loss_clip": 0.01113375, "auxiliary_loss_mlp": 0.01027865, "balance_loss_clip": 1.01460886, "balance_loss_mlp": 1.03901803, "epoch": 0.6429580640312641, "flos": 32926852892160.0, "grad_norm": 1.670225637121633, "language_loss": 0.59664464, "learning_rate": 1.1945635582498903e-06, "loss": 0.61805701, "num_input_tokens_seen": 230690575, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7421875, "step": 10694, "time_per_iteration": 2.565997362136841 }, { "auxiliary_loss_clip": 0.01112626, "auxiliary_loss_mlp": 0.01033717, "balance_loss_clip": 1.02095008, "balance_loss_mlp": 1.04063606, "epoch": 0.643018187283932, "flos": 21068359896960.0, "grad_norm": 1.8711575164350338, "language_loss": 0.79858571, "learning_rate": 1.1942070898970853e-06, "loss": 0.82004917, "num_input_tokens_seen": 230709420, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 10695, "time_per_iteration": 2.47935152053833 }, { "auxiliary_loss_clip": 0.01111776, "auxiliary_loss_mlp": 0.01044303, "balance_loss_clip": 1.03053379, "balance_loss_mlp": 1.03969491, "epoch": 0.6430783105366, "flos": 26724649455360.0, "grad_norm": 1.6347846213001387, "language_loss": 0.73605967, "learning_rate": 1.1938506521000285e-06, "loss": 0.75762045, "num_input_tokens_seen": 230729350, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 10696, "time_per_iteration": 2.5380868911743164 }, { "auxiliary_loss_clip": 0.01108733, "auxiliary_loss_mlp": 0.01029712, "balance_loss_clip": 1.01707602, "balance_loss_mlp": 1.0405817, "epoch": 0.643138433789268, "flos": 23696841438720.0, "grad_norm": 1.8718055441399906, "language_loss": 0.7549237, "learning_rate": 1.1934942448722347e-06, "loss": 0.77630818, "num_input_tokens_seen": 230749220, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 10697, "time_per_iteration": 2.515162706375122 }, { "auxiliary_loss_clip": 0.01107477, "auxiliary_loss_mlp": 0.01028928, "balance_loss_clip": 1.01696539, "balance_loss_mlp": 1.03819728, "epoch": 0.643198557041936, "flos": 34202184255360.0, "grad_norm": 1.5313648485253533, "language_loss": 0.65831047, "learning_rate": 1.1931378682272208e-06, "loss": 0.67967451, "num_input_tokens_seen": 230770245, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 10698, "time_per_iteration": 2.6010425090789795 }, { "auxiliary_loss_clip": 0.01031415, "auxiliary_loss_mlp": 0.00999585, "balance_loss_clip": 0.99825567, "balance_loss_mlp": 1.00810838, "epoch": 0.643258680294604, "flos": 67626473621760.0, "grad_norm": 0.8417992604903307, "language_loss": 0.63477182, "learning_rate": 1.1927815221784996e-06, "loss": 0.65508187, "num_input_tokens_seen": 230837030, "router_z_loss_clip": 0.01330566, "router_z_loss_mlp": 0.23339844, "step": 10699, "time_per_iteration": 3.0850367546081543 }, { "auxiliary_loss_clip": 0.01106774, "auxiliary_loss_mlp": 0.01027751, "balance_loss_clip": 1.01622355, "balance_loss_mlp": 1.03892183, "epoch": 0.6433188035472719, "flos": 25185984508800.0, "grad_norm": 1.6280978432642625, "language_loss": 0.69365168, "learning_rate": 1.1924252067395838e-06, "loss": 0.71499693, "num_input_tokens_seen": 230856845, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 10700, "time_per_iteration": 2.4846959114074707 }, { "auxiliary_loss_clip": 0.01108317, "auxiliary_loss_mlp": 0.01027356, "balance_loss_clip": 1.01457667, "balance_loss_mlp": 1.03780818, "epoch": 0.6433789267999399, "flos": 24973573432320.0, "grad_norm": 1.8675954985345418, "language_loss": 0.73270148, "learning_rate": 1.1920689219239855e-06, "loss": 0.75405824, "num_input_tokens_seen": 230878785, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10701, "time_per_iteration": 2.5205368995666504 }, { "auxiliary_loss_clip": 0.01110393, "auxiliary_loss_mlp": 0.01032937, "balance_loss_clip": 1.01857233, "balance_loss_mlp": 1.03698564, "epoch": 0.6434390500526078, "flos": 17566028282880.0, "grad_norm": 1.962066732054482, "language_loss": 0.82208264, "learning_rate": 1.1917126677452144e-06, "loss": 0.84351599, "num_input_tokens_seen": 230895445, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.734375, "step": 10702, "time_per_iteration": 2.43410062789917 }, { "auxiliary_loss_clip": 0.01106448, "auxiliary_loss_mlp": 0.01035362, "balance_loss_clip": 1.02270818, "balance_loss_mlp": 1.03748357, "epoch": 0.6434991733052758, "flos": 20843594542080.0, "grad_norm": 1.9910937926223056, "language_loss": 0.75001782, "learning_rate": 1.1913564442167798e-06, "loss": 0.77143592, "num_input_tokens_seen": 230911375, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 10703, "time_per_iteration": 2.4631872177124023 }, { "auxiliary_loss_clip": 0.01031844, "auxiliary_loss_mlp": 0.01000564, "balance_loss_clip": 0.99922317, "balance_loss_mlp": 1.00846267, "epoch": 0.6435592965579437, "flos": 66094596345600.0, "grad_norm": 0.6550104164619244, "language_loss": 0.54693598, "learning_rate": 1.1910002513521898e-06, "loss": 0.56726009, "num_input_tokens_seen": 230975990, "router_z_loss_clip": 0.01342773, "router_z_loss_mlp": 0.234375, "step": 10704, "time_per_iteration": 3.09263277053833 }, { "auxiliary_loss_clip": 0.01109079, "auxiliary_loss_mlp": 0.01023677, "balance_loss_clip": 1.01207197, "balance_loss_mlp": 1.03865075, "epoch": 0.6436194198106118, "flos": 23768842250880.0, "grad_norm": 1.9531780399185408, "language_loss": 0.76878536, "learning_rate": 1.1906440891649519e-06, "loss": 0.79011291, "num_input_tokens_seen": 230997110, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.703125, "step": 10705, "time_per_iteration": 2.501845598220825 }, { "auxiliary_loss_clip": 0.01109105, "auxiliary_loss_mlp": 0.01035983, "balance_loss_clip": 1.02308488, "balance_loss_mlp": 1.03787947, "epoch": 0.6436795430632797, "flos": 20230312705920.0, "grad_norm": 2.152327071418697, "language_loss": 0.79185414, "learning_rate": 1.1902879576685708e-06, "loss": 0.81330502, "num_input_tokens_seen": 231015590, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 10706, "time_per_iteration": 2.476933479309082 }, { "auxiliary_loss_clip": 0.01110188, "auxiliary_loss_mlp": 0.01030127, "balance_loss_clip": 1.01653743, "balance_loss_mlp": 1.03878188, "epoch": 0.6437396663159477, "flos": 20301846641280.0, "grad_norm": 2.044548534429957, "language_loss": 0.79975498, "learning_rate": 1.1899318568765518e-06, "loss": 0.82115811, "num_input_tokens_seen": 231033800, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 10707, "time_per_iteration": 2.4489805698394775 }, { "auxiliary_loss_clip": 0.01109266, "auxiliary_loss_mlp": 0.01031492, "balance_loss_clip": 1.0187006, "balance_loss_mlp": 1.03845501, "epoch": 0.6437997895686156, "flos": 23878585278720.0, "grad_norm": 1.587662383176893, "language_loss": 0.85459316, "learning_rate": 1.1895757868023978e-06, "loss": 0.87600076, "num_input_tokens_seen": 231053160, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 10708, "time_per_iteration": 2.533839225769043 }, { "auxiliary_loss_clip": 0.01117821, "auxiliary_loss_mlp": 0.01041778, "balance_loss_clip": 1.0268712, "balance_loss_mlp": 1.04147553, "epoch": 0.6438599128212836, "flos": 18989275852800.0, "grad_norm": 2.576215033430853, "language_loss": 0.65572071, "learning_rate": 1.1892197474596106e-06, "loss": 0.67731667, "num_input_tokens_seen": 231069470, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.765625, "step": 10709, "time_per_iteration": 2.4439516067504883 }, { "auxiliary_loss_clip": 0.01107749, "auxiliary_loss_mlp": 0.01030104, "balance_loss_clip": 1.01790857, "balance_loss_mlp": 1.03797674, "epoch": 0.6439200360739517, "flos": 24096347481600.0, "grad_norm": 1.7752965606323974, "language_loss": 0.80624914, "learning_rate": 1.1888637388616929e-06, "loss": 0.82762766, "num_input_tokens_seen": 231088205, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 10710, "time_per_iteration": 2.503913640975952 }, { "auxiliary_loss_clip": 0.01106065, "auxiliary_loss_mlp": 0.01030132, "balance_loss_clip": 1.01771688, "balance_loss_mlp": 1.03648365, "epoch": 0.6439801593266196, "flos": 31902141697920.0, "grad_norm": 2.778449941744068, "language_loss": 0.65900946, "learning_rate": 1.1885077610221425e-06, "loss": 0.68037146, "num_input_tokens_seen": 231107850, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 10711, "time_per_iteration": 2.552530288696289 }, { "auxiliary_loss_clip": 0.01111209, "auxiliary_loss_mlp": 0.01029251, "balance_loss_clip": 1.01601315, "balance_loss_mlp": 1.0402925, "epoch": 0.6440402825792876, "flos": 27125879351040.0, "grad_norm": 1.6451437090995145, "language_loss": 0.78911686, "learning_rate": 1.1881518139544597e-06, "loss": 0.81052142, "num_input_tokens_seen": 231127200, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 10712, "time_per_iteration": 3.8508834838867188 }, { "auxiliary_loss_clip": 0.01111156, "auxiliary_loss_mlp": 0.01034606, "balance_loss_clip": 1.02149868, "balance_loss_mlp": 1.03868842, "epoch": 0.6441004058319555, "flos": 20667704618880.0, "grad_norm": 1.6287152453624016, "language_loss": 0.82604408, "learning_rate": 1.1877958976721417e-06, "loss": 0.84750175, "num_input_tokens_seen": 231146360, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 10713, "time_per_iteration": 2.460935115814209 }, { "auxiliary_loss_clip": 0.01105728, "auxiliary_loss_mlp": 0.01033066, "balance_loss_clip": 1.0208832, "balance_loss_mlp": 1.03862, "epoch": 0.6441605290846235, "flos": 26026006947840.0, "grad_norm": 1.631891035182514, "language_loss": 0.78488505, "learning_rate": 1.187440012188684e-06, "loss": 0.80627298, "num_input_tokens_seen": 231168350, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 10714, "time_per_iteration": 2.506553888320923 }, { "auxiliary_loss_clip": 0.01107009, "auxiliary_loss_mlp": 0.01028944, "balance_loss_clip": 1.01698089, "balance_loss_mlp": 1.03848696, "epoch": 0.6442206523372914, "flos": 24899489631360.0, "grad_norm": 1.4270921325626458, "language_loss": 0.81739044, "learning_rate": 1.187084157517583e-06, "loss": 0.83875, "num_input_tokens_seen": 231188385, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 10715, "time_per_iteration": 5.44550895690918 }, { "auxiliary_loss_clip": 0.01107897, "auxiliary_loss_mlp": 0.01028222, "balance_loss_clip": 1.01528168, "balance_loss_mlp": 1.0366807, "epoch": 0.6442807755899594, "flos": 25156322853120.0, "grad_norm": 1.8584611667424735, "language_loss": 0.81794167, "learning_rate": 1.186728333672332e-06, "loss": 0.83930284, "num_input_tokens_seen": 231209880, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 10716, "time_per_iteration": 3.991039514541626 }, { "auxiliary_loss_clip": 0.01111055, "auxiliary_loss_mlp": 0.01034183, "balance_loss_clip": 1.01985395, "balance_loss_mlp": 1.0383817, "epoch": 0.6443408988426274, "flos": 27344503480320.0, "grad_norm": 2.05604149607838, "language_loss": 0.78104997, "learning_rate": 1.186372540666424e-06, "loss": 0.80250233, "num_input_tokens_seen": 231230765, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7265625, "step": 10717, "time_per_iteration": 2.5343880653381348 }, { "auxiliary_loss_clip": 0.01106162, "auxiliary_loss_mlp": 0.01032473, "balance_loss_clip": 1.0200932, "balance_loss_mlp": 1.03887892, "epoch": 0.6444010220952954, "flos": 27928339142400.0, "grad_norm": 2.1048105016624405, "language_loss": 0.68436241, "learning_rate": 1.1860167785133513e-06, "loss": 0.70574868, "num_input_tokens_seen": 231252350, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 10718, "time_per_iteration": 2.533416748046875 }, { "auxiliary_loss_clip": 0.01031515, "auxiliary_loss_mlp": 0.00999758, "balance_loss_clip": 0.99844724, "balance_loss_mlp": 1.00792348, "epoch": 0.6444611453479633, "flos": 71215024855680.0, "grad_norm": 0.7517907661876009, "language_loss": 0.49614078, "learning_rate": 1.185661047226603e-06, "loss": 0.5164535, "num_input_tokens_seen": 231313865, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.23632812, "step": 10719, "time_per_iteration": 3.2554805278778076 }, { "auxiliary_loss_clip": 0.0111153, "auxiliary_loss_mlp": 0.01031861, "balance_loss_clip": 1.01811659, "balance_loss_mlp": 1.04003143, "epoch": 0.6445212686006313, "flos": 22705131864960.0, "grad_norm": 1.7753938621813492, "language_loss": 0.781672, "learning_rate": 1.18530534681967e-06, "loss": 0.80310595, "num_input_tokens_seen": 231331710, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 10720, "time_per_iteration": 2.4764764308929443 }, { "auxiliary_loss_clip": 0.01106969, "auxiliary_loss_mlp": 0.01029814, "balance_loss_clip": 1.01653457, "balance_loss_mlp": 1.03720355, "epoch": 0.6445813918532992, "flos": 21178821196800.0, "grad_norm": 1.7457153213950065, "language_loss": 0.77258563, "learning_rate": 1.18494967730604e-06, "loss": 0.79395348, "num_input_tokens_seen": 231350705, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 10721, "time_per_iteration": 2.465686082839966 }, { "auxiliary_loss_clip": 0.01106636, "auxiliary_loss_mlp": 0.01032601, "balance_loss_clip": 1.01890337, "balance_loss_mlp": 1.0364902, "epoch": 0.6446415151059672, "flos": 25191910252800.0, "grad_norm": 2.0427095575324397, "language_loss": 0.73191965, "learning_rate": 1.1845940386991995e-06, "loss": 0.75331205, "num_input_tokens_seen": 231369550, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 10722, "time_per_iteration": 2.4951999187469482 }, { "auxiliary_loss_clip": 0.0110577, "auxiliary_loss_mlp": 0.01029311, "balance_loss_clip": 1.01742542, "balance_loss_mlp": 1.0377903, "epoch": 0.6447016383586353, "flos": 25302227898240.0, "grad_norm": 1.5212689760252143, "language_loss": 0.78102565, "learning_rate": 1.184238431012635e-06, "loss": 0.80237651, "num_input_tokens_seen": 231389285, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 10723, "time_per_iteration": 2.4908270835876465 }, { "auxiliary_loss_clip": 0.01111175, "auxiliary_loss_mlp": 0.01036262, "balance_loss_clip": 1.02282667, "balance_loss_mlp": 1.03901172, "epoch": 0.6447617616113032, "flos": 27703142824320.0, "grad_norm": 3.4829164169835387, "language_loss": 0.58830684, "learning_rate": 1.1838828542598312e-06, "loss": 0.60978115, "num_input_tokens_seen": 231408820, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 10724, "time_per_iteration": 2.5055549144744873 }, { "auxiliary_loss_clip": 0.01105127, "auxiliary_loss_mlp": 0.01029495, "balance_loss_clip": 1.01769948, "balance_loss_mlp": 1.03820467, "epoch": 0.6448218848639712, "flos": 23039101543680.0, "grad_norm": 3.4864563003749125, "language_loss": 0.83696342, "learning_rate": 1.183527308454271e-06, "loss": 0.85830963, "num_input_tokens_seen": 231428100, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 10725, "time_per_iteration": 2.46917986869812 }, { "auxiliary_loss_clip": 0.01106041, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 1.02074361, "balance_loss_mlp": 1.03654027, "epoch": 0.6448820081166391, "flos": 24496104919680.0, "grad_norm": 1.9245096623349747, "language_loss": 0.82264423, "learning_rate": 1.1831717936094368e-06, "loss": 0.84404057, "num_input_tokens_seen": 231445810, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 10726, "time_per_iteration": 2.491119861602783 }, { "auxiliary_loss_clip": 0.01111107, "auxiliary_loss_mlp": 0.0103046, "balance_loss_clip": 1.01735258, "balance_loss_mlp": 1.03870714, "epoch": 0.6449421313693071, "flos": 22419283432320.0, "grad_norm": 1.8553410117075726, "language_loss": 0.81261802, "learning_rate": 1.1828163097388108e-06, "loss": 0.83403373, "num_input_tokens_seen": 231463570, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 10727, "time_per_iteration": 2.46008038520813 }, { "auxiliary_loss_clip": 0.01113715, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.01936758, "balance_loss_mlp": 1.03880298, "epoch": 0.645002254621975, "flos": 20225715765120.0, "grad_norm": 1.8889108211792365, "language_loss": 0.79114687, "learning_rate": 1.1824608568558717e-06, "loss": 0.81261313, "num_input_tokens_seen": 231482155, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.75, "step": 10728, "time_per_iteration": 2.4912142753601074 }, { "auxiliary_loss_clip": 0.01108952, "auxiliary_loss_mlp": 0.01035983, "balance_loss_clip": 1.02214921, "balance_loss_mlp": 1.03743255, "epoch": 0.645062377874643, "flos": 27855440490240.0, "grad_norm": 1.6537546069062319, "language_loss": 0.74440694, "learning_rate": 1.1821054349740988e-06, "loss": 0.76585633, "num_input_tokens_seen": 231502465, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 10729, "time_per_iteration": 2.5433566570281982 }, { "auxiliary_loss_clip": 0.0110971, "auxiliary_loss_mlp": 0.01035441, "balance_loss_clip": 1.02191651, "balance_loss_mlp": 1.03813553, "epoch": 0.645122501127311, "flos": 25301509626240.0, "grad_norm": 1.8623279249544717, "language_loss": 0.66381532, "learning_rate": 1.1817500441069706e-06, "loss": 0.68526685, "num_input_tokens_seen": 231522740, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 10730, "time_per_iteration": 2.519268035888672 }, { "auxiliary_loss_clip": 0.0110931, "auxiliary_loss_mlp": 0.01031147, "balance_loss_clip": 1.01722956, "balance_loss_mlp": 1.03834188, "epoch": 0.645182624379979, "flos": 18807352444800.0, "grad_norm": 1.9336366096788216, "language_loss": 0.64025789, "learning_rate": 1.1813946842679614e-06, "loss": 0.66166246, "num_input_tokens_seen": 231542050, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 10731, "time_per_iteration": 2.518444538116455 }, { "auxiliary_loss_clip": 0.01106279, "auxiliary_loss_mlp": 0.01035018, "balance_loss_clip": 1.02247727, "balance_loss_mlp": 1.03756392, "epoch": 0.6452427476326469, "flos": 18332182402560.0, "grad_norm": 1.6151542576887004, "language_loss": 0.68217665, "learning_rate": 1.1810393554705492e-06, "loss": 0.70358962, "num_input_tokens_seen": 231560380, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 10732, "time_per_iteration": 2.477874517440796 }, { "auxiliary_loss_clip": 0.01104719, "auxiliary_loss_mlp": 0.01034163, "balance_loss_clip": 1.02199793, "balance_loss_mlp": 1.03700161, "epoch": 0.6453028708853149, "flos": 22784746360320.0, "grad_norm": 1.7267150197184467, "language_loss": 0.75542289, "learning_rate": 1.1806840577282055e-06, "loss": 0.77681166, "num_input_tokens_seen": 231580810, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 10733, "time_per_iteration": 2.4766860008239746 }, { "auxiliary_loss_clip": 0.01113212, "auxiliary_loss_mlp": 0.01038296, "balance_loss_clip": 1.02510583, "balance_loss_mlp": 1.040277, "epoch": 0.6453629941379828, "flos": 23945989150080.0, "grad_norm": 2.379703653426538, "language_loss": 0.67057896, "learning_rate": 1.1803287910544048e-06, "loss": 0.69209397, "num_input_tokens_seen": 231600585, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 10734, "time_per_iteration": 2.503826856613159 }, { "auxiliary_loss_clip": 0.0110506, "auxiliary_loss_mlp": 0.01037877, "balance_loss_clip": 1.02590847, "balance_loss_mlp": 1.03938365, "epoch": 0.6454231173906508, "flos": 17676381841920.0, "grad_norm": 2.3673002667029217, "language_loss": 0.73104382, "learning_rate": 1.1799735554626191e-06, "loss": 0.75247318, "num_input_tokens_seen": 231618765, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.65625, "step": 10735, "time_per_iteration": 2.4536991119384766 }, { "auxiliary_loss_clip": 0.01107147, "auxiliary_loss_mlp": 0.0103635, "balance_loss_clip": 1.02311814, "balance_loss_mlp": 1.03785801, "epoch": 0.6454832406433189, "flos": 23292774368640.0, "grad_norm": 2.1275773928193096, "language_loss": 0.75234151, "learning_rate": 1.1796183509663176e-06, "loss": 0.77377641, "num_input_tokens_seen": 231638525, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69140625, "step": 10736, "time_per_iteration": 2.502656936645508 }, { "auxiliary_loss_clip": 0.01111516, "auxiliary_loss_mlp": 0.0103087, "balance_loss_clip": 1.01747048, "balance_loss_mlp": 1.04022646, "epoch": 0.6455433638959868, "flos": 20157198572160.0, "grad_norm": 2.4091306977849025, "language_loss": 0.70347428, "learning_rate": 1.1792631775789708e-06, "loss": 0.7248981, "num_input_tokens_seen": 231656785, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 10737, "time_per_iteration": 2.480369806289673 }, { "auxiliary_loss_clip": 0.01032085, "auxiliary_loss_mlp": 0.01004227, "balance_loss_clip": 1.00307024, "balance_loss_mlp": 1.00852561, "epoch": 0.6456034871486548, "flos": 66532922012160.0, "grad_norm": 0.7863633197491522, "language_loss": 0.58502042, "learning_rate": 1.1789080353140464e-06, "loss": 0.60538352, "num_input_tokens_seen": 231719075, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.23632812, "step": 10738, "time_per_iteration": 3.148820400238037 }, { "auxiliary_loss_clip": 0.01104725, "auxiliary_loss_mlp": 0.01027262, "balance_loss_clip": 1.01461387, "balance_loss_mlp": 1.0363977, "epoch": 0.6456636104013227, "flos": 24206090509440.0, "grad_norm": 3.4275730854825914, "language_loss": 0.74818063, "learning_rate": 1.1785529241850118e-06, "loss": 0.76950049, "num_input_tokens_seen": 231737810, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 10739, "time_per_iteration": 2.4781484603881836 }, { "auxiliary_loss_clip": 0.01109437, "auxiliary_loss_mlp": 0.01029041, "balance_loss_clip": 1.01597023, "balance_loss_mlp": 1.03785491, "epoch": 0.6457237336539907, "flos": 23624086440960.0, "grad_norm": 1.862057883559801, "language_loss": 0.71803832, "learning_rate": 1.1781978442053324e-06, "loss": 0.73942316, "num_input_tokens_seen": 231756140, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 10740, "time_per_iteration": 2.4999382495880127 }, { "auxiliary_loss_clip": 0.01031477, "auxiliary_loss_mlp": 0.01004182, "balance_loss_clip": 1.00287104, "balance_loss_mlp": 1.00794542, "epoch": 0.6457838569066586, "flos": 65846023251840.0, "grad_norm": 0.6646543206159855, "language_loss": 0.55280614, "learning_rate": 1.1778427953884733e-06, "loss": 0.57316267, "num_input_tokens_seen": 231823665, "router_z_loss_clip": 0.01312256, "router_z_loss_mlp": 0.23535156, "step": 10741, "time_per_iteration": 3.1193253993988037 }, { "auxiliary_loss_clip": 0.01104534, "auxiliary_loss_mlp": 0.01031308, "balance_loss_clip": 1.01937532, "balance_loss_mlp": 1.0367403, "epoch": 0.6458439801593266, "flos": 22381972179840.0, "grad_norm": 1.8457133114534516, "language_loss": 0.81027603, "learning_rate": 1.1774877777478977e-06, "loss": 0.83163446, "num_input_tokens_seen": 231844500, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 10742, "time_per_iteration": 2.529048204421997 }, { "auxiliary_loss_clip": 0.01103364, "auxiliary_loss_mlp": 0.01026758, "balance_loss_clip": 1.01485515, "balance_loss_mlp": 1.03688157, "epoch": 0.6459041034119946, "flos": 24789243813120.0, "grad_norm": 1.788443911184807, "language_loss": 0.81767225, "learning_rate": 1.1771327912970678e-06, "loss": 0.8389734, "num_input_tokens_seen": 231864510, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 10743, "time_per_iteration": 2.5371081829071045 }, { "auxiliary_loss_clip": 0.01104942, "auxiliary_loss_mlp": 0.01032082, "balance_loss_clip": 1.01963103, "balance_loss_mlp": 1.03668261, "epoch": 0.6459642266646626, "flos": 18325358818560.0, "grad_norm": 1.911893835407344, "language_loss": 0.71970153, "learning_rate": 1.1767778360494453e-06, "loss": 0.74107176, "num_input_tokens_seen": 231881555, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 10744, "time_per_iteration": 2.4431087970733643 }, { "auxiliary_loss_clip": 0.01106146, "auxiliary_loss_mlp": 0.01025506, "balance_loss_clip": 1.01366258, "balance_loss_mlp": 1.03690553, "epoch": 0.6460243499173305, "flos": 43581368891520.0, "grad_norm": 1.8837723169590412, "language_loss": 0.66936058, "learning_rate": 1.1764229120184896e-06, "loss": 0.69067711, "num_input_tokens_seen": 231905945, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69140625, "step": 10745, "time_per_iteration": 2.6756458282470703 }, { "auxiliary_loss_clip": 0.01106573, "auxiliary_loss_mlp": 0.01031565, "balance_loss_clip": 1.01889277, "balance_loss_mlp": 1.03760862, "epoch": 0.6460844731699985, "flos": 19244026085760.0, "grad_norm": 2.2948825742476604, "language_loss": 0.73593223, "learning_rate": 1.1760680192176597e-06, "loss": 0.75731361, "num_input_tokens_seen": 231922535, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 10746, "time_per_iteration": 2.4403059482574463 }, { "auxiliary_loss_clip": 0.01109555, "auxiliary_loss_mlp": 0.01032496, "balance_loss_clip": 1.02077222, "balance_loss_mlp": 1.03895259, "epoch": 0.6461445964226664, "flos": 27453348668160.0, "grad_norm": 1.5581253375923396, "language_loss": 0.66378355, "learning_rate": 1.175713157660413e-06, "loss": 0.68520403, "num_input_tokens_seen": 231944800, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.70703125, "step": 10747, "time_per_iteration": 2.5446040630340576 }, { "auxiliary_loss_clip": 0.01107511, "auxiliary_loss_mlp": 0.0103578, "balance_loss_clip": 1.02350783, "balance_loss_mlp": 1.03862405, "epoch": 0.6462047196753344, "flos": 20295489934080.0, "grad_norm": 1.7062216568093844, "language_loss": 0.67039257, "learning_rate": 1.1753583273602056e-06, "loss": 0.69182551, "num_input_tokens_seen": 231962970, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 10748, "time_per_iteration": 2.475830554962158 }, { "auxiliary_loss_clip": 0.01110096, "auxiliary_loss_mlp": 0.01040463, "balance_loss_clip": 1.02673006, "balance_loss_mlp": 1.03856039, "epoch": 0.6462648429280025, "flos": 22018340845440.0, "grad_norm": 1.8504692441175445, "language_loss": 0.76284027, "learning_rate": 1.1750035283304937e-06, "loss": 0.78434581, "num_input_tokens_seen": 231981195, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71484375, "step": 10749, "time_per_iteration": 2.480470895767212 }, { "auxiliary_loss_clip": 0.01108476, "auxiliary_loss_mlp": 0.01032438, "balance_loss_clip": 1.01968896, "balance_loss_mlp": 1.03742337, "epoch": 0.6463249661806704, "flos": 27781141207680.0, "grad_norm": 1.7481565380235886, "language_loss": 0.76942289, "learning_rate": 1.17464876058473e-06, "loss": 0.79083204, "num_input_tokens_seen": 232001735, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 10750, "time_per_iteration": 2.5329768657684326 }, { "auxiliary_loss_clip": 0.01111728, "auxiliary_loss_mlp": 0.01031844, "balance_loss_clip": 1.0175209, "balance_loss_mlp": 1.03916276, "epoch": 0.6463850894333384, "flos": 22050588280320.0, "grad_norm": 2.619243009580856, "language_loss": 0.68426239, "learning_rate": 1.1742940241363683e-06, "loss": 0.70569813, "num_input_tokens_seen": 232019830, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7265625, "step": 10751, "time_per_iteration": 2.472379684448242 }, { "auxiliary_loss_clip": 0.01108079, "auxiliary_loss_mlp": 0.01031687, "balance_loss_clip": 1.01874721, "balance_loss_mlp": 1.03716791, "epoch": 0.6464452126860063, "flos": 21106245767040.0, "grad_norm": 5.548069736822815, "language_loss": 0.71193302, "learning_rate": 1.1739393189988604e-06, "loss": 0.73333061, "num_input_tokens_seen": 232039625, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 10752, "time_per_iteration": 2.4918019771575928 }, { "auxiliary_loss_clip": 0.011098, "auxiliary_loss_mlp": 0.0103807, "balance_loss_clip": 1.02301967, "balance_loss_mlp": 1.03784311, "epoch": 0.6465053359386743, "flos": 16028045694720.0, "grad_norm": 2.0843981517747108, "language_loss": 0.78041565, "learning_rate": 1.1735846451856554e-06, "loss": 0.80189437, "num_input_tokens_seen": 232055855, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.71875, "step": 10753, "time_per_iteration": 3.8358747959136963 }, { "auxiliary_loss_clip": 0.01107165, "auxiliary_loss_mlp": 0.01039049, "balance_loss_clip": 1.02638936, "balance_loss_mlp": 1.03797185, "epoch": 0.6465654591913422, "flos": 23398674641280.0, "grad_norm": 1.89969110416043, "language_loss": 0.85172009, "learning_rate": 1.1732300027102041e-06, "loss": 0.87318218, "num_input_tokens_seen": 232073475, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 10754, "time_per_iteration": 2.479722499847412 }, { "auxiliary_loss_clip": 0.0110598, "auxiliary_loss_mlp": 0.01034547, "balance_loss_clip": 1.02173793, "balance_loss_mlp": 1.03734136, "epoch": 0.6466255824440102, "flos": 15377273038080.0, "grad_norm": 2.245356799271513, "language_loss": 0.59860992, "learning_rate": 1.1728753915859541e-06, "loss": 0.62001514, "num_input_tokens_seen": 232091090, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 10755, "time_per_iteration": 2.567092180252075 }, { "auxiliary_loss_clip": 0.01106272, "auxiliary_loss_mlp": 0.0103026, "balance_loss_clip": 1.01710486, "balance_loss_mlp": 1.03707409, "epoch": 0.6466857056966782, "flos": 16252846963200.0, "grad_norm": 2.735468429429184, "language_loss": 0.68167186, "learning_rate": 1.1725208118263518e-06, "loss": 0.70303714, "num_input_tokens_seen": 232107320, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69140625, "step": 10756, "time_per_iteration": 3.945844888687134 }, { "auxiliary_loss_clip": 0.01114834, "auxiliary_loss_mlp": 0.01031786, "balance_loss_clip": 1.01854825, "balance_loss_mlp": 1.04006624, "epoch": 0.6467458289493462, "flos": 21178246579200.0, "grad_norm": 2.6155522872436805, "language_loss": 0.74189782, "learning_rate": 1.172166263444844e-06, "loss": 0.76336408, "num_input_tokens_seen": 232123930, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.74609375, "step": 10757, "time_per_iteration": 3.831233024597168 }, { "auxiliary_loss_clip": 0.01106894, "auxiliary_loss_mlp": 0.01028908, "balance_loss_clip": 1.01636696, "balance_loss_mlp": 1.03936541, "epoch": 0.6468059522020141, "flos": 17968299672960.0, "grad_norm": 1.4943031844043217, "language_loss": 0.74587744, "learning_rate": 1.1718117464548734e-06, "loss": 0.7672354, "num_input_tokens_seen": 232142905, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 10758, "time_per_iteration": 3.9099209308624268 }, { "auxiliary_loss_clip": 0.0110755, "auxiliary_loss_mlp": 0.01035816, "balance_loss_clip": 1.02217817, "balance_loss_mlp": 1.03753519, "epoch": 0.6468660754546821, "flos": 17890157635200.0, "grad_norm": 2.1870442870340585, "language_loss": 0.67719167, "learning_rate": 1.1714572608698845e-06, "loss": 0.69862533, "num_input_tokens_seen": 232162230, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69921875, "step": 10759, "time_per_iteration": 2.4746170043945312 }, { "auxiliary_loss_clip": 0.01111542, "auxiliary_loss_mlp": 0.01032145, "balance_loss_clip": 1.01856148, "balance_loss_mlp": 1.03886485, "epoch": 0.64692619870735, "flos": 22600991358720.0, "grad_norm": 1.6582050152090513, "language_loss": 0.75583792, "learning_rate": 1.1711028067033197e-06, "loss": 0.77727485, "num_input_tokens_seen": 232182700, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 10760, "time_per_iteration": 2.478400945663452 }, { "auxiliary_loss_clip": 0.01105154, "auxiliary_loss_mlp": 0.01029319, "balance_loss_clip": 1.01735032, "balance_loss_mlp": 1.03647351, "epoch": 0.646986321960018, "flos": 49600786993920.0, "grad_norm": 3.8058406737327544, "language_loss": 0.65396273, "learning_rate": 1.1707483839686194e-06, "loss": 0.67530739, "num_input_tokens_seen": 232208235, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 10761, "time_per_iteration": 2.741025686264038 }, { "auxiliary_loss_clip": 0.01109972, "auxiliary_loss_mlp": 0.01033115, "balance_loss_clip": 1.01903665, "balance_loss_mlp": 1.03890181, "epoch": 0.6470464452126861, "flos": 21908454163200.0, "grad_norm": 3.3313059539277354, "language_loss": 0.69817531, "learning_rate": 1.1703939926792235e-06, "loss": 0.71960616, "num_input_tokens_seen": 232228720, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7109375, "step": 10762, "time_per_iteration": 2.4831440448760986 }, { "auxiliary_loss_clip": 0.01110552, "auxiliary_loss_mlp": 0.01034483, "balance_loss_clip": 1.02113104, "balance_loss_mlp": 1.03808749, "epoch": 0.647106568465354, "flos": 18106124158080.0, "grad_norm": 2.030576338172616, "language_loss": 0.82738817, "learning_rate": 1.1700396328485705e-06, "loss": 0.84883857, "num_input_tokens_seen": 232244655, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10763, "time_per_iteration": 2.458674669265747 }, { "auxiliary_loss_clip": 0.01034196, "auxiliary_loss_mlp": 0.0100159, "balance_loss_clip": 1.00028515, "balance_loss_mlp": 1.01058936, "epoch": 0.647166691718022, "flos": 69480038125440.0, "grad_norm": 0.7176780984568062, "language_loss": 0.57815194, "learning_rate": 1.1696853044900978e-06, "loss": 0.59850979, "num_input_tokens_seen": 232308685, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.23632812, "step": 10764, "time_per_iteration": 3.26828932762146 }, { "auxiliary_loss_clip": 0.01107306, "auxiliary_loss_mlp": 0.01037445, "balance_loss_clip": 1.02428997, "balance_loss_mlp": 1.03799152, "epoch": 0.6472268149706899, "flos": 34095170661120.0, "grad_norm": 1.936301686513869, "language_loss": 0.60324299, "learning_rate": 1.1693310076172413e-06, "loss": 0.62469047, "num_input_tokens_seen": 232327520, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69140625, "step": 10765, "time_per_iteration": 2.617234230041504 }, { "auxiliary_loss_clip": 0.01108127, "auxiliary_loss_mlp": 0.01029271, "balance_loss_clip": 1.01655698, "balance_loss_mlp": 1.0388155, "epoch": 0.6472869382233579, "flos": 28111232217600.0, "grad_norm": 1.8421355440509188, "language_loss": 0.63040954, "learning_rate": 1.168976742243437e-06, "loss": 0.65178353, "num_input_tokens_seen": 232349025, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10766, "time_per_iteration": 2.578542709350586 }, { "auxiliary_loss_clip": 0.01107836, "auxiliary_loss_mlp": 0.01030595, "balance_loss_clip": 1.01763725, "balance_loss_mlp": 1.0381465, "epoch": 0.6473470614760258, "flos": 22492146170880.0, "grad_norm": 2.150038287492542, "language_loss": 0.75682449, "learning_rate": 1.1686225083821174e-06, "loss": 0.77820885, "num_input_tokens_seen": 232367835, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 10767, "time_per_iteration": 2.5109622478485107 }, { "auxiliary_loss_clip": 0.01106214, "auxiliary_loss_mlp": 0.01036895, "balance_loss_clip": 1.02394319, "balance_loss_mlp": 1.03685331, "epoch": 0.6474071847286939, "flos": 14538938538240.0, "grad_norm": 1.9983812912944643, "language_loss": 0.77763504, "learning_rate": 1.1682683060467153e-06, "loss": 0.79906619, "num_input_tokens_seen": 232385840, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 10768, "time_per_iteration": 2.454463481903076 }, { "auxiliary_loss_clip": 0.01106483, "auxiliary_loss_mlp": 0.01028449, "balance_loss_clip": 1.01599765, "balance_loss_mlp": 1.0368377, "epoch": 0.6474673079813618, "flos": 24098214988800.0, "grad_norm": 1.8333798944313913, "language_loss": 0.71773696, "learning_rate": 1.167914135250663e-06, "loss": 0.73908627, "num_input_tokens_seen": 232406205, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 10769, "time_per_iteration": 2.5105559825897217 }, { "auxiliary_loss_clip": 0.01105643, "auxiliary_loss_mlp": 0.01034167, "balance_loss_clip": 1.02207351, "balance_loss_mlp": 1.03842521, "epoch": 0.6475274312340298, "flos": 14976186796800.0, "grad_norm": 1.9942983056552164, "language_loss": 0.71750069, "learning_rate": 1.1675599960073895e-06, "loss": 0.73889881, "num_input_tokens_seen": 232424995, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 10770, "time_per_iteration": 2.4667558670043945 }, { "auxiliary_loss_clip": 0.01112804, "auxiliary_loss_mlp": 0.01033217, "balance_loss_clip": 1.01940703, "balance_loss_mlp": 1.03870702, "epoch": 0.6475875544866977, "flos": 25045322849280.0, "grad_norm": 1.7705365051392292, "language_loss": 0.73219091, "learning_rate": 1.167205888330325e-06, "loss": 0.75365114, "num_input_tokens_seen": 232445870, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7421875, "step": 10771, "time_per_iteration": 2.541400671005249 }, { "auxiliary_loss_clip": 0.01108273, "auxiliary_loss_mlp": 0.01036387, "balance_loss_clip": 1.02369177, "balance_loss_mlp": 1.03840053, "epoch": 0.6476476777393657, "flos": 16472153450880.0, "grad_norm": 2.0108763697585537, "language_loss": 0.74153304, "learning_rate": 1.1668518122328958e-06, "loss": 0.76297963, "num_input_tokens_seen": 232464285, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 10772, "time_per_iteration": 2.443758010864258 }, { "auxiliary_loss_clip": 0.01106681, "auxiliary_loss_mlp": 0.01031825, "balance_loss_clip": 1.02033889, "balance_loss_mlp": 1.03799748, "epoch": 0.6477078009920336, "flos": 25812267068160.0, "grad_norm": 2.011854484442523, "language_loss": 0.83162129, "learning_rate": 1.1664977677285305e-06, "loss": 0.8530063, "num_input_tokens_seen": 232485815, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 10773, "time_per_iteration": 2.5443406105041504 }, { "auxiliary_loss_clip": 0.01105137, "auxiliary_loss_mlp": 0.01027622, "balance_loss_clip": 1.01545024, "balance_loss_mlp": 1.037292, "epoch": 0.6477679242447016, "flos": 17676130446720.0, "grad_norm": 1.5538373841390276, "language_loss": 0.78297484, "learning_rate": 1.1661437548306524e-06, "loss": 0.80430245, "num_input_tokens_seen": 232504875, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 10774, "time_per_iteration": 2.4495205879211426 }, { "auxiliary_loss_clip": 0.01109117, "auxiliary_loss_mlp": 0.0103758, "balance_loss_clip": 1.02460372, "balance_loss_mlp": 1.03761768, "epoch": 0.6478280474973696, "flos": 21032305620480.0, "grad_norm": 3.2080077937827083, "language_loss": 0.68637377, "learning_rate": 1.1657897735526867e-06, "loss": 0.70784074, "num_input_tokens_seen": 232521945, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 10775, "time_per_iteration": 2.4761321544647217 }, { "auxiliary_loss_clip": 0.01110319, "auxiliary_loss_mlp": 0.01037953, "balance_loss_clip": 1.02527475, "balance_loss_mlp": 1.03801596, "epoch": 0.6478881707500376, "flos": 21616931381760.0, "grad_norm": 2.804131931183854, "language_loss": 0.65626693, "learning_rate": 1.1654358239080574e-06, "loss": 0.67774963, "num_input_tokens_seen": 232541500, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 10776, "time_per_iteration": 2.462465763092041 }, { "auxiliary_loss_clip": 0.01109595, "auxiliary_loss_mlp": 0.01037762, "balance_loss_clip": 1.02386236, "balance_loss_mlp": 1.03745818, "epoch": 0.6479482940027056, "flos": 18442571875200.0, "grad_norm": 2.618981416292911, "language_loss": 0.78730267, "learning_rate": 1.1650819059101839e-06, "loss": 0.80877626, "num_input_tokens_seen": 232559720, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 10777, "time_per_iteration": 2.4623208045959473 }, { "auxiliary_loss_clip": 0.01109503, "auxiliary_loss_mlp": 0.0103568, "balance_loss_clip": 1.02231014, "balance_loss_mlp": 1.03954148, "epoch": 0.6480084172553735, "flos": 22164066322560.0, "grad_norm": 1.8212691327467565, "language_loss": 0.73962116, "learning_rate": 1.1647280195724896e-06, "loss": 0.76107299, "num_input_tokens_seen": 232579370, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69921875, "step": 10778, "time_per_iteration": 2.4926185607910156 }, { "auxiliary_loss_clip": 0.01104963, "auxiliary_loss_mlp": 0.0102938, "balance_loss_clip": 1.01653516, "balance_loss_mlp": 1.03610969, "epoch": 0.6480685405080415, "flos": 24316228586880.0, "grad_norm": 1.556536213831054, "language_loss": 0.78007448, "learning_rate": 1.1643741649083923e-06, "loss": 0.80141789, "num_input_tokens_seen": 232600495, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 10779, "time_per_iteration": 2.5201730728149414 }, { "auxiliary_loss_clip": 0.01034369, "auxiliary_loss_mlp": 0.01001426, "balance_loss_clip": 1.00013816, "balance_loss_mlp": 1.01100457, "epoch": 0.6481286637607094, "flos": 59891207760000.0, "grad_norm": 0.7936396683590478, "language_loss": 0.59420919, "learning_rate": 1.1640203419313095e-06, "loss": 0.61456716, "num_input_tokens_seen": 232663165, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.234375, "step": 10780, "time_per_iteration": 3.0975563526153564 }, { "auxiliary_loss_clip": 0.01105712, "auxiliary_loss_mlp": 0.01029638, "balance_loss_clip": 1.01692438, "balance_loss_mlp": 1.03746033, "epoch": 0.6481887870133775, "flos": 25484187219840.0, "grad_norm": 2.258838026864883, "language_loss": 0.7922104, "learning_rate": 1.1636665506546599e-06, "loss": 0.81356394, "num_input_tokens_seen": 232683385, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 10781, "time_per_iteration": 2.502180576324463 }, { "auxiliary_loss_clip": 0.01113775, "auxiliary_loss_mlp": 0.0103469, "balance_loss_clip": 1.02019382, "balance_loss_mlp": 1.04124689, "epoch": 0.6482489102660454, "flos": 19930206574080.0, "grad_norm": 2.0483685980652626, "language_loss": 0.78584349, "learning_rate": 1.1633127910918578e-06, "loss": 0.80732822, "num_input_tokens_seen": 232699095, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7265625, "step": 10782, "time_per_iteration": 2.470151424407959 }, { "auxiliary_loss_clip": 0.01112096, "auxiliary_loss_mlp": 0.01033551, "balance_loss_clip": 1.02000904, "balance_loss_mlp": 1.04020417, "epoch": 0.6483090335187134, "flos": 26979471515520.0, "grad_norm": 2.205803407181666, "language_loss": 0.63679671, "learning_rate": 1.1629590632563187e-06, "loss": 0.65825319, "num_input_tokens_seen": 232717920, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 10783, "time_per_iteration": 2.5138847827911377 }, { "auxiliary_loss_clip": 0.01113533, "auxiliary_loss_mlp": 0.01031989, "balance_loss_clip": 1.01787484, "balance_loss_mlp": 1.04063165, "epoch": 0.6483691567713813, "flos": 25077965333760.0, "grad_norm": 3.1881724635578212, "language_loss": 0.88539195, "learning_rate": 1.1626053671614561e-06, "loss": 0.90684718, "num_input_tokens_seen": 232737605, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 10784, "time_per_iteration": 2.53607177734375 }, { "auxiliary_loss_clip": 0.01107036, "auxiliary_loss_mlp": 0.01033159, "balance_loss_clip": 1.01902032, "balance_loss_mlp": 1.03786016, "epoch": 0.6484292800240493, "flos": 16105972250880.0, "grad_norm": 3.2303591164688648, "language_loss": 0.73604095, "learning_rate": 1.1622517028206815e-06, "loss": 0.75744289, "num_input_tokens_seen": 232755110, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.69140625, "step": 10785, "time_per_iteration": 2.4572558403015137 }, { "auxiliary_loss_clip": 0.01106173, "auxiliary_loss_mlp": 0.0103431, "balance_loss_clip": 1.02180505, "balance_loss_mlp": 1.03835058, "epoch": 0.6484894032767172, "flos": 28840398307200.0, "grad_norm": 1.4227527420242205, "language_loss": 0.69173217, "learning_rate": 1.1618980702474071e-06, "loss": 0.71313703, "num_input_tokens_seen": 232779040, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 10786, "time_per_iteration": 2.59014630317688 }, { "auxiliary_loss_clip": 0.01105729, "auxiliary_loss_mlp": 0.0103116, "balance_loss_clip": 1.01803517, "balance_loss_mlp": 1.03571498, "epoch": 0.6485495265293852, "flos": 30227052896640.0, "grad_norm": 2.0146507846914976, "language_loss": 0.71548307, "learning_rate": 1.161544469455041e-06, "loss": 0.73685193, "num_input_tokens_seen": 232800515, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 10787, "time_per_iteration": 2.5449352264404297 }, { "auxiliary_loss_clip": 0.01111832, "auxiliary_loss_mlp": 0.01031388, "balance_loss_clip": 1.01799464, "balance_loss_mlp": 1.03964043, "epoch": 0.6486096497820532, "flos": 20082181017600.0, "grad_norm": 2.185021632379537, "language_loss": 0.84256023, "learning_rate": 1.1611909004569934e-06, "loss": 0.86399245, "num_input_tokens_seen": 232818450, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 10788, "time_per_iteration": 2.4717960357666016 }, { "auxiliary_loss_clip": 0.01110521, "auxiliary_loss_mlp": 0.01030331, "balance_loss_clip": 1.01701498, "balance_loss_mlp": 1.04014707, "epoch": 0.6486697730347212, "flos": 17129067333120.0, "grad_norm": 2.10438624249505, "language_loss": 0.77628982, "learning_rate": 1.1608373632666708e-06, "loss": 0.79769838, "num_input_tokens_seen": 232834785, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 10789, "time_per_iteration": 2.4559502601623535 }, { "auxiliary_loss_clip": 0.01105918, "auxiliary_loss_mlp": 0.01027695, "balance_loss_clip": 1.01548767, "balance_loss_mlp": 1.0378145, "epoch": 0.6487298962873892, "flos": 38911940570880.0, "grad_norm": 1.8829674855080747, "language_loss": 0.7583828, "learning_rate": 1.160483857897479e-06, "loss": 0.779719, "num_input_tokens_seen": 232856050, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 10790, "time_per_iteration": 2.661173105239868 }, { "auxiliary_loss_clip": 0.01107732, "auxiliary_loss_mlp": 0.01034181, "balance_loss_clip": 1.02242732, "balance_loss_mlp": 1.03932583, "epoch": 0.6487900195400571, "flos": 11947840076160.0, "grad_norm": 2.2856178602347024, "language_loss": 0.60075974, "learning_rate": 1.160130384362823e-06, "loss": 0.62217891, "num_input_tokens_seen": 232873945, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.68359375, "step": 10791, "time_per_iteration": 2.447105884552002 }, { "auxiliary_loss_clip": 0.01107061, "auxiliary_loss_mlp": 0.01030286, "balance_loss_clip": 1.0172323, "balance_loss_mlp": 1.03641224, "epoch": 0.6488501427927251, "flos": 22344445445760.0, "grad_norm": 1.7165224941643527, "language_loss": 0.85765755, "learning_rate": 1.1597769426761082e-06, "loss": 0.87903106, "num_input_tokens_seen": 232892160, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 10792, "time_per_iteration": 2.4879608154296875 }, { "auxiliary_loss_clip": 0.01113163, "auxiliary_loss_mlp": 0.01037695, "balance_loss_clip": 1.02382493, "balance_loss_mlp": 1.03999889, "epoch": 0.648910266045393, "flos": 22236282616320.0, "grad_norm": 2.107560742045376, "language_loss": 0.7828331, "learning_rate": 1.159423532850735e-06, "loss": 0.80434167, "num_input_tokens_seen": 232911725, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 10793, "time_per_iteration": 2.4990127086639404 }, { "auxiliary_loss_clip": 0.01112447, "auxiliary_loss_mlp": 0.01028241, "balance_loss_clip": 1.0151875, "balance_loss_mlp": 1.04027236, "epoch": 0.6489703892980611, "flos": 25301258231040.0, "grad_norm": 2.2654421539371024, "language_loss": 0.74615717, "learning_rate": 1.1590701549001055e-06, "loss": 0.76756406, "num_input_tokens_seen": 232929085, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 10794, "time_per_iteration": 2.5210764408111572 }, { "auxiliary_loss_clip": 0.01106573, "auxiliary_loss_mlp": 0.01030567, "balance_loss_clip": 1.01743579, "balance_loss_mlp": 1.03593791, "epoch": 0.649030512550729, "flos": 24571912573440.0, "grad_norm": 1.7412052113821472, "language_loss": 0.69663316, "learning_rate": 1.158716808837621e-06, "loss": 0.71800452, "num_input_tokens_seen": 232949455, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 10795, "time_per_iteration": 3.844149112701416 }, { "auxiliary_loss_clip": 0.01111517, "auxiliary_loss_mlp": 0.01034594, "balance_loss_clip": 1.02120066, "balance_loss_mlp": 1.03974485, "epoch": 0.649090635803397, "flos": 26244702904320.0, "grad_norm": 3.1596995057318447, "language_loss": 0.53420323, "learning_rate": 1.158363494676679e-06, "loss": 0.5556643, "num_input_tokens_seen": 232969445, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 10796, "time_per_iteration": 2.517526388168335 }, { "auxiliary_loss_clip": 0.01109023, "auxiliary_loss_mlp": 0.01028752, "balance_loss_clip": 1.01683688, "balance_loss_mlp": 1.03835964, "epoch": 0.6491507590560649, "flos": 24937375501440.0, "grad_norm": 1.463763242815647, "language_loss": 0.77876556, "learning_rate": 1.1580102124306775e-06, "loss": 0.80014336, "num_input_tokens_seen": 232988900, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.70703125, "step": 10797, "time_per_iteration": 2.516563892364502 }, { "auxiliary_loss_clip": 0.011073, "auxiliary_loss_mlp": 0.01027784, "balance_loss_clip": 1.01587522, "balance_loss_mlp": 1.04028153, "epoch": 0.6492108823087329, "flos": 19499781899520.0, "grad_norm": 2.212491731818239, "language_loss": 0.7080878, "learning_rate": 1.1576569621130134e-06, "loss": 0.72943866, "num_input_tokens_seen": 233005060, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 10798, "time_per_iteration": 5.3616509437561035 }, { "auxiliary_loss_clip": 0.01106216, "auxiliary_loss_mlp": 0.01026934, "balance_loss_clip": 1.01491773, "balance_loss_mlp": 1.03664827, "epoch": 0.6492710055614008, "flos": 19719303868800.0, "grad_norm": 1.8864635625179624, "language_loss": 0.770715, "learning_rate": 1.1573037437370811e-06, "loss": 0.79204655, "num_input_tokens_seen": 233023375, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 10799, "time_per_iteration": 2.4687206745147705 }, { "auxiliary_loss_clip": 0.0110894, "auxiliary_loss_mlp": 0.01034003, "balance_loss_clip": 1.02069879, "balance_loss_mlp": 1.03612101, "epoch": 0.6493311288140688, "flos": 24317018686080.0, "grad_norm": 2.3320863667146905, "language_loss": 0.71953225, "learning_rate": 1.1569505573162755e-06, "loss": 0.74096167, "num_input_tokens_seen": 233043130, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10800, "time_per_iteration": 3.980989694595337 }, { "auxiliary_loss_clip": 0.01035631, "auxiliary_loss_mlp": 0.01002293, "balance_loss_clip": 1.00101721, "balance_loss_mlp": 1.01228857, "epoch": 0.6493912520667368, "flos": 70934635290240.0, "grad_norm": 0.7658881360065667, "language_loss": 0.60305607, "learning_rate": 1.1565974028639897e-06, "loss": 0.62343538, "num_input_tokens_seen": 233110560, "router_z_loss_clip": 0.01275635, "router_z_loss_mlp": 0.23339844, "step": 10801, "time_per_iteration": 3.2015693187713623 }, { "auxiliary_loss_clip": 0.01114214, "auxiliary_loss_mlp": 0.01036439, "balance_loss_clip": 1.02314687, "balance_loss_mlp": 1.04177523, "epoch": 0.6494513753194048, "flos": 25337779384320.0, "grad_norm": 1.7194800318256012, "language_loss": 0.78660363, "learning_rate": 1.156244280393614e-06, "loss": 0.80811012, "num_input_tokens_seen": 233130080, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10802, "time_per_iteration": 2.492337465286255 }, { "auxiliary_loss_clip": 0.01106786, "auxiliary_loss_mlp": 0.01033834, "balance_loss_clip": 1.02076876, "balance_loss_mlp": 1.03644121, "epoch": 0.6495114985720728, "flos": 24681978823680.0, "grad_norm": 1.9163092527560692, "language_loss": 0.74679124, "learning_rate": 1.155891189918541e-06, "loss": 0.76819742, "num_input_tokens_seen": 233150235, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 10803, "time_per_iteration": 2.4886293411254883 }, { "auxiliary_loss_clip": 0.0110871, "auxiliary_loss_mlp": 0.01029056, "balance_loss_clip": 1.01674759, "balance_loss_mlp": 1.03733802, "epoch": 0.6495716218247407, "flos": 23651162317440.0, "grad_norm": 4.0700825068875846, "language_loss": 0.69804311, "learning_rate": 1.1555381314521578e-06, "loss": 0.71942079, "num_input_tokens_seen": 233166710, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 10804, "time_per_iteration": 2.540541410446167 }, { "auxiliary_loss_clip": 0.01108275, "auxiliary_loss_mlp": 0.01030794, "balance_loss_clip": 1.01764536, "balance_loss_mlp": 1.03848195, "epoch": 0.6496317450774087, "flos": 22346169298560.0, "grad_norm": 1.8728281096139379, "language_loss": 0.72574013, "learning_rate": 1.1551851050078537e-06, "loss": 0.74713081, "num_input_tokens_seen": 233185445, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 10805, "time_per_iteration": 2.4835376739501953 }, { "auxiliary_loss_clip": 0.01107389, "auxiliary_loss_mlp": 0.01029201, "balance_loss_clip": 1.01701808, "balance_loss_mlp": 1.03666186, "epoch": 0.6496918683300766, "flos": 30518647505280.0, "grad_norm": 2.337449268982092, "language_loss": 0.64676809, "learning_rate": 1.1548321105990155e-06, "loss": 0.66813397, "num_input_tokens_seen": 233205805, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.70703125, "step": 10806, "time_per_iteration": 2.543793201446533 }, { "auxiliary_loss_clip": 0.01108764, "auxiliary_loss_mlp": 0.01028529, "balance_loss_clip": 1.01545715, "balance_loss_mlp": 1.03672254, "epoch": 0.6497519915827447, "flos": 12458992567680.0, "grad_norm": 2.994436448336905, "language_loss": 0.78460693, "learning_rate": 1.1544791482390275e-06, "loss": 0.80597985, "num_input_tokens_seen": 233224215, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10807, "time_per_iteration": 2.4462039470672607 }, { "auxiliary_loss_clip": 0.01035024, "auxiliary_loss_mlp": 0.00999264, "balance_loss_clip": 0.99800682, "balance_loss_mlp": 1.01189446, "epoch": 0.6498121148354126, "flos": 69093748287360.0, "grad_norm": 0.7855490246660245, "language_loss": 0.58895844, "learning_rate": 1.1541262179412745e-06, "loss": 0.60930139, "num_input_tokens_seen": 233294440, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.23144531, "step": 10808, "time_per_iteration": 3.2602851390838623 }, { "auxiliary_loss_clip": 0.01108104, "auxiliary_loss_mlp": 0.01026347, "balance_loss_clip": 1.01391387, "balance_loss_mlp": 1.04093528, "epoch": 0.6498722380880806, "flos": 36897135914880.0, "grad_norm": 1.7625745139882487, "language_loss": 0.63562107, "learning_rate": 1.1537733197191415e-06, "loss": 0.65696555, "num_input_tokens_seen": 233316125, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.671875, "step": 10809, "time_per_iteration": 2.6186513900756836 }, { "auxiliary_loss_clip": 0.01107335, "auxiliary_loss_mlp": 0.01028849, "balance_loss_clip": 1.0167557, "balance_loss_mlp": 1.03915203, "epoch": 0.6499323613407485, "flos": 29017760688000.0, "grad_norm": 1.6502245896748922, "language_loss": 0.81725782, "learning_rate": 1.153420453586008e-06, "loss": 0.83861971, "num_input_tokens_seen": 233336140, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 10810, "time_per_iteration": 2.5347070693969727 }, { "auxiliary_loss_clip": 0.01105663, "auxiliary_loss_mlp": 0.01034105, "balance_loss_clip": 1.0227561, "balance_loss_mlp": 1.03736818, "epoch": 0.6499924845934165, "flos": 20119240874880.0, "grad_norm": 1.7749100155101065, "language_loss": 0.71648645, "learning_rate": 1.1530676195552561e-06, "loss": 0.7378841, "num_input_tokens_seen": 233356095, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 10811, "time_per_iteration": 2.4957919120788574 }, { "auxiliary_loss_clip": 0.01106764, "auxiliary_loss_mlp": 0.01028076, "balance_loss_clip": 1.01663184, "balance_loss_mlp": 1.04049218, "epoch": 0.6500526078460844, "flos": 24421338760320.0, "grad_norm": 1.6143276858585265, "language_loss": 0.77959734, "learning_rate": 1.1527148176402649e-06, "loss": 0.80094576, "num_input_tokens_seen": 233376830, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 10812, "time_per_iteration": 2.513507604598999 }, { "auxiliary_loss_clip": 0.01108881, "auxiliary_loss_mlp": 0.01030674, "balance_loss_clip": 1.01793098, "balance_loss_mlp": 1.03820395, "epoch": 0.6501127310987524, "flos": 23331019374720.0, "grad_norm": 1.799419773145345, "language_loss": 0.85316944, "learning_rate": 1.152362047854413e-06, "loss": 0.87456501, "num_input_tokens_seen": 233395275, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 10813, "time_per_iteration": 2.4940686225891113 }, { "auxiliary_loss_clip": 0.01106411, "auxiliary_loss_mlp": 0.01029948, "balance_loss_clip": 1.01775861, "balance_loss_mlp": 1.03768015, "epoch": 0.6501728543514204, "flos": 18697824898560.0, "grad_norm": 1.665440091751803, "language_loss": 0.79732227, "learning_rate": 1.1520093102110764e-06, "loss": 0.81868583, "num_input_tokens_seen": 233413345, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 10814, "time_per_iteration": 2.451429605484009 }, { "auxiliary_loss_clip": 0.01110729, "auxiliary_loss_mlp": 0.01033413, "balance_loss_clip": 1.02058041, "balance_loss_mlp": 1.03943849, "epoch": 0.6502329776040884, "flos": 44199858199680.0, "grad_norm": 1.608362071701497, "language_loss": 0.65662628, "learning_rate": 1.1516566047236328e-06, "loss": 0.67806768, "num_input_tokens_seen": 233436105, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 10815, "time_per_iteration": 2.6737194061279297 }, { "auxiliary_loss_clip": 0.01112851, "auxiliary_loss_mlp": 0.01034201, "balance_loss_clip": 1.01961529, "balance_loss_mlp": 1.03947783, "epoch": 0.6502931008567564, "flos": 14574741419520.0, "grad_norm": 9.057363619596853, "language_loss": 0.75456947, "learning_rate": 1.1513039314054546e-06, "loss": 0.77603996, "num_input_tokens_seen": 233452320, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.734375, "step": 10816, "time_per_iteration": 2.4371259212493896 }, { "auxiliary_loss_clip": 0.01109133, "auxiliary_loss_mlp": 0.01031782, "balance_loss_clip": 1.01930046, "balance_loss_mlp": 1.04077125, "epoch": 0.6503532241094243, "flos": 21395003201280.0, "grad_norm": 1.727135532749028, "language_loss": 0.73189735, "learning_rate": 1.1509512902699174e-06, "loss": 0.75330651, "num_input_tokens_seen": 233469920, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 10817, "time_per_iteration": 2.5143096446990967 }, { "auxiliary_loss_clip": 0.01107795, "auxiliary_loss_mlp": 0.01036973, "balance_loss_clip": 1.02353263, "balance_loss_mlp": 1.03724647, "epoch": 0.6504133473620923, "flos": 74740840986240.0, "grad_norm": 1.5807069995200123, "language_loss": 0.72026145, "learning_rate": 1.1505986813303916e-06, "loss": 0.74170917, "num_input_tokens_seen": 233499780, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 10818, "time_per_iteration": 2.925630807876587 }, { "auxiliary_loss_clip": 0.0111131, "auxiliary_loss_mlp": 0.0103284, "balance_loss_clip": 1.01969671, "balance_loss_mlp": 1.03926921, "epoch": 0.6504734706147602, "flos": 19713270384000.0, "grad_norm": 2.0271330102191887, "language_loss": 0.65389103, "learning_rate": 1.150246104600249e-06, "loss": 0.67533255, "num_input_tokens_seen": 233518235, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10819, "time_per_iteration": 2.4760854244232178 }, { "auxiliary_loss_clip": 0.01109423, "auxiliary_loss_mlp": 0.01035426, "balance_loss_clip": 1.02241397, "balance_loss_mlp": 1.03872621, "epoch": 0.6505335938674283, "flos": 25556870390400.0, "grad_norm": 2.163915270305164, "language_loss": 0.83845127, "learning_rate": 1.14989356009286e-06, "loss": 0.85989976, "num_input_tokens_seen": 233535215, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 10820, "time_per_iteration": 2.514347553253174 }, { "auxiliary_loss_clip": 0.01111633, "auxiliary_loss_mlp": 0.01030329, "balance_loss_clip": 1.01679289, "balance_loss_mlp": 1.03926861, "epoch": 0.6505937171200962, "flos": 17821424960640.0, "grad_norm": 2.1271806194874547, "language_loss": 0.7790966, "learning_rate": 1.1495410478215914e-06, "loss": 0.80051619, "num_input_tokens_seen": 233552775, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 10821, "time_per_iteration": 2.449648380279541 }, { "auxiliary_loss_clip": 0.01105374, "auxiliary_loss_mlp": 0.0103097, "balance_loss_clip": 1.0200386, "balance_loss_mlp": 1.03874767, "epoch": 0.6506538403727642, "flos": 20668135582080.0, "grad_norm": 1.4289820849843977, "language_loss": 0.80155343, "learning_rate": 1.1491885677998126e-06, "loss": 0.82291681, "num_input_tokens_seen": 233572080, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6640625, "step": 10822, "time_per_iteration": 2.4741694927215576 }, { "auxiliary_loss_clip": 0.01106308, "auxiliary_loss_mlp": 0.01029894, "balance_loss_clip": 1.01759148, "balance_loss_mlp": 1.03749681, "epoch": 0.6507139636254321, "flos": 11721422695680.0, "grad_norm": 1.9114533449446458, "language_loss": 0.86970192, "learning_rate": 1.1488361200408883e-06, "loss": 0.89106393, "num_input_tokens_seen": 233589155, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 10823, "time_per_iteration": 2.4827497005462646 }, { "auxiliary_loss_clip": 0.01108495, "auxiliary_loss_mlp": 0.0103436, "balance_loss_clip": 1.02163422, "balance_loss_mlp": 1.0377425, "epoch": 0.6507740868781001, "flos": 26761745226240.0, "grad_norm": 1.8057978081003, "language_loss": 0.66437984, "learning_rate": 1.148483704558183e-06, "loss": 0.68580836, "num_input_tokens_seen": 233608180, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 10824, "time_per_iteration": 2.514775037765503 }, { "auxiliary_loss_clip": 0.01109035, "auxiliary_loss_mlp": 0.01030624, "balance_loss_clip": 1.01789236, "balance_loss_mlp": 1.03727782, "epoch": 0.650834210130768, "flos": 16471722487680.0, "grad_norm": 3.1837270122983514, "language_loss": 0.87370187, "learning_rate": 1.1481313213650607e-06, "loss": 0.89509845, "num_input_tokens_seen": 233625750, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 10825, "time_per_iteration": 2.463925838470459 }, { "auxiliary_loss_clip": 0.01109501, "auxiliary_loss_mlp": 0.01028788, "balance_loss_clip": 1.01519203, "balance_loss_mlp": 1.03692472, "epoch": 0.650894333383436, "flos": 17128672283520.0, "grad_norm": 2.150268254556653, "language_loss": 0.73352706, "learning_rate": 1.147778970474885e-06, "loss": 0.75490993, "num_input_tokens_seen": 233644235, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 10826, "time_per_iteration": 2.452253580093384 }, { "auxiliary_loss_clip": 0.01108198, "auxiliary_loss_mlp": 0.01031953, "balance_loss_clip": 1.02011013, "balance_loss_mlp": 1.0391283, "epoch": 0.650954456636104, "flos": 18734238311040.0, "grad_norm": 2.183720153819671, "language_loss": 0.68814504, "learning_rate": 1.1474266519010157e-06, "loss": 0.70954657, "num_input_tokens_seen": 233662845, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69140625, "step": 10827, "time_per_iteration": 2.4712235927581787 }, { "auxiliary_loss_clip": 0.01108171, "auxiliary_loss_mlp": 0.0103234, "balance_loss_clip": 1.02055633, "balance_loss_mlp": 1.03721523, "epoch": 0.651014579888772, "flos": 24528244613760.0, "grad_norm": 2.7678206096785543, "language_loss": 0.76969695, "learning_rate": 1.1470743656568136e-06, "loss": 0.79110205, "num_input_tokens_seen": 233681990, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.7109375, "step": 10828, "time_per_iteration": 2.480942487716675 }, { "auxiliary_loss_clip": 0.01108655, "auxiliary_loss_mlp": 0.01033322, "balance_loss_clip": 1.02131748, "balance_loss_mlp": 1.0401566, "epoch": 0.65107470314144, "flos": 24061083304320.0, "grad_norm": 1.8061988266669866, "language_loss": 0.89042127, "learning_rate": 1.1467221117556362e-06, "loss": 0.91184098, "num_input_tokens_seen": 233698930, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 10829, "time_per_iteration": 2.5020177364349365 }, { "auxiliary_loss_clip": 0.0103397, "auxiliary_loss_mlp": 0.01002769, "balance_loss_clip": 1.00161302, "balance_loss_mlp": 1.01100373, "epoch": 0.6511348263941079, "flos": 72480734352000.0, "grad_norm": 0.8652033904078616, "language_loss": 0.55351645, "learning_rate": 1.1463698902108428e-06, "loss": 0.57388383, "num_input_tokens_seen": 233769825, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.23046875, "step": 10830, "time_per_iteration": 3.24208927154541 }, { "auxiliary_loss_clip": 0.01108942, "auxiliary_loss_mlp": 0.01029225, "balance_loss_clip": 1.01624262, "balance_loss_mlp": 1.03686571, "epoch": 0.6511949496467759, "flos": 23367684182400.0, "grad_norm": 1.9503042451307278, "language_loss": 0.74687588, "learning_rate": 1.1460177010357878e-06, "loss": 0.76825762, "num_input_tokens_seen": 233787095, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 10831, "time_per_iteration": 2.5191776752471924 }, { "auxiliary_loss_clip": 0.01034282, "auxiliary_loss_mlp": 0.01006894, "balance_loss_clip": 1.00567842, "balance_loss_mlp": 1.01129556, "epoch": 0.6512550728994438, "flos": 67333191073920.0, "grad_norm": 0.7064237944724023, "language_loss": 0.51048493, "learning_rate": 1.145665544243828e-06, "loss": 0.53089666, "num_input_tokens_seen": 233853050, "router_z_loss_clip": 0.012146, "router_z_loss_mlp": 0.23046875, "step": 10832, "time_per_iteration": 3.1691601276397705 }, { "auxiliary_loss_clip": 0.01108643, "auxiliary_loss_mlp": 0.01034956, "balance_loss_clip": 1.02230191, "balance_loss_mlp": 1.03646398, "epoch": 0.6513151961521119, "flos": 21141689512320.0, "grad_norm": 3.6822805335296263, "language_loss": 0.83899146, "learning_rate": 1.145313419848316e-06, "loss": 0.86042744, "num_input_tokens_seen": 233871385, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 10833, "time_per_iteration": 2.4681127071380615 }, { "auxiliary_loss_clip": 0.01109115, "auxiliary_loss_mlp": 0.01031701, "balance_loss_clip": 1.01960087, "balance_loss_mlp": 1.03968, "epoch": 0.6513753194047798, "flos": 15158828476800.0, "grad_norm": 2.3756054491097753, "language_loss": 0.83542085, "learning_rate": 1.1449613278626049e-06, "loss": 0.85682893, "num_input_tokens_seen": 233888175, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 10834, "time_per_iteration": 2.441984176635742 }, { "auxiliary_loss_clip": 0.01110203, "auxiliary_loss_mlp": 0.01032559, "balance_loss_clip": 1.02019072, "balance_loss_mlp": 1.03953695, "epoch": 0.6514354426574478, "flos": 30226621933440.0, "grad_norm": 1.6380459918504975, "language_loss": 0.77212679, "learning_rate": 1.1446092683000455e-06, "loss": 0.79355443, "num_input_tokens_seen": 233911470, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 10835, "time_per_iteration": 2.5519213676452637 }, { "auxiliary_loss_clip": 0.01110028, "auxiliary_loss_mlp": 0.01033927, "balance_loss_clip": 1.02172589, "balance_loss_mlp": 1.04024792, "epoch": 0.6514955659101157, "flos": 24205587719040.0, "grad_norm": 3.366002831548709, "language_loss": 0.77193862, "learning_rate": 1.1442572411739882e-06, "loss": 0.79337817, "num_input_tokens_seen": 233932135, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 10836, "time_per_iteration": 2.5013158321380615 }, { "auxiliary_loss_clip": 0.0110886, "auxiliary_loss_mlp": 0.01035967, "balance_loss_clip": 1.02385545, "balance_loss_mlp": 1.0386256, "epoch": 0.6515556891627837, "flos": 12377761960320.0, "grad_norm": 2.0745183411910837, "language_loss": 0.82322794, "learning_rate": 1.143905246497783e-06, "loss": 0.8446762, "num_input_tokens_seen": 233947880, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 10837, "time_per_iteration": 3.8153164386749268 }, { "auxiliary_loss_clip": 0.01106483, "auxiliary_loss_mlp": 0.01031541, "balance_loss_clip": 1.01826108, "balance_loss_mlp": 1.03902555, "epoch": 0.6516158124154516, "flos": 49601217957120.0, "grad_norm": 2.025015165834269, "language_loss": 0.58275878, "learning_rate": 1.1435532842847758e-06, "loss": 0.60413903, "num_input_tokens_seen": 233971475, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.671875, "step": 10838, "time_per_iteration": 2.7360892295837402 }, { "auxiliary_loss_clip": 0.01033199, "auxiliary_loss_mlp": 0.0099902, "balance_loss_clip": 0.99782753, "balance_loss_mlp": 1.01022863, "epoch": 0.6516759356681197, "flos": 59702748076800.0, "grad_norm": 0.7977294592334148, "language_loss": 0.60825384, "learning_rate": 1.1432013545483147e-06, "loss": 0.62857604, "num_input_tokens_seen": 234030690, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.23046875, "step": 10839, "time_per_iteration": 3.1402430534362793 }, { "auxiliary_loss_clip": 0.01106728, "auxiliary_loss_mlp": 0.01029233, "balance_loss_clip": 1.01775289, "balance_loss_mlp": 1.03814375, "epoch": 0.6517360589207876, "flos": 37450807130880.0, "grad_norm": 1.9425979190836746, "language_loss": 0.67864478, "learning_rate": 1.1428494573017439e-06, "loss": 0.7000044, "num_input_tokens_seen": 234052470, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6875, "step": 10840, "time_per_iteration": 5.420520782470703 }, { "auxiliary_loss_clip": 0.01105777, "auxiliary_loss_mlp": 0.01032074, "balance_loss_clip": 1.0208509, "balance_loss_mlp": 1.03630102, "epoch": 0.6517961821734556, "flos": 25374911068800.0, "grad_norm": 8.750035263299907, "language_loss": 0.7369231, "learning_rate": 1.1424975925584071e-06, "loss": 0.75830156, "num_input_tokens_seen": 234071495, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6953125, "step": 10841, "time_per_iteration": 4.063343286514282 }, { "auxiliary_loss_clip": 0.01108299, "auxiliary_loss_mlp": 0.01034623, "balance_loss_clip": 1.02189112, "balance_loss_mlp": 1.03751493, "epoch": 0.6518563054261236, "flos": 28766996864640.0, "grad_norm": 1.5606840677245852, "language_loss": 0.62682486, "learning_rate": 1.142145760331648e-06, "loss": 0.64825404, "num_input_tokens_seen": 234092325, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 10842, "time_per_iteration": 2.584127902984619 }, { "auxiliary_loss_clip": 0.01033472, "auxiliary_loss_mlp": 0.01003816, "balance_loss_clip": 1.00264728, "balance_loss_mlp": 1.01041174, "epoch": 0.6519164286787915, "flos": 68924750797440.0, "grad_norm": 0.8429529663028874, "language_loss": 0.56078821, "learning_rate": 1.141793960634807e-06, "loss": 0.58116108, "num_input_tokens_seen": 234148005, "router_z_loss_clip": 0.01165771, "router_z_loss_mlp": 0.23046875, "step": 10843, "time_per_iteration": 2.934452772140503 }, { "auxiliary_loss_clip": 0.01110673, "auxiliary_loss_mlp": 0.01034123, "balance_loss_clip": 1.02062845, "balance_loss_mlp": 1.03807664, "epoch": 0.6519765519314595, "flos": 20441933683200.0, "grad_norm": 1.7956121230734146, "language_loss": 0.82760072, "learning_rate": 1.1414421934812253e-06, "loss": 0.84904867, "num_input_tokens_seen": 234164280, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 10844, "time_per_iteration": 2.4826202392578125 }, { "auxiliary_loss_clip": 0.01107571, "auxiliary_loss_mlp": 0.01028377, "balance_loss_clip": 1.01540673, "balance_loss_mlp": 1.03740072, "epoch": 0.6520366751841274, "flos": 28402970480640.0, "grad_norm": 1.832847138031111, "language_loss": 0.59910333, "learning_rate": 1.1410904588842421e-06, "loss": 0.62046283, "num_input_tokens_seen": 234185090, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 10845, "time_per_iteration": 2.5572547912597656 }, { "auxiliary_loss_clip": 0.01107076, "auxiliary_loss_mlp": 0.01033329, "balance_loss_clip": 1.02100253, "balance_loss_mlp": 1.03741229, "epoch": 0.6520967984367955, "flos": 22273414300800.0, "grad_norm": 1.8632085147057122, "language_loss": 0.79599261, "learning_rate": 1.140738756857194e-06, "loss": 0.8173967, "num_input_tokens_seen": 234204050, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 10846, "time_per_iteration": 2.5444259643554688 }, { "auxiliary_loss_clip": 0.01034099, "auxiliary_loss_mlp": 0.01003475, "balance_loss_clip": 1.00224102, "balance_loss_mlp": 1.01093328, "epoch": 0.6521569216894634, "flos": 68917140092160.0, "grad_norm": 0.7114877508378367, "language_loss": 0.60169256, "learning_rate": 1.1403870874134192e-06, "loss": 0.62206829, "num_input_tokens_seen": 234269790, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.23242188, "step": 10847, "time_per_iteration": 3.242751121520996 }, { "auxiliary_loss_clip": 0.01111742, "auxiliary_loss_mlp": 0.01037527, "balance_loss_clip": 1.02520108, "balance_loss_mlp": 1.03953421, "epoch": 0.6522170449421314, "flos": 29130520458240.0, "grad_norm": 1.8538522134201552, "language_loss": 0.80750656, "learning_rate": 1.1400354505662514e-06, "loss": 0.82899928, "num_input_tokens_seen": 234290135, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.72265625, "step": 10848, "time_per_iteration": 2.5909860134124756 }, { "auxiliary_loss_clip": 0.01105483, "auxiliary_loss_mlp": 0.01032477, "balance_loss_clip": 1.02066922, "balance_loss_mlp": 1.03699398, "epoch": 0.6522771681947993, "flos": 26651930371200.0, "grad_norm": 2.1807741393553304, "language_loss": 0.74687922, "learning_rate": 1.1396838463290263e-06, "loss": 0.76825881, "num_input_tokens_seen": 234309535, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 10849, "time_per_iteration": 2.5675833225250244 }, { "auxiliary_loss_clip": 0.01104612, "auxiliary_loss_mlp": 0.01031183, "balance_loss_clip": 1.01961362, "balance_loss_mlp": 1.03742659, "epoch": 0.6523372914474673, "flos": 25739763465600.0, "grad_norm": 1.9815440235887667, "language_loss": 0.68244898, "learning_rate": 1.1393322747150752e-06, "loss": 0.703807, "num_input_tokens_seen": 234328755, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 10850, "time_per_iteration": 2.559821128845215 }, { "auxiliary_loss_clip": 0.01105842, "auxiliary_loss_mlp": 0.01027748, "balance_loss_clip": 1.01554716, "balance_loss_mlp": 1.03860712, "epoch": 0.6523974147001352, "flos": 24827345164800.0, "grad_norm": 1.830520786857338, "language_loss": 0.6690526, "learning_rate": 1.1389807357377313e-06, "loss": 0.6903885, "num_input_tokens_seen": 234348655, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 10851, "time_per_iteration": 2.5585734844207764 }, { "auxiliary_loss_clip": 0.01109983, "auxiliary_loss_mlp": 0.01030156, "balance_loss_clip": 1.01802659, "balance_loss_mlp": 1.03844893, "epoch": 0.6524575379528033, "flos": 26317637470080.0, "grad_norm": 2.452649186866783, "language_loss": 0.74032921, "learning_rate": 1.1386292294103235e-06, "loss": 0.76173055, "num_input_tokens_seen": 234367445, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71484375, "step": 10852, "time_per_iteration": 2.5640361309051514 }, { "auxiliary_loss_clip": 0.01110893, "auxiliary_loss_mlp": 0.01026581, "balance_loss_clip": 1.01328945, "balance_loss_mlp": 1.03877115, "epoch": 0.6525176612054712, "flos": 19494143464320.0, "grad_norm": 2.1794285540743323, "language_loss": 0.66904336, "learning_rate": 1.1382777557461812e-06, "loss": 0.69041812, "num_input_tokens_seen": 234384825, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 10853, "time_per_iteration": 2.4792747497558594 }, { "auxiliary_loss_clip": 0.01033951, "auxiliary_loss_mlp": 0.01001091, "balance_loss_clip": 0.99986929, "balance_loss_mlp": 1.01073027, "epoch": 0.6525777844581392, "flos": 71706894721920.0, "grad_norm": 0.7222997724411392, "language_loss": 0.63044065, "learning_rate": 1.137926314758634e-06, "loss": 0.65079105, "num_input_tokens_seen": 234450630, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.23242188, "step": 10854, "time_per_iteration": 3.2331767082214355 }, { "auxiliary_loss_clip": 0.01106887, "auxiliary_loss_mlp": 0.01033494, "balance_loss_clip": 1.0205245, "balance_loss_mlp": 1.03741074, "epoch": 0.6526379077108072, "flos": 26653115520000.0, "grad_norm": 1.6981315622831266, "language_loss": 0.77771229, "learning_rate": 1.1375749064610072e-06, "loss": 0.79911613, "num_input_tokens_seen": 234473505, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 10855, "time_per_iteration": 2.599789619445801 }, { "auxiliary_loss_clip": 0.01103876, "auxiliary_loss_mlp": 0.01027245, "balance_loss_clip": 1.01519299, "balance_loss_mlp": 1.03629184, "epoch": 0.6526980309634751, "flos": 22820369673600.0, "grad_norm": 1.9389841317844347, "language_loss": 0.79197043, "learning_rate": 1.1372235308666256e-06, "loss": 0.8132816, "num_input_tokens_seen": 234492485, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.671875, "step": 10856, "time_per_iteration": 2.564028263092041 }, { "auxiliary_loss_clip": 0.01106854, "auxiliary_loss_mlp": 0.01029019, "balance_loss_clip": 1.01556015, "balance_loss_mlp": 1.03749228, "epoch": 0.6527581542161431, "flos": 28365048696960.0, "grad_norm": 2.051499283640502, "language_loss": 0.73560202, "learning_rate": 1.136872187988815e-06, "loss": 0.75696069, "num_input_tokens_seen": 234512645, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 10857, "time_per_iteration": 2.5529654026031494 }, { "auxiliary_loss_clip": 0.011075, "auxiliary_loss_mlp": 0.01028311, "balance_loss_clip": 1.01703334, "balance_loss_mlp": 1.03800905, "epoch": 0.652818277468811, "flos": 18369206346240.0, "grad_norm": 2.3557217876756837, "language_loss": 0.63153172, "learning_rate": 1.1365208778408965e-06, "loss": 0.65288985, "num_input_tokens_seen": 234529310, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6953125, "step": 10858, "time_per_iteration": 2.47424578666687 }, { "auxiliary_loss_clip": 0.01102742, "auxiliary_loss_mlp": 0.01033483, "balance_loss_clip": 1.02143133, "balance_loss_mlp": 1.03497243, "epoch": 0.6528784007214791, "flos": 18036170421120.0, "grad_norm": 4.50816139970141, "language_loss": 0.78538918, "learning_rate": 1.1361696004361939e-06, "loss": 0.80675143, "num_input_tokens_seen": 234546685, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 10859, "time_per_iteration": 2.4799752235412598 }, { "auxiliary_loss_clip": 0.01109759, "auxiliary_loss_mlp": 0.01027112, "balance_loss_clip": 1.01436865, "balance_loss_mlp": 1.03810287, "epoch": 0.652938523974147, "flos": 22382008093440.0, "grad_norm": 1.5404866822288297, "language_loss": 0.67793387, "learning_rate": 1.1358183557880256e-06, "loss": 0.69930255, "num_input_tokens_seen": 234566255, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 10860, "time_per_iteration": 2.486701011657715 }, { "auxiliary_loss_clip": 0.01112511, "auxiliary_loss_mlp": 0.01028779, "balance_loss_clip": 1.01626778, "balance_loss_mlp": 1.03965282, "epoch": 0.652998647226815, "flos": 16764035368320.0, "grad_norm": 2.1572359263906473, "language_loss": 0.66304749, "learning_rate": 1.135467143909712e-06, "loss": 0.6844604, "num_input_tokens_seen": 234585405, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73046875, "step": 10861, "time_per_iteration": 2.497873306274414 }, { "auxiliary_loss_clip": 0.01109357, "auxiliary_loss_mlp": 0.01035716, "balance_loss_clip": 1.0216074, "balance_loss_mlp": 1.03829026, "epoch": 0.6530587704794829, "flos": 35772522019200.0, "grad_norm": 1.8035318249460983, "language_loss": 0.65509737, "learning_rate": 1.135115964814572e-06, "loss": 0.67654812, "num_input_tokens_seen": 234608095, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7109375, "step": 10862, "time_per_iteration": 2.641155958175659 }, { "auxiliary_loss_clip": 0.01106178, "auxiliary_loss_mlp": 0.01034375, "balance_loss_clip": 1.02240622, "balance_loss_mlp": 1.03792572, "epoch": 0.6531188937321509, "flos": 19316134638720.0, "grad_norm": 2.0177593969074223, "language_loss": 0.77266723, "learning_rate": 1.13476481851592e-06, "loss": 0.79407275, "num_input_tokens_seen": 234627335, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 10863, "time_per_iteration": 2.494912624359131 }, { "auxiliary_loss_clip": 0.01106301, "auxiliary_loss_mlp": 0.01030867, "balance_loss_clip": 1.01923203, "balance_loss_mlp": 1.0372076, "epoch": 0.6531790169848188, "flos": 22893771116160.0, "grad_norm": 1.6763092299432478, "language_loss": 0.75042838, "learning_rate": 1.1344137050270739e-06, "loss": 0.77180004, "num_input_tokens_seen": 234646540, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.69140625, "step": 10864, "time_per_iteration": 2.5081727504730225 }, { "auxiliary_loss_clip": 0.01105442, "auxiliary_loss_mlp": 0.01037818, "balance_loss_clip": 1.02586722, "balance_loss_mlp": 1.03671718, "epoch": 0.6532391402374869, "flos": 29563530912000.0, "grad_norm": 2.012265563417881, "language_loss": 0.86176401, "learning_rate": 1.1340626243613458e-06, "loss": 0.88319659, "num_input_tokens_seen": 234665470, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 10865, "time_per_iteration": 2.588056802749634 }, { "auxiliary_loss_clip": 0.01112298, "auxiliary_loss_mlp": 0.01037855, "balance_loss_clip": 1.02535534, "balance_loss_mlp": 1.03936815, "epoch": 0.6532992634901548, "flos": 23105463920640.0, "grad_norm": 1.6658079023508177, "language_loss": 0.81453264, "learning_rate": 1.133711576532051e-06, "loss": 0.83603412, "num_input_tokens_seen": 234683955, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 10866, "time_per_iteration": 2.5610415935516357 }, { "auxiliary_loss_clip": 0.01106683, "auxiliary_loss_mlp": 0.01031405, "balance_loss_clip": 1.01956177, "balance_loss_mlp": 1.03870511, "epoch": 0.6533593867428228, "flos": 26067340523520.0, "grad_norm": 1.6091908543791453, "language_loss": 0.82305467, "learning_rate": 1.1333605615524995e-06, "loss": 0.84443557, "num_input_tokens_seen": 234704595, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 10867, "time_per_iteration": 2.546996593475342 }, { "auxiliary_loss_clip": 0.01107036, "auxiliary_loss_mlp": 0.01028729, "balance_loss_clip": 1.016868, "balance_loss_mlp": 1.03664756, "epoch": 0.6534195099954908, "flos": 21212469262080.0, "grad_norm": 2.0846178026975166, "language_loss": 0.81009912, "learning_rate": 1.1330095794360016e-06, "loss": 0.83145678, "num_input_tokens_seen": 234724090, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 10868, "time_per_iteration": 2.526038646697998 }, { "auxiliary_loss_clip": 0.01110335, "auxiliary_loss_mlp": 0.01032771, "balance_loss_clip": 1.01929986, "balance_loss_mlp": 1.03888226, "epoch": 0.6534796332481587, "flos": 19646584784640.0, "grad_norm": 1.8199215261281256, "language_loss": 0.79661834, "learning_rate": 1.1326586301958675e-06, "loss": 0.81804937, "num_input_tokens_seen": 234742560, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 10869, "time_per_iteration": 2.4861555099487305 }, { "auxiliary_loss_clip": 0.01110975, "auxiliary_loss_mlp": 0.0103739, "balance_loss_clip": 1.0248785, "balance_loss_mlp": 1.04037595, "epoch": 0.6535397565008267, "flos": 24022479162240.0, "grad_norm": 2.0587748631002762, "language_loss": 0.72267383, "learning_rate": 1.1323077138454063e-06, "loss": 0.74415749, "num_input_tokens_seen": 234762315, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 10870, "time_per_iteration": 2.518962860107422 }, { "auxiliary_loss_clip": 0.01108944, "auxiliary_loss_mlp": 0.01038231, "balance_loss_clip": 1.02570224, "balance_loss_mlp": 1.03973234, "epoch": 0.6535998797534947, "flos": 24602759377920.0, "grad_norm": 4.426203570656342, "language_loss": 0.74868155, "learning_rate": 1.1319568303979221e-06, "loss": 0.77015328, "num_input_tokens_seen": 234781300, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 10871, "time_per_iteration": 2.517719268798828 }, { "auxiliary_loss_clip": 0.01105808, "auxiliary_loss_mlp": 0.01031913, "balance_loss_clip": 1.02029634, "balance_loss_mlp": 1.03915858, "epoch": 0.6536600030061627, "flos": 23364164649600.0, "grad_norm": 1.757837214199013, "language_loss": 0.55903757, "learning_rate": 1.1316059798667227e-06, "loss": 0.58041471, "num_input_tokens_seen": 234801040, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 10872, "time_per_iteration": 2.4947800636291504 }, { "auxiliary_loss_clip": 0.01106072, "auxiliary_loss_mlp": 0.01034783, "balance_loss_clip": 1.02273655, "balance_loss_mlp": 1.03844285, "epoch": 0.6537201262588306, "flos": 23878477537920.0, "grad_norm": 2.6311828827936607, "language_loss": 0.75160813, "learning_rate": 1.1312551622651112e-06, "loss": 0.77301669, "num_input_tokens_seen": 234821415, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.67578125, "step": 10873, "time_per_iteration": 2.5110702514648438 }, { "auxiliary_loss_clip": 0.01108137, "auxiliary_loss_mlp": 0.01035102, "balance_loss_clip": 1.0224781, "balance_loss_mlp": 1.03811443, "epoch": 0.6537802495114986, "flos": 24354760901760.0, "grad_norm": 1.780438522980636, "language_loss": 0.75536603, "learning_rate": 1.1309043776063917e-06, "loss": 0.77679837, "num_input_tokens_seen": 234843795, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 10874, "time_per_iteration": 2.53537917137146 }, { "auxiliary_loss_clip": 0.01108942, "auxiliary_loss_mlp": 0.01034428, "balance_loss_clip": 1.02180934, "balance_loss_mlp": 1.03933322, "epoch": 0.6538403727641665, "flos": 27996892248960.0, "grad_norm": 1.8295738965747759, "language_loss": 0.81593513, "learning_rate": 1.1305536259038642e-06, "loss": 0.83736885, "num_input_tokens_seen": 234862350, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 10875, "time_per_iteration": 2.532822847366333 }, { "auxiliary_loss_clip": 0.01106969, "auxiliary_loss_mlp": 0.01039035, "balance_loss_clip": 1.02705491, "balance_loss_mlp": 1.03652835, "epoch": 0.6539004960168345, "flos": 27563594486400.0, "grad_norm": 3.193427392529548, "language_loss": 0.70077121, "learning_rate": 1.1302029071708314e-06, "loss": 0.72223121, "num_input_tokens_seen": 234881790, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 10876, "time_per_iteration": 2.5192670822143555 }, { "auxiliary_loss_clip": 0.01106406, "auxiliary_loss_mlp": 0.01033386, "balance_loss_clip": 1.02076149, "balance_loss_mlp": 1.03717291, "epoch": 0.6539606192695024, "flos": 14530067879040.0, "grad_norm": 2.1348682954237517, "language_loss": 0.79612041, "learning_rate": 1.1298522214205908e-06, "loss": 0.81751835, "num_input_tokens_seen": 234897775, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 10877, "time_per_iteration": 2.4695682525634766 }, { "auxiliary_loss_clip": 0.01109948, "auxiliary_loss_mlp": 0.01026314, "balance_loss_clip": 1.01401198, "balance_loss_mlp": 1.03955197, "epoch": 0.6540207425221705, "flos": 21616356764160.0, "grad_norm": 2.466805196959218, "language_loss": 0.79735821, "learning_rate": 1.1295015686664408e-06, "loss": 0.81872088, "num_input_tokens_seen": 234918395, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 10878, "time_per_iteration": 3.941572427749634 }, { "auxiliary_loss_clip": 0.01107675, "auxiliary_loss_mlp": 0.01031752, "balance_loss_clip": 1.01894283, "balance_loss_mlp": 1.03773141, "epoch": 0.6540808657748384, "flos": 17668983640320.0, "grad_norm": 2.918533949138185, "language_loss": 0.84580779, "learning_rate": 1.1291509489216797e-06, "loss": 0.8672021, "num_input_tokens_seen": 234936260, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 10879, "time_per_iteration": 2.4479870796203613 }, { "auxiliary_loss_clip": 0.01110505, "auxiliary_loss_mlp": 0.01033179, "balance_loss_clip": 1.01992249, "balance_loss_mlp": 1.03811014, "epoch": 0.6541409890275064, "flos": 14538292093440.0, "grad_norm": 2.40025470475374, "language_loss": 0.71710944, "learning_rate": 1.128800362199601e-06, "loss": 0.73854625, "num_input_tokens_seen": 234952110, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10880, "time_per_iteration": 2.52485990524292 }, { "auxiliary_loss_clip": 0.01106573, "auxiliary_loss_mlp": 0.01031723, "balance_loss_clip": 1.01885486, "balance_loss_mlp": 1.03759992, "epoch": 0.6542011122801744, "flos": 17165301177600.0, "grad_norm": 1.9765375180204972, "language_loss": 0.84349585, "learning_rate": 1.1284498085135005e-06, "loss": 0.86487877, "num_input_tokens_seen": 234970810, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 10881, "time_per_iteration": 4.0232744216918945 }, { "auxiliary_loss_clip": 0.01111237, "auxiliary_loss_mlp": 0.01033176, "balance_loss_clip": 1.01924086, "balance_loss_mlp": 1.03919184, "epoch": 0.6542612355328423, "flos": 18186600579840.0, "grad_norm": 2.0918787640803913, "language_loss": 0.7809689, "learning_rate": 1.1280992878766699e-06, "loss": 0.80241305, "num_input_tokens_seen": 234989565, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.72265625, "step": 10882, "time_per_iteration": 3.904881000518799 }, { "auxiliary_loss_clip": 0.0111236, "auxiliary_loss_mlp": 0.01028771, "balance_loss_clip": 1.01536584, "balance_loss_mlp": 1.04088008, "epoch": 0.6543213587855103, "flos": 19792453916160.0, "grad_norm": 1.7176130464109816, "language_loss": 0.81961048, "learning_rate": 1.1277488003024024e-06, "loss": 0.84102172, "num_input_tokens_seen": 235007955, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 10883, "time_per_iteration": 3.9522042274475098 }, { "auxiliary_loss_clip": 0.01113853, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.02134538, "balance_loss_mlp": 1.04190314, "epoch": 0.6543814820381783, "flos": 21105096531840.0, "grad_norm": 3.3003124173231586, "language_loss": 0.85349131, "learning_rate": 1.127398345803988e-06, "loss": 0.87497449, "num_input_tokens_seen": 235024860, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 10884, "time_per_iteration": 2.505702495574951 }, { "auxiliary_loss_clip": 0.01110232, "auxiliary_loss_mlp": 0.01032336, "balance_loss_clip": 1.01952696, "balance_loss_mlp": 1.03971934, "epoch": 0.6544416052908463, "flos": 20194042947840.0, "grad_norm": 2.2476653916746825, "language_loss": 0.79829443, "learning_rate": 1.127047924394715e-06, "loss": 0.81972009, "num_input_tokens_seen": 235043815, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10885, "time_per_iteration": 2.489607334136963 }, { "auxiliary_loss_clip": 0.01107354, "auxiliary_loss_mlp": 0.01028186, "balance_loss_clip": 1.0156455, "balance_loss_mlp": 1.03818893, "epoch": 0.6545017285435142, "flos": 23368258800000.0, "grad_norm": 2.373037539901046, "language_loss": 0.72212642, "learning_rate": 1.1266975360878722e-06, "loss": 0.74348181, "num_input_tokens_seen": 235062985, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 10886, "time_per_iteration": 2.516099452972412 }, { "auxiliary_loss_clip": 0.01106317, "auxiliary_loss_mlp": 0.01029785, "balance_loss_clip": 1.01779234, "balance_loss_mlp": 1.0374608, "epoch": 0.6545618517961822, "flos": 19134714021120.0, "grad_norm": 1.9408318895963232, "language_loss": 0.77897727, "learning_rate": 1.1263471808967468e-06, "loss": 0.80033839, "num_input_tokens_seen": 235081670, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 10887, "time_per_iteration": 2.4967010021209717 }, { "auxiliary_loss_clip": 0.01108927, "auxiliary_loss_mlp": 0.01033665, "balance_loss_clip": 1.02111232, "balance_loss_mlp": 1.03969526, "epoch": 0.6546219750488501, "flos": 14938624149120.0, "grad_norm": 1.8120654481239047, "language_loss": 0.79185939, "learning_rate": 1.1259968588346234e-06, "loss": 0.81328529, "num_input_tokens_seen": 235098510, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 10888, "time_per_iteration": 2.476341485977173 }, { "auxiliary_loss_clip": 0.01104439, "auxiliary_loss_mlp": 0.01033846, "balance_loss_clip": 1.02193701, "balance_loss_mlp": 1.03696978, "epoch": 0.6546820983015181, "flos": 36320518886400.0, "grad_norm": 1.6453031622227068, "language_loss": 0.66525066, "learning_rate": 1.1256465699147874e-06, "loss": 0.68663347, "num_input_tokens_seen": 235119990, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 10889, "time_per_iteration": 2.5969460010528564 }, { "auxiliary_loss_clip": 0.01106876, "auxiliary_loss_mlp": 0.01030447, "balance_loss_clip": 1.0170536, "balance_loss_mlp": 1.03660047, "epoch": 0.654742221554186, "flos": 20411446014720.0, "grad_norm": 1.514414816485577, "language_loss": 0.79770082, "learning_rate": 1.1252963141505203e-06, "loss": 0.81907403, "num_input_tokens_seen": 235139255, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 10890, "time_per_iteration": 2.512082815170288 }, { "auxiliary_loss_clip": 0.01109219, "auxiliary_loss_mlp": 0.01030692, "balance_loss_clip": 1.01770425, "balance_loss_mlp": 1.03800166, "epoch": 0.6548023448068541, "flos": 24863650836480.0, "grad_norm": 2.8149986499846205, "language_loss": 0.65185213, "learning_rate": 1.1249460915551052e-06, "loss": 0.67325121, "num_input_tokens_seen": 235158455, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 10891, "time_per_iteration": 2.5205302238464355 }, { "auxiliary_loss_clip": 0.01107079, "auxiliary_loss_mlp": 0.01032153, "balance_loss_clip": 1.02036357, "balance_loss_mlp": 1.0372057, "epoch": 0.654862468059522, "flos": 21427573858560.0, "grad_norm": 1.9659113721718064, "language_loss": 0.79456544, "learning_rate": 1.1245959021418214e-06, "loss": 0.81595778, "num_input_tokens_seen": 235177350, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 10892, "time_per_iteration": 2.5197689533233643 }, { "auxiliary_loss_clip": 0.01113212, "auxiliary_loss_mlp": 0.01032843, "balance_loss_clip": 1.02020073, "balance_loss_mlp": 1.03980875, "epoch": 0.65492259131219, "flos": 26577846570240.0, "grad_norm": 1.9396451654923905, "language_loss": 0.77796984, "learning_rate": 1.1242457459239497e-06, "loss": 0.79943037, "num_input_tokens_seen": 235196435, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 10893, "time_per_iteration": 2.541672468185425 }, { "auxiliary_loss_clip": 0.0111261, "auxiliary_loss_mlp": 0.01034493, "balance_loss_clip": 1.02049756, "balance_loss_mlp": 1.03981328, "epoch": 0.6549827145648579, "flos": 21501334437120.0, "grad_norm": 1.9227852711757343, "language_loss": 0.70327926, "learning_rate": 1.123895622914766e-06, "loss": 0.72475028, "num_input_tokens_seen": 235215430, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73046875, "step": 10894, "time_per_iteration": 2.496302843093872 }, { "auxiliary_loss_clip": 0.01112552, "auxiliary_loss_mlp": 0.01031856, "balance_loss_clip": 1.01902914, "balance_loss_mlp": 1.03952527, "epoch": 0.6550428378175259, "flos": 22594275515520.0, "grad_norm": 3.5270926273274137, "language_loss": 0.63013172, "learning_rate": 1.123545533127549e-06, "loss": 0.6515758, "num_input_tokens_seen": 235232015, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 10895, "time_per_iteration": 2.4691097736358643 }, { "auxiliary_loss_clip": 0.0110809, "auxiliary_loss_mlp": 0.01039123, "balance_loss_clip": 1.02632618, "balance_loss_mlp": 1.03771186, "epoch": 0.655102961070194, "flos": 12823809050880.0, "grad_norm": 1.935782349527319, "language_loss": 0.78897679, "learning_rate": 1.1231954765755722e-06, "loss": 0.81044894, "num_input_tokens_seen": 235248115, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10896, "time_per_iteration": 2.4819672107696533 }, { "auxiliary_loss_clip": 0.01108538, "auxiliary_loss_mlp": 0.01033871, "balance_loss_clip": 1.02143705, "balance_loss_mlp": 1.03960657, "epoch": 0.6551630843228619, "flos": 24791075406720.0, "grad_norm": 1.8949943304915902, "language_loss": 0.70751226, "learning_rate": 1.1228454532721111e-06, "loss": 0.72893637, "num_input_tokens_seen": 235270785, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 10897, "time_per_iteration": 2.532294273376465 }, { "auxiliary_loss_clip": 0.01109745, "auxiliary_loss_mlp": 0.01029425, "balance_loss_clip": 1.01631212, "balance_loss_mlp": 1.03706694, "epoch": 0.6552232075755299, "flos": 16724461559040.0, "grad_norm": 1.672538663957022, "language_loss": 0.75365263, "learning_rate": 1.1224954632304391e-06, "loss": 0.77504432, "num_input_tokens_seen": 235287905, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 10898, "time_per_iteration": 2.479722499847412 }, { "auxiliary_loss_clip": 0.01110058, "auxiliary_loss_mlp": 0.0103564, "balance_loss_clip": 1.0233134, "balance_loss_mlp": 1.04011071, "epoch": 0.6552833308281978, "flos": 22016473338240.0, "grad_norm": 3.297116530900636, "language_loss": 0.7289747, "learning_rate": 1.122145506463827e-06, "loss": 0.75043172, "num_input_tokens_seen": 235305525, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 10899, "time_per_iteration": 2.4794118404388428 }, { "auxiliary_loss_clip": 0.01109489, "auxiliary_loss_mlp": 0.01027935, "balance_loss_clip": 1.01534033, "balance_loss_mlp": 1.03887236, "epoch": 0.6553434540808658, "flos": 24863399441280.0, "grad_norm": 2.667842621539825, "language_loss": 0.56286961, "learning_rate": 1.1217955829855443e-06, "loss": 0.58424389, "num_input_tokens_seen": 235324415, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 10900, "time_per_iteration": 2.5404460430145264 }, { "auxiliary_loss_clip": 0.01109795, "auxiliary_loss_mlp": 0.01036331, "balance_loss_clip": 1.02240777, "balance_loss_mlp": 1.03925693, "epoch": 0.6554035773335337, "flos": 23221060865280.0, "grad_norm": 1.7750164537413065, "language_loss": 0.76977289, "learning_rate": 1.1214456928088622e-06, "loss": 0.79123414, "num_input_tokens_seen": 235341595, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.703125, "step": 10901, "time_per_iteration": 2.4923367500305176 }, { "auxiliary_loss_clip": 0.01106764, "auxiliary_loss_mlp": 0.01031496, "balance_loss_clip": 1.01835346, "balance_loss_mlp": 1.03738809, "epoch": 0.6554637005862017, "flos": 22783597125120.0, "grad_norm": 1.8374004099018362, "language_loss": 0.73406363, "learning_rate": 1.1210958359470463e-06, "loss": 0.7554462, "num_input_tokens_seen": 235361700, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 10902, "time_per_iteration": 2.6181020736694336 }, { "auxiliary_loss_clip": 0.01106503, "auxiliary_loss_mlp": 0.01031424, "balance_loss_clip": 1.01878214, "balance_loss_mlp": 1.03793716, "epoch": 0.6555238238388696, "flos": 21507224267520.0, "grad_norm": 1.6983211526873454, "language_loss": 0.67994606, "learning_rate": 1.1207460124133645e-06, "loss": 0.70132542, "num_input_tokens_seen": 235382065, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 10903, "time_per_iteration": 2.5117557048797607 }, { "auxiliary_loss_clip": 0.01112383, "auxiliary_loss_mlp": 0.01032902, "balance_loss_clip": 1.0192523, "balance_loss_mlp": 1.03792238, "epoch": 0.6555839470915377, "flos": 30519473518080.0, "grad_norm": 1.7956134542667463, "language_loss": 0.66436535, "learning_rate": 1.1203962222210832e-06, "loss": 0.6858182, "num_input_tokens_seen": 235402130, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.74609375, "step": 10904, "time_per_iteration": 2.576237440109253 }, { "auxiliary_loss_clip": 0.01108341, "auxiliary_loss_mlp": 0.01036696, "balance_loss_clip": 1.02254593, "balance_loss_mlp": 1.0361644, "epoch": 0.6556440703442056, "flos": 24642943718400.0, "grad_norm": 2.332132098110459, "language_loss": 0.90850985, "learning_rate": 1.120046465383464e-06, "loss": 0.92996019, "num_input_tokens_seen": 235420435, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.71875, "step": 10905, "time_per_iteration": 2.497692584991455 }, { "auxiliary_loss_clip": 0.01106478, "auxiliary_loss_mlp": 0.01033484, "balance_loss_clip": 1.02152121, "balance_loss_mlp": 1.03849232, "epoch": 0.6557041935968736, "flos": 23732464752000.0, "grad_norm": 4.589383940800598, "language_loss": 0.75626349, "learning_rate": 1.1196967419137721e-06, "loss": 0.77766311, "num_input_tokens_seen": 235439960, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 10906, "time_per_iteration": 2.5342676639556885 }, { "auxiliary_loss_clip": 0.0111217, "auxiliary_loss_mlp": 0.01041376, "balance_loss_clip": 1.02817369, "balance_loss_mlp": 1.0393889, "epoch": 0.6557643168495415, "flos": 11102753819520.0, "grad_norm": 2.821657906585587, "language_loss": 0.74714696, "learning_rate": 1.119347051825267e-06, "loss": 0.76868248, "num_input_tokens_seen": 235457495, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 10907, "time_per_iteration": 2.5235438346862793 }, { "auxiliary_loss_clip": 0.01107212, "auxiliary_loss_mlp": 0.01030854, "balance_loss_clip": 1.0168525, "balance_loss_mlp": 1.0363512, "epoch": 0.6558244401022095, "flos": 30191034533760.0, "grad_norm": 23.49674056604676, "language_loss": 0.72286218, "learning_rate": 1.118997395131211e-06, "loss": 0.74424279, "num_input_tokens_seen": 235479525, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.70703125, "step": 10908, "time_per_iteration": 2.561155319213867 }, { "auxiliary_loss_clip": 0.01110646, "auxiliary_loss_mlp": 0.01034643, "balance_loss_clip": 1.02109504, "balance_loss_mlp": 1.03973234, "epoch": 0.6558845633548775, "flos": 17931060247680.0, "grad_norm": 4.521255930392177, "language_loss": 0.80670953, "learning_rate": 1.118647771844861e-06, "loss": 0.82816243, "num_input_tokens_seen": 235496305, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 10909, "time_per_iteration": 2.4450881481170654 }, { "auxiliary_loss_clip": 0.01109972, "auxiliary_loss_mlp": 0.0103601, "balance_loss_clip": 1.02218199, "balance_loss_mlp": 1.03860915, "epoch": 0.6559446866075455, "flos": 21904144531200.0, "grad_norm": 4.481988221817564, "language_loss": 0.63851452, "learning_rate": 1.1182981819794767e-06, "loss": 0.65997434, "num_input_tokens_seen": 235512545, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 10910, "time_per_iteration": 2.5067121982574463 }, { "auxiliary_loss_clip": 0.01113585, "auxiliary_loss_mlp": 0.01037623, "balance_loss_clip": 1.0233295, "balance_loss_mlp": 1.03840709, "epoch": 0.6560048098602135, "flos": 14127976056960.0, "grad_norm": 3.1359325626872345, "language_loss": 0.75928891, "learning_rate": 1.117948625548313e-06, "loss": 0.78080094, "num_input_tokens_seen": 235526045, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 10911, "time_per_iteration": 2.4128270149230957 }, { "auxiliary_loss_clip": 0.01103625, "auxiliary_loss_mlp": 0.01030114, "balance_loss_clip": 1.01797819, "balance_loss_mlp": 1.03618979, "epoch": 0.6560649331128814, "flos": 18807567926400.0, "grad_norm": 1.628127633260983, "language_loss": 0.75730497, "learning_rate": 1.1175991025646265e-06, "loss": 0.77864242, "num_input_tokens_seen": 235545285, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 10912, "time_per_iteration": 2.5262534618377686 }, { "auxiliary_loss_clip": 0.01117821, "auxiliary_loss_mlp": 0.01033708, "balance_loss_clip": 1.01997495, "balance_loss_mlp": 1.04187918, "epoch": 0.6561250563655494, "flos": 17053618815360.0, "grad_norm": 1.6328209320158233, "language_loss": 0.77518171, "learning_rate": 1.1172496130416697e-06, "loss": 0.79669702, "num_input_tokens_seen": 235563150, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7578125, "step": 10913, "time_per_iteration": 2.459015369415283 }, { "auxiliary_loss_clip": 0.01104672, "auxiliary_loss_mlp": 0.01028545, "balance_loss_clip": 1.01687992, "balance_loss_mlp": 1.0374893, "epoch": 0.6561851796182173, "flos": 22637656166400.0, "grad_norm": 1.7761018951648653, "language_loss": 0.71095413, "learning_rate": 1.1169001569926961e-06, "loss": 0.73228621, "num_input_tokens_seen": 235582535, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 10914, "time_per_iteration": 2.495417594909668 }, { "auxiliary_loss_clip": 0.01108152, "auxiliary_loss_mlp": 0.01032916, "balance_loss_clip": 1.01976728, "balance_loss_mlp": 1.03778577, "epoch": 0.6562453028708853, "flos": 19239213663360.0, "grad_norm": 17.584166303447493, "language_loss": 0.73638618, "learning_rate": 1.116550734430958e-06, "loss": 0.75779682, "num_input_tokens_seen": 235601490, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 10915, "time_per_iteration": 2.502565622329712 }, { "auxiliary_loss_clip": 0.01105111, "auxiliary_loss_mlp": 0.01029626, "balance_loss_clip": 1.01641107, "balance_loss_mlp": 1.03619707, "epoch": 0.6563054261235532, "flos": 23801305167360.0, "grad_norm": 1.6011685848998005, "language_loss": 0.79826534, "learning_rate": 1.1162013453697042e-06, "loss": 0.81961268, "num_input_tokens_seen": 235619165, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 10916, "time_per_iteration": 2.535168170928955 }, { "auxiliary_loss_clip": 0.01107346, "auxiliary_loss_mlp": 0.0103451, "balance_loss_clip": 1.02238059, "balance_loss_mlp": 1.03717268, "epoch": 0.6563655493762213, "flos": 19240039676160.0, "grad_norm": 2.362494416025812, "language_loss": 0.76197195, "learning_rate": 1.1158519898221831e-06, "loss": 0.78339046, "num_input_tokens_seen": 235637115, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 10917, "time_per_iteration": 2.45943284034729 }, { "auxiliary_loss_clip": 0.0110879, "auxiliary_loss_mlp": 0.01030573, "balance_loss_clip": 1.01763868, "balance_loss_mlp": 1.03914785, "epoch": 0.6564256726288892, "flos": 25556439427200.0, "grad_norm": 2.422375404330114, "language_loss": 0.69562012, "learning_rate": 1.1155026678016445e-06, "loss": 0.71701372, "num_input_tokens_seen": 235656330, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 10918, "time_per_iteration": 2.5595390796661377 }, { "auxiliary_loss_clip": 0.01105085, "auxiliary_loss_mlp": 0.01034606, "balance_loss_clip": 1.02241051, "balance_loss_mlp": 1.03791428, "epoch": 0.6564857958815572, "flos": 22200623389440.0, "grad_norm": 1.5887927050491764, "language_loss": 0.76004922, "learning_rate": 1.115153379321332e-06, "loss": 0.78144616, "num_input_tokens_seen": 235674510, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 10919, "time_per_iteration": 2.4854886531829834 }, { "auxiliary_loss_clip": 0.01030919, "auxiliary_loss_mlp": 0.01002134, "balance_loss_clip": 1.00091195, "balance_loss_mlp": 1.00800943, "epoch": 0.6565459191342251, "flos": 58123144604160.0, "grad_norm": 0.7316157944232614, "language_loss": 0.53080773, "learning_rate": 1.1148041243944931e-06, "loss": 0.55113828, "num_input_tokens_seen": 235735050, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.22949219, "step": 10920, "time_per_iteration": 4.534180164337158 }, { "auxiliary_loss_clip": 0.0110783, "auxiliary_loss_mlp": 0.01031015, "balance_loss_clip": 1.01794946, "balance_loss_mlp": 1.03948379, "epoch": 0.6566060423868931, "flos": 30809631582720.0, "grad_norm": 2.00843888491834, "language_loss": 0.65689534, "learning_rate": 1.1144549030343697e-06, "loss": 0.67828381, "num_input_tokens_seen": 235757545, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.68359375, "step": 10921, "time_per_iteration": 2.5977606773376465 }, { "auxiliary_loss_clip": 0.01105857, "auxiliary_loss_mlp": 0.01034897, "balance_loss_clip": 1.02052021, "balance_loss_mlp": 1.03589189, "epoch": 0.6566661656395612, "flos": 23367432787200.0, "grad_norm": 1.8071396854621078, "language_loss": 0.81338441, "learning_rate": 1.114105715254205e-06, "loss": 0.83479196, "num_input_tokens_seen": 235777265, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.69921875, "step": 10922, "time_per_iteration": 2.5123133659362793 }, { "auxiliary_loss_clip": 0.01108405, "auxiliary_loss_mlp": 0.01033991, "balance_loss_clip": 1.02070546, "balance_loss_mlp": 1.03821754, "epoch": 0.6567262888922291, "flos": 25735597488000.0, "grad_norm": 2.4586205704591704, "language_loss": 0.71288204, "learning_rate": 1.1137565610672414e-06, "loss": 0.73430598, "num_input_tokens_seen": 235796565, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 10923, "time_per_iteration": 5.370471954345703 }, { "auxiliary_loss_clip": 0.01110993, "auxiliary_loss_mlp": 0.01032631, "balance_loss_clip": 1.01939917, "balance_loss_mlp": 1.03971243, "epoch": 0.6567864121448971, "flos": 17123716206720.0, "grad_norm": 2.8001620054919627, "language_loss": 0.80892491, "learning_rate": 1.1134074404867169e-06, "loss": 0.83036113, "num_input_tokens_seen": 235814805, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 10924, "time_per_iteration": 2.4638712406158447 }, { "auxiliary_loss_clip": 0.0110768, "auxiliary_loss_mlp": 0.01030427, "balance_loss_clip": 1.01813674, "balance_loss_mlp": 1.03852022, "epoch": 0.656846535397565, "flos": 22419319345920.0, "grad_norm": 1.4593700524980056, "language_loss": 0.72480893, "learning_rate": 1.1130583535258717e-06, "loss": 0.74618995, "num_input_tokens_seen": 235833405, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 10925, "time_per_iteration": 3.9484317302703857 }, { "auxiliary_loss_clip": 0.01109559, "auxiliary_loss_mlp": 0.01029002, "balance_loss_clip": 1.01677728, "balance_loss_mlp": 1.03885722, "epoch": 0.656906658650233, "flos": 17704535126400.0, "grad_norm": 2.2278112101724408, "language_loss": 0.72836995, "learning_rate": 1.112709300197942e-06, "loss": 0.74975562, "num_input_tokens_seen": 235848530, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 10926, "time_per_iteration": 2.4256162643432617 }, { "auxiliary_loss_clip": 0.0111029, "auxiliary_loss_mlp": 0.01031943, "balance_loss_clip": 1.01866305, "balance_loss_mlp": 1.03762174, "epoch": 0.6569667819029009, "flos": 21175158009600.0, "grad_norm": 1.6902908666909684, "language_loss": 0.72448254, "learning_rate": 1.1123602805161656e-06, "loss": 0.74590486, "num_input_tokens_seen": 235867225, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 10927, "time_per_iteration": 2.4528143405914307 }, { "auxiliary_loss_clip": 0.01031268, "auxiliary_loss_mlp": 0.00999418, "balance_loss_clip": 0.99821991, "balance_loss_mlp": 1.00830364, "epoch": 0.6570269051555689, "flos": 68761897511040.0, "grad_norm": 0.7515171853566558, "language_loss": 0.64456975, "learning_rate": 1.112011294493775e-06, "loss": 0.66487658, "num_input_tokens_seen": 235932925, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.23046875, "step": 10928, "time_per_iteration": 3.099628448486328 }, { "auxiliary_loss_clip": 0.0110745, "auxiliary_loss_mlp": 0.01032463, "balance_loss_clip": 1.01935637, "balance_loss_mlp": 1.03729987, "epoch": 0.6570870284082369, "flos": 26319289495680.0, "grad_norm": 1.7223993339235601, "language_loss": 0.77633488, "learning_rate": 1.1116623421440063e-06, "loss": 0.79773408, "num_input_tokens_seen": 235952680, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 10929, "time_per_iteration": 2.516256809234619 }, { "auxiliary_loss_clip": 0.01106978, "auxiliary_loss_mlp": 0.01033119, "balance_loss_clip": 1.02057767, "balance_loss_mlp": 1.03758907, "epoch": 0.6571471516609049, "flos": 26174749167360.0, "grad_norm": 1.8569848893397345, "language_loss": 0.65047902, "learning_rate": 1.1113134234800895e-06, "loss": 0.67188001, "num_input_tokens_seen": 235972075, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 10930, "time_per_iteration": 2.5210072994232178 }, { "auxiliary_loss_clip": 0.01108032, "auxiliary_loss_mlp": 0.01029379, "balance_loss_clip": 1.01615274, "balance_loss_mlp": 1.03724647, "epoch": 0.6572072749135728, "flos": 20376253664640.0, "grad_norm": 2.1494446043400046, "language_loss": 0.70895898, "learning_rate": 1.110964538515258e-06, "loss": 0.73033309, "num_input_tokens_seen": 235990340, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 10931, "time_per_iteration": 2.4855666160583496 }, { "auxiliary_loss_clip": 0.01110286, "auxiliary_loss_mlp": 0.01038923, "balance_loss_clip": 1.02616167, "balance_loss_mlp": 1.0380702, "epoch": 0.6572673981662408, "flos": 17128744110720.0, "grad_norm": 2.317804955749278, "language_loss": 0.68979955, "learning_rate": 1.1106156872627393e-06, "loss": 0.71129167, "num_input_tokens_seen": 236007470, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 10932, "time_per_iteration": 2.436858654022217 }, { "auxiliary_loss_clip": 0.01107023, "auxiliary_loss_mlp": 0.01028793, "balance_loss_clip": 1.01632977, "balance_loss_mlp": 1.03728902, "epoch": 0.6573275214189087, "flos": 41275113281280.0, "grad_norm": 2.4822938252892004, "language_loss": 0.80126607, "learning_rate": 1.1102668697357626e-06, "loss": 0.82262427, "num_input_tokens_seen": 236029030, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 10933, "time_per_iteration": 2.657244920730591 }, { "auxiliary_loss_clip": 0.01111429, "auxiliary_loss_mlp": 0.01032723, "balance_loss_clip": 1.01946688, "balance_loss_mlp": 1.04011059, "epoch": 0.6573876446715767, "flos": 22890143842560.0, "grad_norm": 2.746904551697718, "language_loss": 0.73893601, "learning_rate": 1.1099180859475571e-06, "loss": 0.76037753, "num_input_tokens_seen": 236047160, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 10934, "time_per_iteration": 2.4801204204559326 }, { "auxiliary_loss_clip": 0.011067, "auxiliary_loss_mlp": 0.01032782, "balance_loss_clip": 1.02015758, "balance_loss_mlp": 1.0381639, "epoch": 0.6574477679242448, "flos": 44018150273280.0, "grad_norm": 1.9684051812868868, "language_loss": 0.76248282, "learning_rate": 1.1095693359113454e-06, "loss": 0.78387761, "num_input_tokens_seen": 236069215, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 10935, "time_per_iteration": 2.7002432346343994 }, { "auxiliary_loss_clip": 0.01108778, "auxiliary_loss_mlp": 0.01038897, "balance_loss_clip": 1.02460968, "balance_loss_mlp": 1.03820002, "epoch": 0.6575078911769127, "flos": 24571517523840.0, "grad_norm": 1.825626409981691, "language_loss": 0.78362656, "learning_rate": 1.1092206196403538e-06, "loss": 0.8051033, "num_input_tokens_seen": 236088335, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.70703125, "step": 10936, "time_per_iteration": 2.521125078201294 }, { "auxiliary_loss_clip": 0.01104015, "auxiliary_loss_mlp": 0.01032755, "balance_loss_clip": 1.02047682, "balance_loss_mlp": 1.03659296, "epoch": 0.6575680144295807, "flos": 20924035050240.0, "grad_norm": 2.0043638621124917, "language_loss": 0.69274884, "learning_rate": 1.1088719371478056e-06, "loss": 0.71411657, "num_input_tokens_seen": 236108540, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.67578125, "step": 10937, "time_per_iteration": 2.516578435897827 }, { "auxiliary_loss_clip": 0.01107903, "auxiliary_loss_mlp": 0.01030381, "balance_loss_clip": 1.0177207, "balance_loss_mlp": 1.03870201, "epoch": 0.6576281376822486, "flos": 10925642833920.0, "grad_norm": 2.4033542746275884, "language_loss": 0.69207561, "learning_rate": 1.1085232884469236e-06, "loss": 0.71345842, "num_input_tokens_seen": 236124495, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 10938, "time_per_iteration": 2.4423913955688477 }, { "auxiliary_loss_clip": 0.01110189, "auxiliary_loss_mlp": 0.01031134, "balance_loss_clip": 1.0181762, "balance_loss_mlp": 1.03917289, "epoch": 0.6576882609349166, "flos": 19281552819840.0, "grad_norm": 2.243166635135183, "language_loss": 0.71404886, "learning_rate": 1.108174673550927e-06, "loss": 0.73546207, "num_input_tokens_seen": 236142550, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 10939, "time_per_iteration": 2.5541913509368896 }, { "auxiliary_loss_clip": 0.01111538, "auxiliary_loss_mlp": 0.0103219, "balance_loss_clip": 1.01863015, "balance_loss_mlp": 1.03853226, "epoch": 0.6577483841875845, "flos": 20220544206720.0, "grad_norm": 2.4331322845930026, "language_loss": 0.7773695, "learning_rate": 1.107826092473037e-06, "loss": 0.79880673, "num_input_tokens_seen": 236156620, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 10940, "time_per_iteration": 2.5200467109680176 }, { "auxiliary_loss_clip": 0.01111952, "auxiliary_loss_mlp": 0.01031264, "balance_loss_clip": 1.01819885, "balance_loss_mlp": 1.03831851, "epoch": 0.6578085074402525, "flos": 34751078962560.0, "grad_norm": 2.484769127516377, "language_loss": 0.68543112, "learning_rate": 1.107477545226471e-06, "loss": 0.70686328, "num_input_tokens_seen": 236177095, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 10941, "time_per_iteration": 2.724806308746338 }, { "auxiliary_loss_clip": 0.01105971, "auxiliary_loss_mlp": 0.01028302, "balance_loss_clip": 1.01548743, "balance_loss_mlp": 1.03649306, "epoch": 0.6578686306929205, "flos": 23470998675840.0, "grad_norm": 1.8344256648018353, "language_loss": 0.67576146, "learning_rate": 1.1071290318244448e-06, "loss": 0.69710422, "num_input_tokens_seen": 236194695, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 10942, "time_per_iteration": 2.533339738845825 }, { "auxiliary_loss_clip": 0.01114564, "auxiliary_loss_mlp": 0.01036354, "balance_loss_clip": 1.02225709, "balance_loss_mlp": 1.03910661, "epoch": 0.6579287539455885, "flos": 18077073033600.0, "grad_norm": 2.0974539055089307, "language_loss": 0.71202409, "learning_rate": 1.1067805522801753e-06, "loss": 0.73353326, "num_input_tokens_seen": 236213885, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 10943, "time_per_iteration": 2.4584808349609375 }, { "auxiliary_loss_clip": 0.01107581, "auxiliary_loss_mlp": 0.01030136, "balance_loss_clip": 1.01707709, "balance_loss_mlp": 1.03819847, "epoch": 0.6579888771982564, "flos": 28661383900800.0, "grad_norm": 2.7814766668603697, "language_loss": 0.59220612, "learning_rate": 1.1064321066068778e-06, "loss": 0.61358327, "num_input_tokens_seen": 236237315, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 10944, "time_per_iteration": 2.568106174468994 }, { "auxiliary_loss_clip": 0.01111406, "auxiliary_loss_mlp": 0.01036325, "balance_loss_clip": 1.02268767, "balance_loss_mlp": 1.03726506, "epoch": 0.6580490004509244, "flos": 25046543911680.0, "grad_norm": 1.7110993044371416, "language_loss": 0.72381061, "learning_rate": 1.1060836948177646e-06, "loss": 0.7452879, "num_input_tokens_seen": 236256345, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 10945, "time_per_iteration": 2.511751413345337 }, { "auxiliary_loss_clip": 0.01108051, "auxiliary_loss_mlp": 0.01028741, "balance_loss_clip": 1.01659381, "balance_loss_mlp": 1.03887069, "epoch": 0.6581091237035923, "flos": 43508793461760.0, "grad_norm": 1.8010267314291535, "language_loss": 0.70683694, "learning_rate": 1.105735316926046e-06, "loss": 0.72820485, "num_input_tokens_seen": 236281890, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 10946, "time_per_iteration": 2.660630226135254 }, { "auxiliary_loss_clip": 0.01109025, "auxiliary_loss_mlp": 0.01034974, "balance_loss_clip": 1.02174795, "balance_loss_mlp": 1.03902411, "epoch": 0.6581692469562603, "flos": 22415404763520.0, "grad_norm": 1.8379890689155924, "language_loss": 0.8189621, "learning_rate": 1.105386972944934e-06, "loss": 0.84040207, "num_input_tokens_seen": 236298370, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 10947, "time_per_iteration": 2.4927470684051514 }, { "auxiliary_loss_clip": 0.0110898, "auxiliary_loss_mlp": 0.01030097, "balance_loss_clip": 1.0173595, "balance_loss_mlp": 1.03671837, "epoch": 0.6582293702089284, "flos": 24859772167680.0, "grad_norm": 2.738814253785704, "language_loss": 0.77596319, "learning_rate": 1.1050386628876385e-06, "loss": 0.79735398, "num_input_tokens_seen": 236317380, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 10948, "time_per_iteration": 2.490525007247925 }, { "auxiliary_loss_clip": 0.01108807, "auxiliary_loss_mlp": 0.01029711, "balance_loss_clip": 1.0170331, "balance_loss_mlp": 1.03919411, "epoch": 0.6582894934615963, "flos": 23039676161280.0, "grad_norm": 2.1858662233560433, "language_loss": 0.79256308, "learning_rate": 1.1046903867673655e-06, "loss": 0.81394827, "num_input_tokens_seen": 236336210, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 10949, "time_per_iteration": 2.491427421569824 }, { "auxiliary_loss_clip": 0.01029805, "auxiliary_loss_mlp": 0.01005038, "balance_loss_clip": 1.00371456, "balance_loss_mlp": 1.00685954, "epoch": 0.6583496167142643, "flos": 72551980978560.0, "grad_norm": 0.7337648734979042, "language_loss": 0.61839086, "learning_rate": 1.104342144597323e-06, "loss": 0.63873929, "num_input_tokens_seen": 236403090, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.22949219, "step": 10950, "time_per_iteration": 3.149624824523926 }, { "auxiliary_loss_clip": 0.01102613, "auxiliary_loss_mlp": 0.01034642, "balance_loss_clip": 1.02311456, "balance_loss_mlp": 1.03560519, "epoch": 0.6584097399669322, "flos": 13078846592640.0, "grad_norm": 2.2730487712162644, "language_loss": 0.67053795, "learning_rate": 1.1039939363907178e-06, "loss": 0.69191051, "num_input_tokens_seen": 236420475, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 10951, "time_per_iteration": 2.4728853702545166 }, { "auxiliary_loss_clip": 0.0110814, "auxiliary_loss_mlp": 0.01031955, "balance_loss_clip": 1.01963496, "balance_loss_mlp": 1.0386827, "epoch": 0.6584698632196002, "flos": 28693164458880.0, "grad_norm": 5.023765027774519, "language_loss": 0.76610112, "learning_rate": 1.1036457621607504e-06, "loss": 0.78750211, "num_input_tokens_seen": 236441915, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 10952, "time_per_iteration": 2.5352039337158203 }, { "auxiliary_loss_clip": 0.01107006, "auxiliary_loss_mlp": 0.01032617, "balance_loss_clip": 1.01983738, "balance_loss_mlp": 1.03869629, "epoch": 0.6585299864722681, "flos": 14319272914560.0, "grad_norm": 1.7208814438128317, "language_loss": 0.7357136, "learning_rate": 1.1032976219206257e-06, "loss": 0.75710988, "num_input_tokens_seen": 236460340, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 10953, "time_per_iteration": 2.456048011779785 }, { "auxiliary_loss_clip": 0.01107247, "auxiliary_loss_mlp": 0.01036279, "balance_loss_clip": 1.02325535, "balance_loss_mlp": 1.03748286, "epoch": 0.6585901097249361, "flos": 26797907243520.0, "grad_norm": 1.9985961974733564, "language_loss": 0.78660738, "learning_rate": 1.102949515683546e-06, "loss": 0.80804265, "num_input_tokens_seen": 236478280, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 10954, "time_per_iteration": 2.5138072967529297 }, { "auxiliary_loss_clip": 0.01109679, "auxiliary_loss_mlp": 0.0103331, "balance_loss_clip": 1.0205071, "balance_loss_mlp": 1.03907204, "epoch": 0.658650232977604, "flos": 18733124989440.0, "grad_norm": 2.725245131313929, "language_loss": 0.69660592, "learning_rate": 1.1026014434627096e-06, "loss": 0.71803576, "num_input_tokens_seen": 236493225, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 10955, "time_per_iteration": 2.4453752040863037 }, { "auxiliary_loss_clip": 0.01103202, "auxiliary_loss_mlp": 0.01028654, "balance_loss_clip": 1.01754963, "balance_loss_mlp": 1.03683591, "epoch": 0.6587103562302721, "flos": 24753440931840.0, "grad_norm": 2.973694928786223, "language_loss": 0.80874336, "learning_rate": 1.1022534052713172e-06, "loss": 0.83006185, "num_input_tokens_seen": 236514420, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6640625, "step": 10956, "time_per_iteration": 2.504312038421631 }, { "auxiliary_loss_clip": 0.01108468, "auxiliary_loss_mlp": 0.01043661, "balance_loss_clip": 1.03043437, "balance_loss_mlp": 1.03908277, "epoch": 0.65877047948294, "flos": 22346133384960.0, "grad_norm": 2.7844548891521823, "language_loss": 0.81378233, "learning_rate": 1.1019054011225648e-06, "loss": 0.8353036, "num_input_tokens_seen": 236532785, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 10957, "time_per_iteration": 2.4922027587890625 }, { "auxiliary_loss_clip": 0.01105537, "auxiliary_loss_mlp": 0.01026872, "balance_loss_clip": 1.01584494, "balance_loss_mlp": 1.03784883, "epoch": 0.658830602735608, "flos": 45180542298240.0, "grad_norm": 1.7893753653254645, "language_loss": 0.757725, "learning_rate": 1.1015574310296506e-06, "loss": 0.77904916, "num_input_tokens_seen": 236553330, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6796875, "step": 10958, "time_per_iteration": 2.6708521842956543 }, { "auxiliary_loss_clip": 0.01104521, "auxiliary_loss_mlp": 0.01031443, "balance_loss_clip": 1.01932573, "balance_loss_mlp": 1.03753543, "epoch": 0.6588907259882759, "flos": 19901622326400.0, "grad_norm": 3.9599825000390725, "language_loss": 0.74964058, "learning_rate": 1.1012094950057678e-06, "loss": 0.77100021, "num_input_tokens_seen": 236572960, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66796875, "step": 10959, "time_per_iteration": 2.4776597023010254 }, { "auxiliary_loss_clip": 0.0110672, "auxiliary_loss_mlp": 0.01029808, "balance_loss_clip": 1.01770258, "balance_loss_mlp": 1.03818738, "epoch": 0.6589508492409439, "flos": 24133766474880.0, "grad_norm": 1.543546500150155, "language_loss": 0.64966178, "learning_rate": 1.1008615930641107e-06, "loss": 0.67102706, "num_input_tokens_seen": 236594090, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 10960, "time_per_iteration": 2.5928003787994385 }, { "auxiliary_loss_clip": 0.01111499, "auxiliary_loss_mlp": 0.01031002, "balance_loss_clip": 1.01785898, "balance_loss_mlp": 1.03813493, "epoch": 0.659010972493612, "flos": 18222906251520.0, "grad_norm": 2.2343963012881103, "language_loss": 0.82350445, "learning_rate": 1.1005137252178734e-06, "loss": 0.84492946, "num_input_tokens_seen": 236610190, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 10961, "time_per_iteration": 2.4731028079986572 }, { "auxiliary_loss_clip": 0.01109113, "auxiliary_loss_mlp": 0.01029398, "balance_loss_clip": 1.01688147, "balance_loss_mlp": 1.03980255, "epoch": 0.6590710957462799, "flos": 27600007898880.0, "grad_norm": 1.7179014080076158, "language_loss": 0.73136407, "learning_rate": 1.1001658914802453e-06, "loss": 0.7527492, "num_input_tokens_seen": 236631575, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 10962, "time_per_iteration": 3.9564719200134277 }, { "auxiliary_loss_clip": 0.0110747, "auxiliary_loss_mlp": 0.01032341, "balance_loss_clip": 1.01995468, "balance_loss_mlp": 1.03721762, "epoch": 0.6591312189989479, "flos": 20302959962880.0, "grad_norm": 2.0918644507341937, "language_loss": 0.79944861, "learning_rate": 1.0998180918644165e-06, "loss": 0.82084674, "num_input_tokens_seen": 236649815, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 10963, "time_per_iteration": 2.4560279846191406 }, { "auxiliary_loss_clip": 0.01104311, "auxiliary_loss_mlp": 0.01028994, "balance_loss_clip": 1.01682866, "balance_loss_mlp": 1.0365448, "epoch": 0.6591913422516158, "flos": 12312943868160.0, "grad_norm": 1.7092515475922827, "language_loss": 0.78258157, "learning_rate": 1.0994703263835754e-06, "loss": 0.80391461, "num_input_tokens_seen": 236668335, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 10964, "time_per_iteration": 2.506929636001587 }, { "auxiliary_loss_clip": 0.01107355, "auxiliary_loss_mlp": 0.01036876, "balance_loss_clip": 1.02431166, "balance_loss_mlp": 1.03556907, "epoch": 0.6592514655042838, "flos": 25884591102720.0, "grad_norm": 1.5900518729978919, "language_loss": 0.74010897, "learning_rate": 1.0991225950509106e-06, "loss": 0.76155126, "num_input_tokens_seen": 236688945, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 10965, "time_per_iteration": 5.361037492752075 }, { "auxiliary_loss_clip": 0.01111932, "auxiliary_loss_mlp": 0.01034319, "balance_loss_clip": 1.02157593, "balance_loss_mlp": 1.0387311, "epoch": 0.6593115887569517, "flos": 14063624841600.0, "grad_norm": 2.0044535460798265, "language_loss": 0.73676491, "learning_rate": 1.0987748978796067e-06, "loss": 0.75822735, "num_input_tokens_seen": 236707055, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 10966, "time_per_iteration": 3.9106318950653076 }, { "auxiliary_loss_clip": 0.0110678, "auxiliary_loss_mlp": 0.01030984, "balance_loss_clip": 1.01781094, "balance_loss_mlp": 1.0369643, "epoch": 0.6593717120096197, "flos": 24717925359360.0, "grad_norm": 1.6817504447668312, "language_loss": 0.76810485, "learning_rate": 1.0984272348828487e-06, "loss": 0.78948247, "num_input_tokens_seen": 236725900, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 10967, "time_per_iteration": 2.5005862712860107 }, { "auxiliary_loss_clip": 0.01028961, "auxiliary_loss_mlp": 0.01004896, "balance_loss_clip": 1.00367379, "balance_loss_mlp": 1.00604916, "epoch": 0.6594318352622877, "flos": 55558083502080.0, "grad_norm": 0.7029522770484711, "language_loss": 0.48481488, "learning_rate": 1.0980796060738221e-06, "loss": 0.50515342, "num_input_tokens_seen": 236788415, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.22851562, "step": 10968, "time_per_iteration": 3.081014633178711 }, { "auxiliary_loss_clip": 0.01107861, "auxiliary_loss_mlp": 0.01031134, "balance_loss_clip": 1.01856947, "balance_loss_mlp": 1.03718519, "epoch": 0.6594919585149557, "flos": 17456931699840.0, "grad_norm": 2.222554949478774, "language_loss": 0.79236591, "learning_rate": 1.0977320114657058e-06, "loss": 0.81375587, "num_input_tokens_seen": 236805155, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 10969, "time_per_iteration": 2.4245498180389404 }, { "auxiliary_loss_clip": 0.01105515, "auxiliary_loss_mlp": 0.01031746, "balance_loss_clip": 1.01959252, "balance_loss_mlp": 1.03692496, "epoch": 0.6595520817676236, "flos": 18223229473920.0, "grad_norm": 1.8642189051496507, "language_loss": 0.65293169, "learning_rate": 1.0973844510716817e-06, "loss": 0.67430431, "num_input_tokens_seen": 236824360, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 10970, "time_per_iteration": 2.4703292846679688 }, { "auxiliary_loss_clip": 0.01106486, "auxiliary_loss_mlp": 0.01030262, "balance_loss_clip": 1.01780474, "balance_loss_mlp": 1.03694808, "epoch": 0.6596122050202916, "flos": 22199761463040.0, "grad_norm": 1.5926470464592783, "language_loss": 0.76593012, "learning_rate": 1.0970369249049308e-06, "loss": 0.78729761, "num_input_tokens_seen": 236844640, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 10971, "time_per_iteration": 2.4631528854370117 }, { "auxiliary_loss_clip": 0.01109801, "auxiliary_loss_mlp": 0.01031325, "balance_loss_clip": 1.01907015, "balance_loss_mlp": 1.03825521, "epoch": 0.6596723282729595, "flos": 14173834746240.0, "grad_norm": 2.7220837033378213, "language_loss": 0.70233428, "learning_rate": 1.096689432978629e-06, "loss": 0.72374558, "num_input_tokens_seen": 236861160, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71484375, "step": 10972, "time_per_iteration": 2.439361810684204 }, { "auxiliary_loss_clip": 0.01105893, "auxiliary_loss_mlp": 0.0102557, "balance_loss_clip": 1.01285028, "balance_loss_mlp": 1.03728902, "epoch": 0.6597324515256275, "flos": 30553193410560.0, "grad_norm": 3.31681556695527, "language_loss": 0.5573402, "learning_rate": 1.0963419753059556e-06, "loss": 0.57865489, "num_input_tokens_seen": 236880465, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 10973, "time_per_iteration": 2.542724609375 }, { "auxiliary_loss_clip": 0.01112865, "auxiliary_loss_mlp": 0.01040822, "balance_loss_clip": 1.02800107, "balance_loss_mlp": 1.03869367, "epoch": 0.6597925747782956, "flos": 17639860688640.0, "grad_norm": 2.7256472566900403, "language_loss": 0.78494203, "learning_rate": 1.0959945519000839e-06, "loss": 0.80647892, "num_input_tokens_seen": 236897730, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7421875, "step": 10974, "time_per_iteration": 2.4487810134887695 }, { "auxiliary_loss_clip": 0.01107631, "auxiliary_loss_mlp": 0.01034754, "balance_loss_clip": 1.02199244, "balance_loss_mlp": 1.03749013, "epoch": 0.6598526980309635, "flos": 22819112697600.0, "grad_norm": 2.4135023062445944, "language_loss": 0.6887269, "learning_rate": 1.0956471627741906e-06, "loss": 0.71015072, "num_input_tokens_seen": 236917300, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 10975, "time_per_iteration": 2.4694764614105225 }, { "auxiliary_loss_clip": 0.01104874, "auxiliary_loss_mlp": 0.01030691, "balance_loss_clip": 1.01857972, "balance_loss_mlp": 1.03481221, "epoch": 0.6599128212836315, "flos": 21068036674560.0, "grad_norm": 1.798872314007565, "language_loss": 0.70638543, "learning_rate": 1.0952998079414464e-06, "loss": 0.72774106, "num_input_tokens_seen": 236935590, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 10976, "time_per_iteration": 2.4748218059539795 }, { "auxiliary_loss_clip": 0.01103231, "auxiliary_loss_mlp": 0.01028807, "balance_loss_clip": 1.01606393, "balance_loss_mlp": 1.03571713, "epoch": 0.6599729445362994, "flos": 22163527618560.0, "grad_norm": 1.6310694459886914, "language_loss": 0.67288923, "learning_rate": 1.0949524874150243e-06, "loss": 0.69420964, "num_input_tokens_seen": 236952830, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.67578125, "step": 10977, "time_per_iteration": 2.4651057720184326 }, { "auxiliary_loss_clip": 0.01111792, "auxiliary_loss_mlp": 0.01032482, "balance_loss_clip": 1.01896942, "balance_loss_mlp": 1.03881788, "epoch": 0.6600330677889674, "flos": 18150079426560.0, "grad_norm": 1.916617366679434, "language_loss": 0.81541419, "learning_rate": 1.0946052012080952e-06, "loss": 0.83685696, "num_input_tokens_seen": 236971930, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 10978, "time_per_iteration": 2.4798078536987305 }, { "auxiliary_loss_clip": 0.01111993, "auxiliary_loss_mlp": 0.01036084, "balance_loss_clip": 1.02267265, "balance_loss_mlp": 1.03967857, "epoch": 0.6600931910416353, "flos": 18150115340160.0, "grad_norm": 3.750393395255361, "language_loss": 0.67368633, "learning_rate": 1.0942579493338278e-06, "loss": 0.69516706, "num_input_tokens_seen": 236989920, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 10979, "time_per_iteration": 2.4380598068237305 }, { "auxiliary_loss_clip": 0.01107913, "auxiliary_loss_mlp": 0.01028932, "balance_loss_clip": 1.01559794, "balance_loss_mlp": 1.0367167, "epoch": 0.6601533142943034, "flos": 17420733768960.0, "grad_norm": 2.137456459385985, "language_loss": 0.73555851, "learning_rate": 1.0939107318053889e-06, "loss": 0.75692701, "num_input_tokens_seen": 237006570, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 10980, "time_per_iteration": 2.4530770778656006 }, { "auxiliary_loss_clip": 0.01104367, "auxiliary_loss_mlp": 0.01032866, "balance_loss_clip": 1.0212369, "balance_loss_mlp": 1.03758907, "epoch": 0.6602134375469713, "flos": 28219574615040.0, "grad_norm": 1.908408932425044, "language_loss": 0.72912157, "learning_rate": 1.0935635486359459e-06, "loss": 0.75049394, "num_input_tokens_seen": 237028415, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 10981, "time_per_iteration": 2.522961378097534 }, { "auxiliary_loss_clip": 0.01107801, "auxiliary_loss_mlp": 0.01033858, "balance_loss_clip": 1.02128172, "balance_loss_mlp": 1.03659391, "epoch": 0.6602735607996393, "flos": 29418056830080.0, "grad_norm": 2.281292098394521, "language_loss": 0.68680203, "learning_rate": 1.0932163998386647e-06, "loss": 0.70821857, "num_input_tokens_seen": 237046595, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 10982, "time_per_iteration": 2.5634684562683105 }, { "auxiliary_loss_clip": 0.01107112, "auxiliary_loss_mlp": 0.01031258, "balance_loss_clip": 1.0189321, "balance_loss_mlp": 1.03917789, "epoch": 0.6603336840523072, "flos": 18588045957120.0, "grad_norm": 2.9569882991852006, "language_loss": 0.69639027, "learning_rate": 1.0928692854267075e-06, "loss": 0.71777403, "num_input_tokens_seen": 237066150, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 10983, "time_per_iteration": 2.462987184524536 }, { "auxiliary_loss_clip": 0.01108154, "auxiliary_loss_mlp": 0.01030552, "balance_loss_clip": 1.01724267, "balance_loss_mlp": 1.03780079, "epoch": 0.6603938073049752, "flos": 33254860913280.0, "grad_norm": 1.685234884357833, "language_loss": 0.70257747, "learning_rate": 1.092522205413239e-06, "loss": 0.72396457, "num_input_tokens_seen": 237087060, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 10984, "time_per_iteration": 2.5941550731658936 }, { "auxiliary_loss_clip": 0.01105805, "auxiliary_loss_mlp": 0.01033066, "balance_loss_clip": 1.02088296, "balance_loss_mlp": 1.03746653, "epoch": 0.6604539305576431, "flos": 17384284442880.0, "grad_norm": 1.7833192958934505, "language_loss": 0.84329498, "learning_rate": 1.0921751598114193e-06, "loss": 0.86468375, "num_input_tokens_seen": 237103825, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 10985, "time_per_iteration": 2.4354610443115234 }, { "auxiliary_loss_clip": 0.01108455, "auxiliary_loss_mlp": 0.01029488, "balance_loss_clip": 1.01670289, "balance_loss_mlp": 1.03807878, "epoch": 0.6605140538103111, "flos": 21251145231360.0, "grad_norm": 2.332961237150381, "language_loss": 0.73972684, "learning_rate": 1.0918281486344077e-06, "loss": 0.76110631, "num_input_tokens_seen": 237121740, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 10986, "time_per_iteration": 2.477796792984009 }, { "auxiliary_loss_clip": 0.01105423, "auxiliary_loss_mlp": 0.01030368, "balance_loss_clip": 1.01779747, "balance_loss_mlp": 1.03814757, "epoch": 0.6605741770629792, "flos": 13881701433600.0, "grad_norm": 1.8956774783695614, "language_loss": 0.79300976, "learning_rate": 1.0914811718953636e-06, "loss": 0.81436765, "num_input_tokens_seen": 237139565, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.671875, "step": 10987, "time_per_iteration": 2.4355878829956055 }, { "auxiliary_loss_clip": 0.01030398, "auxiliary_loss_mlp": 0.01007057, "balance_loss_clip": 1.00593042, "balance_loss_mlp": 1.0073874, "epoch": 0.6606343003156471, "flos": 69316215171840.0, "grad_norm": 0.8212668572573368, "language_loss": 0.54125619, "learning_rate": 1.0911342296074454e-06, "loss": 0.56163073, "num_input_tokens_seen": 237201055, "router_z_loss_clip": 0.0112915, "router_z_loss_mlp": 0.23046875, "step": 10988, "time_per_iteration": 3.173571825027466 }, { "auxiliary_loss_clip": 0.01105961, "auxiliary_loss_mlp": 0.01033502, "balance_loss_clip": 1.0219264, "balance_loss_mlp": 1.03810453, "epoch": 0.6606944235683151, "flos": 27272394927360.0, "grad_norm": 2.0787563316385227, "language_loss": 0.77527189, "learning_rate": 1.0907873217838077e-06, "loss": 0.7966665, "num_input_tokens_seen": 237221805, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 10989, "time_per_iteration": 2.5067026615142822 }, { "auxiliary_loss_clip": 0.01109093, "auxiliary_loss_mlp": 0.01035691, "balance_loss_clip": 1.02329338, "balance_loss_mlp": 1.04057479, "epoch": 0.660754546820983, "flos": 13772820332160.0, "grad_norm": 2.611827901142045, "language_loss": 0.77377725, "learning_rate": 1.0904404484376064e-06, "loss": 0.79522514, "num_input_tokens_seen": 237238270, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 10990, "time_per_iteration": 2.4343137741088867 }, { "auxiliary_loss_clip": 0.01108444, "auxiliary_loss_mlp": 0.01031427, "balance_loss_clip": 1.0182128, "balance_loss_mlp": 1.03718424, "epoch": 0.660814670073651, "flos": 15705209232000.0, "grad_norm": 2.0081777268807373, "language_loss": 0.60689813, "learning_rate": 1.0900936095819937e-06, "loss": 0.62829685, "num_input_tokens_seen": 237255400, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 10991, "time_per_iteration": 2.4352924823760986 }, { "auxiliary_loss_clip": 0.01110357, "auxiliary_loss_mlp": 0.01035393, "balance_loss_clip": 1.02222633, "balance_loss_mlp": 1.03889239, "epoch": 0.6608747933263189, "flos": 20850023076480.0, "grad_norm": 3.1932214730594173, "language_loss": 0.68336785, "learning_rate": 1.0897468052301234e-06, "loss": 0.7048254, "num_input_tokens_seen": 237273105, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 10992, "time_per_iteration": 2.495715618133545 }, { "auxiliary_loss_clip": 0.01108708, "auxiliary_loss_mlp": 0.0103196, "balance_loss_clip": 1.0187695, "balance_loss_mlp": 1.03635442, "epoch": 0.660934916578987, "flos": 20632117219200.0, "grad_norm": 2.243033876310702, "language_loss": 0.87774527, "learning_rate": 1.0894000353951444e-06, "loss": 0.89915192, "num_input_tokens_seen": 237292650, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 10993, "time_per_iteration": 2.4764902591705322 }, { "auxiliary_loss_clip": 0.01114548, "auxiliary_loss_mlp": 0.01032281, "balance_loss_clip": 1.01787508, "balance_loss_mlp": 1.03959179, "epoch": 0.6609950398316549, "flos": 25113588647040.0, "grad_norm": 1.8271248485894336, "language_loss": 0.66568959, "learning_rate": 1.0890533000902078e-06, "loss": 0.68715787, "num_input_tokens_seen": 237312865, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.75, "step": 10994, "time_per_iteration": 2.5487875938415527 }, { "auxiliary_loss_clip": 0.01110049, "auxiliary_loss_mlp": 0.01031423, "balance_loss_clip": 1.01821446, "balance_loss_mlp": 1.03897035, "epoch": 0.6610551630843229, "flos": 18661196004480.0, "grad_norm": 1.9207551066256636, "language_loss": 0.77001113, "learning_rate": 1.0887065993284626e-06, "loss": 0.79142582, "num_input_tokens_seen": 237331210, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 10995, "time_per_iteration": 2.4789645671844482 }, { "auxiliary_loss_clip": 0.01107103, "auxiliary_loss_mlp": 0.01028839, "balance_loss_clip": 1.01707315, "balance_loss_mlp": 1.03727043, "epoch": 0.6611152863369908, "flos": 23258192549760.0, "grad_norm": 2.07566661304008, "language_loss": 0.74482334, "learning_rate": 1.088359933123053e-06, "loss": 0.76618278, "num_input_tokens_seen": 237349455, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69921875, "step": 10996, "time_per_iteration": 2.48313307762146 }, { "auxiliary_loss_clip": 0.01110792, "auxiliary_loss_mlp": 0.01035582, "balance_loss_clip": 1.02258801, "balance_loss_mlp": 1.04048252, "epoch": 0.6611754095896588, "flos": 22159720776960.0, "grad_norm": 4.279438205887045, "language_loss": 0.68903136, "learning_rate": 1.088013301487126e-06, "loss": 0.71049505, "num_input_tokens_seen": 237367100, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 10997, "time_per_iteration": 2.4704787731170654 }, { "auxiliary_loss_clip": 0.01110367, "auxiliary_loss_mlp": 0.01031632, "balance_loss_clip": 1.01931763, "balance_loss_mlp": 1.03807187, "epoch": 0.6612355328423267, "flos": 13991228979840.0, "grad_norm": 3.19008708245182, "language_loss": 0.68501306, "learning_rate": 1.0876667044338269e-06, "loss": 0.70643306, "num_input_tokens_seen": 237384840, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.72265625, "step": 10998, "time_per_iteration": 2.449951171875 }, { "auxiliary_loss_clip": 0.01030216, "auxiliary_loss_mlp": 0.0100342, "balance_loss_clip": 1.0021857, "balance_loss_mlp": 1.00737691, "epoch": 0.6612956560949947, "flos": 61453716359040.0, "grad_norm": 0.6822188143295895, "language_loss": 0.51099908, "learning_rate": 1.087320141976297e-06, "loss": 0.53133547, "num_input_tokens_seen": 237443355, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.22851562, "step": 10999, "time_per_iteration": 3.0440003871917725 }, { "auxiliary_loss_clip": 0.01112242, "auxiliary_loss_mlp": 0.01032685, "balance_loss_clip": 1.02014422, "balance_loss_mlp": 1.03938925, "epoch": 0.6613557793476627, "flos": 21616644072960.0, "grad_norm": 2.752830605299225, "language_loss": 0.7029984, "learning_rate": 1.086973614127679e-06, "loss": 0.72444773, "num_input_tokens_seen": 237459205, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 11000, "time_per_iteration": 2.4910266399383545 }, { "auxiliary_loss_clip": 0.0110568, "auxiliary_loss_mlp": 0.01033176, "balance_loss_clip": 1.02109385, "balance_loss_mlp": 1.03818917, "epoch": 0.6614159026003307, "flos": 34020117192960.0, "grad_norm": 2.0785816043452754, "language_loss": 0.65302813, "learning_rate": 1.0866271209011133e-06, "loss": 0.67441666, "num_input_tokens_seen": 237483580, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 11001, "time_per_iteration": 2.584505081176758 }, { "auxiliary_loss_clip": 0.01107473, "auxiliary_loss_mlp": 0.0102794, "balance_loss_clip": 1.01537561, "balance_loss_mlp": 1.03787231, "epoch": 0.6614760258529987, "flos": 24097281235200.0, "grad_norm": 1.75197393205497, "language_loss": 0.73019946, "learning_rate": 1.086280662309739e-06, "loss": 0.7515536, "num_input_tokens_seen": 237502860, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 11002, "time_per_iteration": 2.4898569583892822 }, { "auxiliary_loss_clip": 0.01107488, "auxiliary_loss_mlp": 0.0102964, "balance_loss_clip": 1.01710558, "balance_loss_mlp": 1.03866935, "epoch": 0.6615361491056666, "flos": 14903790935040.0, "grad_norm": 2.215635936428966, "language_loss": 0.78568184, "learning_rate": 1.0859342383666928e-06, "loss": 0.80705309, "num_input_tokens_seen": 237521030, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 11003, "time_per_iteration": 3.8027560710906982 }, { "auxiliary_loss_clip": 0.01110233, "auxiliary_loss_mlp": 0.0103408, "balance_loss_clip": 1.02027595, "balance_loss_mlp": 1.03932285, "epoch": 0.6615962723583346, "flos": 15304877176320.0, "grad_norm": 2.037717055231791, "language_loss": 0.68778926, "learning_rate": 1.0855878490851119e-06, "loss": 0.70923239, "num_input_tokens_seen": 237539585, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.70703125, "step": 11004, "time_per_iteration": 2.479607105255127 }, { "auxiliary_loss_clip": 0.01110876, "auxiliary_loss_mlp": 0.01034511, "balance_loss_clip": 1.02059913, "balance_loss_mlp": 1.03815937, "epoch": 0.6616563956110025, "flos": 18732586285440.0, "grad_norm": 2.862739040100052, "language_loss": 0.69340408, "learning_rate": 1.085241494478132e-06, "loss": 0.71485794, "num_input_tokens_seen": 237557655, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 11005, "time_per_iteration": 2.446227788925171 }, { "auxiliary_loss_clip": 0.01107456, "auxiliary_loss_mlp": 0.01029647, "balance_loss_clip": 1.01729679, "balance_loss_mlp": 1.03819489, "epoch": 0.6617165188636706, "flos": 24495063425280.0, "grad_norm": 1.5322091437348906, "language_loss": 0.78281236, "learning_rate": 1.0848951745588855e-06, "loss": 0.80418342, "num_input_tokens_seen": 237577000, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 11006, "time_per_iteration": 3.9993278980255127 }, { "auxiliary_loss_clip": 0.01107915, "auxiliary_loss_mlp": 0.01033551, "balance_loss_clip": 1.02014565, "balance_loss_mlp": 1.03838682, "epoch": 0.6617766421163385, "flos": 22379673709440.0, "grad_norm": 1.8736454980085409, "language_loss": 0.76107001, "learning_rate": 1.0845488893405068e-06, "loss": 0.78248465, "num_input_tokens_seen": 237597960, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 11007, "time_per_iteration": 3.8562850952148438 }, { "auxiliary_loss_clip": 0.01108343, "auxiliary_loss_mlp": 0.01027367, "balance_loss_clip": 1.01462388, "balance_loss_mlp": 1.03895175, "epoch": 0.6618367653690065, "flos": 20850418126080.0, "grad_norm": 1.953491114451764, "language_loss": 0.7858752, "learning_rate": 1.0842026388361248e-06, "loss": 0.80723226, "num_input_tokens_seen": 237616385, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 11008, "time_per_iteration": 3.9196956157684326 }, { "auxiliary_loss_clip": 0.01111624, "auxiliary_loss_mlp": 0.01032481, "balance_loss_clip": 1.01870072, "balance_loss_mlp": 1.03786993, "epoch": 0.6618968886216744, "flos": 17712328377600.0, "grad_norm": 2.60568766665287, "language_loss": 0.81799388, "learning_rate": 1.0838564230588715e-06, "loss": 0.83943486, "num_input_tokens_seen": 237634930, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73828125, "step": 11009, "time_per_iteration": 2.4282426834106445 }, { "auxiliary_loss_clip": 0.01030899, "auxiliary_loss_mlp": 0.01004174, "balance_loss_clip": 1.00296998, "balance_loss_mlp": 1.00818622, "epoch": 0.6619570118743424, "flos": 67035347498880.0, "grad_norm": 0.98165629868704, "language_loss": 0.67295843, "learning_rate": 1.0835102420218735e-06, "loss": 0.69330913, "num_input_tokens_seen": 237693175, "router_z_loss_clip": 0.01202393, "router_z_loss_mlp": 0.22753906, "step": 11010, "time_per_iteration": 3.027082681655884 }, { "auxiliary_loss_clip": 0.01108932, "auxiliary_loss_mlp": 0.01031419, "balance_loss_clip": 1.01772761, "balance_loss_mlp": 1.03743064, "epoch": 0.6620171351270103, "flos": 18660908695680.0, "grad_norm": 1.896956238873653, "language_loss": 0.71454, "learning_rate": 1.0831640957382593e-06, "loss": 0.73594344, "num_input_tokens_seen": 237713160, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 11011, "time_per_iteration": 2.4634323120117188 }, { "auxiliary_loss_clip": 0.01108595, "auxiliary_loss_mlp": 0.01033168, "balance_loss_clip": 1.02109826, "balance_loss_mlp": 1.03952551, "epoch": 0.6620772583796783, "flos": 24170503109760.0, "grad_norm": 1.5058031294303482, "language_loss": 0.72521627, "learning_rate": 1.0828179842211557e-06, "loss": 0.74663389, "num_input_tokens_seen": 237733600, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 11012, "time_per_iteration": 2.5569021701812744 }, { "auxiliary_loss_clip": 0.01103392, "auxiliary_loss_mlp": 0.01033247, "balance_loss_clip": 1.02155232, "balance_loss_mlp": 1.03815341, "epoch": 0.6621373816323463, "flos": 23623547736960.0, "grad_norm": 1.868339921364771, "language_loss": 0.7955299, "learning_rate": 1.0824719074836845e-06, "loss": 0.81689632, "num_input_tokens_seen": 237752135, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.65234375, "step": 11013, "time_per_iteration": 2.499840497970581 }, { "auxiliary_loss_clip": 0.01110057, "auxiliary_loss_mlp": 0.01027001, "balance_loss_clip": 1.01425135, "balance_loss_mlp": 1.04027319, "epoch": 0.6621975048850143, "flos": 18442212739200.0, "grad_norm": 2.131763456313516, "language_loss": 0.70515323, "learning_rate": 1.082125865538971e-06, "loss": 0.72652382, "num_input_tokens_seen": 237770735, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 11014, "time_per_iteration": 2.460519313812256 }, { "auxiliary_loss_clip": 0.01105776, "auxiliary_loss_mlp": 0.01028774, "balance_loss_clip": 1.01743126, "balance_loss_mlp": 1.03886175, "epoch": 0.6622576281376823, "flos": 14063876236800.0, "grad_norm": 2.0628304751952333, "language_loss": 0.77081847, "learning_rate": 1.081779858400137e-06, "loss": 0.79216397, "num_input_tokens_seen": 237789005, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66796875, "step": 11015, "time_per_iteration": 2.4477405548095703 }, { "auxiliary_loss_clip": 0.01105654, "auxiliary_loss_mlp": 0.01027067, "balance_loss_clip": 1.01376891, "balance_loss_mlp": 1.03746271, "epoch": 0.6623177513903502, "flos": 17018965169280.0, "grad_norm": 1.6625920058912549, "language_loss": 0.82426918, "learning_rate": 1.0814338860803021e-06, "loss": 0.84559637, "num_input_tokens_seen": 237807740, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.68359375, "step": 11016, "time_per_iteration": 2.463674306869507 }, { "auxiliary_loss_clip": 0.01107366, "auxiliary_loss_mlp": 0.01032792, "balance_loss_clip": 1.02015567, "balance_loss_mlp": 1.03596878, "epoch": 0.6623778746430182, "flos": 17271021882240.0, "grad_norm": 2.104002649492122, "language_loss": 0.70133281, "learning_rate": 1.0810879485925864e-06, "loss": 0.72273433, "num_input_tokens_seen": 237826340, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 11017, "time_per_iteration": 2.454094648361206 }, { "auxiliary_loss_clip": 0.01105842, "auxiliary_loss_mlp": 0.01032212, "balance_loss_clip": 1.01946282, "balance_loss_mlp": 1.03666222, "epoch": 0.6624379978956861, "flos": 48792688767360.0, "grad_norm": 1.7428939676239024, "language_loss": 0.77511781, "learning_rate": 1.0807420459501084e-06, "loss": 0.79649842, "num_input_tokens_seen": 237848305, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 11018, "time_per_iteration": 2.7041175365448 }, { "auxiliary_loss_clip": 0.01106371, "auxiliary_loss_mlp": 0.01034801, "balance_loss_clip": 1.02205133, "balance_loss_mlp": 1.03684139, "epoch": 0.6624981211483542, "flos": 18952431477120.0, "grad_norm": 2.0407086215095047, "language_loss": 0.83683914, "learning_rate": 1.0803961781659841e-06, "loss": 0.85825086, "num_input_tokens_seen": 237867020, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 11019, "time_per_iteration": 2.482696056365967 }, { "auxiliary_loss_clip": 0.01106367, "auxiliary_loss_mlp": 0.01026535, "balance_loss_clip": 1.01454902, "balance_loss_mlp": 1.03819823, "epoch": 0.6625582444010221, "flos": 23256576437760.0, "grad_norm": 1.87230933269204, "language_loss": 0.71926874, "learning_rate": 1.080050345253328e-06, "loss": 0.74059772, "num_input_tokens_seen": 237886710, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 11020, "time_per_iteration": 2.4997854232788086 }, { "auxiliary_loss_clip": 0.01111804, "auxiliary_loss_mlp": 0.01028222, "balance_loss_clip": 1.01466238, "balance_loss_mlp": 1.03835058, "epoch": 0.6626183676536901, "flos": 21394823633280.0, "grad_norm": 1.775644249393655, "language_loss": 0.72755527, "learning_rate": 1.0797045472252554e-06, "loss": 0.74895555, "num_input_tokens_seen": 237904795, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 11021, "time_per_iteration": 2.4669196605682373 }, { "auxiliary_loss_clip": 0.01108529, "auxiliary_loss_mlp": 0.01029664, "balance_loss_clip": 1.01705205, "balance_loss_mlp": 1.03873086, "epoch": 0.662678490906358, "flos": 14571293713920.0, "grad_norm": 2.128056537148125, "language_loss": 0.82909113, "learning_rate": 1.0793587840948793e-06, "loss": 0.85047305, "num_input_tokens_seen": 237921320, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 11022, "time_per_iteration": 2.443058967590332 }, { "auxiliary_loss_clip": 0.0111459, "auxiliary_loss_mlp": 0.01032705, "balance_loss_clip": 1.01827466, "balance_loss_mlp": 1.03884673, "epoch": 0.662738614159026, "flos": 15992350554240.0, "grad_norm": 2.3628128024353203, "language_loss": 0.72835213, "learning_rate": 1.0790130558753099e-06, "loss": 0.74982506, "num_input_tokens_seen": 237933525, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7578125, "step": 11023, "time_per_iteration": 2.4095587730407715 }, { "auxiliary_loss_clip": 0.01105834, "auxiliary_loss_mlp": 0.01028424, "balance_loss_clip": 1.01652122, "balance_loss_mlp": 1.03706479, "epoch": 0.6627987374116939, "flos": 19536338966400.0, "grad_norm": 2.5662162327496363, "language_loss": 0.74845767, "learning_rate": 1.0786673625796574e-06, "loss": 0.76980031, "num_input_tokens_seen": 237953395, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 11024, "time_per_iteration": 2.467073678970337 }, { "auxiliary_loss_clip": 0.0111002, "auxiliary_loss_mlp": 0.01030797, "balance_loss_clip": 1.0179106, "balance_loss_mlp": 1.03940117, "epoch": 0.662858860664362, "flos": 15702838934400.0, "grad_norm": 2.939193846714845, "language_loss": 0.69919533, "learning_rate": 1.0783217042210306e-06, "loss": 0.72060347, "num_input_tokens_seen": 237971445, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 11025, "time_per_iteration": 2.4289627075195312 }, { "auxiliary_loss_clip": 0.01110383, "auxiliary_loss_mlp": 0.01033811, "balance_loss_clip": 1.02084148, "balance_loss_mlp": 1.04029799, "epoch": 0.6629189839170299, "flos": 20154289570560.0, "grad_norm": 1.9005817931514053, "language_loss": 0.79051518, "learning_rate": 1.0779760808125379e-06, "loss": 0.81195706, "num_input_tokens_seen": 237989965, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 11026, "time_per_iteration": 2.4728474617004395 }, { "auxiliary_loss_clip": 0.0110924, "auxiliary_loss_mlp": 0.01030575, "balance_loss_clip": 1.01868427, "balance_loss_mlp": 1.04079473, "epoch": 0.6629791071696979, "flos": 20915415786240.0, "grad_norm": 1.823320364100066, "language_loss": 0.763309, "learning_rate": 1.0776304923672842e-06, "loss": 0.78470719, "num_input_tokens_seen": 238006820, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 11027, "time_per_iteration": 2.4553375244140625 }, { "auxiliary_loss_clip": 0.01108753, "auxiliary_loss_mlp": 0.01032724, "balance_loss_clip": 1.01933122, "balance_loss_mlp": 1.03850412, "epoch": 0.6630392304223659, "flos": 20846898593280.0, "grad_norm": 2.509777655983932, "language_loss": 0.70170236, "learning_rate": 1.0772849388983742e-06, "loss": 0.72311711, "num_input_tokens_seen": 238022560, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 11028, "time_per_iteration": 2.4771921634674072 }, { "auxiliary_loss_clip": 0.01107395, "auxiliary_loss_mlp": 0.01032147, "balance_loss_clip": 1.02100635, "balance_loss_mlp": 1.03775167, "epoch": 0.6630993536750338, "flos": 20995820380800.0, "grad_norm": 2.8843116827052104, "language_loss": 0.79525334, "learning_rate": 1.0769394204189138e-06, "loss": 0.81664878, "num_input_tokens_seen": 238041895, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6953125, "step": 11029, "time_per_iteration": 2.4789843559265137 }, { "auxiliary_loss_clip": 0.01108587, "auxiliary_loss_mlp": 0.01032641, "balance_loss_clip": 1.01920605, "balance_loss_mlp": 1.03690624, "epoch": 0.6631594769277018, "flos": 18259032355200.0, "grad_norm": 2.7525126816722705, "language_loss": 0.76269603, "learning_rate": 1.0765939369420012e-06, "loss": 0.78410828, "num_input_tokens_seen": 238060445, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 11030, "time_per_iteration": 2.4530739784240723 }, { "auxiliary_loss_clip": 0.01114753, "auxiliary_loss_mlp": 0.01033497, "balance_loss_clip": 1.02022874, "balance_loss_mlp": 1.04113817, "epoch": 0.6632196001803697, "flos": 17820491207040.0, "grad_norm": 2.318678699781523, "language_loss": 0.75479269, "learning_rate": 1.0762484884807391e-06, "loss": 0.77627516, "num_input_tokens_seen": 238077080, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 11031, "time_per_iteration": 2.4642672538757324 }, { "auxiliary_loss_clip": 0.01110011, "auxiliary_loss_mlp": 0.01034506, "balance_loss_clip": 1.02133381, "balance_loss_mlp": 1.03841817, "epoch": 0.6632797234330378, "flos": 12670182581760.0, "grad_norm": 2.7023723326936073, "language_loss": 0.74936867, "learning_rate": 1.075903075048228e-06, "loss": 0.77081388, "num_input_tokens_seen": 238091045, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 11032, "time_per_iteration": 2.415372848510742 }, { "auxiliary_loss_clip": 0.01106453, "auxiliary_loss_mlp": 0.01030554, "balance_loss_clip": 1.01886559, "balance_loss_mlp": 1.03710341, "epoch": 0.6633398466857057, "flos": 23584728113280.0, "grad_norm": 1.775230935051167, "language_loss": 0.80374157, "learning_rate": 1.0755576966575635e-06, "loss": 0.82511163, "num_input_tokens_seen": 238110220, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 11033, "time_per_iteration": 2.5245182514190674 }, { "auxiliary_loss_clip": 0.0110721, "auxiliary_loss_mlp": 0.01032091, "balance_loss_clip": 1.01920986, "balance_loss_mlp": 1.03708994, "epoch": 0.6633999699383737, "flos": 20631686256000.0, "grad_norm": 1.8466243522856105, "language_loss": 0.80397105, "learning_rate": 1.0752123533218451e-06, "loss": 0.82536411, "num_input_tokens_seen": 238130400, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11034, "time_per_iteration": 2.5083839893341064 }, { "auxiliary_loss_clip": 0.01105926, "auxiliary_loss_mlp": 0.01029211, "balance_loss_clip": 1.01725459, "balance_loss_mlp": 1.03743148, "epoch": 0.6634600931910416, "flos": 21797095023360.0, "grad_norm": 1.7195303881507087, "language_loss": 0.75598991, "learning_rate": 1.074867045054166e-06, "loss": 0.77734131, "num_input_tokens_seen": 238148165, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 11035, "time_per_iteration": 2.5240862369537354 }, { "auxiliary_loss_clip": 0.01107441, "auxiliary_loss_mlp": 0.01026944, "balance_loss_clip": 1.01445627, "balance_loss_mlp": 1.03590047, "epoch": 0.6635202164437096, "flos": 18732873594240.0, "grad_norm": 1.697467663535619, "language_loss": 0.82809567, "learning_rate": 1.074521771867622e-06, "loss": 0.84943956, "num_input_tokens_seen": 238166360, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 11036, "time_per_iteration": 2.4807868003845215 }, { "auxiliary_loss_clip": 0.01030271, "auxiliary_loss_mlp": 0.01001773, "balance_loss_clip": 1.00044966, "balance_loss_mlp": 1.00744355, "epoch": 0.6635803396963775, "flos": 60222771227520.0, "grad_norm": 0.7819289438501623, "language_loss": 0.52291012, "learning_rate": 1.0741765337753044e-06, "loss": 0.54323053, "num_input_tokens_seen": 238227630, "router_z_loss_clip": 0.01324463, "router_z_loss_mlp": 0.22851562, "step": 11037, "time_per_iteration": 3.092477560043335 }, { "auxiliary_loss_clip": 0.01108559, "auxiliary_loss_mlp": 0.01039663, "balance_loss_clip": 1.02651453, "balance_loss_mlp": 1.03863478, "epoch": 0.6636404629490456, "flos": 29167041611520.0, "grad_norm": 2.180690436021762, "language_loss": 0.79362512, "learning_rate": 1.0738313307903052e-06, "loss": 0.81510735, "num_input_tokens_seen": 238248435, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 11038, "time_per_iteration": 2.550063133239746 }, { "auxiliary_loss_clip": 0.01108812, "auxiliary_loss_mlp": 0.01038202, "balance_loss_clip": 1.02448082, "balance_loss_mlp": 1.03886032, "epoch": 0.6637005862017135, "flos": 38907702766080.0, "grad_norm": 1.8562458179010226, "language_loss": 0.6393953, "learning_rate": 1.073486162925716e-06, "loss": 0.66086543, "num_input_tokens_seen": 238268755, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69921875, "step": 11039, "time_per_iteration": 2.6581645011901855 }, { "auxiliary_loss_clip": 0.01109706, "auxiliary_loss_mlp": 0.01028797, "balance_loss_clip": 1.01591063, "balance_loss_mlp": 1.03770852, "epoch": 0.6637607094543815, "flos": 22783345729920.0, "grad_norm": 1.6398306958946767, "language_loss": 0.64018822, "learning_rate": 1.0731410301946237e-06, "loss": 0.66157323, "num_input_tokens_seen": 238290120, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11040, "time_per_iteration": 2.4953670501708984 }, { "auxiliary_loss_clip": 0.01105005, "auxiliary_loss_mlp": 0.01033004, "balance_loss_clip": 1.02100015, "balance_loss_mlp": 1.03651321, "epoch": 0.6638208327070495, "flos": 18114096977280.0, "grad_norm": 5.978376482679355, "language_loss": 0.7207408, "learning_rate": 1.0727959326101161e-06, "loss": 0.74212098, "num_input_tokens_seen": 238309290, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 11041, "time_per_iteration": 2.452242612838745 }, { "auxiliary_loss_clip": 0.01105263, "auxiliary_loss_mlp": 0.01038521, "balance_loss_clip": 1.02554464, "balance_loss_mlp": 1.03642929, "epoch": 0.6638809559597174, "flos": 29424880414080.0, "grad_norm": 4.491632259577376, "language_loss": 0.61653459, "learning_rate": 1.0724508701852806e-06, "loss": 0.63797235, "num_input_tokens_seen": 238327280, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 11042, "time_per_iteration": 2.5278611183166504 }, { "auxiliary_loss_clip": 0.01110267, "auxiliary_loss_mlp": 0.01027915, "balance_loss_clip": 1.01428962, "balance_loss_mlp": 1.03715539, "epoch": 0.6639410792123854, "flos": 28072699902720.0, "grad_norm": 2.1147036929688543, "language_loss": 0.68496662, "learning_rate": 1.0721058429331998e-06, "loss": 0.70634842, "num_input_tokens_seen": 238346330, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 11043, "time_per_iteration": 2.493032455444336 }, { "auxiliary_loss_clip": 0.01104222, "auxiliary_loss_mlp": 0.01029598, "balance_loss_clip": 1.01841044, "balance_loss_mlp": 1.03847182, "epoch": 0.6640012024650533, "flos": 25556367600000.0, "grad_norm": 1.6404504655666765, "language_loss": 0.83788794, "learning_rate": 1.0717608508669587e-06, "loss": 0.85922611, "num_input_tokens_seen": 238364650, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.65625, "step": 11044, "time_per_iteration": 2.4970338344573975 }, { "auxiliary_loss_clip": 0.01107383, "auxiliary_loss_mlp": 0.01031559, "balance_loss_clip": 1.01874411, "balance_loss_mlp": 1.03820586, "epoch": 0.6640613257177214, "flos": 14866946559360.0, "grad_norm": 2.5344522065763706, "language_loss": 0.69381094, "learning_rate": 1.0714158939996392e-06, "loss": 0.71520036, "num_input_tokens_seen": 238381630, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 11045, "time_per_iteration": 3.872126817703247 }, { "auxiliary_loss_clip": 0.01109827, "auxiliary_loss_mlp": 0.01029825, "balance_loss_clip": 1.01684308, "balance_loss_mlp": 1.03882611, "epoch": 0.6641214489703893, "flos": 23221096778880.0, "grad_norm": 7.0237076839565855, "language_loss": 0.64587641, "learning_rate": 1.0710709723443235e-06, "loss": 0.66727293, "num_input_tokens_seen": 238402595, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11046, "time_per_iteration": 2.5036330223083496 }, { "auxiliary_loss_clip": 0.01106516, "auxiliary_loss_mlp": 0.01029818, "balance_loss_clip": 1.01731908, "balance_loss_mlp": 1.03798163, "epoch": 0.6641815722230573, "flos": 37742617221120.0, "grad_norm": 1.5586943385733685, "language_loss": 0.71537232, "learning_rate": 1.070726085914088e-06, "loss": 0.73673558, "num_input_tokens_seen": 238426860, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 11047, "time_per_iteration": 2.6183254718780518 }, { "auxiliary_loss_clip": 0.0110976, "auxiliary_loss_mlp": 0.01035735, "balance_loss_clip": 1.02296782, "balance_loss_mlp": 1.04025459, "epoch": 0.6642416954757252, "flos": 17931132074880.0, "grad_norm": 2.1001713239174364, "language_loss": 0.76933491, "learning_rate": 1.0703812347220126e-06, "loss": 0.79078984, "num_input_tokens_seen": 238443990, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 11048, "time_per_iteration": 5.265036582946777 }, { "auxiliary_loss_clip": 0.01029557, "auxiliary_loss_mlp": 0.01002402, "balance_loss_clip": 1.00122154, "balance_loss_mlp": 1.00679922, "epoch": 0.6643018187283932, "flos": 51995384104320.0, "grad_norm": 0.7616697163381695, "language_loss": 0.55056143, "learning_rate": 1.0700364187811745e-06, "loss": 0.57088101, "num_input_tokens_seen": 238503045, "router_z_loss_clip": 0.01177979, "router_z_loss_mlp": 0.22753906, "step": 11049, "time_per_iteration": 3.1019442081451416 }, { "auxiliary_loss_clip": 0.01107166, "auxiliary_loss_mlp": 0.01029205, "balance_loss_clip": 1.01769495, "balance_loss_mlp": 1.03826499, "epoch": 0.6643619419810611, "flos": 30226657847040.0, "grad_norm": 1.6555189889986732, "language_loss": 0.64210212, "learning_rate": 1.069691638104648e-06, "loss": 0.66346574, "num_input_tokens_seen": 238527320, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 11050, "time_per_iteration": 3.966386318206787 }, { "auxiliary_loss_clip": 0.01104265, "auxiliary_loss_mlp": 0.01029962, "balance_loss_clip": 1.01850629, "balance_loss_mlp": 1.03689027, "epoch": 0.6644220652337292, "flos": 22966131064320.0, "grad_norm": 2.3351286372771933, "language_loss": 0.79025298, "learning_rate": 1.0693468927055085e-06, "loss": 0.81159526, "num_input_tokens_seen": 238546030, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 11051, "time_per_iteration": 2.5016870498657227 }, { "auxiliary_loss_clip": 0.01108027, "auxiliary_loss_mlp": 0.01035788, "balance_loss_clip": 1.02371264, "balance_loss_mlp": 1.03888679, "epoch": 0.6644821884863971, "flos": 21142228216320.0, "grad_norm": 1.777670083590484, "language_loss": 0.85477412, "learning_rate": 1.0690021825968276e-06, "loss": 0.87621224, "num_input_tokens_seen": 238564175, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 11052, "time_per_iteration": 2.470715045928955 }, { "auxiliary_loss_clip": 0.01108698, "auxiliary_loss_mlp": 0.0103735, "balance_loss_clip": 1.02371287, "balance_loss_mlp": 1.037287, "epoch": 0.6645423117390651, "flos": 20192821885440.0, "grad_norm": 2.2934322717044067, "language_loss": 0.7416625, "learning_rate": 1.0686575077916776e-06, "loss": 0.76312292, "num_input_tokens_seen": 238581010, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 11053, "time_per_iteration": 2.508918046951294 }, { "auxiliary_loss_clip": 0.01103449, "auxiliary_loss_mlp": 0.01027732, "balance_loss_clip": 1.01555479, "balance_loss_mlp": 1.03663635, "epoch": 0.6646024349917331, "flos": 24351959640960.0, "grad_norm": 1.982409256612542, "language_loss": 0.79457563, "learning_rate": 1.0683128683031278e-06, "loss": 0.81588745, "num_input_tokens_seen": 238601365, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.66796875, "step": 11054, "time_per_iteration": 2.5568015575408936 }, { "auxiliary_loss_clip": 0.0110399, "auxiliary_loss_mlp": 0.01029265, "balance_loss_clip": 1.01798761, "balance_loss_mlp": 1.03648579, "epoch": 0.664662558244401, "flos": 18806706000000.0, "grad_norm": 1.5614740622116519, "language_loss": 0.73993158, "learning_rate": 1.0679682641442472e-06, "loss": 0.76126415, "num_input_tokens_seen": 238619850, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.67578125, "step": 11055, "time_per_iteration": 2.4705071449279785 }, { "auxiliary_loss_clip": 0.01108566, "auxiliary_loss_mlp": 0.01037847, "balance_loss_clip": 1.02429271, "balance_loss_mlp": 1.03805208, "epoch": 0.664722681497069, "flos": 18952790613120.0, "grad_norm": 1.7225241808510952, "language_loss": 0.73033518, "learning_rate": 1.0676236953281042e-06, "loss": 0.75179935, "num_input_tokens_seen": 238637635, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 11056, "time_per_iteration": 2.499225378036499 }, { "auxiliary_loss_clip": 0.0110472, "auxiliary_loss_mlp": 0.01028819, "balance_loss_clip": 1.01655281, "balance_loss_mlp": 1.03651702, "epoch": 0.6647828047497369, "flos": 19571279921280.0, "grad_norm": 2.4233024862476933, "language_loss": 0.69715881, "learning_rate": 1.0672791618677641e-06, "loss": 0.71849424, "num_input_tokens_seen": 238656200, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 11057, "time_per_iteration": 2.594269037246704 }, { "auxiliary_loss_clip": 0.01108137, "auxiliary_loss_mlp": 0.01030015, "balance_loss_clip": 1.0175879, "balance_loss_mlp": 1.0385623, "epoch": 0.664842928002405, "flos": 23149455102720.0, "grad_norm": 1.8183742852290545, "language_loss": 0.8038398, "learning_rate": 1.066934663776291e-06, "loss": 0.8252213, "num_input_tokens_seen": 238675005, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 11058, "time_per_iteration": 2.6490964889526367 }, { "auxiliary_loss_clip": 0.01030761, "auxiliary_loss_mlp": 0.0100215, "balance_loss_clip": 1.00093389, "balance_loss_mlp": 1.00806212, "epoch": 0.6649030512550729, "flos": 65244913148160.0, "grad_norm": 0.8132393347229856, "language_loss": 0.6263634, "learning_rate": 1.0665902010667496e-06, "loss": 0.64669251, "num_input_tokens_seen": 238731425, "router_z_loss_clip": 0.012146, "router_z_loss_mlp": 0.22753906, "step": 11059, "time_per_iteration": 3.1937127113342285 }, { "auxiliary_loss_clip": 0.01105136, "auxiliary_loss_mlp": 0.01033857, "balance_loss_clip": 1.02236509, "balance_loss_mlp": 1.03695631, "epoch": 0.6649631745077409, "flos": 20194797133440.0, "grad_norm": 1.4263525214297237, "language_loss": 0.78994739, "learning_rate": 1.0662457737522008e-06, "loss": 0.81133729, "num_input_tokens_seen": 238752020, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.68359375, "step": 11060, "time_per_iteration": 2.6296141147613525 }, { "auxiliary_loss_clip": 0.0110901, "auxiliary_loss_mlp": 0.01031527, "balance_loss_clip": 1.01838374, "balance_loss_mlp": 1.03891134, "epoch": 0.6650232977604088, "flos": 17238558965760.0, "grad_norm": 5.3493261967804555, "language_loss": 0.78706086, "learning_rate": 1.0659013818457055e-06, "loss": 0.8084662, "num_input_tokens_seen": 238769665, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 11061, "time_per_iteration": 2.623476505279541 }, { "auxiliary_loss_clip": 0.01105286, "auxiliary_loss_mlp": 0.01026302, "balance_loss_clip": 1.01442897, "balance_loss_mlp": 1.03753257, "epoch": 0.6650834210130768, "flos": 10006867825920.0, "grad_norm": 2.3178280834216736, "language_loss": 0.57226086, "learning_rate": 1.0655570253603243e-06, "loss": 0.59357679, "num_input_tokens_seen": 238782180, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 11062, "time_per_iteration": 2.5989537239074707 }, { "auxiliary_loss_clip": 0.01110792, "auxiliary_loss_mlp": 0.01027486, "balance_loss_clip": 1.01388431, "balance_loss_mlp": 1.03720164, "epoch": 0.6651435442657447, "flos": 10452088903680.0, "grad_norm": 2.153453205317903, "language_loss": 0.75670546, "learning_rate": 1.0652127043091144e-06, "loss": 0.77808821, "num_input_tokens_seen": 238800315, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 11063, "time_per_iteration": 2.5186660289764404 }, { "auxiliary_loss_clip": 0.01107501, "auxiliary_loss_mlp": 0.01034299, "balance_loss_clip": 1.02255678, "balance_loss_mlp": 1.03831482, "epoch": 0.6652036675184128, "flos": 22344229964160.0, "grad_norm": 1.3006665876044534, "language_loss": 0.70685345, "learning_rate": 1.0648684187051316e-06, "loss": 0.72827148, "num_input_tokens_seen": 238822250, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69140625, "step": 11064, "time_per_iteration": 2.578334093093872 }, { "auxiliary_loss_clip": 0.01031252, "auxiliary_loss_mlp": 0.01001481, "balance_loss_clip": 1.000319, "balance_loss_mlp": 1.00872016, "epoch": 0.6652637907710807, "flos": 52909633998720.0, "grad_norm": 0.8350189938312718, "language_loss": 0.63082236, "learning_rate": 1.0645241685614322e-06, "loss": 0.65114969, "num_input_tokens_seen": 238877190, "router_z_loss_clip": 0.01159668, "router_z_loss_mlp": 0.22558594, "step": 11065, "time_per_iteration": 3.0466699600219727 }, { "auxiliary_loss_clip": 0.01106585, "auxiliary_loss_mlp": 0.01029391, "balance_loss_clip": 1.01663566, "balance_loss_mlp": 1.03717029, "epoch": 0.6653239140237487, "flos": 23104637907840.0, "grad_norm": 2.8553736552855074, "language_loss": 0.62160635, "learning_rate": 1.0641799538910708e-06, "loss": 0.64296615, "num_input_tokens_seen": 238896010, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 11066, "time_per_iteration": 2.516749620437622 }, { "auxiliary_loss_clip": 0.01107814, "auxiliary_loss_mlp": 0.01029887, "balance_loss_clip": 1.01628542, "balance_loss_mlp": 1.0370667, "epoch": 0.6653840372764167, "flos": 25959393175680.0, "grad_norm": 1.471056931744988, "language_loss": 0.69787812, "learning_rate": 1.0638357747070985e-06, "loss": 0.71925509, "num_input_tokens_seen": 238918990, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.70703125, "step": 11067, "time_per_iteration": 2.537781000137329 }, { "auxiliary_loss_clip": 0.01030441, "auxiliary_loss_mlp": 0.01001352, "balance_loss_clip": 1.00018406, "balance_loss_mlp": 1.00769758, "epoch": 0.6654441605290846, "flos": 66041985899520.0, "grad_norm": 0.9317227740203229, "language_loss": 0.72073001, "learning_rate": 1.0634916310225684e-06, "loss": 0.74104792, "num_input_tokens_seen": 238975735, "router_z_loss_clip": 0.01165771, "router_z_loss_mlp": 0.22753906, "step": 11068, "time_per_iteration": 3.10286283493042 }, { "auxiliary_loss_clip": 0.01030315, "auxiliary_loss_mlp": 0.01002603, "balance_loss_clip": 1.00142848, "balance_loss_mlp": 1.00745726, "epoch": 0.6655042837817526, "flos": 65196112521600.0, "grad_norm": 6.049066027588001, "language_loss": 0.5778625, "learning_rate": 1.0631475228505285e-06, "loss": 0.59819168, "num_input_tokens_seen": 239042360, "router_z_loss_clip": 0.01171875, "router_z_loss_mlp": 0.22851562, "step": 11069, "time_per_iteration": 3.2452445030212402 }, { "auxiliary_loss_clip": 0.01030406, "auxiliary_loss_mlp": 0.01003096, "balance_loss_clip": 1.00192165, "balance_loss_mlp": 1.00764728, "epoch": 0.6655644070344205, "flos": 69008746752000.0, "grad_norm": 0.7561241666498124, "language_loss": 0.63511777, "learning_rate": 1.062803450204029e-06, "loss": 0.65545285, "num_input_tokens_seen": 239109410, "router_z_loss_clip": 0.01171875, "router_z_loss_mlp": 0.22851562, "step": 11070, "time_per_iteration": 3.1939773559570312 }, { "auxiliary_loss_clip": 0.01104541, "auxiliary_loss_mlp": 0.01028953, "balance_loss_clip": 1.01646614, "balance_loss_mlp": 1.0348773, "epoch": 0.6656245302870886, "flos": 36315562809600.0, "grad_norm": 1.9126791996992223, "language_loss": 0.58869064, "learning_rate": 1.062459413096116e-06, "loss": 0.61002564, "num_input_tokens_seen": 239135345, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 11071, "time_per_iteration": 2.647639513015747 }, { "auxiliary_loss_clip": 0.01109493, "auxiliary_loss_mlp": 0.01026473, "balance_loss_clip": 1.01462996, "balance_loss_mlp": 1.03967977, "epoch": 0.6656846535397565, "flos": 21794832466560.0, "grad_norm": 2.8288410469501923, "language_loss": 0.73031485, "learning_rate": 1.0621154115398364e-06, "loss": 0.75167453, "num_input_tokens_seen": 239154340, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 11072, "time_per_iteration": 2.516045331954956 }, { "auxiliary_loss_clip": 0.01106582, "auxiliary_loss_mlp": 0.01028524, "balance_loss_clip": 1.01502311, "balance_loss_mlp": 1.03855145, "epoch": 0.6657447767924245, "flos": 37487615592960.0, "grad_norm": 6.45908629977969, "language_loss": 0.70960271, "learning_rate": 1.0617714455482353e-06, "loss": 0.73095381, "num_input_tokens_seen": 239177815, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6796875, "step": 11073, "time_per_iteration": 2.7273833751678467 }, { "auxiliary_loss_clip": 0.01108315, "auxiliary_loss_mlp": 0.0102984, "balance_loss_clip": 1.01684678, "balance_loss_mlp": 1.03714991, "epoch": 0.6658049000450924, "flos": 16837688206080.0, "grad_norm": 2.244420748234324, "language_loss": 0.55871397, "learning_rate": 1.061427515134354e-06, "loss": 0.58009553, "num_input_tokens_seen": 239195735, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 11074, "time_per_iteration": 2.4698662757873535 }, { "auxiliary_loss_clip": 0.01108011, "auxiliary_loss_mlp": 0.01031287, "balance_loss_clip": 1.01905036, "balance_loss_mlp": 1.03871047, "epoch": 0.6658650232977604, "flos": 33510975863040.0, "grad_norm": 1.5244188293555654, "language_loss": 0.72291219, "learning_rate": 1.061083620311235e-06, "loss": 0.74430513, "num_input_tokens_seen": 239217535, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 11075, "time_per_iteration": 2.5931472778320312 }, { "auxiliary_loss_clip": 0.01104945, "auxiliary_loss_mlp": 0.0103107, "balance_loss_clip": 1.0188396, "balance_loss_mlp": 1.03718364, "epoch": 0.6659251465504283, "flos": 37706311549440.0, "grad_norm": 1.5469952808400622, "language_loss": 0.66238034, "learning_rate": 1.0607397610919202e-06, "loss": 0.6837405, "num_input_tokens_seen": 239241975, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 11076, "time_per_iteration": 2.6119394302368164 }, { "auxiliary_loss_clip": 0.01106738, "auxiliary_loss_mlp": 0.01033526, "balance_loss_clip": 1.02062154, "balance_loss_mlp": 1.03784621, "epoch": 0.6659852698030964, "flos": 24893420232960.0, "grad_norm": 1.6745963083185385, "language_loss": 0.75289828, "learning_rate": 1.0603959374894468e-06, "loss": 0.77430093, "num_input_tokens_seen": 239262025, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 11077, "time_per_iteration": 2.529440402984619 }, { "auxiliary_loss_clip": 0.01106169, "auxiliary_loss_mlp": 0.01030391, "balance_loss_clip": 1.0178982, "balance_loss_mlp": 1.03620756, "epoch": 0.6660453930557643, "flos": 24352821567360.0, "grad_norm": 2.8535787796751113, "language_loss": 0.66832078, "learning_rate": 1.0600521495168538e-06, "loss": 0.68968642, "num_input_tokens_seen": 239282775, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 11078, "time_per_iteration": 2.5172946453094482 }, { "auxiliary_loss_clip": 0.01108305, "auxiliary_loss_mlp": 0.01032331, "balance_loss_clip": 1.0188303, "balance_loss_mlp": 1.03717351, "epoch": 0.6661055163084323, "flos": 10597814380800.0, "grad_norm": 2.6118508174029493, "language_loss": 0.69491923, "learning_rate": 1.0597083971871783e-06, "loss": 0.71632552, "num_input_tokens_seen": 239299775, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 11079, "time_per_iteration": 2.461369752883911 }, { "auxiliary_loss_clip": 0.01104238, "auxiliary_loss_mlp": 0.01024657, "balance_loss_clip": 1.01253986, "balance_loss_mlp": 1.03565717, "epoch": 0.6661656395611003, "flos": 24057491944320.0, "grad_norm": 1.5015598190890809, "language_loss": 0.80130917, "learning_rate": 1.0593646805134544e-06, "loss": 0.8225981, "num_input_tokens_seen": 239319660, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 11080, "time_per_iteration": 2.4773950576782227 }, { "auxiliary_loss_clip": 0.01101711, "auxiliary_loss_mlp": 0.01028112, "balance_loss_clip": 1.01651287, "balance_loss_mlp": 1.03598952, "epoch": 0.6662257628137682, "flos": 23036192542080.0, "grad_norm": 1.8875204362841749, "language_loss": 0.78092015, "learning_rate": 1.0590209995087157e-06, "loss": 0.80221844, "num_input_tokens_seen": 239339215, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.65625, "step": 11081, "time_per_iteration": 2.4692630767822266 }, { "auxiliary_loss_clip": 0.01109432, "auxiliary_loss_mlp": 0.01032629, "balance_loss_clip": 1.01943815, "balance_loss_mlp": 1.03856635, "epoch": 0.6662858860664362, "flos": 24754446512640.0, "grad_norm": 2.3857208058740667, "language_loss": 0.80057502, "learning_rate": 1.0586773541859946e-06, "loss": 0.82199568, "num_input_tokens_seen": 239358545, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 11082, "time_per_iteration": 2.511591672897339 }, { "auxiliary_loss_clip": 0.01105583, "auxiliary_loss_mlp": 0.01034882, "balance_loss_clip": 1.02322364, "balance_loss_mlp": 1.03652477, "epoch": 0.6663460093191041, "flos": 20009066883840.0, "grad_norm": 1.5878864190793578, "language_loss": 0.84050912, "learning_rate": 1.0583337445583234e-06, "loss": 0.8619138, "num_input_tokens_seen": 239376665, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69140625, "step": 11083, "time_per_iteration": 2.466935634613037 }, { "auxiliary_loss_clip": 0.01113283, "auxiliary_loss_mlp": 0.01033854, "balance_loss_clip": 1.02073538, "balance_loss_mlp": 1.04024887, "epoch": 0.6664061325717722, "flos": 17821389047040.0, "grad_norm": 2.097747357832693, "language_loss": 0.85429895, "learning_rate": 1.057990170638731e-06, "loss": 0.87577027, "num_input_tokens_seen": 239394345, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 11084, "time_per_iteration": 2.4428212642669678 }, { "auxiliary_loss_clip": 0.01109408, "auxiliary_loss_mlp": 0.01031816, "balance_loss_clip": 1.0186851, "balance_loss_mlp": 1.03754354, "epoch": 0.6664662558244401, "flos": 18076893465600.0, "grad_norm": 11.77772605283859, "language_loss": 0.73222125, "learning_rate": 1.0576466324402452e-06, "loss": 0.7536335, "num_input_tokens_seen": 239410605, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 11085, "time_per_iteration": 2.436018228530884 }, { "auxiliary_loss_clip": 0.01104704, "auxiliary_loss_mlp": 0.01032796, "balance_loss_clip": 1.02024341, "balance_loss_mlp": 1.03636503, "epoch": 0.6665263790771081, "flos": 21574197175680.0, "grad_norm": 2.7970301126847557, "language_loss": 0.80699968, "learning_rate": 1.057303129975894e-06, "loss": 0.82837468, "num_input_tokens_seen": 239427155, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 11086, "time_per_iteration": 2.47737455368042 }, { "auxiliary_loss_clip": 0.01107607, "auxiliary_loss_mlp": 0.010303, "balance_loss_clip": 1.01709723, "balance_loss_mlp": 1.0382061, "epoch": 0.666586502329776, "flos": 24206629213440.0, "grad_norm": 2.200231072609223, "language_loss": 0.74519026, "learning_rate": 1.056959663258702e-06, "loss": 0.76656932, "num_input_tokens_seen": 239445510, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 11087, "time_per_iteration": 3.859083414077759 }, { "auxiliary_loss_clip": 0.01106243, "auxiliary_loss_mlp": 0.01031471, "balance_loss_clip": 1.01847744, "balance_loss_mlp": 1.03707612, "epoch": 0.666646625582444, "flos": 22200515648640.0, "grad_norm": 1.6783892300357133, "language_loss": 0.64890337, "learning_rate": 1.0566162323016939e-06, "loss": 0.67028052, "num_input_tokens_seen": 239464805, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 11088, "time_per_iteration": 2.459411382675171 }, { "auxiliary_loss_clip": 0.01108907, "auxiliary_loss_mlp": 0.01026498, "balance_loss_clip": 1.01378465, "balance_loss_mlp": 1.03842247, "epoch": 0.6667067488351119, "flos": 18259930195200.0, "grad_norm": 1.9267886983006304, "language_loss": 0.64214611, "learning_rate": 1.0562728371178928e-06, "loss": 0.66350013, "num_input_tokens_seen": 239483890, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 11089, "time_per_iteration": 3.890876531600952 }, { "auxiliary_loss_clip": 0.01104688, "auxiliary_loss_mlp": 0.01028444, "balance_loss_clip": 1.01655316, "balance_loss_mlp": 1.03615332, "epoch": 0.66676687208778, "flos": 17236547804160.0, "grad_norm": 3.2204701721197324, "language_loss": 0.80418706, "learning_rate": 1.0559294777203221e-06, "loss": 0.82551837, "num_input_tokens_seen": 239500080, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 11090, "time_per_iteration": 3.8365468978881836 }, { "auxiliary_loss_clip": 0.01109105, "auxiliary_loss_mlp": 0.01031837, "balance_loss_clip": 1.0192008, "balance_loss_mlp": 1.03771353, "epoch": 0.6668269953404479, "flos": 19752197748480.0, "grad_norm": 2.3498547733431536, "language_loss": 0.77467537, "learning_rate": 1.0555861541219984e-06, "loss": 0.79608476, "num_input_tokens_seen": 239517335, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 11091, "time_per_iteration": 3.988166570663452 }, { "auxiliary_loss_clip": 0.01105221, "auxiliary_loss_mlp": 0.01031314, "balance_loss_clip": 1.01852298, "balance_loss_mlp": 1.03652406, "epoch": 0.6668871185931159, "flos": 20558428467840.0, "grad_norm": 1.7899860143081128, "language_loss": 0.79401523, "learning_rate": 1.0552428663359425e-06, "loss": 0.81538057, "num_input_tokens_seen": 239536240, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 11092, "time_per_iteration": 2.4658937454223633 }, { "auxiliary_loss_clip": 0.01031034, "auxiliary_loss_mlp": 0.01004768, "balance_loss_clip": 1.0034802, "balance_loss_mlp": 1.00822949, "epoch": 0.6669472418457839, "flos": 58088167735680.0, "grad_norm": 0.7514407936242234, "language_loss": 0.57683766, "learning_rate": 1.0548996143751724e-06, "loss": 0.59719568, "num_input_tokens_seen": 239598000, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.22851562, "step": 11093, "time_per_iteration": 3.1267800331115723 }, { "auxiliary_loss_clip": 0.01105964, "auxiliary_loss_mlp": 0.01027412, "balance_loss_clip": 1.01511598, "balance_loss_mlp": 1.03742254, "epoch": 0.6670073650984518, "flos": 26065113880320.0, "grad_norm": 1.7462899015899078, "language_loss": 0.76670241, "learning_rate": 1.054556398252703e-06, "loss": 0.78803623, "num_input_tokens_seen": 239617650, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 11094, "time_per_iteration": 2.564671277999878 }, { "auxiliary_loss_clip": 0.01106184, "auxiliary_loss_mlp": 0.01029891, "balance_loss_clip": 1.01695657, "balance_loss_mlp": 1.03621411, "epoch": 0.6670674883511198, "flos": 32416849635840.0, "grad_norm": 1.894712568980706, "language_loss": 0.73144621, "learning_rate": 1.05421321798155e-06, "loss": 0.7528069, "num_input_tokens_seen": 239639825, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 11095, "time_per_iteration": 2.615229368209839 }, { "auxiliary_loss_clip": 0.01108274, "auxiliary_loss_mlp": 0.01035277, "balance_loss_clip": 1.02271271, "balance_loss_mlp": 1.03921342, "epoch": 0.6671276116037878, "flos": 18037786533120.0, "grad_norm": 2.264844504855073, "language_loss": 0.73286641, "learning_rate": 1.053870073574727e-06, "loss": 0.75430191, "num_input_tokens_seen": 239656300, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 11096, "time_per_iteration": 2.4767355918884277 }, { "auxiliary_loss_clip": 0.01104814, "auxiliary_loss_mlp": 0.01032462, "balance_loss_clip": 1.02033257, "balance_loss_mlp": 1.03670573, "epoch": 0.6671877348564558, "flos": 23767046570880.0, "grad_norm": 2.1495748533830823, "language_loss": 0.64223361, "learning_rate": 1.0535269650452456e-06, "loss": 0.66360629, "num_input_tokens_seen": 239676655, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 11097, "time_per_iteration": 2.513183116912842 }, { "auxiliary_loss_clip": 0.0110789, "auxiliary_loss_mlp": 0.01032025, "balance_loss_clip": 1.01910865, "balance_loss_mlp": 1.03696311, "epoch": 0.6672478581091237, "flos": 20918360701440.0, "grad_norm": 2.022545707207145, "language_loss": 0.75676197, "learning_rate": 1.0531838924061158e-06, "loss": 0.77816117, "num_input_tokens_seen": 239695430, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11098, "time_per_iteration": 2.5086288452148438 }, { "auxiliary_loss_clip": 0.01109799, "auxiliary_loss_mlp": 0.01031033, "balance_loss_clip": 1.01941597, "balance_loss_mlp": 1.03919816, "epoch": 0.6673079813617917, "flos": 27855799626240.0, "grad_norm": 2.003945487035022, "language_loss": 0.74441409, "learning_rate": 1.0528408556703476e-06, "loss": 0.76582241, "num_input_tokens_seen": 239717070, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.703125, "step": 11099, "time_per_iteration": 2.5566952228546143 }, { "auxiliary_loss_clip": 0.01104264, "auxiliary_loss_mlp": 0.0103136, "balance_loss_clip": 1.01928973, "balance_loss_mlp": 1.03645337, "epoch": 0.6673681046144596, "flos": 21616859554560.0, "grad_norm": 2.1831450800509957, "language_loss": 0.78554487, "learning_rate": 1.0524978548509502e-06, "loss": 0.8069011, "num_input_tokens_seen": 239737105, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 11100, "time_per_iteration": 2.4922075271606445 }, { "auxiliary_loss_clip": 0.01106115, "auxiliary_loss_mlp": 0.01038592, "balance_loss_clip": 1.02640879, "balance_loss_mlp": 1.03740764, "epoch": 0.6674282278671276, "flos": 20889884194560.0, "grad_norm": 3.804100982016281, "language_loss": 0.60385787, "learning_rate": 1.0521548899609288e-06, "loss": 0.62530488, "num_input_tokens_seen": 239757835, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 11101, "time_per_iteration": 2.512406349182129 }, { "auxiliary_loss_clip": 0.01111992, "auxiliary_loss_mlp": 0.01039257, "balance_loss_clip": 1.02469003, "balance_loss_mlp": 1.03783143, "epoch": 0.6674883511197955, "flos": 23624194181760.0, "grad_norm": 1.6876247022398307, "language_loss": 0.71610707, "learning_rate": 1.0518119610132884e-06, "loss": 0.73761952, "num_input_tokens_seen": 239775425, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7421875, "step": 11102, "time_per_iteration": 2.5045864582061768 }, { "auxiliary_loss_clip": 0.01106213, "auxiliary_loss_mlp": 0.0102959, "balance_loss_clip": 1.01718605, "balance_loss_mlp": 1.03596282, "epoch": 0.6675484743724636, "flos": 19609668581760.0, "grad_norm": 2.104101334365758, "language_loss": 0.84678674, "learning_rate": 1.051469068021034e-06, "loss": 0.86814475, "num_input_tokens_seen": 239794605, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 11103, "time_per_iteration": 2.4771313667297363 }, { "auxiliary_loss_clip": 0.01108134, "auxiliary_loss_mlp": 0.01023924, "balance_loss_clip": 1.0113951, "balance_loss_mlp": 1.03670359, "epoch": 0.6676085976251315, "flos": 14319452482560.0, "grad_norm": 1.9559876820747213, "language_loss": 0.77691138, "learning_rate": 1.0511262109971668e-06, "loss": 0.79823196, "num_input_tokens_seen": 239812135, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 11104, "time_per_iteration": 2.4801206588745117 }, { "auxiliary_loss_clip": 0.01109951, "auxiliary_loss_mlp": 0.01026832, "balance_loss_clip": 1.01488674, "balance_loss_mlp": 1.03765011, "epoch": 0.6676687208777995, "flos": 38104596529920.0, "grad_norm": 1.6543378130179613, "language_loss": 0.58286315, "learning_rate": 1.0507833899546889e-06, "loss": 0.604231, "num_input_tokens_seen": 239835845, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.72265625, "step": 11105, "time_per_iteration": 2.6281402111053467 }, { "auxiliary_loss_clip": 0.01112067, "auxiliary_loss_mlp": 0.01033697, "balance_loss_clip": 1.01936197, "balance_loss_mlp": 1.03786755, "epoch": 0.6677288441304675, "flos": 23981576549760.0, "grad_norm": 1.6982733917477035, "language_loss": 0.73144388, "learning_rate": 1.0504406049066e-06, "loss": 0.75290143, "num_input_tokens_seen": 239853820, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7421875, "step": 11106, "time_per_iteration": 2.4925055503845215 }, { "auxiliary_loss_clip": 0.0110666, "auxiliary_loss_mlp": 0.01027363, "balance_loss_clip": 1.01491165, "balance_loss_mlp": 1.03668213, "epoch": 0.6677889673831354, "flos": 24170682677760.0, "grad_norm": 1.8794645251074884, "language_loss": 0.76223421, "learning_rate": 1.0500978558659e-06, "loss": 0.78357446, "num_input_tokens_seen": 239873365, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 11107, "time_per_iteration": 2.5395209789276123 }, { "auxiliary_loss_clip": 0.0110226, "auxiliary_loss_mlp": 0.0103073, "balance_loss_clip": 1.01828456, "balance_loss_mlp": 1.03575802, "epoch": 0.6678490906358034, "flos": 22309648145280.0, "grad_norm": 4.651093549490472, "language_loss": 0.9022491, "learning_rate": 1.049755142845583e-06, "loss": 0.92357898, "num_input_tokens_seen": 239891215, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6640625, "step": 11108, "time_per_iteration": 2.5031163692474365 }, { "auxiliary_loss_clip": 0.01104711, "auxiliary_loss_mlp": 0.01025483, "balance_loss_clip": 1.01453328, "balance_loss_mlp": 1.03629613, "epoch": 0.6679092138884714, "flos": 36898752026880.0, "grad_norm": 2.1757044763934084, "language_loss": 0.8277908, "learning_rate": 1.049412465858646e-06, "loss": 0.84909272, "num_input_tokens_seen": 239913490, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.68359375, "step": 11109, "time_per_iteration": 2.6700780391693115 }, { "auxiliary_loss_clip": 0.01107024, "auxiliary_loss_mlp": 0.01030416, "balance_loss_clip": 1.01755309, "balance_loss_mlp": 1.0374856, "epoch": 0.6679693371411394, "flos": 18150294908160.0, "grad_norm": 5.987841553036309, "language_loss": 0.68916386, "learning_rate": 1.0490698249180847e-06, "loss": 0.71053827, "num_input_tokens_seen": 239931565, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 11110, "time_per_iteration": 2.4740641117095947 }, { "auxiliary_loss_clip": 0.01108547, "auxiliary_loss_mlp": 0.01035251, "balance_loss_clip": 1.02141654, "balance_loss_mlp": 1.03670287, "epoch": 0.6680294603938073, "flos": 27198167472000.0, "grad_norm": 1.9151522376739134, "language_loss": 0.74124235, "learning_rate": 1.04872722003689e-06, "loss": 0.76268035, "num_input_tokens_seen": 239952395, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 11111, "time_per_iteration": 2.5325820446014404 }, { "auxiliary_loss_clip": 0.01104301, "auxiliary_loss_mlp": 0.01027283, "balance_loss_clip": 1.01501703, "balance_loss_mlp": 1.03586388, "epoch": 0.6680895836464753, "flos": 21725309692800.0, "grad_norm": 2.118536933907767, "language_loss": 0.65256721, "learning_rate": 1.0483846512280553e-06, "loss": 0.67388308, "num_input_tokens_seen": 239968910, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 11112, "time_per_iteration": 2.488095998764038 }, { "auxiliary_loss_clip": 0.01107149, "auxiliary_loss_mlp": 0.01028966, "balance_loss_clip": 1.01613939, "balance_loss_mlp": 1.03693676, "epoch": 0.6681497068991432, "flos": 19646477043840.0, "grad_norm": 2.0581098298663223, "language_loss": 0.63075686, "learning_rate": 1.048042118504569e-06, "loss": 0.65211797, "num_input_tokens_seen": 239987680, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 11113, "time_per_iteration": 2.462076425552368 }, { "auxiliary_loss_clip": 0.01105341, "auxiliary_loss_mlp": 0.01029378, "balance_loss_clip": 1.01748705, "balance_loss_mlp": 1.03727043, "epoch": 0.6682098301518112, "flos": 17419153570560.0, "grad_norm": 2.1384296534395224, "language_loss": 0.65989065, "learning_rate": 1.047699621879422e-06, "loss": 0.68123782, "num_input_tokens_seen": 240005790, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 11114, "time_per_iteration": 2.5108909606933594 }, { "auxiliary_loss_clip": 0.01105001, "auxiliary_loss_mlp": 0.01034456, "balance_loss_clip": 1.02170086, "balance_loss_mlp": 1.03594375, "epoch": 0.6682699534044791, "flos": 22599016110720.0, "grad_norm": 1.5796228297642143, "language_loss": 0.78589958, "learning_rate": 1.0473571613655998e-06, "loss": 0.80729413, "num_input_tokens_seen": 240025895, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 11115, "time_per_iteration": 2.478928565979004 }, { "auxiliary_loss_clip": 0.01105588, "auxiliary_loss_mlp": 0.01029102, "balance_loss_clip": 1.01681113, "balance_loss_mlp": 1.03494453, "epoch": 0.6683300766571472, "flos": 24863686750080.0, "grad_norm": 1.8055512433788103, "language_loss": 0.79291606, "learning_rate": 1.0470147369760896e-06, "loss": 0.81426299, "num_input_tokens_seen": 240044880, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 11116, "time_per_iteration": 2.524583101272583 }, { "auxiliary_loss_clip": 0.01108839, "auxiliary_loss_mlp": 0.01035562, "balance_loss_clip": 1.02210975, "balance_loss_mlp": 1.03775394, "epoch": 0.6683901999098151, "flos": 27126633536640.0, "grad_norm": 5.210496786842362, "language_loss": 0.79632097, "learning_rate": 1.0466723487238768e-06, "loss": 0.817765, "num_input_tokens_seen": 240065785, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 11117, "time_per_iteration": 2.5254862308502197 }, { "auxiliary_loss_clip": 0.0110923, "auxiliary_loss_mlp": 0.01032734, "balance_loss_clip": 1.01807117, "balance_loss_mlp": 1.03801262, "epoch": 0.6684503231624831, "flos": 20739023072640.0, "grad_norm": 2.824147884452571, "language_loss": 0.65514088, "learning_rate": 1.0463299966219441e-06, "loss": 0.67656052, "num_input_tokens_seen": 240085130, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7109375, "step": 11118, "time_per_iteration": 2.4775328636169434 }, { "auxiliary_loss_clip": 0.01105399, "auxiliary_loss_mlp": 0.01028313, "balance_loss_clip": 1.01652277, "balance_loss_mlp": 1.03727317, "epoch": 0.668510446415151, "flos": 21762189982080.0, "grad_norm": 2.0181546350002892, "language_loss": 0.69047427, "learning_rate": 1.0459876806832727e-06, "loss": 0.71181142, "num_input_tokens_seen": 240105495, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 11119, "time_per_iteration": 2.490779399871826 }, { "auxiliary_loss_clip": 0.01107332, "auxiliary_loss_mlp": 0.01032982, "balance_loss_clip": 1.02050114, "balance_loss_mlp": 1.03763592, "epoch": 0.668570569667819, "flos": 30191250015360.0, "grad_norm": 1.7176718215654194, "language_loss": 0.67347586, "learning_rate": 1.0456454009208448e-06, "loss": 0.69487906, "num_input_tokens_seen": 240125455, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 11120, "time_per_iteration": 2.5669262409210205 }, { "auxiliary_loss_clip": 0.01107146, "auxiliary_loss_mlp": 0.01032078, "balance_loss_clip": 1.01858926, "balance_loss_mlp": 1.03723311, "epoch": 0.668630692920487, "flos": 24170646764160.0, "grad_norm": 1.7840599702019335, "language_loss": 0.72267997, "learning_rate": 1.045303157347638e-06, "loss": 0.7440722, "num_input_tokens_seen": 240143870, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69921875, "step": 11121, "time_per_iteration": 2.504338026046753 }, { "auxiliary_loss_clip": 0.01107995, "auxiliary_loss_mlp": 0.01035874, "balance_loss_clip": 1.02277255, "balance_loss_mlp": 1.03651547, "epoch": 0.668690816173155, "flos": 17457147181440.0, "grad_norm": 2.969631657721932, "language_loss": 0.70621264, "learning_rate": 1.0449609499766316e-06, "loss": 0.7276513, "num_input_tokens_seen": 240161020, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 11122, "time_per_iteration": 2.451129674911499 }, { "auxiliary_loss_clip": 0.01105893, "auxiliary_loss_mlp": 0.01036675, "balance_loss_clip": 1.02380657, "balance_loss_mlp": 1.03657305, "epoch": 0.668750939425823, "flos": 25005102595200.0, "grad_norm": 1.831959698218794, "language_loss": 0.71814227, "learning_rate": 1.0446187788208015e-06, "loss": 0.739568, "num_input_tokens_seen": 240179820, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 11123, "time_per_iteration": 2.499751567840576 }, { "auxiliary_loss_clip": 0.01111565, "auxiliary_loss_mlp": 0.01032259, "balance_loss_clip": 1.01900339, "balance_loss_mlp": 1.03982997, "epoch": 0.6688110626784909, "flos": 24096778444800.0, "grad_norm": 2.8263332823732963, "language_loss": 0.7924937, "learning_rate": 1.0442766438931244e-06, "loss": 0.813932, "num_input_tokens_seen": 240200130, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11124, "time_per_iteration": 2.528935432434082 }, { "auxiliary_loss_clip": 0.01107514, "auxiliary_loss_mlp": 0.01037685, "balance_loss_clip": 1.02509642, "balance_loss_mlp": 1.03827107, "epoch": 0.6688711859311589, "flos": 21759532375680.0, "grad_norm": 2.0909285047947157, "language_loss": 0.74293667, "learning_rate": 1.0439345452065716e-06, "loss": 0.76438868, "num_input_tokens_seen": 240217945, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 11125, "time_per_iteration": 2.469022750854492 }, { "auxiliary_loss_clip": 0.01109969, "auxiliary_loss_mlp": 0.01037036, "balance_loss_clip": 1.02460229, "balance_loss_mlp": 1.0394727, "epoch": 0.6689313091838268, "flos": 22929645824640.0, "grad_norm": 2.7843435277929385, "language_loss": 0.6676504, "learning_rate": 1.043592482774116e-06, "loss": 0.68912041, "num_input_tokens_seen": 240237220, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 11126, "time_per_iteration": 2.505650043487549 }, { "auxiliary_loss_clip": 0.01106205, "auxiliary_loss_mlp": 0.01027046, "balance_loss_clip": 1.01447535, "balance_loss_mlp": 1.03609622, "epoch": 0.6689914324364948, "flos": 20886149180160.0, "grad_norm": 1.732450093926321, "language_loss": 0.70850253, "learning_rate": 1.0432504566087305e-06, "loss": 0.72983503, "num_input_tokens_seen": 240256000, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 11127, "time_per_iteration": 2.5132710933685303 }, { "auxiliary_loss_clip": 0.01111183, "auxiliary_loss_mlp": 0.01034716, "balance_loss_clip": 1.02064323, "balance_loss_mlp": 1.03724217, "epoch": 0.6690515556891627, "flos": 22748225207040.0, "grad_norm": 2.4530968914382294, "language_loss": 0.8039906, "learning_rate": 1.0429084667233827e-06, "loss": 0.82544959, "num_input_tokens_seen": 240275845, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 11128, "time_per_iteration": 2.5211281776428223 }, { "auxiliary_loss_clip": 0.01108118, "auxiliary_loss_mlp": 0.01028262, "balance_loss_clip": 1.01555395, "balance_loss_mlp": 1.03705442, "epoch": 0.6691116789418308, "flos": 23331450337920.0, "grad_norm": 12.768072774199117, "language_loss": 0.80856752, "learning_rate": 1.0425665131310427e-06, "loss": 0.82993138, "num_input_tokens_seen": 240294095, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11129, "time_per_iteration": 3.929518938064575 }, { "auxiliary_loss_clip": 0.0110115, "auxiliary_loss_mlp": 0.01031424, "balance_loss_clip": 1.01992035, "balance_loss_mlp": 1.035007, "epoch": 0.6691718021944987, "flos": 32447014081920.0, "grad_norm": 3.520340568411051, "language_loss": 0.70420915, "learning_rate": 1.0422245958446762e-06, "loss": 0.72553492, "num_input_tokens_seen": 240313460, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66015625, "step": 11130, "time_per_iteration": 2.5938332080841064 }, { "auxiliary_loss_clip": 0.01104445, "auxiliary_loss_mlp": 0.01032505, "balance_loss_clip": 1.02107251, "balance_loss_mlp": 1.03734303, "epoch": 0.6692319254471667, "flos": 23731602825600.0, "grad_norm": 1.7542222964090748, "language_loss": 0.70243621, "learning_rate": 1.0418827148772486e-06, "loss": 0.72380567, "num_input_tokens_seen": 240333540, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 11131, "time_per_iteration": 4.065020799636841 }, { "auxiliary_loss_clip": 0.01107415, "auxiliary_loss_mlp": 0.01030621, "balance_loss_clip": 1.01657867, "balance_loss_mlp": 1.03648293, "epoch": 0.6692920486998346, "flos": 14427902620800.0, "grad_norm": 2.212760705092777, "language_loss": 0.65518034, "learning_rate": 1.0415408702417243e-06, "loss": 0.67656064, "num_input_tokens_seen": 240350085, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.70703125, "step": 11132, "time_per_iteration": 3.8601136207580566 }, { "auxiliary_loss_clip": 0.01109186, "auxiliary_loss_mlp": 0.01033329, "balance_loss_clip": 1.01994741, "balance_loss_mlp": 1.03778553, "epoch": 0.6693521719525026, "flos": 21507475662720.0, "grad_norm": 1.5659178180540498, "language_loss": 0.74920368, "learning_rate": 1.0411990619510661e-06, "loss": 0.77062887, "num_input_tokens_seen": 240370015, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 11133, "time_per_iteration": 3.9657044410705566 }, { "auxiliary_loss_clip": 0.01113127, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.02064717, "balance_loss_mlp": 1.04012942, "epoch": 0.6694122952051706, "flos": 25406943022080.0, "grad_norm": 2.3110763643176235, "language_loss": 0.66605163, "learning_rate": 1.0408572900182363e-06, "loss": 0.6875307, "num_input_tokens_seen": 240390770, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.73046875, "step": 11134, "time_per_iteration": 2.5282070636749268 }, { "auxiliary_loss_clip": 0.01114835, "auxiliary_loss_mlp": 0.01037148, "balance_loss_clip": 1.02293229, "balance_loss_mlp": 1.0402025, "epoch": 0.6694724184578386, "flos": 25661729168640.0, "grad_norm": 3.1868670683869236, "language_loss": 0.77560413, "learning_rate": 1.0405155544561943e-06, "loss": 0.79712397, "num_input_tokens_seen": 240409590, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 11135, "time_per_iteration": 2.4972383975982666 }, { "auxiliary_loss_clip": 0.01104603, "auxiliary_loss_mlp": 0.01028075, "balance_loss_clip": 1.01536179, "balance_loss_mlp": 1.03695869, "epoch": 0.6695325417105066, "flos": 17709311635200.0, "grad_norm": 1.6328350342261015, "language_loss": 0.74242342, "learning_rate": 1.040173855277898e-06, "loss": 0.7637502, "num_input_tokens_seen": 240428180, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 11136, "time_per_iteration": 2.467092275619507 }, { "auxiliary_loss_clip": 0.01113726, "auxiliary_loss_mlp": 0.01036049, "balance_loss_clip": 1.02186358, "balance_loss_mlp": 1.0408479, "epoch": 0.6695926649631745, "flos": 24460050643200.0, "grad_norm": 2.3748331188683607, "language_loss": 0.62191129, "learning_rate": 1.0398321924963061e-06, "loss": 0.64340907, "num_input_tokens_seen": 240447815, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.73046875, "step": 11137, "time_per_iteration": 2.50019907951355 }, { "auxiliary_loss_clip": 0.01107128, "auxiliary_loss_mlp": 0.01028289, "balance_loss_clip": 1.01546168, "balance_loss_mlp": 1.03762162, "epoch": 0.6696527882158425, "flos": 24280138396800.0, "grad_norm": 2.2984154713192964, "language_loss": 0.6582042, "learning_rate": 1.0394905661243724e-06, "loss": 0.6795584, "num_input_tokens_seen": 240468635, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 11138, "time_per_iteration": 2.5055837631225586 }, { "auxiliary_loss_clip": 0.01103236, "auxiliary_loss_mlp": 0.0103145, "balance_loss_clip": 1.01940966, "balance_loss_mlp": 1.03584838, "epoch": 0.6697129114685104, "flos": 23002759958400.0, "grad_norm": 1.8367835277998967, "language_loss": 0.73039734, "learning_rate": 1.039148976175053e-06, "loss": 0.75174421, "num_input_tokens_seen": 240488550, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 11139, "time_per_iteration": 2.4913976192474365 }, { "auxiliary_loss_clip": 0.01104798, "auxiliary_loss_mlp": 0.01030451, "balance_loss_clip": 1.01920998, "balance_loss_mlp": 1.03787935, "epoch": 0.6697730347211784, "flos": 22638123043200.0, "grad_norm": 2.0159350061242844, "language_loss": 0.69952869, "learning_rate": 1.0388074226613016e-06, "loss": 0.72088116, "num_input_tokens_seen": 240508330, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66796875, "step": 11140, "time_per_iteration": 2.501112461090088 }, { "auxiliary_loss_clip": 0.01107309, "auxiliary_loss_mlp": 0.01027806, "balance_loss_clip": 1.01425147, "balance_loss_mlp": 1.03560519, "epoch": 0.6698331579738463, "flos": 28877242682880.0, "grad_norm": 2.3892885115706566, "language_loss": 0.7589823, "learning_rate": 1.0384659055960691e-06, "loss": 0.78033346, "num_input_tokens_seen": 240528470, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 11141, "time_per_iteration": 2.534273862838745 }, { "auxiliary_loss_clip": 0.01109253, "auxiliary_loss_mlp": 0.010355, "balance_loss_clip": 1.02215457, "balance_loss_mlp": 1.03763008, "epoch": 0.6698932812265144, "flos": 24207096090240.0, "grad_norm": 2.3395037652377275, "language_loss": 0.82466602, "learning_rate": 1.0381244249923052e-06, "loss": 0.84611356, "num_input_tokens_seen": 240547815, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 11142, "time_per_iteration": 2.5172672271728516 }, { "auxiliary_loss_clip": 0.01103939, "auxiliary_loss_mlp": 0.01028553, "balance_loss_clip": 1.01570797, "balance_loss_mlp": 1.03515339, "epoch": 0.6699534044791823, "flos": 22090269830400.0, "grad_norm": 1.67291664665868, "language_loss": 0.70302588, "learning_rate": 1.037782980862959e-06, "loss": 0.72435081, "num_input_tokens_seen": 240567765, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 11143, "time_per_iteration": 2.4809927940368652 }, { "auxiliary_loss_clip": 0.01103748, "auxiliary_loss_mlp": 0.0103159, "balance_loss_clip": 1.0197171, "balance_loss_mlp": 1.03656793, "epoch": 0.6700135277318503, "flos": 25192377129600.0, "grad_norm": 1.8237190145574542, "language_loss": 0.70291716, "learning_rate": 1.0374415732209796e-06, "loss": 0.72427058, "num_input_tokens_seen": 240590750, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 11144, "time_per_iteration": 2.5396203994750977 }, { "auxiliary_loss_clip": 0.01105855, "auxiliary_loss_mlp": 0.01030203, "balance_loss_clip": 1.01742339, "balance_loss_mlp": 1.03705311, "epoch": 0.6700736509845182, "flos": 23440187784960.0, "grad_norm": 1.6576467225342442, "language_loss": 0.74512255, "learning_rate": 1.0371002020793114e-06, "loss": 0.76648307, "num_input_tokens_seen": 240608875, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 11145, "time_per_iteration": 2.506235361099243 }, { "auxiliary_loss_clip": 0.01109438, "auxiliary_loss_mlp": 0.01026923, "balance_loss_clip": 1.01396537, "balance_loss_mlp": 1.03690076, "epoch": 0.6701337742371862, "flos": 24389953251840.0, "grad_norm": 1.5318883887718997, "language_loss": 0.7102778, "learning_rate": 1.0367588674509008e-06, "loss": 0.73164141, "num_input_tokens_seen": 240628565, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 11146, "time_per_iteration": 2.5330729484558105 }, { "auxiliary_loss_clip": 0.01102419, "auxiliary_loss_mlp": 0.01030882, "balance_loss_clip": 1.01841903, "balance_loss_mlp": 1.03609133, "epoch": 0.6701938974898543, "flos": 14793652857600.0, "grad_norm": 1.7954453838876667, "language_loss": 0.78088182, "learning_rate": 1.0364175693486905e-06, "loss": 0.80221486, "num_input_tokens_seen": 240646325, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6640625, "step": 11147, "time_per_iteration": 2.444033145904541 }, { "auxiliary_loss_clip": 0.01108868, "auxiliary_loss_mlp": 0.01034427, "balance_loss_clip": 1.02166581, "balance_loss_mlp": 1.03916919, "epoch": 0.6702540207425222, "flos": 20154002261760.0, "grad_norm": 1.7093328420994889, "language_loss": 0.6997906, "learning_rate": 1.0360763077856218e-06, "loss": 0.72122359, "num_input_tokens_seen": 240666145, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 11148, "time_per_iteration": 2.504119396209717 }, { "auxiliary_loss_clip": 0.01107283, "auxiliary_loss_mlp": 0.01033207, "balance_loss_clip": 1.02080309, "balance_loss_mlp": 1.03643346, "epoch": 0.6703141439951902, "flos": 21214157201280.0, "grad_norm": 1.750620052319715, "language_loss": 0.70258075, "learning_rate": 1.035735082774636e-06, "loss": 0.72398561, "num_input_tokens_seen": 240685570, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 11149, "time_per_iteration": 2.473633050918579 }, { "auxiliary_loss_clip": 0.01107552, "auxiliary_loss_mlp": 0.01030347, "balance_loss_clip": 1.01855707, "balance_loss_mlp": 1.03620791, "epoch": 0.6703742672478581, "flos": 23112538899840.0, "grad_norm": 1.7924023785611098, "language_loss": 0.73829645, "learning_rate": 1.0353938943286727e-06, "loss": 0.75967544, "num_input_tokens_seen": 240706945, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.7109375, "step": 11150, "time_per_iteration": 2.532252550125122 }, { "auxiliary_loss_clip": 0.0110836, "auxiliary_loss_mlp": 0.01030446, "balance_loss_clip": 1.01844144, "balance_loss_mlp": 1.03823185, "epoch": 0.6704343905005261, "flos": 22528918719360.0, "grad_norm": 1.8347886418493078, "language_loss": 0.78198266, "learning_rate": 1.035052742460671e-06, "loss": 0.80337071, "num_input_tokens_seen": 240727990, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 11151, "time_per_iteration": 2.5034401416778564 }, { "auxiliary_loss_clip": 0.01032511, "auxiliary_loss_mlp": 0.01002946, "balance_loss_clip": 1.00173593, "balance_loss_mlp": 1.00961804, "epoch": 0.670494513753194, "flos": 64793158773120.0, "grad_norm": 0.8019590663727099, "language_loss": 0.5550859, "learning_rate": 1.0347116271835643e-06, "loss": 0.57544047, "num_input_tokens_seen": 240790380, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.22949219, "step": 11152, "time_per_iteration": 3.2407264709472656 }, { "auxiliary_loss_clip": 0.01107559, "auxiliary_loss_mlp": 0.01034012, "balance_loss_clip": 1.02110171, "balance_loss_mlp": 1.03670037, "epoch": 0.670554637005862, "flos": 23511506238720.0, "grad_norm": 1.669055250192932, "language_loss": 0.80894446, "learning_rate": 1.0343705485102896e-06, "loss": 0.83036017, "num_input_tokens_seen": 240811545, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11153, "time_per_iteration": 2.522915840148926 }, { "auxiliary_loss_clip": 0.01107162, "auxiliary_loss_mlp": 0.01033363, "balance_loss_clip": 1.02124524, "balance_loss_mlp": 1.03671694, "epoch": 0.67061476025853, "flos": 19463404400640.0, "grad_norm": 1.644498413684819, "language_loss": 0.76030409, "learning_rate": 1.0340295064537814e-06, "loss": 0.78170931, "num_input_tokens_seen": 240831380, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 11154, "time_per_iteration": 2.505408525466919 }, { "auxiliary_loss_clip": 0.01113685, "auxiliary_loss_mlp": 0.01036222, "balance_loss_clip": 1.02344298, "balance_loss_mlp": 1.0399189, "epoch": 0.670674883511198, "flos": 20519967980160.0, "grad_norm": 1.4738377421713358, "language_loss": 0.75990897, "learning_rate": 1.0336885010269702e-06, "loss": 0.78140807, "num_input_tokens_seen": 240851855, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73828125, "step": 11155, "time_per_iteration": 2.4782090187072754 }, { "auxiliary_loss_clip": 0.01109733, "auxiliary_loss_mlp": 0.01036741, "balance_loss_clip": 1.02399731, "balance_loss_mlp": 1.03966033, "epoch": 0.6707350067638659, "flos": 25483971738240.0, "grad_norm": 1.9738478778623014, "language_loss": 0.82189167, "learning_rate": 1.0333475322427878e-06, "loss": 0.84335637, "num_input_tokens_seen": 240869980, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 11156, "time_per_iteration": 2.49833607673645 }, { "auxiliary_loss_clip": 0.01105816, "auxiliary_loss_mlp": 0.01030183, "balance_loss_clip": 1.01803017, "balance_loss_mlp": 1.0366981, "epoch": 0.6707951300165339, "flos": 22273450214400.0, "grad_norm": 2.4001029028400924, "language_loss": 0.74889207, "learning_rate": 1.033006600114165e-06, "loss": 0.77025211, "num_input_tokens_seen": 240888680, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 11157, "time_per_iteration": 2.480508327484131 }, { "auxiliary_loss_clip": 0.01111103, "auxiliary_loss_mlp": 0.01036768, "balance_loss_clip": 1.02358305, "balance_loss_mlp": 1.03984034, "epoch": 0.6708552532692018, "flos": 23984593292160.0, "grad_norm": 1.8704740759161578, "language_loss": 0.74489397, "learning_rate": 1.0326657046540282e-06, "loss": 0.76637268, "num_input_tokens_seen": 240909050, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 11158, "time_per_iteration": 2.4905169010162354 }, { "auxiliary_loss_clip": 0.01110159, "auxiliary_loss_mlp": 0.01035779, "balance_loss_clip": 1.02266634, "balance_loss_mlp": 1.03803742, "epoch": 0.6709153765218698, "flos": 24937519155840.0, "grad_norm": 1.4550523431614384, "language_loss": 0.81759262, "learning_rate": 1.0323248458753044e-06, "loss": 0.83905196, "num_input_tokens_seen": 240930035, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 11159, "time_per_iteration": 2.5014193058013916 }, { "auxiliary_loss_clip": 0.01107189, "auxiliary_loss_mlp": 0.01031468, "balance_loss_clip": 1.01908171, "balance_loss_mlp": 1.03726411, "epoch": 0.6709754997745379, "flos": 17530225401600.0, "grad_norm": 2.7021410296074726, "language_loss": 0.76749492, "learning_rate": 1.0319840237909193e-06, "loss": 0.78888148, "num_input_tokens_seen": 240948895, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 11160, "time_per_iteration": 2.460641860961914 }, { "auxiliary_loss_clip": 0.01104567, "auxiliary_loss_mlp": 0.01025906, "balance_loss_clip": 1.01405072, "balance_loss_mlp": 1.03634691, "epoch": 0.6710356230272058, "flos": 22090880361600.0, "grad_norm": 1.873006071274597, "language_loss": 0.73644084, "learning_rate": 1.0316432384137978e-06, "loss": 0.75774562, "num_input_tokens_seen": 240967770, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 11161, "time_per_iteration": 2.4830782413482666 }, { "auxiliary_loss_clip": 0.0111118, "auxiliary_loss_mlp": 0.01035736, "balance_loss_clip": 1.02209282, "balance_loss_mlp": 1.03791034, "epoch": 0.6710957462798738, "flos": 24206449645440.0, "grad_norm": 1.792944870246261, "language_loss": 0.68477583, "learning_rate": 1.0313024897568618e-06, "loss": 0.70624501, "num_input_tokens_seen": 240988985, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 11162, "time_per_iteration": 2.5308618545532227 }, { "auxiliary_loss_clip": 0.01106001, "auxiliary_loss_mlp": 0.0103667, "balance_loss_clip": 1.02467752, "balance_loss_mlp": 1.03682303, "epoch": 0.6711558695325417, "flos": 19093955063040.0, "grad_norm": 1.996231325278758, "language_loss": 0.70250511, "learning_rate": 1.030961777833032e-06, "loss": 0.72393179, "num_input_tokens_seen": 241005455, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 11163, "time_per_iteration": 2.4452366828918457 }, { "auxiliary_loss_clip": 0.01106256, "auxiliary_loss_mlp": 0.01029726, "balance_loss_clip": 1.01769233, "balance_loss_mlp": 1.03853595, "epoch": 0.6712159927852097, "flos": 25557875971200.0, "grad_norm": 1.69350101519667, "language_loss": 0.75403976, "learning_rate": 1.0306211026552291e-06, "loss": 0.77539963, "num_input_tokens_seen": 241026175, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 11164, "time_per_iteration": 2.5194344520568848 }, { "auxiliary_loss_clip": 0.01107428, "auxiliary_loss_mlp": 0.0103342, "balance_loss_clip": 1.02078342, "balance_loss_mlp": 1.03778839, "epoch": 0.6712761160378776, "flos": 22228812587520.0, "grad_norm": 3.690494569130768, "language_loss": 0.65178502, "learning_rate": 1.0302804642363704e-06, "loss": 0.67319345, "num_input_tokens_seen": 241044040, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 11165, "time_per_iteration": 2.459359645843506 }, { "auxiliary_loss_clip": 0.01106008, "auxiliary_loss_mlp": 0.01029036, "balance_loss_clip": 1.01708508, "balance_loss_mlp": 1.03774107, "epoch": 0.6713362392905456, "flos": 22455517276800.0, "grad_norm": 2.246812859876882, "language_loss": 0.71693295, "learning_rate": 1.0299398625893738e-06, "loss": 0.73828334, "num_input_tokens_seen": 241063615, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 11166, "time_per_iteration": 2.519472122192383 }, { "auxiliary_loss_clip": 0.01105179, "auxiliary_loss_mlp": 0.01030376, "balance_loss_clip": 1.01889014, "balance_loss_mlp": 1.03741419, "epoch": 0.6713963625432136, "flos": 25630200005760.0, "grad_norm": 1.9510407869747175, "language_loss": 0.76651418, "learning_rate": 1.0295992977271546e-06, "loss": 0.78786969, "num_input_tokens_seen": 241082520, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 11167, "time_per_iteration": 2.5103580951690674 }, { "auxiliary_loss_clip": 0.0110539, "auxiliary_loss_mlp": 0.01033923, "balance_loss_clip": 1.02150166, "balance_loss_mlp": 1.03531015, "epoch": 0.6714564857958816, "flos": 35006475640320.0, "grad_norm": 2.1499402991440144, "language_loss": 0.68766022, "learning_rate": 1.029258769662629e-06, "loss": 0.70905334, "num_input_tokens_seen": 241103505, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 11168, "time_per_iteration": 2.5936782360076904 }, { "auxiliary_loss_clip": 0.01110834, "auxiliary_loss_mlp": 0.01038859, "balance_loss_clip": 1.02515006, "balance_loss_mlp": 1.03814209, "epoch": 0.6715166090485495, "flos": 26279931168000.0, "grad_norm": 2.28997414567093, "language_loss": 0.73248017, "learning_rate": 1.0289182784087068e-06, "loss": 0.75397706, "num_input_tokens_seen": 241122885, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 11169, "time_per_iteration": 2.5021510124206543 }, { "auxiliary_loss_clip": 0.01109128, "auxiliary_loss_mlp": 0.01034884, "balance_loss_clip": 1.02115154, "balance_loss_mlp": 1.03745508, "epoch": 0.6715767323012175, "flos": 15924156583680.0, "grad_norm": 2.2859496805243236, "language_loss": 0.75942421, "learning_rate": 1.0285778239783005e-06, "loss": 0.78086436, "num_input_tokens_seen": 241140865, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 11170, "time_per_iteration": 2.4556586742401123 }, { "auxiliary_loss_clip": 0.01108648, "auxiliary_loss_mlp": 0.01027353, "balance_loss_clip": 1.01443648, "balance_loss_mlp": 1.03638566, "epoch": 0.6716368555538854, "flos": 17491441691520.0, "grad_norm": 2.1776182583509605, "language_loss": 0.74501663, "learning_rate": 1.0282374063843212e-06, "loss": 0.76637661, "num_input_tokens_seen": 241158225, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 11171, "time_per_iteration": 3.927302360534668 }, { "auxiliary_loss_clip": 0.01109523, "auxiliary_loss_mlp": 0.01037062, "balance_loss_clip": 1.02397835, "balance_loss_mlp": 1.03750837, "epoch": 0.6716969788065534, "flos": 16761521416320.0, "grad_norm": 1.6448245152116814, "language_loss": 0.86686987, "learning_rate": 1.0278970256396762e-06, "loss": 0.8883357, "num_input_tokens_seen": 241175215, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 11172, "time_per_iteration": 3.931138277053833 }, { "auxiliary_loss_clip": 0.01105575, "auxiliary_loss_mlp": 0.01033204, "balance_loss_clip": 1.02012062, "balance_loss_mlp": 1.03549659, "epoch": 0.6717571020592215, "flos": 22709800632960.0, "grad_norm": 1.718785336307734, "language_loss": 0.63662314, "learning_rate": 1.0275566817572733e-06, "loss": 0.65801096, "num_input_tokens_seen": 241195250, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 11173, "time_per_iteration": 2.4978184700012207 }, { "auxiliary_loss_clip": 0.01114357, "auxiliary_loss_mlp": 0.01036417, "balance_loss_clip": 1.02235079, "balance_loss_mlp": 1.03881192, "epoch": 0.6718172253118894, "flos": 18734094656640.0, "grad_norm": 2.673548875688645, "language_loss": 0.71891487, "learning_rate": 1.02721637475002e-06, "loss": 0.74042261, "num_input_tokens_seen": 241210720, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 11174, "time_per_iteration": 3.8572685718536377 }, { "auxiliary_loss_clip": 0.01103482, "auxiliary_loss_mlp": 0.0103147, "balance_loss_clip": 1.018911, "balance_loss_mlp": 1.03537512, "epoch": 0.6718773485645574, "flos": 15632526061440.0, "grad_norm": 3.5249249144251693, "language_loss": 0.69011521, "learning_rate": 1.0268761046308178e-06, "loss": 0.71146476, "num_input_tokens_seen": 241227395, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 11175, "time_per_iteration": 3.939481258392334 }, { "auxiliary_loss_clip": 0.01106697, "auxiliary_loss_mlp": 0.01037365, "balance_loss_clip": 1.02520609, "balance_loss_mlp": 1.03796875, "epoch": 0.6719374718172253, "flos": 19354774694400.0, "grad_norm": 1.9643481764651398, "language_loss": 0.73819745, "learning_rate": 1.0265358714125714e-06, "loss": 0.75963807, "num_input_tokens_seen": 241246355, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 11176, "time_per_iteration": 2.4779720306396484 }, { "auxiliary_loss_clip": 0.01107362, "auxiliary_loss_mlp": 0.01032542, "balance_loss_clip": 1.01893401, "balance_loss_mlp": 1.03588295, "epoch": 0.6719975950698933, "flos": 21981316901760.0, "grad_norm": 2.20178978511626, "language_loss": 0.72784948, "learning_rate": 1.026195675108182e-06, "loss": 0.7492485, "num_input_tokens_seen": 241264180, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 11177, "time_per_iteration": 2.4799065589904785 }, { "auxiliary_loss_clip": 0.01106893, "auxiliary_loss_mlp": 0.01033006, "balance_loss_clip": 1.01928473, "balance_loss_mlp": 1.03589535, "epoch": 0.6720577183225612, "flos": 25228072270080.0, "grad_norm": 2.249101625574139, "language_loss": 0.76189822, "learning_rate": 1.025855515730551e-06, "loss": 0.78329718, "num_input_tokens_seen": 241282245, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 11178, "time_per_iteration": 2.49354887008667 }, { "auxiliary_loss_clip": 0.01108762, "auxiliary_loss_mlp": 0.01031964, "balance_loss_clip": 1.02009678, "balance_loss_mlp": 1.03778648, "epoch": 0.6721178415752292, "flos": 16945886949120.0, "grad_norm": 9.219029259352489, "language_loss": 0.7032578, "learning_rate": 1.0255153932925766e-06, "loss": 0.72466505, "num_input_tokens_seen": 241300745, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.7109375, "step": 11179, "time_per_iteration": 2.455116033554077 }, { "auxiliary_loss_clip": 0.01105343, "auxiliary_loss_mlp": 0.01029394, "balance_loss_clip": 1.01733637, "balance_loss_mlp": 1.03689909, "epoch": 0.6721779648278972, "flos": 21541375123200.0, "grad_norm": 1.8538742815978615, "language_loss": 0.74105, "learning_rate": 1.0251753078071557e-06, "loss": 0.76239741, "num_input_tokens_seen": 241319320, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.68359375, "step": 11180, "time_per_iteration": 2.488527297973633 }, { "auxiliary_loss_clip": 0.01106566, "auxiliary_loss_mlp": 0.0102847, "balance_loss_clip": 1.01656699, "balance_loss_mlp": 1.03731585, "epoch": 0.6722380880805652, "flos": 22605444645120.0, "grad_norm": 1.4083861013459196, "language_loss": 0.75086582, "learning_rate": 1.0248352592871848e-06, "loss": 0.77221614, "num_input_tokens_seen": 241342225, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 11181, "time_per_iteration": 2.548858880996704 }, { "auxiliary_loss_clip": 0.01107657, "auxiliary_loss_mlp": 0.01027074, "balance_loss_clip": 1.015576, "balance_loss_mlp": 1.03600454, "epoch": 0.6722982113332331, "flos": 15925269905280.0, "grad_norm": 2.27741956002802, "language_loss": 0.74602497, "learning_rate": 1.0244952477455585e-06, "loss": 0.76737225, "num_input_tokens_seen": 241358240, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.71875, "step": 11182, "time_per_iteration": 2.446963310241699 }, { "auxiliary_loss_clip": 0.01103578, "auxiliary_loss_mlp": 0.01033199, "balance_loss_clip": 1.02106333, "balance_loss_mlp": 1.03574336, "epoch": 0.6723583345859011, "flos": 20596170683520.0, "grad_norm": 1.8489701343611773, "language_loss": 0.69840264, "learning_rate": 1.0241552731951699e-06, "loss": 0.71977031, "num_input_tokens_seen": 241378420, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 11183, "time_per_iteration": 2.5043418407440186 }, { "auxiliary_loss_clip": 0.01107727, "auxiliary_loss_mlp": 0.0103058, "balance_loss_clip": 1.01733029, "balance_loss_mlp": 1.03658605, "epoch": 0.672418457838569, "flos": 21725848396800.0, "grad_norm": 1.6185881780908584, "language_loss": 0.78025377, "learning_rate": 1.0238153356489112e-06, "loss": 0.80163682, "num_input_tokens_seen": 241397185, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 11184, "time_per_iteration": 2.5080652236938477 }, { "auxiliary_loss_clip": 0.01116804, "auxiliary_loss_mlp": 0.01034111, "balance_loss_clip": 1.01983607, "balance_loss_mlp": 1.04060841, "epoch": 0.672478581091237, "flos": 21470379891840.0, "grad_norm": 2.0603115283026985, "language_loss": 0.66113949, "learning_rate": 1.0234754351196743e-06, "loss": 0.68264866, "num_input_tokens_seen": 241415785, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.76171875, "step": 11185, "time_per_iteration": 2.48827862739563 }, { "auxiliary_loss_clip": 0.01105065, "auxiliary_loss_mlp": 0.01032438, "balance_loss_clip": 1.01955795, "balance_loss_mlp": 1.03485978, "epoch": 0.6725387043439051, "flos": 30846763267200.0, "grad_norm": 1.6913465870217568, "language_loss": 0.80253679, "learning_rate": 1.023135571620345e-06, "loss": 0.82391179, "num_input_tokens_seen": 241437390, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11186, "time_per_iteration": 2.556504726409912 }, { "auxiliary_loss_clip": 0.01105483, "auxiliary_loss_mlp": 0.01035869, "balance_loss_clip": 1.02412736, "balance_loss_mlp": 1.03771639, "epoch": 0.672598827596573, "flos": 24055947659520.0, "grad_norm": 1.5834990233275004, "language_loss": 0.80382186, "learning_rate": 1.022795745163813e-06, "loss": 0.82523537, "num_input_tokens_seen": 241458085, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 11187, "time_per_iteration": 2.5407962799072266 }, { "auxiliary_loss_clip": 0.01114015, "auxiliary_loss_mlp": 0.01035343, "balance_loss_clip": 1.02119279, "balance_loss_mlp": 1.04023087, "epoch": 0.672658950849241, "flos": 21871861182720.0, "grad_norm": 11.134680353983487, "language_loss": 0.70826787, "learning_rate": 1.022455955762965e-06, "loss": 0.72976148, "num_input_tokens_seen": 241476880, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.73828125, "step": 11188, "time_per_iteration": 2.502082586288452 }, { "auxiliary_loss_clip": 0.0110491, "auxiliary_loss_mlp": 0.01033522, "balance_loss_clip": 1.02110028, "balance_loss_mlp": 1.03718913, "epoch": 0.6727190741019089, "flos": 23222102359680.0, "grad_norm": 1.7805194553661834, "language_loss": 0.75381058, "learning_rate": 1.0221162034306842e-06, "loss": 0.77519488, "num_input_tokens_seen": 241496535, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.67578125, "step": 11189, "time_per_iteration": 2.518986701965332 }, { "auxiliary_loss_clip": 0.01110613, "auxiliary_loss_mlp": 0.01031006, "balance_loss_clip": 1.01692724, "balance_loss_mlp": 1.03704369, "epoch": 0.6727791973545769, "flos": 15778610674560.0, "grad_norm": 2.491409731806071, "language_loss": 0.75337851, "learning_rate": 1.0217764881798562e-06, "loss": 0.7747947, "num_input_tokens_seen": 241513465, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 11190, "time_per_iteration": 2.451683282852173 }, { "auxiliary_loss_clip": 0.01106402, "auxiliary_loss_mlp": 0.01032569, "balance_loss_clip": 1.01950336, "balance_loss_mlp": 1.03649259, "epoch": 0.6728393206072448, "flos": 21249852341760.0, "grad_norm": 1.4610574336906652, "language_loss": 0.76957488, "learning_rate": 1.0214368100233612e-06, "loss": 0.79096454, "num_input_tokens_seen": 241534125, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 11191, "time_per_iteration": 2.4983670711517334 }, { "auxiliary_loss_clip": 0.01106435, "auxiliary_loss_mlp": 0.01031703, "balance_loss_clip": 1.01895976, "balance_loss_mlp": 1.03759861, "epoch": 0.6728994438599128, "flos": 32123279779200.0, "grad_norm": 1.7984120187380328, "language_loss": 0.86626589, "learning_rate": 1.0210971689740802e-06, "loss": 0.88764727, "num_input_tokens_seen": 241556340, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 11192, "time_per_iteration": 2.5673885345458984 }, { "auxiliary_loss_clip": 0.01109336, "auxiliary_loss_mlp": 0.01040718, "balance_loss_clip": 1.02700925, "balance_loss_mlp": 1.03820515, "epoch": 0.6729595671125808, "flos": 23112359331840.0, "grad_norm": 2.1906629384458247, "language_loss": 0.75817621, "learning_rate": 1.0207575650448923e-06, "loss": 0.7796768, "num_input_tokens_seen": 241575185, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 11193, "time_per_iteration": 2.511418342590332 }, { "auxiliary_loss_clip": 0.01107723, "auxiliary_loss_mlp": 0.0103296, "balance_loss_clip": 1.01997781, "balance_loss_mlp": 1.03765583, "epoch": 0.6730196903652488, "flos": 14611406227200.0, "grad_norm": 2.07737532850052, "language_loss": 0.78804302, "learning_rate": 1.0204179982486758e-06, "loss": 0.80944985, "num_input_tokens_seen": 241592970, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 11194, "time_per_iteration": 2.4536545276641846 }, { "auxiliary_loss_clip": 0.0110736, "auxiliary_loss_mlp": 0.0102842, "balance_loss_clip": 1.01658225, "balance_loss_mlp": 1.03572273, "epoch": 0.6730798136179167, "flos": 21105922544640.0, "grad_norm": 2.1254719548273098, "language_loss": 0.90119171, "learning_rate": 1.0200784685983075e-06, "loss": 0.92254949, "num_input_tokens_seen": 241610245, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.71875, "step": 11195, "time_per_iteration": 2.512122869491577 }, { "auxiliary_loss_clip": 0.01106418, "auxiliary_loss_mlp": 0.01032858, "balance_loss_clip": 1.02036524, "balance_loss_mlp": 1.03642511, "epoch": 0.6731399368705847, "flos": 28986267438720.0, "grad_norm": 2.3633114118130836, "language_loss": 0.72419918, "learning_rate": 1.019738976106662e-06, "loss": 0.74559194, "num_input_tokens_seen": 241630350, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 11196, "time_per_iteration": 2.5541954040527344 }, { "auxiliary_loss_clip": 0.01032507, "auxiliary_loss_mlp": 0.01000331, "balance_loss_clip": 0.99909705, "balance_loss_mlp": 1.0093832, "epoch": 0.6732000601232526, "flos": 64743708723840.0, "grad_norm": 0.7803086104742867, "language_loss": 0.56738538, "learning_rate": 1.0193995207866123e-06, "loss": 0.58771378, "num_input_tokens_seen": 241692380, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.23144531, "step": 11197, "time_per_iteration": 3.0391812324523926 }, { "auxiliary_loss_clip": 0.01107073, "auxiliary_loss_mlp": 0.01029607, "balance_loss_clip": 1.0177753, "balance_loss_mlp": 1.03943813, "epoch": 0.6732601833759206, "flos": 17201642762880.0, "grad_norm": 2.619500887344701, "language_loss": 0.75724465, "learning_rate": 1.0190601026510312e-06, "loss": 0.77861142, "num_input_tokens_seen": 241710430, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 11198, "time_per_iteration": 2.4701125621795654 }, { "auxiliary_loss_clip": 0.01107651, "auxiliary_loss_mlp": 0.01030484, "balance_loss_clip": 1.01748466, "balance_loss_mlp": 1.03625345, "epoch": 0.6733203066285887, "flos": 18658861620480.0, "grad_norm": 2.049420416040986, "language_loss": 0.81923956, "learning_rate": 1.0187207217127892e-06, "loss": 0.84062088, "num_input_tokens_seen": 241724775, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 11199, "time_per_iteration": 2.463623046875 }, { "auxiliary_loss_clip": 0.01108689, "auxiliary_loss_mlp": 0.01032495, "balance_loss_clip": 1.01922154, "balance_loss_mlp": 1.03701138, "epoch": 0.6733804298812566, "flos": 35809330481280.0, "grad_norm": 1.7699266943932601, "language_loss": 0.71278012, "learning_rate": 1.0183813779847552e-06, "loss": 0.73419201, "num_input_tokens_seen": 241744440, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 11200, "time_per_iteration": 2.6149840354919434 }, { "auxiliary_loss_clip": 0.01111045, "auxiliary_loss_mlp": 0.010358, "balance_loss_clip": 1.02268076, "balance_loss_mlp": 1.03987336, "epoch": 0.6734405531339246, "flos": 61638833099520.0, "grad_norm": 1.9930047588487219, "language_loss": 0.64362621, "learning_rate": 1.0180420714797987e-06, "loss": 0.66509473, "num_input_tokens_seen": 241771705, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 11201, "time_per_iteration": 2.8690435886383057 }, { "auxiliary_loss_clip": 0.01109249, "auxiliary_loss_mlp": 0.01034965, "balance_loss_clip": 1.02178001, "balance_loss_mlp": 1.0364573, "epoch": 0.6735006763865925, "flos": 20522338277760.0, "grad_norm": 1.829463256722492, "language_loss": 0.63712704, "learning_rate": 1.0177028022107856e-06, "loss": 0.65856922, "num_input_tokens_seen": 241790830, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 11202, "time_per_iteration": 2.4935238361358643 }, { "auxiliary_loss_clip": 0.01107211, "auxiliary_loss_mlp": 0.010268, "balance_loss_clip": 1.0147835, "balance_loss_mlp": 1.03662276, "epoch": 0.6735607996392605, "flos": 13918869031680.0, "grad_norm": 1.7458471566531866, "language_loss": 0.74787867, "learning_rate": 1.0173635701905796e-06, "loss": 0.7692188, "num_input_tokens_seen": 241808165, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 11203, "time_per_iteration": 2.461855888366699 }, { "auxiliary_loss_clip": 0.01112754, "auxiliary_loss_mlp": 0.01031156, "balance_loss_clip": 1.01644564, "balance_loss_mlp": 1.038499, "epoch": 0.6736209228919284, "flos": 18807244704000.0, "grad_norm": 2.1235247071711805, "language_loss": 0.67809319, "learning_rate": 1.0170243754320456e-06, "loss": 0.69953233, "num_input_tokens_seen": 241826925, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7421875, "step": 11204, "time_per_iteration": 2.4853532314300537 }, { "auxiliary_loss_clip": 0.01113542, "auxiliary_loss_mlp": 0.01030642, "balance_loss_clip": 1.01715398, "balance_loss_mlp": 1.03999424, "epoch": 0.6736810461445965, "flos": 20373129181440.0, "grad_norm": 1.6671046946292503, "language_loss": 0.73912907, "learning_rate": 1.0166852179480465e-06, "loss": 0.76057094, "num_input_tokens_seen": 241845525, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 11205, "time_per_iteration": 2.4999985694885254 }, { "auxiliary_loss_clip": 0.01103245, "auxiliary_loss_mlp": 0.01030592, "balance_loss_clip": 1.01817584, "balance_loss_mlp": 1.03503036, "epoch": 0.6737411693972644, "flos": 30007530927360.0, "grad_norm": 2.7814147098947917, "language_loss": 0.71496928, "learning_rate": 1.0163460977514416e-06, "loss": 0.73630762, "num_input_tokens_seen": 241866815, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 11206, "time_per_iteration": 2.5566763877868652 }, { "auxiliary_loss_clip": 0.01115483, "auxiliary_loss_mlp": 0.01036862, "balance_loss_clip": 1.02332544, "balance_loss_mlp": 1.0400188, "epoch": 0.6738012926499324, "flos": 25447342844160.0, "grad_norm": 2.0775689359155884, "language_loss": 0.67621368, "learning_rate": 1.016007014855092e-06, "loss": 0.6977371, "num_input_tokens_seen": 241887050, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.75390625, "step": 11207, "time_per_iteration": 2.52756404876709 }, { "auxiliary_loss_clip": 0.01105706, "auxiliary_loss_mlp": 0.01033468, "balance_loss_clip": 1.02141643, "balance_loss_mlp": 1.03791189, "epoch": 0.6738614159026003, "flos": 20776873029120.0, "grad_norm": 2.378824706950663, "language_loss": 0.7391144, "learning_rate": 1.0156679692718553e-06, "loss": 0.76050621, "num_input_tokens_seen": 241904280, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 11208, "time_per_iteration": 2.4783082008361816 }, { "auxiliary_loss_clip": 0.01105011, "auxiliary_loss_mlp": 0.01033948, "balance_loss_clip": 1.01978636, "balance_loss_mlp": 1.03444338, "epoch": 0.6739215391552683, "flos": 19566898462080.0, "grad_norm": 1.9933426164429375, "language_loss": 0.75387633, "learning_rate": 1.0153289610145867e-06, "loss": 0.77526593, "num_input_tokens_seen": 241919190, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.70703125, "step": 11209, "time_per_iteration": 2.4699959754943848 }, { "auxiliary_loss_clip": 0.01101061, "auxiliary_loss_mlp": 0.01030307, "balance_loss_clip": 1.01869631, "balance_loss_mlp": 1.03433228, "epoch": 0.6739816624079362, "flos": 24388193485440.0, "grad_norm": 1.6758077104768934, "language_loss": 0.66421449, "learning_rate": 1.0149899900961428e-06, "loss": 0.68552816, "num_input_tokens_seen": 241940525, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 11210, "time_per_iteration": 2.505439281463623 }, { "auxiliary_loss_clip": 0.01101838, "auxiliary_loss_mlp": 0.01032353, "balance_loss_clip": 1.02099812, "balance_loss_mlp": 1.03473496, "epoch": 0.6740417856606042, "flos": 22528164533760.0, "grad_norm": 2.1160967869489284, "language_loss": 0.79779339, "learning_rate": 1.014651056529377e-06, "loss": 0.81913525, "num_input_tokens_seen": 241959290, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 11211, "time_per_iteration": 2.4899239540100098 }, { "auxiliary_loss_clip": 0.01104019, "auxiliary_loss_mlp": 0.01033123, "balance_loss_clip": 1.0203433, "balance_loss_mlp": 1.03629839, "epoch": 0.6741019089132723, "flos": 25775458606080.0, "grad_norm": 1.4072883188421867, "language_loss": 0.76769042, "learning_rate": 1.014312160327143e-06, "loss": 0.78906178, "num_input_tokens_seen": 241980715, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.67578125, "step": 11212, "time_per_iteration": 3.932854175567627 }, { "auxiliary_loss_clip": 0.01106777, "auxiliary_loss_mlp": 0.01028895, "balance_loss_clip": 1.01619875, "balance_loss_mlp": 1.03581345, "epoch": 0.6741620321659402, "flos": 21105671149440.0, "grad_norm": 1.7371841709584877, "language_loss": 0.77804774, "learning_rate": 1.0139733015022905e-06, "loss": 0.79940444, "num_input_tokens_seen": 241999985, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11213, "time_per_iteration": 2.4843270778656006 }, { "auxiliary_loss_clip": 0.01109487, "auxiliary_loss_mlp": 0.01031621, "balance_loss_clip": 1.01849639, "balance_loss_mlp": 1.03738821, "epoch": 0.6742221554186082, "flos": 20740423703040.0, "grad_norm": 1.779871632538324, "language_loss": 0.67660415, "learning_rate": 1.0136344800676685e-06, "loss": 0.69801521, "num_input_tokens_seen": 242018990, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 11214, "time_per_iteration": 3.943329095840454 }, { "auxiliary_loss_clip": 0.01106103, "auxiliary_loss_mlp": 0.01033062, "balance_loss_clip": 1.02043772, "balance_loss_mlp": 1.03604317, "epoch": 0.6742822786712761, "flos": 37774146384000.0, "grad_norm": 2.4543968993305927, "language_loss": 0.72577852, "learning_rate": 1.0132956960361263e-06, "loss": 0.74717015, "num_input_tokens_seen": 242039340, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 11215, "time_per_iteration": 2.6019439697265625 }, { "auxiliary_loss_clip": 0.01107723, "auxiliary_loss_mlp": 0.01031335, "balance_loss_clip": 1.01937246, "balance_loss_mlp": 1.03630292, "epoch": 0.6743424019239441, "flos": 37263891732480.0, "grad_norm": 5.467049431767433, "language_loss": 0.66989553, "learning_rate": 1.0129569494205096e-06, "loss": 0.69128609, "num_input_tokens_seen": 242062215, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.71484375, "step": 11216, "time_per_iteration": 5.446830749511719 }, { "auxiliary_loss_clip": 0.01031989, "auxiliary_loss_mlp": 0.01000147, "balance_loss_clip": 0.99889582, "balance_loss_mlp": 1.0088141, "epoch": 0.674402525176612, "flos": 65997746300160.0, "grad_norm": 0.6820017371303746, "language_loss": 0.56322348, "learning_rate": 1.0126182402336646e-06, "loss": 0.58354485, "num_input_tokens_seen": 242131130, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.23242188, "step": 11217, "time_per_iteration": 3.212021589279175 }, { "auxiliary_loss_clip": 0.01104448, "auxiliary_loss_mlp": 0.01030385, "balance_loss_clip": 1.01755834, "balance_loss_mlp": 1.03578985, "epoch": 0.67446264842928, "flos": 26461208131200.0, "grad_norm": 2.139530879025732, "language_loss": 0.74541366, "learning_rate": 1.0122795684884363e-06, "loss": 0.76676202, "num_input_tokens_seen": 242149720, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 11218, "time_per_iteration": 2.5419299602508545 }, { "auxiliary_loss_clip": 0.01107612, "auxiliary_loss_mlp": 0.0104307, "balance_loss_clip": 1.0282042, "balance_loss_mlp": 1.03679752, "epoch": 0.674522771681948, "flos": 23732392924800.0, "grad_norm": 1.8461738358063335, "language_loss": 0.65755016, "learning_rate": 1.0119409341976639e-06, "loss": 0.679057, "num_input_tokens_seen": 242168875, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.70703125, "step": 11219, "time_per_iteration": 2.576225996017456 }, { "auxiliary_loss_clip": 0.01108781, "auxiliary_loss_mlp": 0.01036225, "balance_loss_clip": 1.02355313, "balance_loss_mlp": 1.03667998, "epoch": 0.674582894934616, "flos": 24754338771840.0, "grad_norm": 2.102954230334043, "language_loss": 0.74722183, "learning_rate": 1.0116023373741904e-06, "loss": 0.76867187, "num_input_tokens_seen": 242188465, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 11220, "time_per_iteration": 2.571817398071289 }, { "auxiliary_loss_clip": 0.01107182, "auxiliary_loss_mlp": 0.0103173, "balance_loss_clip": 1.01880789, "balance_loss_mlp": 1.03613508, "epoch": 0.6746430181872839, "flos": 24826626892800.0, "grad_norm": 1.6735880891627657, "language_loss": 0.70387745, "learning_rate": 1.0112637780308554e-06, "loss": 0.72526658, "num_input_tokens_seen": 242208675, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11221, "time_per_iteration": 2.6797821521759033 }, { "auxiliary_loss_clip": 0.01105809, "auxiliary_loss_mlp": 0.0103022, "balance_loss_clip": 1.01851964, "balance_loss_mlp": 1.03664863, "epoch": 0.6747031414399519, "flos": 16873491087360.0, "grad_norm": 1.8810157983188966, "language_loss": 0.5807879, "learning_rate": 1.010925256180498e-06, "loss": 0.60214818, "num_input_tokens_seen": 242227440, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 11222, "time_per_iteration": 2.482290744781494 }, { "auxiliary_loss_clip": 0.01106543, "auxiliary_loss_mlp": 0.01034535, "balance_loss_clip": 1.02169657, "balance_loss_mlp": 1.03634965, "epoch": 0.6747632646926198, "flos": 22784925928320.0, "grad_norm": 2.135122561816687, "language_loss": 0.77028441, "learning_rate": 1.0105867718359528e-06, "loss": 0.79169524, "num_input_tokens_seen": 242245240, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11223, "time_per_iteration": 2.494016647338867 }, { "auxiliary_loss_clip": 0.01108663, "auxiliary_loss_mlp": 0.01033451, "balance_loss_clip": 1.02056456, "balance_loss_mlp": 1.03749633, "epoch": 0.6748233879452878, "flos": 20046090827520.0, "grad_norm": 3.262042074894253, "language_loss": 0.75375724, "learning_rate": 1.0102483250100574e-06, "loss": 0.77517843, "num_input_tokens_seen": 242263435, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11224, "time_per_iteration": 2.513310194015503 }, { "auxiliary_loss_clip": 0.01102786, "auxiliary_loss_mlp": 0.01028955, "balance_loss_clip": 1.01816607, "balance_loss_mlp": 1.03590286, "epoch": 0.6748835111979558, "flos": 23002831785600.0, "grad_norm": 1.6201400607050023, "language_loss": 0.63209224, "learning_rate": 1.0099099157156445e-06, "loss": 0.65340966, "num_input_tokens_seen": 242282765, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.66796875, "step": 11225, "time_per_iteration": 2.506288766860962 }, { "auxiliary_loss_clip": 0.01102747, "auxiliary_loss_mlp": 0.01029814, "balance_loss_clip": 1.01836991, "balance_loss_mlp": 1.0362246, "epoch": 0.6749436344506238, "flos": 12197311009920.0, "grad_norm": 1.905725286694616, "language_loss": 0.64141786, "learning_rate": 1.0095715439655462e-06, "loss": 0.66274345, "num_input_tokens_seen": 242298980, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6640625, "step": 11226, "time_per_iteration": 2.47607684135437 }, { "auxiliary_loss_clip": 0.01108499, "auxiliary_loss_mlp": 0.01029359, "balance_loss_clip": 1.01694369, "balance_loss_mlp": 1.03827572, "epoch": 0.6750037577032918, "flos": 11873720361600.0, "grad_norm": 10.222782113085993, "language_loss": 0.72121918, "learning_rate": 1.0092332097725945e-06, "loss": 0.74259776, "num_input_tokens_seen": 242315420, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 11227, "time_per_iteration": 2.4590940475463867 }, { "auxiliary_loss_clip": 0.01104419, "auxiliary_loss_mlp": 0.01030358, "balance_loss_clip": 1.01750755, "balance_loss_mlp": 1.03563643, "epoch": 0.6750638809559597, "flos": 17019611614080.0, "grad_norm": 2.5020486694960082, "language_loss": 0.71566647, "learning_rate": 1.0088949131496183e-06, "loss": 0.73701423, "num_input_tokens_seen": 242332805, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 11228, "time_per_iteration": 2.48179030418396 }, { "auxiliary_loss_clip": 0.01031156, "auxiliary_loss_mlp": 0.01000095, "balance_loss_clip": 0.99884337, "balance_loss_mlp": 1.00818074, "epoch": 0.6751240042086277, "flos": 70951011891840.0, "grad_norm": 0.7576265307354519, "language_loss": 0.5324806, "learning_rate": 1.0085566541094482e-06, "loss": 0.55279303, "num_input_tokens_seen": 242396160, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.23046875, "step": 11229, "time_per_iteration": 3.1435372829437256 }, { "auxiliary_loss_clip": 0.01103697, "auxiliary_loss_mlp": 0.01029325, "balance_loss_clip": 1.0169332, "balance_loss_mlp": 1.0347954, "epoch": 0.6751841274612956, "flos": 22675146986880.0, "grad_norm": 1.6966875441754417, "language_loss": 0.79984629, "learning_rate": 1.0082184326649072e-06, "loss": 0.82117653, "num_input_tokens_seen": 242414660, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 11230, "time_per_iteration": 2.516610622406006 }, { "auxiliary_loss_clip": 0.0110354, "auxiliary_loss_mlp": 0.01029939, "balance_loss_clip": 1.01876903, "balance_loss_mlp": 1.03601313, "epoch": 0.6752442507139637, "flos": 21288636051840.0, "grad_norm": 1.68311691134965, "language_loss": 0.65645885, "learning_rate": 1.0078802488288228e-06, "loss": 0.67779368, "num_input_tokens_seen": 242434225, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.67578125, "step": 11231, "time_per_iteration": 2.490896463394165 }, { "auxiliary_loss_clip": 0.01113073, "auxiliary_loss_mlp": 0.01036425, "balance_loss_clip": 1.02180958, "balance_loss_mlp": 1.0391891, "epoch": 0.6753043739666316, "flos": 28256921781120.0, "grad_norm": 1.8406639092097539, "language_loss": 0.66496491, "learning_rate": 1.0075421026140198e-06, "loss": 0.68645984, "num_input_tokens_seen": 242454355, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.73828125, "step": 11232, "time_per_iteration": 2.5514533519744873 }, { "auxiliary_loss_clip": 0.01103226, "auxiliary_loss_mlp": 0.01027759, "balance_loss_clip": 1.01552796, "balance_loss_mlp": 1.03498912, "epoch": 0.6753644972192996, "flos": 21360349555200.0, "grad_norm": 2.11413009785757, "language_loss": 0.7253027, "learning_rate": 1.0072039940333188e-06, "loss": 0.74661261, "num_input_tokens_seen": 242474935, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 11233, "time_per_iteration": 2.5031919479370117 }, { "auxiliary_loss_clip": 0.01106998, "auxiliary_loss_mlp": 0.01034142, "balance_loss_clip": 1.02084446, "balance_loss_mlp": 1.03669262, "epoch": 0.6754246204719675, "flos": 26541971861760.0, "grad_norm": 1.584374152979816, "language_loss": 0.76642132, "learning_rate": 1.0068659230995418e-06, "loss": 0.78783268, "num_input_tokens_seen": 242495530, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 11234, "time_per_iteration": 2.5680525302886963 }, { "auxiliary_loss_clip": 0.01106564, "auxiliary_loss_mlp": 0.01031158, "balance_loss_clip": 1.01811647, "balance_loss_mlp": 1.03739643, "epoch": 0.6754847437246355, "flos": 25556690822400.0, "grad_norm": 1.5927111205937532, "language_loss": 0.75382948, "learning_rate": 1.0065278898255101e-06, "loss": 0.77520669, "num_input_tokens_seen": 242514550, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 11235, "time_per_iteration": 2.5300021171569824 }, { "auxiliary_loss_clip": 0.01031611, "auxiliary_loss_mlp": 0.01000265, "balance_loss_clip": 0.99894828, "balance_loss_mlp": 1.00866485, "epoch": 0.6755448669773034, "flos": 59513318726400.0, "grad_norm": 0.7794925790304612, "language_loss": 0.51322091, "learning_rate": 1.0061898942240387e-06, "loss": 0.53353965, "num_input_tokens_seen": 242569200, "router_z_loss_clip": 0.01318359, "router_z_loss_mlp": 0.23046875, "step": 11236, "time_per_iteration": 3.087106227874756 }, { "auxiliary_loss_clip": 0.01106997, "auxiliary_loss_mlp": 0.01030047, "balance_loss_clip": 1.01568222, "balance_loss_mlp": 1.03748691, "epoch": 0.6756049902299714, "flos": 23294534135040.0, "grad_norm": 2.238348927899092, "language_loss": 0.75681126, "learning_rate": 1.0058519363079464e-06, "loss": 0.77818167, "num_input_tokens_seen": 242586950, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.6953125, "step": 11237, "time_per_iteration": 2.491596221923828 }, { "auxiliary_loss_clip": 0.01106756, "auxiliary_loss_mlp": 0.01035901, "balance_loss_clip": 1.02365196, "balance_loss_mlp": 1.03784406, "epoch": 0.6756651134826394, "flos": 31575426566400.0, "grad_norm": 2.126775756525287, "language_loss": 0.77511942, "learning_rate": 1.0055140160900482e-06, "loss": 0.79654598, "num_input_tokens_seen": 242607380, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 11238, "time_per_iteration": 2.578688144683838 }, { "auxiliary_loss_clip": 0.01110413, "auxiliary_loss_mlp": 0.01036345, "balance_loss_clip": 1.02285647, "balance_loss_mlp": 1.03691113, "epoch": 0.6757252367353074, "flos": 27272287186560.0, "grad_norm": 2.1196989641933293, "language_loss": 0.66598296, "learning_rate": 1.0051761335831587e-06, "loss": 0.68745053, "num_input_tokens_seen": 242628025, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 11239, "time_per_iteration": 2.5037033557891846 }, { "auxiliary_loss_clip": 0.0110614, "auxiliary_loss_mlp": 0.01027506, "balance_loss_clip": 1.0152154, "balance_loss_mlp": 1.0375874, "epoch": 0.6757853599879754, "flos": 16830900535680.0, "grad_norm": 3.997501947127396, "language_loss": 0.83021903, "learning_rate": 1.0048382888000898e-06, "loss": 0.85155547, "num_input_tokens_seen": 242643825, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 11240, "time_per_iteration": 2.448331594467163 }, { "auxiliary_loss_clip": 0.01114836, "auxiliary_loss_mlp": 0.01033432, "balance_loss_clip": 1.01791656, "balance_loss_mlp": 1.03953743, "epoch": 0.6758454832406433, "flos": 23220055284480.0, "grad_norm": 2.079852139790936, "language_loss": 0.74148774, "learning_rate": 1.0045004817536525e-06, "loss": 0.76297045, "num_input_tokens_seen": 242661820, "router_z_loss_clip": 0.15527344, "router_z_loss_mlp": 0.75390625, "step": 11241, "time_per_iteration": 2.4776222705841064 }, { "auxiliary_loss_clip": 0.01108909, "auxiliary_loss_mlp": 0.01034012, "balance_loss_clip": 1.02086878, "balance_loss_mlp": 1.0383532, "epoch": 0.6759056064933113, "flos": 16289547684480.0, "grad_norm": 2.5920704281649662, "language_loss": 0.8033911, "learning_rate": 1.0041627124566572e-06, "loss": 0.82482028, "num_input_tokens_seen": 242679890, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11242, "time_per_iteration": 2.453878402709961 }, { "auxiliary_loss_clip": 0.01106047, "auxiliary_loss_mlp": 0.01032105, "balance_loss_clip": 1.01959395, "balance_loss_mlp": 1.03598714, "epoch": 0.6759657297459792, "flos": 25922297404800.0, "grad_norm": 2.3511107250696015, "language_loss": 0.72972345, "learning_rate": 1.0038249809219109e-06, "loss": 0.75110507, "num_input_tokens_seen": 242699495, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 11243, "time_per_iteration": 2.5111629962921143 }, { "auxiliary_loss_clip": 0.01105328, "auxiliary_loss_mlp": 0.01034866, "balance_loss_clip": 1.02289748, "balance_loss_mlp": 1.03601646, "epoch": 0.6760258529986473, "flos": 23000820624000.0, "grad_norm": 1.6050711475045614, "language_loss": 0.72968352, "learning_rate": 1.003487287162221e-06, "loss": 0.75108546, "num_input_tokens_seen": 242719500, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69140625, "step": 11244, "time_per_iteration": 2.5161867141723633 }, { "auxiliary_loss_clip": 0.01110907, "auxiliary_loss_mlp": 0.01042076, "balance_loss_clip": 1.02871907, "balance_loss_mlp": 1.03882778, "epoch": 0.6760859762513152, "flos": 20959335141120.0, "grad_norm": 1.979770319824069, "language_loss": 0.85858583, "learning_rate": 1.003149631190393e-06, "loss": 0.88011569, "num_input_tokens_seen": 242738325, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 11245, "time_per_iteration": 2.485445499420166 }, { "auxiliary_loss_clip": 0.01113038, "auxiliary_loss_mlp": 0.01030639, "balance_loss_clip": 1.01718032, "balance_loss_mlp": 1.03906584, "epoch": 0.6761460995039832, "flos": 23622937205760.0, "grad_norm": 3.3377286673369624, "language_loss": 0.73352605, "learning_rate": 1.0028120130192327e-06, "loss": 0.7549628, "num_input_tokens_seen": 242756620, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 11246, "time_per_iteration": 2.5174400806427 }, { "auxiliary_loss_clip": 0.01106018, "auxiliary_loss_mlp": 0.01029099, "balance_loss_clip": 1.01634336, "balance_loss_mlp": 1.0356853, "epoch": 0.6762062227566511, "flos": 20770875457920.0, "grad_norm": 2.2717840094175448, "language_loss": 0.88205647, "learning_rate": 1.002474432661539e-06, "loss": 0.90340769, "num_input_tokens_seen": 242774505, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11247, "time_per_iteration": 2.445093870162964 }, { "auxiliary_loss_clip": 0.01032559, "auxiliary_loss_mlp": 0.01003361, "balance_loss_clip": 1.00216329, "balance_loss_mlp": 1.0095613, "epoch": 0.6762663460093191, "flos": 52818099166080.0, "grad_norm": 0.8325116639652579, "language_loss": 0.54093868, "learning_rate": 1.002136890130115e-06, "loss": 0.56129789, "num_input_tokens_seen": 242828645, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.23046875, "step": 11248, "time_per_iteration": 3.0910041332244873 }, { "auxiliary_loss_clip": 0.01103596, "auxiliary_loss_mlp": 0.01028769, "balance_loss_clip": 1.01681268, "balance_loss_mlp": 1.03728759, "epoch": 0.676326469261987, "flos": 23696302734720.0, "grad_norm": 2.46218960470515, "language_loss": 0.7371549, "learning_rate": 1.001799385437761e-06, "loss": 0.75847852, "num_input_tokens_seen": 242850100, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 11249, "time_per_iteration": 2.495955228805542 }, { "auxiliary_loss_clip": 0.01107595, "auxiliary_loss_mlp": 0.01040473, "balance_loss_clip": 1.02669215, "balance_loss_mlp": 1.03543973, "epoch": 0.676386592514655, "flos": 14063732582400.0, "grad_norm": 2.521318284533348, "language_loss": 0.73949337, "learning_rate": 1.0014619185972732e-06, "loss": 0.76097411, "num_input_tokens_seen": 242867775, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 11250, "time_per_iteration": 2.4274630546569824 }, { "auxiliary_loss_clip": 0.01109734, "auxiliary_loss_mlp": 0.01029624, "balance_loss_clip": 1.01666594, "balance_loss_mlp": 1.0379492, "epoch": 0.676446715767323, "flos": 20412236113920.0, "grad_norm": 2.336536276118632, "language_loss": 0.75054634, "learning_rate": 1.0011244896214497e-06, "loss": 0.77193993, "num_input_tokens_seen": 242886865, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11251, "time_per_iteration": 2.4613425731658936 }, { "auxiliary_loss_clip": 0.01106461, "auxiliary_loss_mlp": 0.01031747, "balance_loss_clip": 1.01911712, "balance_loss_mlp": 1.03803873, "epoch": 0.676506839019991, "flos": 21288241002240.0, "grad_norm": 1.659132409524101, "language_loss": 0.70187509, "learning_rate": 1.0007870985230873e-06, "loss": 0.72325712, "num_input_tokens_seen": 242906705, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 11252, "time_per_iteration": 2.4923176765441895 }, { "auxiliary_loss_clip": 0.01108785, "auxiliary_loss_mlp": 0.01029645, "balance_loss_clip": 1.01709199, "balance_loss_mlp": 1.03851199, "epoch": 0.676566962272659, "flos": 29932477459200.0, "grad_norm": 1.7499814166360832, "language_loss": 0.66951132, "learning_rate": 1.0004497453149765e-06, "loss": 0.69089556, "num_input_tokens_seen": 242925215, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 11253, "time_per_iteration": 3.9206161499023438 }, { "auxiliary_loss_clip": 0.01110741, "auxiliary_loss_mlp": 0.01033864, "balance_loss_clip": 1.01991034, "balance_loss_mlp": 1.03850532, "epoch": 0.6766270855253269, "flos": 17931203902080.0, "grad_norm": 1.6154297898707968, "language_loss": 0.76897967, "learning_rate": 1.0001124300099115e-06, "loss": 0.79042578, "num_input_tokens_seen": 242944750, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.72265625, "step": 11254, "time_per_iteration": 2.4939041137695312 }, { "auxiliary_loss_clip": 0.01106039, "auxiliary_loss_mlp": 0.01034631, "balance_loss_clip": 1.02153039, "balance_loss_mlp": 1.03507221, "epoch": 0.6766872087779949, "flos": 23104853389440.0, "grad_norm": 2.1740711006419025, "language_loss": 0.72321278, "learning_rate": 9.997751526206835e-07, "loss": 0.74461949, "num_input_tokens_seen": 242963860, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 11255, "time_per_iteration": 2.4832921028137207 }, { "auxiliary_loss_clip": 0.01106418, "auxiliary_loss_mlp": 0.01037386, "balance_loss_clip": 1.02427924, "balance_loss_mlp": 1.03534293, "epoch": 0.6767473320306628, "flos": 26213137827840.0, "grad_norm": 2.0837327459678145, "language_loss": 0.75098407, "learning_rate": 9.994379131600828e-07, "loss": 0.77242208, "num_input_tokens_seen": 242983050, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 11256, "time_per_iteration": 3.851762294769287 }, { "auxiliary_loss_clip": 0.0110991, "auxiliary_loss_mlp": 0.01032852, "balance_loss_clip": 1.01995993, "balance_loss_mlp": 1.03818536, "epoch": 0.6768074552833309, "flos": 18368739469440.0, "grad_norm": 2.677158230543233, "language_loss": 0.66215122, "learning_rate": 9.991007116408965e-07, "loss": 0.68357885, "num_input_tokens_seen": 243001125, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11257, "time_per_iteration": 3.8712568283081055 }, { "auxiliary_loss_clip": 0.0110315, "auxiliary_loss_mlp": 0.0103253, "balance_loss_clip": 1.02018023, "balance_loss_mlp": 1.03529096, "epoch": 0.6768675785359988, "flos": 23039927556480.0, "grad_norm": 1.9966635211972614, "language_loss": 0.75684047, "learning_rate": 9.987635480759109e-07, "loss": 0.77819729, "num_input_tokens_seen": 243021865, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 11258, "time_per_iteration": 3.9953978061676025 }, { "auxiliary_loss_clip": 0.01105089, "auxiliary_loss_mlp": 0.01032003, "balance_loss_clip": 1.01984918, "balance_loss_mlp": 1.03751004, "epoch": 0.6769277017886668, "flos": 33036524092800.0, "grad_norm": 1.7022558083077006, "language_loss": 0.6655966, "learning_rate": 9.984264224779127e-07, "loss": 0.68696749, "num_input_tokens_seen": 243042970, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 11259, "time_per_iteration": 2.5860884189605713 }, { "auxiliary_loss_clip": 0.01106885, "auxiliary_loss_mlp": 0.01033904, "balance_loss_clip": 1.02117276, "balance_loss_mlp": 1.03648365, "epoch": 0.6769878250413347, "flos": 20848406964480.0, "grad_norm": 2.325105180121351, "language_loss": 0.85756558, "learning_rate": 9.980893348596839e-07, "loss": 0.87897348, "num_input_tokens_seen": 243058470, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 11260, "time_per_iteration": 2.486344337463379 }, { "auxiliary_loss_clip": 0.01108535, "auxiliary_loss_mlp": 0.01034701, "balance_loss_clip": 1.02116454, "balance_loss_mlp": 1.03594089, "epoch": 0.6770479482940027, "flos": 15595968994560.0, "grad_norm": 2.5101240154870013, "language_loss": 0.77453589, "learning_rate": 9.977522852340081e-07, "loss": 0.79596829, "num_input_tokens_seen": 243076630, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 11261, "time_per_iteration": 2.492295980453491 }, { "auxiliary_loss_clip": 0.01105521, "auxiliary_loss_mlp": 0.01035114, "balance_loss_clip": 1.02237058, "balance_loss_mlp": 1.03482938, "epoch": 0.6771080715466706, "flos": 18621011664000.0, "grad_norm": 2.1912213174774564, "language_loss": 0.88162202, "learning_rate": 9.97415273613666e-07, "loss": 0.90302837, "num_input_tokens_seen": 243092260, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 11262, "time_per_iteration": 2.485959768295288 }, { "auxiliary_loss_clip": 0.01110275, "auxiliary_loss_mlp": 0.01032361, "balance_loss_clip": 1.01887822, "balance_loss_mlp": 1.03822839, "epoch": 0.6771681947993387, "flos": 12495441893760.0, "grad_norm": 2.0572366026467446, "language_loss": 0.74409908, "learning_rate": 9.97078300011439e-07, "loss": 0.76552546, "num_input_tokens_seen": 243109405, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 11263, "time_per_iteration": 2.4855144023895264 }, { "auxiliary_loss_clip": 0.01110164, "auxiliary_loss_mlp": 0.01035665, "balance_loss_clip": 1.02131176, "balance_loss_mlp": 1.0369904, "epoch": 0.6772283180520066, "flos": 22236964974720.0, "grad_norm": 3.3707162018594525, "language_loss": 0.68615574, "learning_rate": 9.967413644401016e-07, "loss": 0.707614, "num_input_tokens_seen": 243128135, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.73046875, "step": 11264, "time_per_iteration": 2.533845901489258 }, { "auxiliary_loss_clip": 0.01109386, "auxiliary_loss_mlp": 0.01034005, "balance_loss_clip": 1.02080846, "balance_loss_mlp": 1.03916883, "epoch": 0.6772884413046746, "flos": 16143139848960.0, "grad_norm": 2.2953281181778737, "language_loss": 0.73103356, "learning_rate": 9.964044669124324e-07, "loss": 0.75246751, "num_input_tokens_seen": 243146785, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 11265, "time_per_iteration": 2.490309000015259 }, { "auxiliary_loss_clip": 0.0110536, "auxiliary_loss_mlp": 0.01029353, "balance_loss_clip": 1.01712191, "balance_loss_mlp": 1.03639686, "epoch": 0.6773485645573426, "flos": 19135755515520.0, "grad_norm": 3.8766752666553796, "language_loss": 0.61234176, "learning_rate": 9.96067607441207e-07, "loss": 0.63368887, "num_input_tokens_seen": 243165275, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 11266, "time_per_iteration": 2.4693305492401123 }, { "auxiliary_loss_clip": 0.011091, "auxiliary_loss_mlp": 0.01035198, "balance_loss_clip": 1.02205491, "balance_loss_mlp": 1.03819442, "epoch": 0.6774086878100105, "flos": 14136918543360.0, "grad_norm": 2.513159113952113, "language_loss": 0.70262682, "learning_rate": 9.957307860391976e-07, "loss": 0.72406983, "num_input_tokens_seen": 243182845, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 11267, "time_per_iteration": 2.489597797393799 }, { "auxiliary_loss_clip": 0.01106104, "auxiliary_loss_mlp": 0.01034799, "balance_loss_clip": 1.02211499, "balance_loss_mlp": 1.03597987, "epoch": 0.6774688110626785, "flos": 22197067943040.0, "grad_norm": 2.1181696415916864, "language_loss": 0.7104162, "learning_rate": 9.953940027191785e-07, "loss": 0.73182523, "num_input_tokens_seen": 243201475, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11268, "time_per_iteration": 2.512035846710205 }, { "auxiliary_loss_clip": 0.01111199, "auxiliary_loss_mlp": 0.0103286, "balance_loss_clip": 1.0195446, "balance_loss_mlp": 1.04069448, "epoch": 0.6775289343153464, "flos": 23039963470080.0, "grad_norm": 1.5549557805489282, "language_loss": 0.76915848, "learning_rate": 9.950572574939194e-07, "loss": 0.79059911, "num_input_tokens_seen": 243221850, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 11269, "time_per_iteration": 2.5382328033447266 }, { "auxiliary_loss_clip": 0.01110603, "auxiliary_loss_mlp": 0.01040679, "balance_loss_clip": 1.0270654, "balance_loss_mlp": 1.03751445, "epoch": 0.6775890575680145, "flos": 18293506433280.0, "grad_norm": 2.639910909725484, "language_loss": 0.74267566, "learning_rate": 9.94720550376189e-07, "loss": 0.76418847, "num_input_tokens_seen": 243239855, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 11270, "time_per_iteration": 2.4742109775543213 }, { "auxiliary_loss_clip": 0.01110047, "auxiliary_loss_mlp": 0.01035922, "balance_loss_clip": 1.02283919, "balance_loss_mlp": 1.0396235, "epoch": 0.6776491808206824, "flos": 25336450581120.0, "grad_norm": 2.231540471347116, "language_loss": 0.73108268, "learning_rate": 9.94383881378756e-07, "loss": 0.75254238, "num_input_tokens_seen": 243260085, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11271, "time_per_iteration": 2.659428358078003 }, { "auxiliary_loss_clip": 0.0110883, "auxiliary_loss_mlp": 0.01035551, "balance_loss_clip": 1.02333164, "balance_loss_mlp": 1.03868723, "epoch": 0.6777093040733504, "flos": 26028233591040.0, "grad_norm": 2.1645742572798676, "language_loss": 0.67491651, "learning_rate": 9.94047250514387e-07, "loss": 0.69636035, "num_input_tokens_seen": 243280065, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 11272, "time_per_iteration": 2.528419256210327 }, { "auxiliary_loss_clip": 0.01111613, "auxiliary_loss_mlp": 0.01035201, "balance_loss_clip": 1.02069306, "balance_loss_mlp": 1.03840709, "epoch": 0.6777694273260183, "flos": 18003599763840.0, "grad_norm": 1.822319980750794, "language_loss": 0.74163198, "learning_rate": 9.937106577958481e-07, "loss": 0.76310015, "num_input_tokens_seen": 243297775, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.734375, "step": 11273, "time_per_iteration": 2.481768846511841 }, { "auxiliary_loss_clip": 0.01105606, "auxiliary_loss_mlp": 0.01038686, "balance_loss_clip": 1.02556133, "balance_loss_mlp": 1.03718269, "epoch": 0.6778295505786863, "flos": 23441085624960.0, "grad_norm": 2.3410086096453915, "language_loss": 0.69931, "learning_rate": 9.933741032359015e-07, "loss": 0.72075295, "num_input_tokens_seen": 243315760, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.68359375, "step": 11274, "time_per_iteration": 2.499032974243164 }, { "auxiliary_loss_clip": 0.01109883, "auxiliary_loss_mlp": 0.01033123, "balance_loss_clip": 1.01998055, "balance_loss_mlp": 1.038638, "epoch": 0.6778896738313542, "flos": 19098408349440.0, "grad_norm": 1.7216324468553508, "language_loss": 0.65696067, "learning_rate": 9.930375868473093e-07, "loss": 0.67839068, "num_input_tokens_seen": 243335715, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 11275, "time_per_iteration": 2.5201220512390137 }, { "auxiliary_loss_clip": 0.01109735, "auxiliary_loss_mlp": 0.01032357, "balance_loss_clip": 1.0203526, "balance_loss_mlp": 1.03964901, "epoch": 0.6779497970840223, "flos": 26103933504000.0, "grad_norm": 1.5455711759281696, "language_loss": 0.72705817, "learning_rate": 9.927011086428335e-07, "loss": 0.74847913, "num_input_tokens_seen": 243356935, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 11276, "time_per_iteration": 2.5430259704589844 }, { "auxiliary_loss_clip": 0.01108195, "auxiliary_loss_mlp": 0.01031845, "balance_loss_clip": 1.01828444, "balance_loss_mlp": 1.03891432, "epoch": 0.6780099203366902, "flos": 19719232041600.0, "grad_norm": 2.1722830724768296, "language_loss": 0.77124059, "learning_rate": 9.923646686352317e-07, "loss": 0.79264104, "num_input_tokens_seen": 243375625, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.69140625, "step": 11277, "time_per_iteration": 2.5087802410125732 }, { "auxiliary_loss_clip": 0.0111021, "auxiliary_loss_mlp": 0.01029927, "balance_loss_clip": 1.01740372, "balance_loss_mlp": 1.03835881, "epoch": 0.6780700435893582, "flos": 18214538382720.0, "grad_norm": 2.893174137116648, "language_loss": 0.83586669, "learning_rate": 9.920282668372627e-07, "loss": 0.8572681, "num_input_tokens_seen": 243390195, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 11278, "time_per_iteration": 2.4755544662475586 }, { "auxiliary_loss_clip": 0.01106532, "auxiliary_loss_mlp": 0.01032096, "balance_loss_clip": 1.02022886, "balance_loss_mlp": 1.03820753, "epoch": 0.6781301668420262, "flos": 25376239872000.0, "grad_norm": 1.9627599251822683, "language_loss": 0.70466971, "learning_rate": 9.916919032616844e-07, "loss": 0.72605598, "num_input_tokens_seen": 243411690, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.68359375, "step": 11279, "time_per_iteration": 2.5955452919006348 }, { "auxiliary_loss_clip": 0.01108817, "auxiliary_loss_mlp": 0.01034197, "balance_loss_clip": 1.02058935, "balance_loss_mlp": 1.03791308, "epoch": 0.6781902900946941, "flos": 24020432087040.0, "grad_norm": 1.8460389629644276, "language_loss": 0.74337518, "learning_rate": 9.913555779212485e-07, "loss": 0.76480532, "num_input_tokens_seen": 243430280, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 11280, "time_per_iteration": 2.646674156188965 }, { "auxiliary_loss_clip": 0.011114, "auxiliary_loss_mlp": 0.01030782, "balance_loss_clip": 1.01739514, "balance_loss_mlp": 1.0380317, "epoch": 0.6782504133473621, "flos": 19646764352640.0, "grad_norm": 4.941099768528501, "language_loss": 0.70227504, "learning_rate": 9.910192908287104e-07, "loss": 0.72369683, "num_input_tokens_seen": 243448690, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 11281, "time_per_iteration": 2.518293857574463 }, { "auxiliary_loss_clip": 0.01106995, "auxiliary_loss_mlp": 0.01032422, "balance_loss_clip": 1.02029896, "balance_loss_mlp": 1.03942382, "epoch": 0.67831053660003, "flos": 24932742647040.0, "grad_norm": 2.2390917744403542, "language_loss": 0.63645732, "learning_rate": 9.906830419968217e-07, "loss": 0.65785158, "num_input_tokens_seen": 243470695, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 11282, "time_per_iteration": 2.6233091354370117 }, { "auxiliary_loss_clip": 0.01114291, "auxiliary_loss_mlp": 0.01037673, "balance_loss_clip": 1.02337348, "balance_loss_mlp": 1.04052854, "epoch": 0.6783706598526981, "flos": 31208383440000.0, "grad_norm": 2.704020973084398, "language_loss": 0.74652803, "learning_rate": 9.90346831438334e-07, "loss": 0.76804769, "num_input_tokens_seen": 243493345, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.734375, "step": 11283, "time_per_iteration": 2.6442441940307617 }, { "auxiliary_loss_clip": 0.01106776, "auxiliary_loss_mlp": 0.01030172, "balance_loss_clip": 1.01763701, "balance_loss_mlp": 1.03754604, "epoch": 0.678430783105366, "flos": 35441317687680.0, "grad_norm": 1.5883648334596814, "language_loss": 0.56753474, "learning_rate": 9.900106591659948e-07, "loss": 0.5889042, "num_input_tokens_seen": 243515670, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 11284, "time_per_iteration": 2.6611545085906982 }, { "auxiliary_loss_clip": 0.01107426, "auxiliary_loss_mlp": 0.01029009, "balance_loss_clip": 1.01628375, "balance_loss_mlp": 1.03684509, "epoch": 0.678490906358034, "flos": 14428800460800.0, "grad_norm": 2.271404951434019, "language_loss": 0.75381857, "learning_rate": 9.896745251925535e-07, "loss": 0.77518296, "num_input_tokens_seen": 243533625, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11285, "time_per_iteration": 2.540663719177246 }, { "auxiliary_loss_clip": 0.01108166, "auxiliary_loss_mlp": 0.01029265, "balance_loss_clip": 1.01653337, "balance_loss_mlp": 1.03948617, "epoch": 0.6785510296107019, "flos": 24311236596480.0, "grad_norm": 1.797518320800618, "language_loss": 0.6643039, "learning_rate": 9.893384295307557e-07, "loss": 0.68567824, "num_input_tokens_seen": 243553040, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 11286, "time_per_iteration": 2.552558660507202 }, { "auxiliary_loss_clip": 0.01108425, "auxiliary_loss_mlp": 0.01027641, "balance_loss_clip": 1.01467657, "balance_loss_mlp": 1.03628373, "epoch": 0.6786111528633699, "flos": 26977244872320.0, "grad_norm": 2.772065206225887, "language_loss": 0.52773142, "learning_rate": 9.890023721933447e-07, "loss": 0.54909205, "num_input_tokens_seen": 243572590, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 11287, "time_per_iteration": 2.588517904281616 }, { "auxiliary_loss_clip": 0.01109167, "auxiliary_loss_mlp": 0.01034637, "balance_loss_clip": 1.0220964, "balance_loss_mlp": 1.03902745, "epoch": 0.6786712761160378, "flos": 24317557390080.0, "grad_norm": 1.4739961759675757, "language_loss": 0.77362037, "learning_rate": 9.886663531930655e-07, "loss": 0.79505837, "num_input_tokens_seen": 243594140, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 11288, "time_per_iteration": 2.6067616939544678 }, { "auxiliary_loss_clip": 0.01113623, "auxiliary_loss_mlp": 0.01034842, "balance_loss_clip": 1.02192616, "balance_loss_mlp": 1.04142916, "epoch": 0.6787313993687059, "flos": 22930435923840.0, "grad_norm": 1.9635139477067138, "language_loss": 0.73575467, "learning_rate": 9.883303725426593e-07, "loss": 0.75723928, "num_input_tokens_seen": 243615170, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 11289, "time_per_iteration": 2.586158275604248 }, { "auxiliary_loss_clip": 0.01109236, "auxiliary_loss_mlp": 0.01034062, "balance_loss_clip": 1.02073979, "balance_loss_mlp": 1.03780532, "epoch": 0.6787915226213738, "flos": 26868435598080.0, "grad_norm": 1.5516498741511124, "language_loss": 0.79920554, "learning_rate": 9.879944302548682e-07, "loss": 0.82063854, "num_input_tokens_seen": 243635675, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 11290, "time_per_iteration": 2.5768144130706787 }, { "auxiliary_loss_clip": 0.01107671, "auxiliary_loss_mlp": 0.01032974, "balance_loss_clip": 1.02022481, "balance_loss_mlp": 1.03969026, "epoch": 0.6788516458740418, "flos": 20008851402240.0, "grad_norm": 1.5370250734535056, "language_loss": 0.74960089, "learning_rate": 9.87658526342428e-07, "loss": 0.7710073, "num_input_tokens_seen": 243654950, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 11291, "time_per_iteration": 2.5334761142730713 }, { "auxiliary_loss_clip": 0.01110861, "auxiliary_loss_mlp": 0.01035117, "balance_loss_clip": 1.02187252, "balance_loss_mlp": 1.03879023, "epoch": 0.6789117691267098, "flos": 28727099832960.0, "grad_norm": 1.85890201955672, "language_loss": 0.75797415, "learning_rate": 9.873226608180785e-07, "loss": 0.77943391, "num_input_tokens_seen": 243674970, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11292, "time_per_iteration": 2.6241934299468994 }, { "auxiliary_loss_clip": 0.01109334, "auxiliary_loss_mlp": 0.01032452, "balance_loss_clip": 1.01885593, "balance_loss_mlp": 1.03861094, "epoch": 0.6789718923793777, "flos": 23403451150080.0, "grad_norm": 1.922345628454067, "language_loss": 0.84568644, "learning_rate": 9.869868336945556e-07, "loss": 0.86710429, "num_input_tokens_seen": 243693440, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.70703125, "step": 11293, "time_per_iteration": 2.598207712173462 }, { "auxiliary_loss_clip": 0.01116883, "auxiliary_loss_mlp": 0.01037903, "balance_loss_clip": 1.02339554, "balance_loss_mlp": 1.0419389, "epoch": 0.6790320156320457, "flos": 20448865008000.0, "grad_norm": 3.4223287913170877, "language_loss": 0.78951603, "learning_rate": 9.866510449845929e-07, "loss": 0.81106389, "num_input_tokens_seen": 243710055, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.75, "step": 11294, "time_per_iteration": 2.522017240524292 }, { "auxiliary_loss_clip": 0.01108226, "auxiliary_loss_mlp": 0.01029054, "balance_loss_clip": 1.01707923, "balance_loss_mlp": 1.03751063, "epoch": 0.6790921388847136, "flos": 24167199058560.0, "grad_norm": 1.7226781127495538, "language_loss": 0.79077786, "learning_rate": 9.86315294700924e-07, "loss": 0.81215066, "num_input_tokens_seen": 243728635, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.70703125, "step": 11295, "time_per_iteration": 3.9924862384796143 }, { "auxiliary_loss_clip": 0.0110466, "auxiliary_loss_mlp": 0.01029905, "balance_loss_clip": 1.01895523, "balance_loss_mlp": 1.03678524, "epoch": 0.6791522621373817, "flos": 21908095027200.0, "grad_norm": 1.883328340563914, "language_loss": 0.71178907, "learning_rate": 9.859795828562823e-07, "loss": 0.73313475, "num_input_tokens_seen": 243748330, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6796875, "step": 11296, "time_per_iteration": 2.5438928604125977 }, { "auxiliary_loss_clip": 0.01107654, "auxiliary_loss_mlp": 0.01029161, "balance_loss_clip": 1.0166142, "balance_loss_mlp": 1.03723931, "epoch": 0.6792123853900496, "flos": 24826519152000.0, "grad_norm": 1.6504054933150918, "language_loss": 0.70788121, "learning_rate": 9.856439094633949e-07, "loss": 0.72924936, "num_input_tokens_seen": 243769380, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 11297, "time_per_iteration": 4.767991781234741 }, { "auxiliary_loss_clip": 0.01113757, "auxiliary_loss_mlp": 0.01032089, "balance_loss_clip": 1.0179987, "balance_loss_mlp": 1.04024601, "epoch": 0.6792725086427176, "flos": 17566279678080.0, "grad_norm": 2.418683531045301, "language_loss": 0.6656245, "learning_rate": 9.853082745349918e-07, "loss": 0.68708301, "num_input_tokens_seen": 243785510, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 11298, "time_per_iteration": 2.528794288635254 }, { "auxiliary_loss_clip": 0.01110825, "auxiliary_loss_mlp": 0.01027044, "balance_loss_clip": 1.01496768, "balance_loss_mlp": 1.03926373, "epoch": 0.6793326318953855, "flos": 26941837040640.0, "grad_norm": 1.8441835652798362, "language_loss": 0.71451867, "learning_rate": 9.84972678083801e-07, "loss": 0.73589742, "num_input_tokens_seen": 243805545, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71484375, "step": 11299, "time_per_iteration": 4.01851224899292 }, { "auxiliary_loss_clip": 0.01111513, "auxiliary_loss_mlp": 0.01030906, "balance_loss_clip": 1.01753056, "balance_loss_mlp": 1.03992724, "epoch": 0.6793927551480535, "flos": 24318275662080.0, "grad_norm": 1.4675167077692381, "language_loss": 0.76897466, "learning_rate": 9.846371201225488e-07, "loss": 0.79039884, "num_input_tokens_seen": 243825185, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 11300, "time_per_iteration": 4.085808277130127 }, { "auxiliary_loss_clip": 0.01109099, "auxiliary_loss_mlp": 0.01027663, "balance_loss_clip": 1.01490724, "balance_loss_mlp": 1.03874266, "epoch": 0.6794528784007214, "flos": 11436615757440.0, "grad_norm": 2.1098538871362296, "language_loss": 0.63198721, "learning_rate": 9.843016006639577e-07, "loss": 0.65335476, "num_input_tokens_seen": 243841600, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 11301, "time_per_iteration": 2.5362157821655273 }, { "auxiliary_loss_clip": 0.01109043, "auxiliary_loss_mlp": 0.01029201, "balance_loss_clip": 1.01679182, "balance_loss_mlp": 1.03817201, "epoch": 0.6795130016533895, "flos": 25229688382080.0, "grad_norm": 2.6500381750767428, "language_loss": 0.83225363, "learning_rate": 9.839661197207525e-07, "loss": 0.85363603, "num_input_tokens_seen": 243862250, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 11302, "time_per_iteration": 2.581904411315918 }, { "auxiliary_loss_clip": 0.0110991, "auxiliary_loss_mlp": 0.01031325, "balance_loss_clip": 1.01853347, "balance_loss_mlp": 1.03765583, "epoch": 0.6795731249060574, "flos": 18296415434880.0, "grad_norm": 1.9609334203442141, "language_loss": 0.69615853, "learning_rate": 9.83630677305654e-07, "loss": 0.7175709, "num_input_tokens_seen": 243880560, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 11303, "time_per_iteration": 2.4891412258148193 }, { "auxiliary_loss_clip": 0.01113115, "auxiliary_loss_mlp": 0.01033198, "balance_loss_clip": 1.02022207, "balance_loss_mlp": 1.03950012, "epoch": 0.6796332481587254, "flos": 20300374183680.0, "grad_norm": 1.884805756886911, "language_loss": 0.69812131, "learning_rate": 9.832952734313813e-07, "loss": 0.71958447, "num_input_tokens_seen": 243900635, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.734375, "step": 11304, "time_per_iteration": 2.5294954776763916 }, { "auxiliary_loss_clip": 0.01113144, "auxiliary_loss_mlp": 0.0103196, "balance_loss_clip": 1.01851368, "balance_loss_mlp": 1.04149735, "epoch": 0.6796933714113934, "flos": 23586847015680.0, "grad_norm": 1.9566509563075116, "language_loss": 0.73122042, "learning_rate": 9.829599081106536e-07, "loss": 0.75267148, "num_input_tokens_seen": 243920160, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 11305, "time_per_iteration": 2.550581693649292 }, { "auxiliary_loss_clip": 0.01109694, "auxiliary_loss_mlp": 0.01026787, "balance_loss_clip": 1.01347685, "balance_loss_mlp": 1.0389961, "epoch": 0.6797534946640613, "flos": 27119917693440.0, "grad_norm": 2.765839427313283, "language_loss": 0.66062713, "learning_rate": 9.826245813561882e-07, "loss": 0.68199193, "num_input_tokens_seen": 243939015, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 11306, "time_per_iteration": 2.5621652603149414 }, { "auxiliary_loss_clip": 0.01107072, "auxiliary_loss_mlp": 0.01028276, "balance_loss_clip": 1.01485944, "balance_loss_mlp": 1.03766418, "epoch": 0.6798136179167293, "flos": 22127437428480.0, "grad_norm": 1.6786471641291734, "language_loss": 0.80190659, "learning_rate": 9.822892931807021e-07, "loss": 0.82326007, "num_input_tokens_seen": 243958470, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 11307, "time_per_iteration": 2.506791830062866 }, { "auxiliary_loss_clip": 0.011091, "auxiliary_loss_mlp": 0.01030154, "balance_loss_clip": 1.0176909, "balance_loss_mlp": 1.03952563, "epoch": 0.6798737411693972, "flos": 17488640430720.0, "grad_norm": 1.9287190076777136, "language_loss": 0.8894251, "learning_rate": 9.819540435969066e-07, "loss": 0.91081768, "num_input_tokens_seen": 243975450, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 11308, "time_per_iteration": 2.482381820678711 }, { "auxiliary_loss_clip": 0.01109093, "auxiliary_loss_mlp": 0.01033283, "balance_loss_clip": 1.01962161, "balance_loss_mlp": 1.03774953, "epoch": 0.6799338644220653, "flos": 22892262744960.0, "grad_norm": 2.045137778258209, "language_loss": 0.71125364, "learning_rate": 9.816188326175154e-07, "loss": 0.73267734, "num_input_tokens_seen": 243994355, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 11309, "time_per_iteration": 2.517667770385742 }, { "auxiliary_loss_clip": 0.01109751, "auxiliary_loss_mlp": 0.01035685, "balance_loss_clip": 1.02258325, "balance_loss_mlp": 1.03894687, "epoch": 0.6799939876747332, "flos": 23180409648000.0, "grad_norm": 2.058732020345659, "language_loss": 0.84536588, "learning_rate": 9.812836602552411e-07, "loss": 0.86682016, "num_input_tokens_seen": 244011620, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 11310, "time_per_iteration": 2.5169167518615723 }, { "auxiliary_loss_clip": 0.01105755, "auxiliary_loss_mlp": 0.01026534, "balance_loss_clip": 1.01482749, "balance_loss_mlp": 1.03837967, "epoch": 0.6800541109274012, "flos": 19499925553920.0, "grad_norm": 2.1518827316903866, "language_loss": 0.83563709, "learning_rate": 9.80948526522792e-07, "loss": 0.85696006, "num_input_tokens_seen": 244029925, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 11311, "time_per_iteration": 2.4799344539642334 }, { "auxiliary_loss_clip": 0.01112176, "auxiliary_loss_mlp": 0.01029899, "balance_loss_clip": 1.01558185, "balance_loss_mlp": 1.0375973, "epoch": 0.6801142341800691, "flos": 22277652105600.0, "grad_norm": 1.6852926147374168, "language_loss": 0.76190042, "learning_rate": 9.806134314328767e-07, "loss": 0.78332126, "num_input_tokens_seen": 244051225, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.74609375, "step": 11312, "time_per_iteration": 2.528733730316162 }, { "auxiliary_loss_clip": 0.0103378, "auxiliary_loss_mlp": 0.00999854, "balance_loss_clip": 0.99873984, "balance_loss_mlp": 1.01113725, "epoch": 0.6801743574327371, "flos": 68714817759360.0, "grad_norm": 0.6685720123683349, "language_loss": 0.57208097, "learning_rate": 9.802783749982038e-07, "loss": 0.59241736, "num_input_tokens_seen": 244115930, "router_z_loss_clip": 0.01116943, "router_z_loss_mlp": 0.2265625, "step": 11313, "time_per_iteration": 3.251452684402466 }, { "auxiliary_loss_clip": 0.01107935, "auxiliary_loss_mlp": 0.01027492, "balance_loss_clip": 1.01443255, "balance_loss_mlp": 1.03668427, "epoch": 0.680234480685405, "flos": 29460467813760.0, "grad_norm": 2.1919977829886435, "language_loss": 0.68998313, "learning_rate": 9.799433572314754e-07, "loss": 0.71133745, "num_input_tokens_seen": 244137320, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 11314, "time_per_iteration": 2.5782482624053955 }, { "auxiliary_loss_clip": 0.01105227, "auxiliary_loss_mlp": 0.01031186, "balance_loss_clip": 1.01927674, "balance_loss_mlp": 1.03616524, "epoch": 0.6802946039380731, "flos": 15916866122880.0, "grad_norm": 1.7952033108361685, "language_loss": 0.81305987, "learning_rate": 9.796083781453972e-07, "loss": 0.83442402, "num_input_tokens_seen": 244152755, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 11315, "time_per_iteration": 2.489570140838623 }, { "auxiliary_loss_clip": 0.01107458, "auxiliary_loss_mlp": 0.01028933, "balance_loss_clip": 1.01598096, "balance_loss_mlp": 1.0372715, "epoch": 0.680354727190741, "flos": 22018664067840.0, "grad_norm": 5.2609713069385196, "language_loss": 0.70070845, "learning_rate": 9.792734377526718e-07, "loss": 0.72207236, "num_input_tokens_seen": 244171480, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 11316, "time_per_iteration": 2.5972156524658203 }, { "auxiliary_loss_clip": 0.0110647, "auxiliary_loss_mlp": 0.01027232, "balance_loss_clip": 1.01518631, "balance_loss_mlp": 1.03746796, "epoch": 0.680414850443409, "flos": 18441494467200.0, "grad_norm": 2.107204682202291, "language_loss": 0.6684404, "learning_rate": 9.789385360660003e-07, "loss": 0.68977743, "num_input_tokens_seen": 244187920, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69140625, "step": 11317, "time_per_iteration": 2.481715679168701 }, { "auxiliary_loss_clip": 0.01111129, "auxiliary_loss_mlp": 0.01040792, "balance_loss_clip": 1.02828681, "balance_loss_mlp": 1.04072511, "epoch": 0.680474973696077, "flos": 26358611909760.0, "grad_norm": 1.547177482654414, "language_loss": 0.74923217, "learning_rate": 9.78603673098082e-07, "loss": 0.77075136, "num_input_tokens_seen": 244209565, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 11318, "time_per_iteration": 2.550485372543335 }, { "auxiliary_loss_clip": 0.01102964, "auxiliary_loss_mlp": 0.01028175, "balance_loss_clip": 1.01670718, "balance_loss_mlp": 1.03553402, "epoch": 0.6805350969487449, "flos": 18333116156160.0, "grad_norm": 1.8607735643043288, "language_loss": 0.68031967, "learning_rate": 9.782688488616143e-07, "loss": 0.70163107, "num_input_tokens_seen": 244228015, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.67578125, "step": 11319, "time_per_iteration": 2.4957804679870605 }, { "auxiliary_loss_clip": 0.011057, "auxiliary_loss_mlp": 0.01034216, "balance_loss_clip": 1.02114463, "balance_loss_mlp": 1.03655803, "epoch": 0.6805952202014129, "flos": 19937497034880.0, "grad_norm": 1.6396629379561805, "language_loss": 0.76709163, "learning_rate": 9.779340633692945e-07, "loss": 0.78849077, "num_input_tokens_seen": 244245615, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 11320, "time_per_iteration": 2.5003960132598877 }, { "auxiliary_loss_clip": 0.01107321, "auxiliary_loss_mlp": 0.01031761, "balance_loss_clip": 1.01922011, "balance_loss_mlp": 1.03754282, "epoch": 0.6806553434540809, "flos": 25224301342080.0, "grad_norm": 1.771950200181747, "language_loss": 0.74865454, "learning_rate": 9.77599316633817e-07, "loss": 0.7700454, "num_input_tokens_seen": 244263625, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 11321, "time_per_iteration": 2.5387935638427734 }, { "auxiliary_loss_clip": 0.01110331, "auxiliary_loss_mlp": 0.01036087, "balance_loss_clip": 1.02337909, "balance_loss_mlp": 1.03981352, "epoch": 0.6807154667067489, "flos": 17785586165760.0, "grad_norm": 1.8376301496204563, "language_loss": 0.72936141, "learning_rate": 9.772646086678758e-07, "loss": 0.75082564, "num_input_tokens_seen": 244282745, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11322, "time_per_iteration": 2.498506546020508 }, { "auxiliary_loss_clip": 0.01107458, "auxiliary_loss_mlp": 0.01030755, "balance_loss_clip": 1.01788664, "balance_loss_mlp": 1.03662395, "epoch": 0.6807755899594168, "flos": 22199905117440.0, "grad_norm": 1.9468908073652365, "language_loss": 0.78468609, "learning_rate": 9.769299394841638e-07, "loss": 0.80606824, "num_input_tokens_seen": 244303770, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 11323, "time_per_iteration": 2.523904323577881 }, { "auxiliary_loss_clip": 0.01032646, "auxiliary_loss_mlp": 0.00999849, "balance_loss_clip": 0.99859685, "balance_loss_mlp": 1.01007664, "epoch": 0.6808357132120848, "flos": 68631073200000.0, "grad_norm": 0.7530588162158103, "language_loss": 0.57104129, "learning_rate": 9.765953090953714e-07, "loss": 0.59136623, "num_input_tokens_seen": 244355910, "router_z_loss_clip": 0.01251221, "router_z_loss_mlp": 0.22558594, "step": 11324, "time_per_iteration": 2.9425013065338135 }, { "auxiliary_loss_clip": 0.01110193, "auxiliary_loss_mlp": 0.01033075, "balance_loss_clip": 1.01978898, "balance_loss_mlp": 1.03927231, "epoch": 0.6808958364647527, "flos": 23843357015040.0, "grad_norm": 1.8739969512875536, "language_loss": 0.6829558, "learning_rate": 9.76260717514186e-07, "loss": 0.7043885, "num_input_tokens_seen": 244376610, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 11325, "time_per_iteration": 2.6009202003479004 }, { "auxiliary_loss_clip": 0.01111038, "auxiliary_loss_mlp": 0.01032524, "balance_loss_clip": 1.01921439, "balance_loss_mlp": 1.03730226, "epoch": 0.6809559597174207, "flos": 17711717846400.0, "grad_norm": 2.351498004834643, "language_loss": 0.70732796, "learning_rate": 9.759261647532974e-07, "loss": 0.72876358, "num_input_tokens_seen": 244393000, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 11326, "time_per_iteration": 2.4920318126678467 }, { "auxiliary_loss_clip": 0.01106654, "auxiliary_loss_mlp": 0.01031211, "balance_loss_clip": 1.0186944, "balance_loss_mlp": 1.03613663, "epoch": 0.6810160829700886, "flos": 22491894775680.0, "grad_norm": 2.1196652151598907, "language_loss": 0.73350644, "learning_rate": 9.75591650825392e-07, "loss": 0.75488508, "num_input_tokens_seen": 244409515, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 11327, "time_per_iteration": 2.5139641761779785 }, { "auxiliary_loss_clip": 0.01104127, "auxiliary_loss_mlp": 0.01028556, "balance_loss_clip": 1.01622343, "balance_loss_mlp": 1.03608775, "epoch": 0.6810762062227567, "flos": 16832875783680.0, "grad_norm": 1.9950126198347615, "language_loss": 0.77489549, "learning_rate": 9.752571757431526e-07, "loss": 0.79622233, "num_input_tokens_seen": 244427165, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 11328, "time_per_iteration": 2.5242247581481934 }, { "auxiliary_loss_clip": 0.01108309, "auxiliary_loss_mlp": 0.01029555, "balance_loss_clip": 1.01708603, "balance_loss_mlp": 1.03759027, "epoch": 0.6811363294754246, "flos": 12714676554240.0, "grad_norm": 2.097284681019287, "language_loss": 0.64469755, "learning_rate": 9.74922739519265e-07, "loss": 0.6660763, "num_input_tokens_seen": 244445705, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.70703125, "step": 11329, "time_per_iteration": 2.49226975440979 }, { "auxiliary_loss_clip": 0.01109051, "auxiliary_loss_mlp": 0.01029969, "balance_loss_clip": 1.01726735, "balance_loss_mlp": 1.03792787, "epoch": 0.6811964527280926, "flos": 17711969241600.0, "grad_norm": 2.128849819942074, "language_loss": 0.78936458, "learning_rate": 9.745883421664096e-07, "loss": 0.81075478, "num_input_tokens_seen": 244460415, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11330, "time_per_iteration": 2.4577629566192627 }, { "auxiliary_loss_clip": 0.01108367, "auxiliary_loss_mlp": 0.01031748, "balance_loss_clip": 1.01865244, "balance_loss_mlp": 1.0385766, "epoch": 0.6812565759807605, "flos": 24863471268480.0, "grad_norm": 1.71679882361841, "language_loss": 0.64036161, "learning_rate": 9.742539836972665e-07, "loss": 0.66176277, "num_input_tokens_seen": 244480555, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 11331, "time_per_iteration": 2.5381624698638916 }, { "auxiliary_loss_clip": 0.01107048, "auxiliary_loss_mlp": 0.01032885, "balance_loss_clip": 1.02015984, "balance_loss_mlp": 1.0373348, "epoch": 0.6813166992334285, "flos": 17166019449600.0, "grad_norm": 1.616525917882628, "language_loss": 0.72562796, "learning_rate": 9.739196641245148e-07, "loss": 0.74702728, "num_input_tokens_seen": 244498540, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 11332, "time_per_iteration": 2.4744319915771484 }, { "auxiliary_loss_clip": 0.01109227, "auxiliary_loss_mlp": 0.01030613, "balance_loss_clip": 1.01806617, "balance_loss_mlp": 1.03787708, "epoch": 0.6813768224860965, "flos": 18843550375680.0, "grad_norm": 2.6661680600546167, "language_loss": 0.74796814, "learning_rate": 9.735853834608326e-07, "loss": 0.7693665, "num_input_tokens_seen": 244517015, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 11333, "time_per_iteration": 2.4813268184661865 }, { "auxiliary_loss_clip": 0.01111347, "auxiliary_loss_mlp": 0.01032741, "balance_loss_clip": 1.0194608, "balance_loss_mlp": 1.03862929, "epoch": 0.6814369457387645, "flos": 24532733813760.0, "grad_norm": 1.5778487842990805, "language_loss": 0.72214234, "learning_rate": 9.732511417188963e-07, "loss": 0.7435832, "num_input_tokens_seen": 244537450, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 11334, "time_per_iteration": 2.516874074935913 }, { "auxiliary_loss_clip": 0.01106669, "auxiliary_loss_mlp": 0.01031317, "balance_loss_clip": 1.01919985, "balance_loss_mlp": 1.03813815, "epoch": 0.6814970689914325, "flos": 18222978078720.0, "grad_norm": 1.8180815853424372, "language_loss": 0.85613048, "learning_rate": 9.729169389113791e-07, "loss": 0.87751031, "num_input_tokens_seen": 244555640, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 11335, "time_per_iteration": 2.49741792678833 }, { "auxiliary_loss_clip": 0.011008, "auxiliary_loss_mlp": 0.01028231, "balance_loss_clip": 1.01695406, "balance_loss_mlp": 1.03498578, "epoch": 0.6815571922441004, "flos": 25228790542080.0, "grad_norm": 1.8294042309912464, "language_loss": 0.82122695, "learning_rate": 9.725827750509542e-07, "loss": 0.84251726, "num_input_tokens_seen": 244574005, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.66015625, "step": 11336, "time_per_iteration": 2.5183560848236084 }, { "auxiliary_loss_clip": 0.01102768, "auxiliary_loss_mlp": 0.01028789, "balance_loss_clip": 1.01716042, "balance_loss_mlp": 1.03539276, "epoch": 0.6816173154967684, "flos": 19456078026240.0, "grad_norm": 1.7848649750699428, "language_loss": 0.81273079, "learning_rate": 9.72248650150294e-07, "loss": 0.83404636, "num_input_tokens_seen": 244591395, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 11337, "time_per_iteration": 3.9278619289398193 }, { "auxiliary_loss_clip": 0.01103568, "auxiliary_loss_mlp": 0.01028852, "balance_loss_clip": 1.01725316, "balance_loss_mlp": 1.03648448, "epoch": 0.6816774387494363, "flos": 17931455297280.0, "grad_norm": 1.897236985099453, "language_loss": 0.72582984, "learning_rate": 9.719145642220673e-07, "loss": 0.74715412, "num_input_tokens_seen": 244610400, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 11338, "time_per_iteration": 2.5249578952789307 }, { "auxiliary_loss_clip": 0.01108311, "auxiliary_loss_mlp": 0.01034783, "balance_loss_clip": 1.02223015, "balance_loss_mlp": 1.03860879, "epoch": 0.6817375620021043, "flos": 22233014478720.0, "grad_norm": 1.6967678613331059, "language_loss": 0.77475601, "learning_rate": 9.715805172789435e-07, "loss": 0.79618698, "num_input_tokens_seen": 244630400, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 11339, "time_per_iteration": 4.021308422088623 }, { "auxiliary_loss_clip": 0.01107594, "auxiliary_loss_mlp": 0.01033848, "balance_loss_clip": 1.02089548, "balance_loss_mlp": 1.03776598, "epoch": 0.6817976852547722, "flos": 25374408278400.0, "grad_norm": 2.18780933766799, "language_loss": 0.70662618, "learning_rate": 9.712465093335901e-07, "loss": 0.72804058, "num_input_tokens_seen": 244649155, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 11340, "time_per_iteration": 3.9878902435302734 }, { "auxiliary_loss_clip": 0.01110168, "auxiliary_loss_mlp": 0.01033339, "balance_loss_clip": 1.02063727, "balance_loss_mlp": 1.03796363, "epoch": 0.6818578085074403, "flos": 22265764704000.0, "grad_norm": 2.290681157068438, "language_loss": 0.83547252, "learning_rate": 9.709125403986722e-07, "loss": 0.85690761, "num_input_tokens_seen": 244665470, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 11341, "time_per_iteration": 3.966890335083008 }, { "auxiliary_loss_clip": 0.01107655, "auxiliary_loss_mlp": 0.01037153, "balance_loss_clip": 1.02312207, "balance_loss_mlp": 1.03685713, "epoch": 0.6819179317601082, "flos": 19318145800320.0, "grad_norm": 4.134359869884818, "language_loss": 0.68171495, "learning_rate": 9.705786104868531e-07, "loss": 0.70316303, "num_input_tokens_seen": 244684390, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.70703125, "step": 11342, "time_per_iteration": 2.5256543159484863 }, { "auxiliary_loss_clip": 0.01105699, "auxiliary_loss_mlp": 0.01031266, "balance_loss_clip": 1.01824212, "balance_loss_mlp": 1.03630328, "epoch": 0.6819780550127762, "flos": 21104126864640.0, "grad_norm": 1.6792267261220684, "language_loss": 0.74794626, "learning_rate": 9.702447196107963e-07, "loss": 0.7693159, "num_input_tokens_seen": 244703370, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 11343, "time_per_iteration": 2.5323517322540283 }, { "auxiliary_loss_clip": 0.01109431, "auxiliary_loss_mlp": 0.0103568, "balance_loss_clip": 1.02341366, "balance_loss_mlp": 1.03914857, "epoch": 0.6820381782654441, "flos": 29716403195520.0, "grad_norm": 1.681584833669912, "language_loss": 0.79587138, "learning_rate": 9.699108677831639e-07, "loss": 0.81732249, "num_input_tokens_seen": 244723325, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 11344, "time_per_iteration": 2.557729721069336 }, { "auxiliary_loss_clip": 0.01105063, "auxiliary_loss_mlp": 0.01032181, "balance_loss_clip": 1.01945579, "balance_loss_mlp": 1.03542674, "epoch": 0.6820983015181121, "flos": 29242130993280.0, "grad_norm": 2.0356072303659736, "language_loss": 0.66817391, "learning_rate": 9.695770550166136e-07, "loss": 0.68954635, "num_input_tokens_seen": 244745650, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 11345, "time_per_iteration": 2.5878660678863525 }, { "auxiliary_loss_clip": 0.0110994, "auxiliary_loss_mlp": 0.01032708, "balance_loss_clip": 1.01963675, "balance_loss_mlp": 1.03800559, "epoch": 0.6821584247707801, "flos": 18871775487360.0, "grad_norm": 2.5901238349989604, "language_loss": 0.65586984, "learning_rate": 9.692432813238054e-07, "loss": 0.67729634, "num_input_tokens_seen": 244760270, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 11346, "time_per_iteration": 2.477426767349243 }, { "auxiliary_loss_clip": 0.01105092, "auxiliary_loss_mlp": 0.01030564, "balance_loss_clip": 1.01745105, "balance_loss_mlp": 1.03480387, "epoch": 0.6822185480234481, "flos": 21324582587520.0, "grad_norm": 2.8417983974471697, "language_loss": 0.78416121, "learning_rate": 9.689095467173952e-07, "loss": 0.80551779, "num_input_tokens_seen": 244779565, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11347, "time_per_iteration": 2.510979413986206 }, { "auxiliary_loss_clip": 0.01031625, "auxiliary_loss_mlp": 0.01005049, "balance_loss_clip": 1.00380349, "balance_loss_mlp": 1.00889802, "epoch": 0.6822786712761161, "flos": 63488306430720.0, "grad_norm": 0.8195052962305517, "language_loss": 0.52516973, "learning_rate": 9.685758512100378e-07, "loss": 0.54553646, "num_input_tokens_seen": 244838480, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.22753906, "step": 11348, "time_per_iteration": 3.102978467941284 }, { "auxiliary_loss_clip": 0.01103515, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 1.02172637, "balance_loss_mlp": 1.03543699, "epoch": 0.682338794528784, "flos": 21068934514560.0, "grad_norm": 1.9120413763582218, "language_loss": 0.79941308, "learning_rate": 9.682421948143873e-07, "loss": 0.82078362, "num_input_tokens_seen": 244855265, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 11349, "time_per_iteration": 2.482494354248047 }, { "auxiliary_loss_clip": 0.011129, "auxiliary_loss_mlp": 0.01030726, "balance_loss_clip": 1.01601529, "balance_loss_mlp": 1.0379281, "epoch": 0.682398917781452, "flos": 36283243547520.0, "grad_norm": 1.8529058520904222, "language_loss": 0.7381472, "learning_rate": 9.67908577543096e-07, "loss": 0.75958341, "num_input_tokens_seen": 244875555, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75, "step": 11350, "time_per_iteration": 2.6342267990112305 }, { "auxiliary_loss_clip": 0.01106317, "auxiliary_loss_mlp": 0.01030296, "balance_loss_clip": 1.017326, "balance_loss_mlp": 1.03772235, "epoch": 0.6824590410341199, "flos": 24859197550080.0, "grad_norm": 2.1094779843849207, "language_loss": 0.79440618, "learning_rate": 9.675749994088161e-07, "loss": 0.81577229, "num_input_tokens_seen": 244895270, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.68359375, "step": 11351, "time_per_iteration": 2.5367515087127686 }, { "auxiliary_loss_clip": 0.01105114, "auxiliary_loss_mlp": 0.01036019, "balance_loss_clip": 1.02381158, "balance_loss_mlp": 1.03563786, "epoch": 0.6825191642867879, "flos": 22452392793600.0, "grad_norm": 1.7668134225181724, "language_loss": 0.73064363, "learning_rate": 9.672414604241954e-07, "loss": 0.75205493, "num_input_tokens_seen": 244914535, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 11352, "time_per_iteration": 2.5251822471618652 }, { "auxiliary_loss_clip": 0.01108936, "auxiliary_loss_mlp": 0.01034192, "balance_loss_clip": 1.02085209, "balance_loss_mlp": 1.03704333, "epoch": 0.6825792875394558, "flos": 29424377623680.0, "grad_norm": 1.6959544520913747, "language_loss": 0.80513042, "learning_rate": 9.669079606018814e-07, "loss": 0.82656169, "num_input_tokens_seen": 244936095, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 11353, "time_per_iteration": 2.5599148273468018 }, { "auxiliary_loss_clip": 0.01105479, "auxiliary_loss_mlp": 0.01028207, "balance_loss_clip": 1.01516533, "balance_loss_mlp": 1.03594756, "epoch": 0.6826394107921239, "flos": 18770974945920.0, "grad_norm": 1.6852141820608249, "language_loss": 0.78448898, "learning_rate": 9.665744999545218e-07, "loss": 0.80582583, "num_input_tokens_seen": 244955290, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 11354, "time_per_iteration": 2.5134215354919434 }, { "auxiliary_loss_clip": 0.01104796, "auxiliary_loss_mlp": 0.01025587, "balance_loss_clip": 1.01335621, "balance_loss_mlp": 1.03615308, "epoch": 0.6826995340447918, "flos": 16617591619200.0, "grad_norm": 2.106412901520974, "language_loss": 0.62385589, "learning_rate": 9.662410784947599e-07, "loss": 0.64515972, "num_input_tokens_seen": 244972935, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 11355, "time_per_iteration": 2.48833966255188 }, { "auxiliary_loss_clip": 0.01104456, "auxiliary_loss_mlp": 0.01028075, "balance_loss_clip": 1.01563537, "balance_loss_mlp": 1.03487241, "epoch": 0.6827596572974598, "flos": 20848299223680.0, "grad_norm": 2.4642615347827084, "language_loss": 0.82127285, "learning_rate": 9.659076962352398e-07, "loss": 0.8425982, "num_input_tokens_seen": 244989440, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 11356, "time_per_iteration": 2.4887895584106445 }, { "auxiliary_loss_clip": 0.01110911, "auxiliary_loss_mlp": 0.01030973, "balance_loss_clip": 1.01813412, "balance_loss_mlp": 1.03901339, "epoch": 0.6828197805501277, "flos": 22748081552640.0, "grad_norm": 1.8146447678135593, "language_loss": 0.78685427, "learning_rate": 9.655743531886052e-07, "loss": 0.80827308, "num_input_tokens_seen": 245007830, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11357, "time_per_iteration": 2.502143144607544 }, { "auxiliary_loss_clip": 0.01030752, "auxiliary_loss_mlp": 0.01002328, "balance_loss_clip": 1.0010947, "balance_loss_mlp": 1.00810838, "epoch": 0.6828799038027957, "flos": 71646565829760.0, "grad_norm": 0.8344304288596955, "language_loss": 0.59656703, "learning_rate": 9.65241049367493e-07, "loss": 0.61689782, "num_input_tokens_seen": 245070720, "router_z_loss_clip": 0.0123291, "router_z_loss_mlp": 0.2265625, "step": 11358, "time_per_iteration": 3.215270519256592 }, { "auxiliary_loss_clip": 0.01112361, "auxiliary_loss_mlp": 0.01038384, "balance_loss_clip": 1.02462721, "balance_loss_mlp": 1.03809834, "epoch": 0.6829400270554637, "flos": 19829154637440.0, "grad_norm": 1.794859939816337, "language_loss": 0.78738141, "learning_rate": 9.64907784784544e-07, "loss": 0.80888891, "num_input_tokens_seen": 245089070, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 11359, "time_per_iteration": 2.5210952758789062 }, { "auxiliary_loss_clip": 0.01105741, "auxiliary_loss_mlp": 0.01032901, "balance_loss_clip": 1.02016973, "balance_loss_mlp": 1.03519499, "epoch": 0.6830001503081317, "flos": 21980634543360.0, "grad_norm": 2.5544099227886115, "language_loss": 0.81569225, "learning_rate": 9.645745594523958e-07, "loss": 0.83707863, "num_input_tokens_seen": 245106500, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11360, "time_per_iteration": 2.523651361465454 }, { "auxiliary_loss_clip": 0.01110439, "auxiliary_loss_mlp": 0.01034414, "balance_loss_clip": 1.02131271, "balance_loss_mlp": 1.03911614, "epoch": 0.6830602735607997, "flos": 24316767290880.0, "grad_norm": 1.907089158919198, "language_loss": 0.75405282, "learning_rate": 9.642413733836844e-07, "loss": 0.77550137, "num_input_tokens_seen": 245125260, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 11361, "time_per_iteration": 2.51149582862854 }, { "auxiliary_loss_clip": 0.01030798, "auxiliary_loss_mlp": 0.01000867, "balance_loss_clip": 0.99968702, "balance_loss_mlp": 1.00819874, "epoch": 0.6831203968134676, "flos": 57690062323200.0, "grad_norm": 0.8855732103915935, "language_loss": 0.59685123, "learning_rate": 9.639082265910437e-07, "loss": 0.61716789, "num_input_tokens_seen": 245188730, "router_z_loss_clip": 0.01177979, "router_z_loss_mlp": 0.2265625, "step": 11362, "time_per_iteration": 3.216209650039673 }, { "auxiliary_loss_clip": 0.01109185, "auxiliary_loss_mlp": 0.01033822, "balance_loss_clip": 1.02005351, "balance_loss_mlp": 1.03634477, "epoch": 0.6831805200661356, "flos": 14388436552320.0, "grad_norm": 2.38713924198078, "language_loss": 0.74967414, "learning_rate": 9.635751190871074e-07, "loss": 0.77110422, "num_input_tokens_seen": 245205065, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 11363, "time_per_iteration": 2.447655200958252 }, { "auxiliary_loss_clip": 0.01106072, "auxiliary_loss_mlp": 0.01037934, "balance_loss_clip": 1.02484465, "balance_loss_mlp": 1.03610826, "epoch": 0.6832406433188035, "flos": 22820297846400.0, "grad_norm": 2.8542591586229724, "language_loss": 0.89477086, "learning_rate": 9.632420508845063e-07, "loss": 0.91621089, "num_input_tokens_seen": 245224265, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 11364, "time_per_iteration": 2.504002571105957 }, { "auxiliary_loss_clip": 0.01105646, "auxiliary_loss_mlp": 0.01029611, "balance_loss_clip": 1.01742148, "balance_loss_mlp": 1.0368681, "epoch": 0.6833007665714715, "flos": 17561718650880.0, "grad_norm": 3.1414951600892174, "language_loss": 0.88209498, "learning_rate": 9.629090219958697e-07, "loss": 0.90344757, "num_input_tokens_seen": 245243360, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 11365, "time_per_iteration": 2.474613904953003 }, { "auxiliary_loss_clip": 0.01112824, "auxiliary_loss_mlp": 0.01039461, "balance_loss_clip": 1.02593064, "balance_loss_mlp": 1.03898787, "epoch": 0.6833608898241395, "flos": 22445928345600.0, "grad_norm": 2.280646441475554, "language_loss": 0.81430721, "learning_rate": 9.625760324338272e-07, "loss": 0.83583009, "num_input_tokens_seen": 245256350, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73828125, "step": 11366, "time_per_iteration": 2.4938483238220215 }, { "auxiliary_loss_clip": 0.01107778, "auxiliary_loss_mlp": 0.01029247, "balance_loss_clip": 1.01645541, "balance_loss_mlp": 1.03550065, "epoch": 0.6834210130768075, "flos": 24534637234560.0, "grad_norm": 1.639486233915981, "language_loss": 0.76820862, "learning_rate": 9.622430822110062e-07, "loss": 0.7895788, "num_input_tokens_seen": 245277575, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 11367, "time_per_iteration": 2.526376485824585 }, { "auxiliary_loss_clip": 0.01107907, "auxiliary_loss_mlp": 0.01038011, "balance_loss_clip": 1.02467775, "balance_loss_mlp": 1.03717387, "epoch": 0.6834811363294754, "flos": 20047132321920.0, "grad_norm": 1.6525800149134207, "language_loss": 0.68794966, "learning_rate": 9.619101713400312e-07, "loss": 0.70940882, "num_input_tokens_seen": 245296615, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 11368, "time_per_iteration": 2.52290678024292 }, { "auxiliary_loss_clip": 0.01107867, "auxiliary_loss_mlp": 0.01030788, "balance_loss_clip": 1.01845598, "balance_loss_mlp": 1.03708768, "epoch": 0.6835412595821434, "flos": 24790752184320.0, "grad_norm": 2.159687580219934, "language_loss": 0.73540014, "learning_rate": 9.615772998335261e-07, "loss": 0.7567867, "num_input_tokens_seen": 245316275, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 11369, "time_per_iteration": 2.514136791229248 }, { "auxiliary_loss_clip": 0.0110944, "auxiliary_loss_mlp": 0.01030263, "balance_loss_clip": 1.01722121, "balance_loss_mlp": 1.03775585, "epoch": 0.6836013828348113, "flos": 19500356517120.0, "grad_norm": 2.341396177376178, "language_loss": 0.79082847, "learning_rate": 9.612444677041138e-07, "loss": 0.81222552, "num_input_tokens_seen": 245334595, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 11370, "time_per_iteration": 2.4662857055664062 }, { "auxiliary_loss_clip": 0.01030298, "auxiliary_loss_mlp": 0.01005817, "balance_loss_clip": 1.00465453, "balance_loss_mlp": 1.00777638, "epoch": 0.6836615060874793, "flos": 58363999251840.0, "grad_norm": 0.7457753226119347, "language_loss": 0.59863675, "learning_rate": 9.609116749644162e-07, "loss": 0.61899781, "num_input_tokens_seen": 245389750, "router_z_loss_clip": 0.01159668, "router_z_loss_mlp": 0.22558594, "step": 11371, "time_per_iteration": 3.005389451980591 }, { "auxiliary_loss_clip": 0.01104707, "auxiliary_loss_mlp": 0.01031299, "balance_loss_clip": 1.0192529, "balance_loss_mlp": 1.03661656, "epoch": 0.6837216293401474, "flos": 12166895168640.0, "grad_norm": 1.5321350319379903, "language_loss": 0.63996935, "learning_rate": 9.605789216270511e-07, "loss": 0.66132939, "num_input_tokens_seen": 245407530, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 11372, "time_per_iteration": 2.470477819442749 }, { "auxiliary_loss_clip": 0.0110632, "auxiliary_loss_mlp": 0.01027377, "balance_loss_clip": 1.01481235, "balance_loss_mlp": 1.03660822, "epoch": 0.6837817525928153, "flos": 22127581082880.0, "grad_norm": 1.7846447645250942, "language_loss": 0.71944988, "learning_rate": 9.602462077046375e-07, "loss": 0.74078679, "num_input_tokens_seen": 245427000, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 11373, "time_per_iteration": 2.5070149898529053 }, { "auxiliary_loss_clip": 0.01030812, "auxiliary_loss_mlp": 0.0100196, "balance_loss_clip": 1.00073802, "balance_loss_mlp": 1.00803924, "epoch": 0.6838418758454833, "flos": 65005928985600.0, "grad_norm": 1.22263627024579, "language_loss": 0.56566083, "learning_rate": 9.599135332097935e-07, "loss": 0.58598864, "num_input_tokens_seen": 245491620, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.22851562, "step": 11374, "time_per_iteration": 3.298868417739868 }, { "auxiliary_loss_clip": 0.01110381, "auxiliary_loss_mlp": 0.0102623, "balance_loss_clip": 1.01285434, "balance_loss_mlp": 1.03883696, "epoch": 0.6839019990981512, "flos": 21030833162880.0, "grad_norm": 2.002521727186855, "language_loss": 0.73995733, "learning_rate": 9.595808981551312e-07, "loss": 0.76132345, "num_input_tokens_seen": 245511285, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 11375, "time_per_iteration": 2.4950809478759766 }, { "auxiliary_loss_clip": 0.01106791, "auxiliary_loss_mlp": 0.01032109, "balance_loss_clip": 1.0193119, "balance_loss_mlp": 1.03717232, "epoch": 0.6839621223508192, "flos": 24935543907840.0, "grad_norm": 1.7076253095217129, "language_loss": 0.70505494, "learning_rate": 9.592483025532651e-07, "loss": 0.72644389, "num_input_tokens_seen": 245532910, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 11376, "time_per_iteration": 2.6261513233184814 }, { "auxiliary_loss_clip": 0.01108616, "auxiliary_loss_mlp": 0.01030914, "balance_loss_clip": 1.01778889, "balance_loss_mlp": 1.03625429, "epoch": 0.6840222456034871, "flos": 26358827391360.0, "grad_norm": 1.8091882700115285, "language_loss": 0.74723458, "learning_rate": 9.58915746416808e-07, "loss": 0.76862991, "num_input_tokens_seen": 245550540, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 11377, "time_per_iteration": 2.6033542156219482 }, { "auxiliary_loss_clip": 0.01031172, "auxiliary_loss_mlp": 0.01001719, "balance_loss_clip": 1.00047326, "balance_loss_mlp": 1.00861931, "epoch": 0.6840823688561551, "flos": 65988336936960.0, "grad_norm": 0.7203357740501614, "language_loss": 0.56844509, "learning_rate": 9.585832297583707e-07, "loss": 0.58877403, "num_input_tokens_seen": 245619570, "router_z_loss_clip": 0.01245117, "router_z_loss_mlp": 0.2265625, "step": 11378, "time_per_iteration": 4.640397787094116 }, { "auxiliary_loss_clip": 0.01107724, "auxiliary_loss_mlp": 0.01034039, "balance_loss_clip": 1.0204668, "balance_loss_mlp": 1.03658009, "epoch": 0.684142492108823, "flos": 21397588980480.0, "grad_norm": 6.687565045362469, "language_loss": 0.78443599, "learning_rate": 9.58250752590561e-07, "loss": 0.80585361, "num_input_tokens_seen": 245637980, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 11379, "time_per_iteration": 2.5359179973602295 }, { "auxiliary_loss_clip": 0.01102434, "auxiliary_loss_mlp": 0.01028111, "balance_loss_clip": 1.0165596, "balance_loss_mlp": 1.03687549, "epoch": 0.6842026153614911, "flos": 18801426700800.0, "grad_norm": 1.8827889406301384, "language_loss": 0.69469273, "learning_rate": 9.57918314925988e-07, "loss": 0.71599823, "num_input_tokens_seen": 245655690, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65234375, "step": 11380, "time_per_iteration": 3.905214548110962 }, { "auxiliary_loss_clip": 0.01105631, "auxiliary_loss_mlp": 0.01033292, "balance_loss_clip": 1.01982737, "balance_loss_mlp": 1.03563285, "epoch": 0.684262738614159, "flos": 19646405216640.0, "grad_norm": 2.1628776998110824, "language_loss": 0.78500551, "learning_rate": 9.575859167772568e-07, "loss": 0.8063947, "num_input_tokens_seen": 245671525, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69921875, "step": 11381, "time_per_iteration": 2.5025997161865234 }, { "auxiliary_loss_clip": 0.01030836, "auxiliary_loss_mlp": 0.01001511, "balance_loss_clip": 1.0003134, "balance_loss_mlp": 1.00835025, "epoch": 0.684322861866827, "flos": 62354462739840.0, "grad_norm": 0.8767006786014401, "language_loss": 0.67201543, "learning_rate": 9.572535581569713e-07, "loss": 0.69233894, "num_input_tokens_seen": 245724115, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.22460938, "step": 11382, "time_per_iteration": 4.407053709030151 }, { "auxiliary_loss_clip": 0.0103121, "auxiliary_loss_mlp": 0.0100201, "balance_loss_clip": 1.00090146, "balance_loss_mlp": 1.00863683, "epoch": 0.6843829851194949, "flos": 65805048812160.0, "grad_norm": 0.8260100637438665, "language_loss": 0.58143473, "learning_rate": 9.569212390777356e-07, "loss": 0.60176682, "num_input_tokens_seen": 245789245, "router_z_loss_clip": 0.0111084, "router_z_loss_mlp": 0.2265625, "step": 11383, "time_per_iteration": 4.547998428344727 }, { "auxiliary_loss_clip": 0.01105363, "auxiliary_loss_mlp": 0.01027959, "balance_loss_clip": 1.01599669, "balance_loss_mlp": 1.03536105, "epoch": 0.6844431083721629, "flos": 27855153181440.0, "grad_norm": 2.236565225726592, "language_loss": 0.79830092, "learning_rate": 9.565889595521517e-07, "loss": 0.8196342, "num_input_tokens_seen": 245812420, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69921875, "step": 11384, "time_per_iteration": 2.5414226055145264 }, { "auxiliary_loss_clip": 0.01109069, "auxiliary_loss_mlp": 0.01033354, "balance_loss_clip": 1.02024078, "balance_loss_mlp": 1.03539324, "epoch": 0.684503231624831, "flos": 18255010032000.0, "grad_norm": 2.9255416231512705, "language_loss": 0.77392054, "learning_rate": 9.562567195928187e-07, "loss": 0.79534471, "num_input_tokens_seen": 245829135, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 11385, "time_per_iteration": 2.475586414337158 }, { "auxiliary_loss_clip": 0.01114332, "auxiliary_loss_mlp": 0.01035181, "balance_loss_clip": 1.02117395, "balance_loss_mlp": 1.03816795, "epoch": 0.6845633548774989, "flos": 17639681120640.0, "grad_norm": 2.132205790772256, "language_loss": 0.84351718, "learning_rate": 9.55924519212335e-07, "loss": 0.86501229, "num_input_tokens_seen": 245847140, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.76171875, "step": 11386, "time_per_iteration": 2.454742193222046 }, { "auxiliary_loss_clip": 0.01108218, "auxiliary_loss_mlp": 0.01040126, "balance_loss_clip": 1.02788329, "balance_loss_mlp": 1.03784966, "epoch": 0.6846234781301669, "flos": 20807576179200.0, "grad_norm": 2.4590938214315923, "language_loss": 0.83508241, "learning_rate": 9.555923584232984e-07, "loss": 0.85656583, "num_input_tokens_seen": 245862855, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 11387, "time_per_iteration": 2.5064265727996826 }, { "auxiliary_loss_clip": 0.01106052, "auxiliary_loss_mlp": 0.01029397, "balance_loss_clip": 1.01667142, "balance_loss_mlp": 1.03667247, "epoch": 0.6846836013828348, "flos": 36101176485120.0, "grad_norm": 1.9258144213762285, "language_loss": 0.72032905, "learning_rate": 9.552602372383047e-07, "loss": 0.74168354, "num_input_tokens_seen": 245885415, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 11388, "time_per_iteration": 2.613675355911255 }, { "auxiliary_loss_clip": 0.01104804, "auxiliary_loss_mlp": 0.01025755, "balance_loss_clip": 1.01392913, "balance_loss_mlp": 1.0357995, "epoch": 0.6847437246355028, "flos": 43142468607360.0, "grad_norm": 2.004336529587629, "language_loss": 0.63040423, "learning_rate": 9.549281556699469e-07, "loss": 0.65170979, "num_input_tokens_seen": 245906285, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69140625, "step": 11389, "time_per_iteration": 2.644287109375 }, { "auxiliary_loss_clip": 0.01030717, "auxiliary_loss_mlp": 0.01001343, "balance_loss_clip": 1.00022209, "balance_loss_mlp": 1.00827312, "epoch": 0.6848038478881707, "flos": 71663729552640.0, "grad_norm": 0.7287349837725787, "language_loss": 0.55912238, "learning_rate": 9.54596113730818e-07, "loss": 0.57944298, "num_input_tokens_seen": 245967620, "router_z_loss_clip": 0.01123047, "router_z_loss_mlp": 0.22460938, "step": 11390, "time_per_iteration": 3.185800790786743 }, { "auxiliary_loss_clip": 0.01109018, "auxiliary_loss_mlp": 0.01031655, "balance_loss_clip": 1.0186255, "balance_loss_mlp": 1.03875124, "epoch": 0.6848639711408387, "flos": 19937820257280.0, "grad_norm": 2.4149402101852426, "language_loss": 0.87753105, "learning_rate": 9.542641114335109e-07, "loss": 0.89893782, "num_input_tokens_seen": 245985075, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11391, "time_per_iteration": 2.5405311584472656 }, { "auxiliary_loss_clip": 0.01111264, "auxiliary_loss_mlp": 0.01035322, "balance_loss_clip": 1.02278125, "balance_loss_mlp": 1.03892064, "epoch": 0.6849240943935067, "flos": 26867501844480.0, "grad_norm": 1.591622747564408, "language_loss": 0.79184031, "learning_rate": 9.539321487906117e-07, "loss": 0.81330615, "num_input_tokens_seen": 246003560, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 11392, "time_per_iteration": 2.5354504585266113 }, { "auxiliary_loss_clip": 0.01103961, "auxiliary_loss_mlp": 0.01031072, "balance_loss_clip": 1.01878726, "balance_loss_mlp": 1.03611994, "epoch": 0.6849842176461747, "flos": 13735365425280.0, "grad_norm": 2.3158031972625195, "language_loss": 0.70664251, "learning_rate": 9.536002258147104e-07, "loss": 0.72799289, "num_input_tokens_seen": 246019600, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 11393, "time_per_iteration": 2.490950584411621 }, { "auxiliary_loss_clip": 0.01109879, "auxiliary_loss_mlp": 0.01030113, "balance_loss_clip": 1.01671398, "balance_loss_mlp": 1.0369153, "epoch": 0.6850443408988426, "flos": 24973070641920.0, "grad_norm": 1.6318636369470223, "language_loss": 0.64754552, "learning_rate": 9.532683425183936e-07, "loss": 0.66894543, "num_input_tokens_seen": 246038920, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 11394, "time_per_iteration": 2.5242345333099365 }, { "auxiliary_loss_clip": 0.0110973, "auxiliary_loss_mlp": 0.01035299, "balance_loss_clip": 1.02232325, "balance_loss_mlp": 1.03801799, "epoch": 0.6851044641515106, "flos": 27744225004800.0, "grad_norm": 1.804710074044168, "language_loss": 0.80717933, "learning_rate": 9.529364989142468e-07, "loss": 0.82862961, "num_input_tokens_seen": 246060490, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 11395, "time_per_iteration": 2.5570127964019775 }, { "auxiliary_loss_clip": 0.01108483, "auxiliary_loss_mlp": 0.01032004, "balance_loss_clip": 1.01844978, "balance_loss_mlp": 1.03768325, "epoch": 0.6851645874041785, "flos": 24351061800960.0, "grad_norm": 1.9948427176861732, "language_loss": 0.7269119, "learning_rate": 9.526046950148527e-07, "loss": 0.74831676, "num_input_tokens_seen": 246081465, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.70703125, "step": 11396, "time_per_iteration": 2.53149676322937 }, { "auxiliary_loss_clip": 0.01111314, "auxiliary_loss_mlp": 0.01028895, "balance_loss_clip": 1.01531732, "balance_loss_mlp": 1.03812265, "epoch": 0.6852247106568465, "flos": 15077849264640.0, "grad_norm": 3.1668830811117523, "language_loss": 0.79290271, "learning_rate": 9.522729308327931e-07, "loss": 0.81430477, "num_input_tokens_seen": 246096110, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 11397, "time_per_iteration": 2.4634461402893066 }, { "auxiliary_loss_clip": 0.01106425, "auxiliary_loss_mlp": 0.01029689, "balance_loss_clip": 1.01689827, "balance_loss_mlp": 1.03561246, "epoch": 0.6852848339095146, "flos": 18770005278720.0, "grad_norm": 3.248580963413722, "language_loss": 0.71462286, "learning_rate": 9.519412063806493e-07, "loss": 0.73598397, "num_input_tokens_seen": 246114785, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 11398, "time_per_iteration": 2.5322468280792236 }, { "auxiliary_loss_clip": 0.0110413, "auxiliary_loss_mlp": 0.0103441, "balance_loss_clip": 1.02218497, "balance_loss_mlp": 1.03551555, "epoch": 0.6853449571621825, "flos": 27854363082240.0, "grad_norm": 1.642965653727057, "language_loss": 0.70706713, "learning_rate": 9.516095216709996e-07, "loss": 0.72845256, "num_input_tokens_seen": 246136375, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 11399, "time_per_iteration": 2.545442581176758 }, { "auxiliary_loss_clip": 0.01109931, "auxiliary_loss_mlp": 0.01032245, "balance_loss_clip": 1.02001452, "balance_loss_mlp": 1.03866923, "epoch": 0.6854050804148505, "flos": 18150510389760.0, "grad_norm": 1.7105812570838608, "language_loss": 0.70271873, "learning_rate": 9.512778767164217e-07, "loss": 0.72414052, "num_input_tokens_seen": 246155090, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.7109375, "step": 11400, "time_per_iteration": 2.490413188934326 }, { "auxiliary_loss_clip": 0.011176, "auxiliary_loss_mlp": 0.01036649, "balance_loss_clip": 1.02036476, "balance_loss_mlp": 1.03938293, "epoch": 0.6854652036675184, "flos": 16326212492160.0, "grad_norm": 1.836624453854817, "language_loss": 0.78043127, "learning_rate": 9.509462715294927e-07, "loss": 0.80197376, "num_input_tokens_seen": 246172645, "router_z_loss_clip": 0.16308594, "router_z_loss_mlp": 0.78125, "step": 11401, "time_per_iteration": 2.475350856781006 }, { "auxiliary_loss_clip": 0.01104985, "auxiliary_loss_mlp": 0.01031051, "balance_loss_clip": 1.01917148, "balance_loss_mlp": 1.03560734, "epoch": 0.6855253269201864, "flos": 14940814878720.0, "grad_norm": 2.0288315402347266, "language_loss": 0.75340986, "learning_rate": 9.50614706122786e-07, "loss": 0.77477026, "num_input_tokens_seen": 246189055, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 11402, "time_per_iteration": 2.4370131492614746 }, { "auxiliary_loss_clip": 0.01109731, "auxiliary_loss_mlp": 0.0103403, "balance_loss_clip": 1.02058876, "balance_loss_mlp": 1.03756523, "epoch": 0.6855854501728543, "flos": 23037736826880.0, "grad_norm": 1.8155854279018737, "language_loss": 0.72839236, "learning_rate": 9.502831805088742e-07, "loss": 0.74983001, "num_input_tokens_seen": 246207990, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 11403, "time_per_iteration": 2.489625930786133 }, { "auxiliary_loss_clip": 0.01105592, "auxiliary_loss_mlp": 0.01032663, "balance_loss_clip": 1.01991367, "balance_loss_mlp": 1.03657711, "epoch": 0.6856455734255223, "flos": 13253623194240.0, "grad_norm": 2.358829601723102, "language_loss": 0.8128897, "learning_rate": 9.499516947003294e-07, "loss": 0.83427227, "num_input_tokens_seen": 246221595, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 11404, "time_per_iteration": 2.470682144165039 }, { "auxiliary_loss_clip": 0.01108266, "auxiliary_loss_mlp": 0.01033701, "balance_loss_clip": 1.02113676, "balance_loss_mlp": 1.03858685, "epoch": 0.6857056966781903, "flos": 23333461499520.0, "grad_norm": 1.977447314707514, "language_loss": 0.77929449, "learning_rate": 9.496202487097222e-07, "loss": 0.80071414, "num_input_tokens_seen": 246242970, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 11405, "time_per_iteration": 2.528158187866211 }, { "auxiliary_loss_clip": 0.01031491, "auxiliary_loss_mlp": 0.01003741, "balance_loss_clip": 1.00245309, "balance_loss_mlp": 1.00877583, "epoch": 0.6857658199308583, "flos": 61852647784320.0, "grad_norm": 0.8447927972577445, "language_loss": 0.61054927, "learning_rate": 9.492888425496199e-07, "loss": 0.63090158, "num_input_tokens_seen": 246300405, "router_z_loss_clip": 0.01287842, "router_z_loss_mlp": 0.22753906, "step": 11406, "time_per_iteration": 3.1497533321380615 }, { "auxiliary_loss_clip": 0.01106642, "auxiliary_loss_mlp": 0.01031817, "balance_loss_clip": 1.0176791, "balance_loss_mlp": 1.03560674, "epoch": 0.6858259431835262, "flos": 16654543735680.0, "grad_norm": 1.749746381086764, "language_loss": 0.77064711, "learning_rate": 9.489574762325907e-07, "loss": 0.79203176, "num_input_tokens_seen": 246318780, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7109375, "step": 11407, "time_per_iteration": 2.4566848278045654 }, { "auxiliary_loss_clip": 0.01110005, "auxiliary_loss_mlp": 0.01035312, "balance_loss_clip": 1.02151334, "balance_loss_mlp": 1.03772271, "epoch": 0.6858860664361942, "flos": 21872974504320.0, "grad_norm": 3.9434374220222623, "language_loss": 0.71001232, "learning_rate": 9.486261497711991e-07, "loss": 0.73146546, "num_input_tokens_seen": 246339405, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.72265625, "step": 11408, "time_per_iteration": 2.5000545978546143 }, { "auxiliary_loss_clip": 0.01109807, "auxiliary_loss_mlp": 0.01028396, "balance_loss_clip": 1.01503849, "balance_loss_mlp": 1.03666151, "epoch": 0.6859461896888621, "flos": 15267637751040.0, "grad_norm": 1.9498480639985294, "language_loss": 0.6986106, "learning_rate": 9.482948631780087e-07, "loss": 0.71999264, "num_input_tokens_seen": 246357055, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 11409, "time_per_iteration": 2.466468572616577 }, { "auxiliary_loss_clip": 0.01103211, "auxiliary_loss_mlp": 0.01027921, "balance_loss_clip": 1.01623297, "balance_loss_mlp": 1.03713369, "epoch": 0.6860063129415301, "flos": 18620293392000.0, "grad_norm": 1.5855722393579326, "language_loss": 0.78205949, "learning_rate": 9.479636164655825e-07, "loss": 0.80337083, "num_input_tokens_seen": 246374050, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6640625, "step": 11410, "time_per_iteration": 2.4663662910461426 }, { "auxiliary_loss_clip": 0.01109878, "auxiliary_loss_mlp": 0.01032542, "balance_loss_clip": 1.01863027, "balance_loss_mlp": 1.0360074, "epoch": 0.6860664361941982, "flos": 23951376190080.0, "grad_norm": 2.292164283927493, "language_loss": 0.71672511, "learning_rate": 9.476324096464821e-07, "loss": 0.73814929, "num_input_tokens_seen": 246392910, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73828125, "step": 11411, "time_per_iteration": 2.520991325378418 }, { "auxiliary_loss_clip": 0.01109136, "auxiliary_loss_mlp": 0.01030611, "balance_loss_clip": 1.01714647, "balance_loss_mlp": 1.03782511, "epoch": 0.6861265594468661, "flos": 20407782827520.0, "grad_norm": 1.9094605114540268, "language_loss": 0.7035259, "learning_rate": 9.473012427332654e-07, "loss": 0.72492331, "num_input_tokens_seen": 246411540, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 11412, "time_per_iteration": 2.485689878463745 }, { "auxiliary_loss_clip": 0.01108419, "auxiliary_loss_mlp": 0.01028934, "balance_loss_clip": 1.01581502, "balance_loss_mlp": 1.03787434, "epoch": 0.6861866826995341, "flos": 11428571111040.0, "grad_norm": 2.945440205143658, "language_loss": 0.71482217, "learning_rate": 9.469701157384919e-07, "loss": 0.7361958, "num_input_tokens_seen": 246423295, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 11413, "time_per_iteration": 2.466501235961914 }, { "auxiliary_loss_clip": 0.01107314, "auxiliary_loss_mlp": 0.01032252, "balance_loss_clip": 1.01992583, "balance_loss_mlp": 1.03668523, "epoch": 0.686246805952202, "flos": 15997593939840.0, "grad_norm": 1.8794009096366198, "language_loss": 0.73567748, "learning_rate": 9.466390286747164e-07, "loss": 0.75707316, "num_input_tokens_seen": 246441045, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 11414, "time_per_iteration": 2.495603561401367 }, { "auxiliary_loss_clip": 0.01111899, "auxiliary_loss_mlp": 0.01029197, "balance_loss_clip": 1.01603091, "balance_loss_mlp": 1.03948832, "epoch": 0.68630692920487, "flos": 19826712512640.0, "grad_norm": 2.4419362296286518, "language_loss": 0.86764103, "learning_rate": 9.46307981554495e-07, "loss": 0.88905203, "num_input_tokens_seen": 246456905, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 11415, "time_per_iteration": 2.472893238067627 }, { "auxiliary_loss_clip": 0.01110904, "auxiliary_loss_mlp": 0.01033701, "balance_loss_clip": 1.02037919, "balance_loss_mlp": 1.03805113, "epoch": 0.6863670524575379, "flos": 26286216048000.0, "grad_norm": 1.6933877756342963, "language_loss": 0.66956431, "learning_rate": 9.459769743903801e-07, "loss": 0.69101036, "num_input_tokens_seen": 246477545, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 11416, "time_per_iteration": 2.556488275527954 }, { "auxiliary_loss_clip": 0.01107252, "auxiliary_loss_mlp": 0.01032883, "balance_loss_clip": 1.02033067, "balance_loss_mlp": 1.0368706, "epoch": 0.686427175710206, "flos": 19173138595200.0, "grad_norm": 1.5773686310696566, "language_loss": 0.76180542, "learning_rate": 9.456460071949237e-07, "loss": 0.78320682, "num_input_tokens_seen": 246496705, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 11417, "time_per_iteration": 2.4993557929992676 }, { "auxiliary_loss_clip": 0.01108901, "auxiliary_loss_mlp": 0.01028995, "balance_loss_clip": 1.01641822, "balance_loss_mlp": 1.03821373, "epoch": 0.6864872989628739, "flos": 18916628595840.0, "grad_norm": 1.7433212700525926, "language_loss": 0.77432168, "learning_rate": 9.45315079980678e-07, "loss": 0.79570067, "num_input_tokens_seen": 246514860, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 11418, "time_per_iteration": 2.525848865509033 }, { "auxiliary_loss_clip": 0.01106948, "auxiliary_loss_mlp": 0.01029109, "balance_loss_clip": 1.01702666, "balance_loss_mlp": 1.03710222, "epoch": 0.6865474222155419, "flos": 25956196865280.0, "grad_norm": 1.845631955057709, "language_loss": 0.76355612, "learning_rate": 9.449841927601887e-07, "loss": 0.7849167, "num_input_tokens_seen": 246536145, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 11419, "time_per_iteration": 2.5200443267822266 }, { "auxiliary_loss_clip": 0.01105649, "auxiliary_loss_mlp": 0.0103135, "balance_loss_clip": 1.01932812, "balance_loss_mlp": 1.03608918, "epoch": 0.6866075454682098, "flos": 18478087447680.0, "grad_norm": 2.908540635801603, "language_loss": 0.71777982, "learning_rate": 9.446533455460044e-07, "loss": 0.73914981, "num_input_tokens_seen": 246553265, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 11420, "time_per_iteration": 3.7813560962677 }, { "auxiliary_loss_clip": 0.01104537, "auxiliary_loss_mlp": 0.01023626, "balance_loss_clip": 1.01197934, "balance_loss_mlp": 1.03528905, "epoch": 0.6866676687208778, "flos": 34239998298240.0, "grad_norm": 1.5512786049973764, "language_loss": 0.74540418, "learning_rate": 9.443225383506712e-07, "loss": 0.76668578, "num_input_tokens_seen": 246575130, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 11421, "time_per_iteration": 2.5743215084075928 }, { "auxiliary_loss_clip": 0.01103801, "auxiliary_loss_mlp": 0.01030464, "balance_loss_clip": 1.01791131, "balance_loss_mlp": 1.03566098, "epoch": 0.6867277919735457, "flos": 21721754246400.0, "grad_norm": 1.9435591756560326, "language_loss": 0.7731483, "learning_rate": 9.439917711867338e-07, "loss": 0.79449093, "num_input_tokens_seen": 246593095, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 11422, "time_per_iteration": 3.9630112648010254 }, { "auxiliary_loss_clip": 0.01108517, "auxiliary_loss_mlp": 0.01036569, "balance_loss_clip": 1.02378988, "balance_loss_mlp": 1.03769481, "epoch": 0.6867879152262137, "flos": 24097999507200.0, "grad_norm": 1.8078680336682924, "language_loss": 0.77059865, "learning_rate": 9.436610440667334e-07, "loss": 0.79204947, "num_input_tokens_seen": 246612165, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 11423, "time_per_iteration": 2.4956119060516357 }, { "auxiliary_loss_clip": 0.01110457, "auxiliary_loss_mlp": 0.01028858, "balance_loss_clip": 1.01557779, "balance_loss_mlp": 1.03930378, "epoch": 0.6868480384788818, "flos": 21615818060160.0, "grad_norm": 1.4995708509692869, "language_loss": 0.72647387, "learning_rate": 9.433303570032129e-07, "loss": 0.74786699, "num_input_tokens_seen": 246632065, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 11424, "time_per_iteration": 3.9073362350463867 }, { "auxiliary_loss_clip": 0.01107964, "auxiliary_loss_mlp": 0.01026883, "balance_loss_clip": 1.01462865, "balance_loss_mlp": 1.03728557, "epoch": 0.6869081617315497, "flos": 26286144220800.0, "grad_norm": 1.9252633960139767, "language_loss": 0.65004051, "learning_rate": 9.429997100087112e-07, "loss": 0.67138898, "num_input_tokens_seen": 246651245, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.70703125, "step": 11425, "time_per_iteration": 4.022274017333984 }, { "auxiliary_loss_clip": 0.01106074, "auxiliary_loss_mlp": 0.01024942, "balance_loss_clip": 1.01239502, "balance_loss_mlp": 1.03805244, "epoch": 0.6869682849842177, "flos": 21105096531840.0, "grad_norm": 1.4826073273093692, "language_loss": 0.71969157, "learning_rate": 9.426691030957657e-07, "loss": 0.74100173, "num_input_tokens_seen": 246672225, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 11426, "time_per_iteration": 2.4864938259124756 }, { "auxiliary_loss_clip": 0.01106797, "auxiliary_loss_mlp": 0.01025243, "balance_loss_clip": 1.01301765, "balance_loss_mlp": 1.03712714, "epoch": 0.6870284082368856, "flos": 17092653920640.0, "grad_norm": 2.070465443441417, "language_loss": 0.84989911, "learning_rate": 9.423385362769136e-07, "loss": 0.87121952, "num_input_tokens_seen": 246688385, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 11427, "time_per_iteration": 2.4440536499023438 }, { "auxiliary_loss_clip": 0.01105402, "auxiliary_loss_mlp": 0.01029703, "balance_loss_clip": 1.01752007, "balance_loss_mlp": 1.03680897, "epoch": 0.6870885314895536, "flos": 27308090067840.0, "grad_norm": 1.5082568300549288, "language_loss": 0.76211202, "learning_rate": 9.420080095646909e-07, "loss": 0.78346306, "num_input_tokens_seen": 246710730, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 11428, "time_per_iteration": 2.523721694946289 }, { "auxiliary_loss_clip": 0.01109415, "auxiliary_loss_mlp": 0.01035783, "balance_loss_clip": 1.02165055, "balance_loss_mlp": 1.03699517, "epoch": 0.6871486547422215, "flos": 20814543417600.0, "grad_norm": 1.8423163334397372, "language_loss": 0.72935891, "learning_rate": 9.4167752297163e-07, "loss": 0.75081092, "num_input_tokens_seen": 246730350, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7265625, "step": 11429, "time_per_iteration": 2.4921765327453613 }, { "auxiliary_loss_clip": 0.01109006, "auxiliary_loss_mlp": 0.01025825, "balance_loss_clip": 1.01384449, "balance_loss_mlp": 1.03846538, "epoch": 0.6872087779948896, "flos": 30154118330880.0, "grad_norm": 1.9866055750506793, "language_loss": 0.83089054, "learning_rate": 9.413470765102643e-07, "loss": 0.85223883, "num_input_tokens_seen": 246751700, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 11430, "time_per_iteration": 2.5618045330047607 }, { "auxiliary_loss_clip": 0.01105468, "auxiliary_loss_mlp": 0.01031156, "balance_loss_clip": 1.0188055, "balance_loss_mlp": 1.03592992, "epoch": 0.6872689012475575, "flos": 20704584908160.0, "grad_norm": 2.188071909789964, "language_loss": 0.69814277, "learning_rate": 9.410166701931225e-07, "loss": 0.71950895, "num_input_tokens_seen": 246769860, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 11431, "time_per_iteration": 2.4917168617248535 }, { "auxiliary_loss_clip": 0.01107675, "auxiliary_loss_mlp": 0.01026573, "balance_loss_clip": 1.0142529, "balance_loss_mlp": 1.03632331, "epoch": 0.6873290245002255, "flos": 25520852027520.0, "grad_norm": 1.8898576641958733, "language_loss": 0.80225074, "learning_rate": 9.406863040327355e-07, "loss": 0.82359326, "num_input_tokens_seen": 246789905, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71484375, "step": 11432, "time_per_iteration": 2.5151402950286865 }, { "auxiliary_loss_clip": 0.01103137, "auxiliary_loss_mlp": 0.01027789, "balance_loss_clip": 1.0160526, "balance_loss_mlp": 1.03623402, "epoch": 0.6873891477528934, "flos": 25191479289600.0, "grad_norm": 1.4836220456832678, "language_loss": 0.68044353, "learning_rate": 9.403559780416295e-07, "loss": 0.70175278, "num_input_tokens_seen": 246808815, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 11433, "time_per_iteration": 2.5383527278900146 }, { "auxiliary_loss_clip": 0.01109058, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.02418399, "balance_loss_mlp": 1.03972149, "epoch": 0.6874492710055614, "flos": 35152380685440.0, "grad_norm": 2.1292400992490923, "language_loss": 0.73272097, "learning_rate": 9.400256922323309e-07, "loss": 0.75417602, "num_input_tokens_seen": 246829775, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 11434, "time_per_iteration": 2.6066670417785645 }, { "auxiliary_loss_clip": 0.01107969, "auxiliary_loss_mlp": 0.01029463, "balance_loss_clip": 1.0167377, "balance_loss_mlp": 1.03888822, "epoch": 0.6875093942582293, "flos": 17822215059840.0, "grad_norm": 1.8439992034155757, "language_loss": 0.80191886, "learning_rate": 9.396954466173657e-07, "loss": 0.82329321, "num_input_tokens_seen": 246848045, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 11435, "time_per_iteration": 2.4736509323120117 }, { "auxiliary_loss_clip": 0.01107898, "auxiliary_loss_mlp": 0.01032667, "balance_loss_clip": 1.01941061, "balance_loss_mlp": 1.03668857, "epoch": 0.6875695175108973, "flos": 20704548994560.0, "grad_norm": 2.3676624155654657, "language_loss": 0.80777401, "learning_rate": 9.393652412092538e-07, "loss": 0.82917964, "num_input_tokens_seen": 246866095, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 11436, "time_per_iteration": 2.482677459716797 }, { "auxiliary_loss_clip": 0.01101345, "auxiliary_loss_mlp": 0.01026352, "balance_loss_clip": 1.01586747, "balance_loss_mlp": 1.03606844, "epoch": 0.6876296407635654, "flos": 25374013228800.0, "grad_norm": 1.7728618555693785, "language_loss": 0.81960195, "learning_rate": 9.390350760205183e-07, "loss": 0.8408789, "num_input_tokens_seen": 246883975, "router_z_loss_clip": 0.10498047, "router_z_loss_mlp": 0.65625, "step": 11437, "time_per_iteration": 2.5549371242523193 }, { "auxiliary_loss_clip": 0.01115621, "auxiliary_loss_mlp": 0.01037866, "balance_loss_clip": 1.02403212, "balance_loss_mlp": 1.04003894, "epoch": 0.6876897640162333, "flos": 23222317841280.0, "grad_norm": 3.0363100721010152, "language_loss": 0.78155482, "learning_rate": 9.387049510636793e-07, "loss": 0.80308974, "num_input_tokens_seen": 246901560, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7578125, "step": 11438, "time_per_iteration": 2.5670888423919678 }, { "auxiliary_loss_clip": 0.01101391, "auxiliary_loss_mlp": 0.01029006, "balance_loss_clip": 1.01681113, "balance_loss_mlp": 1.03576529, "epoch": 0.6877498872689013, "flos": 27124335066240.0, "grad_norm": 1.620459647208661, "language_loss": 0.72423792, "learning_rate": 9.383748663512554e-07, "loss": 0.74554187, "num_input_tokens_seen": 246922655, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.65625, "step": 11439, "time_per_iteration": 2.5485408306121826 }, { "auxiliary_loss_clip": 0.01106609, "auxiliary_loss_mlp": 0.01026865, "balance_loss_clip": 1.01431799, "balance_loss_mlp": 1.03721023, "epoch": 0.6878100105215692, "flos": 11581658876160.0, "grad_norm": 2.1129087799855175, "language_loss": 0.75503993, "learning_rate": 9.380448218957623e-07, "loss": 0.7763747, "num_input_tokens_seen": 246940100, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 11440, "time_per_iteration": 2.4382894039154053 }, { "auxiliary_loss_clip": 0.01103606, "auxiliary_loss_mlp": 0.01031508, "balance_loss_clip": 1.01961672, "balance_loss_mlp": 1.03506565, "epoch": 0.6878701337742372, "flos": 20303175444480.0, "grad_norm": 2.144302356180433, "language_loss": 0.72539604, "learning_rate": 9.377148177097167e-07, "loss": 0.74674714, "num_input_tokens_seen": 246958545, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 11441, "time_per_iteration": 2.47316312789917 }, { "auxiliary_loss_clip": 0.01110534, "auxiliary_loss_mlp": 0.01031638, "balance_loss_clip": 1.01785183, "balance_loss_mlp": 1.03790319, "epoch": 0.6879302570269051, "flos": 13840080549120.0, "grad_norm": 1.676254659257239, "language_loss": 0.66232783, "learning_rate": 9.373848538056317e-07, "loss": 0.68374956, "num_input_tokens_seen": 246974805, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 11442, "time_per_iteration": 2.459696054458618 }, { "auxiliary_loss_clip": 0.01107303, "auxiliary_loss_mlp": 0.01029538, "balance_loss_clip": 1.01771879, "balance_loss_mlp": 1.03827178, "epoch": 0.6879903802795732, "flos": 21324654414720.0, "grad_norm": 2.0309908701379658, "language_loss": 0.6921196, "learning_rate": 9.370549301960189e-07, "loss": 0.71348798, "num_input_tokens_seen": 246992505, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 11443, "time_per_iteration": 2.479602813720703 }, { "auxiliary_loss_clip": 0.01111228, "auxiliary_loss_mlp": 0.01032679, "balance_loss_clip": 1.01950669, "balance_loss_mlp": 1.03975606, "epoch": 0.6880505035322411, "flos": 25152049134720.0, "grad_norm": 1.4718630269971016, "language_loss": 0.76349372, "learning_rate": 9.367250468933893e-07, "loss": 0.78493273, "num_input_tokens_seen": 247013370, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 11444, "time_per_iteration": 2.4970717430114746 }, { "auxiliary_loss_clip": 0.01104338, "auxiliary_loss_mlp": 0.01031468, "balance_loss_clip": 1.01938653, "balance_loss_mlp": 1.03597021, "epoch": 0.6881106267849091, "flos": 23215530170880.0, "grad_norm": 4.309891614374378, "language_loss": 0.76682645, "learning_rate": 9.363952039102536e-07, "loss": 0.78818452, "num_input_tokens_seen": 247029855, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 11445, "time_per_iteration": 2.462977886199951 }, { "auxiliary_loss_clip": 0.01033036, "auxiliary_loss_mlp": 0.00999162, "balance_loss_clip": 0.99804705, "balance_loss_mlp": 1.01003814, "epoch": 0.688170750037577, "flos": 48484397312640.0, "grad_norm": 0.8247120323586611, "language_loss": 0.58405006, "learning_rate": 9.360654012591183e-07, "loss": 0.60437202, "num_input_tokens_seen": 247085030, "router_z_loss_clip": 0.01116943, "router_z_loss_mlp": 0.23046875, "step": 11446, "time_per_iteration": 3.1224071979522705 }, { "auxiliary_loss_clip": 0.01107351, "auxiliary_loss_mlp": 0.01030938, "balance_loss_clip": 1.01796794, "balance_loss_mlp": 1.03536153, "epoch": 0.688230873290245, "flos": 22783633038720.0, "grad_norm": 3.105864196209497, "language_loss": 0.75995195, "learning_rate": 9.357356389524886e-07, "loss": 0.78133482, "num_input_tokens_seen": 247104840, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 11447, "time_per_iteration": 2.4896886348724365 }, { "auxiliary_loss_clip": 0.01107433, "auxiliary_loss_mlp": 0.01030482, "balance_loss_clip": 1.01790524, "balance_loss_mlp": 1.03646266, "epoch": 0.6882909965429129, "flos": 22455660931200.0, "grad_norm": 2.2622182954814636, "language_loss": 0.73068941, "learning_rate": 9.354059170028705e-07, "loss": 0.75206858, "num_input_tokens_seen": 247121905, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 11448, "time_per_iteration": 2.462573528289795 }, { "auxiliary_loss_clip": 0.01109262, "auxiliary_loss_mlp": 0.0103163, "balance_loss_clip": 1.01837373, "balance_loss_mlp": 1.03640509, "epoch": 0.688351119795581, "flos": 26214143408640.0, "grad_norm": 1.7342815799299744, "language_loss": 0.75031257, "learning_rate": 9.350762354227673e-07, "loss": 0.77172148, "num_input_tokens_seen": 247142375, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 11449, "time_per_iteration": 2.5237550735473633 }, { "auxiliary_loss_clip": 0.01104154, "auxiliary_loss_mlp": 0.01030534, "balance_loss_clip": 1.01894689, "balance_loss_mlp": 1.03625774, "epoch": 0.6884112430482489, "flos": 22565260304640.0, "grad_norm": 2.3717518800534902, "language_loss": 0.70259941, "learning_rate": 9.34746594224679e-07, "loss": 0.72394633, "num_input_tokens_seen": 247161095, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 11450, "time_per_iteration": 2.531675338745117 }, { "auxiliary_loss_clip": 0.01110602, "auxiliary_loss_mlp": 0.01033262, "balance_loss_clip": 1.01954734, "balance_loss_mlp": 1.03676403, "epoch": 0.6884713663009169, "flos": 17341047446400.0, "grad_norm": 9.534911555310945, "language_loss": 0.75839579, "learning_rate": 9.344169934211068e-07, "loss": 0.77983439, "num_input_tokens_seen": 247178565, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 11451, "time_per_iteration": 2.5293076038360596 }, { "auxiliary_loss_clip": 0.0110756, "auxiliary_loss_mlp": 0.01027218, "balance_loss_clip": 1.0147543, "balance_loss_mlp": 1.03644657, "epoch": 0.6885314895535849, "flos": 26470832976000.0, "grad_norm": 73.51499450548802, "language_loss": 0.69588202, "learning_rate": 9.340874330245505e-07, "loss": 0.71722984, "num_input_tokens_seen": 247202345, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 11452, "time_per_iteration": 2.593148708343506 }, { "auxiliary_loss_clip": 0.01105773, "auxiliary_loss_mlp": 0.01033263, "balance_loss_clip": 1.01928008, "balance_loss_mlp": 1.03636634, "epoch": 0.6885916128062528, "flos": 20521548178560.0, "grad_norm": 1.789387133659592, "language_loss": 0.71784532, "learning_rate": 9.337579130475042e-07, "loss": 0.73923564, "num_input_tokens_seen": 247219240, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.6953125, "step": 11453, "time_per_iteration": 2.507223606109619 }, { "auxiliary_loss_clip": 0.01032537, "auxiliary_loss_mlp": 0.01001499, "balance_loss_clip": 1.00022948, "balance_loss_mlp": 1.0095799, "epoch": 0.6886517360589208, "flos": 70715795679360.0, "grad_norm": 0.7871704124410601, "language_loss": 0.50713658, "learning_rate": 9.334284335024644e-07, "loss": 0.52747703, "num_input_tokens_seen": 247272010, "router_z_loss_clip": 0.01269531, "router_z_loss_mlp": 0.22949219, "step": 11454, "time_per_iteration": 2.9675393104553223 }, { "auxiliary_loss_clip": 0.01103997, "auxiliary_loss_mlp": 0.01031708, "balance_loss_clip": 1.01937556, "balance_loss_mlp": 1.03716683, "epoch": 0.6887118593115887, "flos": 17893533513600.0, "grad_norm": 2.0529814215112334, "language_loss": 0.75953001, "learning_rate": 9.330989944019263e-07, "loss": 0.78088707, "num_input_tokens_seen": 247290630, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66796875, "step": 11455, "time_per_iteration": 2.4968721866607666 }, { "auxiliary_loss_clip": 0.01107544, "auxiliary_loss_mlp": 0.01034334, "balance_loss_clip": 1.02113748, "balance_loss_mlp": 1.0353471, "epoch": 0.6887719825642568, "flos": 17453017117440.0, "grad_norm": 2.9801078287482565, "language_loss": 0.73090541, "learning_rate": 9.327695957583803e-07, "loss": 0.7523241, "num_input_tokens_seen": 247304800, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 11456, "time_per_iteration": 2.529562473297119 }, { "auxiliary_loss_clip": 0.01105171, "auxiliary_loss_mlp": 0.01032388, "balance_loss_clip": 1.02024603, "balance_loss_mlp": 1.03750992, "epoch": 0.6888321058169247, "flos": 23070199743360.0, "grad_norm": 1.8802746665954255, "language_loss": 0.80918509, "learning_rate": 9.32440237584319e-07, "loss": 0.83056068, "num_input_tokens_seen": 247323450, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 11457, "time_per_iteration": 2.501974105834961 }, { "auxiliary_loss_clip": 0.01111547, "auxiliary_loss_mlp": 0.01031906, "balance_loss_clip": 1.01879883, "balance_loss_mlp": 1.03951657, "epoch": 0.6888922290695927, "flos": 23368833417600.0, "grad_norm": 1.696695563129203, "language_loss": 0.7635653, "learning_rate": 9.321109198922301e-07, "loss": 0.78499985, "num_input_tokens_seen": 247343845, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 11458, "time_per_iteration": 2.49700665473938 }, { "auxiliary_loss_clip": 0.01107196, "auxiliary_loss_mlp": 0.01030963, "balance_loss_clip": 1.01895225, "balance_loss_mlp": 1.03733766, "epoch": 0.6889523523222606, "flos": 17631636474240.0, "grad_norm": 3.6641350286973142, "language_loss": 0.68420261, "learning_rate": 9.31781642694603e-07, "loss": 0.70558417, "num_input_tokens_seen": 247356650, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 11459, "time_per_iteration": 2.443974256515503 }, { "auxiliary_loss_clip": 0.0110809, "auxiliary_loss_mlp": 0.0102982, "balance_loss_clip": 1.01826239, "balance_loss_mlp": 1.03812718, "epoch": 0.6890124755749286, "flos": 25228144097280.0, "grad_norm": 1.7377828735563214, "language_loss": 0.68667263, "learning_rate": 9.314524060039221e-07, "loss": 0.70805174, "num_input_tokens_seen": 247377340, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69921875, "step": 11460, "time_per_iteration": 2.5197598934173584 }, { "auxiliary_loss_clip": 0.01111373, "auxiliary_loss_mlp": 0.0103339, "balance_loss_clip": 1.01935351, "balance_loss_mlp": 1.03652632, "epoch": 0.6890725988275965, "flos": 20230240878720.0, "grad_norm": 1.8571776890567553, "language_loss": 0.7706989, "learning_rate": 9.311232098326731e-07, "loss": 0.79214656, "num_input_tokens_seen": 247395805, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75, "step": 11461, "time_per_iteration": 2.493889093399048 }, { "auxiliary_loss_clip": 0.01107576, "auxiliary_loss_mlp": 0.01032708, "balance_loss_clip": 1.01993465, "balance_loss_mlp": 1.0378952, "epoch": 0.6891327220802645, "flos": 14535311264640.0, "grad_norm": 1.9939230122302078, "language_loss": 0.69665885, "learning_rate": 9.307940541933401e-07, "loss": 0.71806169, "num_input_tokens_seen": 247413165, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 11462, "time_per_iteration": 3.9050729274749756 }, { "auxiliary_loss_clip": 0.01108795, "auxiliary_loss_mlp": 0.01027182, "balance_loss_clip": 1.01460516, "balance_loss_mlp": 1.03757882, "epoch": 0.6891928453329325, "flos": 21139139646720.0, "grad_norm": 1.5876447816275354, "language_loss": 0.87000215, "learning_rate": 9.304649390984034e-07, "loss": 0.89136189, "num_input_tokens_seen": 247433140, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 11463, "time_per_iteration": 2.5119800567626953 }, { "auxiliary_loss_clip": 0.01102986, "auxiliary_loss_mlp": 0.01029152, "balance_loss_clip": 1.01786304, "balance_loss_mlp": 1.03689909, "epoch": 0.6892529685856005, "flos": 17858520731520.0, "grad_norm": 1.6141355467526606, "language_loss": 0.685516, "learning_rate": 9.301358645603428e-07, "loss": 0.70683736, "num_input_tokens_seen": 247451265, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.66015625, "step": 11464, "time_per_iteration": 3.9340484142303467 }, { "auxiliary_loss_clip": 0.01107195, "auxiliary_loss_mlp": 0.01035092, "balance_loss_clip": 1.02261651, "balance_loss_mlp": 1.03777742, "epoch": 0.6893130918382685, "flos": 29934811843200.0, "grad_norm": 1.8002170819505463, "language_loss": 0.64839977, "learning_rate": 9.298068305916373e-07, "loss": 0.66982263, "num_input_tokens_seen": 247471645, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 11465, "time_per_iteration": 4.011198043823242 }, { "auxiliary_loss_clip": 0.01109888, "auxiliary_loss_mlp": 0.01034922, "balance_loss_clip": 1.02239895, "balance_loss_mlp": 1.03793907, "epoch": 0.6893732150909364, "flos": 24388516707840.0, "grad_norm": 1.4660764921209042, "language_loss": 0.7239145, "learning_rate": 9.294778372047649e-07, "loss": 0.74536258, "num_input_tokens_seen": 247491170, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 11466, "time_per_iteration": 3.9702131748199463 }, { "auxiliary_loss_clip": 0.01107208, "auxiliary_loss_mlp": 0.01031396, "balance_loss_clip": 1.01904023, "balance_loss_mlp": 1.03679228, "epoch": 0.6894333383436044, "flos": 16982874979200.0, "grad_norm": 2.003176436029408, "language_loss": 0.72407842, "learning_rate": 9.291488844121995e-07, "loss": 0.74546444, "num_input_tokens_seen": 247509005, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 11467, "time_per_iteration": 2.4983019828796387 }, { "auxiliary_loss_clip": 0.01108881, "auxiliary_loss_mlp": 0.0103734, "balance_loss_clip": 1.02312422, "balance_loss_mlp": 1.03613448, "epoch": 0.6894934615962723, "flos": 18985540838400.0, "grad_norm": 1.9763998029152374, "language_loss": 0.81008244, "learning_rate": 9.288199722264156e-07, "loss": 0.83154464, "num_input_tokens_seen": 247527050, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7265625, "step": 11468, "time_per_iteration": 2.4782116413116455 }, { "auxiliary_loss_clip": 0.01110242, "auxiliary_loss_mlp": 0.01037485, "balance_loss_clip": 1.02458096, "balance_loss_mlp": 1.03818893, "epoch": 0.6895535848489404, "flos": 34531664734080.0, "grad_norm": 1.4794751879631582, "language_loss": 0.65971255, "learning_rate": 9.284911006598875e-07, "loss": 0.68118984, "num_input_tokens_seen": 247547765, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11469, "time_per_iteration": 2.6263999938964844 }, { "auxiliary_loss_clip": 0.01032474, "auxiliary_loss_mlp": 0.01002302, "balance_loss_clip": 1.00112748, "balance_loss_mlp": 1.00977612, "epoch": 0.6896137081016083, "flos": 50075852273280.0, "grad_norm": 0.8127171109223372, "language_loss": 0.55279517, "learning_rate": 9.281622697250824e-07, "loss": 0.57314301, "num_input_tokens_seen": 247603515, "router_z_loss_clip": 0.01171875, "router_z_loss_mlp": 0.2265625, "step": 11470, "time_per_iteration": 2.995250940322876 }, { "auxiliary_loss_clip": 0.0110337, "auxiliary_loss_mlp": 0.01032689, "balance_loss_clip": 1.02182353, "balance_loss_mlp": 1.03714275, "epoch": 0.6896738313542763, "flos": 19938215306880.0, "grad_norm": 1.9738222463542718, "language_loss": 0.77971047, "learning_rate": 9.278334794344715e-07, "loss": 0.80107105, "num_input_tokens_seen": 247622110, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.6640625, "step": 11471, "time_per_iteration": 2.5188634395599365 }, { "auxiliary_loss_clip": 0.01106087, "auxiliary_loss_mlp": 0.01031854, "balance_loss_clip": 1.01938462, "balance_loss_mlp": 1.03624845, "epoch": 0.6897339546069442, "flos": 21725489260800.0, "grad_norm": 1.8172878886273136, "language_loss": 0.78464496, "learning_rate": 9.275047298005232e-07, "loss": 0.80602437, "num_input_tokens_seen": 247641905, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 11472, "time_per_iteration": 2.575376033782959 }, { "auxiliary_loss_clip": 0.01103494, "auxiliary_loss_mlp": 0.01031784, "balance_loss_clip": 1.02019644, "balance_loss_mlp": 1.03477716, "epoch": 0.6897940778596122, "flos": 19826497031040.0, "grad_norm": 1.7757583814383615, "language_loss": 0.76412863, "learning_rate": 9.271760208357024e-07, "loss": 0.78548139, "num_input_tokens_seen": 247660945, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6875, "step": 11473, "time_per_iteration": 2.5217020511627197 }, { "auxiliary_loss_clip": 0.0110959, "auxiliary_loss_mlp": 0.01034217, "balance_loss_clip": 1.02127707, "balance_loss_mlp": 1.03775358, "epoch": 0.6898542011122801, "flos": 17310056987520.0, "grad_norm": 5.333658191951938, "language_loss": 0.75970709, "learning_rate": 9.268473525524751e-07, "loss": 0.7811451, "num_input_tokens_seen": 247678395, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11474, "time_per_iteration": 2.539232015609741 }, { "auxiliary_loss_clip": 0.01108556, "auxiliary_loss_mlp": 0.01034249, "balance_loss_clip": 1.02135062, "balance_loss_mlp": 1.03844464, "epoch": 0.6899143243649482, "flos": 24754051463040.0, "grad_norm": 2.3274002141678976, "language_loss": 0.74260545, "learning_rate": 9.26518724963303e-07, "loss": 0.7640335, "num_input_tokens_seen": 247698380, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11475, "time_per_iteration": 2.5549674034118652 }, { "auxiliary_loss_clip": 0.01106052, "auxiliary_loss_mlp": 0.01033763, "balance_loss_clip": 1.02109742, "balance_loss_mlp": 1.03589249, "epoch": 0.6899744476176161, "flos": 17234536642560.0, "grad_norm": 2.2057740657012013, "language_loss": 0.88622725, "learning_rate": 9.261901380806491e-07, "loss": 0.90762538, "num_input_tokens_seen": 247716370, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 11476, "time_per_iteration": 2.4877536296844482 }, { "auxiliary_loss_clip": 0.01104826, "auxiliary_loss_mlp": 0.01034129, "balance_loss_clip": 1.0217731, "balance_loss_mlp": 1.03577995, "epoch": 0.6900345708702841, "flos": 25410678036480.0, "grad_norm": 1.468867896867333, "language_loss": 0.70216656, "learning_rate": 9.258615919169724e-07, "loss": 0.72355616, "num_input_tokens_seen": 247737335, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 11477, "time_per_iteration": 2.548609972000122 }, { "auxiliary_loss_clip": 0.01111925, "auxiliary_loss_mlp": 0.01041049, "balance_loss_clip": 1.02750707, "balance_loss_mlp": 1.0379169, "epoch": 0.6900946941229521, "flos": 23434190213760.0, "grad_norm": 2.5900874170808272, "language_loss": 0.68485641, "learning_rate": 9.255330864847313e-07, "loss": 0.70638615, "num_input_tokens_seen": 247756680, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 11478, "time_per_iteration": 2.495340585708618 }, { "auxiliary_loss_clip": 0.01109015, "auxiliary_loss_mlp": 0.01032871, "balance_loss_clip": 1.02063441, "balance_loss_mlp": 1.03746259, "epoch": 0.69015481737562, "flos": 17820096157440.0, "grad_norm": 1.8877984194919752, "language_loss": 0.76570845, "learning_rate": 9.252046217963843e-07, "loss": 0.78712732, "num_input_tokens_seen": 247774265, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71875, "step": 11479, "time_per_iteration": 2.4742300510406494 }, { "auxiliary_loss_clip": 0.01109523, "auxiliary_loss_mlp": 0.01030993, "balance_loss_clip": 1.01809454, "balance_loss_mlp": 1.03876841, "epoch": 0.690214940628288, "flos": 17456500736640.0, "grad_norm": 1.9453776616877636, "language_loss": 0.7917257, "learning_rate": 9.248761978643856e-07, "loss": 0.81313086, "num_input_tokens_seen": 247792395, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 11480, "time_per_iteration": 2.463789463043213 }, { "auxiliary_loss_clip": 0.01107401, "auxiliary_loss_mlp": 0.01033166, "balance_loss_clip": 1.02044606, "balance_loss_mlp": 1.03900981, "epoch": 0.6902750638809559, "flos": 29566691308800.0, "grad_norm": 1.6438100436846328, "language_loss": 0.75795221, "learning_rate": 9.245478147011885e-07, "loss": 0.77935785, "num_input_tokens_seen": 247811985, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 11481, "time_per_iteration": 2.537415027618408 }, { "auxiliary_loss_clip": 0.01106627, "auxiliary_loss_mlp": 0.01032356, "balance_loss_clip": 1.02007139, "balance_loss_mlp": 1.03676224, "epoch": 0.690335187133624, "flos": 25557121785600.0, "grad_norm": 1.9609208342981288, "language_loss": 0.69562364, "learning_rate": 9.24219472319246e-07, "loss": 0.71701348, "num_input_tokens_seen": 247831880, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 11482, "time_per_iteration": 2.515141487121582 }, { "auxiliary_loss_clip": 0.01108028, "auxiliary_loss_mlp": 0.01030554, "balance_loss_clip": 1.01837134, "balance_loss_mlp": 1.03756404, "epoch": 0.6903953103862919, "flos": 22488447070080.0, "grad_norm": 1.5428733248011082, "language_loss": 0.82743692, "learning_rate": 9.238911707310096e-07, "loss": 0.84882271, "num_input_tokens_seen": 247851170, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 11483, "time_per_iteration": 2.4833295345306396 }, { "auxiliary_loss_clip": 0.01107781, "auxiliary_loss_mlp": 0.01030751, "balance_loss_clip": 1.01919317, "balance_loss_mlp": 1.03659773, "epoch": 0.6904554336389599, "flos": 26100521712000.0, "grad_norm": 2.5173929807411706, "language_loss": 0.66157162, "learning_rate": 9.235629099489273e-07, "loss": 0.68295693, "num_input_tokens_seen": 247868950, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.7109375, "step": 11484, "time_per_iteration": 2.545733690261841 }, { "auxiliary_loss_clip": 0.01103972, "auxiliary_loss_mlp": 0.01034914, "balance_loss_clip": 1.02279663, "balance_loss_mlp": 1.03550291, "epoch": 0.6905155568916278, "flos": 31171754545920.0, "grad_norm": 1.9885313171778016, "language_loss": 0.73539615, "learning_rate": 9.232346899854479e-07, "loss": 0.75678504, "num_input_tokens_seen": 247889805, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 11485, "time_per_iteration": 2.5505173206329346 }, { "auxiliary_loss_clip": 0.01109507, "auxiliary_loss_mlp": 0.01039023, "balance_loss_clip": 1.02577305, "balance_loss_mlp": 1.037745, "epoch": 0.6905756801442958, "flos": 17639681120640.0, "grad_norm": 1.8751111996103218, "language_loss": 0.84995055, "learning_rate": 9.22906510853017e-07, "loss": 0.87143582, "num_input_tokens_seen": 247908585, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11486, "time_per_iteration": 2.5601634979248047 }, { "auxiliary_loss_clip": 0.0110728, "auxiliary_loss_mlp": 0.010347, "balance_loss_clip": 1.02198017, "balance_loss_mlp": 1.03711021, "epoch": 0.6906358033969637, "flos": 22343691260160.0, "grad_norm": 1.5744166866308553, "language_loss": 0.72837454, "learning_rate": 9.225783725640786e-07, "loss": 0.74979436, "num_input_tokens_seen": 247928480, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11487, "time_per_iteration": 2.4930951595306396 }, { "auxiliary_loss_clip": 0.01031329, "auxiliary_loss_mlp": 0.01002813, "balance_loss_clip": 1.00161481, "balance_loss_mlp": 1.00840306, "epoch": 0.6906959266496318, "flos": 69747789081600.0, "grad_norm": 0.9073293975019875, "language_loss": 0.66753203, "learning_rate": 9.222502751310759e-07, "loss": 0.68787342, "num_input_tokens_seen": 247988855, "router_z_loss_clip": 0.01196289, "router_z_loss_mlp": 0.22949219, "step": 11488, "time_per_iteration": 3.1650314331054688 }, { "auxiliary_loss_clip": 0.01110813, "auxiliary_loss_mlp": 0.010389, "balance_loss_clip": 1.02466083, "balance_loss_mlp": 1.03661203, "epoch": 0.6907560499022997, "flos": 21434253788160.0, "grad_norm": 2.24713352806222, "language_loss": 0.74939585, "learning_rate": 9.219222185664519e-07, "loss": 0.77089298, "num_input_tokens_seen": 248007685, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7421875, "step": 11489, "time_per_iteration": 2.5021841526031494 }, { "auxiliary_loss_clip": 0.01109227, "auxiliary_loss_mlp": 0.01039454, "balance_loss_clip": 1.0258106, "balance_loss_mlp": 1.03768098, "epoch": 0.6908161731549677, "flos": 14392207480320.0, "grad_norm": 2.3026225206404503, "language_loss": 0.62340283, "learning_rate": 9.215942028826445e-07, "loss": 0.64488971, "num_input_tokens_seen": 248025145, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 11490, "time_per_iteration": 2.4963576793670654 }, { "auxiliary_loss_clip": 0.01107092, "auxiliary_loss_mlp": 0.01028169, "balance_loss_clip": 1.01631367, "balance_loss_mlp": 1.03619707, "epoch": 0.6908762964076357, "flos": 20010970304640.0, "grad_norm": 4.820278239237171, "language_loss": 0.72569615, "learning_rate": 9.212662280920937e-07, "loss": 0.74704874, "num_input_tokens_seen": 248043750, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.7109375, "step": 11491, "time_per_iteration": 2.4998276233673096 }, { "auxiliary_loss_clip": 0.01105006, "auxiliary_loss_mlp": 0.01037143, "balance_loss_clip": 1.02385736, "balance_loss_mlp": 1.03583467, "epoch": 0.6909364196603036, "flos": 28769079853440.0, "grad_norm": 1.4665731213053206, "language_loss": 0.70530152, "learning_rate": 9.20938294207235e-07, "loss": 0.72672307, "num_input_tokens_seen": 248065765, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69140625, "step": 11492, "time_per_iteration": 2.5732550621032715 }, { "auxiliary_loss_clip": 0.01111415, "auxiliary_loss_mlp": 0.01033819, "balance_loss_clip": 1.02027094, "balance_loss_mlp": 1.03781199, "epoch": 0.6909965429129716, "flos": 22528128620160.0, "grad_norm": 1.693984609951376, "language_loss": 0.74651092, "learning_rate": 9.206104012405049e-07, "loss": 0.76796323, "num_input_tokens_seen": 248083810, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 11493, "time_per_iteration": 2.4982750415802 }, { "auxiliary_loss_clip": 0.01107374, "auxiliary_loss_mlp": 0.01028593, "balance_loss_clip": 1.01608825, "balance_loss_mlp": 1.0380435, "epoch": 0.6910566661656395, "flos": 18405942981120.0, "grad_norm": 1.7187655049705557, "language_loss": 0.74663258, "learning_rate": 9.20282549204336e-07, "loss": 0.76799232, "num_input_tokens_seen": 248103185, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 11494, "time_per_iteration": 2.4617819786071777 }, { "auxiliary_loss_clip": 0.01106077, "auxiliary_loss_mlp": 0.01032845, "balance_loss_clip": 1.02029228, "balance_loss_mlp": 1.03648484, "epoch": 0.6911167894183076, "flos": 30773972355840.0, "grad_norm": 1.6772118899202495, "language_loss": 0.68257141, "learning_rate": 9.19954738111161e-07, "loss": 0.70396066, "num_input_tokens_seen": 248125665, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 11495, "time_per_iteration": 2.5608572959899902 }, { "auxiliary_loss_clip": 0.0110594, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 1.0179981, "balance_loss_mlp": 1.03551435, "epoch": 0.6911769126709755, "flos": 13735724561280.0, "grad_norm": 1.7382671013911568, "language_loss": 0.74143016, "learning_rate": 9.196269679734119e-07, "loss": 0.76279563, "num_input_tokens_seen": 248142545, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 11496, "time_per_iteration": 2.4704604148864746 }, { "auxiliary_loss_clip": 0.01104168, "auxiliary_loss_mlp": 0.01031012, "balance_loss_clip": 1.0191623, "balance_loss_mlp": 1.03493738, "epoch": 0.6912370359236435, "flos": 17566854295680.0, "grad_norm": 1.6894445417876147, "language_loss": 0.79818326, "learning_rate": 9.19299238803515e-07, "loss": 0.81953502, "num_input_tokens_seen": 248160225, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 11497, "time_per_iteration": 2.4747936725616455 }, { "auxiliary_loss_clip": 0.0110996, "auxiliary_loss_mlp": 0.01034301, "balance_loss_clip": 1.02161145, "balance_loss_mlp": 1.03736746, "epoch": 0.6912971591763114, "flos": 22090772620800.0, "grad_norm": 2.3016297228144995, "language_loss": 0.80562592, "learning_rate": 9.189715506138993e-07, "loss": 0.82706857, "num_input_tokens_seen": 248180430, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 11498, "time_per_iteration": 2.4895853996276855 }, { "auxiliary_loss_clip": 0.01103221, "auxiliary_loss_mlp": 0.01031765, "balance_loss_clip": 1.019135, "balance_loss_mlp": 1.03540564, "epoch": 0.6913572824289794, "flos": 29971476650880.0, "grad_norm": 1.4819680224501806, "language_loss": 0.85861272, "learning_rate": 9.186439034169915e-07, "loss": 0.87996256, "num_input_tokens_seen": 248202365, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 11499, "time_per_iteration": 2.5787806510925293 }, { "auxiliary_loss_clip": 0.01104089, "auxiliary_loss_mlp": 0.01029624, "balance_loss_clip": 1.01713085, "balance_loss_mlp": 1.03590918, "epoch": 0.6914174056816473, "flos": 20448936835200.0, "grad_norm": 1.798353192591592, "language_loss": 0.75544697, "learning_rate": 9.183162972252145e-07, "loss": 0.77678412, "num_input_tokens_seen": 248221750, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 11500, "time_per_iteration": 2.4886157512664795 }, { "auxiliary_loss_clip": 0.01108187, "auxiliary_loss_mlp": 0.01031685, "balance_loss_clip": 1.01876259, "balance_loss_mlp": 1.03690016, "epoch": 0.6914775289343154, "flos": 21282530739840.0, "grad_norm": 1.9258153464916024, "language_loss": 0.76848161, "learning_rate": 9.179887320509921e-07, "loss": 0.78988039, "num_input_tokens_seen": 248239535, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 11501, "time_per_iteration": 2.5198476314544678 }, { "auxiliary_loss_clip": 0.01109386, "auxiliary_loss_mlp": 0.01033918, "balance_loss_clip": 1.02134097, "balance_loss_mlp": 1.03729248, "epoch": 0.6915376521869833, "flos": 23878118401920.0, "grad_norm": 1.9010838982700609, "language_loss": 0.73358643, "learning_rate": 9.176612079067458e-07, "loss": 0.75501943, "num_input_tokens_seen": 248259055, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 11502, "time_per_iteration": 2.4997735023498535 }, { "auxiliary_loss_clip": 0.011099, "auxiliary_loss_mlp": 0.01032713, "balance_loss_clip": 1.0186224, "balance_loss_mlp": 1.03773475, "epoch": 0.6915977754396513, "flos": 11510268595200.0, "grad_norm": 1.9209637489056748, "language_loss": 0.73590386, "learning_rate": 9.173337248048953e-07, "loss": 0.75733, "num_input_tokens_seen": 248276765, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.72265625, "step": 11503, "time_per_iteration": 2.4846794605255127 }, { "auxiliary_loss_clip": 0.01106564, "auxiliary_loss_mlp": 0.01027607, "balance_loss_clip": 1.01536465, "balance_loss_mlp": 1.03685069, "epoch": 0.6916578986923193, "flos": 22601278667520.0, "grad_norm": 1.7575861825286487, "language_loss": 0.77379751, "learning_rate": 9.170062827578575e-07, "loss": 0.79513925, "num_input_tokens_seen": 248295310, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 11504, "time_per_iteration": 3.8954572677612305 }, { "auxiliary_loss_clip": 0.01106356, "auxiliary_loss_mlp": 0.01025996, "balance_loss_clip": 1.01397967, "balance_loss_mlp": 1.03623581, "epoch": 0.6917180219449872, "flos": 23477355383040.0, "grad_norm": 2.1517773279568426, "language_loss": 0.73911041, "learning_rate": 9.166788817780499e-07, "loss": 0.76043391, "num_input_tokens_seen": 248315230, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 11505, "time_per_iteration": 3.945406675338745 }, { "auxiliary_loss_clip": 0.01104803, "auxiliary_loss_mlp": 0.01031604, "balance_loss_clip": 1.01874089, "balance_loss_mlp": 1.03562772, "epoch": 0.6917781451976552, "flos": 23732536579200.0, "grad_norm": 2.285326367404767, "language_loss": 0.87520885, "learning_rate": 9.163515218778886e-07, "loss": 0.89657295, "num_input_tokens_seen": 248332980, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 11506, "time_per_iteration": 2.5382699966430664 }, { "auxiliary_loss_clip": 0.01105913, "auxiliary_loss_mlp": 0.01024064, "balance_loss_clip": 1.01216102, "balance_loss_mlp": 1.03749812, "epoch": 0.6918382684503231, "flos": 31466760946560.0, "grad_norm": 2.2948964700629855, "language_loss": 0.70421749, "learning_rate": 9.160242030697856e-07, "loss": 0.72551727, "num_input_tokens_seen": 248352865, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 11507, "time_per_iteration": 4.109834909439087 }, { "auxiliary_loss_clip": 0.01107751, "auxiliary_loss_mlp": 0.01031765, "balance_loss_clip": 1.01927197, "balance_loss_mlp": 1.03529, "epoch": 0.6918983917029912, "flos": 21650471706240.0, "grad_norm": 1.8089737529925707, "language_loss": 0.77120233, "learning_rate": 9.156969253661538e-07, "loss": 0.79259741, "num_input_tokens_seen": 248371125, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 11508, "time_per_iteration": 3.9753615856170654 }, { "auxiliary_loss_clip": 0.01103743, "auxiliary_loss_mlp": 0.01030532, "balance_loss_clip": 1.01892734, "balance_loss_mlp": 1.0367862, "epoch": 0.6919585149556591, "flos": 25550082720000.0, "grad_norm": 1.8629681543901775, "language_loss": 0.74724865, "learning_rate": 9.153696887794027e-07, "loss": 0.7685914, "num_input_tokens_seen": 248390455, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 11509, "time_per_iteration": 2.5506937503814697 }, { "auxiliary_loss_clip": 0.01108074, "auxiliary_loss_mlp": 0.01031579, "balance_loss_clip": 1.01943791, "balance_loss_mlp": 1.03849649, "epoch": 0.6920186382083271, "flos": 23659781581440.0, "grad_norm": 1.4577583390711442, "language_loss": 0.64117092, "learning_rate": 9.150424933219425e-07, "loss": 0.66256744, "num_input_tokens_seen": 248411305, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 11510, "time_per_iteration": 2.5139148235321045 }, { "auxiliary_loss_clip": 0.01112096, "auxiliary_loss_mlp": 0.01032597, "balance_loss_clip": 1.01903093, "balance_loss_mlp": 1.03868294, "epoch": 0.692078761460995, "flos": 19061959023360.0, "grad_norm": 2.0218276675097906, "language_loss": 0.75414306, "learning_rate": 9.147153390061788e-07, "loss": 0.77558994, "num_input_tokens_seen": 248430190, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 11511, "time_per_iteration": 2.4820427894592285 }, { "auxiliary_loss_clip": 0.01105293, "auxiliary_loss_mlp": 0.01032125, "balance_loss_clip": 1.02076411, "balance_loss_mlp": 1.03665996, "epoch": 0.692138884713663, "flos": 29023291382400.0, "grad_norm": 1.6059805326675651, "language_loss": 0.62832367, "learning_rate": 9.143882258445184e-07, "loss": 0.64969778, "num_input_tokens_seen": 248450830, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6875, "step": 11512, "time_per_iteration": 2.550964593887329 }, { "auxiliary_loss_clip": 0.0110822, "auxiliary_loss_mlp": 0.01034673, "balance_loss_clip": 1.02173328, "balance_loss_mlp": 1.03634655, "epoch": 0.6921990079663309, "flos": 14757849976320.0, "grad_norm": 1.8717711017805108, "language_loss": 0.82933259, "learning_rate": 9.140611538493666e-07, "loss": 0.85076153, "num_input_tokens_seen": 248468585, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11513, "time_per_iteration": 2.4744181632995605 }, { "auxiliary_loss_clip": 0.01106272, "auxiliary_loss_mlp": 0.01031701, "balance_loss_clip": 1.02001297, "balance_loss_mlp": 1.03763223, "epoch": 0.692259131218999, "flos": 23841848643840.0, "grad_norm": 1.5435301188207344, "language_loss": 0.78508741, "learning_rate": 9.137341230331233e-07, "loss": 0.80646718, "num_input_tokens_seen": 248490535, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 11514, "time_per_iteration": 2.5218331813812256 }, { "auxiliary_loss_clip": 0.01107974, "auxiliary_loss_mlp": 0.01031235, "balance_loss_clip": 1.01866472, "balance_loss_mlp": 1.03621531, "epoch": 0.6923192544716669, "flos": 19135073157120.0, "grad_norm": 2.386308485337056, "language_loss": 0.75070906, "learning_rate": 9.134071334081907e-07, "loss": 0.7721011, "num_input_tokens_seen": 248508575, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 11515, "time_per_iteration": 2.453014612197876 }, { "auxiliary_loss_clip": 0.01105066, "auxiliary_loss_mlp": 0.01033102, "balance_loss_clip": 1.02143145, "balance_loss_mlp": 1.03734136, "epoch": 0.6923793777243349, "flos": 28074639237120.0, "grad_norm": 3.0275743652579266, "language_loss": 0.53709483, "learning_rate": 9.130801849869694e-07, "loss": 0.55847657, "num_input_tokens_seen": 248527025, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 11516, "time_per_iteration": 2.5193932056427 }, { "auxiliary_loss_clip": 0.01104838, "auxiliary_loss_mlp": 0.01032879, "balance_loss_clip": 1.02023077, "balance_loss_mlp": 1.03760433, "epoch": 0.6924395009770029, "flos": 16581250033920.0, "grad_norm": 1.9372561424486021, "language_loss": 0.73143065, "learning_rate": 9.127532777818557e-07, "loss": 0.75280774, "num_input_tokens_seen": 248544275, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 11517, "time_per_iteration": 2.478374719619751 }, { "auxiliary_loss_clip": 0.01109668, "auxiliary_loss_mlp": 0.0103572, "balance_loss_clip": 1.0224756, "balance_loss_mlp": 1.03831339, "epoch": 0.6924996242296708, "flos": 16655297921280.0, "grad_norm": 6.201717601225418, "language_loss": 0.76402074, "learning_rate": 9.124264118052465e-07, "loss": 0.78547466, "num_input_tokens_seen": 248561870, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 11518, "time_per_iteration": 2.4864730834960938 }, { "auxiliary_loss_clip": 0.0110928, "auxiliary_loss_mlp": 0.01032933, "balance_loss_clip": 1.01886642, "balance_loss_mlp": 1.03590691, "epoch": 0.6925597474823388, "flos": 34754167532160.0, "grad_norm": 1.4450453402758574, "language_loss": 0.64563251, "learning_rate": 9.120995870695376e-07, "loss": 0.66705465, "num_input_tokens_seen": 248588190, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 11519, "time_per_iteration": 2.6196677684783936 }, { "auxiliary_loss_clip": 0.01106108, "auxiliary_loss_mlp": 0.01034339, "balance_loss_clip": 1.02180398, "balance_loss_mlp": 1.0356946, "epoch": 0.6926198707350067, "flos": 21871717528320.0, "grad_norm": 1.840867532236424, "language_loss": 0.6230973, "learning_rate": 9.117728035871212e-07, "loss": 0.64450175, "num_input_tokens_seen": 248606460, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 11520, "time_per_iteration": 2.488924264907837 }, { "auxiliary_loss_clip": 0.01113561, "auxiliary_loss_mlp": 0.01038826, "balance_loss_clip": 1.02460384, "balance_loss_mlp": 1.03802836, "epoch": 0.6926799939876748, "flos": 13006271162880.0, "grad_norm": 1.986579996866222, "language_loss": 0.77489436, "learning_rate": 9.114460613703887e-07, "loss": 0.79641819, "num_input_tokens_seen": 248623715, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7578125, "step": 11521, "time_per_iteration": 2.4995930194854736 }, { "auxiliary_loss_clip": 0.01111747, "auxiliary_loss_mlp": 0.01033478, "balance_loss_clip": 1.01970339, "balance_loss_mlp": 1.0375607, "epoch": 0.6927401172403427, "flos": 16761234107520.0, "grad_norm": 1.865792121619875, "language_loss": 0.81992435, "learning_rate": 9.111193604317304e-07, "loss": 0.84137654, "num_input_tokens_seen": 248640575, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 11522, "time_per_iteration": 2.4910128116607666 }, { "auxiliary_loss_clip": 0.01109391, "auxiliary_loss_mlp": 0.01036478, "balance_loss_clip": 1.02381837, "balance_loss_mlp": 1.03915322, "epoch": 0.6928002404930107, "flos": 25705648523520.0, "grad_norm": 1.5105632512450349, "language_loss": 0.7661252, "learning_rate": 9.107927007835361e-07, "loss": 0.78758395, "num_input_tokens_seen": 248663535, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11523, "time_per_iteration": 2.5781004428863525 }, { "auxiliary_loss_clip": 0.01104723, "auxiliary_loss_mlp": 0.01034879, "balance_loss_clip": 1.02288628, "balance_loss_mlp": 1.03631127, "epoch": 0.6928603637456786, "flos": 18588261438720.0, "grad_norm": 1.810321759036602, "language_loss": 0.68195915, "learning_rate": 9.104660824381915e-07, "loss": 0.70335519, "num_input_tokens_seen": 248681125, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 11524, "time_per_iteration": 2.4728848934173584 }, { "auxiliary_loss_clip": 0.01107723, "auxiliary_loss_mlp": 0.01033875, "balance_loss_clip": 1.0207386, "balance_loss_mlp": 1.03642082, "epoch": 0.6929204869983466, "flos": 22200874784640.0, "grad_norm": 1.9277263153515545, "language_loss": 0.64482754, "learning_rate": 9.101395054080815e-07, "loss": 0.66624355, "num_input_tokens_seen": 248700555, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 11525, "time_per_iteration": 2.5034189224243164 }, { "auxiliary_loss_clip": 0.01110385, "auxiliary_loss_mlp": 0.01042712, "balance_loss_clip": 1.02972984, "balance_loss_mlp": 1.03950608, "epoch": 0.6929806102510145, "flos": 17894754576000.0, "grad_norm": 2.4610617400096535, "language_loss": 0.70538712, "learning_rate": 9.098129697055907e-07, "loss": 0.7269181, "num_input_tokens_seen": 248716095, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 11526, "time_per_iteration": 2.469182014465332 }, { "auxiliary_loss_clip": 0.01104028, "auxiliary_loss_mlp": 0.01032657, "balance_loss_clip": 1.02024722, "balance_loss_mlp": 1.0354178, "epoch": 0.6930407335036826, "flos": 19755178577280.0, "grad_norm": 1.8975138021226707, "language_loss": 0.75896698, "learning_rate": 9.094864753431022e-07, "loss": 0.78033376, "num_input_tokens_seen": 248735330, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 11527, "time_per_iteration": 2.580960750579834 }, { "auxiliary_loss_clip": 0.01105178, "auxiliary_loss_mlp": 0.01030968, "balance_loss_clip": 1.019297, "balance_loss_mlp": 1.03623867, "epoch": 0.6931008567563505, "flos": 21544248211200.0, "grad_norm": 1.5550061538434163, "language_loss": 0.79152787, "learning_rate": 9.091600223329952e-07, "loss": 0.81288934, "num_input_tokens_seen": 248754530, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 11528, "time_per_iteration": 2.505136489868164 }, { "auxiliary_loss_clip": 0.01102968, "auxiliary_loss_mlp": 0.01030457, "balance_loss_clip": 1.01801801, "balance_loss_mlp": 1.03544724, "epoch": 0.6931609800090185, "flos": 26250018117120.0, "grad_norm": 1.5095500761962333, "language_loss": 0.76137769, "learning_rate": 9.088336106876491e-07, "loss": 0.78271186, "num_input_tokens_seen": 248775825, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.671875, "step": 11529, "time_per_iteration": 2.5665831565856934 }, { "auxiliary_loss_clip": 0.01106185, "auxiliary_loss_mlp": 0.01034942, "balance_loss_clip": 1.02240682, "balance_loss_mlp": 1.03749728, "epoch": 0.6932211032616865, "flos": 32343376366080.0, "grad_norm": 1.70186739798677, "language_loss": 0.72682798, "learning_rate": 9.085072404194436e-07, "loss": 0.74823928, "num_input_tokens_seen": 248796180, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 11530, "time_per_iteration": 2.6095962524414062 }, { "auxiliary_loss_clip": 0.01113867, "auxiliary_loss_mlp": 0.01033753, "balance_loss_clip": 1.01904249, "balance_loss_mlp": 1.039217, "epoch": 0.6932812265143544, "flos": 22049079909120.0, "grad_norm": 2.8067538010664803, "language_loss": 0.78323889, "learning_rate": 9.081809115407513e-07, "loss": 0.80471504, "num_input_tokens_seen": 248814735, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.74609375, "step": 11531, "time_per_iteration": 2.5165255069732666 }, { "auxiliary_loss_clip": 0.01104527, "auxiliary_loss_mlp": 0.01033024, "balance_loss_clip": 1.02143693, "balance_loss_mlp": 1.03637886, "epoch": 0.6933413497670224, "flos": 26256626219520.0, "grad_norm": 1.4087430076220195, "language_loss": 0.69441783, "learning_rate": 9.078546240639484e-07, "loss": 0.71579337, "num_input_tokens_seen": 248839140, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 11532, "time_per_iteration": 2.5643656253814697 }, { "auxiliary_loss_clip": 0.01110586, "auxiliary_loss_mlp": 0.01030248, "balance_loss_clip": 1.01695609, "balance_loss_mlp": 1.03863323, "epoch": 0.6934014730196904, "flos": 19573003774080.0, "grad_norm": 1.3186054554783475, "language_loss": 0.66872001, "learning_rate": 9.075283780014082e-07, "loss": 0.69012839, "num_input_tokens_seen": 248858300, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11533, "time_per_iteration": 2.477243185043335 }, { "auxiliary_loss_clip": 0.01109549, "auxiliary_loss_mlp": 0.01035766, "balance_loss_clip": 1.0226115, "balance_loss_mlp": 1.03748918, "epoch": 0.6934615962723584, "flos": 22119249127680.0, "grad_norm": 2.521177043703024, "language_loss": 0.58634931, "learning_rate": 9.072021733655007e-07, "loss": 0.60780251, "num_input_tokens_seen": 248876310, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 11534, "time_per_iteration": 2.450310230255127 }, { "auxiliary_loss_clip": 0.0110476, "auxiliary_loss_mlp": 0.01031823, "balance_loss_clip": 1.01841211, "balance_loss_mlp": 1.03494644, "epoch": 0.6935217195250263, "flos": 21360816432000.0, "grad_norm": 2.063121324740967, "language_loss": 0.71274447, "learning_rate": 9.068760101685971e-07, "loss": 0.73411024, "num_input_tokens_seen": 248895650, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69921875, "step": 11535, "time_per_iteration": 2.5601589679718018 }, { "auxiliary_loss_clip": 0.01032582, "auxiliary_loss_mlp": 0.01000515, "balance_loss_clip": 0.99933499, "balance_loss_mlp": 1.00957251, "epoch": 0.6935818427776943, "flos": 64063813115520.0, "grad_norm": 0.7192064993406301, "language_loss": 0.5906772, "learning_rate": 9.065498884230638e-07, "loss": 0.61100817, "num_input_tokens_seen": 248963920, "router_z_loss_clip": 0.01177979, "router_z_loss_mlp": 0.23046875, "step": 11536, "time_per_iteration": 3.2346200942993164 }, { "auxiliary_loss_clip": 0.01113179, "auxiliary_loss_mlp": 0.01033237, "balance_loss_clip": 1.01976681, "balance_loss_mlp": 1.03894031, "epoch": 0.6936419660303622, "flos": 20302564913280.0, "grad_norm": 1.8981940386079004, "language_loss": 0.72956192, "learning_rate": 9.062238081412692e-07, "loss": 0.75102609, "num_input_tokens_seen": 248983380, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 11537, "time_per_iteration": 2.4809365272521973 }, { "auxiliary_loss_clip": 0.01032682, "auxiliary_loss_mlp": 0.01001795, "balance_loss_clip": 1.0005554, "balance_loss_mlp": 1.00974178, "epoch": 0.6937020892830302, "flos": 67182581347200.0, "grad_norm": 0.7501054316233988, "language_loss": 0.55574936, "learning_rate": 9.058977693355767e-07, "loss": 0.57609409, "num_input_tokens_seen": 249044680, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.22949219, "step": 11538, "time_per_iteration": 3.115967273712158 }, { "auxiliary_loss_clip": 0.01102815, "auxiliary_loss_mlp": 0.01034338, "balance_loss_clip": 1.02248275, "balance_loss_mlp": 1.03651297, "epoch": 0.6937622125356981, "flos": 23878190229120.0, "grad_norm": 1.6649779098939845, "language_loss": 0.77754509, "learning_rate": 9.055717720183505e-07, "loss": 0.79891658, "num_input_tokens_seen": 249061060, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6640625, "step": 11539, "time_per_iteration": 2.555866241455078 }, { "auxiliary_loss_clip": 0.01106996, "auxiliary_loss_mlp": 0.01027936, "balance_loss_clip": 1.0162003, "balance_loss_mlp": 1.0374918, "epoch": 0.6938223357883662, "flos": 28730619365760.0, "grad_norm": 2.626652472671407, "language_loss": 0.64306986, "learning_rate": 9.05245816201953e-07, "loss": 0.66441917, "num_input_tokens_seen": 249081430, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6953125, "step": 11540, "time_per_iteration": 2.6665117740631104 }, { "auxiliary_loss_clip": 0.01106862, "auxiliary_loss_mlp": 0.01032653, "balance_loss_clip": 1.02066612, "balance_loss_mlp": 1.0380106, "epoch": 0.6938824590410341, "flos": 28655027193600.0, "grad_norm": 1.428007889435206, "language_loss": 0.86827612, "learning_rate": 9.049199018987437e-07, "loss": 0.88967133, "num_input_tokens_seen": 249103020, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 11541, "time_per_iteration": 2.599390983581543 }, { "auxiliary_loss_clip": 0.01108469, "auxiliary_loss_mlp": 0.01034326, "balance_loss_clip": 1.02192843, "balance_loss_mlp": 1.03837204, "epoch": 0.6939425822937021, "flos": 18983062800000.0, "grad_norm": 2.0265558760632407, "language_loss": 0.84033191, "learning_rate": 9.04594029121081e-07, "loss": 0.86175978, "num_input_tokens_seen": 249120810, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 11542, "time_per_iteration": 2.4569170475006104 }, { "auxiliary_loss_clip": 0.01109091, "auxiliary_loss_mlp": 0.01034863, "balance_loss_clip": 1.02129138, "balance_loss_mlp": 1.0376761, "epoch": 0.6940027055463701, "flos": 23075838178560.0, "grad_norm": 1.791332403728557, "language_loss": 0.75313401, "learning_rate": 9.04268197881323e-07, "loss": 0.77457356, "num_input_tokens_seen": 249138050, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 11543, "time_per_iteration": 2.5353658199310303 }, { "auxiliary_loss_clip": 0.01105407, "auxiliary_loss_mlp": 0.01035859, "balance_loss_clip": 1.02349138, "balance_loss_mlp": 1.03631997, "epoch": 0.694062828799038, "flos": 18186564666240.0, "grad_norm": 1.8760114309938896, "language_loss": 0.7648685, "learning_rate": 9.039424081918241e-07, "loss": 0.78628111, "num_input_tokens_seen": 249155570, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 11544, "time_per_iteration": 2.511396884918213 }, { "auxiliary_loss_clip": 0.01110675, "auxiliary_loss_mlp": 0.01036108, "balance_loss_clip": 1.02388334, "balance_loss_mlp": 1.03926635, "epoch": 0.694122952051706, "flos": 17821532701440.0, "grad_norm": 1.9546481573011296, "language_loss": 0.71387756, "learning_rate": 9.036166600649388e-07, "loss": 0.73534542, "num_input_tokens_seen": 249172960, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71484375, "step": 11545, "time_per_iteration": 2.5245540142059326 }, { "auxiliary_loss_clip": 0.01104478, "auxiliary_loss_mlp": 0.01030682, "balance_loss_clip": 1.01914263, "balance_loss_mlp": 1.03746367, "epoch": 0.694183075304374, "flos": 21215306436480.0, "grad_norm": 1.83315071062533, "language_loss": 0.79383886, "learning_rate": 9.0329095351302e-07, "loss": 0.81519043, "num_input_tokens_seen": 249192450, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 11546, "time_per_iteration": 3.905905246734619 }, { "auxiliary_loss_clip": 0.01108388, "auxiliary_loss_mlp": 0.01031543, "balance_loss_clip": 1.01943099, "balance_loss_mlp": 1.03815198, "epoch": 0.694243198557042, "flos": 24060508686720.0, "grad_norm": 1.5830628637635995, "language_loss": 0.78673846, "learning_rate": 9.029652885484194e-07, "loss": 0.80813777, "num_input_tokens_seen": 249214320, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 11547, "time_per_iteration": 3.9810709953308105 }, { "auxiliary_loss_clip": 0.01107091, "auxiliary_loss_mlp": 0.01035554, "balance_loss_clip": 1.02258396, "balance_loss_mlp": 1.03807068, "epoch": 0.6943033218097099, "flos": 21141869080320.0, "grad_norm": 3.147230822743412, "language_loss": 0.80473363, "learning_rate": 9.026396651834834e-07, "loss": 0.82616007, "num_input_tokens_seen": 249230925, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 11548, "time_per_iteration": 2.5192525386810303 }, { "auxiliary_loss_clip": 0.01031971, "auxiliary_loss_mlp": 0.0100229, "balance_loss_clip": 1.00102675, "balance_loss_mlp": 1.00900149, "epoch": 0.6943634450623779, "flos": 57812015975040.0, "grad_norm": 0.6911462431240716, "language_loss": 0.53724062, "learning_rate": 9.023140834305613e-07, "loss": 0.55758321, "num_input_tokens_seen": 249293975, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.23046875, "step": 11549, "time_per_iteration": 4.631623029708862 }, { "auxiliary_loss_clip": 0.01106708, "auxiliary_loss_mlp": 0.01030141, "balance_loss_clip": 1.01755297, "balance_loss_mlp": 1.03697085, "epoch": 0.6944235683150458, "flos": 30590684231040.0, "grad_norm": 2.0223806849076342, "language_loss": 0.73624182, "learning_rate": 9.01988543302e-07, "loss": 0.75761032, "num_input_tokens_seen": 249315285, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 11550, "time_per_iteration": 3.984189033508301 }, { "auxiliary_loss_clip": 0.01111113, "auxiliary_loss_mlp": 0.01038173, "balance_loss_clip": 1.02528644, "balance_loss_mlp": 1.03917456, "epoch": 0.6944836915677138, "flos": 19719447523200.0, "grad_norm": 4.007534541839477, "language_loss": 0.74450302, "learning_rate": 9.016630448101425e-07, "loss": 0.76599586, "num_input_tokens_seen": 249333505, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11551, "time_per_iteration": 2.5137908458709717 }, { "auxiliary_loss_clip": 0.01109631, "auxiliary_loss_mlp": 0.01036428, "balance_loss_clip": 1.02350605, "balance_loss_mlp": 1.03872323, "epoch": 0.6945438148203817, "flos": 24863579009280.0, "grad_norm": 1.65186218000735, "language_loss": 0.84441406, "learning_rate": 9.01337587967333e-07, "loss": 0.86587465, "num_input_tokens_seen": 249354180, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11552, "time_per_iteration": 2.523967981338501 }, { "auxiliary_loss_clip": 0.01107369, "auxiliary_loss_mlp": 0.01036743, "balance_loss_clip": 1.0242914, "balance_loss_mlp": 1.03754365, "epoch": 0.6946039380730498, "flos": 33326646243840.0, "grad_norm": 1.5975731425078683, "language_loss": 0.67424834, "learning_rate": 9.010121727859117e-07, "loss": 0.69568944, "num_input_tokens_seen": 249377035, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 11553, "time_per_iteration": 2.5670220851898193 }, { "auxiliary_loss_clip": 0.01114174, "auxiliary_loss_mlp": 0.01031999, "balance_loss_clip": 1.01890421, "balance_loss_mlp": 1.03974533, "epoch": 0.6946640613257177, "flos": 20850956830080.0, "grad_norm": 1.6781735209042912, "language_loss": 0.7942006, "learning_rate": 9.006867992782195e-07, "loss": 0.81566232, "num_input_tokens_seen": 249396155, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.74609375, "step": 11554, "time_per_iteration": 2.459717035293579 }, { "auxiliary_loss_clip": 0.01108577, "auxiliary_loss_mlp": 0.01033311, "balance_loss_clip": 1.02059782, "balance_loss_mlp": 1.03710306, "epoch": 0.6947241845783857, "flos": 19354846521600.0, "grad_norm": 1.7425306915610181, "language_loss": 0.72956342, "learning_rate": 9.003614674565934e-07, "loss": 0.75098228, "num_input_tokens_seen": 249414555, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 11555, "time_per_iteration": 2.451042890548706 }, { "auxiliary_loss_clip": 0.01106625, "auxiliary_loss_mlp": 0.01032406, "balance_loss_clip": 1.02036619, "balance_loss_mlp": 1.03617334, "epoch": 0.6947843078310536, "flos": 27120240915840.0, "grad_norm": 1.7792468390630454, "language_loss": 0.78371716, "learning_rate": 9.000361773333705e-07, "loss": 0.80510747, "num_input_tokens_seen": 249433570, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 11556, "time_per_iteration": 2.4956870079040527 }, { "auxiliary_loss_clip": 0.01107015, "auxiliary_loss_mlp": 0.01036312, "balance_loss_clip": 1.02358627, "balance_loss_mlp": 1.03614187, "epoch": 0.6948444310837216, "flos": 28585109370240.0, "grad_norm": 2.6636912866428064, "language_loss": 0.60487032, "learning_rate": 8.997109289208869e-07, "loss": 0.62630361, "num_input_tokens_seen": 249453735, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11557, "time_per_iteration": 2.521998882293701 }, { "auxiliary_loss_clip": 0.01105606, "auxiliary_loss_mlp": 0.01037486, "balance_loss_clip": 1.02556539, "balance_loss_mlp": 1.03756475, "epoch": 0.6949045543363896, "flos": 15669262696320.0, "grad_norm": 1.7258981586996966, "language_loss": 0.85593045, "learning_rate": 8.993857222314752e-07, "loss": 0.87736142, "num_input_tokens_seen": 249470805, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 11558, "time_per_iteration": 2.4287500381469727 }, { "auxiliary_loss_clip": 0.01109766, "auxiliary_loss_mlp": 0.01034011, "balance_loss_clip": 1.0203315, "balance_loss_mlp": 1.0385592, "epoch": 0.6949646775890576, "flos": 23259413612160.0, "grad_norm": 1.4984746541358533, "language_loss": 0.70446193, "learning_rate": 8.990605572774664e-07, "loss": 0.7258997, "num_input_tokens_seen": 249491150, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 11559, "time_per_iteration": 2.5001654624938965 }, { "auxiliary_loss_clip": 0.01106584, "auxiliary_loss_mlp": 0.01031344, "balance_loss_clip": 1.01942873, "balance_loss_mlp": 1.0373975, "epoch": 0.6950248008417256, "flos": 22382546797440.0, "grad_norm": 2.429955946111377, "language_loss": 0.79135472, "learning_rate": 8.987354340711921e-07, "loss": 0.81273401, "num_input_tokens_seen": 249511560, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 11560, "time_per_iteration": 2.4635932445526123 }, { "auxiliary_loss_clip": 0.01105696, "auxiliary_loss_mlp": 0.01033782, "balance_loss_clip": 1.02193856, "balance_loss_mlp": 1.03704143, "epoch": 0.6950849240943935, "flos": 23477355383040.0, "grad_norm": 1.5398141523117426, "language_loss": 0.76719826, "learning_rate": 8.9841035262498e-07, "loss": 0.78859305, "num_input_tokens_seen": 249531910, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 11561, "time_per_iteration": 2.485562801361084 }, { "auxiliary_loss_clip": 0.01105285, "auxiliary_loss_mlp": 0.01032924, "balance_loss_clip": 1.01941776, "balance_loss_mlp": 1.03621864, "epoch": 0.6951450473470615, "flos": 17420554200960.0, "grad_norm": 1.8485912182450623, "language_loss": 0.78662622, "learning_rate": 8.980853129511577e-07, "loss": 0.80800831, "num_input_tokens_seen": 249550300, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69140625, "step": 11562, "time_per_iteration": 2.4363996982574463 }, { "auxiliary_loss_clip": 0.01107389, "auxiliary_loss_mlp": 0.01032412, "balance_loss_clip": 1.01966214, "balance_loss_mlp": 1.03665709, "epoch": 0.6952051705997294, "flos": 20485745297280.0, "grad_norm": 2.352556296938915, "language_loss": 0.69790959, "learning_rate": 8.977603150620515e-07, "loss": 0.7193076, "num_input_tokens_seen": 249567740, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 11563, "time_per_iteration": 2.4951791763305664 }, { "auxiliary_loss_clip": 0.0110252, "auxiliary_loss_mlp": 0.01026103, "balance_loss_clip": 1.01449227, "balance_loss_mlp": 1.03608704, "epoch": 0.6952652938523974, "flos": 13989541040640.0, "grad_norm": 2.0809352772178045, "language_loss": 0.73545969, "learning_rate": 8.974353589699846e-07, "loss": 0.75674587, "num_input_tokens_seen": 249582700, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 11564, "time_per_iteration": 2.426995277404785 }, { "auxiliary_loss_clip": 0.01120587, "auxiliary_loss_mlp": 0.01038828, "balance_loss_clip": 1.02305686, "balance_loss_mlp": 1.04113078, "epoch": 0.6953254171050653, "flos": 30953956429440.0, "grad_norm": 2.0282882571615253, "language_loss": 0.72056019, "learning_rate": 8.971104446872785e-07, "loss": 0.74215436, "num_input_tokens_seen": 249602920, "router_z_loss_clip": 0.15722656, "router_z_loss_mlp": 0.79296875, "step": 11565, "time_per_iteration": 2.593733310699463 }, { "auxiliary_loss_clip": 0.01031351, "auxiliary_loss_mlp": 0.01004083, "balance_loss_clip": 1.0028795, "balance_loss_mlp": 1.00831079, "epoch": 0.6953855403577334, "flos": 61670257499520.0, "grad_norm": 0.9303788171394213, "language_loss": 0.58456099, "learning_rate": 8.96785572226255e-07, "loss": 0.60491526, "num_input_tokens_seen": 249660400, "router_z_loss_clip": 0.01202393, "router_z_loss_mlp": 0.23046875, "step": 11566, "time_per_iteration": 2.9493050575256348 }, { "auxiliary_loss_clip": 0.01108761, "auxiliary_loss_mlp": 0.01029446, "balance_loss_clip": 1.01619589, "balance_loss_mlp": 1.03615022, "epoch": 0.6954456636104013, "flos": 23039029716480.0, "grad_norm": 1.954485845708565, "language_loss": 0.74349773, "learning_rate": 8.964607415992338e-07, "loss": 0.76487982, "num_input_tokens_seen": 249679335, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 11567, "time_per_iteration": 2.499102830886841 }, { "auxiliary_loss_clip": 0.0110294, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.02040529, "balance_loss_mlp": 1.03515673, "epoch": 0.6955057868630693, "flos": 23918518224000.0, "grad_norm": 1.2731831090041141, "language_loss": 0.76890737, "learning_rate": 8.961359528185313e-07, "loss": 0.79026687, "num_input_tokens_seen": 249701805, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 11568, "time_per_iteration": 2.538996458053589 }, { "auxiliary_loss_clip": 0.01106784, "auxiliary_loss_mlp": 0.01032795, "balance_loss_clip": 1.02095723, "balance_loss_mlp": 1.03819346, "epoch": 0.6955659101157372, "flos": 22594634651520.0, "grad_norm": 2.08899342135793, "language_loss": 0.72729868, "learning_rate": 8.958112058964649e-07, "loss": 0.74869448, "num_input_tokens_seen": 249720550, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 11569, "time_per_iteration": 2.4985954761505127 }, { "auxiliary_loss_clip": 0.01108973, "auxiliary_loss_mlp": 0.01029089, "balance_loss_clip": 1.01656604, "balance_loss_mlp": 1.03828943, "epoch": 0.6956260333684052, "flos": 24572523104640.0, "grad_norm": 1.6694690769822338, "language_loss": 0.77064538, "learning_rate": 8.954865008453471e-07, "loss": 0.79202604, "num_input_tokens_seen": 249740325, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 11570, "time_per_iteration": 2.4960439205169678 }, { "auxiliary_loss_clip": 0.01106522, "auxiliary_loss_mlp": 0.01038553, "balance_loss_clip": 1.02527308, "balance_loss_mlp": 1.03543901, "epoch": 0.6956861566210732, "flos": 25846058787840.0, "grad_norm": 2.0570035089412113, "language_loss": 0.74207413, "learning_rate": 8.95161837677493e-07, "loss": 0.76352489, "num_input_tokens_seen": 249760570, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 11571, "time_per_iteration": 2.5189380645751953 }, { "auxiliary_loss_clip": 0.01101419, "auxiliary_loss_mlp": 0.01029025, "balance_loss_clip": 1.01706243, "balance_loss_mlp": 1.03565288, "epoch": 0.6957462798737412, "flos": 15301393557120.0, "grad_norm": 2.032868884282255, "language_loss": 0.74957782, "learning_rate": 8.948372164052118e-07, "loss": 0.77088219, "num_input_tokens_seen": 249778290, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.66015625, "step": 11572, "time_per_iteration": 2.433657646179199 }, { "auxiliary_loss_clip": 0.01103768, "auxiliary_loss_mlp": 0.01027712, "balance_loss_clip": 1.01542747, "balance_loss_mlp": 1.03353488, "epoch": 0.6958064031264092, "flos": 36246830135040.0, "grad_norm": 1.8777807535844036, "language_loss": 0.6978358, "learning_rate": 8.94512637040814e-07, "loss": 0.7191506, "num_input_tokens_seen": 249800925, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 11573, "time_per_iteration": 2.5794599056243896 }, { "auxiliary_loss_clip": 0.01110507, "auxiliary_loss_mlp": 0.01035972, "balance_loss_clip": 1.02226293, "balance_loss_mlp": 1.03799534, "epoch": 0.6958665263790771, "flos": 19208725994880.0, "grad_norm": 1.81248449380109, "language_loss": 0.74881709, "learning_rate": 8.941880995966095e-07, "loss": 0.77028191, "num_input_tokens_seen": 249820500, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 11574, "time_per_iteration": 2.446157693862915 }, { "auxiliary_loss_clip": 0.0110962, "auxiliary_loss_mlp": 0.01031882, "balance_loss_clip": 1.0195322, "balance_loss_mlp": 1.03771794, "epoch": 0.6959266496317451, "flos": 21795838047360.0, "grad_norm": 2.219780673715803, "language_loss": 0.74656796, "learning_rate": 8.938636040849014e-07, "loss": 0.76798296, "num_input_tokens_seen": 249839845, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 11575, "time_per_iteration": 2.4947290420532227 }, { "auxiliary_loss_clip": 0.01106274, "auxiliary_loss_mlp": 0.01030482, "balance_loss_clip": 1.01707125, "balance_loss_mlp": 1.03550887, "epoch": 0.695986772884413, "flos": 20558248899840.0, "grad_norm": 1.935435626992992, "language_loss": 0.78910816, "learning_rate": 8.935391505179966e-07, "loss": 0.81047577, "num_input_tokens_seen": 249857400, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.70703125, "step": 11576, "time_per_iteration": 2.4521548748016357 }, { "auxiliary_loss_clip": 0.011081, "auxiliary_loss_mlp": 0.01028202, "balance_loss_clip": 1.01642382, "balance_loss_mlp": 1.03556335, "epoch": 0.696046896137081, "flos": 14936217937920.0, "grad_norm": 2.6864280519110424, "language_loss": 0.56394732, "learning_rate": 8.932147389081985e-07, "loss": 0.58531034, "num_input_tokens_seen": 249871645, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.7265625, "step": 11577, "time_per_iteration": 2.4219250679016113 }, { "auxiliary_loss_clip": 0.01100078, "auxiliary_loss_mlp": 0.01023323, "balance_loss_clip": 1.01276064, "balance_loss_mlp": 1.03450298, "epoch": 0.696107019389749, "flos": 30740216549760.0, "grad_norm": 1.3484244745771883, "language_loss": 0.76770186, "learning_rate": 8.928903692678081e-07, "loss": 0.78893584, "num_input_tokens_seen": 249894215, "router_z_loss_clip": 0.10546875, "router_z_loss_mlp": 0.65625, "step": 11578, "time_per_iteration": 2.5817482471466064 }, { "auxiliary_loss_clip": 0.01106571, "auxiliary_loss_mlp": 0.01031936, "balance_loss_clip": 1.01943696, "balance_loss_mlp": 1.03695083, "epoch": 0.696167142642417, "flos": 20776729374720.0, "grad_norm": 2.989838641485972, "language_loss": 0.79728603, "learning_rate": 8.925660416091254e-07, "loss": 0.81867111, "num_input_tokens_seen": 249912850, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 11579, "time_per_iteration": 2.4877140522003174 }, { "auxiliary_loss_clip": 0.0110076, "auxiliary_loss_mlp": 0.01026252, "balance_loss_clip": 1.01349103, "balance_loss_mlp": 1.03339708, "epoch": 0.6962272658950849, "flos": 22565152563840.0, "grad_norm": 1.9354874654961298, "language_loss": 0.72807837, "learning_rate": 8.922417559444502e-07, "loss": 0.74934852, "num_input_tokens_seen": 249932650, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 11580, "time_per_iteration": 2.4572839736938477 }, { "auxiliary_loss_clip": 0.01108809, "auxiliary_loss_mlp": 0.01029627, "balance_loss_clip": 1.01638913, "balance_loss_mlp": 1.03735995, "epoch": 0.6962873891477529, "flos": 22200156512640.0, "grad_norm": 2.141635183700316, "language_loss": 0.65437579, "learning_rate": 8.919175122860787e-07, "loss": 0.67576015, "num_input_tokens_seen": 249951205, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 11581, "time_per_iteration": 2.4733734130859375 }, { "auxiliary_loss_clip": 0.01106748, "auxiliary_loss_mlp": 0.0102875, "balance_loss_clip": 1.01672196, "balance_loss_mlp": 1.0364604, "epoch": 0.6963475124004208, "flos": 12489695717760.0, "grad_norm": 3.5121591272746575, "language_loss": 0.7630614, "learning_rate": 8.915933106463056e-07, "loss": 0.78441644, "num_input_tokens_seen": 249967045, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 11582, "time_per_iteration": 2.411475896835327 }, { "auxiliary_loss_clip": 0.01103894, "auxiliary_loss_mlp": 0.01029714, "balance_loss_clip": 1.01849055, "balance_loss_mlp": 1.03535962, "epoch": 0.6964076356530888, "flos": 17165085696000.0, "grad_norm": 2.0250942033855353, "language_loss": 0.70077789, "learning_rate": 8.91269151037425e-07, "loss": 0.72211397, "num_input_tokens_seen": 249984565, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6875, "step": 11583, "time_per_iteration": 2.429786443710327 }, { "auxiliary_loss_clip": 0.01107506, "auxiliary_loss_mlp": 0.01032869, "balance_loss_clip": 1.01987553, "balance_loss_mlp": 1.03773272, "epoch": 0.6964677589057569, "flos": 19937317466880.0, "grad_norm": 1.8527887780768872, "language_loss": 0.82147127, "learning_rate": 8.909450334717301e-07, "loss": 0.84287506, "num_input_tokens_seen": 250004235, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 11584, "time_per_iteration": 2.4968011379241943 }, { "auxiliary_loss_clip": 0.01109012, "auxiliary_loss_mlp": 0.01033417, "balance_loss_clip": 1.01999986, "balance_loss_mlp": 1.03777301, "epoch": 0.6965278821584248, "flos": 22784064001920.0, "grad_norm": 2.2662167862763143, "language_loss": 0.79488957, "learning_rate": 8.906209579615107e-07, "loss": 0.81631392, "num_input_tokens_seen": 250017645, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 11585, "time_per_iteration": 2.5188000202178955 }, { "auxiliary_loss_clip": 0.01100887, "auxiliary_loss_mlp": 0.01031317, "balance_loss_clip": 1.01996851, "balance_loss_mlp": 1.03471398, "epoch": 0.6965880054110928, "flos": 20047563285120.0, "grad_norm": 1.6399589142492563, "language_loss": 0.77620828, "learning_rate": 8.90296924519055e-07, "loss": 0.79753035, "num_input_tokens_seen": 250037640, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 11586, "time_per_iteration": 2.4649851322174072 }, { "auxiliary_loss_clip": 0.01098419, "auxiliary_loss_mlp": 0.01027973, "balance_loss_clip": 1.01652265, "balance_loss_mlp": 1.03402805, "epoch": 0.6966481286637607, "flos": 21908238681600.0, "grad_norm": 2.374695927988077, "language_loss": 0.78572869, "learning_rate": 8.899729331566519e-07, "loss": 0.80699259, "num_input_tokens_seen": 250056490, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.640625, "step": 11587, "time_per_iteration": 3.8454766273498535 }, { "auxiliary_loss_clip": 0.01102131, "auxiliary_loss_mlp": 0.01029989, "balance_loss_clip": 1.01774597, "balance_loss_mlp": 1.0360229, "epoch": 0.6967082519164287, "flos": 15633172506240.0, "grad_norm": 3.2467997744780996, "language_loss": 0.72333866, "learning_rate": 8.896489838865857e-07, "loss": 0.7446599, "num_input_tokens_seen": 250074285, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.66015625, "step": 11588, "time_per_iteration": 3.9260740280151367 }, { "auxiliary_loss_clip": 0.01103348, "auxiliary_loss_mlp": 0.01026145, "balance_loss_clip": 1.01483822, "balance_loss_mlp": 1.03511143, "epoch": 0.6967683751690966, "flos": 24024598064640.0, "grad_norm": 3.086389645556106, "language_loss": 0.75015152, "learning_rate": 8.893250767211413e-07, "loss": 0.77144647, "num_input_tokens_seen": 250093350, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 11589, "time_per_iteration": 2.490678548812866 }, { "auxiliary_loss_clip": 0.01105563, "auxiliary_loss_mlp": 0.01029227, "balance_loss_clip": 1.01743102, "balance_loss_mlp": 1.03659725, "epoch": 0.6968284984217646, "flos": 31024700265600.0, "grad_norm": 1.9932207120205732, "language_loss": 0.63739419, "learning_rate": 8.890012116726012e-07, "loss": 0.65874213, "num_input_tokens_seen": 250114170, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 11590, "time_per_iteration": 3.970590353012085 }, { "auxiliary_loss_clip": 0.01030936, "auxiliary_loss_mlp": 0.01001421, "balance_loss_clip": 1.00029469, "balance_loss_mlp": 1.00789666, "epoch": 0.6968886216744326, "flos": 67622990002560.0, "grad_norm": 0.7578302816263013, "language_loss": 0.61241734, "learning_rate": 8.88677388753248e-07, "loss": 0.63274091, "num_input_tokens_seen": 250178250, "router_z_loss_clip": 0.0112915, "router_z_loss_mlp": 0.23046875, "step": 11591, "time_per_iteration": 4.598282337188721 }, { "auxiliary_loss_clip": 0.01108028, "auxiliary_loss_mlp": 0.01031857, "balance_loss_clip": 1.01898265, "balance_loss_mlp": 1.03857064, "epoch": 0.6969487449271006, "flos": 24863686750080.0, "grad_norm": 1.916349607072282, "language_loss": 0.69491619, "learning_rate": 8.883536079753582e-07, "loss": 0.71631503, "num_input_tokens_seen": 250198420, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 11592, "time_per_iteration": 2.4940149784088135 }, { "auxiliary_loss_clip": 0.01105649, "auxiliary_loss_mlp": 0.01024769, "balance_loss_clip": 1.01361656, "balance_loss_mlp": 1.03791761, "epoch": 0.6970088681797685, "flos": 28767858791040.0, "grad_norm": 1.683857475169259, "language_loss": 0.62303495, "learning_rate": 8.880298693512109e-07, "loss": 0.64433914, "num_input_tokens_seen": 250220650, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.67578125, "step": 11593, "time_per_iteration": 2.511277675628662 }, { "auxiliary_loss_clip": 0.01101056, "auxiliary_loss_mlp": 0.01026206, "balance_loss_clip": 1.01507831, "balance_loss_mlp": 1.03569424, "epoch": 0.6970689914324365, "flos": 27308556944640.0, "grad_norm": 1.9769173744561106, "language_loss": 0.54652143, "learning_rate": 8.877061728930832e-07, "loss": 0.56779402, "num_input_tokens_seen": 250241750, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 11594, "time_per_iteration": 2.537341833114624 }, { "auxiliary_loss_clip": 0.01104126, "auxiliary_loss_mlp": 0.0102831, "balance_loss_clip": 1.0164907, "balance_loss_mlp": 1.03577256, "epoch": 0.6971291146851044, "flos": 19136258305920.0, "grad_norm": 1.827017869004038, "language_loss": 0.76876462, "learning_rate": 8.87382518613248e-07, "loss": 0.79008889, "num_input_tokens_seen": 250259445, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 11595, "time_per_iteration": 2.452754020690918 }, { "auxiliary_loss_clip": 0.01108335, "auxiliary_loss_mlp": 0.01028324, "balance_loss_clip": 1.01561081, "balance_loss_mlp": 1.03727508, "epoch": 0.6971892379377724, "flos": 14610508387200.0, "grad_norm": 2.595376296063146, "language_loss": 0.71886688, "learning_rate": 8.870589065239793e-07, "loss": 0.74023342, "num_input_tokens_seen": 250275640, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11596, "time_per_iteration": 2.4375991821289062 }, { "auxiliary_loss_clip": 0.01108472, "auxiliary_loss_mlp": 0.01034083, "balance_loss_clip": 1.02203703, "balance_loss_mlp": 1.03944623, "epoch": 0.6972493611904405, "flos": 22307457415680.0, "grad_norm": 1.6102174234590882, "language_loss": 0.75945091, "learning_rate": 8.867353366375492e-07, "loss": 0.78087646, "num_input_tokens_seen": 250296435, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 11597, "time_per_iteration": 2.516221046447754 }, { "auxiliary_loss_clip": 0.01102728, "auxiliary_loss_mlp": 0.01030682, "balance_loss_clip": 1.01870191, "balance_loss_mlp": 1.03501022, "epoch": 0.6973094844431084, "flos": 17420374632960.0, "grad_norm": 1.9620207950802067, "language_loss": 0.74794024, "learning_rate": 8.864118089662267e-07, "loss": 0.76927435, "num_input_tokens_seen": 250314035, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 11598, "time_per_iteration": 2.43990159034729 }, { "auxiliary_loss_clip": 0.01108278, "auxiliary_loss_mlp": 0.01032601, "balance_loss_clip": 1.01975596, "balance_loss_mlp": 1.03724766, "epoch": 0.6973696076957764, "flos": 27235370983680.0, "grad_norm": 3.3770792450124127, "language_loss": 0.89825821, "learning_rate": 8.860883235222791e-07, "loss": 0.91966701, "num_input_tokens_seen": 250332995, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11599, "time_per_iteration": 2.527092695236206 }, { "auxiliary_loss_clip": 0.01112273, "auxiliary_loss_mlp": 0.01033989, "balance_loss_clip": 1.02068543, "balance_loss_mlp": 1.03924727, "epoch": 0.6974297309484443, "flos": 22018089450240.0, "grad_norm": 2.033037723905669, "language_loss": 0.70171398, "learning_rate": 8.85764880317974e-07, "loss": 0.7231766, "num_input_tokens_seen": 250352120, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 11600, "time_per_iteration": 2.4615187644958496 }, { "auxiliary_loss_clip": 0.01104405, "auxiliary_loss_mlp": 0.01031151, "balance_loss_clip": 1.01902711, "balance_loss_mlp": 1.03510499, "epoch": 0.6974898542011123, "flos": 28366449327360.0, "grad_norm": 1.9433637125138177, "language_loss": 0.76420867, "learning_rate": 8.854414793655771e-07, "loss": 0.78556418, "num_input_tokens_seen": 250371705, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 11601, "time_per_iteration": 2.538672685623169 }, { "auxiliary_loss_clip": 0.01100036, "auxiliary_loss_mlp": 0.01027741, "balance_loss_clip": 1.01660669, "balance_loss_mlp": 1.03419936, "epoch": 0.6975499774537802, "flos": 15232050351360.0, "grad_norm": 1.7908867620747397, "language_loss": 0.71893954, "learning_rate": 8.851181206773508e-07, "loss": 0.74021733, "num_input_tokens_seen": 250390485, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.66015625, "step": 11602, "time_per_iteration": 2.439824342727661 }, { "auxiliary_loss_clip": 0.01104345, "auxiliary_loss_mlp": 0.01034924, "balance_loss_clip": 1.0235455, "balance_loss_mlp": 1.03549159, "epoch": 0.6976101007064482, "flos": 22157422306560.0, "grad_norm": 2.108498230317644, "language_loss": 0.76562029, "learning_rate": 8.847948042655567e-07, "loss": 0.78701299, "num_input_tokens_seen": 250407020, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6875, "step": 11603, "time_per_iteration": 2.486558675765991 }, { "auxiliary_loss_clip": 0.01104828, "auxiliary_loss_mlp": 0.01028874, "balance_loss_clip": 1.0172447, "balance_loss_mlp": 1.036448, "epoch": 0.6976702239591162, "flos": 22273522041600.0, "grad_norm": 1.7528199198844627, "language_loss": 0.62372947, "learning_rate": 8.844715301424557e-07, "loss": 0.6450665, "num_input_tokens_seen": 250425880, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 11604, "time_per_iteration": 2.473111391067505 }, { "auxiliary_loss_clip": 0.01106289, "auxiliary_loss_mlp": 0.01031715, "balance_loss_clip": 1.01839328, "balance_loss_mlp": 1.03634167, "epoch": 0.6977303472117842, "flos": 25848608653440.0, "grad_norm": 2.7441363199418327, "language_loss": 0.81493753, "learning_rate": 8.841482983203057e-07, "loss": 0.8363176, "num_input_tokens_seen": 250442925, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 11605, "time_per_iteration": 2.506160259246826 }, { "auxiliary_loss_clip": 0.01104722, "auxiliary_loss_mlp": 0.01032547, "balance_loss_clip": 1.02088857, "balance_loss_mlp": 1.03575468, "epoch": 0.6977904704644521, "flos": 20959586536320.0, "grad_norm": 1.59248414443943, "language_loss": 0.70605981, "learning_rate": 8.838251088113638e-07, "loss": 0.72743249, "num_input_tokens_seen": 250461220, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69140625, "step": 11606, "time_per_iteration": 2.4771409034729004 }, { "auxiliary_loss_clip": 0.01107602, "auxiliary_loss_mlp": 0.01032704, "balance_loss_clip": 1.02030003, "balance_loss_mlp": 1.03655982, "epoch": 0.6978505937171201, "flos": 22055041566720.0, "grad_norm": 2.9988430135592474, "language_loss": 0.82380277, "learning_rate": 8.835019616278856e-07, "loss": 0.84520578, "num_input_tokens_seen": 250480975, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 11607, "time_per_iteration": 2.500164270401001 }, { "auxiliary_loss_clip": 0.01110099, "auxiliary_loss_mlp": 0.01030617, "balance_loss_clip": 1.01774788, "balance_loss_mlp": 1.03788257, "epoch": 0.697910716969788, "flos": 20043720529920.0, "grad_norm": 2.005530420050378, "language_loss": 0.79189777, "learning_rate": 8.831788567821265e-07, "loss": 0.81330496, "num_input_tokens_seen": 250497980, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11608, "time_per_iteration": 2.455535888671875 }, { "auxiliary_loss_clip": 0.01105063, "auxiliary_loss_mlp": 0.01031337, "balance_loss_clip": 1.01929045, "balance_loss_mlp": 1.0350641, "epoch": 0.697970840222456, "flos": 15888245961600.0, "grad_norm": 1.9575002009852538, "language_loss": 0.90115511, "learning_rate": 8.828557942863357e-07, "loss": 0.92251909, "num_input_tokens_seen": 250511910, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 11609, "time_per_iteration": 2.458540439605713 }, { "auxiliary_loss_clip": 0.01106272, "auxiliary_loss_mlp": 0.01028351, "balance_loss_clip": 1.01597118, "balance_loss_mlp": 1.0353148, "epoch": 0.698030963475124, "flos": 21215629658880.0, "grad_norm": 2.053763114133784, "language_loss": 0.64128983, "learning_rate": 8.82532774152765e-07, "loss": 0.66263604, "num_input_tokens_seen": 250531090, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 11610, "time_per_iteration": 2.5363504886627197 }, { "auxiliary_loss_clip": 0.01103232, "auxiliary_loss_mlp": 0.01031994, "balance_loss_clip": 1.02091908, "balance_loss_mlp": 1.03466535, "epoch": 0.698091086727792, "flos": 33759728524800.0, "grad_norm": 1.8484636901089424, "language_loss": 0.84402442, "learning_rate": 8.822097963936643e-07, "loss": 0.86537671, "num_input_tokens_seen": 250551565, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.68359375, "step": 11611, "time_per_iteration": 2.6133389472961426 }, { "auxiliary_loss_clip": 0.01106796, "auxiliary_loss_mlp": 0.01035286, "balance_loss_clip": 1.02325797, "balance_loss_mlp": 1.03582168, "epoch": 0.69815120998046, "flos": 15887850912000.0, "grad_norm": 2.0911638078349424, "language_loss": 0.71476245, "learning_rate": 8.818868610212793e-07, "loss": 0.73618329, "num_input_tokens_seen": 250569625, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 11612, "time_per_iteration": 2.509822368621826 }, { "auxiliary_loss_clip": 0.01103444, "auxiliary_loss_mlp": 0.01030273, "balance_loss_clip": 1.01823902, "balance_loss_mlp": 1.0355221, "epoch": 0.6982113332331279, "flos": 18947044437120.0, "grad_norm": 2.2880146712861125, "language_loss": 0.81013566, "learning_rate": 8.815639680478573e-07, "loss": 0.83147275, "num_input_tokens_seen": 250586960, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 11613, "time_per_iteration": 2.5158021450042725 }, { "auxiliary_loss_clip": 0.01104371, "auxiliary_loss_mlp": 0.01031167, "balance_loss_clip": 1.02052188, "balance_loss_mlp": 1.03688025, "epoch": 0.6982714564857959, "flos": 24389594115840.0, "grad_norm": 2.6664353934029275, "language_loss": 0.7554872, "learning_rate": 8.812411174856411e-07, "loss": 0.77684259, "num_input_tokens_seen": 250605080, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.671875, "step": 11614, "time_per_iteration": 2.551365852355957 }, { "auxiliary_loss_clip": 0.01102383, "auxiliary_loss_mlp": 0.01030268, "balance_loss_clip": 1.01811469, "balance_loss_mlp": 1.03422534, "epoch": 0.6983315797384638, "flos": 20083725302400.0, "grad_norm": 2.07441829908564, "language_loss": 0.7705943, "learning_rate": 8.809183093468746e-07, "loss": 0.79192084, "num_input_tokens_seen": 250623965, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 11615, "time_per_iteration": 2.52093243598938 }, { "auxiliary_loss_clip": 0.01101641, "auxiliary_loss_mlp": 0.01030631, "balance_loss_clip": 1.01888311, "balance_loss_mlp": 1.03531909, "epoch": 0.6983917029911318, "flos": 13512431664000.0, "grad_norm": 1.8349025591528112, "language_loss": 0.73146605, "learning_rate": 8.80595543643797e-07, "loss": 0.75278878, "num_input_tokens_seen": 250640675, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 11616, "time_per_iteration": 2.440202474594116 }, { "auxiliary_loss_clip": 0.01106499, "auxiliary_loss_mlp": 0.01032276, "balance_loss_clip": 1.02070022, "balance_loss_mlp": 1.03901982, "epoch": 0.6984518262437998, "flos": 22018412672640.0, "grad_norm": 1.627087657914141, "language_loss": 0.84199589, "learning_rate": 8.802728203886487e-07, "loss": 0.86338365, "num_input_tokens_seen": 250660295, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.67578125, "step": 11617, "time_per_iteration": 2.490809917449951 }, { "auxiliary_loss_clip": 0.01110099, "auxiliary_loss_mlp": 0.01040127, "balance_loss_clip": 1.02733612, "balance_loss_mlp": 1.03945529, "epoch": 0.6985119494964678, "flos": 18770615809920.0, "grad_norm": 2.6103203724654294, "language_loss": 0.59673822, "learning_rate": 8.799501395936682e-07, "loss": 0.61824048, "num_input_tokens_seen": 250678155, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 11618, "time_per_iteration": 2.4799909591674805 }, { "auxiliary_loss_clip": 0.01104797, "auxiliary_loss_mlp": 0.01034248, "balance_loss_clip": 1.02250552, "balance_loss_mlp": 1.03602076, "epoch": 0.6985720727491357, "flos": 22382834106240.0, "grad_norm": 1.6763066262887567, "language_loss": 0.83316934, "learning_rate": 8.796275012710903e-07, "loss": 0.85455978, "num_input_tokens_seen": 250697230, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 11619, "time_per_iteration": 2.502897262573242 }, { "auxiliary_loss_clip": 0.01100084, "auxiliary_loss_mlp": 0.0102783, "balance_loss_clip": 1.01715517, "balance_loss_mlp": 1.0343039, "epoch": 0.6986321960018037, "flos": 39567884785920.0, "grad_norm": 1.7113516553151407, "language_loss": 0.67163968, "learning_rate": 8.793049054331494e-07, "loss": 0.69291884, "num_input_tokens_seen": 250719865, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.65625, "step": 11620, "time_per_iteration": 2.6628575325012207 }, { "auxiliary_loss_clip": 0.01105374, "auxiliary_loss_mlp": 0.01030084, "balance_loss_clip": 1.01796007, "balance_loss_mlp": 1.03542542, "epoch": 0.6986923192544716, "flos": 17967725055360.0, "grad_norm": 1.996502144333865, "language_loss": 0.72766769, "learning_rate": 8.789823520920794e-07, "loss": 0.74902231, "num_input_tokens_seen": 250736565, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 11621, "time_per_iteration": 2.473581314086914 }, { "auxiliary_loss_clip": 0.01108304, "auxiliary_loss_mlp": 0.01038912, "balance_loss_clip": 1.0265975, "balance_loss_mlp": 1.03750336, "epoch": 0.6987524425071396, "flos": 25594325297280.0, "grad_norm": 1.6970593655051551, "language_loss": 0.68584043, "learning_rate": 8.7865984126011e-07, "loss": 0.70731258, "num_input_tokens_seen": 250757235, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 11622, "time_per_iteration": 2.520883083343506 }, { "auxiliary_loss_clip": 0.01100064, "auxiliary_loss_mlp": 0.01026911, "balance_loss_clip": 1.01563358, "balance_loss_mlp": 1.03425586, "epoch": 0.6988125657598077, "flos": 17530081747200.0, "grad_norm": 1.8258648285875874, "language_loss": 0.62511802, "learning_rate": 8.783373729494721e-07, "loss": 0.64638776, "num_input_tokens_seen": 250775585, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.65625, "step": 11623, "time_per_iteration": 2.4651739597320557 }, { "auxiliary_loss_clip": 0.01108496, "auxiliary_loss_mlp": 0.01026776, "balance_loss_clip": 1.01385427, "balance_loss_mlp": 1.035851, "epoch": 0.6988726890124756, "flos": 39165721136640.0, "grad_norm": 2.0582917622659074, "language_loss": 0.60771811, "learning_rate": 8.780149471723932e-07, "loss": 0.62907088, "num_input_tokens_seen": 250795725, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 11624, "time_per_iteration": 2.615926504135132 }, { "auxiliary_loss_clip": 0.0110558, "auxiliary_loss_mlp": 0.01038582, "balance_loss_clip": 1.02589798, "balance_loss_mlp": 1.03431857, "epoch": 0.6989328122651436, "flos": 20193468330240.0, "grad_norm": 2.1556771375037265, "language_loss": 0.78578514, "learning_rate": 8.776925639411017e-07, "loss": 0.80722678, "num_input_tokens_seen": 250814555, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11625, "time_per_iteration": 2.4747252464294434 }, { "auxiliary_loss_clip": 0.01101565, "auxiliary_loss_mlp": 0.01028041, "balance_loss_clip": 1.01682329, "balance_loss_mlp": 1.03572524, "epoch": 0.6989929355178115, "flos": 21834873152640.0, "grad_norm": 1.8622218247762776, "language_loss": 0.6574744, "learning_rate": 8.773702232678188e-07, "loss": 0.67877042, "num_input_tokens_seen": 250833105, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66015625, "step": 11626, "time_per_iteration": 2.4700565338134766 }, { "auxiliary_loss_clip": 0.01106694, "auxiliary_loss_mlp": 0.01033063, "balance_loss_clip": 1.02036095, "balance_loss_mlp": 1.03691936, "epoch": 0.6990530587704795, "flos": 26322880855680.0, "grad_norm": 2.7600879923606496, "language_loss": 0.70462799, "learning_rate": 8.770479251647697e-07, "loss": 0.72602558, "num_input_tokens_seen": 250852570, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 11627, "time_per_iteration": 2.5493886470794678 }, { "auxiliary_loss_clip": 0.01101707, "auxiliary_loss_mlp": 0.01024778, "balance_loss_clip": 1.0136857, "balance_loss_mlp": 1.03660667, "epoch": 0.6991131820231474, "flos": 19828975069440.0, "grad_norm": 1.7939847120766723, "language_loss": 0.62709284, "learning_rate": 8.767256696441768e-07, "loss": 0.64835769, "num_input_tokens_seen": 250870500, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6484375, "step": 11628, "time_per_iteration": 3.860811710357666 }, { "auxiliary_loss_clip": 0.0110495, "auxiliary_loss_mlp": 0.01034964, "balance_loss_clip": 1.02250671, "balance_loss_mlp": 1.03492296, "epoch": 0.6991733052758154, "flos": 33984817102080.0, "grad_norm": 2.1904195179624226, "language_loss": 0.67744625, "learning_rate": 8.764034567182581e-07, "loss": 0.69884539, "num_input_tokens_seen": 250892745, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 11629, "time_per_iteration": 2.5997002124786377 }, { "auxiliary_loss_clip": 0.01105732, "auxiliary_loss_mlp": 0.01032007, "balance_loss_clip": 1.01898336, "balance_loss_mlp": 1.0377816, "epoch": 0.6992334285284834, "flos": 15633136592640.0, "grad_norm": 1.8735634418597642, "language_loss": 0.7253778, "learning_rate": 8.760812863992337e-07, "loss": 0.74675524, "num_input_tokens_seen": 250910225, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6796875, "step": 11630, "time_per_iteration": 4.600784540176392 }, { "auxiliary_loss_clip": 0.0110575, "auxiliary_loss_mlp": 0.01032704, "balance_loss_clip": 1.02119458, "balance_loss_mlp": 1.03829384, "epoch": 0.6992935517811514, "flos": 21726279360000.0, "grad_norm": 1.9185037103923603, "language_loss": 0.74058038, "learning_rate": 8.757591586993196e-07, "loss": 0.76196492, "num_input_tokens_seen": 250929715, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 11631, "time_per_iteration": 2.5253591537475586 }, { "auxiliary_loss_clip": 0.01110744, "auxiliary_loss_mlp": 0.0103067, "balance_loss_clip": 1.01771748, "balance_loss_mlp": 1.0394721, "epoch": 0.6993536750338193, "flos": 20115254465280.0, "grad_norm": 2.1195273025246, "language_loss": 0.89592528, "learning_rate": 8.7543707363073e-07, "loss": 0.91733938, "num_input_tokens_seen": 250944230, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 11632, "time_per_iteration": 3.865445613861084 }, { "auxiliary_loss_clip": 0.01109291, "auxiliary_loss_mlp": 0.01035348, "balance_loss_clip": 1.02373147, "balance_loss_mlp": 1.03841043, "epoch": 0.6994137982864873, "flos": 22010547594240.0, "grad_norm": 20.062492804141456, "language_loss": 0.80157268, "learning_rate": 8.751150312056792e-07, "loss": 0.82301909, "num_input_tokens_seen": 250961865, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.7109375, "step": 11633, "time_per_iteration": 3.9196760654449463 }, { "auxiliary_loss_clip": 0.01109246, "auxiliary_loss_mlp": 0.01032532, "balance_loss_clip": 1.01915073, "balance_loss_mlp": 1.03712249, "epoch": 0.6994739215391552, "flos": 25519020433920.0, "grad_norm": 1.9944002088465944, "language_loss": 0.66937685, "learning_rate": 8.747930314363794e-07, "loss": 0.69079465, "num_input_tokens_seen": 250982025, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 11634, "time_per_iteration": 2.539992332458496 }, { "auxiliary_loss_clip": 0.0103058, "auxiliary_loss_mlp": 0.01000169, "balance_loss_clip": 0.99899524, "balance_loss_mlp": 1.00771785, "epoch": 0.6995340447918232, "flos": 59128357691520.0, "grad_norm": 0.7024348217087161, "language_loss": 0.5323118, "learning_rate": 8.744710743350412e-07, "loss": 0.55261928, "num_input_tokens_seen": 251046900, "router_z_loss_clip": 0.01171875, "router_z_loss_mlp": 0.22851562, "step": 11635, "time_per_iteration": 3.2302350997924805 }, { "auxiliary_loss_clip": 0.01105008, "auxiliary_loss_mlp": 0.01028574, "balance_loss_clip": 1.01603889, "balance_loss_mlp": 1.03646111, "epoch": 0.6995941680444913, "flos": 17967832796160.0, "grad_norm": 1.689460297165207, "language_loss": 0.82142043, "learning_rate": 8.741491599138726e-07, "loss": 0.84275621, "num_input_tokens_seen": 251065050, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 11636, "time_per_iteration": 2.481799602508545 }, { "auxiliary_loss_clip": 0.01106561, "auxiliary_loss_mlp": 0.01027122, "balance_loss_clip": 1.01539135, "balance_loss_mlp": 1.03690076, "epoch": 0.6996542912971592, "flos": 21980095839360.0, "grad_norm": 1.8875159407001074, "language_loss": 0.82955056, "learning_rate": 8.738272881850801e-07, "loss": 0.85088736, "num_input_tokens_seen": 251083355, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6953125, "step": 11637, "time_per_iteration": 2.5126452445983887 }, { "auxiliary_loss_clip": 0.01104996, "auxiliary_loss_mlp": 0.0103168, "balance_loss_clip": 1.01978874, "balance_loss_mlp": 1.0366888, "epoch": 0.6997144145498272, "flos": 11686158518400.0, "grad_norm": 1.971711267030947, "language_loss": 0.67729729, "learning_rate": 8.735054591608704e-07, "loss": 0.69866407, "num_input_tokens_seen": 251096420, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 11638, "time_per_iteration": 2.4696929454803467 }, { "auxiliary_loss_clip": 0.01109971, "auxiliary_loss_mlp": 0.01032524, "balance_loss_clip": 1.0190351, "balance_loss_mlp": 1.03743172, "epoch": 0.6997745378024951, "flos": 29607162958080.0, "grad_norm": 5.277572796530957, "language_loss": 0.77738595, "learning_rate": 8.731836728534459e-07, "loss": 0.79881084, "num_input_tokens_seen": 251115410, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 11639, "time_per_iteration": 2.5701446533203125 }, { "auxiliary_loss_clip": 0.01107768, "auxiliary_loss_mlp": 0.01035685, "balance_loss_clip": 1.02283466, "balance_loss_mlp": 1.03797746, "epoch": 0.6998346610551631, "flos": 20886616056960.0, "grad_norm": 2.855678772765348, "language_loss": 0.8247366, "learning_rate": 8.728619292750093e-07, "loss": 0.84617114, "num_input_tokens_seen": 251133530, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 11640, "time_per_iteration": 2.5337862968444824 }, { "auxiliary_loss_clip": 0.01103587, "auxiliary_loss_mlp": 0.01030167, "balance_loss_clip": 1.0182817, "balance_loss_mlp": 1.03483486, "epoch": 0.699894784307831, "flos": 27163046949120.0, "grad_norm": 1.9881432316705654, "language_loss": 0.75453281, "learning_rate": 8.725402284377619e-07, "loss": 0.77587032, "num_input_tokens_seen": 251153985, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 11641, "time_per_iteration": 2.546868085861206 }, { "auxiliary_loss_clip": 0.01107747, "auxiliary_loss_mlp": 0.01023652, "balance_loss_clip": 1.01092041, "balance_loss_mlp": 1.03826416, "epoch": 0.699954907560499, "flos": 20923640000640.0, "grad_norm": 2.789434768542272, "language_loss": 0.77294958, "learning_rate": 8.722185703539022e-07, "loss": 0.7942636, "num_input_tokens_seen": 251173225, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 11642, "time_per_iteration": 2.5453855991363525 }, { "auxiliary_loss_clip": 0.01110752, "auxiliary_loss_mlp": 0.01034414, "balance_loss_clip": 1.02029991, "balance_loss_mlp": 1.03836322, "epoch": 0.700015030813167, "flos": 28657792540800.0, "grad_norm": 2.678451798098785, "language_loss": 0.74602783, "learning_rate": 8.718969550356266e-07, "loss": 0.76747948, "num_input_tokens_seen": 251192485, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.72265625, "step": 11643, "time_per_iteration": 2.579407215118408 }, { "auxiliary_loss_clip": 0.01106499, "auxiliary_loss_mlp": 0.01024238, "balance_loss_clip": 1.01183414, "balance_loss_mlp": 1.03601778, "epoch": 0.700075154065835, "flos": 29205286617600.0, "grad_norm": 1.542412017965712, "language_loss": 0.60515893, "learning_rate": 8.715753824951315e-07, "loss": 0.62646627, "num_input_tokens_seen": 251214965, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 11644, "time_per_iteration": 2.5638370513916016 }, { "auxiliary_loss_clip": 0.01102883, "auxiliary_loss_mlp": 0.01028748, "balance_loss_clip": 1.01713145, "balance_loss_mlp": 1.03594184, "epoch": 0.7001352773185029, "flos": 23112431159040.0, "grad_norm": 1.6848288531719757, "language_loss": 0.81862295, "learning_rate": 8.712538527446119e-07, "loss": 0.83993924, "num_input_tokens_seen": 251234500, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 11645, "time_per_iteration": 2.519289016723633 }, { "auxiliary_loss_clip": 0.01105499, "auxiliary_loss_mlp": 0.010292, "balance_loss_clip": 1.01719522, "balance_loss_mlp": 1.03760517, "epoch": 0.7001954005711709, "flos": 21322858734720.0, "grad_norm": 2.063202820788479, "language_loss": 0.68544221, "learning_rate": 8.709323657962584e-07, "loss": 0.70678914, "num_input_tokens_seen": 251254360, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 11646, "time_per_iteration": 2.510496139526367 }, { "auxiliary_loss_clip": 0.01104564, "auxiliary_loss_mlp": 0.0103116, "balance_loss_clip": 1.01946568, "balance_loss_mlp": 1.03687799, "epoch": 0.7002555238238388, "flos": 24535822383360.0, "grad_norm": 1.635311952703229, "language_loss": 0.71109068, "learning_rate": 8.706109216622635e-07, "loss": 0.73244798, "num_input_tokens_seen": 251274790, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 11647, "time_per_iteration": 2.5083720684051514 }, { "auxiliary_loss_clip": 0.01109082, "auxiliary_loss_mlp": 0.01032022, "balance_loss_clip": 1.01906967, "balance_loss_mlp": 1.03893876, "epoch": 0.7003156470765068, "flos": 39056552726400.0, "grad_norm": 1.6290699106726747, "language_loss": 0.71390605, "learning_rate": 8.702895203548155e-07, "loss": 0.73531699, "num_input_tokens_seen": 251296275, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11648, "time_per_iteration": 2.649841547012329 }, { "auxiliary_loss_clip": 0.01102602, "auxiliary_loss_mlp": 0.01027275, "balance_loss_clip": 1.01541996, "balance_loss_mlp": 1.03416109, "epoch": 0.7003757703291749, "flos": 28804092635520.0, "grad_norm": 1.5191008311060383, "language_loss": 0.77512163, "learning_rate": 8.699681618861014e-07, "loss": 0.7964204, "num_input_tokens_seen": 251317375, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.68359375, "step": 11649, "time_per_iteration": 2.537336826324463 }, { "auxiliary_loss_clip": 0.01104475, "auxiliary_loss_mlp": 0.01030071, "balance_loss_clip": 1.01815653, "balance_loss_mlp": 1.0362035, "epoch": 0.7004358935818428, "flos": 15953854152960.0, "grad_norm": 1.6925912203793392, "language_loss": 0.78723931, "learning_rate": 8.69646846268308e-07, "loss": 0.80858475, "num_input_tokens_seen": 251333570, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 11650, "time_per_iteration": 2.4698374271392822 }, { "auxiliary_loss_clip": 0.01102083, "auxiliary_loss_mlp": 0.01027526, "balance_loss_clip": 1.01567698, "balance_loss_mlp": 1.03346944, "epoch": 0.7004960168345108, "flos": 20411984718720.0, "grad_norm": 9.879918291569789, "language_loss": 0.78166789, "learning_rate": 8.693255735136194e-07, "loss": 0.80296397, "num_input_tokens_seen": 251351070, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 11651, "time_per_iteration": 2.466766357421875 }, { "auxiliary_loss_clip": 0.01108823, "auxiliary_loss_mlp": 0.01033147, "balance_loss_clip": 1.02161932, "balance_loss_mlp": 1.03840399, "epoch": 0.7005561400871787, "flos": 17347547808000.0, "grad_norm": 1.5632645783887438, "language_loss": 0.69482505, "learning_rate": 8.690043436342198e-07, "loss": 0.7162447, "num_input_tokens_seen": 251370005, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.703125, "step": 11652, "time_per_iteration": 2.4746286869049072 }, { "auxiliary_loss_clip": 0.01105629, "auxiliary_loss_mlp": 0.01029945, "balance_loss_clip": 1.01775599, "balance_loss_mlp": 1.03691578, "epoch": 0.7006162633398467, "flos": 25302120157440.0, "grad_norm": 1.567385862091198, "language_loss": 0.74338013, "learning_rate": 8.686831566422874e-07, "loss": 0.76473588, "num_input_tokens_seen": 251391210, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 11653, "time_per_iteration": 2.501417636871338 }, { "auxiliary_loss_clip": 0.01108198, "auxiliary_loss_mlp": 0.01030478, "balance_loss_clip": 1.01701939, "balance_loss_mlp": 1.03731084, "epoch": 0.7006763865925146, "flos": 20668997508480.0, "grad_norm": 1.9475289223270138, "language_loss": 0.70464575, "learning_rate": 8.68362012550003e-07, "loss": 0.7260325, "num_input_tokens_seen": 251411505, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 11654, "time_per_iteration": 2.5089621543884277 }, { "auxiliary_loss_clip": 0.01107388, "auxiliary_loss_mlp": 0.01033533, "balance_loss_clip": 1.01992559, "balance_loss_mlp": 1.03654993, "epoch": 0.7007365098451827, "flos": 20046449963520.0, "grad_norm": 3.216629528887452, "language_loss": 0.72965902, "learning_rate": 8.680409113695453e-07, "loss": 0.75106823, "num_input_tokens_seen": 251428975, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 11655, "time_per_iteration": 2.467848777770996 }, { "auxiliary_loss_clip": 0.01113745, "auxiliary_loss_mlp": 0.01034558, "balance_loss_clip": 1.02101016, "balance_loss_mlp": 1.03966093, "epoch": 0.7007966330978506, "flos": 20777375819520.0, "grad_norm": 1.88466202269225, "language_loss": 0.7021004, "learning_rate": 8.677198531130889e-07, "loss": 0.7235834, "num_input_tokens_seen": 251446940, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 11656, "time_per_iteration": 2.522388219833374 }, { "auxiliary_loss_clip": 0.01103401, "auxiliary_loss_mlp": 0.01030549, "balance_loss_clip": 1.01930749, "balance_loss_mlp": 1.03542674, "epoch": 0.7008567563505186, "flos": 29638189330560.0, "grad_norm": 1.832822096264156, "language_loss": 0.78046346, "learning_rate": 8.673988377928092e-07, "loss": 0.80180287, "num_input_tokens_seen": 251466205, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6796875, "step": 11657, "time_per_iteration": 2.554647207260132 }, { "auxiliary_loss_clip": 0.01111844, "auxiliary_loss_mlp": 0.01035414, "balance_loss_clip": 1.02209282, "balance_loss_mlp": 1.03819036, "epoch": 0.7009168796031865, "flos": 17092007475840.0, "grad_norm": 2.083594090431315, "language_loss": 0.78783327, "learning_rate": 8.670778654208797e-07, "loss": 0.80930591, "num_input_tokens_seen": 251484820, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 11658, "time_per_iteration": 2.4819703102111816 }, { "auxiliary_loss_clip": 0.01102322, "auxiliary_loss_mlp": 0.01024148, "balance_loss_clip": 1.01222098, "balance_loss_mlp": 1.03513384, "epoch": 0.7009770028558545, "flos": 20448972748800.0, "grad_norm": 1.858985537155984, "language_loss": 0.82521027, "learning_rate": 8.667569360094713e-07, "loss": 0.84647501, "num_input_tokens_seen": 251502670, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 11659, "time_per_iteration": 2.4927616119384766 }, { "auxiliary_loss_clip": 0.01105211, "auxiliary_loss_mlp": 0.01027831, "balance_loss_clip": 1.01613629, "balance_loss_mlp": 1.0372653, "epoch": 0.7010371261085224, "flos": 19245139407360.0, "grad_norm": 2.2306697554116344, "language_loss": 0.6953814, "learning_rate": 8.664360495707526e-07, "loss": 0.71671188, "num_input_tokens_seen": 251521630, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 11660, "time_per_iteration": 2.473585605621338 }, { "auxiliary_loss_clip": 0.0110835, "auxiliary_loss_mlp": 0.01033129, "balance_loss_clip": 1.01997459, "balance_loss_mlp": 1.03662157, "epoch": 0.7010972493611904, "flos": 22127581082880.0, "grad_norm": 1.8053060253164344, "language_loss": 0.80922222, "learning_rate": 8.661152061168924e-07, "loss": 0.83063698, "num_input_tokens_seen": 251540105, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 11661, "time_per_iteration": 2.4922735691070557 }, { "auxiliary_loss_clip": 0.01104094, "auxiliary_loss_mlp": 0.01031192, "balance_loss_clip": 1.01882982, "balance_loss_mlp": 1.03511846, "epoch": 0.7011573726138585, "flos": 31391132860800.0, "grad_norm": 1.770050832867085, "language_loss": 0.78982389, "learning_rate": 8.657944056600579e-07, "loss": 0.81117666, "num_input_tokens_seen": 251560530, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 11662, "time_per_iteration": 2.5876922607421875 }, { "auxiliary_loss_clip": 0.01107484, "auxiliary_loss_mlp": 0.01025968, "balance_loss_clip": 1.01351058, "balance_loss_mlp": 1.03653312, "epoch": 0.7012174958665264, "flos": 18150582216960.0, "grad_norm": 3.417831166832664, "language_loss": 0.83396673, "learning_rate": 8.654736482124134e-07, "loss": 0.8553012, "num_input_tokens_seen": 251577930, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 11663, "time_per_iteration": 2.533363103866577 }, { "auxiliary_loss_clip": 0.01032818, "auxiliary_loss_mlp": 0.01002511, "balance_loss_clip": 1.0014267, "balance_loss_mlp": 1.00991344, "epoch": 0.7012776191191944, "flos": 60651256567680.0, "grad_norm": 0.8530263283952876, "language_loss": 0.53758955, "learning_rate": 8.651529337861209e-07, "loss": 0.55794287, "num_input_tokens_seen": 251638820, "router_z_loss_clip": 0.01086426, "router_z_loss_mlp": 0.22949219, "step": 11664, "time_per_iteration": 3.117719888687134 }, { "auxiliary_loss_clip": 0.01108977, "auxiliary_loss_mlp": 0.01029316, "balance_loss_clip": 1.01651883, "balance_loss_mlp": 1.03790414, "epoch": 0.7013377423718623, "flos": 27198598435200.0, "grad_norm": 2.1148833320898053, "language_loss": 0.7914294, "learning_rate": 8.64832262393344e-07, "loss": 0.81281233, "num_input_tokens_seen": 251658070, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11665, "time_per_iteration": 2.595078945159912 }, { "auxiliary_loss_clip": 0.01105142, "auxiliary_loss_mlp": 0.01028994, "balance_loss_clip": 1.0174551, "balance_loss_mlp": 1.03623533, "epoch": 0.7013978656245303, "flos": 16543543731840.0, "grad_norm": 2.8957486419694174, "language_loss": 0.76850301, "learning_rate": 8.645116340462404e-07, "loss": 0.78984439, "num_input_tokens_seen": 251671575, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 11666, "time_per_iteration": 2.469961404800415 }, { "auxiliary_loss_clip": 0.01106364, "auxiliary_loss_mlp": 0.01028246, "balance_loss_clip": 1.01599145, "balance_loss_mlp": 1.03774261, "epoch": 0.7014579888771982, "flos": 23143780753920.0, "grad_norm": 1.995769952828367, "language_loss": 0.81341422, "learning_rate": 8.641910487569695e-07, "loss": 0.83476031, "num_input_tokens_seen": 251689350, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 11667, "time_per_iteration": 2.5135459899902344 }, { "auxiliary_loss_clip": 0.01105174, "auxiliary_loss_mlp": 0.01031476, "balance_loss_clip": 1.01901245, "balance_loss_mlp": 1.03648329, "epoch": 0.7015181121298663, "flos": 25082095397760.0, "grad_norm": 2.3765921336154463, "language_loss": 0.66015655, "learning_rate": 8.638705065376879e-07, "loss": 0.68152308, "num_input_tokens_seen": 251704635, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 11668, "time_per_iteration": 2.5081043243408203 }, { "auxiliary_loss_clip": 0.01108802, "auxiliary_loss_mlp": 0.01026915, "balance_loss_clip": 1.01454151, "balance_loss_mlp": 1.03715384, "epoch": 0.7015782353825342, "flos": 23327894891520.0, "grad_norm": 1.701541631505573, "language_loss": 0.76566321, "learning_rate": 8.635500074005519e-07, "loss": 0.78702033, "num_input_tokens_seen": 251723035, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 11669, "time_per_iteration": 2.501075267791748 }, { "auxiliary_loss_clip": 0.01033601, "auxiliary_loss_mlp": 0.01000434, "balance_loss_clip": 0.99922389, "balance_loss_mlp": 1.01076329, "epoch": 0.7016383586352022, "flos": 70397161107840.0, "grad_norm": 0.6925305208552336, "language_loss": 0.54463887, "learning_rate": 8.632295513577122e-07, "loss": 0.5649792, "num_input_tokens_seen": 251791630, "router_z_loss_clip": 0.01208496, "router_z_loss_mlp": 0.22851562, "step": 11670, "time_per_iteration": 4.567712306976318 }, { "auxiliary_loss_clip": 0.01106185, "auxiliary_loss_mlp": 0.01034486, "balance_loss_clip": 1.02181375, "balance_loss_mlp": 1.03747153, "epoch": 0.7016984818878701, "flos": 19792274348160.0, "grad_norm": 1.66887482589161, "language_loss": 0.81800985, "learning_rate": 8.629091384213218e-07, "loss": 0.8394165, "num_input_tokens_seen": 251809840, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 11671, "time_per_iteration": 2.5856833457946777 }, { "auxiliary_loss_clip": 0.01109665, "auxiliary_loss_mlp": 0.0103269, "balance_loss_clip": 1.01998234, "balance_loss_mlp": 1.03905821, "epoch": 0.7017586051405381, "flos": 12896923184640.0, "grad_norm": 2.072588827436551, "language_loss": 0.74964523, "learning_rate": 8.625887686035313e-07, "loss": 0.77106881, "num_input_tokens_seen": 251827550, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 11672, "time_per_iteration": 4.805166006088257 }, { "auxiliary_loss_clip": 0.01105231, "auxiliary_loss_mlp": 0.01029535, "balance_loss_clip": 1.01708341, "balance_loss_mlp": 1.03600955, "epoch": 0.701818728393206, "flos": 18332828847360.0, "grad_norm": 1.973334674213208, "language_loss": 0.87062687, "learning_rate": 8.622684419164883e-07, "loss": 0.89197457, "num_input_tokens_seen": 251844880, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 11673, "time_per_iteration": 2.4724910259246826 }, { "auxiliary_loss_clip": 0.01102985, "auxiliary_loss_mlp": 0.01025846, "balance_loss_clip": 1.01347852, "balance_loss_mlp": 1.03561926, "epoch": 0.701878851645874, "flos": 17384212615680.0, "grad_norm": 2.010750165209318, "language_loss": 0.73084015, "learning_rate": 8.619481583723399e-07, "loss": 0.75212848, "num_input_tokens_seen": 251861025, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 11674, "time_per_iteration": 3.943964958190918 }, { "auxiliary_loss_clip": 0.01105858, "auxiliary_loss_mlp": 0.01032028, "balance_loss_clip": 1.02023816, "balance_loss_mlp": 1.03893304, "epoch": 0.701938974898542, "flos": 23915501481600.0, "grad_norm": 1.6683895562776783, "language_loss": 0.72153181, "learning_rate": 8.616279179832329e-07, "loss": 0.74291068, "num_input_tokens_seen": 251880175, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 11675, "time_per_iteration": 3.9637176990509033 }, { "auxiliary_loss_clip": 0.01108781, "auxiliary_loss_mlp": 0.01028641, "balance_loss_clip": 1.01580834, "balance_loss_mlp": 1.03880703, "epoch": 0.70199909815121, "flos": 21795586652160.0, "grad_norm": 2.355792396677909, "language_loss": 0.51412523, "learning_rate": 8.613077207613078e-07, "loss": 0.53549945, "num_input_tokens_seen": 251899005, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 11676, "time_per_iteration": 2.523552417755127 }, { "auxiliary_loss_clip": 0.01033305, "auxiliary_loss_mlp": 0.00998819, "balance_loss_clip": 0.9976328, "balance_loss_mlp": 1.01033926, "epoch": 0.702059221403878, "flos": 71715047109120.0, "grad_norm": 0.7350221906606006, "language_loss": 0.59262747, "learning_rate": 8.609875667187079e-07, "loss": 0.61294872, "num_input_tokens_seen": 251966790, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.23046875, "step": 11677, "time_per_iteration": 3.2056033611297607 }, { "auxiliary_loss_clip": 0.01107131, "auxiliary_loss_mlp": 0.01029713, "balance_loss_clip": 1.01702893, "balance_loss_mlp": 1.03646779, "epoch": 0.7021193446565459, "flos": 28111052649600.0, "grad_norm": 2.6380036652448053, "language_loss": 0.62259996, "learning_rate": 8.606674558675737e-07, "loss": 0.64396834, "num_input_tokens_seen": 251989315, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 11678, "time_per_iteration": 2.582467555999756 }, { "auxiliary_loss_clip": 0.01106727, "auxiliary_loss_mlp": 0.0103405, "balance_loss_clip": 1.02157497, "balance_loss_mlp": 1.037709, "epoch": 0.7021794679092139, "flos": 22924905229440.0, "grad_norm": 1.8111370969259477, "language_loss": 0.79645258, "learning_rate": 8.603473882200444e-07, "loss": 0.81786036, "num_input_tokens_seen": 252006620, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 11679, "time_per_iteration": 2.5305778980255127 }, { "auxiliary_loss_clip": 0.01106295, "auxiliary_loss_mlp": 0.01035625, "balance_loss_clip": 1.02445507, "balance_loss_mlp": 1.03881693, "epoch": 0.7022395911618818, "flos": 18077827219200.0, "grad_norm": 2.3963777176864793, "language_loss": 0.70763308, "learning_rate": 8.600273637882567e-07, "loss": 0.72905231, "num_input_tokens_seen": 252024570, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.67578125, "step": 11680, "time_per_iteration": 2.5152008533477783 }, { "auxiliary_loss_clip": 0.01109989, "auxiliary_loss_mlp": 0.01030507, "balance_loss_clip": 1.01755464, "balance_loss_mlp": 1.03795016, "epoch": 0.7022997144145499, "flos": 16034294661120.0, "grad_norm": 2.4534718742211457, "language_loss": 0.75014967, "learning_rate": 8.597073825843446e-07, "loss": 0.77155465, "num_input_tokens_seen": 252042775, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 11681, "time_per_iteration": 2.478156328201294 }, { "auxiliary_loss_clip": 0.01107177, "auxiliary_loss_mlp": 0.01034752, "balance_loss_clip": 1.02325463, "balance_loss_mlp": 1.03742886, "epoch": 0.7023598376672178, "flos": 26468678160000.0, "grad_norm": 1.5336490314853448, "language_loss": 0.77126986, "learning_rate": 8.593874446204434e-07, "loss": 0.7926892, "num_input_tokens_seen": 252063690, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6953125, "step": 11682, "time_per_iteration": 2.5590810775756836 }, { "auxiliary_loss_clip": 0.01110001, "auxiliary_loss_mlp": 0.010337, "balance_loss_clip": 1.02072358, "balance_loss_mlp": 1.03829467, "epoch": 0.7024199609198858, "flos": 17055917285760.0, "grad_norm": 3.0919935402020418, "language_loss": 0.73594391, "learning_rate": 8.590675499086841e-07, "loss": 0.75738096, "num_input_tokens_seen": 252080335, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 11683, "time_per_iteration": 2.455183744430542 }, { "auxiliary_loss_clip": 0.0110802, "auxiliary_loss_mlp": 0.01029612, "balance_loss_clip": 1.01659393, "balance_loss_mlp": 1.03927267, "epoch": 0.7024800841725537, "flos": 25849039616640.0, "grad_norm": 2.2590768866942574, "language_loss": 0.71875417, "learning_rate": 8.587476984611976e-07, "loss": 0.74013054, "num_input_tokens_seen": 252101075, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 11684, "time_per_iteration": 2.5627694129943848 }, { "auxiliary_loss_clip": 0.01106918, "auxiliary_loss_mlp": 0.01032794, "balance_loss_clip": 1.0199964, "balance_loss_mlp": 1.03758669, "epoch": 0.7025402074252217, "flos": 23513014609920.0, "grad_norm": 3.0089345878457205, "language_loss": 0.72178155, "learning_rate": 8.584278902901128e-07, "loss": 0.74317867, "num_input_tokens_seen": 252120510, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 11685, "time_per_iteration": 2.5015435218811035 }, { "auxiliary_loss_clip": 0.01106856, "auxiliary_loss_mlp": 0.01030317, "balance_loss_clip": 1.01853871, "balance_loss_mlp": 1.03678465, "epoch": 0.7026003306778896, "flos": 20150985519360.0, "grad_norm": 2.0196217005495245, "language_loss": 0.84503329, "learning_rate": 8.581081254075582e-07, "loss": 0.86640501, "num_input_tokens_seen": 252137590, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69921875, "step": 11686, "time_per_iteration": 2.4825117588043213 }, { "auxiliary_loss_clip": 0.01032191, "auxiliary_loss_mlp": 0.01000451, "balance_loss_clip": 0.99931282, "balance_loss_mlp": 1.00936222, "epoch": 0.7026604539305576, "flos": 64772400712320.0, "grad_norm": 0.9924969654304008, "language_loss": 0.69936085, "learning_rate": 8.577884038256566e-07, "loss": 0.71968728, "num_input_tokens_seen": 252199830, "router_z_loss_clip": 0.01141357, "router_z_loss_mlp": 0.22851562, "step": 11687, "time_per_iteration": 3.2638673782348633 }, { "auxiliary_loss_clip": 0.01107652, "auxiliary_loss_mlp": 0.01027861, "balance_loss_clip": 1.0155642, "balance_loss_mlp": 1.03790402, "epoch": 0.7027205771832256, "flos": 21871466133120.0, "grad_norm": 1.9338734143460796, "language_loss": 0.77217543, "learning_rate": 8.574687255565329e-07, "loss": 0.79353058, "num_input_tokens_seen": 252217200, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 11688, "time_per_iteration": 2.530214786529541 }, { "auxiliary_loss_clip": 0.01106206, "auxiliary_loss_mlp": 0.01031712, "balance_loss_clip": 1.01923728, "balance_loss_mlp": 1.0371182, "epoch": 0.7027807004358936, "flos": 23367791923200.0, "grad_norm": 2.828596195827914, "language_loss": 0.68649787, "learning_rate": 8.571490906123107e-07, "loss": 0.70787704, "num_input_tokens_seen": 252236105, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 11689, "time_per_iteration": 2.498386859893799 }, { "auxiliary_loss_clip": 0.01108154, "auxiliary_loss_mlp": 0.01038436, "balance_loss_clip": 1.02499509, "balance_loss_mlp": 1.03648269, "epoch": 0.7028408236885616, "flos": 15304266645120.0, "grad_norm": 2.368864445702962, "language_loss": 0.8032167, "learning_rate": 8.568294990051086e-07, "loss": 0.82468259, "num_input_tokens_seen": 252253315, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 11690, "time_per_iteration": 2.5046420097351074 }, { "auxiliary_loss_clip": 0.01107431, "auxiliary_loss_mlp": 0.01038303, "balance_loss_clip": 1.02611423, "balance_loss_mlp": 1.03818679, "epoch": 0.7029009469412295, "flos": 22018197191040.0, "grad_norm": 2.0204524412485103, "language_loss": 0.76026469, "learning_rate": 8.56509950747047e-07, "loss": 0.78172207, "num_input_tokens_seen": 252272765, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 11691, "time_per_iteration": 2.5132791996002197 }, { "auxiliary_loss_clip": 0.01107422, "auxiliary_loss_mlp": 0.01028044, "balance_loss_clip": 1.01587272, "balance_loss_mlp": 1.03865004, "epoch": 0.7029610701938975, "flos": 21835519597440.0, "grad_norm": 1.8849955536949305, "language_loss": 0.82141125, "learning_rate": 8.561904458502429e-07, "loss": 0.84276593, "num_input_tokens_seen": 252290510, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 11692, "time_per_iteration": 2.52038311958313 }, { "auxiliary_loss_clip": 0.01106814, "auxiliary_loss_mlp": 0.01027288, "balance_loss_clip": 1.01450896, "balance_loss_mlp": 1.03716469, "epoch": 0.7030211934465654, "flos": 19135647774720.0, "grad_norm": 1.5679484510457862, "language_loss": 0.76323932, "learning_rate": 8.558709843268111e-07, "loss": 0.78458035, "num_input_tokens_seen": 252309365, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 11693, "time_per_iteration": 2.5113704204559326 }, { "auxiliary_loss_clip": 0.0110895, "auxiliary_loss_mlp": 0.01034425, "balance_loss_clip": 1.02187836, "balance_loss_mlp": 1.03985739, "epoch": 0.7030813166992335, "flos": 38546010766080.0, "grad_norm": 1.3796819455739011, "language_loss": 0.68028337, "learning_rate": 8.55551566188866e-07, "loss": 0.70171708, "num_input_tokens_seen": 252333010, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 11694, "time_per_iteration": 2.666712999343872 }, { "auxiliary_loss_clip": 0.01105153, "auxiliary_loss_mlp": 0.01030584, "balance_loss_clip": 1.0177927, "balance_loss_mlp": 1.03533316, "epoch": 0.7031414399519014, "flos": 14720897859840.0, "grad_norm": 2.18705615270724, "language_loss": 0.75504017, "learning_rate": 8.552321914485203e-07, "loss": 0.77639753, "num_input_tokens_seen": 252351330, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 11695, "time_per_iteration": 2.474745750427246 }, { "auxiliary_loss_clip": 0.01109752, "auxiliary_loss_mlp": 0.01040626, "balance_loss_clip": 1.02753711, "balance_loss_mlp": 1.03806186, "epoch": 0.7032015632045694, "flos": 14027247342720.0, "grad_norm": 1.888568193633857, "language_loss": 0.73657346, "learning_rate": 8.549128601178852e-07, "loss": 0.75807726, "num_input_tokens_seen": 252369580, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 11696, "time_per_iteration": 2.4920551776885986 }, { "auxiliary_loss_clip": 0.01108943, "auxiliary_loss_mlp": 0.01029584, "balance_loss_clip": 1.01623845, "balance_loss_mlp": 1.03793621, "epoch": 0.7032616864572373, "flos": 27637175496960.0, "grad_norm": 1.6066163245235572, "language_loss": 0.75696862, "learning_rate": 8.545935722090693e-07, "loss": 0.77835381, "num_input_tokens_seen": 252390525, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 11697, "time_per_iteration": 2.5486221313476562 }, { "auxiliary_loss_clip": 0.01111592, "auxiliary_loss_mlp": 0.01035778, "balance_loss_clip": 1.02111554, "balance_loss_mlp": 1.03990841, "epoch": 0.7033218097099053, "flos": 17967294092160.0, "grad_norm": 2.103576480638904, "language_loss": 0.80890703, "learning_rate": 8.542743277341793e-07, "loss": 0.83038068, "num_input_tokens_seen": 252407470, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.71875, "step": 11698, "time_per_iteration": 2.4873170852661133 }, { "auxiliary_loss_clip": 0.01107237, "auxiliary_loss_mlp": 0.01034953, "balance_loss_clip": 1.02206683, "balance_loss_mlp": 1.03654575, "epoch": 0.7033819329625732, "flos": 19501721233920.0, "grad_norm": 1.8610901579244707, "language_loss": 0.84717786, "learning_rate": 8.539551267053222e-07, "loss": 0.86859977, "num_input_tokens_seen": 252427025, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 11699, "time_per_iteration": 2.536909580230713 }, { "auxiliary_loss_clip": 0.01106899, "auxiliary_loss_mlp": 0.01029247, "balance_loss_clip": 1.01597357, "balance_loss_mlp": 1.03792048, "epoch": 0.7034420562152413, "flos": 23987645948160.0, "grad_norm": 2.127678052745967, "language_loss": 0.78950107, "learning_rate": 8.53635969134601e-07, "loss": 0.81086248, "num_input_tokens_seen": 252445410, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 11700, "time_per_iteration": 2.51025128364563 }, { "auxiliary_loss_clip": 0.01106324, "auxiliary_loss_mlp": 0.01028227, "balance_loss_clip": 1.01500022, "balance_loss_mlp": 1.03580666, "epoch": 0.7035021794679092, "flos": 35043427756800.0, "grad_norm": 1.7312580638963355, "language_loss": 0.74572957, "learning_rate": 8.533168550341186e-07, "loss": 0.76707512, "num_input_tokens_seen": 252463905, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 11701, "time_per_iteration": 2.6283490657806396 }, { "auxiliary_loss_clip": 0.01110631, "auxiliary_loss_mlp": 0.01033736, "balance_loss_clip": 1.01977038, "balance_loss_mlp": 1.0389626, "epoch": 0.7035623027205772, "flos": 10997428164480.0, "grad_norm": 2.3042788572115502, "language_loss": 0.83780569, "learning_rate": 8.529977844159769e-07, "loss": 0.85924935, "num_input_tokens_seen": 252478655, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 11702, "time_per_iteration": 2.4499974250793457 }, { "auxiliary_loss_clip": 0.01106255, "auxiliary_loss_mlp": 0.01037442, "balance_loss_clip": 1.02420926, "balance_loss_mlp": 1.03647733, "epoch": 0.7036224259732452, "flos": 23623727304960.0, "grad_norm": 1.7675694642497881, "language_loss": 0.61238885, "learning_rate": 8.526787572922738e-07, "loss": 0.63382578, "num_input_tokens_seen": 252498740, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 11703, "time_per_iteration": 2.531778335571289 }, { "auxiliary_loss_clip": 0.01104052, "auxiliary_loss_mlp": 0.01030477, "balance_loss_clip": 1.01740563, "balance_loss_mlp": 1.03397393, "epoch": 0.7036825492259131, "flos": 31686175175040.0, "grad_norm": 1.8212697226705883, "language_loss": 0.61404991, "learning_rate": 8.523597736751067e-07, "loss": 0.63539523, "num_input_tokens_seen": 252517800, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 11704, "time_per_iteration": 2.697134256362915 }, { "auxiliary_loss_clip": 0.01101739, "auxiliary_loss_mlp": 0.01031385, "balance_loss_clip": 1.02019143, "balance_loss_mlp": 1.0350275, "epoch": 0.7037426724785811, "flos": 30192866127360.0, "grad_norm": 2.2162640201300925, "language_loss": 0.70967281, "learning_rate": 8.520408335765719e-07, "loss": 0.73100406, "num_input_tokens_seen": 252539620, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.66796875, "step": 11705, "time_per_iteration": 2.5919349193573 }, { "auxiliary_loss_clip": 0.01104637, "auxiliary_loss_mlp": 0.01032809, "balance_loss_clip": 1.02035737, "balance_loss_mlp": 1.03658736, "epoch": 0.703802795731249, "flos": 24311523905280.0, "grad_norm": 1.9570938461906935, "language_loss": 0.618155, "learning_rate": 8.517219370087645e-07, "loss": 0.63952947, "num_input_tokens_seen": 252557300, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 11706, "time_per_iteration": 2.499753952026367 }, { "auxiliary_loss_clip": 0.0110741, "auxiliary_loss_mlp": 0.01030257, "balance_loss_clip": 1.01804996, "balance_loss_mlp": 1.03714228, "epoch": 0.7038629189839171, "flos": 22528954632960.0, "grad_norm": 6.721402740896111, "language_loss": 0.67652529, "learning_rate": 8.514030839837756e-07, "loss": 0.69790196, "num_input_tokens_seen": 252576715, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 11707, "time_per_iteration": 2.5129034519195557 }, { "auxiliary_loss_clip": 0.01103169, "auxiliary_loss_mlp": 0.01029751, "balance_loss_clip": 1.01782966, "balance_loss_mlp": 1.03592014, "epoch": 0.703923042236585, "flos": 26250484993920.0, "grad_norm": 2.0619190043436033, "language_loss": 0.76341081, "learning_rate": 8.510842745136974e-07, "loss": 0.78474003, "num_input_tokens_seen": 252596190, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 11708, "time_per_iteration": 2.527982234954834 }, { "auxiliary_loss_clip": 0.01104889, "auxiliary_loss_mlp": 0.01029324, "balance_loss_clip": 1.0175941, "balance_loss_mlp": 1.03754365, "epoch": 0.703983165489253, "flos": 19390254353280.0, "grad_norm": 2.5976902039680807, "language_loss": 0.72298443, "learning_rate": 8.50765508610619e-07, "loss": 0.74432653, "num_input_tokens_seen": 252613410, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 11709, "time_per_iteration": 2.492220163345337 }, { "auxiliary_loss_clip": 0.01104323, "auxiliary_loss_mlp": 0.01028437, "balance_loss_clip": 1.01619387, "balance_loss_mlp": 1.03601444, "epoch": 0.7040432887419209, "flos": 16683630773760.0, "grad_norm": 2.5950548985878146, "language_loss": 0.79009259, "learning_rate": 8.504467862866267e-07, "loss": 0.8114202, "num_input_tokens_seen": 252629150, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 11710, "time_per_iteration": 2.455881118774414 }, { "auxiliary_loss_clip": 0.01108249, "auxiliary_loss_mlp": 0.01035135, "balance_loss_clip": 1.02216554, "balance_loss_mlp": 1.03788424, "epoch": 0.7041034119945889, "flos": 21141402203520.0, "grad_norm": 1.5507096325813736, "language_loss": 0.769943, "learning_rate": 8.501281075538076e-07, "loss": 0.79137683, "num_input_tokens_seen": 252648225, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11711, "time_per_iteration": 2.5098276138305664 }, { "auxiliary_loss_clip": 0.01103803, "auxiliary_loss_mlp": 0.01030254, "balance_loss_clip": 1.01884007, "balance_loss_mlp": 1.03539157, "epoch": 0.7041635352472568, "flos": 16910299549440.0, "grad_norm": 2.239432929251287, "language_loss": 0.74233305, "learning_rate": 8.498094724242457e-07, "loss": 0.7636736, "num_input_tokens_seen": 252665380, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.68359375, "step": 11712, "time_per_iteration": 3.863802909851074 }, { "auxiliary_loss_clip": 0.01032315, "auxiliary_loss_mlp": 0.01005538, "balance_loss_clip": 1.00437593, "balance_loss_mlp": 1.00927615, "epoch": 0.7042236584999249, "flos": 71681219475840.0, "grad_norm": 0.8944276508594787, "language_loss": 0.64657736, "learning_rate": 8.494908809100247e-07, "loss": 0.66695583, "num_input_tokens_seen": 252727950, "router_z_loss_clip": 0.01159668, "router_z_loss_mlp": 0.23046875, "step": 11713, "time_per_iteration": 4.795130491256714 }, { "auxiliary_loss_clip": 0.01105321, "auxiliary_loss_mlp": 0.01028316, "balance_loss_clip": 1.01653767, "balance_loss_mlp": 1.03509498, "epoch": 0.7042837817525928, "flos": 28658187590400.0, "grad_norm": 2.136950987068817, "language_loss": 0.73153108, "learning_rate": 8.49172333023225e-07, "loss": 0.75286746, "num_input_tokens_seen": 252746770, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 11714, "time_per_iteration": 2.5705320835113525 }, { "auxiliary_loss_clip": 0.01104086, "auxiliary_loss_mlp": 0.01034727, "balance_loss_clip": 1.02218056, "balance_loss_mlp": 1.03567743, "epoch": 0.7043439050052608, "flos": 19753562465280.0, "grad_norm": 1.7363983919516786, "language_loss": 0.79564583, "learning_rate": 8.488538287759248e-07, "loss": 0.81703401, "num_input_tokens_seen": 252765610, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 11715, "time_per_iteration": 3.9430460929870605 }, { "auxiliary_loss_clip": 0.01107412, "auxiliary_loss_mlp": 0.01035793, "balance_loss_clip": 1.02312684, "balance_loss_mlp": 1.03607082, "epoch": 0.7044040282579288, "flos": 11538529620480.0, "grad_norm": 2.8424847507328552, "language_loss": 0.71338427, "learning_rate": 8.485353681802037e-07, "loss": 0.73481625, "num_input_tokens_seen": 252781610, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 11716, "time_per_iteration": 3.9443275928497314 }, { "auxiliary_loss_clip": 0.01109804, "auxiliary_loss_mlp": 0.01031894, "balance_loss_clip": 1.01934743, "balance_loss_mlp": 1.03772509, "epoch": 0.7044641515105967, "flos": 33656126722560.0, "grad_norm": 1.90540834340183, "language_loss": 0.66253996, "learning_rate": 8.482169512481358e-07, "loss": 0.68395698, "num_input_tokens_seen": 252800600, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 11717, "time_per_iteration": 2.6124470233917236 }, { "auxiliary_loss_clip": 0.01106491, "auxiliary_loss_mlp": 0.01030748, "balance_loss_clip": 1.01867783, "balance_loss_mlp": 1.0371412, "epoch": 0.7045242747632647, "flos": 26723859356160.0, "grad_norm": 1.6768388903641251, "language_loss": 0.7444309, "learning_rate": 8.478985779917967e-07, "loss": 0.76580322, "num_input_tokens_seen": 252822310, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69140625, "step": 11718, "time_per_iteration": 2.5420444011688232 }, { "auxiliary_loss_clip": 0.01105311, "auxiliary_loss_mlp": 0.01030685, "balance_loss_clip": 1.01937807, "balance_loss_mlp": 1.0370971, "epoch": 0.7045843980159326, "flos": 26797655848320.0, "grad_norm": 2.3754292428852484, "language_loss": 0.80227381, "learning_rate": 8.475802484232606e-07, "loss": 0.82363379, "num_input_tokens_seen": 252842355, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.68359375, "step": 11719, "time_per_iteration": 2.5388777256011963 }, { "auxiliary_loss_clip": 0.01106443, "auxiliary_loss_mlp": 0.01038637, "balance_loss_clip": 1.02561343, "balance_loss_mlp": 1.03734207, "epoch": 0.7046445212686007, "flos": 41574824363520.0, "grad_norm": 1.815611913907029, "language_loss": 0.65576136, "learning_rate": 8.472619625545951e-07, "loss": 0.67721218, "num_input_tokens_seen": 252866785, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 11720, "time_per_iteration": 2.676215171813965 }, { "auxiliary_loss_clip": 0.01108949, "auxiliary_loss_mlp": 0.0103154, "balance_loss_clip": 1.01851666, "balance_loss_mlp": 1.03792071, "epoch": 0.7047046445212686, "flos": 15560166113280.0, "grad_norm": 2.1489974470020643, "language_loss": 0.79792809, "learning_rate": 8.46943720397872e-07, "loss": 0.81933302, "num_input_tokens_seen": 252881870, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 11721, "time_per_iteration": 2.4680728912353516 }, { "auxiliary_loss_clip": 0.01031879, "auxiliary_loss_mlp": 0.01003995, "balance_loss_clip": 1.00283837, "balance_loss_mlp": 1.00882161, "epoch": 0.7047647677739366, "flos": 70410269571840.0, "grad_norm": 0.778322926065859, "language_loss": 0.64831644, "learning_rate": 8.466255219651582e-07, "loss": 0.66867518, "num_input_tokens_seen": 252951300, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.23046875, "step": 11722, "time_per_iteration": 3.2612884044647217 }, { "auxiliary_loss_clip": 0.01106573, "auxiliary_loss_mlp": 0.01031965, "balance_loss_clip": 1.02047944, "balance_loss_mlp": 1.03863955, "epoch": 0.7048248910266045, "flos": 23660032976640.0, "grad_norm": 1.8719161054718927, "language_loss": 0.65799642, "learning_rate": 8.463073672685211e-07, "loss": 0.67938185, "num_input_tokens_seen": 252971400, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 11723, "time_per_iteration": 2.512153387069702 }, { "auxiliary_loss_clip": 0.01107913, "auxiliary_loss_mlp": 0.01028691, "balance_loss_clip": 1.01598334, "balance_loss_mlp": 1.03754735, "epoch": 0.7048850142792725, "flos": 21397158017280.0, "grad_norm": 2.0293597941741464, "language_loss": 0.80856228, "learning_rate": 8.459892563200235e-07, "loss": 0.8299284, "num_input_tokens_seen": 252989475, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11724, "time_per_iteration": 2.511220693588257 }, { "auxiliary_loss_clip": 0.01106046, "auxiliary_loss_mlp": 0.01035319, "balance_loss_clip": 1.02293885, "balance_loss_mlp": 1.03563762, "epoch": 0.7049451375319404, "flos": 21648101408640.0, "grad_norm": 1.7239118810886125, "language_loss": 0.73237193, "learning_rate": 8.456711891317296e-07, "loss": 0.75378555, "num_input_tokens_seen": 253007220, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 11725, "time_per_iteration": 2.5122690200805664 }, { "auxiliary_loss_clip": 0.01108453, "auxiliary_loss_mlp": 0.01026646, "balance_loss_clip": 1.01372361, "balance_loss_mlp": 1.03735065, "epoch": 0.7050052607846085, "flos": 14866802904960.0, "grad_norm": 2.6558594544461593, "language_loss": 0.78526032, "learning_rate": 8.453531657156998e-07, "loss": 0.8066113, "num_input_tokens_seen": 253025410, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 11726, "time_per_iteration": 2.4851834774017334 }, { "auxiliary_loss_clip": 0.01105113, "auxiliary_loss_mlp": 0.01031386, "balance_loss_clip": 1.0195365, "balance_loss_mlp": 1.03521681, "epoch": 0.7050653840372764, "flos": 19241763528960.0, "grad_norm": 1.7486489375381844, "language_loss": 0.70439792, "learning_rate": 8.450351860839931e-07, "loss": 0.72576296, "num_input_tokens_seen": 253043305, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69921875, "step": 11727, "time_per_iteration": 2.479898452758789 }, { "auxiliary_loss_clip": 0.01095921, "auxiliary_loss_mlp": 0.01022835, "balance_loss_clip": 1.01184952, "balance_loss_mlp": 1.03198314, "epoch": 0.7051255072899444, "flos": 27780422935680.0, "grad_norm": 1.8123115680164956, "language_loss": 0.68774325, "learning_rate": 8.44717250248668e-07, "loss": 0.70893085, "num_input_tokens_seen": 253062790, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.640625, "step": 11728, "time_per_iteration": 2.584160566329956 }, { "auxiliary_loss_clip": 0.01105786, "auxiliary_loss_mlp": 0.01029473, "balance_loss_clip": 1.01749849, "balance_loss_mlp": 1.03751791, "epoch": 0.7051856305426124, "flos": 27892033470720.0, "grad_norm": 2.1224680364805333, "language_loss": 0.73691058, "learning_rate": 8.443993582217803e-07, "loss": 0.75826323, "num_input_tokens_seen": 253082055, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 11729, "time_per_iteration": 2.5715911388397217 }, { "auxiliary_loss_clip": 0.01110862, "auxiliary_loss_mlp": 0.01030512, "balance_loss_clip": 1.0172081, "balance_loss_mlp": 1.03775632, "epoch": 0.7052457537952803, "flos": 25043563082880.0, "grad_norm": 2.22133252580969, "language_loss": 0.78291833, "learning_rate": 8.440815100153862e-07, "loss": 0.80433208, "num_input_tokens_seen": 253102575, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 11730, "time_per_iteration": 2.548097610473633 }, { "auxiliary_loss_clip": 0.01104817, "auxiliary_loss_mlp": 0.01034876, "balance_loss_clip": 1.02257395, "balance_loss_mlp": 1.03491879, "epoch": 0.7053058770479483, "flos": 21871717528320.0, "grad_norm": 3.2161698494411293, "language_loss": 0.62725908, "learning_rate": 8.437637056415359e-07, "loss": 0.64865601, "num_input_tokens_seen": 253121290, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 11731, "time_per_iteration": 2.5264692306518555 }, { "auxiliary_loss_clip": 0.01107405, "auxiliary_loss_mlp": 0.01030588, "balance_loss_clip": 1.0174334, "balance_loss_mlp": 1.03677511, "epoch": 0.7053660003006162, "flos": 16398716094720.0, "grad_norm": 2.972927742956743, "language_loss": 0.74414766, "learning_rate": 8.434459451122815e-07, "loss": 0.76552761, "num_input_tokens_seen": 253139720, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 11732, "time_per_iteration": 2.4965715408325195 }, { "auxiliary_loss_clip": 0.01103633, "auxiliary_loss_mlp": 0.01025655, "balance_loss_clip": 1.01380014, "balance_loss_mlp": 1.03659534, "epoch": 0.7054261235532843, "flos": 22711560399360.0, "grad_norm": 2.141872630728023, "language_loss": 0.71324503, "learning_rate": 8.431282284396735e-07, "loss": 0.7345379, "num_input_tokens_seen": 253160250, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 11733, "time_per_iteration": 2.511979818344116 }, { "auxiliary_loss_clip": 0.01102638, "auxiliary_loss_mlp": 0.01031791, "balance_loss_clip": 1.01944113, "balance_loss_mlp": 1.03457141, "epoch": 0.7054862468059522, "flos": 13589711775360.0, "grad_norm": 1.9729153519410683, "language_loss": 0.73472071, "learning_rate": 8.428105556357583e-07, "loss": 0.75606501, "num_input_tokens_seen": 253178710, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 11734, "time_per_iteration": 2.5134148597717285 }, { "auxiliary_loss_clip": 0.0110984, "auxiliary_loss_mlp": 0.01034546, "balance_loss_clip": 1.02151597, "balance_loss_mlp": 1.03703141, "epoch": 0.7055463700586202, "flos": 15880704105600.0, "grad_norm": 2.7102351575184978, "language_loss": 0.69484031, "learning_rate": 8.424929267125829e-07, "loss": 0.71628416, "num_input_tokens_seen": 253194805, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 11735, "time_per_iteration": 2.464202642440796 }, { "auxiliary_loss_clip": 0.01106982, "auxiliary_loss_mlp": 0.01035323, "balance_loss_clip": 1.02129185, "balance_loss_mlp": 1.03683376, "epoch": 0.7056064933112881, "flos": 23076161400960.0, "grad_norm": 1.7643227380068982, "language_loss": 0.72382838, "learning_rate": 8.421753416821933e-07, "loss": 0.74525136, "num_input_tokens_seen": 253213895, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.703125, "step": 11736, "time_per_iteration": 2.5361907482147217 }, { "auxiliary_loss_clip": 0.01104647, "auxiliary_loss_mlp": 0.01024169, "balance_loss_clip": 1.01280248, "balance_loss_mlp": 1.03756332, "epoch": 0.7056666165639561, "flos": 24057168721920.0, "grad_norm": 1.9095559680896819, "language_loss": 0.69326216, "learning_rate": 8.41857800556629e-07, "loss": 0.71455038, "num_input_tokens_seen": 253231620, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.671875, "step": 11737, "time_per_iteration": 2.518463373184204 }, { "auxiliary_loss_clip": 0.01108165, "auxiliary_loss_mlp": 0.01034916, "balance_loss_clip": 1.02202976, "balance_loss_mlp": 1.03748488, "epoch": 0.705726739816624, "flos": 17493237371520.0, "grad_norm": 1.955312005688383, "language_loss": 0.67453861, "learning_rate": 8.415403033479332e-07, "loss": 0.69596934, "num_input_tokens_seen": 253249590, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 11738, "time_per_iteration": 2.4825899600982666 }, { "auxiliary_loss_clip": 0.011054, "auxiliary_loss_mlp": 0.01032318, "balance_loss_clip": 1.01888943, "balance_loss_mlp": 1.03636014, "epoch": 0.7057868630692921, "flos": 51350426472960.0, "grad_norm": 1.8627541802752225, "language_loss": 0.75505579, "learning_rate": 8.41222850068145e-07, "loss": 0.77643299, "num_input_tokens_seen": 253273870, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69140625, "step": 11739, "time_per_iteration": 2.749648332595825 }, { "auxiliary_loss_clip": 0.0110273, "auxiliary_loss_mlp": 0.01027991, "balance_loss_clip": 1.01574826, "balance_loss_mlp": 1.03612089, "epoch": 0.70584698632196, "flos": 26102963836800.0, "grad_norm": 2.121003424374266, "language_loss": 0.71436083, "learning_rate": 8.409054407293032e-07, "loss": 0.73566806, "num_input_tokens_seen": 253293720, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6640625, "step": 11740, "time_per_iteration": 2.548598527908325 }, { "auxiliary_loss_clip": 0.01106276, "auxiliary_loss_mlp": 0.01025139, "balance_loss_clip": 1.01378465, "balance_loss_mlp": 1.03792, "epoch": 0.705907109574628, "flos": 21543134889600.0, "grad_norm": 1.6577636614352285, "language_loss": 0.81935859, "learning_rate": 8.405880753434434e-07, "loss": 0.84067273, "num_input_tokens_seen": 253313700, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.68359375, "step": 11741, "time_per_iteration": 2.5073835849761963 }, { "auxiliary_loss_clip": 0.01105448, "auxiliary_loss_mlp": 0.01030009, "balance_loss_clip": 1.01726556, "balance_loss_mlp": 1.03587115, "epoch": 0.705967232827296, "flos": 22710842127360.0, "grad_norm": 1.759289341385379, "language_loss": 0.78157043, "learning_rate": 8.402707539225993e-07, "loss": 0.80292499, "num_input_tokens_seen": 253332425, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 11742, "time_per_iteration": 2.50445818901062 }, { "auxiliary_loss_clip": 0.01108454, "auxiliary_loss_mlp": 0.01031695, "balance_loss_clip": 1.01875508, "balance_loss_mlp": 1.03698194, "epoch": 0.7060273560799639, "flos": 28691225124480.0, "grad_norm": 2.3279313718821624, "language_loss": 0.64294463, "learning_rate": 8.39953476478805e-07, "loss": 0.6643461, "num_input_tokens_seen": 253353620, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 11743, "time_per_iteration": 2.5573887825012207 }, { "auxiliary_loss_clip": 0.01106437, "auxiliary_loss_mlp": 0.01031193, "balance_loss_clip": 1.01802075, "balance_loss_mlp": 1.03593886, "epoch": 0.7060874793326319, "flos": 15706178899200.0, "grad_norm": 1.8786483297781156, "language_loss": 0.65622693, "learning_rate": 8.396362430240902e-07, "loss": 0.67760319, "num_input_tokens_seen": 253370930, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 11744, "time_per_iteration": 2.4724485874176025 }, { "auxiliary_loss_clip": 0.01103904, "auxiliary_loss_mlp": 0.01033263, "balance_loss_clip": 1.02078152, "balance_loss_mlp": 1.03653002, "epoch": 0.7061476025852998, "flos": 21506757390720.0, "grad_norm": 2.391609576727409, "language_loss": 0.63683987, "learning_rate": 8.393190535704857e-07, "loss": 0.65821147, "num_input_tokens_seen": 253389810, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 11745, "time_per_iteration": 2.5098531246185303 }, { "auxiliary_loss_clip": 0.01105719, "auxiliary_loss_mlp": 0.01032066, "balance_loss_clip": 1.0196507, "balance_loss_mlp": 1.03638959, "epoch": 0.7062077258379679, "flos": 28181832399360.0, "grad_norm": 1.931459935548851, "language_loss": 0.71610463, "learning_rate": 8.390019081300188e-07, "loss": 0.73748243, "num_input_tokens_seen": 253408685, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 11746, "time_per_iteration": 2.5833208560943604 }, { "auxiliary_loss_clip": 0.01106874, "auxiliary_loss_mlp": 0.0102842, "balance_loss_clip": 1.01617742, "balance_loss_mlp": 1.03694963, "epoch": 0.7062678490906358, "flos": 27853680723840.0, "grad_norm": 1.4315723892247072, "language_loss": 0.79152429, "learning_rate": 8.386848067147175e-07, "loss": 0.81287718, "num_input_tokens_seen": 253429685, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 11747, "time_per_iteration": 2.552320718765259 }, { "auxiliary_loss_clip": 0.01103034, "auxiliary_loss_mlp": 0.01031868, "balance_loss_clip": 1.02063274, "balance_loss_mlp": 1.03576374, "epoch": 0.7063279723433038, "flos": 23184862934400.0, "grad_norm": 3.2436838952006717, "language_loss": 0.65653783, "learning_rate": 8.383677493366031e-07, "loss": 0.67788684, "num_input_tokens_seen": 253448260, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 11748, "time_per_iteration": 2.513094425201416 }, { "auxiliary_loss_clip": 0.01103848, "auxiliary_loss_mlp": 0.01036211, "balance_loss_clip": 1.02314568, "balance_loss_mlp": 1.03475881, "epoch": 0.7063880955959717, "flos": 20188655907840.0, "grad_norm": 2.88867653273361, "language_loss": 0.79095131, "learning_rate": 8.380507360077003e-07, "loss": 0.81235188, "num_input_tokens_seen": 253467725, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 11749, "time_per_iteration": 2.490346908569336 }, { "auxiliary_loss_clip": 0.01031665, "auxiliary_loss_mlp": 0.01003962, "balance_loss_clip": 1.00272202, "balance_loss_mlp": 1.00857735, "epoch": 0.7064482188486397, "flos": 63668182763520.0, "grad_norm": 0.7969870199592879, "language_loss": 0.54024523, "learning_rate": 8.377337667400304e-07, "loss": 0.56060153, "num_input_tokens_seen": 253526940, "router_z_loss_clip": 0.01239014, "router_z_loss_mlp": 0.23046875, "step": 11750, "time_per_iteration": 3.091571569442749 }, { "auxiliary_loss_clip": 0.0110635, "auxiliary_loss_mlp": 0.01034024, "balance_loss_clip": 1.02131033, "balance_loss_mlp": 1.03756475, "epoch": 0.7065083421013076, "flos": 25191227894400.0, "grad_norm": 1.7046776798866428, "language_loss": 0.78525943, "learning_rate": 8.37416841545612e-07, "loss": 0.80666316, "num_input_tokens_seen": 253546160, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 11751, "time_per_iteration": 2.524388074874878 }, { "auxiliary_loss_clip": 0.01100705, "auxiliary_loss_mlp": 0.01028562, "balance_loss_clip": 1.01703405, "balance_loss_mlp": 1.03386784, "epoch": 0.7065684653539757, "flos": 22893699288960.0, "grad_norm": 1.8250878337607341, "language_loss": 0.68152463, "learning_rate": 8.370999604364634e-07, "loss": 0.7028172, "num_input_tokens_seen": 253565505, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 11752, "time_per_iteration": 2.5087268352508545 }, { "auxiliary_loss_clip": 0.01103829, "auxiliary_loss_mlp": 0.01037184, "balance_loss_clip": 1.02507257, "balance_loss_mlp": 1.03619635, "epoch": 0.7066285886066436, "flos": 23550254035200.0, "grad_norm": 2.1201846577434083, "language_loss": 0.7697351, "learning_rate": 8.367831234246025e-07, "loss": 0.79114521, "num_input_tokens_seen": 253585125, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 11753, "time_per_iteration": 3.9362142086029053 }, { "auxiliary_loss_clip": 0.01102124, "auxiliary_loss_mlp": 0.01030483, "balance_loss_clip": 1.01827025, "balance_loss_mlp": 1.03602743, "epoch": 0.7066887118593116, "flos": 21069293650560.0, "grad_norm": 1.5199275681747615, "language_loss": 0.71026623, "learning_rate": 8.364663305220405e-07, "loss": 0.7315923, "num_input_tokens_seen": 253604815, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 11754, "time_per_iteration": 2.557443857192993 }, { "auxiliary_loss_clip": 0.01104738, "auxiliary_loss_mlp": 0.01034141, "balance_loss_clip": 1.02121902, "balance_loss_mlp": 1.03561211, "epoch": 0.7067488351119796, "flos": 21176307244800.0, "grad_norm": 2.2437927801684743, "language_loss": 0.89876676, "learning_rate": 8.361495817407919e-07, "loss": 0.92015553, "num_input_tokens_seen": 253622855, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 11755, "time_per_iteration": 4.159185409545898 }, { "auxiliary_loss_clip": 0.01103258, "auxiliary_loss_mlp": 0.01034933, "balance_loss_clip": 1.02252936, "balance_loss_mlp": 1.03499746, "epoch": 0.7068089583646475, "flos": 20449224144000.0, "grad_norm": 1.6505282441104998, "language_loss": 0.79702443, "learning_rate": 8.358328770928678e-07, "loss": 0.81840634, "num_input_tokens_seen": 253642760, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 11756, "time_per_iteration": 2.498427629470825 }, { "auxiliary_loss_clip": 0.01030637, "auxiliary_loss_mlp": 0.01002019, "balance_loss_clip": 1.00093985, "balance_loss_mlp": 1.00762248, "epoch": 0.7068690816173155, "flos": 59109179829120.0, "grad_norm": 0.9308942435207268, "language_loss": 0.60424054, "learning_rate": 8.355162165902785e-07, "loss": 0.62456709, "num_input_tokens_seen": 253695685, "router_z_loss_clip": 0.01080322, "router_z_loss_mlp": 0.23046875, "step": 11757, "time_per_iteration": 4.340214967727661 }, { "auxiliary_loss_clip": 0.01106066, "auxiliary_loss_mlp": 0.01033696, "balance_loss_clip": 1.02150095, "balance_loss_mlp": 1.03741765, "epoch": 0.7069292048699835, "flos": 16251554073600.0, "grad_norm": 2.2701270876346884, "language_loss": 0.80501378, "learning_rate": 8.351996002450307e-07, "loss": 0.82641149, "num_input_tokens_seen": 253713305, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 11758, "time_per_iteration": 3.9029741287231445 }, { "auxiliary_loss_clip": 0.0110115, "auxiliary_loss_mlp": 0.01034797, "balance_loss_clip": 1.02233911, "balance_loss_mlp": 1.03382182, "epoch": 0.7069893281226515, "flos": 41172768455040.0, "grad_norm": 2.0221701577591378, "language_loss": 0.77541518, "learning_rate": 8.348830280691304e-07, "loss": 0.79677469, "num_input_tokens_seen": 253736100, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 11759, "time_per_iteration": 2.6973133087158203 }, { "auxiliary_loss_clip": 0.01103668, "auxiliary_loss_mlp": 0.01031738, "balance_loss_clip": 1.01901221, "balance_loss_mlp": 1.03487062, "epoch": 0.7070494513753194, "flos": 24207275658240.0, "grad_norm": 1.5017638976319723, "language_loss": 0.67600024, "learning_rate": 8.34566500074583e-07, "loss": 0.69735432, "num_input_tokens_seen": 253757350, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 11760, "time_per_iteration": 2.5262889862060547 }, { "auxiliary_loss_clip": 0.01106573, "auxiliary_loss_mlp": 0.01031957, "balance_loss_clip": 1.02011371, "balance_loss_mlp": 1.03673363, "epoch": 0.7071095746279874, "flos": 20185675079040.0, "grad_norm": 2.3716433854176016, "language_loss": 0.80249959, "learning_rate": 8.342500162733899e-07, "loss": 0.82388484, "num_input_tokens_seen": 253772855, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 11761, "time_per_iteration": 2.491403818130493 }, { "auxiliary_loss_clip": 0.01104656, "auxiliary_loss_mlp": 0.01033067, "balance_loss_clip": 1.01979935, "balance_loss_mlp": 1.03509521, "epoch": 0.7071696978806553, "flos": 18183045133440.0, "grad_norm": 3.7434783249704684, "language_loss": 0.75248641, "learning_rate": 8.33933576677553e-07, "loss": 0.77386367, "num_input_tokens_seen": 253790360, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 11762, "time_per_iteration": 2.4582550525665283 }, { "auxiliary_loss_clip": 0.0110383, "auxiliary_loss_mlp": 0.01027888, "balance_loss_clip": 1.01590753, "balance_loss_mlp": 1.03662455, "epoch": 0.7072298211333233, "flos": 24131719399680.0, "grad_norm": 1.6976573689324679, "language_loss": 0.76771379, "learning_rate": 8.336171812990724e-07, "loss": 0.78903103, "num_input_tokens_seen": 253810585, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 11763, "time_per_iteration": 2.5434863567352295 }, { "auxiliary_loss_clip": 0.01105443, "auxiliary_loss_mlp": 0.01037425, "balance_loss_clip": 1.02432942, "balance_loss_mlp": 1.03603196, "epoch": 0.7072899443859912, "flos": 27198418867200.0, "grad_norm": 2.26215703203335, "language_loss": 0.78526556, "learning_rate": 8.333008301499453e-07, "loss": 0.80669427, "num_input_tokens_seen": 253829080, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 11764, "time_per_iteration": 2.5950381755828857 }, { "auxiliary_loss_clip": 0.01107844, "auxiliary_loss_mlp": 0.01039598, "balance_loss_clip": 1.02613926, "balance_loss_mlp": 1.03634763, "epoch": 0.7073500676386593, "flos": 16435596384000.0, "grad_norm": 1.6451698478935277, "language_loss": 0.79764682, "learning_rate": 8.32984523242167e-07, "loss": 0.81912124, "num_input_tokens_seen": 253846780, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 11765, "time_per_iteration": 2.4817936420440674 }, { "auxiliary_loss_clip": 0.0110134, "auxiliary_loss_mlp": 0.01025974, "balance_loss_clip": 1.01497734, "balance_loss_mlp": 1.03457522, "epoch": 0.7074101908913272, "flos": 27673732563840.0, "grad_norm": 1.6183332951405391, "language_loss": 0.68209255, "learning_rate": 8.326682605877324e-07, "loss": 0.70336568, "num_input_tokens_seen": 253867075, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.66796875, "step": 11766, "time_per_iteration": 2.5375938415527344 }, { "auxiliary_loss_clip": 0.01104798, "auxiliary_loss_mlp": 0.01041329, "balance_loss_clip": 1.02845395, "balance_loss_mlp": 1.03545868, "epoch": 0.7074703141439952, "flos": 22238078296320.0, "grad_norm": 2.353321436796727, "language_loss": 0.64040685, "learning_rate": 8.323520421986352e-07, "loss": 0.6618681, "num_input_tokens_seen": 253885790, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 11767, "time_per_iteration": 2.5338377952575684 }, { "auxiliary_loss_clip": 0.01104008, "auxiliary_loss_mlp": 0.01030029, "balance_loss_clip": 1.01764917, "balance_loss_mlp": 1.03436804, "epoch": 0.7075304373966632, "flos": 29643217234560.0, "grad_norm": 2.365872843253839, "language_loss": 0.52935034, "learning_rate": 8.320358680868646e-07, "loss": 0.55069071, "num_input_tokens_seen": 253907070, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 11768, "time_per_iteration": 2.5883901119232178 }, { "auxiliary_loss_clip": 0.0110218, "auxiliary_loss_mlp": 0.01032702, "balance_loss_clip": 1.0213474, "balance_loss_mlp": 1.03495204, "epoch": 0.7075905606493311, "flos": 19755214490880.0, "grad_norm": 2.100132953837532, "language_loss": 0.75548923, "learning_rate": 8.317197382644119e-07, "loss": 0.77683806, "num_input_tokens_seen": 253927290, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 11769, "time_per_iteration": 2.520831346511841 }, { "auxiliary_loss_clip": 0.01030353, "auxiliary_loss_mlp": 0.01005323, "balance_loss_clip": 1.00420809, "balance_loss_mlp": 1.00703812, "epoch": 0.7076506839019991, "flos": 65716132694400.0, "grad_norm": 0.8516619722630145, "language_loss": 0.62066931, "learning_rate": 8.314036527432637e-07, "loss": 0.64102608, "num_input_tokens_seen": 253983440, "router_z_loss_clip": 0.01116943, "router_z_loss_mlp": 0.23339844, "step": 11770, "time_per_iteration": 3.068150281906128 }, { "auxiliary_loss_clip": 0.01105683, "auxiliary_loss_mlp": 0.01036874, "balance_loss_clip": 1.02441049, "balance_loss_mlp": 1.03496075, "epoch": 0.707710807154667, "flos": 23765286804480.0, "grad_norm": 2.0170783191930433, "language_loss": 0.76260638, "learning_rate": 8.310876115354055e-07, "loss": 0.78403187, "num_input_tokens_seen": 254003825, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 11771, "time_per_iteration": 2.528618335723877 }, { "auxiliary_loss_clip": 0.01101291, "auxiliary_loss_mlp": 0.01031159, "balance_loss_clip": 1.01913095, "balance_loss_mlp": 1.03527367, "epoch": 0.7077709304073351, "flos": 21251360712960.0, "grad_norm": 1.8559185164441563, "language_loss": 0.71313381, "learning_rate": 8.307716146528221e-07, "loss": 0.73445833, "num_input_tokens_seen": 254023345, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66015625, "step": 11772, "time_per_iteration": 2.558241367340088 }, { "auxiliary_loss_clip": 0.01107355, "auxiliary_loss_mlp": 0.01030507, "balance_loss_clip": 1.01811564, "balance_loss_mlp": 1.03615975, "epoch": 0.707831053660003, "flos": 20740746925440.0, "grad_norm": 1.9743914539704992, "language_loss": 0.69856739, "learning_rate": 8.30455662107496e-07, "loss": 0.71994597, "num_input_tokens_seen": 254041815, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 11773, "time_per_iteration": 2.55560302734375 }, { "auxiliary_loss_clip": 0.01104329, "auxiliary_loss_mlp": 0.01033938, "balance_loss_clip": 1.02173662, "balance_loss_mlp": 1.03521061, "epoch": 0.707891176912671, "flos": 21980993679360.0, "grad_norm": 1.5202293045569144, "language_loss": 0.7027868, "learning_rate": 8.301397539114095e-07, "loss": 0.72416943, "num_input_tokens_seen": 254062065, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 11774, "time_per_iteration": 2.5282015800476074 }, { "auxiliary_loss_clip": 0.0110142, "auxiliary_loss_mlp": 0.01028661, "balance_loss_clip": 1.01715088, "balance_loss_mlp": 1.0356853, "epoch": 0.7079513001653389, "flos": 21068970428160.0, "grad_norm": 2.1807713806339346, "language_loss": 0.74515295, "learning_rate": 8.298238900765407e-07, "loss": 0.76645374, "num_input_tokens_seen": 254080605, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65625, "step": 11775, "time_per_iteration": 2.496699571609497 }, { "auxiliary_loss_clip": 0.01106702, "auxiliary_loss_mlp": 0.01031923, "balance_loss_clip": 1.01936388, "balance_loss_mlp": 1.03750277, "epoch": 0.7080114234180069, "flos": 18040659621120.0, "grad_norm": 3.2051283596160807, "language_loss": 0.87008333, "learning_rate": 8.295080706148665e-07, "loss": 0.89146954, "num_input_tokens_seen": 254098710, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 11776, "time_per_iteration": 2.5099000930786133 }, { "auxiliary_loss_clip": 0.01103215, "auxiliary_loss_mlp": 0.01028889, "balance_loss_clip": 1.01760554, "balance_loss_mlp": 1.03455877, "epoch": 0.7080715466706748, "flos": 15122271409920.0, "grad_norm": 1.5381151671979476, "language_loss": 0.75045645, "learning_rate": 8.291922955383641e-07, "loss": 0.77177751, "num_input_tokens_seen": 254117200, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6875, "step": 11777, "time_per_iteration": 2.496500015258789 }, { "auxiliary_loss_clip": 0.01111878, "auxiliary_loss_mlp": 0.01030523, "balance_loss_clip": 1.01798832, "balance_loss_mlp": 1.03896654, "epoch": 0.7081316699233429, "flos": 14422802889600.0, "grad_norm": 2.090633957891779, "language_loss": 0.8215732, "learning_rate": 8.288765648590066e-07, "loss": 0.84299719, "num_input_tokens_seen": 254132115, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 11778, "time_per_iteration": 2.493990898132324 }, { "auxiliary_loss_clip": 0.0109957, "auxiliary_loss_mlp": 0.01030528, "balance_loss_clip": 1.01972151, "balance_loss_mlp": 1.03470302, "epoch": 0.7081917931760108, "flos": 23222389668480.0, "grad_norm": 1.5900881301033623, "language_loss": 0.85107362, "learning_rate": 8.285608785887673e-07, "loss": 0.87237459, "num_input_tokens_seen": 254152285, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.6484375, "step": 11779, "time_per_iteration": 2.5414087772369385 }, { "auxiliary_loss_clip": 0.01107796, "auxiliary_loss_mlp": 0.01036449, "balance_loss_clip": 1.02418208, "balance_loss_mlp": 1.03778613, "epoch": 0.7082519164286788, "flos": 39308429871360.0, "grad_norm": 2.9601342825665986, "language_loss": 0.71964914, "learning_rate": 8.28245236739618e-07, "loss": 0.74109161, "num_input_tokens_seen": 254172805, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 11780, "time_per_iteration": 2.6603808403015137 }, { "auxiliary_loss_clip": 0.01102748, "auxiliary_loss_mlp": 0.01031454, "balance_loss_clip": 1.01913357, "balance_loss_mlp": 1.03522825, "epoch": 0.7083120396813467, "flos": 21651154064640.0, "grad_norm": 1.5598367191240878, "language_loss": 0.72948438, "learning_rate": 8.279296393235256e-07, "loss": 0.75082636, "num_input_tokens_seen": 254191890, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 11781, "time_per_iteration": 2.506486177444458 }, { "auxiliary_loss_clip": 0.01104126, "auxiliary_loss_mlp": 0.01031212, "balance_loss_clip": 1.01960111, "balance_loss_mlp": 1.03606296, "epoch": 0.7083721629340147, "flos": 17567033863680.0, "grad_norm": 68.16159159329074, "language_loss": 0.77406347, "learning_rate": 8.276140863524585e-07, "loss": 0.79541683, "num_input_tokens_seen": 254210150, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 11782, "time_per_iteration": 2.484408140182495 }, { "auxiliary_loss_clip": 0.01102529, "auxiliary_loss_mlp": 0.01028925, "balance_loss_clip": 1.01774323, "balance_loss_mlp": 1.03493237, "epoch": 0.7084322861866827, "flos": 29350509304320.0, "grad_norm": 1.5541843608368089, "language_loss": 0.69721776, "learning_rate": 8.272985778383828e-07, "loss": 0.71853232, "num_input_tokens_seen": 254233015, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.67578125, "step": 11783, "time_per_iteration": 2.557821273803711 }, { "auxiliary_loss_clip": 0.01107564, "auxiliary_loss_mlp": 0.01030599, "balance_loss_clip": 1.01808178, "balance_loss_mlp": 1.03677368, "epoch": 0.7084924094393507, "flos": 20194294343040.0, "grad_norm": 1.9572819817406835, "language_loss": 0.79248059, "learning_rate": 8.269831137932632e-07, "loss": 0.8138622, "num_input_tokens_seen": 254251345, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 11784, "time_per_iteration": 2.5052404403686523 }, { "auxiliary_loss_clip": 0.01103132, "auxiliary_loss_mlp": 0.01030147, "balance_loss_clip": 1.0178442, "balance_loss_mlp": 1.03528905, "epoch": 0.7085525326920187, "flos": 23477211728640.0, "grad_norm": 1.7020382619666605, "language_loss": 0.76987803, "learning_rate": 8.266676942290609e-07, "loss": 0.79121077, "num_input_tokens_seen": 254269905, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 11785, "time_per_iteration": 2.51263165473938 }, { "auxiliary_loss_clip": 0.01103086, "auxiliary_loss_mlp": 0.01030807, "balance_loss_clip": 1.01832032, "balance_loss_mlp": 1.03508043, "epoch": 0.7086126559446866, "flos": 25958818558080.0, "grad_norm": 1.6687542016236836, "language_loss": 0.78004098, "learning_rate": 8.26352319157738e-07, "loss": 0.80137992, "num_input_tokens_seen": 254289990, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 11786, "time_per_iteration": 2.5584001541137695 }, { "auxiliary_loss_clip": 0.01105339, "auxiliary_loss_mlp": 0.01024468, "balance_loss_clip": 1.01233244, "balance_loss_mlp": 1.03547478, "epoch": 0.7086727791973546, "flos": 26724793109760.0, "grad_norm": 2.2585022348152113, "language_loss": 0.79263926, "learning_rate": 8.260369885912526e-07, "loss": 0.81393731, "num_input_tokens_seen": 254309085, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69921875, "step": 11787, "time_per_iteration": 2.5540547370910645 }, { "auxiliary_loss_clip": 0.0110537, "auxiliary_loss_mlp": 0.01031026, "balance_loss_clip": 1.01906323, "balance_loss_mlp": 1.03652692, "epoch": 0.7087329024500225, "flos": 21683365585920.0, "grad_norm": 2.598828317823078, "language_loss": 0.76699233, "learning_rate": 8.257217025415615e-07, "loss": 0.7883563, "num_input_tokens_seen": 254327045, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 11788, "time_per_iteration": 2.5249390602111816 }, { "auxiliary_loss_clip": 0.01108779, "auxiliary_loss_mlp": 0.01031498, "balance_loss_clip": 1.01852214, "balance_loss_mlp": 1.03680003, "epoch": 0.7087930257026905, "flos": 17931060247680.0, "grad_norm": 1.991078558426026, "language_loss": 0.68077612, "learning_rate": 8.254064610206212e-07, "loss": 0.7021789, "num_input_tokens_seen": 254344585, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 11789, "time_per_iteration": 2.475250244140625 }, { "auxiliary_loss_clip": 0.01106474, "auxiliary_loss_mlp": 0.01032679, "balance_loss_clip": 1.01930356, "balance_loss_mlp": 1.03509068, "epoch": 0.7088531489553584, "flos": 18911528864640.0, "grad_norm": 1.5972407952149708, "language_loss": 0.77327859, "learning_rate": 8.250912640403858e-07, "loss": 0.79467016, "num_input_tokens_seen": 254362470, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 11790, "time_per_iteration": 2.500255584716797 }, { "auxiliary_loss_clip": 0.011083, "auxiliary_loss_mlp": 0.0103167, "balance_loss_clip": 1.01842606, "balance_loss_mlp": 1.03538871, "epoch": 0.7089132722080265, "flos": 27380880979200.0, "grad_norm": 1.8673248438113634, "language_loss": 0.71210766, "learning_rate": 8.247761116128085e-07, "loss": 0.73350734, "num_input_tokens_seen": 254383190, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 11791, "time_per_iteration": 2.569969892501831 }, { "auxiliary_loss_clip": 0.01103903, "auxiliary_loss_mlp": 0.01034282, "balance_loss_clip": 1.02121019, "balance_loss_mlp": 1.03491557, "epoch": 0.7089733954606944, "flos": 22162917087360.0, "grad_norm": 1.6605315111540355, "language_loss": 0.82326138, "learning_rate": 8.244610037498376e-07, "loss": 0.84464329, "num_input_tokens_seen": 254403115, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 11792, "time_per_iteration": 2.5263919830322266 }, { "auxiliary_loss_clip": 0.01105984, "auxiliary_loss_mlp": 0.01029394, "balance_loss_clip": 1.01703227, "balance_loss_mlp": 1.0342114, "epoch": 0.7090335187133624, "flos": 24425827960320.0, "grad_norm": 2.5589211927793336, "language_loss": 0.64938807, "learning_rate": 8.241459404634232e-07, "loss": 0.6707418, "num_input_tokens_seen": 254421875, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 11793, "time_per_iteration": 2.5917446613311768 }, { "auxiliary_loss_clip": 0.01104652, "auxiliary_loss_mlp": 0.01030389, "balance_loss_clip": 1.0187602, "balance_loss_mlp": 1.03639483, "epoch": 0.7090936419660303, "flos": 21835232288640.0, "grad_norm": 3.023093862042506, "language_loss": 0.70494509, "learning_rate": 8.238309217655133e-07, "loss": 0.72629547, "num_input_tokens_seen": 254440765, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 11794, "time_per_iteration": 2.7550230026245117 }, { "auxiliary_loss_clip": 0.01106181, "auxiliary_loss_mlp": 0.01031356, "balance_loss_clip": 1.02012622, "balance_loss_mlp": 1.0384568, "epoch": 0.7091537652186983, "flos": 20082360585600.0, "grad_norm": 1.7002818017048846, "language_loss": 0.75841606, "learning_rate": 8.23515947668052e-07, "loss": 0.77979136, "num_input_tokens_seen": 254459480, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.67578125, "step": 11795, "time_per_iteration": 3.857973575592041 }, { "auxiliary_loss_clip": 0.01105578, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.02100933, "balance_loss_mlp": 1.03644657, "epoch": 0.7092138884713663, "flos": 13151565676800.0, "grad_norm": 2.125789595565805, "language_loss": 0.74899524, "learning_rate": 8.232010181829838e-07, "loss": 0.77038133, "num_input_tokens_seen": 254473985, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 11796, "time_per_iteration": 4.084138631820679 }, { "auxiliary_loss_clip": 0.01112688, "auxiliary_loss_mlp": 0.01034667, "balance_loss_clip": 1.02030206, "balance_loss_mlp": 1.03877068, "epoch": 0.7092740117240343, "flos": 21645982506240.0, "grad_norm": 2.0446403460621547, "language_loss": 0.7444253, "learning_rate": 8.228861333222523e-07, "loss": 0.76589882, "num_input_tokens_seen": 254492135, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.73828125, "step": 11797, "time_per_iteration": 2.535463571548462 }, { "auxiliary_loss_clip": 0.01105354, "auxiliary_loss_mlp": 0.01031233, "balance_loss_clip": 1.01856697, "balance_loss_mlp": 1.03653955, "epoch": 0.7093341349767023, "flos": 21032521102080.0, "grad_norm": 1.4546783000785264, "language_loss": 0.79689008, "learning_rate": 8.225712930977953e-07, "loss": 0.81825596, "num_input_tokens_seen": 254512865, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 11798, "time_per_iteration": 2.5307047367095947 }, { "auxiliary_loss_clip": 0.01103951, "auxiliary_loss_mlp": 0.0103351, "balance_loss_clip": 1.02095723, "balance_loss_mlp": 1.03496289, "epoch": 0.7093942582293702, "flos": 22017658487040.0, "grad_norm": 2.0248993210440216, "language_loss": 0.66996425, "learning_rate": 8.222564975215529e-07, "loss": 0.6913389, "num_input_tokens_seen": 254532605, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 11799, "time_per_iteration": 3.90830135345459 }, { "auxiliary_loss_clip": 0.01104559, "auxiliary_loss_mlp": 0.01029872, "balance_loss_clip": 1.01709902, "balance_loss_mlp": 1.03526723, "epoch": 0.7094543814820382, "flos": 27235586465280.0, "grad_norm": 1.6608974664405132, "language_loss": 0.81436491, "learning_rate": 8.219417466054622e-07, "loss": 0.83570921, "num_input_tokens_seen": 254553780, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 11800, "time_per_iteration": 3.9843637943267822 }, { "auxiliary_loss_clip": 0.01102476, "auxiliary_loss_mlp": 0.01031706, "balance_loss_clip": 1.0197376, "balance_loss_mlp": 1.03491998, "epoch": 0.7095145047347061, "flos": 12089148180480.0, "grad_norm": 2.082341159753139, "language_loss": 0.86806571, "learning_rate": 8.21627040361459e-07, "loss": 0.88940752, "num_input_tokens_seen": 254567510, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 11801, "time_per_iteration": 2.467495918273926 }, { "auxiliary_loss_clip": 0.01104181, "auxiliary_loss_mlp": 0.0103651, "balance_loss_clip": 1.02441645, "balance_loss_mlp": 1.03471315, "epoch": 0.7095746279873741, "flos": 19383789905280.0, "grad_norm": 2.1335271210430222, "language_loss": 0.7635951, "learning_rate": 8.213123788014758e-07, "loss": 0.78500199, "num_input_tokens_seen": 254585565, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 11802, "time_per_iteration": 2.537458896636963 }, { "auxiliary_loss_clip": 0.01108086, "auxiliary_loss_mlp": 0.01039093, "balance_loss_clip": 1.02672482, "balance_loss_mlp": 1.03756928, "epoch": 0.709634751240042, "flos": 21360600950400.0, "grad_norm": 2.106643618673099, "language_loss": 0.81267023, "learning_rate": 8.209977619374462e-07, "loss": 0.83414203, "num_input_tokens_seen": 254603465, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 11803, "time_per_iteration": 2.498255729675293 }, { "auxiliary_loss_clip": 0.01106087, "auxiliary_loss_mlp": 0.01031433, "balance_loss_clip": 1.01819491, "balance_loss_mlp": 1.034518, "epoch": 0.7096948744927101, "flos": 13917037438080.0, "grad_norm": 3.1580377689082435, "language_loss": 0.6835084, "learning_rate": 8.206831897812995e-07, "loss": 0.70488358, "num_input_tokens_seen": 254620500, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 11804, "time_per_iteration": 2.4878926277160645 }, { "auxiliary_loss_clip": 0.01098855, "auxiliary_loss_mlp": 0.01026132, "balance_loss_clip": 1.01502192, "balance_loss_mlp": 1.03380036, "epoch": 0.709754997745378, "flos": 30298335436800.0, "grad_norm": 1.931840207736425, "language_loss": 0.7813158, "learning_rate": 8.203686623449637e-07, "loss": 0.80256569, "num_input_tokens_seen": 254638565, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6484375, "step": 11805, "time_per_iteration": 2.555683135986328 }, { "auxiliary_loss_clip": 0.01105842, "auxiliary_loss_mlp": 0.01034235, "balance_loss_clip": 1.02094936, "balance_loss_mlp": 1.03610611, "epoch": 0.709815120998046, "flos": 18515147304960.0, "grad_norm": 2.742780834950103, "language_loss": 0.78708005, "learning_rate": 8.200541796403667e-07, "loss": 0.80848086, "num_input_tokens_seen": 254657505, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 11806, "time_per_iteration": 2.4854657649993896 }, { "auxiliary_loss_clip": 0.01104568, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.02006459, "balance_loss_mlp": 1.03608227, "epoch": 0.7098752442507139, "flos": 22272588288000.0, "grad_norm": 2.3004791086768295, "language_loss": 0.56378508, "learning_rate": 8.197397416794332e-07, "loss": 0.58514524, "num_input_tokens_seen": 254674730, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.68359375, "step": 11807, "time_per_iteration": 2.494856119155884 }, { "auxiliary_loss_clip": 0.01108512, "auxiliary_loss_mlp": 0.01039928, "balance_loss_clip": 1.02754235, "balance_loss_mlp": 1.03506005, "epoch": 0.7099353675033819, "flos": 19275447507840.0, "grad_norm": 2.5019956783529462, "language_loss": 0.68736637, "learning_rate": 8.194253484740882e-07, "loss": 0.70885074, "num_input_tokens_seen": 254691665, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.734375, "step": 11808, "time_per_iteration": 2.484936237335205 }, { "auxiliary_loss_clip": 0.01106547, "auxiliary_loss_mlp": 0.01031988, "balance_loss_clip": 1.02003121, "balance_loss_mlp": 1.03496671, "epoch": 0.70999549075605, "flos": 21908525990400.0, "grad_norm": 1.912930466133299, "language_loss": 0.71460688, "learning_rate": 8.191110000362513e-07, "loss": 0.73599219, "num_input_tokens_seen": 254711610, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.71484375, "step": 11809, "time_per_iteration": 2.488473892211914 }, { "auxiliary_loss_clip": 0.01030964, "auxiliary_loss_mlp": 0.01001464, "balance_loss_clip": 1.00037372, "balance_loss_mlp": 1.00805879, "epoch": 0.7100556140087179, "flos": 70456053456000.0, "grad_norm": 0.7704690531704036, "language_loss": 0.5949524, "learning_rate": 8.187966963778435e-07, "loss": 0.61527675, "num_input_tokens_seen": 254772615, "router_z_loss_clip": 0.01092529, "router_z_loss_mlp": 0.22949219, "step": 11810, "time_per_iteration": 3.2091004848480225 }, { "auxiliary_loss_clip": 0.01105782, "auxiliary_loss_mlp": 0.01036431, "balance_loss_clip": 1.02477813, "balance_loss_mlp": 1.03743482, "epoch": 0.7101157372613859, "flos": 23039568420480.0, "grad_norm": 1.7790887563248678, "language_loss": 0.74467224, "learning_rate": 8.18482437510784e-07, "loss": 0.76609433, "num_input_tokens_seen": 254791375, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.68359375, "step": 11811, "time_per_iteration": 2.5186047554016113 }, { "auxiliary_loss_clip": 0.01103539, "auxiliary_loss_mlp": 0.01030049, "balance_loss_clip": 1.01846159, "balance_loss_mlp": 1.03636873, "epoch": 0.7101758605140538, "flos": 23185329811200.0, "grad_norm": 1.7556753151296836, "language_loss": 0.83707827, "learning_rate": 8.181682234469882e-07, "loss": 0.85841417, "num_input_tokens_seen": 254809300, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 11812, "time_per_iteration": 2.481070041656494 }, { "auxiliary_loss_clip": 0.01107974, "auxiliary_loss_mlp": 0.01029937, "balance_loss_clip": 1.0168066, "balance_loss_mlp": 1.03758264, "epoch": 0.7102359837667218, "flos": 23696123166720.0, "grad_norm": 1.833590090344261, "language_loss": 0.69912529, "learning_rate": 8.178540541983716e-07, "loss": 0.7205044, "num_input_tokens_seen": 254829325, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11813, "time_per_iteration": 2.5755269527435303 }, { "auxiliary_loss_clip": 0.01100702, "auxiliary_loss_mlp": 0.01029041, "balance_loss_clip": 1.01763248, "balance_loss_mlp": 1.03353167, "epoch": 0.7102961070193897, "flos": 19391116279680.0, "grad_norm": 1.84974685667919, "language_loss": 0.81875026, "learning_rate": 8.175399297768495e-07, "loss": 0.84004772, "num_input_tokens_seen": 254847690, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 11814, "time_per_iteration": 2.4800026416778564 }, { "auxiliary_loss_clip": 0.01105187, "auxiliary_loss_mlp": 0.0103164, "balance_loss_clip": 1.01875925, "balance_loss_mlp": 1.03664005, "epoch": 0.7103562302720577, "flos": 21507511576320.0, "grad_norm": 2.0337879181266905, "language_loss": 0.75826293, "learning_rate": 8.172258501943301e-07, "loss": 0.77963126, "num_input_tokens_seen": 254865960, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 11815, "time_per_iteration": 2.5095324516296387 }, { "auxiliary_loss_clip": 0.01102578, "auxiliary_loss_mlp": 0.01031842, "balance_loss_clip": 1.01958752, "balance_loss_mlp": 1.03486991, "epoch": 0.7104163535247257, "flos": 14535059869440.0, "grad_norm": 2.0102946937002515, "language_loss": 0.78537214, "learning_rate": 8.16911815462725e-07, "loss": 0.80671626, "num_input_tokens_seen": 254882815, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.67578125, "step": 11816, "time_per_iteration": 2.4593887329101562 }, { "auxiliary_loss_clip": 0.01106375, "auxiliary_loss_mlp": 0.01035239, "balance_loss_clip": 1.0231626, "balance_loss_mlp": 1.03655577, "epoch": 0.7104764767773937, "flos": 11400310085760.0, "grad_norm": 1.799034196932315, "language_loss": 0.86742359, "learning_rate": 8.165978255939426e-07, "loss": 0.88883972, "num_input_tokens_seen": 254898705, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 11817, "time_per_iteration": 2.47628116607666 }, { "auxiliary_loss_clip": 0.01103255, "auxiliary_loss_mlp": 0.01030387, "balance_loss_clip": 1.01894832, "balance_loss_mlp": 1.0353961, "epoch": 0.7105366000300616, "flos": 11690432236800.0, "grad_norm": 2.4085106902833515, "language_loss": 0.85036373, "learning_rate": 8.162838805998897e-07, "loss": 0.87170023, "num_input_tokens_seen": 254913665, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 11818, "time_per_iteration": 2.450176954269409 }, { "auxiliary_loss_clip": 0.01104521, "auxiliary_loss_mlp": 0.01032688, "balance_loss_clip": 1.020082, "balance_loss_mlp": 1.03423893, "epoch": 0.7105967232827296, "flos": 19354020508800.0, "grad_norm": 2.004480676123504, "language_loss": 0.75612283, "learning_rate": 8.159699804924709e-07, "loss": 0.77749491, "num_input_tokens_seen": 254932140, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 11819, "time_per_iteration": 2.478123664855957 }, { "auxiliary_loss_clip": 0.01106766, "auxiliary_loss_mlp": 0.01030971, "balance_loss_clip": 1.01615942, "balance_loss_mlp": 1.03651047, "epoch": 0.7106568465353975, "flos": 22930400010240.0, "grad_norm": 1.6294245224164041, "language_loss": 0.70887303, "learning_rate": 8.156561252835883e-07, "loss": 0.73025036, "num_input_tokens_seen": 254951580, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.703125, "step": 11820, "time_per_iteration": 2.4951436519622803 }, { "auxiliary_loss_clip": 0.01104758, "auxiliary_loss_mlp": 0.0102964, "balance_loss_clip": 1.01759374, "balance_loss_mlp": 1.03593922, "epoch": 0.7107169697880655, "flos": 19099665325440.0, "grad_norm": 2.490820701026512, "language_loss": 0.75508022, "learning_rate": 8.153423149851449e-07, "loss": 0.77642417, "num_input_tokens_seen": 254969425, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 11821, "time_per_iteration": 2.5073773860931396 }, { "auxiliary_loss_clip": 0.01031383, "auxiliary_loss_mlp": 0.01003051, "balance_loss_clip": 1.00193083, "balance_loss_mlp": 1.00840807, "epoch": 0.7107770930407336, "flos": 63638054231040.0, "grad_norm": 0.774153484961984, "language_loss": 0.55022311, "learning_rate": 8.150285496090388e-07, "loss": 0.57056749, "num_input_tokens_seen": 255032680, "router_z_loss_clip": 0.01123047, "router_z_loss_mlp": 0.23046875, "step": 11822, "time_per_iteration": 3.121394157409668 }, { "auxiliary_loss_clip": 0.01099608, "auxiliary_loss_mlp": 0.01030775, "balance_loss_clip": 1.01841295, "balance_loss_mlp": 1.03371584, "epoch": 0.7108372162934015, "flos": 22054466949120.0, "grad_norm": 1.8024987614572683, "language_loss": 0.60426927, "learning_rate": 8.147148291671688e-07, "loss": 0.62557304, "num_input_tokens_seen": 255054400, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.66015625, "step": 11823, "time_per_iteration": 2.547894239425659 }, { "auxiliary_loss_clip": 0.01105257, "auxiliary_loss_mlp": 0.01031387, "balance_loss_clip": 1.01963305, "balance_loss_mlp": 1.03656983, "epoch": 0.7108973395460695, "flos": 19135144984320.0, "grad_norm": 1.9238752820565252, "language_loss": 0.72027636, "learning_rate": 8.144011536714322e-07, "loss": 0.74164283, "num_input_tokens_seen": 255072785, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 11824, "time_per_iteration": 2.4927022457122803 }, { "auxiliary_loss_clip": 0.01099718, "auxiliary_loss_mlp": 0.0103123, "balance_loss_clip": 1.01998293, "balance_loss_mlp": 1.03442502, "epoch": 0.7109574627987374, "flos": 17894431353600.0, "grad_norm": 3.169428327558091, "language_loss": 0.72737712, "learning_rate": 8.140875231337223e-07, "loss": 0.74868655, "num_input_tokens_seen": 255091820, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.65234375, "step": 11825, "time_per_iteration": 2.5762734413146973 }, { "auxiliary_loss_clip": 0.01106248, "auxiliary_loss_mlp": 0.01034249, "balance_loss_clip": 1.02192903, "balance_loss_mlp": 1.03539038, "epoch": 0.7110175860514054, "flos": 28979623422720.0, "grad_norm": 1.8907312176842586, "language_loss": 0.79382157, "learning_rate": 8.137739375659321e-07, "loss": 0.81522655, "num_input_tokens_seen": 255111720, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 11826, "time_per_iteration": 2.612884044647217 }, { "auxiliary_loss_clip": 0.01103288, "auxiliary_loss_mlp": 0.01033261, "balance_loss_clip": 1.02196038, "balance_loss_mlp": 1.03516269, "epoch": 0.7110777093040733, "flos": 26173312623360.0, "grad_norm": 1.45622149618193, "language_loss": 0.83032, "learning_rate": 8.134603969799527e-07, "loss": 0.85168552, "num_input_tokens_seen": 255133495, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 11827, "time_per_iteration": 2.588543176651001 }, { "auxiliary_loss_clip": 0.01106408, "auxiliary_loss_mlp": 0.01034861, "balance_loss_clip": 1.02189684, "balance_loss_mlp": 1.03600454, "epoch": 0.7111378325567413, "flos": 26869943969280.0, "grad_norm": 1.553909987850965, "language_loss": 0.62351334, "learning_rate": 8.131469013876748e-07, "loss": 0.64492607, "num_input_tokens_seen": 255156880, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 11828, "time_per_iteration": 2.6181445121765137 }, { "auxiliary_loss_clip": 0.01103953, "auxiliary_loss_mlp": 0.01032027, "balance_loss_clip": 1.01922429, "balance_loss_mlp": 1.03497803, "epoch": 0.7111979558094093, "flos": 27271820309760.0, "grad_norm": 1.652005653618756, "language_loss": 0.71708381, "learning_rate": 8.128334508009846e-07, "loss": 0.73844361, "num_input_tokens_seen": 255178920, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 11829, "time_per_iteration": 2.594761848449707 }, { "auxiliary_loss_clip": 0.01104011, "auxiliary_loss_mlp": 0.01030449, "balance_loss_clip": 1.01887929, "balance_loss_mlp": 1.03535044, "epoch": 0.7112580790620773, "flos": 25046938961280.0, "grad_norm": 1.7801334903204982, "language_loss": 0.80513453, "learning_rate": 8.125200452317697e-07, "loss": 0.82647914, "num_input_tokens_seen": 255198095, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6875, "step": 11830, "time_per_iteration": 2.5420961380004883 }, { "auxiliary_loss_clip": 0.01105677, "auxiliary_loss_mlp": 0.01035588, "balance_loss_clip": 1.02332735, "balance_loss_mlp": 1.03590083, "epoch": 0.7113182023147452, "flos": 21646628951040.0, "grad_norm": 6.280503628700084, "language_loss": 0.84120131, "learning_rate": 8.122066846919138e-07, "loss": 0.86261404, "num_input_tokens_seen": 255215860, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 11831, "time_per_iteration": 2.5430917739868164 }, { "auxiliary_loss_clip": 0.01105504, "auxiliary_loss_mlp": 0.01029536, "balance_loss_clip": 1.01722765, "balance_loss_mlp": 1.03556705, "epoch": 0.7113783255674132, "flos": 20996287257600.0, "grad_norm": 2.1838539108567496, "language_loss": 0.77057445, "learning_rate": 8.118933691932985e-07, "loss": 0.79192483, "num_input_tokens_seen": 255235425, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 11832, "time_per_iteration": 2.5290324687957764 }, { "auxiliary_loss_clip": 0.01031328, "auxiliary_loss_mlp": 0.00998407, "balance_loss_clip": 0.99728626, "balance_loss_mlp": 1.00849867, "epoch": 0.7114384488200811, "flos": 66771080161920.0, "grad_norm": 0.7453652651071682, "language_loss": 0.56596172, "learning_rate": 8.115800987478059e-07, "loss": 0.58625913, "num_input_tokens_seen": 255291680, "router_z_loss_clip": 0.01123047, "router_z_loss_mlp": 0.22851562, "step": 11833, "time_per_iteration": 3.047152280807495 }, { "auxiliary_loss_clip": 0.01102542, "auxiliary_loss_mlp": 0.0103508, "balance_loss_clip": 1.02333736, "balance_loss_mlp": 1.03439021, "epoch": 0.7114985720727491, "flos": 25010058672000.0, "grad_norm": 1.7499168880916014, "language_loss": 0.71150148, "learning_rate": 8.11266873367315e-07, "loss": 0.73287773, "num_input_tokens_seen": 255313880, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 11834, "time_per_iteration": 2.5408735275268555 }, { "auxiliary_loss_clip": 0.01108584, "auxiliary_loss_mlp": 0.01032803, "balance_loss_clip": 1.02013671, "balance_loss_mlp": 1.03751409, "epoch": 0.7115586953254172, "flos": 21470128496640.0, "grad_norm": 3.892514690959291, "language_loss": 0.79270804, "learning_rate": 8.10953693063704e-07, "loss": 0.81412184, "num_input_tokens_seen": 255332390, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11835, "time_per_iteration": 2.5556252002716064 }, { "auxiliary_loss_clip": 0.01103002, "auxiliary_loss_mlp": 0.01026243, "balance_loss_clip": 1.01413691, "balance_loss_mlp": 1.0354383, "epoch": 0.7116188185780851, "flos": 28622600190720.0, "grad_norm": 1.487556072992542, "language_loss": 0.75988966, "learning_rate": 8.10640557848848e-07, "loss": 0.78118217, "num_input_tokens_seen": 255354025, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 11836, "time_per_iteration": 2.6980531215667725 }, { "auxiliary_loss_clip": 0.0110249, "auxiliary_loss_mlp": 0.01030025, "balance_loss_clip": 1.01716816, "balance_loss_mlp": 1.03446698, "epoch": 0.7116789418307531, "flos": 25293608634240.0, "grad_norm": 2.017148601591575, "language_loss": 0.70250458, "learning_rate": 8.103274677346208e-07, "loss": 0.72382969, "num_input_tokens_seen": 255371400, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 11837, "time_per_iteration": 3.9298927783966064 }, { "auxiliary_loss_clip": 0.01108805, "auxiliary_loss_mlp": 0.01035058, "balance_loss_clip": 1.02115774, "balance_loss_mlp": 1.03709745, "epoch": 0.711739065083421, "flos": 25557301353600.0, "grad_norm": 2.3368039442537483, "language_loss": 0.61634421, "learning_rate": 8.100144227328958e-07, "loss": 0.63778281, "num_input_tokens_seen": 255390710, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 11838, "time_per_iteration": 4.0619940757751465 }, { "auxiliary_loss_clip": 0.01106868, "auxiliary_loss_mlp": 0.01029747, "balance_loss_clip": 1.01728344, "balance_loss_mlp": 1.03757703, "epoch": 0.711799188336089, "flos": 26140993361280.0, "grad_norm": 2.2309178214036596, "language_loss": 0.6769954, "learning_rate": 8.097014228555426e-07, "loss": 0.69836152, "num_input_tokens_seen": 255408790, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 11839, "time_per_iteration": 2.54520320892334 }, { "auxiliary_loss_clip": 0.01107036, "auxiliary_loss_mlp": 0.01032549, "balance_loss_clip": 1.02007997, "balance_loss_mlp": 1.03739738, "epoch": 0.7118593115887569, "flos": 21140648017920.0, "grad_norm": 2.573770185387974, "language_loss": 0.83933729, "learning_rate": 8.093884681144305e-07, "loss": 0.86073315, "num_input_tokens_seen": 255426280, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 11840, "time_per_iteration": 3.8868443965911865 }, { "auxiliary_loss_clip": 0.0110828, "auxiliary_loss_mlp": 0.01030894, "balance_loss_clip": 1.0182817, "balance_loss_mlp": 1.03697133, "epoch": 0.711919434841425, "flos": 14975684006400.0, "grad_norm": 2.093889111351685, "language_loss": 0.76685214, "learning_rate": 8.090755585214277e-07, "loss": 0.78824389, "num_input_tokens_seen": 255442935, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 11841, "time_per_iteration": 3.963827610015869 }, { "auxiliary_loss_clip": 0.01106748, "auxiliary_loss_mlp": 0.0103104, "balance_loss_clip": 1.01796842, "balance_loss_mlp": 1.03621769, "epoch": 0.7119795580940929, "flos": 16508997826560.0, "grad_norm": 2.2102822759136527, "language_loss": 0.75477529, "learning_rate": 8.087626940883994e-07, "loss": 0.77615321, "num_input_tokens_seen": 255460925, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 11842, "time_per_iteration": 2.539604663848877 }, { "auxiliary_loss_clip": 0.01031961, "auxiliary_loss_mlp": 0.01000638, "balance_loss_clip": 0.99962467, "balance_loss_mlp": 1.00907302, "epoch": 0.7120396813467609, "flos": 66570736055040.0, "grad_norm": 0.9352856471244699, "language_loss": 0.61636412, "learning_rate": 8.084498748272082e-07, "loss": 0.63669014, "num_input_tokens_seen": 255521360, "router_z_loss_clip": 0.01013184, "router_z_loss_mlp": 0.22949219, "step": 11843, "time_per_iteration": 3.0946059226989746 }, { "auxiliary_loss_clip": 0.01104688, "auxiliary_loss_mlp": 0.01026325, "balance_loss_clip": 1.01423693, "balance_loss_mlp": 1.0361793, "epoch": 0.7120998045994288, "flos": 26432731624320.0, "grad_norm": 1.5564094512646915, "language_loss": 0.80068415, "learning_rate": 8.081371007497171e-07, "loss": 0.8219943, "num_input_tokens_seen": 255541435, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 11844, "time_per_iteration": 2.5631425380706787 }, { "auxiliary_loss_clip": 0.01104231, "auxiliary_loss_mlp": 0.01030767, "balance_loss_clip": 1.01780891, "balance_loss_mlp": 1.0344007, "epoch": 0.7121599278520968, "flos": 16427982700800.0, "grad_norm": 2.2706578794842676, "language_loss": 0.78983927, "learning_rate": 8.078243718677873e-07, "loss": 0.81118923, "num_input_tokens_seen": 255558505, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 11845, "time_per_iteration": 2.5044949054718018 }, { "auxiliary_loss_clip": 0.01104401, "auxiliary_loss_mlp": 0.01030611, "balance_loss_clip": 1.01792693, "balance_loss_mlp": 1.03709197, "epoch": 0.7122200511047647, "flos": 28949889939840.0, "grad_norm": 2.229652102806992, "language_loss": 0.77488565, "learning_rate": 8.075116881932762e-07, "loss": 0.79623574, "num_input_tokens_seen": 255577815, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 11846, "time_per_iteration": 2.5833468437194824 }, { "auxiliary_loss_clip": 0.01108561, "auxiliary_loss_mlp": 0.01033256, "balance_loss_clip": 1.02059591, "balance_loss_mlp": 1.03785443, "epoch": 0.7122801743574327, "flos": 16471866142080.0, "grad_norm": 2.202832950808511, "language_loss": 0.58177507, "learning_rate": 8.071990497380421e-07, "loss": 0.60319328, "num_input_tokens_seen": 255595885, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 11847, "time_per_iteration": 2.5007967948913574 }, { "auxiliary_loss_clip": 0.01101841, "auxiliary_loss_mlp": 0.01030318, "balance_loss_clip": 1.01867104, "balance_loss_mlp": 1.03604126, "epoch": 0.7123402976101008, "flos": 20631039811200.0, "grad_norm": 1.5781775621645295, "language_loss": 0.71658486, "learning_rate": 8.068864565139395e-07, "loss": 0.7379064, "num_input_tokens_seen": 255616750, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.65625, "step": 11848, "time_per_iteration": 2.5454652309417725 }, { "auxiliary_loss_clip": 0.01031426, "auxiliary_loss_mlp": 0.01003309, "balance_loss_clip": 1.00225973, "balance_loss_mlp": 1.00868392, "epoch": 0.7124004208627687, "flos": 62325734837760.0, "grad_norm": 0.8185633349055825, "language_loss": 0.6294601, "learning_rate": 8.065739085328211e-07, "loss": 0.64980745, "num_input_tokens_seen": 255677900, "router_z_loss_clip": 0.01049805, "router_z_loss_mlp": 0.22753906, "step": 11849, "time_per_iteration": 3.0635125637054443 }, { "auxiliary_loss_clip": 0.01106196, "auxiliary_loss_mlp": 0.01035586, "balance_loss_clip": 1.02280688, "balance_loss_mlp": 1.03522432, "epoch": 0.7124605441154367, "flos": 39675975788160.0, "grad_norm": 1.7384662534199842, "language_loss": 0.64018905, "learning_rate": 8.0626140580654e-07, "loss": 0.66160691, "num_input_tokens_seen": 255699140, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 11850, "time_per_iteration": 2.6802761554718018 }, { "auxiliary_loss_clip": 0.01105635, "auxiliary_loss_mlp": 0.01030084, "balance_loss_clip": 1.01787734, "balance_loss_mlp": 1.03576493, "epoch": 0.7125206673681046, "flos": 28181868312960.0, "grad_norm": 1.513966917231333, "language_loss": 0.69770658, "learning_rate": 8.05948948346946e-07, "loss": 0.71906376, "num_input_tokens_seen": 255719640, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 11851, "time_per_iteration": 2.5913565158843994 }, { "auxiliary_loss_clip": 0.01105435, "auxiliary_loss_mlp": 0.01033889, "balance_loss_clip": 1.0221355, "balance_loss_mlp": 1.03720796, "epoch": 0.7125807906207726, "flos": 26176939896960.0, "grad_norm": 1.4663042249303075, "language_loss": 0.83304191, "learning_rate": 8.056365361658882e-07, "loss": 0.85443515, "num_input_tokens_seen": 255740450, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.68359375, "step": 11852, "time_per_iteration": 2.5644237995147705 }, { "auxiliary_loss_clip": 0.0110922, "auxiliary_loss_mlp": 0.01030808, "balance_loss_clip": 1.01746273, "balance_loss_mlp": 1.03732765, "epoch": 0.7126409138734405, "flos": 17157328358400.0, "grad_norm": 2.382637286465437, "language_loss": 0.73082447, "learning_rate": 8.053241692752126e-07, "loss": 0.7522248, "num_input_tokens_seen": 255758070, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 11853, "time_per_iteration": 2.4918322563171387 }, { "auxiliary_loss_clip": 0.01100454, "auxiliary_loss_mlp": 0.01028827, "balance_loss_clip": 1.01766276, "balance_loss_mlp": 1.03454399, "epoch": 0.7127010371261085, "flos": 18769933451520.0, "grad_norm": 2.1387135930327967, "language_loss": 0.92085016, "learning_rate": 8.050118476867635e-07, "loss": 0.9421429, "num_input_tokens_seen": 255775685, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.66015625, "step": 11854, "time_per_iteration": 2.490384578704834 }, { "auxiliary_loss_clip": 0.01103723, "auxiliary_loss_mlp": 0.01031739, "balance_loss_clip": 1.01984835, "balance_loss_mlp": 1.03615117, "epoch": 0.7127611603787765, "flos": 20376433232640.0, "grad_norm": 2.102222431460154, "language_loss": 0.79880273, "learning_rate": 8.046995714123856e-07, "loss": 0.82015741, "num_input_tokens_seen": 255794750, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 11855, "time_per_iteration": 2.5123891830444336 }, { "auxiliary_loss_clip": 0.01105989, "auxiliary_loss_mlp": 0.01034674, "balance_loss_clip": 1.02108383, "balance_loss_mlp": 1.03550494, "epoch": 0.7128212836314445, "flos": 20449008662400.0, "grad_norm": 2.0009997544010454, "language_loss": 0.72793472, "learning_rate": 8.043873404639192e-07, "loss": 0.74934131, "num_input_tokens_seen": 255813325, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 11856, "time_per_iteration": 2.510587692260742 }, { "auxiliary_loss_clip": 0.01108823, "auxiliary_loss_mlp": 0.01032692, "balance_loss_clip": 1.02004385, "balance_loss_mlp": 1.03759062, "epoch": 0.7128814068841124, "flos": 23440834229760.0, "grad_norm": 1.96567472278884, "language_loss": 0.70310962, "learning_rate": 8.040751548532046e-07, "loss": 0.7245248, "num_input_tokens_seen": 255832470, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 11857, "time_per_iteration": 2.55580735206604 }, { "auxiliary_loss_clip": 0.01104141, "auxiliary_loss_mlp": 0.01029111, "balance_loss_clip": 1.01661837, "balance_loss_mlp": 1.03621328, "epoch": 0.7129415301367804, "flos": 18222942165120.0, "grad_norm": 2.5156742739339464, "language_loss": 0.85176748, "learning_rate": 8.03763014592081e-07, "loss": 0.87309998, "num_input_tokens_seen": 255849740, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 11858, "time_per_iteration": 2.504694700241089 }, { "auxiliary_loss_clip": 0.01110467, "auxiliary_loss_mlp": 0.01031857, "balance_loss_clip": 1.01869011, "balance_loss_mlp": 1.03794146, "epoch": 0.7130016533894483, "flos": 15523896355200.0, "grad_norm": 1.758008644618441, "language_loss": 0.80341899, "learning_rate": 8.034509196923829e-07, "loss": 0.82484221, "num_input_tokens_seen": 255866975, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 11859, "time_per_iteration": 2.5077261924743652 }, { "auxiliary_loss_clip": 0.01102409, "auxiliary_loss_mlp": 0.01029075, "balance_loss_clip": 1.01729715, "balance_loss_mlp": 1.03457236, "epoch": 0.7130617766421163, "flos": 57115668960000.0, "grad_norm": 1.3046208974853293, "language_loss": 0.69057906, "learning_rate": 8.031388701659456e-07, "loss": 0.71189392, "num_input_tokens_seen": 255892915, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 11860, "time_per_iteration": 2.8414103984832764 }, { "auxiliary_loss_clip": 0.01105534, "auxiliary_loss_mlp": 0.01030196, "balance_loss_clip": 1.01686275, "balance_loss_mlp": 1.03572774, "epoch": 0.7131218998947844, "flos": 19788252024960.0, "grad_norm": 1.8765065953384388, "language_loss": 0.64104426, "learning_rate": 8.028268660246023e-07, "loss": 0.66240156, "num_input_tokens_seen": 255911480, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69921875, "step": 11861, "time_per_iteration": 2.528695583343506 }, { "auxiliary_loss_clip": 0.01110845, "auxiliary_loss_mlp": 0.01033114, "balance_loss_clip": 1.02001929, "balance_loss_mlp": 1.03882217, "epoch": 0.7131820231474523, "flos": 26651894457600.0, "grad_norm": 1.7699777218301522, "language_loss": 0.67132199, "learning_rate": 8.025149072801849e-07, "loss": 0.69276166, "num_input_tokens_seen": 255931140, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 11862, "time_per_iteration": 2.549145221710205 }, { "auxiliary_loss_clip": 0.01103721, "auxiliary_loss_mlp": 0.0103678, "balance_loss_clip": 1.025545, "balance_loss_mlp": 1.0360043, "epoch": 0.7132421464001203, "flos": 29205609840000.0, "grad_norm": 2.3441880549540683, "language_loss": 0.67108375, "learning_rate": 8.022029939445214e-07, "loss": 0.69248879, "num_input_tokens_seen": 255951665, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6796875, "step": 11863, "time_per_iteration": 2.5720040798187256 }, { "auxiliary_loss_clip": 0.01112833, "auxiliary_loss_mlp": 0.010352, "balance_loss_clip": 1.02186584, "balance_loss_mlp": 1.03883433, "epoch": 0.7133022696527882, "flos": 23073611535360.0, "grad_norm": 3.187373850126993, "language_loss": 0.65527916, "learning_rate": 8.018911260294414e-07, "loss": 0.67675942, "num_input_tokens_seen": 255970055, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 11864, "time_per_iteration": 2.586620807647705 }, { "auxiliary_loss_clip": 0.01109698, "auxiliary_loss_mlp": 0.01034888, "balance_loss_clip": 1.0222162, "balance_loss_mlp": 1.03794312, "epoch": 0.7133623929054562, "flos": 17457111267840.0, "grad_norm": 1.7925849953921282, "language_loss": 0.85797554, "learning_rate": 8.015793035467697e-07, "loss": 0.87942141, "num_input_tokens_seen": 255987720, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 11865, "time_per_iteration": 2.5376741886138916 }, { "auxiliary_loss_clip": 0.01105239, "auxiliary_loss_mlp": 0.01029113, "balance_loss_clip": 1.01596403, "balance_loss_mlp": 1.03531468, "epoch": 0.7134225161581241, "flos": 19536554448000.0, "grad_norm": 1.9256277245907698, "language_loss": 0.74962699, "learning_rate": 8.012675265083304e-07, "loss": 0.77097046, "num_input_tokens_seen": 256005490, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 11866, "time_per_iteration": 2.5342366695404053 }, { "auxiliary_loss_clip": 0.01109717, "auxiliary_loss_mlp": 0.01038171, "balance_loss_clip": 1.02439022, "balance_loss_mlp": 1.03891766, "epoch": 0.7134826394107922, "flos": 26250089944320.0, "grad_norm": 2.4976199623239355, "language_loss": 0.70855844, "learning_rate": 8.009557949259464e-07, "loss": 0.73003733, "num_input_tokens_seen": 256026030, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.70703125, "step": 11867, "time_per_iteration": 2.5631184577941895 }, { "auxiliary_loss_clip": 0.01103286, "auxiliary_loss_mlp": 0.01026872, "balance_loss_clip": 1.01499236, "balance_loss_mlp": 1.03598392, "epoch": 0.7135427626634601, "flos": 15815311395840.0, "grad_norm": 2.226497317469795, "language_loss": 0.71394759, "learning_rate": 8.006441088114397e-07, "loss": 0.73524916, "num_input_tokens_seen": 256043680, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 11868, "time_per_iteration": 2.4839468002319336 }, { "auxiliary_loss_clip": 0.01110638, "auxiliary_loss_mlp": 0.0102947, "balance_loss_clip": 1.01596403, "balance_loss_mlp": 1.03898406, "epoch": 0.7136028859161281, "flos": 18223409041920.0, "grad_norm": 2.230768486800953, "language_loss": 0.66727388, "learning_rate": 8.003324681766286e-07, "loss": 0.68867499, "num_input_tokens_seen": 256059705, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 11869, "time_per_iteration": 2.54457950592041 }, { "auxiliary_loss_clip": 0.01104328, "auxiliary_loss_mlp": 0.0102546, "balance_loss_clip": 1.01348591, "balance_loss_mlp": 1.03463542, "epoch": 0.713663009168796, "flos": 24314827956480.0, "grad_norm": 1.8382009068602543, "language_loss": 0.7767365, "learning_rate": 8.000208730333298e-07, "loss": 0.79803437, "num_input_tokens_seen": 256079785, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 11870, "time_per_iteration": 2.52325439453125 }, { "auxiliary_loss_clip": 0.01105058, "auxiliary_loss_mlp": 0.01031434, "balance_loss_clip": 1.01879764, "balance_loss_mlp": 1.03667974, "epoch": 0.713723132421464, "flos": 26538488242560.0, "grad_norm": 1.8123449366487552, "language_loss": 0.80838174, "learning_rate": 7.997093233933597e-07, "loss": 0.8297466, "num_input_tokens_seen": 256099000, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 11871, "time_per_iteration": 2.6229381561279297 }, { "auxiliary_loss_clip": 0.01106147, "auxiliary_loss_mlp": 0.01035755, "balance_loss_clip": 1.02240396, "balance_loss_mlp": 1.0352999, "epoch": 0.7137832556741319, "flos": 19865675790720.0, "grad_norm": 1.786007075081864, "language_loss": 0.78903651, "learning_rate": 7.993978192685331e-07, "loss": 0.81045556, "num_input_tokens_seen": 256117985, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 11872, "time_per_iteration": 2.549783945083618 }, { "auxiliary_loss_clip": 0.01107345, "auxiliary_loss_mlp": 0.01032051, "balance_loss_clip": 1.01881874, "balance_loss_mlp": 1.03586912, "epoch": 0.7138433789267999, "flos": 21688932193920.0, "grad_norm": 2.338238212541034, "language_loss": 0.84038508, "learning_rate": 7.990863606706606e-07, "loss": 0.86177909, "num_input_tokens_seen": 256134350, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 11873, "time_per_iteration": 2.499675989151001 }, { "auxiliary_loss_clip": 0.01102556, "auxiliary_loss_mlp": 0.0103166, "balance_loss_clip": 1.0203414, "balance_loss_mlp": 1.03463125, "epoch": 0.713903502179468, "flos": 17602729004160.0, "grad_norm": 2.238189733670159, "language_loss": 0.85964322, "learning_rate": 7.987749476115539e-07, "loss": 0.88098538, "num_input_tokens_seen": 256150610, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 11874, "time_per_iteration": 2.4612178802490234 }, { "auxiliary_loss_clip": 0.01105801, "auxiliary_loss_mlp": 0.0103074, "balance_loss_clip": 1.01868236, "balance_loss_mlp": 1.03576136, "epoch": 0.7139636254321359, "flos": 18040336398720.0, "grad_norm": 1.9257309572498698, "language_loss": 0.8346507, "learning_rate": 7.984635801030228e-07, "loss": 0.8560161, "num_input_tokens_seen": 256168620, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 11875, "time_per_iteration": 2.4942705631256104 }, { "auxiliary_loss_clip": 0.01110978, "auxiliary_loss_mlp": 0.01033672, "balance_loss_clip": 1.01924741, "balance_loss_mlp": 1.03650475, "epoch": 0.7140237486848039, "flos": 23331127115520.0, "grad_norm": 1.891795732101845, "language_loss": 0.69918454, "learning_rate": 7.981522581568721e-07, "loss": 0.720631, "num_input_tokens_seen": 256186700, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.74609375, "step": 11876, "time_per_iteration": 2.5286309719085693 }, { "auxiliary_loss_clip": 0.01109582, "auxiliary_loss_mlp": 0.01035697, "balance_loss_clip": 1.02263737, "balance_loss_mlp": 1.03771091, "epoch": 0.7140838719374718, "flos": 16837077674880.0, "grad_norm": 1.9109754122328524, "language_loss": 0.78190529, "learning_rate": 7.978409817849079e-07, "loss": 0.80335808, "num_input_tokens_seen": 256205390, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 11877, "time_per_iteration": 2.483105182647705 }, { "auxiliary_loss_clip": 0.01104167, "auxiliary_loss_mlp": 0.01034809, "balance_loss_clip": 1.02296507, "balance_loss_mlp": 1.03555572, "epoch": 0.7141439951901398, "flos": 21142012734720.0, "grad_norm": 1.9985218593235625, "language_loss": 0.69579852, "learning_rate": 7.97529750998934e-07, "loss": 0.71718824, "num_input_tokens_seen": 256224575, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 11878, "time_per_iteration": 3.9166905879974365 }, { "auxiliary_loss_clip": 0.0110451, "auxiliary_loss_mlp": 0.01032685, "balance_loss_clip": 1.02166474, "balance_loss_mlp": 1.03644824, "epoch": 0.7142041184428077, "flos": 24717709877760.0, "grad_norm": 1.9021028236005195, "language_loss": 0.67865777, "learning_rate": 7.972185658107535e-07, "loss": 0.70002973, "num_input_tokens_seen": 256242130, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6796875, "step": 11879, "time_per_iteration": 2.520991563796997 }, { "auxiliary_loss_clip": 0.01104463, "auxiliary_loss_mlp": 0.01032257, "balance_loss_clip": 1.01935291, "balance_loss_mlp": 1.03564978, "epoch": 0.7142642416954758, "flos": 21908202768000.0, "grad_norm": 2.050039127958061, "language_loss": 0.69387937, "learning_rate": 7.969074262321646e-07, "loss": 0.71524656, "num_input_tokens_seen": 256261920, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 11880, "time_per_iteration": 4.081536293029785 }, { "auxiliary_loss_clip": 0.01107133, "auxiliary_loss_mlp": 0.01035307, "balance_loss_clip": 1.02253997, "balance_loss_mlp": 1.03539598, "epoch": 0.7143243649481437, "flos": 20805636844800.0, "grad_norm": 2.3310964573856494, "language_loss": 0.80525517, "learning_rate": 7.965963322749674e-07, "loss": 0.82667959, "num_input_tokens_seen": 256277970, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 11881, "time_per_iteration": 2.531096935272217 }, { "auxiliary_loss_clip": 0.01104861, "auxiliary_loss_mlp": 0.01031997, "balance_loss_clip": 1.02014208, "balance_loss_mlp": 1.03453064, "epoch": 0.7143844882008117, "flos": 27235011847680.0, "grad_norm": 1.697820670351862, "language_loss": 0.63935572, "learning_rate": 7.962852839509579e-07, "loss": 0.66072428, "num_input_tokens_seen": 256298205, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 11882, "time_per_iteration": 3.9781477451324463 }, { "auxiliary_loss_clip": 0.01109355, "auxiliary_loss_mlp": 0.01032525, "balance_loss_clip": 1.02049136, "balance_loss_mlp": 1.03762269, "epoch": 0.7144446114534796, "flos": 17929623703680.0, "grad_norm": 1.7275926764819993, "language_loss": 0.68472314, "learning_rate": 7.959742812719304e-07, "loss": 0.70614195, "num_input_tokens_seen": 256316685, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.71875, "step": 11883, "time_per_iteration": 3.9664580821990967 }, { "auxiliary_loss_clip": 0.0110703, "auxiliary_loss_mlp": 0.01037855, "balance_loss_clip": 1.02446795, "balance_loss_mlp": 1.03869581, "epoch": 0.7145047347061476, "flos": 20740962407040.0, "grad_norm": 2.072571801916609, "language_loss": 0.77387112, "learning_rate": 7.956633242496788e-07, "loss": 0.79531991, "num_input_tokens_seen": 256334205, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.68359375, "step": 11884, "time_per_iteration": 2.5275461673736572 }, { "auxiliary_loss_clip": 0.01110445, "auxiliary_loss_mlp": 0.01031926, "balance_loss_clip": 1.01789534, "balance_loss_mlp": 1.03573418, "epoch": 0.7145648579588155, "flos": 21178605715200.0, "grad_norm": 1.9662609280863939, "language_loss": 0.73571229, "learning_rate": 7.953524128959954e-07, "loss": 0.75713599, "num_input_tokens_seen": 256353340, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.74609375, "step": 11885, "time_per_iteration": 2.572112798690796 }, { "auxiliary_loss_clip": 0.01030285, "auxiliary_loss_mlp": 0.01007938, "balance_loss_clip": 1.00688899, "balance_loss_mlp": 1.00747311, "epoch": 0.7146249812114835, "flos": 64784539509120.0, "grad_norm": 1.0467394371584844, "language_loss": 0.66309464, "learning_rate": 7.95041547222669e-07, "loss": 0.68347681, "num_input_tokens_seen": 256411550, "router_z_loss_clip": 0.01049805, "router_z_loss_mlp": 0.22851562, "step": 11886, "time_per_iteration": 3.1080565452575684 }, { "auxiliary_loss_clip": 0.01105033, "auxiliary_loss_mlp": 0.01031756, "balance_loss_clip": 1.01853573, "balance_loss_mlp": 1.03527832, "epoch": 0.7146851044641516, "flos": 18113881495680.0, "grad_norm": 2.498205085644137, "language_loss": 0.75077027, "learning_rate": 7.947307272414874e-07, "loss": 0.77213812, "num_input_tokens_seen": 256430360, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 11887, "time_per_iteration": 2.5012331008911133 }, { "auxiliary_loss_clip": 0.01105706, "auxiliary_loss_mlp": 0.0102881, "balance_loss_clip": 1.01663828, "balance_loss_mlp": 1.03545642, "epoch": 0.7147452277168195, "flos": 19243846517760.0, "grad_norm": 1.7259226229089133, "language_loss": 0.71509755, "learning_rate": 7.944199529642372e-07, "loss": 0.73644269, "num_input_tokens_seen": 256449750, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 11888, "time_per_iteration": 2.5023581981658936 }, { "auxiliary_loss_clip": 0.01108009, "auxiliary_loss_mlp": 0.01033902, "balance_loss_clip": 1.02104533, "balance_loss_mlp": 1.03543115, "epoch": 0.7148053509694875, "flos": 23764712186880.0, "grad_norm": 2.2063055932757907, "language_loss": 0.84169841, "learning_rate": 7.941092244027041e-07, "loss": 0.86311758, "num_input_tokens_seen": 256467330, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 11889, "time_per_iteration": 2.545315742492676 }, { "auxiliary_loss_clip": 0.01105766, "auxiliary_loss_mlp": 0.01029849, "balance_loss_clip": 1.01709366, "balance_loss_mlp": 1.03621745, "epoch": 0.7148654742221554, "flos": 22485322586880.0, "grad_norm": 2.2652926506913245, "language_loss": 0.75939047, "learning_rate": 7.937985415686695e-07, "loss": 0.78074664, "num_input_tokens_seen": 256485705, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 11890, "time_per_iteration": 2.5226805210113525 }, { "auxiliary_loss_clip": 0.01106415, "auxiliary_loss_mlp": 0.01032554, "balance_loss_clip": 1.02072859, "balance_loss_mlp": 1.03731894, "epoch": 0.7149255974748234, "flos": 24679213476480.0, "grad_norm": 1.6128760144719008, "language_loss": 0.74172217, "learning_rate": 7.934879044739147e-07, "loss": 0.76311189, "num_input_tokens_seen": 256504755, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 11891, "time_per_iteration": 2.553053855895996 }, { "auxiliary_loss_clip": 0.01108679, "auxiliary_loss_mlp": 0.01040093, "balance_loss_clip": 1.02749276, "balance_loss_mlp": 1.03763974, "epoch": 0.7149857207274913, "flos": 18405583845120.0, "grad_norm": 2.160441645447684, "language_loss": 0.67851788, "learning_rate": 7.931773131302211e-07, "loss": 0.70000559, "num_input_tokens_seen": 256523670, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 11892, "time_per_iteration": 2.510132312774658 }, { "auxiliary_loss_clip": 0.01109915, "auxiliary_loss_mlp": 0.01031902, "balance_loss_clip": 1.0173229, "balance_loss_mlp": 1.03678274, "epoch": 0.7150458439801594, "flos": 24969515195520.0, "grad_norm": 1.7984648518604964, "language_loss": 0.74032593, "learning_rate": 7.928667675493632e-07, "loss": 0.76174408, "num_input_tokens_seen": 256542225, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.73046875, "step": 11893, "time_per_iteration": 2.545046091079712 }, { "auxiliary_loss_clip": 0.01110242, "auxiliary_loss_mlp": 0.01033303, "balance_loss_clip": 1.01981425, "balance_loss_mlp": 1.03729558, "epoch": 0.7151059672328273, "flos": 16690777580160.0, "grad_norm": 2.93168451147386, "language_loss": 0.65967107, "learning_rate": 7.925562677431185e-07, "loss": 0.68110645, "num_input_tokens_seen": 256560730, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 11894, "time_per_iteration": 2.478919267654419 }, { "auxiliary_loss_clip": 0.011082, "auxiliary_loss_mlp": 0.01031454, "balance_loss_clip": 1.01898444, "balance_loss_mlp": 1.03627288, "epoch": 0.7151660904854953, "flos": 27271820309760.0, "grad_norm": 2.0673684781520922, "language_loss": 0.77809054, "learning_rate": 7.922458137232613e-07, "loss": 0.79948711, "num_input_tokens_seen": 256580505, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 11895, "time_per_iteration": 2.5454864501953125 }, { "auxiliary_loss_clip": 0.011087, "auxiliary_loss_mlp": 0.0103559, "balance_loss_clip": 1.021487, "balance_loss_mlp": 1.03684688, "epoch": 0.7152262137381632, "flos": 18332254229760.0, "grad_norm": 2.616656452871948, "language_loss": 0.69824898, "learning_rate": 7.919354055015643e-07, "loss": 0.71969187, "num_input_tokens_seen": 256597330, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.71875, "step": 11896, "time_per_iteration": 2.4662442207336426 }, { "auxiliary_loss_clip": 0.01108125, "auxiliary_loss_mlp": 0.01045261, "balance_loss_clip": 1.03145099, "balance_loss_mlp": 1.0354445, "epoch": 0.7152863369908312, "flos": 21799285752960.0, "grad_norm": 2.143000368589466, "language_loss": 0.86459267, "learning_rate": 7.91625043089798e-07, "loss": 0.88612652, "num_input_tokens_seen": 256616030, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 11897, "time_per_iteration": 2.504565477371216 }, { "auxiliary_loss_clip": 0.01104175, "auxiliary_loss_mlp": 0.01038766, "balance_loss_clip": 1.02595663, "balance_loss_mlp": 1.0358603, "epoch": 0.7153464602434991, "flos": 22158427887360.0, "grad_norm": 1.8256336896967846, "language_loss": 0.78216493, "learning_rate": 7.913147264997304e-07, "loss": 0.80359429, "num_input_tokens_seen": 256635570, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.68359375, "step": 11898, "time_per_iteration": 2.5108847618103027 }, { "auxiliary_loss_clip": 0.01110136, "auxiliary_loss_mlp": 0.01028618, "balance_loss_clip": 1.01499879, "balance_loss_mlp": 1.03704226, "epoch": 0.7154065834961671, "flos": 24716057852160.0, "grad_norm": 1.7154499738455509, "language_loss": 0.72989547, "learning_rate": 7.910044557431302e-07, "loss": 0.75128305, "num_input_tokens_seen": 256655290, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 11899, "time_per_iteration": 2.5376265048980713 }, { "auxiliary_loss_clip": 0.01104097, "auxiliary_loss_mlp": 0.01035658, "balance_loss_clip": 1.022277, "balance_loss_mlp": 1.03456008, "epoch": 0.7154667067488351, "flos": 22601494149120.0, "grad_norm": 2.6638721250696507, "language_loss": 0.75869811, "learning_rate": 7.906942308317614e-07, "loss": 0.7800957, "num_input_tokens_seen": 256671605, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 11900, "time_per_iteration": 2.5322818756103516 }, { "auxiliary_loss_clip": 0.01107699, "auxiliary_loss_mlp": 0.01031548, "balance_loss_clip": 1.01930571, "balance_loss_mlp": 1.03687072, "epoch": 0.7155268300015031, "flos": 18771154513920.0, "grad_norm": 2.0136436969361875, "language_loss": 0.81333685, "learning_rate": 7.903840517773886e-07, "loss": 0.83472931, "num_input_tokens_seen": 256689680, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.70703125, "step": 11901, "time_per_iteration": 2.483293294906616 }, { "auxiliary_loss_clip": 0.01111144, "auxiliary_loss_mlp": 0.01035377, "balance_loss_clip": 1.02216232, "balance_loss_mlp": 1.03693712, "epoch": 0.7155869532541711, "flos": 18296343607680.0, "grad_norm": 3.1704999945412355, "language_loss": 0.81812465, "learning_rate": 7.900739185917744e-07, "loss": 0.83958983, "num_input_tokens_seen": 256707760, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7421875, "step": 11902, "time_per_iteration": 2.5006275177001953 }, { "auxiliary_loss_clip": 0.01106243, "auxiliary_loss_mlp": 0.01032425, "balance_loss_clip": 1.01993811, "balance_loss_mlp": 1.03531957, "epoch": 0.715647076506839, "flos": 11980805783040.0, "grad_norm": 1.7882690293626693, "language_loss": 0.67948771, "learning_rate": 7.897638312866785e-07, "loss": 0.70087439, "num_input_tokens_seen": 256724150, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 11903, "time_per_iteration": 2.4641454219818115 }, { "auxiliary_loss_clip": 0.01102795, "auxiliary_loss_mlp": 0.01034321, "balance_loss_clip": 1.02210844, "balance_loss_mlp": 1.03444028, "epoch": 0.715707199759507, "flos": 18951641377920.0, "grad_norm": 2.205421451082311, "language_loss": 0.75870883, "learning_rate": 7.894537898738589e-07, "loss": 0.78008002, "num_input_tokens_seen": 256742780, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 11904, "time_per_iteration": 2.535045862197876 }, { "auxiliary_loss_clip": 0.01107155, "auxiliary_loss_mlp": 0.0103796, "balance_loss_clip": 1.02463269, "balance_loss_mlp": 1.03717756, "epoch": 0.7157673230121749, "flos": 15304410299520.0, "grad_norm": 2.121681264134159, "language_loss": 0.72034144, "learning_rate": 7.891437943650727e-07, "loss": 0.74179256, "num_input_tokens_seen": 256761355, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 11905, "time_per_iteration": 2.4983232021331787 }, { "auxiliary_loss_clip": 0.01105807, "auxiliary_loss_mlp": 0.01031569, "balance_loss_clip": 1.01905775, "balance_loss_mlp": 1.03578246, "epoch": 0.715827446264843, "flos": 23221850964480.0, "grad_norm": 1.5681857053456063, "language_loss": 0.78133684, "learning_rate": 7.88833844772076e-07, "loss": 0.80271059, "num_input_tokens_seen": 256781335, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 11906, "time_per_iteration": 2.5084354877471924 }, { "auxiliary_loss_clip": 0.01030518, "auxiliary_loss_mlp": 0.01004289, "balance_loss_clip": 1.00325143, "balance_loss_mlp": 1.00783026, "epoch": 0.7158875695175109, "flos": 60975421833600.0, "grad_norm": 0.736628156842786, "language_loss": 0.55356675, "learning_rate": 7.885239411066205e-07, "loss": 0.57391483, "num_input_tokens_seen": 256838890, "router_z_loss_clip": 0.01037598, "router_z_loss_mlp": 0.22753906, "step": 11907, "time_per_iteration": 3.0382473468780518 }, { "auxiliary_loss_clip": 0.01106101, "auxiliary_loss_mlp": 0.01032528, "balance_loss_clip": 1.0202142, "balance_loss_mlp": 1.03568089, "epoch": 0.7159476927701789, "flos": 17128780024320.0, "grad_norm": 1.870016048891879, "language_loss": 0.69597006, "learning_rate": 7.882140833804593e-07, "loss": 0.71735644, "num_input_tokens_seen": 256858145, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 11908, "time_per_iteration": 2.4903008937835693 }, { "auxiliary_loss_clip": 0.01108272, "auxiliary_loss_mlp": 0.01033213, "balance_loss_clip": 1.01991498, "balance_loss_mlp": 1.03733325, "epoch": 0.7160078160228468, "flos": 22490601886080.0, "grad_norm": 1.7911505074186944, "language_loss": 0.71510559, "learning_rate": 7.879042716053415e-07, "loss": 0.73652041, "num_input_tokens_seen": 256878545, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 11909, "time_per_iteration": 2.5094783306121826 }, { "auxiliary_loss_clip": 0.01108488, "auxiliary_loss_mlp": 0.01032987, "balance_loss_clip": 1.02073193, "balance_loss_mlp": 1.03720212, "epoch": 0.7160679392755148, "flos": 30590935626240.0, "grad_norm": 1.57756241977393, "language_loss": 0.7505601, "learning_rate": 7.875945057930144e-07, "loss": 0.77197486, "num_input_tokens_seen": 256899920, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.71484375, "step": 11910, "time_per_iteration": 2.5917248725891113 }, { "auxiliary_loss_clip": 0.01106383, "auxiliary_loss_mlp": 0.0103253, "balance_loss_clip": 1.02147293, "balance_loss_mlp": 1.03665304, "epoch": 0.7161280625281827, "flos": 21323648833920.0, "grad_norm": 2.522167621784145, "language_loss": 0.76874113, "learning_rate": 7.872847859552251e-07, "loss": 0.79013026, "num_input_tokens_seen": 256918460, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.69921875, "step": 11911, "time_per_iteration": 2.5046324729919434 }, { "auxiliary_loss_clip": 0.01107302, "auxiliary_loss_mlp": 0.01031585, "balance_loss_clip": 1.0183413, "balance_loss_mlp": 1.03656411, "epoch": 0.7161881857808508, "flos": 61860078921600.0, "grad_norm": 1.8595418627274254, "language_loss": 0.5905146, "learning_rate": 7.869751121037192e-07, "loss": 0.61190343, "num_input_tokens_seen": 256942015, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 11912, "time_per_iteration": 2.8824732303619385 }, { "auxiliary_loss_clip": 0.0110626, "auxiliary_loss_mlp": 0.01032627, "balance_loss_clip": 1.01962769, "balance_loss_mlp": 1.03723288, "epoch": 0.7162483090335187, "flos": 20812101292800.0, "grad_norm": 1.6706974288099714, "language_loss": 0.78110075, "learning_rate": 7.866654842502376e-07, "loss": 0.80248958, "num_input_tokens_seen": 256961065, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 11913, "time_per_iteration": 2.498957872390747 }, { "auxiliary_loss_clip": 0.01103676, "auxiliary_loss_mlp": 0.01027565, "balance_loss_clip": 1.01629353, "balance_loss_mlp": 1.03501379, "epoch": 0.7163084322861867, "flos": 24097532630400.0, "grad_norm": 1.6993417050537822, "language_loss": 0.74556202, "learning_rate": 7.863559024065234e-07, "loss": 0.76687443, "num_input_tokens_seen": 256982165, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6875, "step": 11914, "time_per_iteration": 2.5301244258880615 }, { "auxiliary_loss_clip": 0.01102419, "auxiliary_loss_mlp": 0.01032642, "balance_loss_clip": 1.02045286, "balance_loss_mlp": 1.03562069, "epoch": 0.7163685555388547, "flos": 20080888128000.0, "grad_norm": 1.6629253013553649, "language_loss": 0.74112391, "learning_rate": 7.860463665843143e-07, "loss": 0.76247454, "num_input_tokens_seen": 256999825, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.66796875, "step": 11915, "time_per_iteration": 2.4905707836151123 }, { "auxiliary_loss_clip": 0.01105824, "auxiliary_loss_mlp": 0.0103253, "balance_loss_clip": 1.02006674, "balance_loss_mlp": 1.03454161, "epoch": 0.7164286787915226, "flos": 17456967613440.0, "grad_norm": 2.9734159192195135, "language_loss": 0.80982411, "learning_rate": 7.85736876795349e-07, "loss": 0.83120763, "num_input_tokens_seen": 257017450, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 11916, "time_per_iteration": 2.5003628730773926 }, { "auxiliary_loss_clip": 0.01106752, "auxiliary_loss_mlp": 0.01032735, "balance_loss_clip": 1.02075434, "balance_loss_mlp": 1.03629434, "epoch": 0.7164888020441906, "flos": 19718908819200.0, "grad_norm": 2.0441071306874234, "language_loss": 0.68570656, "learning_rate": 7.854274330513626e-07, "loss": 0.70710146, "num_input_tokens_seen": 257035465, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.703125, "step": 11917, "time_per_iteration": 2.46966814994812 }, { "auxiliary_loss_clip": 0.01105507, "auxiliary_loss_mlp": 0.01034649, "balance_loss_clip": 1.02120817, "balance_loss_mlp": 1.03561974, "epoch": 0.7165489252968585, "flos": 21470523546240.0, "grad_norm": 2.096421669741665, "language_loss": 0.76226348, "learning_rate": 7.851180353640896e-07, "loss": 0.78366506, "num_input_tokens_seen": 257053750, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 11918, "time_per_iteration": 2.484184503555298 }, { "auxiliary_loss_clip": 0.01030458, "auxiliary_loss_mlp": 0.01000644, "balance_loss_clip": 0.99960738, "balance_loss_mlp": 1.00758862, "epoch": 0.7166090485495266, "flos": 69928060464000.0, "grad_norm": 0.6330847145187452, "language_loss": 0.53902048, "learning_rate": 7.848086837452639e-07, "loss": 0.55933148, "num_input_tokens_seen": 257121215, "router_z_loss_clip": 0.01037598, "router_z_loss_mlp": 0.22851562, "step": 11919, "time_per_iteration": 3.156749725341797 }, { "auxiliary_loss_clip": 0.01108751, "auxiliary_loss_mlp": 0.01031337, "balance_loss_clip": 1.01902914, "balance_loss_mlp": 1.03775001, "epoch": 0.7166691718021945, "flos": 27343892949120.0, "grad_norm": 2.016513021245901, "language_loss": 0.68852055, "learning_rate": 7.844993782066132e-07, "loss": 0.70992136, "num_input_tokens_seen": 257143370, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 11920, "time_per_iteration": 3.9484622478485107 }, { "auxiliary_loss_clip": 0.01105739, "auxiliary_loss_mlp": 0.01035105, "balance_loss_clip": 1.02203357, "balance_loss_mlp": 1.03531408, "epoch": 0.7167292950548625, "flos": 30408868563840.0, "grad_norm": 2.6481361327692126, "language_loss": 0.75037718, "learning_rate": 7.841901187598678e-07, "loss": 0.77178562, "num_input_tokens_seen": 257162160, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 11921, "time_per_iteration": 4.114599227905273 }, { "auxiliary_loss_clip": 0.01112144, "auxiliary_loss_mlp": 0.01031867, "balance_loss_clip": 1.01652527, "balance_loss_mlp": 1.03760695, "epoch": 0.7167894183075304, "flos": 14571257800320.0, "grad_norm": 4.0483006503808685, "language_loss": 0.75559938, "learning_rate": 7.83880905416755e-07, "loss": 0.77703953, "num_input_tokens_seen": 257179300, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.74609375, "step": 11922, "time_per_iteration": 2.502120018005371 }, { "auxiliary_loss_clip": 0.01030613, "auxiliary_loss_mlp": 0.0100513, "balance_loss_clip": 1.00406933, "balance_loss_mlp": 1.00771856, "epoch": 0.7168495415601984, "flos": 64110674407680.0, "grad_norm": 0.7680305165504547, "language_loss": 0.55101722, "learning_rate": 7.83571738189001e-07, "loss": 0.5713746, "num_input_tokens_seen": 257235470, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.22851562, "step": 11923, "time_per_iteration": 2.9505105018615723 }, { "auxiliary_loss_clip": 0.01105959, "auxiliary_loss_mlp": 0.01035063, "balance_loss_clip": 1.02200961, "balance_loss_mlp": 1.03518593, "epoch": 0.7169096648128663, "flos": 24681440119680.0, "grad_norm": 1.7817421493060124, "language_loss": 0.77026248, "learning_rate": 7.832626170883279e-07, "loss": 0.79167271, "num_input_tokens_seen": 257255850, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 11924, "time_per_iteration": 3.9691672325134277 }, { "auxiliary_loss_clip": 0.01106426, "auxiliary_loss_mlp": 0.01031865, "balance_loss_clip": 1.02024817, "balance_loss_mlp": 1.03677607, "epoch": 0.7169697880655344, "flos": 20667525050880.0, "grad_norm": 1.7408050585106956, "language_loss": 0.68450403, "learning_rate": 7.829535421264588e-07, "loss": 0.70588696, "num_input_tokens_seen": 257275425, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 11925, "time_per_iteration": 3.9921092987060547 }, { "auxiliary_loss_clip": 0.01099715, "auxiliary_loss_mlp": 0.01027873, "balance_loss_clip": 1.01605916, "balance_loss_mlp": 1.03347707, "epoch": 0.7170299113182023, "flos": 21032700670080.0, "grad_norm": 1.5164045327994935, "language_loss": 0.77692592, "learning_rate": 7.826445133151133e-07, "loss": 0.7982018, "num_input_tokens_seen": 257295740, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 11926, "time_per_iteration": 2.533940076828003 }, { "auxiliary_loss_clip": 0.01108933, "auxiliary_loss_mlp": 0.01036165, "balance_loss_clip": 1.02298045, "balance_loss_mlp": 1.03516257, "epoch": 0.7170900345708703, "flos": 22893304239360.0, "grad_norm": 2.0260138389637534, "language_loss": 0.77049088, "learning_rate": 7.823355306660093e-07, "loss": 0.79194188, "num_input_tokens_seen": 257315970, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.734375, "step": 11927, "time_per_iteration": 2.5173308849334717 }, { "auxiliary_loss_clip": 0.01105674, "auxiliary_loss_mlp": 0.01029449, "balance_loss_clip": 1.0165031, "balance_loss_mlp": 1.03787518, "epoch": 0.7171501578235383, "flos": 15518688883200.0, "grad_norm": 1.5253390487660416, "language_loss": 0.68981218, "learning_rate": 7.820265941908642e-07, "loss": 0.7111634, "num_input_tokens_seen": 257334230, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6796875, "step": 11928, "time_per_iteration": 2.466914653778076 }, { "auxiliary_loss_clip": 0.01102174, "auxiliary_loss_mlp": 0.01029593, "balance_loss_clip": 1.017851, "balance_loss_mlp": 1.03480005, "epoch": 0.7172102810762062, "flos": 26104292640000.0, "grad_norm": 1.9311107664218699, "language_loss": 0.65180629, "learning_rate": 7.817177039013931e-07, "loss": 0.67312396, "num_input_tokens_seen": 257352145, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 11929, "time_per_iteration": 2.5466179847717285 }, { "auxiliary_loss_clip": 0.01106643, "auxiliary_loss_mlp": 0.01029532, "balance_loss_clip": 1.01699734, "balance_loss_mlp": 1.03571248, "epoch": 0.7172704043288742, "flos": 21506649649920.0, "grad_norm": 2.1186419165329764, "language_loss": 0.69642556, "learning_rate": 7.81408859809308e-07, "loss": 0.71778738, "num_input_tokens_seen": 257371460, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 11930, "time_per_iteration": 2.4833695888519287 }, { "auxiliary_loss_clip": 0.01105873, "auxiliary_loss_mlp": 0.0103094, "balance_loss_clip": 1.01857817, "balance_loss_mlp": 1.03500259, "epoch": 0.7173305275815421, "flos": 18770939032320.0, "grad_norm": 1.9407804914433708, "language_loss": 0.80921739, "learning_rate": 7.811000619263219e-07, "loss": 0.83058548, "num_input_tokens_seen": 257390800, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 11931, "time_per_iteration": 2.4937973022460938 }, { "auxiliary_loss_clip": 0.0110491, "auxiliary_loss_mlp": 0.01028239, "balance_loss_clip": 1.01641381, "balance_loss_mlp": 1.03617787, "epoch": 0.7173906508342102, "flos": 16179876483840.0, "grad_norm": 2.320976917527286, "language_loss": 0.78542817, "learning_rate": 7.80791310264143e-07, "loss": 0.80675966, "num_input_tokens_seen": 257407495, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 11932, "time_per_iteration": 2.44781231880188 }, { "auxiliary_loss_clip": 0.01102837, "auxiliary_loss_mlp": 0.01028902, "balance_loss_clip": 1.01703513, "balance_loss_mlp": 1.03421819, "epoch": 0.7174507740868781, "flos": 26613864933120.0, "grad_norm": 1.645347258361073, "language_loss": 0.75190532, "learning_rate": 7.804826048344803e-07, "loss": 0.77322268, "num_input_tokens_seen": 257429675, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 11933, "time_per_iteration": 2.523130178451538 }, { "auxiliary_loss_clip": 0.01111691, "auxiliary_loss_mlp": 0.01036138, "balance_loss_clip": 1.02092719, "balance_loss_mlp": 1.03772473, "epoch": 0.7175108973395461, "flos": 18432911116800.0, "grad_norm": 2.522840186102152, "language_loss": 0.69266474, "learning_rate": 7.801739456490388e-07, "loss": 0.71414304, "num_input_tokens_seen": 257442765, "router_z_loss_clip": 0.15234375, "router_z_loss_mlp": 0.73828125, "step": 11934, "time_per_iteration": 2.4697506427764893 }, { "auxiliary_loss_clip": 0.01105501, "auxiliary_loss_mlp": 0.01033966, "balance_loss_clip": 1.02088916, "balance_loss_mlp": 1.03534055, "epoch": 0.717571020592214, "flos": 23914962777600.0, "grad_norm": 2.258830321554335, "language_loss": 0.86561084, "learning_rate": 7.798653327195237e-07, "loss": 0.88700557, "num_input_tokens_seen": 257459310, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11935, "time_per_iteration": 2.5140082836151123 }, { "auxiliary_loss_clip": 0.0110553, "auxiliary_loss_mlp": 0.01031724, "balance_loss_clip": 1.01862907, "balance_loss_mlp": 1.03549528, "epoch": 0.717631143844882, "flos": 38256930109440.0, "grad_norm": 2.6474273775195676, "language_loss": 0.73945642, "learning_rate": 7.795567660576388e-07, "loss": 0.76082897, "num_input_tokens_seen": 257484750, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 11936, "time_per_iteration": 2.6534085273742676 }, { "auxiliary_loss_clip": 0.01030101, "auxiliary_loss_mlp": 0.01001294, "balance_loss_clip": 1.00029302, "balance_loss_mlp": 1.00721633, "epoch": 0.7176912670975499, "flos": 65515896328320.0, "grad_norm": 0.7790654768444443, "language_loss": 0.55899829, "learning_rate": 7.79248245675082e-07, "loss": 0.57931221, "num_input_tokens_seen": 257543110, "router_z_loss_clip": 0.01000977, "router_z_loss_mlp": 0.22851562, "step": 11937, "time_per_iteration": 3.1316895484924316 }, { "auxiliary_loss_clip": 0.01110912, "auxiliary_loss_mlp": 0.01033688, "balance_loss_clip": 1.01959705, "balance_loss_mlp": 1.03833699, "epoch": 0.717751390350218, "flos": 31281066610560.0, "grad_norm": 1.9261164573657232, "language_loss": 0.54555446, "learning_rate": 7.789397715835542e-07, "loss": 0.56700051, "num_input_tokens_seen": 257567410, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 11938, "time_per_iteration": 2.561933994293213 }, { "auxiliary_loss_clip": 0.01102039, "auxiliary_loss_mlp": 0.01026624, "balance_loss_clip": 1.01451874, "balance_loss_mlp": 1.03441262, "epoch": 0.7178115136028859, "flos": 19859031774720.0, "grad_norm": 1.598591471625242, "language_loss": 0.76404572, "learning_rate": 7.786313437947527e-07, "loss": 0.78533232, "num_input_tokens_seen": 257586270, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 11939, "time_per_iteration": 2.5357234477996826 }, { "auxiliary_loss_clip": 0.01030048, "auxiliary_loss_mlp": 0.0099968, "balance_loss_clip": 0.99857086, "balance_loss_mlp": 1.00715113, "epoch": 0.7178716368555539, "flos": 64348655967360.0, "grad_norm": 0.7583722006823345, "language_loss": 0.61431843, "learning_rate": 7.783229623203738e-07, "loss": 0.63461566, "num_input_tokens_seen": 257647415, "router_z_loss_clip": 0.0111084, "router_z_loss_mlp": 0.22851562, "step": 11940, "time_per_iteration": 3.0772197246551514 }, { "auxiliary_loss_clip": 0.01103482, "auxiliary_loss_mlp": 0.01029904, "balance_loss_clip": 1.01760173, "balance_loss_mlp": 1.03477299, "epoch": 0.7179317601082219, "flos": 26762607152640.0, "grad_norm": 1.4311335562862737, "language_loss": 0.58787036, "learning_rate": 7.780146271721097e-07, "loss": 0.60920423, "num_input_tokens_seen": 257669795, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 11941, "time_per_iteration": 2.540552854537964 }, { "auxiliary_loss_clip": 0.0110536, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.01982558, "balance_loss_mlp": 1.03663898, "epoch": 0.7179918833608898, "flos": 23513804709120.0, "grad_norm": 1.8012623012369076, "language_loss": 0.79444039, "learning_rate": 7.777063383616543e-07, "loss": 0.81581414, "num_input_tokens_seen": 257687415, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 11942, "time_per_iteration": 2.4943418502807617 }, { "auxiliary_loss_clip": 0.01107316, "auxiliary_loss_mlp": 0.01036806, "balance_loss_clip": 1.02399659, "balance_loss_mlp": 1.03683615, "epoch": 0.7180520066135578, "flos": 17165588486400.0, "grad_norm": 1.7611534467069805, "language_loss": 0.65677273, "learning_rate": 7.773980959006968e-07, "loss": 0.67821395, "num_input_tokens_seen": 257706215, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 11943, "time_per_iteration": 2.4769575595855713 }, { "auxiliary_loss_clip": 0.01102933, "auxiliary_loss_mlp": 0.01029651, "balance_loss_clip": 1.01680589, "balance_loss_mlp": 1.03497458, "epoch": 0.7181121298662257, "flos": 17566638814080.0, "grad_norm": 1.8016897024152947, "language_loss": 0.78852338, "learning_rate": 7.770898998009254e-07, "loss": 0.80984926, "num_input_tokens_seen": 257724740, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 11944, "time_per_iteration": 2.47361159324646 }, { "auxiliary_loss_clip": 0.011094, "auxiliary_loss_mlp": 0.01036319, "balance_loss_clip": 1.02245533, "balance_loss_mlp": 1.03715348, "epoch": 0.7181722531188938, "flos": 11947660508160.0, "grad_norm": 2.316699033515052, "language_loss": 0.6297133, "learning_rate": 7.767817500740277e-07, "loss": 0.65117043, "num_input_tokens_seen": 257742060, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 11945, "time_per_iteration": 2.465425729751587 }, { "auxiliary_loss_clip": 0.01030563, "auxiliary_loss_mlp": 0.01000145, "balance_loss_clip": 0.99899513, "balance_loss_mlp": 1.0075736, "epoch": 0.7182323763715617, "flos": 65503649790720.0, "grad_norm": 0.7028988084213221, "language_loss": 0.51014233, "learning_rate": 7.76473646731689e-07, "loss": 0.53044939, "num_input_tokens_seen": 257802250, "router_z_loss_clip": 0.01147461, "router_z_loss_mlp": 0.23046875, "step": 11946, "time_per_iteration": 3.023697853088379 }, { "auxiliary_loss_clip": 0.01108519, "auxiliary_loss_mlp": 0.01033828, "balance_loss_clip": 1.0189805, "balance_loss_mlp": 1.03628588, "epoch": 0.7182924996242297, "flos": 20630932070400.0, "grad_norm": 2.374259168950999, "language_loss": 0.74572861, "learning_rate": 7.761655897855925e-07, "loss": 0.76715207, "num_input_tokens_seen": 257821155, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.72265625, "step": 11947, "time_per_iteration": 2.4990646839141846 }, { "auxiliary_loss_clip": 0.01102472, "auxiliary_loss_mlp": 0.01027761, "balance_loss_clip": 1.01502943, "balance_loss_mlp": 1.03345549, "epoch": 0.7183526228768976, "flos": 16216433550720.0, "grad_norm": 1.546947227897241, "language_loss": 0.72738916, "learning_rate": 7.758575792474187e-07, "loss": 0.7486915, "num_input_tokens_seen": 257839905, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 11948, "time_per_iteration": 2.47538161277771 }, { "auxiliary_loss_clip": 0.01107456, "auxiliary_loss_mlp": 0.0103952, "balance_loss_clip": 1.02572739, "balance_loss_mlp": 1.03634071, "epoch": 0.7184127461295656, "flos": 22232655342720.0, "grad_norm": 1.5100220100408657, "language_loss": 0.71390319, "learning_rate": 7.755496151288483e-07, "loss": 0.7353729, "num_input_tokens_seen": 257860055, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 11949, "time_per_iteration": 2.510251998901367 }, { "auxiliary_loss_clip": 0.01104071, "auxiliary_loss_mlp": 0.01029488, "balance_loss_clip": 1.01812124, "balance_loss_mlp": 1.03612781, "epoch": 0.7184728693822335, "flos": 27344503480320.0, "grad_norm": 1.9641297460013059, "language_loss": 0.76294839, "learning_rate": 7.752416974415598e-07, "loss": 0.78428394, "num_input_tokens_seen": 257879315, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 11950, "time_per_iteration": 2.557612895965576 }, { "auxiliary_loss_clip": 0.01109576, "auxiliary_loss_mlp": 0.01032787, "balance_loss_clip": 1.0188216, "balance_loss_mlp": 1.0385412, "epoch": 0.7185329926349016, "flos": 16508530949760.0, "grad_norm": 3.0244560066482165, "language_loss": 0.68098509, "learning_rate": 7.749338261972282e-07, "loss": 0.70240873, "num_input_tokens_seen": 257896570, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7109375, "step": 11951, "time_per_iteration": 2.4713869094848633 }, { "auxiliary_loss_clip": 0.01112867, "auxiliary_loss_mlp": 0.01030669, "balance_loss_clip": 1.0166378, "balance_loss_mlp": 1.04014063, "epoch": 0.7185931158875695, "flos": 23951052967680.0, "grad_norm": 1.8526160375990668, "language_loss": 0.78073561, "learning_rate": 7.746260014075286e-07, "loss": 0.80217099, "num_input_tokens_seen": 257916855, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 11952, "time_per_iteration": 2.526660442352295 }, { "auxiliary_loss_clip": 0.01109418, "auxiliary_loss_mlp": 0.01029192, "balance_loss_clip": 1.01593566, "balance_loss_mlp": 1.0372895, "epoch": 0.7186532391402375, "flos": 26542007775360.0, "grad_norm": 2.0649052499614817, "language_loss": 0.74975407, "learning_rate": 7.743182230841352e-07, "loss": 0.77114022, "num_input_tokens_seen": 257937140, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 11953, "time_per_iteration": 2.5363376140594482 }, { "auxiliary_loss_clip": 0.011055, "auxiliary_loss_mlp": 0.01032174, "balance_loss_clip": 1.01914454, "balance_loss_mlp": 1.03550982, "epoch": 0.7187133623929055, "flos": 22383049587840.0, "grad_norm": 1.878882821453294, "language_loss": 0.72940838, "learning_rate": 7.740104912387164e-07, "loss": 0.75078511, "num_input_tokens_seen": 257956785, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 11954, "time_per_iteration": 2.5225019454956055 }, { "auxiliary_loss_clip": 0.01108957, "auxiliary_loss_mlp": 0.01036042, "balance_loss_clip": 1.02316713, "balance_loss_mlp": 1.03819251, "epoch": 0.7187734856455734, "flos": 15779580341760.0, "grad_norm": 1.8359318831211697, "language_loss": 0.74435598, "learning_rate": 7.737028058829425e-07, "loss": 0.76580596, "num_input_tokens_seen": 257975455, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 11955, "time_per_iteration": 2.479583501815796 }, { "auxiliary_loss_clip": 0.0110662, "auxiliary_loss_mlp": 0.01030616, "balance_loss_clip": 1.01765835, "balance_loss_mlp": 1.03595257, "epoch": 0.7188336088982414, "flos": 31759612531200.0, "grad_norm": 1.8446193006229192, "language_loss": 0.73530173, "learning_rate": 7.733951670284817e-07, "loss": 0.75667405, "num_input_tokens_seen": 257996850, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 11956, "time_per_iteration": 2.5807909965515137 }, { "auxiliary_loss_clip": 0.01106956, "auxiliary_loss_mlp": 0.01032027, "balance_loss_clip": 1.0184195, "balance_loss_mlp": 1.03508759, "epoch": 0.7188937321509093, "flos": 21465208333440.0, "grad_norm": 1.8218511772941148, "language_loss": 0.70788771, "learning_rate": 7.730875746869987e-07, "loss": 0.72927755, "num_input_tokens_seen": 258016145, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 11957, "time_per_iteration": 2.523005247116089 }, { "auxiliary_loss_clip": 0.0110923, "auxiliary_loss_mlp": 0.01041199, "balance_loss_clip": 1.02698302, "balance_loss_mlp": 1.03674889, "epoch": 0.7189538554035774, "flos": 27271497087360.0, "grad_norm": 13.578453526622184, "language_loss": 0.73613548, "learning_rate": 7.727800288701582e-07, "loss": 0.75763983, "num_input_tokens_seen": 258035420, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7265625, "step": 11958, "time_per_iteration": 2.598132610321045 }, { "auxiliary_loss_clip": 0.01105182, "auxiliary_loss_mlp": 0.01036277, "balance_loss_clip": 1.02302718, "balance_loss_mlp": 1.03653336, "epoch": 0.7190139786562453, "flos": 21580625710080.0, "grad_norm": 2.5086459776494725, "language_loss": 0.83979881, "learning_rate": 7.724725295896215e-07, "loss": 0.86121345, "num_input_tokens_seen": 258053520, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 11959, "time_per_iteration": 2.5478930473327637 }, { "auxiliary_loss_clip": 0.01111325, "auxiliary_loss_mlp": 0.01032439, "balance_loss_clip": 1.0184145, "balance_loss_mlp": 1.03885078, "epoch": 0.7190741019089133, "flos": 26721237663360.0, "grad_norm": 1.5916171988881087, "language_loss": 0.81661612, "learning_rate": 7.7216507685705e-07, "loss": 0.83805376, "num_input_tokens_seen": 258073020, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.72265625, "step": 11960, "time_per_iteration": 2.5322182178497314 }, { "auxiliary_loss_clip": 0.01105633, "auxiliary_loss_mlp": 0.0104061, "balance_loss_clip": 1.02624512, "balance_loss_mlp": 1.0368489, "epoch": 0.7191342251615812, "flos": 26104759516800.0, "grad_norm": 1.6781815801693452, "language_loss": 0.77672863, "learning_rate": 7.718576706841013e-07, "loss": 0.79819107, "num_input_tokens_seen": 258093155, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.6875, "step": 11961, "time_per_iteration": 2.567272901535034 }, { "auxiliary_loss_clip": 0.01103107, "auxiliary_loss_mlp": 0.0103184, "balance_loss_clip": 1.01986599, "balance_loss_mlp": 1.03623199, "epoch": 0.7191943484142492, "flos": 22967028904320.0, "grad_norm": 1.782541706511271, "language_loss": 0.75284147, "learning_rate": 7.715503110824326e-07, "loss": 0.77419096, "num_input_tokens_seen": 258113905, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.66796875, "step": 11962, "time_per_iteration": 3.9100754261016846 }, { "auxiliary_loss_clip": 0.0110727, "auxiliary_loss_mlp": 0.01029637, "balance_loss_clip": 1.0152899, "balance_loss_mlp": 1.03588724, "epoch": 0.7192544716669171, "flos": 22565332131840.0, "grad_norm": 2.1523830310113605, "language_loss": 0.74940312, "learning_rate": 7.712429980637001e-07, "loss": 0.77077216, "num_input_tokens_seen": 258132820, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.71484375, "step": 11963, "time_per_iteration": 3.991231679916382 }, { "auxiliary_loss_clip": 0.01110217, "auxiliary_loss_mlp": 0.01037628, "balance_loss_clip": 1.02304888, "balance_loss_mlp": 1.0371902, "epoch": 0.7193145949195852, "flos": 18982200873600.0, "grad_norm": 2.675066133590613, "language_loss": 0.8094418, "learning_rate": 7.709357316395564e-07, "loss": 0.83092022, "num_input_tokens_seen": 258148055, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.73046875, "step": 11964, "time_per_iteration": 2.466412305831909 }, { "auxiliary_loss_clip": 0.011053, "auxiliary_loss_mlp": 0.01032823, "balance_loss_clip": 1.01991224, "balance_loss_mlp": 1.03600144, "epoch": 0.7193747181722531, "flos": 18004246208640.0, "grad_norm": 2.2433676769466686, "language_loss": 0.75024307, "learning_rate": 7.70628511821652e-07, "loss": 0.77162433, "num_input_tokens_seen": 258165995, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 11965, "time_per_iteration": 3.890397548675537 }, { "auxiliary_loss_clip": 0.01107332, "auxiliary_loss_mlp": 0.01032721, "balance_loss_clip": 1.01901805, "balance_loss_mlp": 1.03584993, "epoch": 0.7194348414249211, "flos": 24389414547840.0, "grad_norm": 1.663143739431333, "language_loss": 0.77397025, "learning_rate": 7.703213386216377e-07, "loss": 0.79537076, "num_input_tokens_seen": 258186165, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 11966, "time_per_iteration": 3.968190908432007 }, { "auxiliary_loss_clip": 0.0110724, "auxiliary_loss_mlp": 0.01034406, "balance_loss_clip": 1.02157903, "balance_loss_mlp": 1.03696585, "epoch": 0.7194949646775891, "flos": 22163455791360.0, "grad_norm": 2.176836826581815, "language_loss": 0.73099917, "learning_rate": 7.700142120511619e-07, "loss": 0.75241566, "num_input_tokens_seen": 258204595, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 11967, "time_per_iteration": 2.4697625637054443 }, { "auxiliary_loss_clip": 0.01102519, "auxiliary_loss_mlp": 0.01031679, "balance_loss_clip": 1.02061653, "balance_loss_mlp": 1.0376265, "epoch": 0.719555087930257, "flos": 20266366982400.0, "grad_norm": 1.8499302587721946, "language_loss": 0.81479788, "learning_rate": 7.6970713212187e-07, "loss": 0.8361398, "num_input_tokens_seen": 258223110, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6484375, "step": 11968, "time_per_iteration": 2.4837894439697266 }, { "auxiliary_loss_clip": 0.01104146, "auxiliary_loss_mlp": 0.01025873, "balance_loss_clip": 1.01358867, "balance_loss_mlp": 1.03515506, "epoch": 0.719615211182925, "flos": 24716309247360.0, "grad_norm": 2.242525486938947, "language_loss": 0.76707858, "learning_rate": 7.69400098845407e-07, "loss": 0.78837872, "num_input_tokens_seen": 258242660, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 11969, "time_per_iteration": 2.5037965774536133 }, { "auxiliary_loss_clip": 0.011054, "auxiliary_loss_mlp": 0.01030601, "balance_loss_clip": 1.01690412, "balance_loss_mlp": 1.03442574, "epoch": 0.719675334435593, "flos": 20009641501440.0, "grad_norm": 1.4412986023317036, "language_loss": 0.70982814, "learning_rate": 7.69093112233417e-07, "loss": 0.73118818, "num_input_tokens_seen": 258261850, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 11970, "time_per_iteration": 2.4931440353393555 }, { "auxiliary_loss_clip": 0.01030116, "auxiliary_loss_mlp": 0.01001946, "balance_loss_clip": 1.00087953, "balance_loss_mlp": 1.00734699, "epoch": 0.719735457688261, "flos": 44199861177600.0, "grad_norm": 0.9139179141386677, "language_loss": 0.60821629, "learning_rate": 7.68786172297538e-07, "loss": 0.62853694, "num_input_tokens_seen": 258312570, "router_z_loss_clip": 0.01068115, "router_z_loss_mlp": 0.22851562, "step": 11971, "time_per_iteration": 3.003838300704956 }, { "auxiliary_loss_clip": 0.01111967, "auxiliary_loss_mlp": 0.01033096, "balance_loss_clip": 1.01933312, "balance_loss_mlp": 1.03744197, "epoch": 0.7197955809409289, "flos": 16802890905600.0, "grad_norm": 1.8122981308272867, "language_loss": 0.80145472, "learning_rate": 7.684792790494105e-07, "loss": 0.8229053, "num_input_tokens_seen": 258331600, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 11972, "time_per_iteration": 2.49880051612854 }, { "auxiliary_loss_clip": 0.01110522, "auxiliary_loss_mlp": 0.01033205, "balance_loss_clip": 1.01932883, "balance_loss_mlp": 1.03873146, "epoch": 0.7198557041935969, "flos": 24535391420160.0, "grad_norm": 2.015711027412201, "language_loss": 0.76066566, "learning_rate": 7.681724325006733e-07, "loss": 0.78210294, "num_input_tokens_seen": 258351785, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 11973, "time_per_iteration": 2.5375914573669434 }, { "auxiliary_loss_clip": 0.01030113, "auxiliary_loss_mlp": 0.00998834, "balance_loss_clip": 0.99779683, "balance_loss_mlp": 1.0072217, "epoch": 0.7199158274462648, "flos": 70710839602560.0, "grad_norm": 0.8567206520965902, "language_loss": 0.57171416, "learning_rate": 7.6786563266296e-07, "loss": 0.5920037, "num_input_tokens_seen": 258404035, "router_z_loss_clip": 0.01037598, "router_z_loss_mlp": 0.22851562, "step": 11974, "time_per_iteration": 2.9880080223083496 }, { "auxiliary_loss_clip": 0.01106231, "auxiliary_loss_mlp": 0.01034046, "balance_loss_clip": 1.02062345, "balance_loss_mlp": 1.03407454, "epoch": 0.7199759506989328, "flos": 29347995352320.0, "grad_norm": 1.994806190016656, "language_loss": 0.61094695, "learning_rate": 7.675588795479062e-07, "loss": 0.63234973, "num_input_tokens_seen": 258424850, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 11975, "time_per_iteration": 2.569587469100952 }, { "auxiliary_loss_clip": 0.01104109, "auxiliary_loss_mlp": 0.01031574, "balance_loss_clip": 1.01893747, "balance_loss_mlp": 1.03388166, "epoch": 0.7200360739516007, "flos": 24640465680000.0, "grad_norm": 2.0054085348370934, "language_loss": 0.6769762, "learning_rate": 7.672521731671425e-07, "loss": 0.69833297, "num_input_tokens_seen": 258445485, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 11976, "time_per_iteration": 2.566577672958374 }, { "auxiliary_loss_clip": 0.01106829, "auxiliary_loss_mlp": 0.0103094, "balance_loss_clip": 1.01833379, "balance_loss_mlp": 1.03623772, "epoch": 0.7200961972042688, "flos": 20812855478400.0, "grad_norm": 1.9294144659730768, "language_loss": 0.67250699, "learning_rate": 7.669455135323004e-07, "loss": 0.69388473, "num_input_tokens_seen": 258464505, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 11977, "time_per_iteration": 2.5432534217834473 }, { "auxiliary_loss_clip": 0.01109378, "auxiliary_loss_mlp": 0.0103422, "balance_loss_clip": 1.02096379, "balance_loss_mlp": 1.03768241, "epoch": 0.7201563204569367, "flos": 31245910174080.0, "grad_norm": 1.816163524423317, "language_loss": 0.75848806, "learning_rate": 7.666389006550074e-07, "loss": 0.77992404, "num_input_tokens_seen": 258487190, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 11978, "time_per_iteration": 2.6284797191619873 }, { "auxiliary_loss_clip": 0.01103729, "auxiliary_loss_mlp": 0.01032594, "balance_loss_clip": 1.01940966, "balance_loss_mlp": 1.03487158, "epoch": 0.7202164437096047, "flos": 26651391667200.0, "grad_norm": 2.000484702888463, "language_loss": 0.79244423, "learning_rate": 7.663323345468908e-07, "loss": 0.81380749, "num_input_tokens_seen": 258503790, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6875, "step": 11979, "time_per_iteration": 2.550449848175049 }, { "auxiliary_loss_clip": 0.01107499, "auxiliary_loss_mlp": 0.01030454, "balance_loss_clip": 1.01733494, "balance_loss_mlp": 1.03719711, "epoch": 0.7202765669622727, "flos": 25959608657280.0, "grad_norm": 1.90027174089091, "language_loss": 0.64735031, "learning_rate": 7.660258152195767e-07, "loss": 0.66872984, "num_input_tokens_seen": 258527335, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 11980, "time_per_iteration": 2.5656015872955322 }, { "auxiliary_loss_clip": 0.01108823, "auxiliary_loss_mlp": 0.01036613, "balance_loss_clip": 1.02249837, "balance_loss_mlp": 1.03744507, "epoch": 0.7203366902149406, "flos": 28512354372480.0, "grad_norm": 2.000265419423584, "language_loss": 0.66981709, "learning_rate": 7.657193426846871e-07, "loss": 0.69127142, "num_input_tokens_seen": 258546690, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7109375, "step": 11981, "time_per_iteration": 2.5428075790405273 }, { "auxiliary_loss_clip": 0.01107581, "auxiliary_loss_mlp": 0.01034058, "balance_loss_clip": 1.02088594, "balance_loss_mlp": 1.03660762, "epoch": 0.7203968134676086, "flos": 21106030285440.0, "grad_norm": 3.0660780471009232, "language_loss": 0.73590559, "learning_rate": 7.65412916953843e-07, "loss": 0.75732195, "num_input_tokens_seen": 258566340, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 11982, "time_per_iteration": 2.479658365249634 }, { "auxiliary_loss_clip": 0.01105378, "auxiliary_loss_mlp": 0.01037095, "balance_loss_clip": 1.02493, "balance_loss_mlp": 1.03483891, "epoch": 0.7204569367202766, "flos": 18332146488960.0, "grad_norm": 2.4299086662507214, "language_loss": 0.65743458, "learning_rate": 7.65106538038665e-07, "loss": 0.67885935, "num_input_tokens_seen": 258584455, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 11983, "time_per_iteration": 2.4649083614349365 }, { "auxiliary_loss_clip": 0.011073, "auxiliary_loss_mlp": 0.01031044, "balance_loss_clip": 1.01793075, "balance_loss_mlp": 1.03726792, "epoch": 0.7205170599729446, "flos": 23255103980160.0, "grad_norm": 1.5390559283575926, "language_loss": 0.66607416, "learning_rate": 7.648002059507715e-07, "loss": 0.68745756, "num_input_tokens_seen": 258604725, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 11984, "time_per_iteration": 2.501412868499756 }, { "auxiliary_loss_clip": 0.0111267, "auxiliary_loss_mlp": 0.01032754, "balance_loss_clip": 1.01876533, "balance_loss_mlp": 1.03931606, "epoch": 0.7205771832256125, "flos": 20120892900480.0, "grad_norm": 1.5776633318036175, "language_loss": 0.73568535, "learning_rate": 7.644939207017771e-07, "loss": 0.75713956, "num_input_tokens_seen": 258622885, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 11985, "time_per_iteration": 2.48230242729187 }, { "auxiliary_loss_clip": 0.01106728, "auxiliary_loss_mlp": 0.01031812, "balance_loss_clip": 1.01933694, "balance_loss_mlp": 1.03795457, "epoch": 0.7206373064782805, "flos": 27703250565120.0, "grad_norm": 2.042647566107597, "language_loss": 0.62763321, "learning_rate": 7.641876823032977e-07, "loss": 0.64901853, "num_input_tokens_seen": 258644305, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 11986, "time_per_iteration": 2.519577741622925 }, { "auxiliary_loss_clip": 0.01109068, "auxiliary_loss_mlp": 0.01034811, "balance_loss_clip": 1.02070856, "balance_loss_mlp": 1.03831887, "epoch": 0.7206974297309484, "flos": 17968156018560.0, "grad_norm": 1.6346142056023387, "language_loss": 0.72703874, "learning_rate": 7.638814907669455e-07, "loss": 0.74847758, "num_input_tokens_seen": 258661775, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.70703125, "step": 11987, "time_per_iteration": 2.4985320568084717 }, { "auxiliary_loss_clip": 0.01109823, "auxiliary_loss_mlp": 0.01034454, "balance_loss_clip": 1.02078664, "balance_loss_mlp": 1.03763235, "epoch": 0.7207575529836164, "flos": 16983162288000.0, "grad_norm": 1.8890005447967564, "language_loss": 0.78606725, "learning_rate": 7.635753461043301e-07, "loss": 0.80751002, "num_input_tokens_seen": 258679830, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 11988, "time_per_iteration": 2.4576003551483154 }, { "auxiliary_loss_clip": 0.01105562, "auxiliary_loss_mlp": 0.01032361, "balance_loss_clip": 1.01978469, "balance_loss_mlp": 1.03590155, "epoch": 0.7208176762362843, "flos": 18727594295040.0, "grad_norm": 1.7376259200692559, "language_loss": 0.78680301, "learning_rate": 7.632692483270618e-07, "loss": 0.80818224, "num_input_tokens_seen": 258697415, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 11989, "time_per_iteration": 2.4987246990203857 }, { "auxiliary_loss_clip": 0.01104504, "auxiliary_loss_mlp": 0.01033457, "balance_loss_clip": 1.02064204, "balance_loss_mlp": 1.03620541, "epoch": 0.7208777994889524, "flos": 18734489706240.0, "grad_norm": 1.8106252774827893, "language_loss": 0.82576287, "learning_rate": 7.629631974467481e-07, "loss": 0.84714246, "num_input_tokens_seen": 258716755, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 11990, "time_per_iteration": 2.5017545223236084 }, { "auxiliary_loss_clip": 0.01106551, "auxiliary_loss_mlp": 0.01038692, "balance_loss_clip": 1.02628827, "balance_loss_mlp": 1.03705955, "epoch": 0.7209379227416203, "flos": 14793437376000.0, "grad_norm": 1.9267020458271094, "language_loss": 0.76536584, "learning_rate": 7.626571934749931e-07, "loss": 0.78681827, "num_input_tokens_seen": 258733270, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 11991, "time_per_iteration": 2.4639673233032227 }, { "auxiliary_loss_clip": 0.011034, "auxiliary_loss_mlp": 0.01028178, "balance_loss_clip": 1.01553595, "balance_loss_mlp": 1.03602529, "epoch": 0.7209980459942883, "flos": 29636860527360.0, "grad_norm": 1.501935495453617, "language_loss": 0.72771871, "learning_rate": 7.623512364234022e-07, "loss": 0.7490344, "num_input_tokens_seen": 258755270, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 11992, "time_per_iteration": 2.5490517616271973 }, { "auxiliary_loss_clip": 0.01106668, "auxiliary_loss_mlp": 0.01031551, "balance_loss_clip": 1.01873612, "balance_loss_mlp": 1.03580666, "epoch": 0.7210581692469563, "flos": 23477175815040.0, "grad_norm": 1.513260658608558, "language_loss": 0.66579986, "learning_rate": 7.620453263035755e-07, "loss": 0.68718207, "num_input_tokens_seen": 258775340, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 11993, "time_per_iteration": 2.525006055831909 }, { "auxiliary_loss_clip": 0.01104927, "auxiliary_loss_mlp": 0.01031723, "balance_loss_clip": 1.01950967, "balance_loss_mlp": 1.03488481, "epoch": 0.7211182924996242, "flos": 26099839353600.0, "grad_norm": 1.915479693486247, "language_loss": 0.65578282, "learning_rate": 7.61739463127115e-07, "loss": 0.6771493, "num_input_tokens_seen": 258794580, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 11994, "time_per_iteration": 2.528613805770874 }, { "auxiliary_loss_clip": 0.01108441, "auxiliary_loss_mlp": 0.01034681, "balance_loss_clip": 1.02083528, "balance_loss_mlp": 1.03770113, "epoch": 0.7211784157522922, "flos": 17712076982400.0, "grad_norm": 2.223386598309055, "language_loss": 0.67071903, "learning_rate": 7.614336469056172e-07, "loss": 0.69215029, "num_input_tokens_seen": 258812330, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.70703125, "step": 11995, "time_per_iteration": 2.543224811553955 }, { "auxiliary_loss_clip": 0.01105558, "auxiliary_loss_mlp": 0.01029601, "balance_loss_clip": 1.01602864, "balance_loss_mlp": 1.03769708, "epoch": 0.7212385390049602, "flos": 24423637230720.0, "grad_norm": 1.7725567421594304, "language_loss": 0.79314196, "learning_rate": 7.6112787765068e-07, "loss": 0.81449354, "num_input_tokens_seen": 258831770, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6796875, "step": 11996, "time_per_iteration": 2.516842842102051 }, { "auxiliary_loss_clip": 0.01107843, "auxiliary_loss_mlp": 0.01033355, "balance_loss_clip": 1.02108848, "balance_loss_mlp": 1.0378623, "epoch": 0.7212986622576282, "flos": 28147250580480.0, "grad_norm": 2.400258394344971, "language_loss": 0.81615096, "learning_rate": 7.60822155373899e-07, "loss": 0.83756292, "num_input_tokens_seen": 258849090, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69921875, "step": 11997, "time_per_iteration": 2.5495426654815674 }, { "auxiliary_loss_clip": 0.01107864, "auxiliary_loss_mlp": 0.01035396, "balance_loss_clip": 1.02193153, "balance_loss_mlp": 1.03664875, "epoch": 0.7213587855102961, "flos": 21835770992640.0, "grad_norm": 2.6887661826210603, "language_loss": 0.66847825, "learning_rate": 7.605164800868646e-07, "loss": 0.68991089, "num_input_tokens_seen": 258868230, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 11998, "time_per_iteration": 2.500276803970337 }, { "auxiliary_loss_clip": 0.01108037, "auxiliary_loss_mlp": 0.01030136, "balance_loss_clip": 1.01894867, "balance_loss_mlp": 1.03840983, "epoch": 0.7214189087629641, "flos": 14611549881600.0, "grad_norm": 2.0100913601701706, "language_loss": 0.72338206, "learning_rate": 7.602108518011696e-07, "loss": 0.74476373, "num_input_tokens_seen": 258885525, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6953125, "step": 11999, "time_per_iteration": 2.4703285694122314 }, { "auxiliary_loss_clip": 0.01107947, "auxiliary_loss_mlp": 0.0102791, "balance_loss_clip": 1.01473713, "balance_loss_mlp": 1.03720331, "epoch": 0.721479032015632, "flos": 19390864884480.0, "grad_norm": 2.339098970477631, "language_loss": 0.82995951, "learning_rate": 7.599052705284039e-07, "loss": 0.85131806, "num_input_tokens_seen": 258903245, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 12000, "time_per_iteration": 2.453065872192383 }, { "auxiliary_loss_clip": 0.01108686, "auxiliary_loss_mlp": 0.01032945, "balance_loss_clip": 1.01994491, "balance_loss_mlp": 1.03823912, "epoch": 0.7215391552683, "flos": 18512884748160.0, "grad_norm": 2.053390539233443, "language_loss": 0.77144039, "learning_rate": 7.59599736280154e-07, "loss": 0.79285669, "num_input_tokens_seen": 258921245, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 12001, "time_per_iteration": 2.4737300872802734 }, { "auxiliary_loss_clip": 0.01107576, "auxiliary_loss_mlp": 0.01037611, "balance_loss_clip": 1.0249567, "balance_loss_mlp": 1.0393517, "epoch": 0.721599278520968, "flos": 23258731253760.0, "grad_norm": 1.8458298676524536, "language_loss": 0.81646276, "learning_rate": 7.592942490680066e-07, "loss": 0.83791471, "num_input_tokens_seen": 258939425, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 12002, "time_per_iteration": 2.487081289291382 }, { "auxiliary_loss_clip": 0.01109018, "auxiliary_loss_mlp": 0.01029138, "balance_loss_clip": 1.0159831, "balance_loss_mlp": 1.03784323, "epoch": 0.721659401773636, "flos": 39199045979520.0, "grad_norm": 2.486496068229724, "language_loss": 0.62448025, "learning_rate": 7.589888089035462e-07, "loss": 0.6458618, "num_input_tokens_seen": 258960710, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12003, "time_per_iteration": 4.102598428726196 }, { "auxiliary_loss_clip": 0.0110588, "auxiliary_loss_mlp": 0.01030863, "balance_loss_clip": 1.01736283, "balance_loss_mlp": 1.03571129, "epoch": 0.7217195250263039, "flos": 14939917038720.0, "grad_norm": 2.998181191040353, "language_loss": 0.68895519, "learning_rate": 7.586834157983544e-07, "loss": 0.71032262, "num_input_tokens_seen": 258978475, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 12004, "time_per_iteration": 3.9504992961883545 }, { "auxiliary_loss_clip": 0.01030714, "auxiliary_loss_mlp": 0.01000134, "balance_loss_clip": 0.99902487, "balance_loss_mlp": 1.00786817, "epoch": 0.7217796482789719, "flos": 70869206666880.0, "grad_norm": 0.8619321816713523, "language_loss": 0.54154968, "learning_rate": 7.583780697640112e-07, "loss": 0.56185818, "num_input_tokens_seen": 259037520, "router_z_loss_clip": 0.0111084, "router_z_loss_mlp": 0.22851562, "step": 12005, "time_per_iteration": 3.0428075790405273 }, { "auxiliary_loss_clip": 0.01107049, "auxiliary_loss_mlp": 0.01031649, "balance_loss_clip": 1.01812506, "balance_loss_mlp": 1.03727818, "epoch": 0.7218397715316398, "flos": 37451525402880.0, "grad_norm": 1.5416377747643188, "language_loss": 0.63208592, "learning_rate": 7.580727708120962e-07, "loss": 0.6534729, "num_input_tokens_seen": 259061325, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69921875, "step": 12006, "time_per_iteration": 2.666722536087036 }, { "auxiliary_loss_clip": 0.011062, "auxiliary_loss_mlp": 0.01033335, "balance_loss_clip": 1.02092528, "balance_loss_mlp": 1.03635311, "epoch": 0.7218998947843078, "flos": 22710662559360.0, "grad_norm": 2.681361972024586, "language_loss": 0.91546202, "learning_rate": 7.577675189541865e-07, "loss": 0.93685746, "num_input_tokens_seen": 259078135, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 12007, "time_per_iteration": 3.909451484680176 }, { "auxiliary_loss_clip": 0.01107196, "auxiliary_loss_mlp": 0.01028388, "balance_loss_clip": 1.01435125, "balance_loss_mlp": 1.03558874, "epoch": 0.7219600180369758, "flos": 12167182477440.0, "grad_norm": 2.1650800767253884, "language_loss": 0.63953954, "learning_rate": 7.574623142018568e-07, "loss": 0.66089541, "num_input_tokens_seen": 259095910, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71484375, "step": 12008, "time_per_iteration": 3.9004766941070557 }, { "auxiliary_loss_clip": 0.01108435, "auxiliary_loss_mlp": 0.01032659, "balance_loss_clip": 1.01939654, "balance_loss_mlp": 1.03700018, "epoch": 0.7220201412896438, "flos": 22596573985920.0, "grad_norm": 2.09474100076495, "language_loss": 0.7893821, "learning_rate": 7.57157156566681e-07, "loss": 0.81079304, "num_input_tokens_seen": 259114225, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 12009, "time_per_iteration": 2.487116813659668 }, { "auxiliary_loss_clip": 0.01108398, "auxiliary_loss_mlp": 0.0103943, "balance_loss_clip": 1.02479696, "balance_loss_mlp": 1.03633499, "epoch": 0.7220802645423118, "flos": 26718651884160.0, "grad_norm": 4.405932986479901, "language_loss": 0.63882589, "learning_rate": 7.568520460602297e-07, "loss": 0.66030419, "num_input_tokens_seen": 259134660, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.71875, "step": 12010, "time_per_iteration": 2.542196035385132 }, { "auxiliary_loss_clip": 0.01107416, "auxiliary_loss_mlp": 0.01030407, "balance_loss_clip": 1.01769876, "balance_loss_mlp": 1.03678274, "epoch": 0.7221403877949797, "flos": 24420548661120.0, "grad_norm": 2.4755332370224656, "language_loss": 0.77425838, "learning_rate": 7.565469826940742e-07, "loss": 0.79563659, "num_input_tokens_seen": 259153300, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 12011, "time_per_iteration": 2.5101232528686523 }, { "auxiliary_loss_clip": 0.01106161, "auxiliary_loss_mlp": 0.01030525, "balance_loss_clip": 1.01839519, "balance_loss_mlp": 1.03743482, "epoch": 0.7222005110476477, "flos": 23514379326720.0, "grad_norm": 1.690273507569459, "language_loss": 0.79456604, "learning_rate": 7.56241966479781e-07, "loss": 0.81593287, "num_input_tokens_seen": 259172115, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 12012, "time_per_iteration": 2.5223910808563232 }, { "auxiliary_loss_clip": 0.01108675, "auxiliary_loss_mlp": 0.01029487, "balance_loss_clip": 1.01695228, "balance_loss_mlp": 1.03809428, "epoch": 0.7222606343003156, "flos": 23112538899840.0, "grad_norm": 2.4329207015542917, "language_loss": 0.75611585, "learning_rate": 7.559369974289171e-07, "loss": 0.77749747, "num_input_tokens_seen": 259191345, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 12013, "time_per_iteration": 2.5109095573425293 }, { "auxiliary_loss_clip": 0.01105772, "auxiliary_loss_mlp": 0.01025663, "balance_loss_clip": 1.01387358, "balance_loss_mlp": 1.03709459, "epoch": 0.7223207575529836, "flos": 24351169541760.0, "grad_norm": 1.6914495342386249, "language_loss": 0.75969058, "learning_rate": 7.556320755530484e-07, "loss": 0.78100491, "num_input_tokens_seen": 259211700, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 12014, "time_per_iteration": 2.5330398082733154 }, { "auxiliary_loss_clip": 0.01107557, "auxiliary_loss_mlp": 0.0103212, "balance_loss_clip": 1.01939988, "balance_loss_mlp": 1.03693521, "epoch": 0.7223808808056515, "flos": 28330179569280.0, "grad_norm": 1.5863172158381864, "language_loss": 0.86275727, "learning_rate": 7.553272008637346e-07, "loss": 0.88415396, "num_input_tokens_seen": 259233825, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 12015, "time_per_iteration": 2.5781500339508057 }, { "auxiliary_loss_clip": 0.01105944, "auxiliary_loss_mlp": 0.01031757, "balance_loss_clip": 1.01955569, "balance_loss_mlp": 1.03717434, "epoch": 0.7224410040583196, "flos": 21069437304960.0, "grad_norm": 2.0275055960003874, "language_loss": 0.7845993, "learning_rate": 7.55022373372538e-07, "loss": 0.80597627, "num_input_tokens_seen": 259253055, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 12016, "time_per_iteration": 2.5176243782043457 }, { "auxiliary_loss_clip": 0.01104614, "auxiliary_loss_mlp": 0.01031731, "balance_loss_clip": 1.01920259, "balance_loss_mlp": 1.03638613, "epoch": 0.7225011273109875, "flos": 26795429205120.0, "grad_norm": 1.4967369906994672, "language_loss": 0.77636409, "learning_rate": 7.547175930910186e-07, "loss": 0.79772758, "num_input_tokens_seen": 259273420, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 12017, "time_per_iteration": 2.573171615600586 }, { "auxiliary_loss_clip": 0.01101768, "auxiliary_loss_mlp": 0.01027133, "balance_loss_clip": 1.01576626, "balance_loss_mlp": 1.03453195, "epoch": 0.7225612505636555, "flos": 23583578878080.0, "grad_norm": 1.7996950982335713, "language_loss": 0.7387256, "learning_rate": 7.54412860030732e-07, "loss": 0.76001465, "num_input_tokens_seen": 259291000, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 12018, "time_per_iteration": 2.5430076122283936 }, { "auxiliary_loss_clip": 0.01105311, "auxiliary_loss_mlp": 0.01032197, "balance_loss_clip": 1.02092624, "balance_loss_mlp": 1.03895509, "epoch": 0.7226213738163234, "flos": 20777627214720.0, "grad_norm": 1.787424851671038, "language_loss": 0.77754688, "learning_rate": 7.541081742032347e-07, "loss": 0.79892194, "num_input_tokens_seen": 259312390, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6640625, "step": 12019, "time_per_iteration": 2.5292396545410156 }, { "auxiliary_loss_clip": 0.01105613, "auxiliary_loss_mlp": 0.01027229, "balance_loss_clip": 1.01482558, "balance_loss_mlp": 1.03639042, "epoch": 0.7226814970689914, "flos": 32635832901120.0, "grad_norm": 1.9441824772770915, "language_loss": 0.73767841, "learning_rate": 7.53803535620081e-07, "loss": 0.7590068, "num_input_tokens_seen": 259332645, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 12020, "time_per_iteration": 2.5887041091918945 }, { "auxiliary_loss_clip": 0.01107864, "auxiliary_loss_mlp": 0.01030122, "balance_loss_clip": 1.01807559, "balance_loss_mlp": 1.03562498, "epoch": 0.7227416203216595, "flos": 22454368041600.0, "grad_norm": 1.8539366113275357, "language_loss": 0.77388227, "learning_rate": 7.534989442928219e-07, "loss": 0.79526216, "num_input_tokens_seen": 259353810, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.72265625, "step": 12021, "time_per_iteration": 2.5063977241516113 }, { "auxiliary_loss_clip": 0.01105577, "auxiliary_loss_mlp": 0.01030361, "balance_loss_clip": 1.01776624, "balance_loss_mlp": 1.03610325, "epoch": 0.7228017435743274, "flos": 21652303299840.0, "grad_norm": 1.7723776999475227, "language_loss": 0.68526155, "learning_rate": 7.531944002330073e-07, "loss": 0.70662093, "num_input_tokens_seen": 259372460, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12022, "time_per_iteration": 2.5039803981781006 }, { "auxiliary_loss_clip": 0.01105193, "auxiliary_loss_mlp": 0.01028872, "balance_loss_clip": 1.0161289, "balance_loss_mlp": 1.03494215, "epoch": 0.7228618668269954, "flos": 29533474206720.0, "grad_norm": 1.9416633313685188, "language_loss": 0.69775122, "learning_rate": 7.528899034521858e-07, "loss": 0.71909183, "num_input_tokens_seen": 259393275, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 12023, "time_per_iteration": 2.5524444580078125 }, { "auxiliary_loss_clip": 0.01104419, "auxiliary_loss_mlp": 0.01029295, "balance_loss_clip": 1.01669502, "balance_loss_mlp": 1.03500414, "epoch": 0.7229219900796633, "flos": 27453815544960.0, "grad_norm": 2.5393078462310963, "language_loss": 0.71110243, "learning_rate": 7.525854539619052e-07, "loss": 0.73243964, "num_input_tokens_seen": 259416205, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12024, "time_per_iteration": 2.5522162914276123 }, { "auxiliary_loss_clip": 0.01106707, "auxiliary_loss_mlp": 0.01033334, "balance_loss_clip": 1.02137768, "balance_loss_mlp": 1.03698695, "epoch": 0.7229821133323313, "flos": 16289368116480.0, "grad_norm": 1.7278491362019373, "language_loss": 0.76073563, "learning_rate": 7.522810517737089e-07, "loss": 0.78213608, "num_input_tokens_seen": 259433115, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 12025, "time_per_iteration": 2.4579668045043945 }, { "auxiliary_loss_clip": 0.01104415, "auxiliary_loss_mlp": 0.01030068, "balance_loss_clip": 1.01829028, "balance_loss_mlp": 1.03679824, "epoch": 0.7230422365849992, "flos": 20412343854720.0, "grad_norm": 2.683680799187217, "language_loss": 0.76757824, "learning_rate": 7.519766968991395e-07, "loss": 0.78892303, "num_input_tokens_seen": 259450475, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.67578125, "step": 12026, "time_per_iteration": 2.517854690551758 }, { "auxiliary_loss_clip": 0.01106932, "auxiliary_loss_mlp": 0.0103492, "balance_loss_clip": 1.02288616, "balance_loss_mlp": 1.03629291, "epoch": 0.7231023598376672, "flos": 25593499284480.0, "grad_norm": 2.0498521872931335, "language_loss": 0.67507297, "learning_rate": 7.516723893497388e-07, "loss": 0.69649154, "num_input_tokens_seen": 259469355, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 12027, "time_per_iteration": 2.551147937774658 }, { "auxiliary_loss_clip": 0.01109898, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.01639605, "balance_loss_mlp": 1.03806996, "epoch": 0.7231624830903352, "flos": 25149607009920.0, "grad_norm": 2.2584009000411567, "language_loss": 0.7926209, "learning_rate": 7.513681291370469e-07, "loss": 0.81401312, "num_input_tokens_seen": 259486565, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 12028, "time_per_iteration": 2.54373836517334 }, { "auxiliary_loss_clip": 0.01103619, "auxiliary_loss_mlp": 0.01026501, "balance_loss_clip": 1.01326847, "balance_loss_mlp": 1.03411448, "epoch": 0.7232226063430032, "flos": 21725740656000.0, "grad_norm": 1.852284628629995, "language_loss": 0.81987536, "learning_rate": 7.510639162726e-07, "loss": 0.84117651, "num_input_tokens_seen": 259505070, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 12029, "time_per_iteration": 2.5528011322021484 }, { "auxiliary_loss_clip": 0.01030207, "auxiliary_loss_mlp": 0.01001927, "balance_loss_clip": 1.0009253, "balance_loss_mlp": 1.00756431, "epoch": 0.7232827295956711, "flos": 68436798491520.0, "grad_norm": 0.8094871218644042, "language_loss": 0.61811411, "learning_rate": 7.507597507679347e-07, "loss": 0.63843548, "num_input_tokens_seen": 259569135, "router_z_loss_clip": 0.01000977, "router_z_loss_mlp": 0.2265625, "step": 12030, "time_per_iteration": 3.193946361541748 }, { "auxiliary_loss_clip": 0.01103634, "auxiliary_loss_mlp": 0.01030613, "balance_loss_clip": 1.01823258, "balance_loss_mlp": 1.03533602, "epoch": 0.7233428528483391, "flos": 20192642317440.0, "grad_norm": 1.7946340523155278, "language_loss": 0.78072369, "learning_rate": 7.504556326345859e-07, "loss": 0.80206609, "num_input_tokens_seen": 259587035, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 12031, "time_per_iteration": 2.4929656982421875 }, { "auxiliary_loss_clip": 0.01108648, "auxiliary_loss_mlp": 0.01024191, "balance_loss_clip": 1.01184654, "balance_loss_mlp": 1.0369854, "epoch": 0.723402976101007, "flos": 23949472769280.0, "grad_norm": 1.7468387000931656, "language_loss": 0.81527793, "learning_rate": 7.501515618840834e-07, "loss": 0.83660632, "num_input_tokens_seen": 259606140, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 12032, "time_per_iteration": 2.5517313480377197 }, { "auxiliary_loss_clip": 0.01111206, "auxiliary_loss_mlp": 0.01031634, "balance_loss_clip": 1.01831222, "balance_loss_mlp": 1.03786898, "epoch": 0.723463099353675, "flos": 20813394182400.0, "grad_norm": 1.867951505454382, "language_loss": 0.75169182, "learning_rate": 7.498475385279592e-07, "loss": 0.77312028, "num_input_tokens_seen": 259624275, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.734375, "step": 12033, "time_per_iteration": 2.4963183403015137 }, { "auxiliary_loss_clip": 0.01103993, "auxiliary_loss_mlp": 0.01026177, "balance_loss_clip": 1.01456058, "balance_loss_mlp": 1.03563643, "epoch": 0.723523222606343, "flos": 19098013299840.0, "grad_norm": 1.597017954885318, "language_loss": 0.75428379, "learning_rate": 7.495435625777423e-07, "loss": 0.77558553, "num_input_tokens_seen": 259643465, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 12034, "time_per_iteration": 2.514477252960205 }, { "auxiliary_loss_clip": 0.01103264, "auxiliary_loss_mlp": 0.01033229, "balance_loss_clip": 1.02191007, "balance_loss_mlp": 1.03457499, "epoch": 0.723583345859011, "flos": 26506994993280.0, "grad_norm": 2.641019893724207, "language_loss": 0.80787343, "learning_rate": 7.492396340449578e-07, "loss": 0.8292383, "num_input_tokens_seen": 259662500, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6875, "step": 12035, "time_per_iteration": 2.5376102924346924 }, { "auxiliary_loss_clip": 0.0110653, "auxiliary_loss_mlp": 0.01029611, "balance_loss_clip": 1.01704085, "balance_loss_mlp": 1.03648019, "epoch": 0.723643469111679, "flos": 16033863697920.0, "grad_norm": 1.7685067494037938, "language_loss": 0.61406267, "learning_rate": 7.489357529411326e-07, "loss": 0.63542408, "num_input_tokens_seen": 259680140, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 12036, "time_per_iteration": 2.469299554824829 }, { "auxiliary_loss_clip": 0.01104081, "auxiliary_loss_mlp": 0.01034396, "balance_loss_clip": 1.0229162, "balance_loss_mlp": 1.03627586, "epoch": 0.7237035923643469, "flos": 21945549934080.0, "grad_norm": 2.685867733604317, "language_loss": 0.68092251, "learning_rate": 7.486319192777883e-07, "loss": 0.70230722, "num_input_tokens_seen": 259700160, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 12037, "time_per_iteration": 2.4828617572784424 }, { "auxiliary_loss_clip": 0.01106171, "auxiliary_loss_mlp": 0.01035842, "balance_loss_clip": 1.02284205, "balance_loss_mlp": 1.03679669, "epoch": 0.7237637156170149, "flos": 23583112001280.0, "grad_norm": 3.3230907777386354, "language_loss": 0.72645497, "learning_rate": 7.483281330664479e-07, "loss": 0.74787503, "num_input_tokens_seen": 259720525, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 12038, "time_per_iteration": 2.5424866676330566 }, { "auxiliary_loss_clip": 0.0110613, "auxiliary_loss_mlp": 0.01033024, "balance_loss_clip": 1.0192734, "balance_loss_mlp": 1.03619576, "epoch": 0.7238238388696828, "flos": 20594698225920.0, "grad_norm": 1.7882774445228824, "language_loss": 0.72210944, "learning_rate": 7.480243943186293e-07, "loss": 0.74350095, "num_input_tokens_seen": 259738680, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.69921875, "step": 12039, "time_per_iteration": 2.4674713611602783 }, { "auxiliary_loss_clip": 0.0110855, "auxiliary_loss_mlp": 0.01031262, "balance_loss_clip": 1.01895416, "balance_loss_mlp": 1.03700542, "epoch": 0.7238839621223508, "flos": 24207024263040.0, "grad_norm": 1.7028706715586615, "language_loss": 0.76060688, "learning_rate": 7.477207030458513e-07, "loss": 0.78200507, "num_input_tokens_seen": 259758790, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71484375, "step": 12040, "time_per_iteration": 2.5036704540252686 }, { "auxiliary_loss_clip": 0.01105994, "auxiliary_loss_mlp": 0.01032696, "balance_loss_clip": 1.01963639, "balance_loss_mlp": 1.03543901, "epoch": 0.7239440853750188, "flos": 14209745368320.0, "grad_norm": 2.5678034976718815, "language_loss": 0.76332521, "learning_rate": 7.474170592596301e-07, "loss": 0.78471214, "num_input_tokens_seen": 259777370, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12041, "time_per_iteration": 2.451019287109375 }, { "auxiliary_loss_clip": 0.01106058, "auxiliary_loss_mlp": 0.01031866, "balance_loss_clip": 1.01951027, "balance_loss_mlp": 1.03475916, "epoch": 0.7240042086276868, "flos": 21614812479360.0, "grad_norm": 2.6028770234637912, "language_loss": 0.63888025, "learning_rate": 7.471134629714797e-07, "loss": 0.66025949, "num_input_tokens_seen": 259794665, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 12042, "time_per_iteration": 2.4754483699798584 }, { "auxiliary_loss_clip": 0.01110638, "auxiliary_loss_mlp": 0.01030842, "balance_loss_clip": 1.0174669, "balance_loss_mlp": 1.03895497, "epoch": 0.7240643318803547, "flos": 23331450337920.0, "grad_norm": 3.217971809226368, "language_loss": 0.83232629, "learning_rate": 7.468099141929116e-07, "loss": 0.85374105, "num_input_tokens_seen": 259811110, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 12043, "time_per_iteration": 2.490687608718872 }, { "auxiliary_loss_clip": 0.01108103, "auxiliary_loss_mlp": 0.01030561, "balance_loss_clip": 1.01672697, "balance_loss_mlp": 1.03655934, "epoch": 0.7241244551330227, "flos": 24024849459840.0, "grad_norm": 2.20859242660417, "language_loss": 0.63950163, "learning_rate": 7.465064129354379e-07, "loss": 0.66088831, "num_input_tokens_seen": 259831080, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 12044, "time_per_iteration": 2.5052459239959717 }, { "auxiliary_loss_clip": 0.01107956, "auxiliary_loss_mlp": 0.01036837, "balance_loss_clip": 1.02326477, "balance_loss_mlp": 1.03806829, "epoch": 0.7241845783856906, "flos": 18730323728640.0, "grad_norm": 2.0772427309419337, "language_loss": 0.81512403, "learning_rate": 7.462029592105658e-07, "loss": 0.83657199, "num_input_tokens_seen": 259850135, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.69921875, "step": 12045, "time_per_iteration": 3.9870800971984863 }, { "auxiliary_loss_clip": 0.01103675, "auxiliary_loss_mlp": 0.01031388, "balance_loss_clip": 1.0188235, "balance_loss_mlp": 1.03585279, "epoch": 0.7242447016383586, "flos": 19498668577920.0, "grad_norm": 3.2045163496413838, "language_loss": 0.71808499, "learning_rate": 7.458995530298034e-07, "loss": 0.73943567, "num_input_tokens_seen": 259868185, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 12046, "time_per_iteration": 3.950014114379883 }, { "auxiliary_loss_clip": 0.01106631, "auxiliary_loss_mlp": 0.0103394, "balance_loss_clip": 1.0201416, "balance_loss_mlp": 1.03579104, "epoch": 0.7243048248910267, "flos": 22163491704960.0, "grad_norm": 2.732110477022488, "language_loss": 0.71251988, "learning_rate": 7.455961944046553e-07, "loss": 0.73392558, "num_input_tokens_seen": 259887055, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.70703125, "step": 12047, "time_per_iteration": 2.46364164352417 }, { "auxiliary_loss_clip": 0.01112225, "auxiliary_loss_mlp": 0.0103366, "balance_loss_clip": 1.02043414, "balance_loss_mlp": 1.03908348, "epoch": 0.7243649481436946, "flos": 27672762896640.0, "grad_norm": 1.616177454221447, "language_loss": 0.70136619, "learning_rate": 7.45292883346627e-07, "loss": 0.72282499, "num_input_tokens_seen": 259908295, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 12048, "time_per_iteration": 2.5033323764801025 }, { "auxiliary_loss_clip": 0.01029788, "auxiliary_loss_mlp": 0.01003958, "balance_loss_clip": 1.00283754, "balance_loss_mlp": 1.00696731, "epoch": 0.7244250713963626, "flos": 63244545759360.0, "grad_norm": 0.8300883756243892, "language_loss": 0.53752607, "learning_rate": 7.449896198672168e-07, "loss": 0.55786353, "num_input_tokens_seen": 259968475, "router_z_loss_clip": 0.01123047, "router_z_loss_mlp": 0.22851562, "step": 12049, "time_per_iteration": 4.468938827514648 }, { "auxiliary_loss_clip": 0.01112961, "auxiliary_loss_mlp": 0.01031465, "balance_loss_clip": 1.01673663, "balance_loss_mlp": 1.03813696, "epoch": 0.7244851946490305, "flos": 17967114524160.0, "grad_norm": 2.9064531539823464, "language_loss": 0.60171568, "learning_rate": 7.446864039779258e-07, "loss": 0.62315995, "num_input_tokens_seen": 259984865, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.75, "step": 12050, "time_per_iteration": 3.8969526290893555 }, { "auxiliary_loss_clip": 0.01029661, "auxiliary_loss_mlp": 0.01000341, "balance_loss_clip": 0.99924463, "balance_loss_mlp": 1.00700045, "epoch": 0.7245453179016985, "flos": 70943649603840.0, "grad_norm": 0.7212909938406563, "language_loss": 0.53306758, "learning_rate": 7.443832356902528e-07, "loss": 0.55336756, "num_input_tokens_seen": 260046735, "router_z_loss_clip": 0.01098633, "router_z_loss_mlp": 0.2265625, "step": 12051, "time_per_iteration": 3.1193151473999023 }, { "auxiliary_loss_clip": 0.01105195, "auxiliary_loss_mlp": 0.01032885, "balance_loss_clip": 1.02100563, "balance_loss_mlp": 1.03615141, "epoch": 0.7246054411543664, "flos": 24568464867840.0, "grad_norm": 1.7605896038756401, "language_loss": 0.72359169, "learning_rate": 7.440801150156927e-07, "loss": 0.74497247, "num_input_tokens_seen": 260067950, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69140625, "step": 12052, "time_per_iteration": 2.535421371459961 }, { "auxiliary_loss_clip": 0.01106467, "auxiliary_loss_mlp": 0.01029986, "balance_loss_clip": 1.01670027, "balance_loss_mlp": 1.03679919, "epoch": 0.7246655644070344, "flos": 32338312548480.0, "grad_norm": 2.1118624063403173, "language_loss": 0.74597561, "learning_rate": 7.437770419657415e-07, "loss": 0.76734012, "num_input_tokens_seen": 260087730, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 12053, "time_per_iteration": 2.5687215328216553 }, { "auxiliary_loss_clip": 0.01107947, "auxiliary_loss_mlp": 0.01032895, "balance_loss_clip": 1.01976454, "balance_loss_mlp": 1.0368526, "epoch": 0.7247256876597024, "flos": 21872471713920.0, "grad_norm": 2.176892791652509, "language_loss": 0.7825321, "learning_rate": 7.434740165518898e-07, "loss": 0.80394053, "num_input_tokens_seen": 260107760, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 12054, "time_per_iteration": 2.504093647003174 }, { "auxiliary_loss_clip": 0.01105974, "auxiliary_loss_mlp": 0.01034188, "balance_loss_clip": 1.02089643, "balance_loss_mlp": 1.03648221, "epoch": 0.7247858109123704, "flos": 16213093585920.0, "grad_norm": 2.725585964813312, "language_loss": 0.68701541, "learning_rate": 7.431710387856301e-07, "loss": 0.70841706, "num_input_tokens_seen": 260123660, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 12055, "time_per_iteration": 2.4469802379608154 }, { "auxiliary_loss_clip": 0.01104415, "auxiliary_loss_mlp": 0.01033457, "balance_loss_clip": 1.02110088, "balance_loss_mlp": 1.03585243, "epoch": 0.7248459341650383, "flos": 20850705434880.0, "grad_norm": 1.8543414615109097, "language_loss": 0.7405051, "learning_rate": 7.428681086784496e-07, "loss": 0.7618838, "num_input_tokens_seen": 260142690, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 12056, "time_per_iteration": 2.4802088737487793 }, { "auxiliary_loss_clip": 0.0110297, "auxiliary_loss_mlp": 0.01027232, "balance_loss_clip": 1.01482821, "balance_loss_mlp": 1.03572083, "epoch": 0.7249060574177063, "flos": 25921794614400.0, "grad_norm": 1.9079511504881033, "language_loss": 0.70850897, "learning_rate": 7.425652262418368e-07, "loss": 0.72981095, "num_input_tokens_seen": 260162590, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 12057, "time_per_iteration": 2.5198559761047363 }, { "auxiliary_loss_clip": 0.01109532, "auxiliary_loss_mlp": 0.01035904, "balance_loss_clip": 1.0224396, "balance_loss_mlp": 1.03806353, "epoch": 0.7249661806703742, "flos": 17345536646400.0, "grad_norm": 1.8579231906949618, "language_loss": 0.62650031, "learning_rate": 7.42262391487277e-07, "loss": 0.64795464, "num_input_tokens_seen": 260181065, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 12058, "time_per_iteration": 2.443040132522583 }, { "auxiliary_loss_clip": 0.01110655, "auxiliary_loss_mlp": 0.01028843, "balance_loss_clip": 1.01555753, "balance_loss_mlp": 1.03851664, "epoch": 0.7250263039230422, "flos": 19574153009280.0, "grad_norm": 2.229097292657213, "language_loss": 0.74714673, "learning_rate": 7.419596044262535e-07, "loss": 0.76854169, "num_input_tokens_seen": 260200330, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 12059, "time_per_iteration": 2.476848602294922 }, { "auxiliary_loss_clip": 0.01104377, "auxiliary_loss_mlp": 0.01034434, "balance_loss_clip": 1.02253711, "balance_loss_mlp": 1.03664899, "epoch": 0.7250864271757103, "flos": 21976648133760.0, "grad_norm": 1.6703997726780049, "language_loss": 0.78983331, "learning_rate": 7.416568650702472e-07, "loss": 0.81122148, "num_input_tokens_seen": 260219975, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 12060, "time_per_iteration": 2.471362829208374 }, { "auxiliary_loss_clip": 0.0110754, "auxiliary_loss_mlp": 0.01026929, "balance_loss_clip": 1.0133214, "balance_loss_mlp": 1.03679812, "epoch": 0.7251465504283782, "flos": 25012608537600.0, "grad_norm": 2.079502295977585, "language_loss": 0.76678717, "learning_rate": 7.413541734307393e-07, "loss": 0.78813189, "num_input_tokens_seen": 260242025, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 12061, "time_per_iteration": 2.535731554031372 }, { "auxiliary_loss_clip": 0.01103952, "auxiliary_loss_mlp": 0.01030537, "balance_loss_clip": 1.01809788, "balance_loss_mlp": 1.03621578, "epoch": 0.7252066736810462, "flos": 16690131135360.0, "grad_norm": 1.8739400304079694, "language_loss": 0.81204796, "learning_rate": 7.410515295192068e-07, "loss": 0.83339286, "num_input_tokens_seen": 260260015, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 12062, "time_per_iteration": 2.434937000274658 }, { "auxiliary_loss_clip": 0.01113333, "auxiliary_loss_mlp": 0.01030721, "balance_loss_clip": 1.01661837, "balance_loss_mlp": 1.03974962, "epoch": 0.7252667969337141, "flos": 25703026830720.0, "grad_norm": 2.1067095249362033, "language_loss": 0.69260412, "learning_rate": 7.407489333471262e-07, "loss": 0.71404463, "num_input_tokens_seen": 260278635, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 12063, "time_per_iteration": 2.5212767124176025 }, { "auxiliary_loss_clip": 0.01103873, "auxiliary_loss_mlp": 0.01030919, "balance_loss_clip": 1.01838398, "balance_loss_mlp": 1.03550589, "epoch": 0.7253269201863821, "flos": 18259930195200.0, "grad_norm": 1.5079990918568809, "language_loss": 0.699274, "learning_rate": 7.40446384925973e-07, "loss": 0.72062194, "num_input_tokens_seen": 260298510, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 12064, "time_per_iteration": 2.470219135284424 }, { "auxiliary_loss_clip": 0.01108897, "auxiliary_loss_mlp": 0.01032022, "balance_loss_clip": 1.01892042, "balance_loss_mlp": 1.03887415, "epoch": 0.72538704343905, "flos": 20411805150720.0, "grad_norm": 1.9163653569840735, "language_loss": 0.90673304, "learning_rate": 7.401438842672192e-07, "loss": 0.92814219, "num_input_tokens_seen": 260317405, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 12065, "time_per_iteration": 2.4896934032440186 }, { "auxiliary_loss_clip": 0.01029689, "auxiliary_loss_mlp": 0.00999999, "balance_loss_clip": 0.99888486, "balance_loss_mlp": 1.00701106, "epoch": 0.725447166691718, "flos": 70151209706880.0, "grad_norm": 0.6556851910265971, "language_loss": 0.56040776, "learning_rate": 7.398414313823349e-07, "loss": 0.58070457, "num_input_tokens_seen": 260388085, "router_z_loss_clip": 0.01116943, "router_z_loss_mlp": 0.2265625, "step": 12066, "time_per_iteration": 3.243098258972168 }, { "auxiliary_loss_clip": 0.01105865, "auxiliary_loss_mlp": 0.01032899, "balance_loss_clip": 1.02031672, "balance_loss_mlp": 1.03589976, "epoch": 0.725507289944386, "flos": 27052334254080.0, "grad_norm": 3.022905951277817, "language_loss": 0.76615405, "learning_rate": 7.395390262827897e-07, "loss": 0.78754169, "num_input_tokens_seen": 260406165, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 12067, "time_per_iteration": 2.5025229454040527 }, { "auxiliary_loss_clip": 0.01030068, "auxiliary_loss_mlp": 0.01000815, "balance_loss_clip": 0.99980813, "balance_loss_mlp": 1.00731349, "epoch": 0.725567413197054, "flos": 62921924778240.0, "grad_norm": 0.7373602672227728, "language_loss": 0.57094604, "learning_rate": 7.392366689800515e-07, "loss": 0.59125489, "num_input_tokens_seen": 260461365, "router_z_loss_clip": 0.0100708, "router_z_loss_mlp": 0.22851562, "step": 12068, "time_per_iteration": 3.017747640609741 }, { "auxiliary_loss_clip": 0.01029474, "auxiliary_loss_mlp": 0.01000732, "balance_loss_clip": 0.99973059, "balance_loss_mlp": 1.00682831, "epoch": 0.7256275364497219, "flos": 60295957188480.0, "grad_norm": 0.7102002358066243, "language_loss": 0.55580783, "learning_rate": 7.389343594855848e-07, "loss": 0.57610989, "num_input_tokens_seen": 260523795, "router_z_loss_clip": 0.01000977, "router_z_loss_mlp": 0.2265625, "step": 12069, "time_per_iteration": 3.111978769302368 }, { "auxiliary_loss_clip": 0.01103304, "auxiliary_loss_mlp": 0.0102847, "balance_loss_clip": 1.01715136, "balance_loss_mlp": 1.03651488, "epoch": 0.7256876597023899, "flos": 24498511130880.0, "grad_norm": 1.8075824942114058, "language_loss": 0.79717094, "learning_rate": 7.38632097810854e-07, "loss": 0.81848872, "num_input_tokens_seen": 260544765, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66796875, "step": 12070, "time_per_iteration": 2.510000228881836 }, { "auxiliary_loss_clip": 0.01103666, "auxiliary_loss_mlp": 0.01036673, "balance_loss_clip": 1.02482963, "balance_loss_mlp": 1.03751898, "epoch": 0.7257477829550578, "flos": 24352749740160.0, "grad_norm": 1.7649645330647536, "language_loss": 0.71838391, "learning_rate": 7.383298839673197e-07, "loss": 0.73978734, "num_input_tokens_seen": 260564340, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.66015625, "step": 12071, "time_per_iteration": 2.4940547943115234 }, { "auxiliary_loss_clip": 0.01106138, "auxiliary_loss_mlp": 0.01035872, "balance_loss_clip": 1.02387989, "balance_loss_mlp": 1.0375545, "epoch": 0.7258079062077258, "flos": 17202217380480.0, "grad_norm": 1.7854931133949175, "language_loss": 0.70041645, "learning_rate": 7.380277179664436e-07, "loss": 0.72183651, "num_input_tokens_seen": 260582565, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 12072, "time_per_iteration": 2.4634506702423096 }, { "auxiliary_loss_clip": 0.0110895, "auxiliary_loss_mlp": 0.01033998, "balance_loss_clip": 1.02068233, "balance_loss_mlp": 1.03649616, "epoch": 0.7258680294603939, "flos": 21580338401280.0, "grad_norm": 2.069374695954411, "language_loss": 0.78659731, "learning_rate": 7.377255998196821e-07, "loss": 0.80802679, "num_input_tokens_seen": 260601700, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 12073, "time_per_iteration": 2.452040672302246 }, { "auxiliary_loss_clip": 0.01105235, "auxiliary_loss_mlp": 0.01032841, "balance_loss_clip": 1.02024698, "balance_loss_mlp": 1.03700328, "epoch": 0.7259281527130618, "flos": 34855399036800.0, "grad_norm": 1.4000293510162423, "language_loss": 0.70318109, "learning_rate": 7.374235295384923e-07, "loss": 0.72456187, "num_input_tokens_seen": 260623040, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 12074, "time_per_iteration": 2.6042964458465576 }, { "auxiliary_loss_clip": 0.01108143, "auxiliary_loss_mlp": 0.01029705, "balance_loss_clip": 1.01649642, "balance_loss_mlp": 1.03708172, "epoch": 0.7259882759657298, "flos": 25404644551680.0, "grad_norm": 1.9264704266180763, "language_loss": 0.74012363, "learning_rate": 7.371215071343302e-07, "loss": 0.76150203, "num_input_tokens_seen": 260642735, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 12075, "time_per_iteration": 2.490316390991211 }, { "auxiliary_loss_clip": 0.01108074, "auxiliary_loss_mlp": 0.01034637, "balance_loss_clip": 1.02116096, "balance_loss_mlp": 1.03729367, "epoch": 0.7260483992183977, "flos": 62953630531200.0, "grad_norm": 1.6320046325290962, "language_loss": 0.63856483, "learning_rate": 7.368195326186458e-07, "loss": 0.65999198, "num_input_tokens_seen": 260669935, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 12076, "time_per_iteration": 2.853017807006836 }, { "auxiliary_loss_clip": 0.0110729, "auxiliary_loss_mlp": 0.0103051, "balance_loss_clip": 1.01745069, "balance_loss_mlp": 1.03639436, "epoch": 0.7261085224710657, "flos": 26467528924800.0, "grad_norm": 1.750485638330853, "language_loss": 0.78666061, "learning_rate": 7.365176060028912e-07, "loss": 0.80803859, "num_input_tokens_seen": 260689605, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12077, "time_per_iteration": 2.4884724617004395 }, { "auxiliary_loss_clip": 0.01029389, "auxiliary_loss_mlp": 0.01002676, "balance_loss_clip": 1.00156772, "balance_loss_mlp": 1.00676024, "epoch": 0.7261686457237336, "flos": 66772732187520.0, "grad_norm": 0.8824241147097829, "language_loss": 0.64967549, "learning_rate": 7.362157272985163e-07, "loss": 0.66999614, "num_input_tokens_seen": 260748265, "router_z_loss_clip": 0.0111084, "router_z_loss_mlp": 0.2265625, "step": 12078, "time_per_iteration": 3.133920431137085 }, { "auxiliary_loss_clip": 0.01029146, "auxiliary_loss_mlp": 0.01000951, "balance_loss_clip": 0.99983644, "balance_loss_mlp": 1.00643086, "epoch": 0.7262287689764017, "flos": 69999594399360.0, "grad_norm": 0.7161849981329365, "language_loss": 0.59300625, "learning_rate": 7.359138965169671e-07, "loss": 0.61330724, "num_input_tokens_seen": 260816715, "router_z_loss_clip": 0.01116943, "router_z_loss_mlp": 0.22753906, "step": 12079, "time_per_iteration": 3.2026281356811523 }, { "auxiliary_loss_clip": 0.01105407, "auxiliary_loss_mlp": 0.01033412, "balance_loss_clip": 1.02007842, "balance_loss_mlp": 1.03591943, "epoch": 0.7262888922290696, "flos": 23805435231360.0, "grad_norm": 2.1294474720648098, "language_loss": 0.64656413, "learning_rate": 7.356121136696895e-07, "loss": 0.6679523, "num_input_tokens_seen": 260836765, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 12080, "time_per_iteration": 2.5078766345977783 }, { "auxiliary_loss_clip": 0.01106966, "auxiliary_loss_mlp": 0.01029383, "balance_loss_clip": 1.01569819, "balance_loss_mlp": 1.03577137, "epoch": 0.7263490154817376, "flos": 19500320603520.0, "grad_norm": 2.2526785378147514, "language_loss": 0.70621979, "learning_rate": 7.35310378768128e-07, "loss": 0.72758329, "num_input_tokens_seen": 260854610, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 12081, "time_per_iteration": 2.4286508560180664 }, { "auxiliary_loss_clip": 0.01111596, "auxiliary_loss_mlp": 0.01028625, "balance_loss_clip": 1.01584542, "balance_loss_mlp": 1.03953195, "epoch": 0.7264091387344055, "flos": 16286243633280.0, "grad_norm": 1.8595164664703518, "language_loss": 0.81232744, "learning_rate": 7.350086918237237e-07, "loss": 0.83372962, "num_input_tokens_seen": 260871620, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 12082, "time_per_iteration": 2.4653892517089844 }, { "auxiliary_loss_clip": 0.01112217, "auxiliary_loss_mlp": 0.01037957, "balance_loss_clip": 1.02356791, "balance_loss_mlp": 1.03722727, "epoch": 0.7264692619870735, "flos": 24352031468160.0, "grad_norm": 2.058763907594417, "language_loss": 0.77569646, "learning_rate": 7.347070528479158e-07, "loss": 0.79719818, "num_input_tokens_seen": 260890490, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.75, "step": 12083, "time_per_iteration": 2.4795522689819336 }, { "auxiliary_loss_clip": 0.01111308, "auxiliary_loss_mlp": 0.01031324, "balance_loss_clip": 1.01805615, "balance_loss_mlp": 1.03977156, "epoch": 0.7265293852397414, "flos": 25119478477440.0, "grad_norm": 1.9689790343997977, "language_loss": 0.73035419, "learning_rate": 7.344054618521433e-07, "loss": 0.75178051, "num_input_tokens_seen": 260909700, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 12084, "time_per_iteration": 2.535390853881836 }, { "auxiliary_loss_clip": 0.01110402, "auxiliary_loss_mlp": 0.01033384, "balance_loss_clip": 1.02025306, "balance_loss_mlp": 1.03809154, "epoch": 0.7265895084924094, "flos": 22638230784000.0, "grad_norm": 1.787848325790159, "language_loss": 0.77828157, "learning_rate": 7.34103918847843e-07, "loss": 0.79971945, "num_input_tokens_seen": 260929090, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 12085, "time_per_iteration": 2.464879035949707 }, { "auxiliary_loss_clip": 0.01107596, "auxiliary_loss_mlp": 0.01035739, "balance_loss_clip": 1.02295399, "balance_loss_mlp": 1.03609729, "epoch": 0.7266496317450775, "flos": 23368222886400.0, "grad_norm": 1.675137738387422, "language_loss": 0.724545, "learning_rate": 7.338024238464493e-07, "loss": 0.74597842, "num_input_tokens_seen": 260946615, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 12086, "time_per_iteration": 2.4866726398468018 }, { "auxiliary_loss_clip": 0.01108213, "auxiliary_loss_mlp": 0.01037003, "balance_loss_clip": 1.02386069, "balance_loss_mlp": 1.03849459, "epoch": 0.7267097549977454, "flos": 28074603323520.0, "grad_norm": 2.0395907084515743, "language_loss": 0.69438756, "learning_rate": 7.335009768593938e-07, "loss": 0.71583974, "num_input_tokens_seen": 260968515, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 12087, "time_per_iteration": 3.927910804748535 }, { "auxiliary_loss_clip": 0.01110172, "auxiliary_loss_mlp": 0.01036509, "balance_loss_clip": 1.02282929, "balance_loss_mlp": 1.03776824, "epoch": 0.7267698782504134, "flos": 22195523658240.0, "grad_norm": 2.063125723479397, "language_loss": 0.79183096, "learning_rate": 7.331995778981088e-07, "loss": 0.81329781, "num_input_tokens_seen": 260986790, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 12088, "time_per_iteration": 3.9902901649475098 }, { "auxiliary_loss_clip": 0.0110766, "auxiliary_loss_mlp": 0.01037422, "balance_loss_clip": 1.0245235, "balance_loss_mlp": 1.03624845, "epoch": 0.7268300015030813, "flos": 18514859996160.0, "grad_norm": 1.884570814912518, "language_loss": 0.74232763, "learning_rate": 7.328982269740221e-07, "loss": 0.76377845, "num_input_tokens_seen": 261004925, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 12089, "time_per_iteration": 2.462573289871216 }, { "auxiliary_loss_clip": 0.01108381, "auxiliary_loss_mlp": 0.01033996, "balance_loss_clip": 1.02102637, "balance_loss_mlp": 1.03753746, "epoch": 0.7268901247557493, "flos": 23986029836160.0, "grad_norm": 1.9491407057372088, "language_loss": 0.71407521, "learning_rate": 7.325969240985616e-07, "loss": 0.73549896, "num_input_tokens_seen": 261023895, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 12090, "time_per_iteration": 3.939939498901367 }, { "auxiliary_loss_clip": 0.01109511, "auxiliary_loss_mlp": 0.01030795, "balance_loss_clip": 1.01702046, "balance_loss_mlp": 1.03743267, "epoch": 0.7269502480084172, "flos": 32088087429120.0, "grad_norm": 10.261431667681826, "language_loss": 0.77092057, "learning_rate": 7.322956692831528e-07, "loss": 0.79232365, "num_input_tokens_seen": 261045445, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 12091, "time_per_iteration": 4.0137412548065186 }, { "auxiliary_loss_clip": 0.01105504, "auxiliary_loss_mlp": 0.01032361, "balance_loss_clip": 1.01922405, "balance_loss_mlp": 1.03479981, "epoch": 0.7270103712610853, "flos": 19062785036160.0, "grad_norm": 1.7834885083426564, "language_loss": 0.71055186, "learning_rate": 7.319944625392205e-07, "loss": 0.73193055, "num_input_tokens_seen": 261064275, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12092, "time_per_iteration": 2.441408634185791 }, { "auxiliary_loss_clip": 0.01107112, "auxiliary_loss_mlp": 0.01028445, "balance_loss_clip": 1.01529658, "balance_loss_mlp": 1.03764486, "epoch": 0.7270704945137532, "flos": 34532921710080.0, "grad_norm": 2.062413612877205, "language_loss": 0.61333442, "learning_rate": 7.31693303878184e-07, "loss": 0.63468999, "num_input_tokens_seen": 261083310, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 12093, "time_per_iteration": 2.5777032375335693 }, { "auxiliary_loss_clip": 0.01106458, "auxiliary_loss_mlp": 0.01034044, "balance_loss_clip": 1.0210861, "balance_loss_mlp": 1.03798378, "epoch": 0.7271306177664212, "flos": 21507583403520.0, "grad_norm": 1.6094954851117755, "language_loss": 0.75592637, "learning_rate": 7.313921933114644e-07, "loss": 0.77733141, "num_input_tokens_seen": 261103460, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 12094, "time_per_iteration": 2.4614510536193848 }, { "auxiliary_loss_clip": 0.01104374, "auxiliary_loss_mlp": 0.01031155, "balance_loss_clip": 1.01907873, "balance_loss_mlp": 1.03551579, "epoch": 0.7271907410190891, "flos": 22272444633600.0, "grad_norm": 1.9141082282201338, "language_loss": 0.85167193, "learning_rate": 7.310911308504808e-07, "loss": 0.87302727, "num_input_tokens_seen": 261121375, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 12095, "time_per_iteration": 2.456263542175293 }, { "auxiliary_loss_clip": 0.0110604, "auxiliary_loss_mlp": 0.01037082, "balance_loss_clip": 1.02318835, "balance_loss_mlp": 1.03580642, "epoch": 0.7272508642717571, "flos": 22893124671360.0, "grad_norm": 2.97771977359353, "language_loss": 0.77848196, "learning_rate": 7.307901165066479e-07, "loss": 0.79991317, "num_input_tokens_seen": 261141105, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.703125, "step": 12096, "time_per_iteration": 2.4588100910186768 }, { "auxiliary_loss_clip": 0.01108739, "auxiliary_loss_mlp": 0.01034186, "balance_loss_clip": 1.02167511, "balance_loss_mlp": 1.03932476, "epoch": 0.727310987524425, "flos": 11655886331520.0, "grad_norm": 1.906534978183989, "language_loss": 0.72259068, "learning_rate": 7.30489150291381e-07, "loss": 0.74401987, "num_input_tokens_seen": 261159255, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12097, "time_per_iteration": 2.4614346027374268 }, { "auxiliary_loss_clip": 0.0110964, "auxiliary_loss_mlp": 0.01035566, "balance_loss_clip": 1.02180386, "balance_loss_mlp": 1.03867793, "epoch": 0.727371110777093, "flos": 24535319592960.0, "grad_norm": 1.7934336509857145, "language_loss": 0.77006865, "learning_rate": 7.301882322160935e-07, "loss": 0.79152071, "num_input_tokens_seen": 261177960, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 12098, "time_per_iteration": 2.4896578788757324 }, { "auxiliary_loss_clip": 0.01110218, "auxiliary_loss_mlp": 0.01032296, "balance_loss_clip": 1.01861703, "balance_loss_mlp": 1.03676105, "epoch": 0.7274312340297611, "flos": 74739835405440.0, "grad_norm": 1.6472884965812375, "language_loss": 0.67621028, "learning_rate": 7.298873622921952e-07, "loss": 0.69763541, "num_input_tokens_seen": 261205660, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 12099, "time_per_iteration": 2.8768861293792725 }, { "auxiliary_loss_clip": 0.01111206, "auxiliary_loss_mlp": 0.01035139, "balance_loss_clip": 1.02084589, "balance_loss_mlp": 1.03640091, "epoch": 0.727491357282429, "flos": 22342865247360.0, "grad_norm": 1.5605037317689447, "language_loss": 0.72372562, "learning_rate": 7.29586540531095e-07, "loss": 0.74518907, "num_input_tokens_seen": 261225185, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75, "step": 12100, "time_per_iteration": 2.4554929733276367 }, { "auxiliary_loss_clip": 0.01107771, "auxiliary_loss_mlp": 0.01036217, "balance_loss_clip": 1.02376509, "balance_loss_mlp": 1.03780043, "epoch": 0.727551480535097, "flos": 23297550877440.0, "grad_norm": 1.5622961841430125, "language_loss": 0.7518599, "learning_rate": 7.292857669442005e-07, "loss": 0.77329975, "num_input_tokens_seen": 261247965, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 12101, "time_per_iteration": 2.542534589767456 }, { "auxiliary_loss_clip": 0.01107918, "auxiliary_loss_mlp": 0.01029144, "balance_loss_clip": 1.01733613, "balance_loss_mlp": 1.03826451, "epoch": 0.7276116037877649, "flos": 21470559459840.0, "grad_norm": 1.899236682068289, "language_loss": 0.82354367, "learning_rate": 7.289850415429177e-07, "loss": 0.84491432, "num_input_tokens_seen": 261267585, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 12102, "time_per_iteration": 2.4720077514648438 }, { "auxiliary_loss_clip": 0.01107022, "auxiliary_loss_mlp": 0.01035304, "balance_loss_clip": 1.02294815, "balance_loss_mlp": 1.03735566, "epoch": 0.7276717270404329, "flos": 21464059098240.0, "grad_norm": 2.1764820817101675, "language_loss": 0.81875241, "learning_rate": 7.286843643386495e-07, "loss": 0.84017563, "num_input_tokens_seen": 261285200, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 12103, "time_per_iteration": 2.486177444458008 }, { "auxiliary_loss_clip": 0.01106797, "auxiliary_loss_mlp": 0.01027529, "balance_loss_clip": 1.01378381, "balance_loss_mlp": 1.0364871, "epoch": 0.7277318502931008, "flos": 16837221329280.0, "grad_norm": 2.243779589756018, "language_loss": 0.66301906, "learning_rate": 7.283837353427968e-07, "loss": 0.68436235, "num_input_tokens_seen": 261303645, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.703125, "step": 12104, "time_per_iteration": 2.4497427940368652 }, { "auxiliary_loss_clip": 0.01104498, "auxiliary_loss_mlp": 0.01029587, "balance_loss_clip": 1.01692736, "balance_loss_mlp": 1.03708684, "epoch": 0.7277919735457689, "flos": 33400550476800.0, "grad_norm": 1.7943383046017416, "language_loss": 0.65961933, "learning_rate": 7.280831545667611e-07, "loss": 0.68096024, "num_input_tokens_seen": 261323265, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.67578125, "step": 12105, "time_per_iteration": 2.579526901245117 }, { "auxiliary_loss_clip": 0.01109171, "auxiliary_loss_mlp": 0.01037678, "balance_loss_clip": 1.02427304, "balance_loss_mlp": 1.03840613, "epoch": 0.7278520967984368, "flos": 19206499351680.0, "grad_norm": 4.087663465657412, "language_loss": 0.75448608, "learning_rate": 7.27782622021939e-07, "loss": 0.77595454, "num_input_tokens_seen": 261339745, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.70703125, "step": 12106, "time_per_iteration": 2.435762405395508 }, { "auxiliary_loss_clip": 0.01109128, "auxiliary_loss_mlp": 0.01033962, "balance_loss_clip": 1.02048528, "balance_loss_mlp": 1.03739548, "epoch": 0.7279122200511048, "flos": 34094667870720.0, "grad_norm": 1.968764194043956, "language_loss": 0.70087892, "learning_rate": 7.274821377197273e-07, "loss": 0.72230983, "num_input_tokens_seen": 261359310, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 12107, "time_per_iteration": 2.5814285278320312 }, { "auxiliary_loss_clip": 0.01104992, "auxiliary_loss_mlp": 0.01029759, "balance_loss_clip": 1.01714635, "balance_loss_mlp": 1.03566909, "epoch": 0.7279723433037727, "flos": 54599049348480.0, "grad_norm": 1.5223302258480507, "language_loss": 0.74984908, "learning_rate": 7.271817016715205e-07, "loss": 0.7711966, "num_input_tokens_seen": 261384640, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 12108, "time_per_iteration": 2.8107523918151855 }, { "auxiliary_loss_clip": 0.01106489, "auxiliary_loss_mlp": 0.01031036, "balance_loss_clip": 1.01808393, "balance_loss_mlp": 1.03610837, "epoch": 0.7280324665564407, "flos": 36137482156800.0, "grad_norm": 1.767052529232793, "language_loss": 0.67050242, "learning_rate": 7.268813138887124e-07, "loss": 0.69187772, "num_input_tokens_seen": 261405290, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 12109, "time_per_iteration": 2.5948827266693115 }, { "auxiliary_loss_clip": 0.01107155, "auxiliary_loss_mlp": 0.01034762, "balance_loss_clip": 1.02101171, "balance_loss_mlp": 1.03736734, "epoch": 0.7280925898091086, "flos": 11618539165440.0, "grad_norm": 1.9036403813017584, "language_loss": 0.63490224, "learning_rate": 7.265809743826912e-07, "loss": 0.65632141, "num_input_tokens_seen": 261419710, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.6953125, "step": 12110, "time_per_iteration": 2.4196412563323975 }, { "auxiliary_loss_clip": 0.01106612, "auxiliary_loss_mlp": 0.01027919, "balance_loss_clip": 1.01410818, "balance_loss_mlp": 1.03459716, "epoch": 0.7281527130617766, "flos": 34277094069120.0, "grad_norm": 1.7785616322944287, "language_loss": 0.58091521, "learning_rate": 7.26280683164847e-07, "loss": 0.60226059, "num_input_tokens_seen": 261442385, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 12111, "time_per_iteration": 2.611581802368164 }, { "auxiliary_loss_clip": 0.0110938, "auxiliary_loss_mlp": 0.01031136, "balance_loss_clip": 1.01781988, "balance_loss_mlp": 1.03851271, "epoch": 0.7282128363144446, "flos": 13918043018880.0, "grad_norm": 2.025161668861108, "language_loss": 0.73459435, "learning_rate": 7.259804402465677e-07, "loss": 0.75599951, "num_input_tokens_seen": 261459805, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 12112, "time_per_iteration": 2.4412474632263184 }, { "auxiliary_loss_clip": 0.01103923, "auxiliary_loss_mlp": 0.01031327, "balance_loss_clip": 1.01883423, "balance_loss_mlp": 1.03492391, "epoch": 0.7282729595671126, "flos": 20777627214720.0, "grad_norm": 2.313856337698173, "language_loss": 0.6699599, "learning_rate": 7.25680245639237e-07, "loss": 0.69131243, "num_input_tokens_seen": 261477175, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 12113, "time_per_iteration": 2.462876558303833 }, { "auxiliary_loss_clip": 0.01106661, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.01780772, "balance_loss_mlp": 1.03528774, "epoch": 0.7283330828197806, "flos": 16325422392960.0, "grad_norm": 2.2333492019802255, "language_loss": 0.73208237, "learning_rate": 7.253800993542399e-07, "loss": 0.75345826, "num_input_tokens_seen": 261494990, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 12114, "time_per_iteration": 2.448678493499756 }, { "auxiliary_loss_clip": 0.01104349, "auxiliary_loss_mlp": 0.01031009, "balance_loss_clip": 1.01766968, "balance_loss_mlp": 1.03489041, "epoch": 0.7283932060724485, "flos": 27490193043840.0, "grad_norm": 2.1569930068914265, "language_loss": 0.68365705, "learning_rate": 7.250800014029564e-07, "loss": 0.70501059, "num_input_tokens_seen": 261514445, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 12115, "time_per_iteration": 2.521806240081787 }, { "auxiliary_loss_clip": 0.01108253, "auxiliary_loss_mlp": 0.01033015, "balance_loss_clip": 1.02007449, "balance_loss_mlp": 1.0354681, "epoch": 0.7284533293251165, "flos": 18367877543040.0, "grad_norm": 1.8175027157882642, "language_loss": 0.59546798, "learning_rate": 7.247799517967674e-07, "loss": 0.61688066, "num_input_tokens_seen": 261533565, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 12116, "time_per_iteration": 2.454390048980713 }, { "auxiliary_loss_clip": 0.01104294, "auxiliary_loss_mlp": 0.01029855, "balance_loss_clip": 1.01706982, "balance_loss_mlp": 1.03561819, "epoch": 0.7285134525777844, "flos": 21725525174400.0, "grad_norm": 1.799886455108728, "language_loss": 0.72622663, "learning_rate": 7.2447995054705e-07, "loss": 0.74756813, "num_input_tokens_seen": 261553795, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 12117, "time_per_iteration": 2.494313955307007 }, { "auxiliary_loss_clip": 0.01105171, "auxiliary_loss_mlp": 0.01028116, "balance_loss_clip": 1.01543236, "balance_loss_mlp": 1.03515589, "epoch": 0.7285735758304525, "flos": 20741357456640.0, "grad_norm": 2.1106744345941504, "language_loss": 0.69467556, "learning_rate": 7.241799976651807e-07, "loss": 0.71600842, "num_input_tokens_seen": 261572565, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 12118, "time_per_iteration": 2.4788522720336914 }, { "auxiliary_loss_clip": 0.01101735, "auxiliary_loss_mlp": 0.01030562, "balance_loss_clip": 1.01819956, "balance_loss_mlp": 1.03486657, "epoch": 0.7286336990831204, "flos": 17310954827520.0, "grad_norm": 1.7991103647125275, "language_loss": 0.84424043, "learning_rate": 7.238800931625346e-07, "loss": 0.86556339, "num_input_tokens_seen": 261590910, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.66796875, "step": 12119, "time_per_iteration": 2.4517886638641357 }, { "auxiliary_loss_clip": 0.01106651, "auxiliary_loss_mlp": 0.0102931, "balance_loss_clip": 1.01694775, "balance_loss_mlp": 1.03591061, "epoch": 0.7286938223357884, "flos": 19787390098560.0, "grad_norm": 2.1269434482172476, "language_loss": 0.82075125, "learning_rate": 7.235802370504831e-07, "loss": 0.84211087, "num_input_tokens_seen": 261606005, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.70703125, "step": 12120, "time_per_iteration": 2.493675470352173 }, { "auxiliary_loss_clip": 0.01108643, "auxiliary_loss_mlp": 0.01039588, "balance_loss_clip": 1.0267489, "balance_loss_mlp": 1.03790462, "epoch": 0.7287539455884563, "flos": 15340859625600.0, "grad_norm": 11.519417493158086, "language_loss": 0.78590202, "learning_rate": 7.232804293403963e-07, "loss": 0.80738437, "num_input_tokens_seen": 261622305, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 12121, "time_per_iteration": 2.4506118297576904 }, { "auxiliary_loss_clip": 0.01107193, "auxiliary_loss_mlp": 0.01032378, "balance_loss_clip": 1.01916337, "balance_loss_mlp": 1.03441834, "epoch": 0.7288140688411243, "flos": 25192484870400.0, "grad_norm": 1.5823057053204146, "language_loss": 0.69176447, "learning_rate": 7.229806700436441e-07, "loss": 0.71316016, "num_input_tokens_seen": 261642465, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 12122, "time_per_iteration": 2.5114552974700928 }, { "auxiliary_loss_clip": 0.01102306, "auxiliary_loss_mlp": 0.01032379, "balance_loss_clip": 1.02043402, "balance_loss_mlp": 1.03425336, "epoch": 0.7288741920937922, "flos": 23984162328960.0, "grad_norm": 1.8682181750763291, "language_loss": 0.8681581, "learning_rate": 7.226809591715923e-07, "loss": 0.88950497, "num_input_tokens_seen": 261661420, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 12123, "time_per_iteration": 2.5069427490234375 }, { "auxiliary_loss_clip": 0.01105375, "auxiliary_loss_mlp": 0.01031439, "balance_loss_clip": 1.01922643, "balance_loss_mlp": 1.03651869, "epoch": 0.7289343153464602, "flos": 22744921155840.0, "grad_norm": 1.6750340657073894, "language_loss": 0.82890213, "learning_rate": 7.223812967356065e-07, "loss": 0.85027021, "num_input_tokens_seen": 261680865, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 12124, "time_per_iteration": 2.482590913772583 }, { "auxiliary_loss_clip": 0.01103977, "auxiliary_loss_mlp": 0.01028849, "balance_loss_clip": 1.01663613, "balance_loss_mlp": 1.03560805, "epoch": 0.7289944385991282, "flos": 24900028335360.0, "grad_norm": 2.0251499569073066, "language_loss": 0.66947913, "learning_rate": 7.220816827470499e-07, "loss": 0.69080734, "num_input_tokens_seen": 261701455, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 12125, "time_per_iteration": 2.497157096862793 }, { "auxiliary_loss_clip": 0.01111014, "auxiliary_loss_mlp": 0.01036081, "balance_loss_clip": 1.02202082, "balance_loss_mlp": 1.03750968, "epoch": 0.7290545618517962, "flos": 22967064817920.0, "grad_norm": 1.9624852513103301, "language_loss": 0.75427711, "learning_rate": 7.217821172172855e-07, "loss": 0.77574813, "num_input_tokens_seen": 261721260, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 12126, "time_per_iteration": 2.4788637161254883 }, { "auxiliary_loss_clip": 0.01028591, "auxiliary_loss_mlp": 0.01001742, "balance_loss_clip": 1.00069857, "balance_loss_mlp": 1.00604069, "epoch": 0.7291146851044642, "flos": 61901523216000.0, "grad_norm": 0.8355164216939395, "language_loss": 0.58656275, "learning_rate": 7.2148260015767e-07, "loss": 0.60686606, "num_input_tokens_seen": 261779370, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.22558594, "step": 12127, "time_per_iteration": 3.0333714485168457 }, { "auxiliary_loss_clip": 0.01105594, "auxiliary_loss_mlp": 0.01027708, "balance_loss_clip": 1.01622796, "balance_loss_mlp": 1.03766668, "epoch": 0.7291748083571321, "flos": 23330947547520.0, "grad_norm": 2.130387332610561, "language_loss": 0.68567288, "learning_rate": 7.21183131579562e-07, "loss": 0.70700586, "num_input_tokens_seen": 261798050, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 12128, "time_per_iteration": 3.9118762016296387 }, { "auxiliary_loss_clip": 0.01107025, "auxiliary_loss_mlp": 0.01036538, "balance_loss_clip": 1.02331758, "balance_loss_mlp": 1.03674674, "epoch": 0.7292349316098001, "flos": 28330000001280.0, "grad_norm": 1.8885370746744459, "language_loss": 0.65426481, "learning_rate": 7.20883711494319e-07, "loss": 0.67570049, "num_input_tokens_seen": 261817660, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 12129, "time_per_iteration": 3.9884183406829834 }, { "auxiliary_loss_clip": 0.01103139, "auxiliary_loss_mlp": 0.01030137, "balance_loss_clip": 1.01700044, "balance_loss_mlp": 1.03585505, "epoch": 0.729295054862468, "flos": 24132222190080.0, "grad_norm": 2.0533195324813476, "language_loss": 0.74283063, "learning_rate": 7.205843399132927e-07, "loss": 0.76416349, "num_input_tokens_seen": 261837935, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.671875, "step": 12130, "time_per_iteration": 2.4936599731445312 }, { "auxiliary_loss_clip": 0.01104221, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.01676488, "balance_loss_mlp": 1.03495622, "epoch": 0.7293551781151361, "flos": 22816239609600.0, "grad_norm": 1.6100587603332308, "language_loss": 0.69816375, "learning_rate": 7.202850168478374e-07, "loss": 0.71950185, "num_input_tokens_seen": 261857575, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 12131, "time_per_iteration": 2.499392509460449 }, { "auxiliary_loss_clip": 0.0110381, "auxiliary_loss_mlp": 0.01031713, "balance_loss_clip": 1.0197438, "balance_loss_mlp": 1.03588188, "epoch": 0.729415301367804, "flos": 22126683242880.0, "grad_norm": 1.5874522117610539, "language_loss": 0.77026391, "learning_rate": 7.199857423093025e-07, "loss": 0.79161918, "num_input_tokens_seen": 261877265, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 12132, "time_per_iteration": 3.8759548664093018 }, { "auxiliary_loss_clip": 0.0110766, "auxiliary_loss_mlp": 0.01036743, "balance_loss_clip": 1.02447617, "balance_loss_mlp": 1.03740096, "epoch": 0.729475424620472, "flos": 12349608675840.0, "grad_norm": 2.354159021322446, "language_loss": 0.79370582, "learning_rate": 7.196865163090358e-07, "loss": 0.81514984, "num_input_tokens_seen": 261893695, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 12133, "time_per_iteration": 3.842071056365967 }, { "auxiliary_loss_clip": 0.01104779, "auxiliary_loss_mlp": 0.0102873, "balance_loss_clip": 1.01590276, "balance_loss_mlp": 1.03561974, "epoch": 0.7295355478731399, "flos": 22195308176640.0, "grad_norm": 2.2511765933566386, "language_loss": 0.72378838, "learning_rate": 7.193873388583846e-07, "loss": 0.74512351, "num_input_tokens_seen": 261911825, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 12134, "time_per_iteration": 2.4538609981536865 }, { "auxiliary_loss_clip": 0.01108163, "auxiliary_loss_mlp": 0.01037656, "balance_loss_clip": 1.02478147, "balance_loss_mlp": 1.03780818, "epoch": 0.7295956711258079, "flos": 23222030532480.0, "grad_norm": 2.835038066713943, "language_loss": 0.71590698, "learning_rate": 7.190882099686939e-07, "loss": 0.73736519, "num_input_tokens_seen": 261931190, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12135, "time_per_iteration": 2.469972848892212 }, { "auxiliary_loss_clip": 0.01108672, "auxiliary_loss_mlp": 0.01035076, "balance_loss_clip": 1.02224874, "balance_loss_mlp": 1.03704619, "epoch": 0.7296557943784758, "flos": 31869104163840.0, "grad_norm": 1.940077060899598, "language_loss": 0.62417185, "learning_rate": 7.187891296513075e-07, "loss": 0.64560938, "num_input_tokens_seen": 261951240, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 12136, "time_per_iteration": 2.522357940673828 }, { "auxiliary_loss_clip": 0.01103719, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.02460265, "balance_loss_mlp": 1.0348016, "epoch": 0.7297159176311439, "flos": 26651714889600.0, "grad_norm": 1.979140989256989, "language_loss": 0.74611658, "learning_rate": 7.184900979175654e-07, "loss": 0.7675246, "num_input_tokens_seen": 261971605, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 12137, "time_per_iteration": 2.490976095199585 }, { "auxiliary_loss_clip": 0.01109012, "auxiliary_loss_mlp": 0.01034422, "balance_loss_clip": 1.02171469, "balance_loss_mlp": 1.0383445, "epoch": 0.7297760408838118, "flos": 24749562263040.0, "grad_norm": 1.8092545159160962, "language_loss": 0.74485588, "learning_rate": 7.181911147788069e-07, "loss": 0.76629019, "num_input_tokens_seen": 261990830, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12138, "time_per_iteration": 2.4800992012023926 }, { "auxiliary_loss_clip": 0.01102958, "auxiliary_loss_mlp": 0.01028998, "balance_loss_clip": 1.01701784, "balance_loss_mlp": 1.03410149, "epoch": 0.7298361641364798, "flos": 18073768982400.0, "grad_norm": 2.18953697171195, "language_loss": 0.72152382, "learning_rate": 7.178921802463702e-07, "loss": 0.74284339, "num_input_tokens_seen": 262008190, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 12139, "time_per_iteration": 2.458080530166626 }, { "auxiliary_loss_clip": 0.01102796, "auxiliary_loss_mlp": 0.0102631, "balance_loss_clip": 1.01464534, "balance_loss_mlp": 1.03597665, "epoch": 0.7298962873891478, "flos": 29895597169920.0, "grad_norm": 1.9460396376327027, "language_loss": 0.73406154, "learning_rate": 7.175932943315898e-07, "loss": 0.75535262, "num_input_tokens_seen": 262030460, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66796875, "step": 12140, "time_per_iteration": 2.539039134979248 }, { "auxiliary_loss_clip": 0.01106145, "auxiliary_loss_mlp": 0.01033348, "balance_loss_clip": 1.01957369, "balance_loss_mlp": 1.03566468, "epoch": 0.7299564106418157, "flos": 32266096254720.0, "grad_norm": 2.9029097386545835, "language_loss": 0.55412233, "learning_rate": 7.172944570458003e-07, "loss": 0.5755173, "num_input_tokens_seen": 262050830, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.703125, "step": 12141, "time_per_iteration": 2.5614733695983887 }, { "auxiliary_loss_clip": 0.01103813, "auxiliary_loss_mlp": 0.01026574, "balance_loss_clip": 1.01468873, "balance_loss_mlp": 1.03641737, "epoch": 0.7300165338944837, "flos": 22930292269440.0, "grad_norm": 1.524829208696074, "language_loss": 0.72766382, "learning_rate": 7.169956684003342e-07, "loss": 0.74896771, "num_input_tokens_seen": 262071245, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 12142, "time_per_iteration": 2.5120773315429688 }, { "auxiliary_loss_clip": 0.01102928, "auxiliary_loss_mlp": 0.01034057, "balance_loss_clip": 1.0221895, "balance_loss_mlp": 1.03449559, "epoch": 0.7300766571471516, "flos": 19828795501440.0, "grad_norm": 1.903291987912254, "language_loss": 0.73975939, "learning_rate": 7.16696928406521e-07, "loss": 0.76112926, "num_input_tokens_seen": 262087525, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.68359375, "step": 12143, "time_per_iteration": 2.4776298999786377 }, { "auxiliary_loss_clip": 0.01106311, "auxiliary_loss_mlp": 0.01031165, "balance_loss_clip": 1.01808202, "balance_loss_mlp": 1.03621674, "epoch": 0.7301367803998197, "flos": 24347829576960.0, "grad_norm": 2.8226579256547857, "language_loss": 0.67140746, "learning_rate": 7.163982370756882e-07, "loss": 0.69278222, "num_input_tokens_seen": 262107355, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 12144, "time_per_iteration": 2.4868104457855225 }, { "auxiliary_loss_clip": 0.01106478, "auxiliary_loss_mlp": 0.01032934, "balance_loss_clip": 1.01974893, "balance_loss_mlp": 1.03653431, "epoch": 0.7301969036524876, "flos": 15304518040320.0, "grad_norm": 1.922146937454107, "language_loss": 0.7924273, "learning_rate": 7.160995944191627e-07, "loss": 0.81382143, "num_input_tokens_seen": 262125645, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 12145, "time_per_iteration": 2.4560017585754395 }, { "auxiliary_loss_clip": 0.01105908, "auxiliary_loss_mlp": 0.01032905, "balance_loss_clip": 1.02013803, "balance_loss_mlp": 1.03821647, "epoch": 0.7302570269051556, "flos": 23507268433920.0, "grad_norm": 2.039434457981697, "language_loss": 0.91186893, "learning_rate": 7.158010004482702e-07, "loss": 0.9332571, "num_input_tokens_seen": 262144075, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.67578125, "step": 12146, "time_per_iteration": 2.549649238586426 }, { "auxiliary_loss_clip": 0.01103603, "auxiliary_loss_mlp": 0.01026555, "balance_loss_clip": 1.01464605, "balance_loss_mlp": 1.03701496, "epoch": 0.7303171501578235, "flos": 20523056549760.0, "grad_norm": 1.9356060960589798, "language_loss": 0.62145048, "learning_rate": 7.155024551743316e-07, "loss": 0.64275205, "num_input_tokens_seen": 262165940, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 12147, "time_per_iteration": 2.4959728717803955 }, { "auxiliary_loss_clip": 0.01109884, "auxiliary_loss_mlp": 0.01039509, "balance_loss_clip": 1.02658129, "balance_loss_mlp": 1.03931642, "epoch": 0.7303772734104915, "flos": 18332613365760.0, "grad_norm": 1.797984935334185, "language_loss": 0.75211471, "learning_rate": 7.152039586086693e-07, "loss": 0.77360868, "num_input_tokens_seen": 262184520, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12148, "time_per_iteration": 2.430774211883545 }, { "auxiliary_loss_clip": 0.01029113, "auxiliary_loss_mlp": 0.01003677, "balance_loss_clip": 1.00237167, "balance_loss_mlp": 1.00658941, "epoch": 0.7304373966631594, "flos": 60654776100480.0, "grad_norm": 0.6891811241986827, "language_loss": 0.56758618, "learning_rate": 7.149055107626017e-07, "loss": 0.58791405, "num_input_tokens_seen": 262247070, "router_z_loss_clip": 0.01306152, "router_z_loss_mlp": 0.22558594, "step": 12149, "time_per_iteration": 3.0592703819274902 }, { "auxiliary_loss_clip": 0.01107387, "auxiliary_loss_mlp": 0.01033151, "balance_loss_clip": 1.02000809, "balance_loss_mlp": 1.03619087, "epoch": 0.7304975199158275, "flos": 19828077229440.0, "grad_norm": 1.8944433929309565, "language_loss": 0.74080682, "learning_rate": 7.146071116474451e-07, "loss": 0.76221216, "num_input_tokens_seen": 262266605, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 12150, "time_per_iteration": 2.4651107788085938 }, { "auxiliary_loss_clip": 0.0110756, "auxiliary_loss_mlp": 0.01031767, "balance_loss_clip": 1.01893425, "balance_loss_mlp": 1.03613329, "epoch": 0.7305576431684954, "flos": 13223997452160.0, "grad_norm": 2.191383340275586, "language_loss": 0.84205568, "learning_rate": 7.143087612745158e-07, "loss": 0.86344898, "num_input_tokens_seen": 262283880, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 12151, "time_per_iteration": 2.420279026031494 }, { "auxiliary_loss_clip": 0.01107726, "auxiliary_loss_mlp": 0.01032815, "balance_loss_clip": 1.01911807, "balance_loss_mlp": 1.03634906, "epoch": 0.7306177664211634, "flos": 24060472773120.0, "grad_norm": 2.408492137101503, "language_loss": 0.77715164, "learning_rate": 7.14010459655127e-07, "loss": 0.79855704, "num_input_tokens_seen": 262304155, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 12152, "time_per_iteration": 2.472885847091675 }, { "auxiliary_loss_clip": 0.01108588, "auxiliary_loss_mlp": 0.01032337, "balance_loss_clip": 1.01930726, "balance_loss_mlp": 1.03791249, "epoch": 0.7306778896738314, "flos": 27089106802560.0, "grad_norm": 1.6147473671118904, "language_loss": 0.79429966, "learning_rate": 7.137122068005919e-07, "loss": 0.81570894, "num_input_tokens_seen": 262325660, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12153, "time_per_iteration": 2.534651517868042 }, { "auxiliary_loss_clip": 0.01109028, "auxiliary_loss_mlp": 0.01037298, "balance_loss_clip": 1.02404761, "balance_loss_mlp": 1.03672886, "epoch": 0.7307380129264993, "flos": 16690669839360.0, "grad_norm": 2.215275067584535, "language_loss": 0.67499089, "learning_rate": 7.134140027222173e-07, "loss": 0.69645405, "num_input_tokens_seen": 262344075, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 12154, "time_per_iteration": 2.4450438022613525 }, { "auxiliary_loss_clip": 0.01106794, "auxiliary_loss_mlp": 0.01033308, "balance_loss_clip": 1.01995063, "balance_loss_mlp": 1.03562474, "epoch": 0.7307981361791673, "flos": 21725740656000.0, "grad_norm": 1.8584526117119982, "language_loss": 0.66007924, "learning_rate": 7.131158474313128e-07, "loss": 0.68148023, "num_input_tokens_seen": 262363305, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 12155, "time_per_iteration": 2.4693944454193115 }, { "auxiliary_loss_clip": 0.0110197, "auxiliary_loss_mlp": 0.01029598, "balance_loss_clip": 1.01707482, "balance_loss_mlp": 1.03365958, "epoch": 0.7308582594318352, "flos": 18040659621120.0, "grad_norm": 1.7119432279914915, "language_loss": 0.81649131, "learning_rate": 7.128177409391851e-07, "loss": 0.83780694, "num_input_tokens_seen": 262380730, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 12156, "time_per_iteration": 2.4311258792877197 }, { "auxiliary_loss_clip": 0.01104801, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.01970053, "balance_loss_mlp": 1.03670204, "epoch": 0.7309183826845033, "flos": 13844964798720.0, "grad_norm": 2.2499133508922, "language_loss": 0.75375485, "learning_rate": 7.125196832571367e-07, "loss": 0.775123, "num_input_tokens_seen": 262395480, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 12157, "time_per_iteration": 2.4381496906280518 }, { "auxiliary_loss_clip": 0.01101121, "auxiliary_loss_mlp": 0.01030345, "balance_loss_clip": 1.01903248, "balance_loss_mlp": 1.03465497, "epoch": 0.7309785059371712, "flos": 17019216564480.0, "grad_norm": 2.465776228792894, "language_loss": 0.73297787, "learning_rate": 7.122216743964713e-07, "loss": 0.75429255, "num_input_tokens_seen": 262413340, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 12158, "time_per_iteration": 2.4319396018981934 }, { "auxiliary_loss_clip": 0.0110955, "auxiliary_loss_mlp": 0.01032857, "balance_loss_clip": 1.01969075, "balance_loss_mlp": 1.03849649, "epoch": 0.7310386291898392, "flos": 26502398052480.0, "grad_norm": 2.773568732663904, "language_loss": 0.85985118, "learning_rate": 7.119237143684896e-07, "loss": 0.88127524, "num_input_tokens_seen": 262433455, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 12159, "time_per_iteration": 2.508090019226074 }, { "auxiliary_loss_clip": 0.01109018, "auxiliary_loss_mlp": 0.01034543, "balance_loss_clip": 1.02103114, "balance_loss_mlp": 1.03640389, "epoch": 0.7310987524425071, "flos": 16945922862720.0, "grad_norm": 3.2988187930759847, "language_loss": 0.73538941, "learning_rate": 7.116258031844895e-07, "loss": 0.75682503, "num_input_tokens_seen": 262450335, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 12160, "time_per_iteration": 2.4204604625701904 }, { "auxiliary_loss_clip": 0.01109868, "auxiliary_loss_mlp": 0.01033463, "balance_loss_clip": 1.01944995, "balance_loss_mlp": 1.03730369, "epoch": 0.7311588756951751, "flos": 13845288021120.0, "grad_norm": 1.934700000232813, "language_loss": 0.72667348, "learning_rate": 7.113279408557675e-07, "loss": 0.74810684, "num_input_tokens_seen": 262468240, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7265625, "step": 12161, "time_per_iteration": 2.443373441696167 }, { "auxiliary_loss_clip": 0.0111251, "auxiliary_loss_mlp": 0.01032145, "balance_loss_clip": 1.01754212, "balance_loss_mlp": 1.0379082, "epoch": 0.731218998947843, "flos": 28767894704640.0, "grad_norm": 1.9536068650961953, "language_loss": 0.69722521, "learning_rate": 7.110301273936192e-07, "loss": 0.7186718, "num_input_tokens_seen": 262487045, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7421875, "step": 12162, "time_per_iteration": 2.5152978897094727 }, { "auxiliary_loss_clip": 0.01110444, "auxiliary_loss_mlp": 0.01033499, "balance_loss_clip": 1.02006984, "balance_loss_mlp": 1.03889334, "epoch": 0.7312791222005111, "flos": 27088783580160.0, "grad_norm": 1.946058306267554, "language_loss": 0.66902006, "learning_rate": 7.107323628093382e-07, "loss": 0.69045949, "num_input_tokens_seen": 262504855, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 12163, "time_per_iteration": 2.5021913051605225 }, { "auxiliary_loss_clip": 0.01105966, "auxiliary_loss_mlp": 0.01029034, "balance_loss_clip": 1.01573014, "balance_loss_mlp": 1.03545737, "epoch": 0.731339245453179, "flos": 20924035050240.0, "grad_norm": 1.8581347881674777, "language_loss": 0.68491834, "learning_rate": 7.104346471142153e-07, "loss": 0.70626831, "num_input_tokens_seen": 262524920, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 12164, "time_per_iteration": 2.4610159397125244 }, { "auxiliary_loss_clip": 0.01107396, "auxiliary_loss_mlp": 0.01035283, "balance_loss_clip": 1.02292097, "balance_loss_mlp": 1.03991234, "epoch": 0.731399368705847, "flos": 23075694524160.0, "grad_norm": 1.9421648867014978, "language_loss": 0.72917169, "learning_rate": 7.101369803195391e-07, "loss": 0.75059843, "num_input_tokens_seen": 262545725, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.67578125, "step": 12165, "time_per_iteration": 2.476106643676758 }, { "auxiliary_loss_clip": 0.01109516, "auxiliary_loss_mlp": 0.01035823, "balance_loss_clip": 1.0221616, "balance_loss_mlp": 1.03788972, "epoch": 0.731459491958515, "flos": 23582681038080.0, "grad_norm": 2.0812070909035194, "language_loss": 0.76509076, "learning_rate": 7.098393624365988e-07, "loss": 0.7865442, "num_input_tokens_seen": 262565480, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 12166, "time_per_iteration": 2.4971370697021484 }, { "auxiliary_loss_clip": 0.01106429, "auxiliary_loss_mlp": 0.01031843, "balance_loss_clip": 1.01904559, "balance_loss_mlp": 1.03774798, "epoch": 0.7315196152111829, "flos": 22379278659840.0, "grad_norm": 2.735612888786682, "language_loss": 0.79537451, "learning_rate": 7.095417934766781e-07, "loss": 0.8167572, "num_input_tokens_seen": 262584145, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 12167, "time_per_iteration": 2.535965919494629 }, { "auxiliary_loss_clip": 0.01103588, "auxiliary_loss_mlp": 0.01037117, "balance_loss_clip": 1.02451015, "balance_loss_mlp": 1.03542113, "epoch": 0.7315797384638509, "flos": 26177047637760.0, "grad_norm": 2.1978932670906928, "language_loss": 0.77295136, "learning_rate": 7.092442734510622e-07, "loss": 0.79435837, "num_input_tokens_seen": 262604045, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 12168, "time_per_iteration": 2.4828455448150635 }, { "auxiliary_loss_clip": 0.01107217, "auxiliary_loss_mlp": 0.01037251, "balance_loss_clip": 1.0231607, "balance_loss_mlp": 1.03588343, "epoch": 0.7316398617165188, "flos": 21506326427520.0, "grad_norm": 1.540096729187584, "language_loss": 0.82042253, "learning_rate": 7.089468023710326e-07, "loss": 0.84186721, "num_input_tokens_seen": 262624540, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7109375, "step": 12169, "time_per_iteration": 2.4651057720184326 }, { "auxiliary_loss_clip": 0.01108682, "auxiliary_loss_mlp": 0.01038039, "balance_loss_clip": 1.02452612, "balance_loss_mlp": 1.03736615, "epoch": 0.7316999849691869, "flos": 30482557315200.0, "grad_norm": 1.7382640137482406, "language_loss": 0.6977917, "learning_rate": 7.08649380247871e-07, "loss": 0.7192589, "num_input_tokens_seen": 262644545, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 12170, "time_per_iteration": 3.951624631881714 }, { "auxiliary_loss_clip": 0.01104657, "auxiliary_loss_mlp": 0.01032654, "balance_loss_clip": 1.01849174, "balance_loss_mlp": 1.03532577, "epoch": 0.7317601082218548, "flos": 21543781334400.0, "grad_norm": 2.2469920027970476, "language_loss": 0.69941902, "learning_rate": 7.083520070928533e-07, "loss": 0.72079217, "num_input_tokens_seen": 262662570, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.6953125, "step": 12171, "time_per_iteration": 3.9301230907440186 }, { "auxiliary_loss_clip": 0.01107872, "auxiliary_loss_mlp": 0.01038359, "balance_loss_clip": 1.02514505, "balance_loss_mlp": 1.03885233, "epoch": 0.7318202314745228, "flos": 33251592775680.0, "grad_norm": 1.6351366849031435, "language_loss": 0.65793407, "learning_rate": 7.080546829172564e-07, "loss": 0.67939639, "num_input_tokens_seen": 262683245, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6875, "step": 12172, "time_per_iteration": 2.5696468353271484 }, { "auxiliary_loss_clip": 0.01109384, "auxiliary_loss_mlp": 0.01028385, "balance_loss_clip": 1.01577866, "balance_loss_mlp": 1.03844166, "epoch": 0.7318803547271907, "flos": 20157054917760.0, "grad_norm": 2.4521145778534863, "language_loss": 0.60801971, "learning_rate": 7.077574077323564e-07, "loss": 0.62939733, "num_input_tokens_seen": 262701585, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 12173, "time_per_iteration": 2.435293674468994 }, { "auxiliary_loss_clip": 0.01106548, "auxiliary_loss_mlp": 0.01026355, "balance_loss_clip": 1.01376605, "balance_loss_mlp": 1.03695679, "epoch": 0.7319404779798587, "flos": 20558536208640.0, "grad_norm": 1.7850201143960283, "language_loss": 0.74097568, "learning_rate": 7.074601815494243e-07, "loss": 0.76230472, "num_input_tokens_seen": 262719295, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12174, "time_per_iteration": 3.8575892448425293 }, { "auxiliary_loss_clip": 0.01103847, "auxiliary_loss_mlp": 0.0102719, "balance_loss_clip": 1.01485813, "balance_loss_mlp": 1.0359391, "epoch": 0.7320006012325266, "flos": 28695391102080.0, "grad_norm": 1.6583307292904383, "language_loss": 0.80851614, "learning_rate": 7.071630043797317e-07, "loss": 0.82982647, "num_input_tokens_seen": 262739995, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 12175, "time_per_iteration": 3.9682464599609375 }, { "auxiliary_loss_clip": 0.01106244, "auxiliary_loss_mlp": 0.0103038, "balance_loss_clip": 1.01785135, "balance_loss_mlp": 1.03618121, "epoch": 0.7320607244851947, "flos": 16362697731840.0, "grad_norm": 2.021206638867278, "language_loss": 0.76646268, "learning_rate": 7.068658762345488e-07, "loss": 0.78782892, "num_input_tokens_seen": 262757680, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 12176, "time_per_iteration": 2.4957501888275146 }, { "auxiliary_loss_clip": 0.01106024, "auxiliary_loss_mlp": 0.01031403, "balance_loss_clip": 1.01895201, "balance_loss_mlp": 1.03769577, "epoch": 0.7321208477378626, "flos": 20955097336320.0, "grad_norm": 1.5691827284173536, "language_loss": 0.7654494, "learning_rate": 7.065687971251399e-07, "loss": 0.78682369, "num_input_tokens_seen": 262776990, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.68359375, "step": 12177, "time_per_iteration": 2.4981377124786377 }, { "auxiliary_loss_clip": 0.01104297, "auxiliary_loss_mlp": 0.01034325, "balance_loss_clip": 1.02165341, "balance_loss_mlp": 1.03459466, "epoch": 0.7321809709905306, "flos": 13845072539520.0, "grad_norm": 2.275013838681831, "language_loss": 0.74714255, "learning_rate": 7.06271767062772e-07, "loss": 0.76852882, "num_input_tokens_seen": 262795440, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 12178, "time_per_iteration": 2.4650561809539795 }, { "auxiliary_loss_clip": 0.01107312, "auxiliary_loss_mlp": 0.01034656, "balance_loss_clip": 1.02211475, "balance_loss_mlp": 1.03567278, "epoch": 0.7322410942431986, "flos": 26979938392320.0, "grad_norm": 2.6483260781919196, "language_loss": 0.82808602, "learning_rate": 7.059747860587084e-07, "loss": 0.84950566, "num_input_tokens_seen": 262816385, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 12179, "time_per_iteration": 2.5112545490264893 }, { "auxiliary_loss_clip": 0.01102693, "auxiliary_loss_mlp": 0.01028956, "balance_loss_clip": 1.0169754, "balance_loss_mlp": 1.0368067, "epoch": 0.7323012174958665, "flos": 17639717034240.0, "grad_norm": 1.8152725243104069, "language_loss": 0.74606812, "learning_rate": 7.056778541242115e-07, "loss": 0.76738465, "num_input_tokens_seen": 262834955, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.66015625, "step": 12180, "time_per_iteration": 2.437408447265625 }, { "auxiliary_loss_clip": 0.01107845, "auxiliary_loss_mlp": 0.0103077, "balance_loss_clip": 1.01704907, "balance_loss_mlp": 1.03467143, "epoch": 0.7323613407485345, "flos": 32342765834880.0, "grad_norm": 2.278053920287249, "language_loss": 0.79473186, "learning_rate": 7.053809712705396e-07, "loss": 0.816118, "num_input_tokens_seen": 262853555, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 12181, "time_per_iteration": 2.5488367080688477 }, { "auxiliary_loss_clip": 0.01110266, "auxiliary_loss_mlp": 0.01030356, "balance_loss_clip": 1.01748085, "balance_loss_mlp": 1.03939223, "epoch": 0.7324214640012024, "flos": 18362777811840.0, "grad_norm": 2.1329637991658976, "language_loss": 0.72292972, "learning_rate": 7.050841375089506e-07, "loss": 0.74433595, "num_input_tokens_seen": 262870975, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 12182, "time_per_iteration": 2.462446928024292 }, { "auxiliary_loss_clip": 0.0110957, "auxiliary_loss_mlp": 0.01030959, "balance_loss_clip": 1.01866245, "balance_loss_mlp": 1.03853393, "epoch": 0.7324815872538705, "flos": 30812289189120.0, "grad_norm": 1.8690436927126712, "language_loss": 0.71033138, "learning_rate": 7.047873528507015e-07, "loss": 0.73173666, "num_input_tokens_seen": 262892635, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 12183, "time_per_iteration": 2.5486907958984375 }, { "auxiliary_loss_clip": 0.01110335, "auxiliary_loss_mlp": 0.0103811, "balance_loss_clip": 1.02415657, "balance_loss_mlp": 1.03790498, "epoch": 0.7325417105065384, "flos": 21505069451520.0, "grad_norm": 1.8238834995654416, "language_loss": 0.7317071, "learning_rate": 7.04490617307045e-07, "loss": 0.75319159, "num_input_tokens_seen": 262910725, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7265625, "step": 12184, "time_per_iteration": 2.4775550365448 }, { "auxiliary_loss_clip": 0.01028118, "auxiliary_loss_mlp": 0.01000315, "balance_loss_clip": 0.9991293, "balance_loss_mlp": 1.00554371, "epoch": 0.7326018337592064, "flos": 67257742556160.0, "grad_norm": 0.7576263931049577, "language_loss": 0.6519227, "learning_rate": 7.041939308892344e-07, "loss": 0.672207, "num_input_tokens_seen": 262974150, "router_z_loss_clip": 0.01184082, "router_z_loss_mlp": 0.2265625, "step": 12185, "time_per_iteration": 3.093275308609009 }, { "auxiliary_loss_clip": 0.01106397, "auxiliary_loss_mlp": 0.01026803, "balance_loss_clip": 1.01342213, "balance_loss_mlp": 1.03496754, "epoch": 0.7326619570118743, "flos": 22857070394880.0, "grad_norm": 2.033409380888399, "language_loss": 0.80495799, "learning_rate": 7.038972936085197e-07, "loss": 0.82629001, "num_input_tokens_seen": 262993370, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 12186, "time_per_iteration": 2.493058204650879 }, { "auxiliary_loss_clip": 0.01106077, "auxiliary_loss_mlp": 0.01030929, "balance_loss_clip": 1.01686287, "balance_loss_mlp": 1.03513646, "epoch": 0.7327220802645423, "flos": 23327499841920.0, "grad_norm": 2.0041584499856744, "language_loss": 0.73527026, "learning_rate": 7.036007054761508e-07, "loss": 0.75664032, "num_input_tokens_seen": 263012665, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7109375, "step": 12187, "time_per_iteration": 2.4607770442962646 }, { "auxiliary_loss_clip": 0.01109093, "auxiliary_loss_mlp": 0.0103386, "balance_loss_clip": 1.02106905, "balance_loss_mlp": 1.03802299, "epoch": 0.7327822035172102, "flos": 23180661043200.0, "grad_norm": 1.7995567264375787, "language_loss": 0.88898784, "learning_rate": 7.033041665033716e-07, "loss": 0.91041732, "num_input_tokens_seen": 263031475, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 12188, "time_per_iteration": 2.4850544929504395 }, { "auxiliary_loss_clip": 0.01109735, "auxiliary_loss_mlp": 0.01034619, "balance_loss_clip": 1.02103508, "balance_loss_mlp": 1.03653598, "epoch": 0.7328423267698783, "flos": 21066600130560.0, "grad_norm": 2.226365417487294, "language_loss": 0.74810201, "learning_rate": 7.030076767014284e-07, "loss": 0.76954556, "num_input_tokens_seen": 263051445, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 12189, "time_per_iteration": 2.4647605419158936 }, { "auxiliary_loss_clip": 0.01106856, "auxiliary_loss_mlp": 0.01030251, "balance_loss_clip": 1.01709652, "balance_loss_mlp": 1.03616726, "epoch": 0.7329024500225462, "flos": 21689578638720.0, "grad_norm": 1.6499474649905548, "language_loss": 0.81971025, "learning_rate": 7.027112360815648e-07, "loss": 0.84108138, "num_input_tokens_seen": 263070835, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 12190, "time_per_iteration": 2.483705520629883 }, { "auxiliary_loss_clip": 0.01108163, "auxiliary_loss_mlp": 0.0103352, "balance_loss_clip": 1.01972711, "balance_loss_mlp": 1.03700471, "epoch": 0.7329625732752142, "flos": 24164038661760.0, "grad_norm": 1.8053080641827233, "language_loss": 0.71790618, "learning_rate": 7.024148446550204e-07, "loss": 0.73932302, "num_input_tokens_seen": 263090070, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 12191, "time_per_iteration": 2.511237144470215 }, { "auxiliary_loss_clip": 0.01108362, "auxiliary_loss_mlp": 0.01034677, "balance_loss_clip": 1.02130222, "balance_loss_mlp": 1.03704572, "epoch": 0.7330226965278822, "flos": 30077915627520.0, "grad_norm": 1.7797514872473068, "language_loss": 0.69318306, "learning_rate": 7.021185024330361e-07, "loss": 0.71461344, "num_input_tokens_seen": 263110030, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 12192, "time_per_iteration": 2.5394680500030518 }, { "auxiliary_loss_clip": 0.01103405, "auxiliary_loss_mlp": 0.01032403, "balance_loss_clip": 1.01962948, "balance_loss_mlp": 1.03478551, "epoch": 0.7330828197805501, "flos": 23368294713600.0, "grad_norm": 1.600692301588623, "language_loss": 0.73452848, "learning_rate": 7.01822209426848e-07, "loss": 0.75588655, "num_input_tokens_seen": 263129735, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 12193, "time_per_iteration": 2.464014768600464 }, { "auxiliary_loss_clip": 0.01107391, "auxiliary_loss_mlp": 0.01029392, "balance_loss_clip": 1.01608872, "balance_loss_mlp": 1.03556454, "epoch": 0.7331429430332181, "flos": 21032808410880.0, "grad_norm": 2.5046186125846934, "language_loss": 0.7716406, "learning_rate": 7.015259656476911e-07, "loss": 0.79300845, "num_input_tokens_seen": 263149100, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12194, "time_per_iteration": 2.46860408782959 }, { "auxiliary_loss_clip": 0.01106, "auxiliary_loss_mlp": 0.01031036, "balance_loss_clip": 1.01741624, "balance_loss_mlp": 1.03704071, "epoch": 0.733203066285886, "flos": 14647891466880.0, "grad_norm": 1.8335492995961202, "language_loss": 0.70468807, "learning_rate": 7.012297711067998e-07, "loss": 0.72605848, "num_input_tokens_seen": 263166620, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6875, "step": 12195, "time_per_iteration": 2.429324150085449 }, { "auxiliary_loss_clip": 0.01106139, "auxiliary_loss_mlp": 0.01037076, "balance_loss_clip": 1.02499437, "balance_loss_mlp": 1.03564835, "epoch": 0.7332631895385541, "flos": 17165301177600.0, "grad_norm": 1.9888463682913804, "language_loss": 0.72034371, "learning_rate": 7.009336258154057e-07, "loss": 0.74177587, "num_input_tokens_seen": 263184780, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.70703125, "step": 12196, "time_per_iteration": 2.4597885608673096 }, { "auxiliary_loss_clip": 0.01105967, "auxiliary_loss_mlp": 0.01032202, "balance_loss_clip": 1.01868975, "balance_loss_mlp": 1.03717852, "epoch": 0.733323312791222, "flos": 28658151676800.0, "grad_norm": 2.794505391405738, "language_loss": 0.71749806, "learning_rate": 7.006375297847394e-07, "loss": 0.73887974, "num_input_tokens_seen": 263204625, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6875, "step": 12197, "time_per_iteration": 2.5110135078430176 }, { "auxiliary_loss_clip": 0.01111203, "auxiliary_loss_mlp": 0.01038894, "balance_loss_clip": 1.02466655, "balance_loss_mlp": 1.03760767, "epoch": 0.73338343604389, "flos": 16618417632000.0, "grad_norm": 2.013187351095544, "language_loss": 0.78274667, "learning_rate": 7.003414830260282e-07, "loss": 0.80424774, "num_input_tokens_seen": 263221565, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.734375, "step": 12198, "time_per_iteration": 2.449575424194336 }, { "auxiliary_loss_clip": 0.01106768, "auxiliary_loss_mlp": 0.0103428, "balance_loss_clip": 1.02156055, "balance_loss_mlp": 1.03625047, "epoch": 0.7334435592965579, "flos": 21142084561920.0, "grad_norm": 2.097706500351887, "language_loss": 0.74786448, "learning_rate": 7.000454855504974e-07, "loss": 0.76927501, "num_input_tokens_seen": 263240620, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12199, "time_per_iteration": 2.4604744911193848 }, { "auxiliary_loss_clip": 0.01110586, "auxiliary_loss_mlp": 0.01036447, "balance_loss_clip": 1.02265465, "balance_loss_mlp": 1.03805423, "epoch": 0.7335036825492259, "flos": 17125332318720.0, "grad_norm": 2.372651567898504, "language_loss": 0.77227783, "learning_rate": 6.997495373693729e-07, "loss": 0.79374826, "num_input_tokens_seen": 263254365, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 12200, "time_per_iteration": 2.4308371543884277 }, { "auxiliary_loss_clip": 0.0110657, "auxiliary_loss_mlp": 0.01036051, "balance_loss_clip": 1.02346873, "balance_loss_mlp": 1.03652346, "epoch": 0.7335638058018938, "flos": 23731818307200.0, "grad_norm": 1.6200169151031254, "language_loss": 0.6143015, "learning_rate": 6.994536384938754e-07, "loss": 0.6357277, "num_input_tokens_seen": 263275880, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 12201, "time_per_iteration": 2.4948182106018066 }, { "auxiliary_loss_clip": 0.01105779, "auxiliary_loss_mlp": 0.01028053, "balance_loss_clip": 1.01551807, "balance_loss_mlp": 1.03667259, "epoch": 0.7336239290545619, "flos": 34933289679360.0, "grad_norm": 2.540065163053765, "language_loss": 0.5213263, "learning_rate": 6.991577889352264e-07, "loss": 0.54266459, "num_input_tokens_seen": 263298315, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 12202, "time_per_iteration": 2.599525213241577 }, { "auxiliary_loss_clip": 0.01106122, "auxiliary_loss_mlp": 0.01031632, "balance_loss_clip": 1.01876378, "balance_loss_mlp": 1.03680968, "epoch": 0.7336840523072298, "flos": 21103049456640.0, "grad_norm": 3.028002118236044, "language_loss": 0.68708354, "learning_rate": 6.98861988704645e-07, "loss": 0.70846105, "num_input_tokens_seen": 263318615, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 12203, "time_per_iteration": 2.6194112300872803 }, { "auxiliary_loss_clip": 0.01110116, "auxiliary_loss_mlp": 0.01035531, "balance_loss_clip": 1.02198255, "balance_loss_mlp": 1.03640032, "epoch": 0.7337441755598978, "flos": 24024418496640.0, "grad_norm": 2.3294475274982864, "language_loss": 0.66199183, "learning_rate": 6.985662378133474e-07, "loss": 0.68344831, "num_input_tokens_seen": 263336705, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73828125, "step": 12204, "time_per_iteration": 2.4996707439422607 }, { "auxiliary_loss_clip": 0.01105693, "auxiliary_loss_mlp": 0.01033395, "balance_loss_clip": 1.02090216, "balance_loss_mlp": 1.03703666, "epoch": 0.7338042988125658, "flos": 22711309004160.0, "grad_norm": 2.0402968070798377, "language_loss": 0.77461708, "learning_rate": 6.982705362725479e-07, "loss": 0.79600799, "num_input_tokens_seen": 263355065, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 12205, "time_per_iteration": 2.4813292026519775 }, { "auxiliary_loss_clip": 0.01105576, "auxiliary_loss_mlp": 0.01033092, "balance_loss_clip": 1.02099228, "balance_loss_mlp": 1.03769279, "epoch": 0.7338644220652337, "flos": 21360996000000.0, "grad_norm": 1.8178801362980415, "language_loss": 0.79786325, "learning_rate": 6.979748840934601e-07, "loss": 0.81924999, "num_input_tokens_seen": 263374460, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 12206, "time_per_iteration": 2.4874610900878906 }, { "auxiliary_loss_clip": 0.01107681, "auxiliary_loss_mlp": 0.01029085, "balance_loss_clip": 1.01602006, "balance_loss_mlp": 1.03639877, "epoch": 0.7339245453179017, "flos": 30920236536960.0, "grad_norm": 2.0210072737861253, "language_loss": 0.7153309, "learning_rate": 6.976792812872958e-07, "loss": 0.73669857, "num_input_tokens_seen": 263393610, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12207, "time_per_iteration": 2.5334537029266357 }, { "auxiliary_loss_clip": 0.01028066, "auxiliary_loss_mlp": 0.01001903, "balance_loss_clip": 1.00080585, "balance_loss_mlp": 1.00561011, "epoch": 0.7339846685705697, "flos": 67899429072000.0, "grad_norm": 0.858515532787226, "language_loss": 0.54828346, "learning_rate": 6.97383727865263e-07, "loss": 0.56858313, "num_input_tokens_seen": 263450340, "router_z_loss_clip": 0.01098633, "router_z_loss_mlp": 0.22460938, "step": 12208, "time_per_iteration": 3.1564784049987793 }, { "auxiliary_loss_clip": 0.01106525, "auxiliary_loss_mlp": 0.01030752, "balance_loss_clip": 1.01918888, "balance_loss_mlp": 1.03678656, "epoch": 0.7340447918232377, "flos": 22236749493120.0, "grad_norm": 1.3663325136522417, "language_loss": 0.80605888, "learning_rate": 6.970882238385703e-07, "loss": 0.82743162, "num_input_tokens_seen": 263471735, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6953125, "step": 12209, "time_per_iteration": 2.4833245277404785 }, { "auxiliary_loss_clip": 0.01102589, "auxiliary_loss_mlp": 0.01033075, "balance_loss_clip": 1.02126169, "balance_loss_mlp": 1.03407431, "epoch": 0.7341049150759056, "flos": 23764784014080.0, "grad_norm": 1.5632918602113925, "language_loss": 0.79178107, "learning_rate": 6.96792769218423e-07, "loss": 0.81313765, "num_input_tokens_seen": 263493245, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 12210, "time_per_iteration": 2.497009754180908 }, { "auxiliary_loss_clip": 0.01105692, "auxiliary_loss_mlp": 0.01033641, "balance_loss_clip": 1.02083826, "balance_loss_mlp": 1.0373491, "epoch": 0.7341650383285736, "flos": 17236547804160.0, "grad_norm": 1.8735177605456728, "language_loss": 0.76267755, "learning_rate": 6.964973640160236e-07, "loss": 0.78407091, "num_input_tokens_seen": 263511660, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.68359375, "step": 12211, "time_per_iteration": 2.4443581104278564 }, { "auxiliary_loss_clip": 0.01108227, "auxiliary_loss_mlp": 0.01029976, "balance_loss_clip": 1.01676178, "balance_loss_mlp": 1.03708494, "epoch": 0.7342251615812415, "flos": 23403953940480.0, "grad_norm": 2.4397552918987864, "language_loss": 0.72158659, "learning_rate": 6.962020082425748e-07, "loss": 0.74296856, "num_input_tokens_seen": 263530875, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 12212, "time_per_iteration": 5.4507293701171875 }, { "auxiliary_loss_clip": 0.01108934, "auxiliary_loss_mlp": 0.01038242, "balance_loss_clip": 1.02520025, "balance_loss_mlp": 1.03898418, "epoch": 0.7342852848339095, "flos": 22747183712640.0, "grad_norm": 1.7031196606043122, "language_loss": 0.68866897, "learning_rate": 6.959067019092766e-07, "loss": 0.71014071, "num_input_tokens_seen": 263551585, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 12213, "time_per_iteration": 2.499232530593872 }, { "auxiliary_loss_clip": 0.01028487, "auxiliary_loss_mlp": 0.01004675, "balance_loss_clip": 1.00355995, "balance_loss_mlp": 1.0061307, "epoch": 0.7343454080865774, "flos": 53942353925760.0, "grad_norm": 0.7526634490569281, "language_loss": 0.54350364, "learning_rate": 6.956114450273276e-07, "loss": 0.56383514, "num_input_tokens_seen": 263609545, "router_z_loss_clip": 0.01116943, "router_z_loss_mlp": 0.22363281, "step": 12214, "time_per_iteration": 2.9759738445281982 }, { "auxiliary_loss_clip": 0.01108614, "auxiliary_loss_mlp": 0.01029592, "balance_loss_clip": 1.01706946, "balance_loss_mlp": 1.03621316, "epoch": 0.7344055313392455, "flos": 12166859255040.0, "grad_norm": 2.620394621896684, "language_loss": 0.70654643, "learning_rate": 6.953162376079233e-07, "loss": 0.72792852, "num_input_tokens_seen": 263627880, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 12215, "time_per_iteration": 3.896139621734619 }, { "auxiliary_loss_clip": 0.01104484, "auxiliary_loss_mlp": 0.01033361, "balance_loss_clip": 1.02173257, "balance_loss_mlp": 1.03687072, "epoch": 0.7344656545919134, "flos": 18550052346240.0, "grad_norm": 1.679428113853521, "language_loss": 0.72816885, "learning_rate": 6.950210796622573e-07, "loss": 0.74954724, "num_input_tokens_seen": 263645665, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 12216, "time_per_iteration": 3.8967838287353516 }, { "auxiliary_loss_clip": 0.01112924, "auxiliary_loss_mlp": 0.01041007, "balance_loss_clip": 1.02556968, "balance_loss_mlp": 1.03736484, "epoch": 0.7345257778445814, "flos": 23661649088640.0, "grad_norm": 1.8692408389912005, "language_loss": 0.78232431, "learning_rate": 6.947259712015236e-07, "loss": 0.80386364, "num_input_tokens_seen": 263668170, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.7578125, "step": 12217, "time_per_iteration": 2.5347187519073486 }, { "auxiliary_loss_clip": 0.01103334, "auxiliary_loss_mlp": 0.01029334, "balance_loss_clip": 1.01776493, "balance_loss_mlp": 1.03470898, "epoch": 0.7345859010972494, "flos": 13808659127040.0, "grad_norm": 2.1145893026458986, "language_loss": 0.7802698, "learning_rate": 6.94430912236911e-07, "loss": 0.80159646, "num_input_tokens_seen": 263684190, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6875, "step": 12218, "time_per_iteration": 2.424739360809326 }, { "auxiliary_loss_clip": 0.01104482, "auxiliary_loss_mlp": 0.01031675, "balance_loss_clip": 1.01893139, "balance_loss_mlp": 1.03625154, "epoch": 0.7346460243499173, "flos": 22272731942400.0, "grad_norm": 2.4234537743997433, "language_loss": 0.71976745, "learning_rate": 6.941359027796092e-07, "loss": 0.74112904, "num_input_tokens_seen": 263702095, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 12219, "time_per_iteration": 2.4712324142456055 }, { "auxiliary_loss_clip": 0.01101994, "auxiliary_loss_mlp": 0.01034334, "balance_loss_clip": 1.02184641, "balance_loss_mlp": 1.03486538, "epoch": 0.7347061476025853, "flos": 23255247634560.0, "grad_norm": 1.7806106084778346, "language_loss": 0.74952251, "learning_rate": 6.938409428408061e-07, "loss": 0.77088583, "num_input_tokens_seen": 263721385, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 12220, "time_per_iteration": 2.4866764545440674 }, { "auxiliary_loss_clip": 0.01108316, "auxiliary_loss_mlp": 0.01031698, "balance_loss_clip": 1.01891255, "balance_loss_mlp": 1.03696656, "epoch": 0.7347662708552533, "flos": 15267565923840.0, "grad_norm": 1.7160586683470196, "language_loss": 0.65972614, "learning_rate": 6.93546032431684e-07, "loss": 0.6811263, "num_input_tokens_seen": 263737835, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 12221, "time_per_iteration": 2.4537675380706787 }, { "auxiliary_loss_clip": 0.01106719, "auxiliary_loss_mlp": 0.01032046, "balance_loss_clip": 1.01932621, "balance_loss_mlp": 1.03662193, "epoch": 0.7348263941079213, "flos": 24859987649280.0, "grad_norm": 1.7611362727325937, "language_loss": 0.69044566, "learning_rate": 6.932511715634273e-07, "loss": 0.71183336, "num_input_tokens_seen": 263756480, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 12222, "time_per_iteration": 2.5062856674194336 }, { "auxiliary_loss_clip": 0.01104631, "auxiliary_loss_mlp": 0.01030005, "balance_loss_clip": 1.01876378, "balance_loss_mlp": 1.03571105, "epoch": 0.7348865173605892, "flos": 24352103295360.0, "grad_norm": 1.7181686448498326, "language_loss": 0.6589641, "learning_rate": 6.92956360247217e-07, "loss": 0.68031049, "num_input_tokens_seen": 263776440, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.69140625, "step": 12223, "time_per_iteration": 2.483398675918579 }, { "auxiliary_loss_clip": 0.01107463, "auxiliary_loss_mlp": 0.01028012, "balance_loss_clip": 1.0156498, "balance_loss_mlp": 1.03698802, "epoch": 0.7349466406132572, "flos": 20004613597440.0, "grad_norm": 2.382042676399798, "language_loss": 0.723894, "learning_rate": 6.926615984942332e-07, "loss": 0.74524879, "num_input_tokens_seen": 263793700, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 12224, "time_per_iteration": 2.452169895172119 }, { "auxiliary_loss_clip": 0.01108619, "auxiliary_loss_mlp": 0.01031416, "balance_loss_clip": 1.0186727, "balance_loss_mlp": 1.03840995, "epoch": 0.7350067638659251, "flos": 29825068815360.0, "grad_norm": 5.547483194341759, "language_loss": 0.72559083, "learning_rate": 6.92366886315652e-07, "loss": 0.74699116, "num_input_tokens_seen": 263814620, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 12225, "time_per_iteration": 2.509783983230591 }, { "auxiliary_loss_clip": 0.01110937, "auxiliary_loss_mlp": 0.01032356, "balance_loss_clip": 1.01832473, "balance_loss_mlp": 1.03762603, "epoch": 0.7350668871185931, "flos": 21866150920320.0, "grad_norm": 1.745622400507246, "language_loss": 0.76272857, "learning_rate": 6.920722237226501e-07, "loss": 0.78416151, "num_input_tokens_seen": 263832725, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.734375, "step": 12226, "time_per_iteration": 2.4657506942749023 }, { "auxiliary_loss_clip": 0.01105958, "auxiliary_loss_mlp": 0.01029489, "balance_loss_clip": 1.01644135, "balance_loss_mlp": 1.03613353, "epoch": 0.735127010371261, "flos": 22566122231040.0, "grad_norm": 1.6821295160008247, "language_loss": 0.67217338, "learning_rate": 6.917776107264008e-07, "loss": 0.69352788, "num_input_tokens_seen": 263853850, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 12227, "time_per_iteration": 2.4759042263031006 }, { "auxiliary_loss_clip": 0.01107729, "auxiliary_loss_mlp": 0.01033218, "balance_loss_clip": 1.02071345, "balance_loss_mlp": 1.03615093, "epoch": 0.7351871336239291, "flos": 25884339707520.0, "grad_norm": 1.5724051240056554, "language_loss": 0.63804561, "learning_rate": 6.914830473380749e-07, "loss": 0.65945506, "num_input_tokens_seen": 263874760, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 12228, "time_per_iteration": 2.524731159210205 }, { "auxiliary_loss_clip": 0.01105424, "auxiliary_loss_mlp": 0.010389, "balance_loss_clip": 1.0269258, "balance_loss_mlp": 1.03597772, "epoch": 0.735247256876597, "flos": 17932173569280.0, "grad_norm": 1.5769079058346327, "language_loss": 0.63352263, "learning_rate": 6.911885335688427e-07, "loss": 0.65496588, "num_input_tokens_seen": 263893390, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 12229, "time_per_iteration": 2.4635565280914307 }, { "auxiliary_loss_clip": 0.01108163, "auxiliary_loss_mlp": 0.01038582, "balance_loss_clip": 1.02494454, "balance_loss_mlp": 1.03659844, "epoch": 0.735307380129265, "flos": 28875159694080.0, "grad_norm": 2.019116446274335, "language_loss": 0.73674691, "learning_rate": 6.908940694298726e-07, "loss": 0.75821435, "num_input_tokens_seen": 263911180, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 12230, "time_per_iteration": 2.523282766342163 }, { "auxiliary_loss_clip": 0.01109311, "auxiliary_loss_mlp": 0.0103357, "balance_loss_clip": 1.02055883, "balance_loss_mlp": 1.03856695, "epoch": 0.7353675033819329, "flos": 13625658311040.0, "grad_norm": 2.0903362809066275, "language_loss": 0.72286808, "learning_rate": 6.90599654932332e-07, "loss": 0.74429691, "num_input_tokens_seen": 263928975, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 12231, "time_per_iteration": 2.4416704177856445 }, { "auxiliary_loss_clip": 0.01109261, "auxiliary_loss_mlp": 0.01041062, "balance_loss_clip": 1.02653027, "balance_loss_mlp": 1.03807867, "epoch": 0.7354276266346009, "flos": 19463081178240.0, "grad_norm": 2.6629417609625183, "language_loss": 0.63932496, "learning_rate": 6.903052900873823e-07, "loss": 0.66082823, "num_input_tokens_seen": 263944495, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.7109375, "step": 12232, "time_per_iteration": 2.4471871852874756 }, { "auxiliary_loss_clip": 0.01107469, "auxiliary_loss_mlp": 0.01032137, "balance_loss_clip": 1.01908374, "balance_loss_mlp": 1.03613353, "epoch": 0.735487749887269, "flos": 15771858917760.0, "grad_norm": 2.2905249409451662, "language_loss": 0.75635099, "learning_rate": 6.900109749061874e-07, "loss": 0.77774704, "num_input_tokens_seen": 263961325, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12233, "time_per_iteration": 2.4329793453216553 }, { "auxiliary_loss_clip": 0.01105481, "auxiliary_loss_mlp": 0.01029158, "balance_loss_clip": 1.01593733, "balance_loss_mlp": 1.03520072, "epoch": 0.7355478731399369, "flos": 18260648467200.0, "grad_norm": 1.6480024047920965, "language_loss": 0.73598778, "learning_rate": 6.897167093999079e-07, "loss": 0.75733411, "num_input_tokens_seen": 263980445, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 12234, "time_per_iteration": 2.4526331424713135 }, { "auxiliary_loss_clip": 0.01108824, "auxiliary_loss_mlp": 0.01033426, "balance_loss_clip": 1.0206883, "balance_loss_mlp": 1.03818071, "epoch": 0.7356079963926049, "flos": 26542043688960.0, "grad_norm": 4.804966500048253, "language_loss": 0.59984279, "learning_rate": 6.894224935797017e-07, "loss": 0.62126535, "num_input_tokens_seen": 263999330, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 12235, "time_per_iteration": 2.522738218307495 }, { "auxiliary_loss_clip": 0.01106371, "auxiliary_loss_mlp": 0.01029897, "balance_loss_clip": 1.01732051, "balance_loss_mlp": 1.03757238, "epoch": 0.7356681196452728, "flos": 10778624467200.0, "grad_norm": 2.9580275886394314, "language_loss": 0.85868037, "learning_rate": 6.891283274567259e-07, "loss": 0.88004309, "num_input_tokens_seen": 264014150, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 12236, "time_per_iteration": 2.458106517791748 }, { "auxiliary_loss_clip": 0.01106716, "auxiliary_loss_mlp": 0.01030227, "balance_loss_clip": 1.01714396, "balance_loss_mlp": 1.0358665, "epoch": 0.7357282428979408, "flos": 19718693337600.0, "grad_norm": 1.8339663430756743, "language_loss": 0.68998563, "learning_rate": 6.888342110421364e-07, "loss": 0.71135509, "num_input_tokens_seen": 264033140, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12237, "time_per_iteration": 2.517176389694214 }, { "auxiliary_loss_clip": 0.01106447, "auxiliary_loss_mlp": 0.01029678, "balance_loss_clip": 1.01730371, "balance_loss_mlp": 1.03553939, "epoch": 0.7357883661506087, "flos": 19464014931840.0, "grad_norm": 1.691557519950956, "language_loss": 0.72181755, "learning_rate": 6.885401443470839e-07, "loss": 0.74317878, "num_input_tokens_seen": 264052105, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 12238, "time_per_iteration": 2.5045626163482666 }, { "auxiliary_loss_clip": 0.01110149, "auxiliary_loss_mlp": 0.01032366, "balance_loss_clip": 1.01901484, "balance_loss_mlp": 1.03560197, "epoch": 0.7358484894032767, "flos": 27123006263040.0, "grad_norm": 1.836844154623025, "language_loss": 0.72285664, "learning_rate": 6.882461273827205e-07, "loss": 0.74428177, "num_input_tokens_seen": 264070690, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 12239, "time_per_iteration": 2.5410091876983643 }, { "auxiliary_loss_clip": 0.01103131, "auxiliary_loss_mlp": 0.01029163, "balance_loss_clip": 1.01633048, "balance_loss_mlp": 1.03584313, "epoch": 0.7359086126559446, "flos": 24502282058880.0, "grad_norm": 1.5514003857068408, "language_loss": 0.79209435, "learning_rate": 6.879521601601954e-07, "loss": 0.81341732, "num_input_tokens_seen": 264094225, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.671875, "step": 12240, "time_per_iteration": 2.54789137840271 }, { "auxiliary_loss_clip": 0.01106899, "auxiliary_loss_mlp": 0.01037848, "balance_loss_clip": 1.02469277, "balance_loss_mlp": 1.03720498, "epoch": 0.7359687359086127, "flos": 23331270769920.0, "grad_norm": 1.958384156680753, "language_loss": 0.82925558, "learning_rate": 6.876582426906565e-07, "loss": 0.850703, "num_input_tokens_seen": 264113190, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 12241, "time_per_iteration": 2.4614269733428955 }, { "auxiliary_loss_clip": 0.01104415, "auxiliary_loss_mlp": 0.01026529, "balance_loss_clip": 1.01378512, "balance_loss_mlp": 1.0353353, "epoch": 0.7360288591612806, "flos": 20193396503040.0, "grad_norm": 2.1145464284550957, "language_loss": 0.78505993, "learning_rate": 6.873643749852484e-07, "loss": 0.80636942, "num_input_tokens_seen": 264132050, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 12242, "time_per_iteration": 2.513878345489502 }, { "auxiliary_loss_clip": 0.0110675, "auxiliary_loss_mlp": 0.01031323, "balance_loss_clip": 1.01881194, "balance_loss_mlp": 1.03699076, "epoch": 0.7360889824139486, "flos": 24972783333120.0, "grad_norm": 1.9018300154106114, "language_loss": 0.79378581, "learning_rate": 6.870705570551145e-07, "loss": 0.81516647, "num_input_tokens_seen": 264152800, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12243, "time_per_iteration": 2.5135393142700195 }, { "auxiliary_loss_clip": 0.01106789, "auxiliary_loss_mlp": 0.01035149, "balance_loss_clip": 1.02195215, "balance_loss_mlp": 1.03507781, "epoch": 0.7361491056666165, "flos": 15012312900480.0, "grad_norm": 2.0566846357358, "language_loss": 0.74348468, "learning_rate": 6.867767889113969e-07, "loss": 0.76490408, "num_input_tokens_seen": 264169650, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 12244, "time_per_iteration": 2.4383528232574463 }, { "auxiliary_loss_clip": 0.01106077, "auxiliary_loss_mlp": 0.01030975, "balance_loss_clip": 1.01785612, "balance_loss_mlp": 1.03508759, "epoch": 0.7362092289192845, "flos": 22930400010240.0, "grad_norm": 1.6849810335789353, "language_loss": 0.69620639, "learning_rate": 6.864830705652347e-07, "loss": 0.71757686, "num_input_tokens_seen": 264190530, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12245, "time_per_iteration": 2.468510866165161 }, { "auxiliary_loss_clip": 0.0110253, "auxiliary_loss_mlp": 0.01033579, "balance_loss_clip": 1.0206331, "balance_loss_mlp": 1.03557253, "epoch": 0.7362693521719526, "flos": 20702681487360.0, "grad_norm": 1.5544753486080114, "language_loss": 0.73137128, "learning_rate": 6.861894020277658e-07, "loss": 0.7527324, "num_input_tokens_seen": 264210820, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.66796875, "step": 12246, "time_per_iteration": 2.4623239040374756 }, { "auxiliary_loss_clip": 0.01100967, "auxiliary_loss_mlp": 0.01025818, "balance_loss_clip": 1.01394439, "balance_loss_mlp": 1.03409183, "epoch": 0.7363294754246205, "flos": 13111381336320.0, "grad_norm": 2.1653276887110047, "language_loss": 0.73393804, "learning_rate": 6.858957833101266e-07, "loss": 0.75520587, "num_input_tokens_seen": 264227430, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.66796875, "step": 12247, "time_per_iteration": 2.398660182952881 }, { "auxiliary_loss_clip": 0.01106384, "auxiliary_loss_mlp": 0.01030871, "balance_loss_clip": 1.01885462, "balance_loss_mlp": 1.03852797, "epoch": 0.7363895986772885, "flos": 14027426910720.0, "grad_norm": 1.6411279533769991, "language_loss": 0.74249834, "learning_rate": 6.856022144234526e-07, "loss": 0.76387089, "num_input_tokens_seen": 264245230, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 12248, "time_per_iteration": 2.4394140243530273 }, { "auxiliary_loss_clip": 0.01105725, "auxiliary_loss_mlp": 0.01038245, "balance_loss_clip": 1.0249474, "balance_loss_mlp": 1.0353086, "epoch": 0.7364497219299564, "flos": 19719986227200.0, "grad_norm": 1.971024298590856, "language_loss": 0.72504652, "learning_rate": 6.853086953788727e-07, "loss": 0.74648625, "num_input_tokens_seen": 264263945, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 12249, "time_per_iteration": 2.441441059112549 }, { "auxiliary_loss_clip": 0.01106779, "auxiliary_loss_mlp": 0.01032103, "balance_loss_clip": 1.01919866, "balance_loss_mlp": 1.03676164, "epoch": 0.7365098451826244, "flos": 21361391049600.0, "grad_norm": 1.7379325792332563, "language_loss": 0.76942956, "learning_rate": 6.850152261875189e-07, "loss": 0.79081833, "num_input_tokens_seen": 264281500, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12250, "time_per_iteration": 2.4594104290008545 }, { "auxiliary_loss_clip": 0.01108697, "auxiliary_loss_mlp": 0.01029544, "balance_loss_clip": 1.0161922, "balance_loss_mlp": 1.03687644, "epoch": 0.7365699684352923, "flos": 23368222886400.0, "grad_norm": 1.705524749907447, "language_loss": 0.71594918, "learning_rate": 6.8472180686052e-07, "loss": 0.73733163, "num_input_tokens_seen": 264301625, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 12251, "time_per_iteration": 2.4722402095794678 }, { "auxiliary_loss_clip": 0.01104375, "auxiliary_loss_mlp": 0.01033828, "balance_loss_clip": 1.02112639, "balance_loss_mlp": 1.03553367, "epoch": 0.7366300916879603, "flos": 59524879927680.0, "grad_norm": 1.6105010422228432, "language_loss": 0.65600491, "learning_rate": 6.844284374090015e-07, "loss": 0.67738694, "num_input_tokens_seen": 264323975, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 12252, "time_per_iteration": 2.7938132286071777 }, { "auxiliary_loss_clip": 0.0111141, "auxiliary_loss_mlp": 0.0103089, "balance_loss_clip": 1.01808119, "balance_loss_mlp": 1.03999424, "epoch": 0.7366902149406283, "flos": 20923137210240.0, "grad_norm": 2.4395542641065697, "language_loss": 0.79161209, "learning_rate": 6.841351178440884e-07, "loss": 0.81303513, "num_input_tokens_seen": 264343785, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 12253, "time_per_iteration": 3.914067268371582 }, { "auxiliary_loss_clip": 0.01100153, "auxiliary_loss_mlp": 0.01028944, "balance_loss_clip": 1.01674914, "balance_loss_mlp": 1.03366375, "epoch": 0.7367503381932963, "flos": 17348158339200.0, "grad_norm": 2.3436639962974386, "language_loss": 0.76165026, "learning_rate": 6.83841848176905e-07, "loss": 0.78294122, "num_input_tokens_seen": 264361130, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 12254, "time_per_iteration": 3.872436046600342 }, { "auxiliary_loss_clip": 0.01105759, "auxiliary_loss_mlp": 0.01037145, "balance_loss_clip": 1.024086, "balance_loss_mlp": 1.03625941, "epoch": 0.7368104614459642, "flos": 17821317219840.0, "grad_norm": 2.942416735372118, "language_loss": 0.69822484, "learning_rate": 6.835486284185692e-07, "loss": 0.71965396, "num_input_tokens_seen": 264376965, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 12255, "time_per_iteration": 2.5044362545013428 }, { "auxiliary_loss_clip": 0.01107262, "auxiliary_loss_mlp": 0.0103068, "balance_loss_clip": 1.0172689, "balance_loss_mlp": 1.03671932, "epoch": 0.7368705846986322, "flos": 24606099342720.0, "grad_norm": 2.053028743074902, "language_loss": 0.75382823, "learning_rate": 6.832554585802012e-07, "loss": 0.77520764, "num_input_tokens_seen": 264396310, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 12256, "time_per_iteration": 2.5353121757507324 }, { "auxiliary_loss_clip": 0.01106494, "auxiliary_loss_mlp": 0.01030829, "balance_loss_clip": 1.01769209, "balance_loss_mlp": 1.03641891, "epoch": 0.7369307079513001, "flos": 34970169968640.0, "grad_norm": 1.8265382971917976, "language_loss": 0.7344169, "learning_rate": 6.829623386729182e-07, "loss": 0.75579017, "num_input_tokens_seen": 264418085, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 12257, "time_per_iteration": 3.9772233963012695 }, { "auxiliary_loss_clip": 0.01104765, "auxiliary_loss_mlp": 0.01034677, "balance_loss_clip": 1.02221346, "balance_loss_mlp": 1.0353601, "epoch": 0.7369908312039681, "flos": 21214588164480.0, "grad_norm": 1.4940420166411428, "language_loss": 0.77860129, "learning_rate": 6.826692687078362e-07, "loss": 0.79999572, "num_input_tokens_seen": 264437595, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 12258, "time_per_iteration": 4.047146558761597 }, { "auxiliary_loss_clip": 0.01109474, "auxiliary_loss_mlp": 0.01034697, "balance_loss_clip": 1.02176309, "balance_loss_mlp": 1.03747165, "epoch": 0.7370509544566362, "flos": 23623655477760.0, "grad_norm": 1.4858893746070092, "language_loss": 0.66241944, "learning_rate": 6.823762486960674e-07, "loss": 0.68386114, "num_input_tokens_seen": 264457385, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 12259, "time_per_iteration": 2.5020015239715576 }, { "auxiliary_loss_clip": 0.0110655, "auxiliary_loss_mlp": 0.01033895, "balance_loss_clip": 1.02023315, "balance_loss_mlp": 1.03663993, "epoch": 0.7371110777093041, "flos": 24827704300800.0, "grad_norm": 1.8019438391364373, "language_loss": 0.7356174, "learning_rate": 6.820832786487225e-07, "loss": 0.75702178, "num_input_tokens_seen": 264477205, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69921875, "step": 12260, "time_per_iteration": 2.5420687198638916 }, { "auxiliary_loss_clip": 0.01109373, "auxiliary_loss_mlp": 0.01029085, "balance_loss_clip": 1.01662755, "balance_loss_mlp": 1.03733635, "epoch": 0.7371712009619721, "flos": 23149491016320.0, "grad_norm": 1.6979401138497634, "language_loss": 0.73460329, "learning_rate": 6.817903585769125e-07, "loss": 0.75598788, "num_input_tokens_seen": 264497195, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.71875, "step": 12261, "time_per_iteration": 2.4634222984313965 }, { "auxiliary_loss_clip": 0.0110952, "auxiliary_loss_mlp": 0.01034897, "balance_loss_clip": 1.02084827, "balance_loss_mlp": 1.03738737, "epoch": 0.73723132421464, "flos": 23112898035840.0, "grad_norm": 2.4491391236556335, "language_loss": 0.67702746, "learning_rate": 6.814974884917438e-07, "loss": 0.69847167, "num_input_tokens_seen": 264516950, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.72265625, "step": 12262, "time_per_iteration": 2.4501874446868896 }, { "auxiliary_loss_clip": 0.01106241, "auxiliary_loss_mlp": 0.01033461, "balance_loss_clip": 1.01999664, "balance_loss_mlp": 1.03554082, "epoch": 0.737291447467308, "flos": 19273328605440.0, "grad_norm": 3.1224129829778926, "language_loss": 0.88724476, "learning_rate": 6.81204668404322e-07, "loss": 0.90864182, "num_input_tokens_seen": 264532675, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 12263, "time_per_iteration": 2.4262733459472656 }, { "auxiliary_loss_clip": 0.01101171, "auxiliary_loss_mlp": 0.01024586, "balance_loss_clip": 1.01340461, "balance_loss_mlp": 1.03574896, "epoch": 0.7373515707199759, "flos": 25118257415040.0, "grad_norm": 1.5844983488447475, "language_loss": 0.67210066, "learning_rate": 6.809118983257522e-07, "loss": 0.69335818, "num_input_tokens_seen": 264555635, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.65625, "step": 12264, "time_per_iteration": 2.5285656452178955 }, { "auxiliary_loss_clip": 0.01100673, "auxiliary_loss_mlp": 0.01028736, "balance_loss_clip": 1.01656461, "balance_loss_mlp": 1.03350568, "epoch": 0.737411693972644, "flos": 32408481767040.0, "grad_norm": 1.7396610109435346, "language_loss": 0.80267841, "learning_rate": 6.806191782671356e-07, "loss": 0.82397246, "num_input_tokens_seen": 264573140, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 12265, "time_per_iteration": 2.547168254852295 }, { "auxiliary_loss_clip": 0.01110378, "auxiliary_loss_mlp": 0.01030693, "balance_loss_clip": 1.01761532, "balance_loss_mlp": 1.037045, "epoch": 0.7374718172253119, "flos": 24315797623680.0, "grad_norm": 1.7179025669006502, "language_loss": 0.74810296, "learning_rate": 6.803265082395711e-07, "loss": 0.76951367, "num_input_tokens_seen": 264591610, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 12266, "time_per_iteration": 2.4851667881011963 }, { "auxiliary_loss_clip": 0.01107425, "auxiliary_loss_mlp": 0.01037111, "balance_loss_clip": 1.0232172, "balance_loss_mlp": 1.03685236, "epoch": 0.7375319404779799, "flos": 27156115624320.0, "grad_norm": 5.0246324264479165, "language_loss": 0.73546958, "learning_rate": 6.800338882541576e-07, "loss": 0.75691497, "num_input_tokens_seen": 264611170, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.703125, "step": 12267, "time_per_iteration": 2.5037550926208496 }, { "auxiliary_loss_clip": 0.01103804, "auxiliary_loss_mlp": 0.01033823, "balance_loss_clip": 1.02149689, "balance_loss_mlp": 1.0352025, "epoch": 0.7375920637306478, "flos": 18879999701760.0, "grad_norm": 2.051017467743726, "language_loss": 0.83256173, "learning_rate": 6.797413183219923e-07, "loss": 0.85393798, "num_input_tokens_seen": 264629365, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 12268, "time_per_iteration": 2.455674171447754 }, { "auxiliary_loss_clip": 0.01104329, "auxiliary_loss_mlp": 0.0103965, "balance_loss_clip": 1.02705538, "balance_loss_mlp": 1.03602552, "epoch": 0.7376521869833158, "flos": 15669765486720.0, "grad_norm": 1.7993395562201806, "language_loss": 0.73433769, "learning_rate": 6.794487984541677e-07, "loss": 0.75577748, "num_input_tokens_seen": 264647915, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 12269, "time_per_iteration": 2.4376535415649414 }, { "auxiliary_loss_clip": 0.01108628, "auxiliary_loss_mlp": 0.01032946, "balance_loss_clip": 1.01972556, "balance_loss_mlp": 1.0366199, "epoch": 0.7377123102359837, "flos": 36971973901440.0, "grad_norm": 2.962181949315502, "language_loss": 0.70427144, "learning_rate": 6.791563286617776e-07, "loss": 0.72568715, "num_input_tokens_seen": 264669620, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 12270, "time_per_iteration": 2.5871057510375977 }, { "auxiliary_loss_clip": 0.01104688, "auxiliary_loss_mlp": 0.01030469, "balance_loss_clip": 1.01890528, "balance_loss_mlp": 1.0359776, "epoch": 0.7377724334886517, "flos": 24496284487680.0, "grad_norm": 1.7937761113701545, "language_loss": 0.69403195, "learning_rate": 6.788639089559119e-07, "loss": 0.71538353, "num_input_tokens_seen": 264689345, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6875, "step": 12271, "time_per_iteration": 2.4841175079345703 }, { "auxiliary_loss_clip": 0.01108269, "auxiliary_loss_mlp": 0.0103301, "balance_loss_clip": 1.01934254, "balance_loss_mlp": 1.03705072, "epoch": 0.7378325567413198, "flos": 24390025079040.0, "grad_norm": 2.2710229244133524, "language_loss": 0.67714137, "learning_rate": 6.785715393476586e-07, "loss": 0.69855416, "num_input_tokens_seen": 264707625, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 12272, "time_per_iteration": 2.4889111518859863 }, { "auxiliary_loss_clip": 0.01102958, "auxiliary_loss_mlp": 0.01027755, "balance_loss_clip": 1.01520205, "balance_loss_mlp": 1.03541422, "epoch": 0.7378926799939877, "flos": 17416388223360.0, "grad_norm": 1.6751954951842671, "language_loss": 0.78259641, "learning_rate": 6.782792198481049e-07, "loss": 0.80390358, "num_input_tokens_seen": 264725575, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 12273, "time_per_iteration": 2.44964337348938 }, { "auxiliary_loss_clip": 0.01104319, "auxiliary_loss_mlp": 0.01032028, "balance_loss_clip": 1.01979673, "balance_loss_mlp": 1.03508031, "epoch": 0.7379528032466557, "flos": 18474208778880.0, "grad_norm": 2.081839840566918, "language_loss": 0.83643109, "learning_rate": 6.779869504683355e-07, "loss": 0.85779452, "num_input_tokens_seen": 264742855, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 12274, "time_per_iteration": 2.4770619869232178 }, { "auxiliary_loss_clip": 0.01113123, "auxiliary_loss_mlp": 0.01032211, "balance_loss_clip": 1.01811981, "balance_loss_mlp": 1.03865755, "epoch": 0.7380129264993236, "flos": 17821999578240.0, "grad_norm": 2.0036617350056756, "language_loss": 0.73636961, "learning_rate": 6.776947312194341e-07, "loss": 0.75782287, "num_input_tokens_seen": 264761155, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 12275, "time_per_iteration": 2.4469728469848633 }, { "auxiliary_loss_clip": 0.01110002, "auxiliary_loss_mlp": 0.01039311, "balance_loss_clip": 1.02578056, "balance_loss_mlp": 1.03699231, "epoch": 0.7380730497519916, "flos": 22997372918400.0, "grad_norm": 1.7939975154265813, "language_loss": 0.73279268, "learning_rate": 6.774025621124813e-07, "loss": 0.75428581, "num_input_tokens_seen": 264780660, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 12276, "time_per_iteration": 2.480069637298584 }, { "auxiliary_loss_clip": 0.01107952, "auxiliary_loss_mlp": 0.01031946, "balance_loss_clip": 1.01884472, "balance_loss_mlp": 1.03660548, "epoch": 0.7381331730046595, "flos": 20266259241600.0, "grad_norm": 2.316023414722722, "language_loss": 0.77511227, "learning_rate": 6.771104431585551e-07, "loss": 0.79651129, "num_input_tokens_seen": 264798850, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 12277, "time_per_iteration": 2.4695308208465576 }, { "auxiliary_loss_clip": 0.01105109, "auxiliary_loss_mlp": 0.01041018, "balance_loss_clip": 1.0280478, "balance_loss_mlp": 1.03698969, "epoch": 0.7381932962573275, "flos": 19754532132480.0, "grad_norm": 2.0997677313865806, "language_loss": 0.78666162, "learning_rate": 6.768183743687338e-07, "loss": 0.80812293, "num_input_tokens_seen": 264816795, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 12278, "time_per_iteration": 2.4501829147338867 }, { "auxiliary_loss_clip": 0.01106806, "auxiliary_loss_mlp": 0.01036439, "balance_loss_clip": 1.02326655, "balance_loss_mlp": 1.0354203, "epoch": 0.7382534195099955, "flos": 17305316392320.0, "grad_norm": 2.7009917973537534, "language_loss": 0.72025484, "learning_rate": 6.765263557540921e-07, "loss": 0.7416873, "num_input_tokens_seen": 264834105, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 12279, "time_per_iteration": 2.4309113025665283 }, { "auxiliary_loss_clip": 0.01106956, "auxiliary_loss_mlp": 0.01036706, "balance_loss_clip": 1.02308631, "balance_loss_mlp": 1.03433895, "epoch": 0.7383135427626635, "flos": 18697358021760.0, "grad_norm": 2.4253859227066608, "language_loss": 0.85580784, "learning_rate": 6.762343873257034e-07, "loss": 0.87724447, "num_input_tokens_seen": 264850895, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 12280, "time_per_iteration": 2.4714086055755615 }, { "auxiliary_loss_clip": 0.01106541, "auxiliary_loss_mlp": 0.01028708, "balance_loss_clip": 1.0158453, "balance_loss_mlp": 1.0359478, "epoch": 0.7383736660153314, "flos": 20881300844160.0, "grad_norm": 2.172561313323175, "language_loss": 0.72459185, "learning_rate": 6.759424690946408e-07, "loss": 0.74594426, "num_input_tokens_seen": 264869505, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12281, "time_per_iteration": 2.4740755558013916 }, { "auxiliary_loss_clip": 0.01107556, "auxiliary_loss_mlp": 0.01034152, "balance_loss_clip": 1.02158165, "balance_loss_mlp": 1.03589153, "epoch": 0.7384337892679994, "flos": 20663215418880.0, "grad_norm": 1.7003033872060014, "language_loss": 0.6080668, "learning_rate": 6.756506010719711e-07, "loss": 0.62948388, "num_input_tokens_seen": 264886915, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 12282, "time_per_iteration": 2.472426652908325 }, { "auxiliary_loss_clip": 0.01107405, "auxiliary_loss_mlp": 0.01036452, "balance_loss_clip": 1.02255249, "balance_loss_mlp": 1.03583634, "epoch": 0.7384939125206673, "flos": 29169627390720.0, "grad_norm": 1.7676706978540009, "language_loss": 0.67751753, "learning_rate": 6.753587832687632e-07, "loss": 0.69895601, "num_input_tokens_seen": 264910350, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 12283, "time_per_iteration": 2.532217502593994 }, { "auxiliary_loss_clip": 0.01107627, "auxiliary_loss_mlp": 0.01037316, "balance_loss_clip": 1.02446544, "balance_loss_mlp": 1.03796446, "epoch": 0.7385540357733353, "flos": 36312833376000.0, "grad_norm": 1.7145697901635841, "language_loss": 0.76251733, "learning_rate": 6.750670156960832e-07, "loss": 0.78396678, "num_input_tokens_seen": 264930705, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12284, "time_per_iteration": 2.584520101547241 }, { "auxiliary_loss_clip": 0.01106727, "auxiliary_loss_mlp": 0.01035917, "balance_loss_clip": 1.022017, "balance_loss_mlp": 1.03520358, "epoch": 0.7386141590260034, "flos": 20302600826880.0, "grad_norm": 1.746905801968447, "language_loss": 0.68942934, "learning_rate": 6.747752983649954e-07, "loss": 0.71085572, "num_input_tokens_seen": 264946975, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 12285, "time_per_iteration": 2.4683279991149902 }, { "auxiliary_loss_clip": 0.01110746, "auxiliary_loss_mlp": 0.0103511, "balance_loss_clip": 1.02152586, "balance_loss_mlp": 1.03626728, "epoch": 0.7386742822786713, "flos": 25483792170240.0, "grad_norm": 2.0059152787672287, "language_loss": 0.79694462, "learning_rate": 6.744836312865602e-07, "loss": 0.81840312, "num_input_tokens_seen": 264967665, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 12286, "time_per_iteration": 2.5038764476776123 }, { "auxiliary_loss_clip": 0.01103991, "auxiliary_loss_mlp": 0.01030271, "balance_loss_clip": 1.01730084, "balance_loss_mlp": 1.03503764, "epoch": 0.7387344055313393, "flos": 13771958405760.0, "grad_norm": 1.974571678084382, "language_loss": 0.65940583, "learning_rate": 6.741920144718396e-07, "loss": 0.68074846, "num_input_tokens_seen": 264985480, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 12287, "time_per_iteration": 2.471245288848877 }, { "auxiliary_loss_clip": 0.01102085, "auxiliary_loss_mlp": 0.01032006, "balance_loss_clip": 1.02001309, "balance_loss_mlp": 1.03485918, "epoch": 0.7387945287840072, "flos": 27855189095040.0, "grad_norm": 1.941093738574985, "language_loss": 0.76543343, "learning_rate": 6.739004479318903e-07, "loss": 0.78677434, "num_input_tokens_seen": 265004790, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 12288, "time_per_iteration": 2.506587266921997 }, { "auxiliary_loss_clip": 0.0111046, "auxiliary_loss_mlp": 0.01034382, "balance_loss_clip": 1.02081001, "balance_loss_mlp": 1.03747773, "epoch": 0.7388546520366752, "flos": 44233039388160.0, "grad_norm": 1.7742154295501549, "language_loss": 0.5832026, "learning_rate": 6.736089316777684e-07, "loss": 0.60465097, "num_input_tokens_seen": 265028790, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 12289, "time_per_iteration": 2.6531453132629395 }, { "auxiliary_loss_clip": 0.01030401, "auxiliary_loss_mlp": 0.01000981, "balance_loss_clip": 0.99987841, "balance_loss_mlp": 1.00789285, "epoch": 0.7389147752893431, "flos": 70680890638080.0, "grad_norm": 0.6428878032432056, "language_loss": 0.49229625, "learning_rate": 6.733174657205287e-07, "loss": 0.51261008, "num_input_tokens_seen": 265096660, "router_z_loss_clip": 0.01104736, "router_z_loss_mlp": 0.22460938, "step": 12290, "time_per_iteration": 3.1966311931610107 }, { "auxiliary_loss_clip": 0.01108444, "auxiliary_loss_mlp": 0.01032169, "balance_loss_clip": 1.01828682, "balance_loss_mlp": 1.03677499, "epoch": 0.7389748985420111, "flos": 25994980575360.0, "grad_norm": 1.8254870008578432, "language_loss": 0.67482191, "learning_rate": 6.730260500712237e-07, "loss": 0.69622803, "num_input_tokens_seen": 265116375, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 12291, "time_per_iteration": 2.5113906860351562 }, { "auxiliary_loss_clip": 0.01029897, "auxiliary_loss_mlp": 0.00998852, "balance_loss_clip": 0.99771959, "balance_loss_mlp": 1.00744891, "epoch": 0.7390350217946791, "flos": 54403661318400.0, "grad_norm": 1.0077499758726016, "language_loss": 0.6085639, "learning_rate": 6.727346847409052e-07, "loss": 0.62885141, "num_input_tokens_seen": 265161230, "router_z_loss_clip": 0.01135254, "router_z_loss_mlp": 0.22460938, "step": 12292, "time_per_iteration": 2.743579626083374 }, { "auxiliary_loss_clip": 0.01107076, "auxiliary_loss_mlp": 0.01033766, "balance_loss_clip": 1.02166629, "balance_loss_mlp": 1.03800166, "epoch": 0.7390951450473471, "flos": 32196968530560.0, "grad_norm": 1.7355128404140963, "language_loss": 0.6725235, "learning_rate": 6.724433697406191e-07, "loss": 0.69393194, "num_input_tokens_seen": 265182515, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 12293, "time_per_iteration": 2.5486252307891846 }, { "auxiliary_loss_clip": 0.01106413, "auxiliary_loss_mlp": 0.01031803, "balance_loss_clip": 1.0185647, "balance_loss_mlp": 1.03630221, "epoch": 0.739155268300015, "flos": 16684241304960.0, "grad_norm": 1.8732068955877574, "language_loss": 0.83518469, "learning_rate": 6.721521050814134e-07, "loss": 0.85656685, "num_input_tokens_seen": 265198160, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 12294, "time_per_iteration": 2.47796630859375 }, { "auxiliary_loss_clip": 0.01105215, "auxiliary_loss_mlp": 0.01036026, "balance_loss_clip": 1.0228833, "balance_loss_mlp": 1.03661096, "epoch": 0.739215391552683, "flos": 31649761762560.0, "grad_norm": 1.5898059366584771, "language_loss": 0.73273408, "learning_rate": 6.718608907743337e-07, "loss": 0.75414652, "num_input_tokens_seen": 265218480, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 12295, "time_per_iteration": 5.4965574741363525 }, { "auxiliary_loss_clip": 0.0110411, "auxiliary_loss_mlp": 0.01035717, "balance_loss_clip": 1.0234201, "balance_loss_mlp": 1.0369575, "epoch": 0.7392755148053509, "flos": 29718522097920.0, "grad_norm": 1.7983294655783735, "language_loss": 0.78900874, "learning_rate": 6.715697268304215e-07, "loss": 0.81040698, "num_input_tokens_seen": 265240165, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 12296, "time_per_iteration": 2.54166841506958 }, { "auxiliary_loss_clip": 0.01106063, "auxiliary_loss_mlp": 0.01029162, "balance_loss_clip": 1.01544678, "balance_loss_mlp": 1.03618467, "epoch": 0.7393356380580189, "flos": 37050475075200.0, "grad_norm": 1.9581524563693378, "language_loss": 0.67086935, "learning_rate": 6.712786132607182e-07, "loss": 0.69222158, "num_input_tokens_seen": 265263295, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69921875, "step": 12297, "time_per_iteration": 2.6115171909332275 }, { "auxiliary_loss_clip": 0.01107356, "auxiliary_loss_mlp": 0.01038125, "balance_loss_clip": 1.02454758, "balance_loss_mlp": 1.03673184, "epoch": 0.739395761310687, "flos": 19719627091200.0, "grad_norm": 4.47742914156208, "language_loss": 0.68997985, "learning_rate": 6.709875500762645e-07, "loss": 0.7114346, "num_input_tokens_seen": 265282740, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.70703125, "step": 12298, "time_per_iteration": 2.4955406188964844 }, { "auxiliary_loss_clip": 0.01105388, "auxiliary_loss_mlp": 0.01032113, "balance_loss_clip": 1.01919067, "balance_loss_mlp": 1.03537464, "epoch": 0.7394558845633549, "flos": 11801504067840.0, "grad_norm": 3.1332886783142295, "language_loss": 0.74533397, "learning_rate": 6.706965372880946e-07, "loss": 0.76670897, "num_input_tokens_seen": 265300175, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 12299, "time_per_iteration": 3.8103339672088623 }, { "auxiliary_loss_clip": 0.01029311, "auxiliary_loss_mlp": 0.00999373, "balance_loss_clip": 0.99823469, "balance_loss_mlp": 1.00661182, "epoch": 0.7395160078160229, "flos": 66195827850240.0, "grad_norm": 0.7245319927279943, "language_loss": 0.60860074, "learning_rate": 6.704055749072455e-07, "loss": 0.62888765, "num_input_tokens_seen": 265363275, "router_z_loss_clip": 0.01141357, "router_z_loss_mlp": 0.2265625, "step": 12300, "time_per_iteration": 4.591663360595703 }, { "auxiliary_loss_clip": 0.01105966, "auxiliary_loss_mlp": 0.01029446, "balance_loss_clip": 1.0162133, "balance_loss_mlp": 1.03710341, "epoch": 0.7395761310686908, "flos": 21249708687360.0, "grad_norm": 1.8225200537806945, "language_loss": 0.80247295, "learning_rate": 6.7011466294475e-07, "loss": 0.82382715, "num_input_tokens_seen": 265382935, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 12301, "time_per_iteration": 2.449774742126465 }, { "auxiliary_loss_clip": 0.01103778, "auxiliary_loss_mlp": 0.01029454, "balance_loss_clip": 1.0172466, "balance_loss_mlp": 1.03486133, "epoch": 0.7396362543213588, "flos": 25955299025280.0, "grad_norm": 1.5845971787789332, "language_loss": 0.7327075, "learning_rate": 6.698238014116406e-07, "loss": 0.75403976, "num_input_tokens_seen": 265403245, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 12302, "time_per_iteration": 2.5317776203155518 }, { "auxiliary_loss_clip": 0.01107056, "auxiliary_loss_mlp": 0.01037382, "balance_loss_clip": 1.02451372, "balance_loss_mlp": 1.03544569, "epoch": 0.7396963775740267, "flos": 27377936064000.0, "grad_norm": 3.6046073059412835, "language_loss": 0.73977637, "learning_rate": 6.695329903189451e-07, "loss": 0.76122081, "num_input_tokens_seen": 265423105, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 12303, "time_per_iteration": 2.497742176055908 }, { "auxiliary_loss_clip": 0.01102854, "auxiliary_loss_mlp": 0.01028136, "balance_loss_clip": 1.01575053, "balance_loss_mlp": 1.03438807, "epoch": 0.7397565008266948, "flos": 25520133755520.0, "grad_norm": 1.9349607088449075, "language_loss": 0.53890777, "learning_rate": 6.692422296776927e-07, "loss": 0.56021762, "num_input_tokens_seen": 265443445, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 12304, "time_per_iteration": 2.5575954914093018 }, { "auxiliary_loss_clip": 0.01108008, "auxiliary_loss_mlp": 0.0103303, "balance_loss_clip": 1.01982117, "balance_loss_mlp": 1.0376333, "epoch": 0.7398166240793627, "flos": 23727760070400.0, "grad_norm": 2.230068999197747, "language_loss": 0.84193468, "learning_rate": 6.689515194989084e-07, "loss": 0.86334515, "num_input_tokens_seen": 265462085, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 12305, "time_per_iteration": 2.5020058155059814 }, { "auxiliary_loss_clip": 0.01029248, "auxiliary_loss_mlp": 0.01001228, "balance_loss_clip": 1.00018489, "balance_loss_mlp": 1.00651407, "epoch": 0.7398767473320307, "flos": 67267582882560.0, "grad_norm": 0.9193937624111196, "language_loss": 0.57765782, "learning_rate": 6.68660859793615e-07, "loss": 0.5979625, "num_input_tokens_seen": 265521190, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.22753906, "step": 12306, "time_per_iteration": 3.096468210220337 }, { "auxiliary_loss_clip": 0.01110054, "auxiliary_loss_mlp": 0.01029214, "balance_loss_clip": 1.01611257, "balance_loss_mlp": 1.03862882, "epoch": 0.7399368705846986, "flos": 22018699981440.0, "grad_norm": 2.010140239908887, "language_loss": 0.82039392, "learning_rate": 6.683702505728355e-07, "loss": 0.84178662, "num_input_tokens_seen": 265539705, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 12307, "time_per_iteration": 2.447561264038086 }, { "auxiliary_loss_clip": 0.01104185, "auxiliary_loss_mlp": 0.01029579, "balance_loss_clip": 1.01755691, "balance_loss_mlp": 1.03744698, "epoch": 0.7399969938373666, "flos": 14173870659840.0, "grad_norm": 2.029767279929055, "language_loss": 0.69842863, "learning_rate": 6.680796918475893e-07, "loss": 0.71976626, "num_input_tokens_seen": 265555855, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66796875, "step": 12308, "time_per_iteration": 2.413346529006958 }, { "auxiliary_loss_clip": 0.01101332, "auxiliary_loss_mlp": 0.01029117, "balance_loss_clip": 1.01672506, "balance_loss_mlp": 1.03401661, "epoch": 0.7400571170900345, "flos": 25301473712640.0, "grad_norm": 1.7931304075726389, "language_loss": 0.8145256, "learning_rate": 6.67789183628896e-07, "loss": 0.83583009, "num_input_tokens_seen": 265575455, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 12309, "time_per_iteration": 2.4712324142456055 }, { "auxiliary_loss_clip": 0.0110722, "auxiliary_loss_mlp": 0.01033495, "balance_loss_clip": 1.0195837, "balance_loss_mlp": 1.03595543, "epoch": 0.7401172403427025, "flos": 22711344917760.0, "grad_norm": 1.9937740754048614, "language_loss": 0.72895449, "learning_rate": 6.674987259277692e-07, "loss": 0.75036168, "num_input_tokens_seen": 265595250, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7109375, "step": 12310, "time_per_iteration": 2.4682626724243164 }, { "auxiliary_loss_clip": 0.01108571, "auxiliary_loss_mlp": 0.01039425, "balance_loss_clip": 1.02587068, "balance_loss_mlp": 1.03814614, "epoch": 0.7401773635953706, "flos": 18067448188800.0, "grad_norm": 2.4870857147854446, "language_loss": 0.88135278, "learning_rate": 6.672083187552239e-07, "loss": 0.90283275, "num_input_tokens_seen": 265606945, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 12311, "time_per_iteration": 2.4309263229370117 }, { "auxiliary_loss_clip": 0.01103628, "auxiliary_loss_mlp": 0.01030868, "balance_loss_clip": 1.01859534, "balance_loss_mlp": 1.03434575, "epoch": 0.7402374868480385, "flos": 22712135016960.0, "grad_norm": 1.5353358651376643, "language_loss": 0.80359423, "learning_rate": 6.669179621222738e-07, "loss": 0.82493919, "num_input_tokens_seen": 265626115, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 12312, "time_per_iteration": 2.5071232318878174 }, { "auxiliary_loss_clip": 0.01106174, "auxiliary_loss_mlp": 0.0103215, "balance_loss_clip": 1.01960874, "balance_loss_mlp": 1.03751588, "epoch": 0.7402976101007065, "flos": 22856675345280.0, "grad_norm": 1.971664367260007, "language_loss": 0.78473926, "learning_rate": 6.666276560399273e-07, "loss": 0.80612248, "num_input_tokens_seen": 265646520, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 12313, "time_per_iteration": 2.460702896118164 }, { "auxiliary_loss_clip": 0.01106697, "auxiliary_loss_mlp": 0.01038681, "balance_loss_clip": 1.02505565, "balance_loss_mlp": 1.03470862, "epoch": 0.7403577333533744, "flos": 12345801834240.0, "grad_norm": 1.9632073483322356, "language_loss": 0.78462088, "learning_rate": 6.663374005191937e-07, "loss": 0.80607468, "num_input_tokens_seen": 265661875, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 12314, "time_per_iteration": 2.426163673400879 }, { "auxiliary_loss_clip": 0.01029128, "auxiliary_loss_mlp": 0.01000585, "balance_loss_clip": 0.99947661, "balance_loss_mlp": 1.00643778, "epoch": 0.7404178566060424, "flos": 60327270869760.0, "grad_norm": 0.8114947210188982, "language_loss": 0.55198199, "learning_rate": 6.660471955710809e-07, "loss": 0.5722791, "num_input_tokens_seen": 265721255, "router_z_loss_clip": 0.0111084, "router_z_loss_mlp": 0.2265625, "step": 12315, "time_per_iteration": 3.056976079940796 }, { "auxiliary_loss_clip": 0.01102407, "auxiliary_loss_mlp": 0.01032984, "balance_loss_clip": 1.02053833, "balance_loss_mlp": 1.03603053, "epoch": 0.7404779798587103, "flos": 32014650072960.0, "grad_norm": 1.490941861920794, "language_loss": 0.79990506, "learning_rate": 6.65757041206591e-07, "loss": 0.82125896, "num_input_tokens_seen": 265743970, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6640625, "step": 12316, "time_per_iteration": 2.599470376968384 }, { "auxiliary_loss_clip": 0.01104135, "auxiliary_loss_mlp": 0.0102916, "balance_loss_clip": 1.01654768, "balance_loss_mlp": 1.03451848, "epoch": 0.7405381031113784, "flos": 12889704551040.0, "grad_norm": 3.521481081093948, "language_loss": 0.74993861, "learning_rate": 6.654669374367275e-07, "loss": 0.77127159, "num_input_tokens_seen": 265760890, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12317, "time_per_iteration": 2.431532859802246 }, { "auxiliary_loss_clip": 0.01100709, "auxiliary_loss_mlp": 0.01033861, "balance_loss_clip": 1.02195799, "balance_loss_mlp": 1.03486133, "epoch": 0.7405982263640463, "flos": 20229127557120.0, "grad_norm": 1.7404395898636533, "language_loss": 0.81924677, "learning_rate": 6.651768842724917e-07, "loss": 0.84059244, "num_input_tokens_seen": 265779600, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.65625, "step": 12318, "time_per_iteration": 2.456043004989624 }, { "auxiliary_loss_clip": 0.01106722, "auxiliary_loss_mlp": 0.01032233, "balance_loss_clip": 1.01969814, "balance_loss_mlp": 1.03576946, "epoch": 0.7406583496167143, "flos": 17567213431680.0, "grad_norm": 2.045671916038815, "language_loss": 0.76767504, "learning_rate": 6.648868817248827e-07, "loss": 0.78906459, "num_input_tokens_seen": 265797030, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 12319, "time_per_iteration": 2.417726993560791 }, { "auxiliary_loss_clip": 0.01104326, "auxiliary_loss_mlp": 0.01031396, "balance_loss_clip": 1.01960635, "balance_loss_mlp": 1.03601956, "epoch": 0.7407184728693822, "flos": 18295733076480.0, "grad_norm": 2.1398675753460585, "language_loss": 0.6397168, "learning_rate": 6.64596929804897e-07, "loss": 0.66107398, "num_input_tokens_seen": 265815055, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.68359375, "step": 12320, "time_per_iteration": 2.4358768463134766 }, { "auxiliary_loss_clip": 0.01108725, "auxiliary_loss_mlp": 0.01035815, "balance_loss_clip": 1.02313089, "balance_loss_mlp": 1.03709221, "epoch": 0.7407785961220502, "flos": 16690562098560.0, "grad_norm": 2.782346259428114, "language_loss": 0.82564235, "learning_rate": 6.643070285235288e-07, "loss": 0.84708774, "num_input_tokens_seen": 265828480, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 12321, "time_per_iteration": 2.4089014530181885 }, { "auxiliary_loss_clip": 0.01110201, "auxiliary_loss_mlp": 0.01043394, "balance_loss_clip": 1.02876687, "balance_loss_mlp": 1.03650355, "epoch": 0.7408387193747181, "flos": 22088330496000.0, "grad_norm": 1.9836634389814345, "language_loss": 0.72103703, "learning_rate": 6.640171778917727e-07, "loss": 0.74257296, "num_input_tokens_seen": 265845825, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.73828125, "step": 12322, "time_per_iteration": 2.46199893951416 }, { "auxiliary_loss_clip": 0.01105808, "auxiliary_loss_mlp": 0.01030181, "balance_loss_clip": 1.01753902, "balance_loss_mlp": 1.03644133, "epoch": 0.7408988426273861, "flos": 24236721832320.0, "grad_norm": 2.2480747305274047, "language_loss": 0.64047039, "learning_rate": 6.637273779206183e-07, "loss": 0.66183031, "num_input_tokens_seen": 265866335, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 12323, "time_per_iteration": 2.4701621532440186 }, { "auxiliary_loss_clip": 0.01106336, "auxiliary_loss_mlp": 0.01031202, "balance_loss_clip": 1.01807117, "balance_loss_mlp": 1.03527737, "epoch": 0.7409589658800542, "flos": 29023004073600.0, "grad_norm": 1.591536293192134, "language_loss": 0.76073229, "learning_rate": 6.634376286210559e-07, "loss": 0.78210759, "num_input_tokens_seen": 265888945, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12324, "time_per_iteration": 2.537628412246704 }, { "auxiliary_loss_clip": 0.01104279, "auxiliary_loss_mlp": 0.01023901, "balance_loss_clip": 1.01155066, "balance_loss_mlp": 1.03428042, "epoch": 0.7410190891327221, "flos": 19351362902400.0, "grad_norm": 1.8246679679193687, "language_loss": 0.75038368, "learning_rate": 6.63147930004073e-07, "loss": 0.77166545, "num_input_tokens_seen": 265908030, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69921875, "step": 12325, "time_per_iteration": 2.428616523742676 }, { "auxiliary_loss_clip": 0.01109674, "auxiliary_loss_mlp": 0.01035275, "balance_loss_clip": 1.02197099, "balance_loss_mlp": 1.03610265, "epoch": 0.7410792123853901, "flos": 22747650589440.0, "grad_norm": 1.7574087564030145, "language_loss": 0.68565118, "learning_rate": 6.628582820806545e-07, "loss": 0.70710063, "num_input_tokens_seen": 265927030, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 12326, "time_per_iteration": 2.464796543121338 }, { "auxiliary_loss_clip": 0.0110625, "auxiliary_loss_mlp": 0.01027651, "balance_loss_clip": 1.01516962, "balance_loss_mlp": 1.03595614, "epoch": 0.741139335638058, "flos": 25372433030400.0, "grad_norm": 2.296137748373444, "language_loss": 0.89263356, "learning_rate": 6.625686848617835e-07, "loss": 0.91397262, "num_input_tokens_seen": 265945490, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 12327, "time_per_iteration": 2.477151870727539 }, { "auxiliary_loss_clip": 0.01106351, "auxiliary_loss_mlp": 0.01031124, "balance_loss_clip": 1.01846433, "balance_loss_mlp": 1.03693366, "epoch": 0.741199458890726, "flos": 18585639745920.0, "grad_norm": 1.8251957445067788, "language_loss": 0.85724646, "learning_rate": 6.62279138358442e-07, "loss": 0.87862122, "num_input_tokens_seen": 265963265, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 12328, "time_per_iteration": 2.4410560131073 }, { "auxiliary_loss_clip": 0.01104453, "auxiliary_loss_mlp": 0.01031441, "balance_loss_clip": 1.01800036, "balance_loss_mlp": 1.03591084, "epoch": 0.7412595821433939, "flos": 22127078292480.0, "grad_norm": 1.8322713030961342, "language_loss": 0.66975141, "learning_rate": 6.619896425816103e-07, "loss": 0.69111031, "num_input_tokens_seen": 265982270, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.68359375, "step": 12329, "time_per_iteration": 2.4522082805633545 }, { "auxiliary_loss_clip": 0.01112089, "auxiliary_loss_mlp": 0.0103147, "balance_loss_clip": 1.01847029, "balance_loss_mlp": 1.03798354, "epoch": 0.741319705396062, "flos": 29169699217920.0, "grad_norm": 1.7253495835388846, "language_loss": 0.66696477, "learning_rate": 6.617001975422647e-07, "loss": 0.68840039, "num_input_tokens_seen": 266003835, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7421875, "step": 12330, "time_per_iteration": 2.5360100269317627 }, { "auxiliary_loss_clip": 0.01112608, "auxiliary_loss_mlp": 0.01033529, "balance_loss_clip": 1.01813936, "balance_loss_mlp": 1.03819346, "epoch": 0.7413798286487299, "flos": 20667489137280.0, "grad_norm": 2.1950448411402546, "language_loss": 0.85961491, "learning_rate": 6.614108032513823e-07, "loss": 0.88107628, "num_input_tokens_seen": 266021595, "router_z_loss_clip": 0.15429688, "router_z_loss_mlp": 0.74609375, "step": 12331, "time_per_iteration": 2.4503421783447266 }, { "auxiliary_loss_clip": 0.01107122, "auxiliary_loss_mlp": 0.01028033, "balance_loss_clip": 1.01499116, "balance_loss_mlp": 1.03530955, "epoch": 0.7414399519013979, "flos": 16398895662720.0, "grad_norm": 1.9852519994513955, "language_loss": 0.69657159, "learning_rate": 6.611214597199364e-07, "loss": 0.71792316, "num_input_tokens_seen": 266039860, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 12332, "time_per_iteration": 2.4611737728118896 }, { "auxiliary_loss_clip": 0.01107506, "auxiliary_loss_mlp": 0.01036317, "balance_loss_clip": 1.02274513, "balance_loss_mlp": 1.03735912, "epoch": 0.7415000751540658, "flos": 25630235919360.0, "grad_norm": 2.1996760599649283, "language_loss": 0.63582706, "learning_rate": 6.608321669588984e-07, "loss": 0.65726531, "num_input_tokens_seen": 266058050, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 12333, "time_per_iteration": 2.492173433303833 }, { "auxiliary_loss_clip": 0.01103235, "auxiliary_loss_mlp": 0.01033197, "balance_loss_clip": 1.02082324, "balance_loss_mlp": 1.03731537, "epoch": 0.7415601984067338, "flos": 24499732193280.0, "grad_norm": 1.7889814963508024, "language_loss": 0.71285105, "learning_rate": 6.605429249792387e-07, "loss": 0.73421538, "num_input_tokens_seen": 266078060, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.65625, "step": 12334, "time_per_iteration": 2.5061686038970947 }, { "auxiliary_loss_clip": 0.01104082, "auxiliary_loss_mlp": 0.01025757, "balance_loss_clip": 1.01344895, "balance_loss_mlp": 1.03545928, "epoch": 0.7416203216594017, "flos": 20887154760960.0, "grad_norm": 1.7574421261516866, "language_loss": 0.82118535, "learning_rate": 6.602537337919257e-07, "loss": 0.84248376, "num_input_tokens_seen": 266097110, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 12335, "time_per_iteration": 2.4784443378448486 }, { "auxiliary_loss_clip": 0.01106356, "auxiliary_loss_mlp": 0.0103064, "balance_loss_clip": 1.01686549, "balance_loss_mlp": 1.0360837, "epoch": 0.7416804449120697, "flos": 15624265933440.0, "grad_norm": 2.379909012655281, "language_loss": 0.74889314, "learning_rate": 6.599645934079259e-07, "loss": 0.77026314, "num_input_tokens_seen": 266110870, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.703125, "step": 12336, "time_per_iteration": 2.4056317806243896 }, { "auxiliary_loss_clip": 0.01110006, "auxiliary_loss_mlp": 0.01033819, "balance_loss_clip": 1.02086127, "balance_loss_mlp": 1.03796959, "epoch": 0.7417405681647377, "flos": 17120483982720.0, "grad_norm": 2.185670313771936, "language_loss": 0.73437119, "learning_rate": 6.596755038382029e-07, "loss": 0.75580943, "num_input_tokens_seen": 266127845, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 12337, "time_per_iteration": 5.316157102584839 }, { "auxiliary_loss_clip": 0.01104775, "auxiliary_loss_mlp": 0.01033811, "balance_loss_clip": 1.02186036, "balance_loss_mlp": 1.03785157, "epoch": 0.7418006914174057, "flos": 18880322924160.0, "grad_norm": 1.8712796918318473, "language_loss": 0.76937294, "learning_rate": 6.593864650937186e-07, "loss": 0.79075885, "num_input_tokens_seen": 266145400, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66796875, "step": 12338, "time_per_iteration": 2.4305408000946045 }, { "auxiliary_loss_clip": 0.01102844, "auxiliary_loss_mlp": 0.01025528, "balance_loss_clip": 1.01436949, "balance_loss_mlp": 1.0350585, "epoch": 0.7418608146700737, "flos": 21580733450880.0, "grad_norm": 2.242059783297617, "language_loss": 0.73051357, "learning_rate": 6.590974771854345e-07, "loss": 0.75179726, "num_input_tokens_seen": 266164430, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6796875, "step": 12339, "time_per_iteration": 2.4667303562164307 }, { "auxiliary_loss_clip": 0.01105942, "auxiliary_loss_mlp": 0.01028145, "balance_loss_clip": 1.015342, "balance_loss_mlp": 1.0365802, "epoch": 0.7419209379227416, "flos": 22340459036160.0, "grad_norm": 1.7113148970638092, "language_loss": 0.79822409, "learning_rate": 6.588085401243077e-07, "loss": 0.81956494, "num_input_tokens_seen": 266183855, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 12340, "time_per_iteration": 3.850764274597168 }, { "auxiliary_loss_clip": 0.01104905, "auxiliary_loss_mlp": 0.01036482, "balance_loss_clip": 1.02345276, "balance_loss_mlp": 1.03479946, "epoch": 0.7419810611754096, "flos": 16762275601920.0, "grad_norm": 1.4815353850150708, "language_loss": 0.75600839, "learning_rate": 6.585196539212958e-07, "loss": 0.77742225, "num_input_tokens_seen": 266202085, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12341, "time_per_iteration": 2.4387030601501465 }, { "auxiliary_loss_clip": 0.0109921, "auxiliary_loss_mlp": 0.01032858, "balance_loss_clip": 1.02050161, "balance_loss_mlp": 1.0354712, "epoch": 0.7420411844280775, "flos": 26212958259840.0, "grad_norm": 1.4741478161558672, "language_loss": 0.80000699, "learning_rate": 6.582308185873535e-07, "loss": 0.82132769, "num_input_tokens_seen": 266223445, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.63671875, "step": 12342, "time_per_iteration": 3.944929361343384 }, { "auxiliary_loss_clip": 0.01105463, "auxiliary_loss_mlp": 0.01028685, "balance_loss_clip": 1.01657367, "balance_loss_mlp": 1.03686476, "epoch": 0.7421013076807456, "flos": 68529371840640.0, "grad_norm": 1.6630861998475652, "language_loss": 0.77631611, "learning_rate": 6.57942034133433e-07, "loss": 0.79765761, "num_input_tokens_seen": 266246575, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 12343, "time_per_iteration": 2.8339767456054688 }, { "auxiliary_loss_clip": 0.01102721, "auxiliary_loss_mlp": 0.01030238, "balance_loss_clip": 1.01739955, "balance_loss_mlp": 1.03357697, "epoch": 0.7421614309334135, "flos": 24425325169920.0, "grad_norm": 1.4821145106150038, "language_loss": 0.67820466, "learning_rate": 6.576533005704843e-07, "loss": 0.6995343, "num_input_tokens_seen": 266266055, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 12344, "time_per_iteration": 2.4959208965301514 }, { "auxiliary_loss_clip": 0.01106906, "auxiliary_loss_mlp": 0.01035655, "balance_loss_clip": 1.02130175, "balance_loss_mlp": 1.0362165, "epoch": 0.7422215541860815, "flos": 12311076360960.0, "grad_norm": 2.8642141625438255, "language_loss": 0.81668627, "learning_rate": 6.573646179094572e-07, "loss": 0.83811182, "num_input_tokens_seen": 266282240, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.70703125, "step": 12345, "time_per_iteration": 2.398080587387085 }, { "auxiliary_loss_clip": 0.01106502, "auxiliary_loss_mlp": 0.01033686, "balance_loss_clip": 1.020859, "balance_loss_mlp": 1.03652287, "epoch": 0.7422816774387494, "flos": 19645579203840.0, "grad_norm": 2.047414991990824, "language_loss": 0.70704526, "learning_rate": 6.570759861612988e-07, "loss": 0.7284472, "num_input_tokens_seen": 266300980, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 12346, "time_per_iteration": 2.445521354675293 }, { "auxiliary_loss_clip": 0.01105961, "auxiliary_loss_mlp": 0.010283, "balance_loss_clip": 1.01593804, "balance_loss_mlp": 1.03674221, "epoch": 0.7423418006914174, "flos": 32015978876160.0, "grad_norm": 1.7934783755336126, "language_loss": 0.73899233, "learning_rate": 6.56787405336953e-07, "loss": 0.76033497, "num_input_tokens_seen": 266322215, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 12347, "time_per_iteration": 2.5419161319732666 }, { "auxiliary_loss_clip": 0.01108331, "auxiliary_loss_mlp": 0.01033378, "balance_loss_clip": 1.02067649, "balance_loss_mlp": 1.03562975, "epoch": 0.7424019239440853, "flos": 18916951818240.0, "grad_norm": 1.9601487854983475, "language_loss": 0.81081128, "learning_rate": 6.564988754473642e-07, "loss": 0.83222836, "num_input_tokens_seen": 266341600, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 12348, "time_per_iteration": 2.4662842750549316 }, { "auxiliary_loss_clip": 0.01105127, "auxiliary_loss_mlp": 0.01028468, "balance_loss_clip": 1.01595664, "balance_loss_mlp": 1.0358727, "epoch": 0.7424620471967533, "flos": 35876518871040.0, "grad_norm": 1.7631375712276993, "language_loss": 0.72507763, "learning_rate": 6.562103965034724e-07, "loss": 0.74641359, "num_input_tokens_seen": 266362895, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 12349, "time_per_iteration": 2.577768087387085 }, { "auxiliary_loss_clip": 0.01108203, "auxiliary_loss_mlp": 0.0103424, "balance_loss_clip": 1.019822, "balance_loss_mlp": 1.03462029, "epoch": 0.7425221704494213, "flos": 27016603200000.0, "grad_norm": 1.8950623629917565, "language_loss": 0.78569055, "learning_rate": 6.559219685162165e-07, "loss": 0.80711496, "num_input_tokens_seen": 266384015, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.734375, "step": 12350, "time_per_iteration": 2.573215961456299 }, { "auxiliary_loss_clip": 0.01104289, "auxiliary_loss_mlp": 0.01030641, "balance_loss_clip": 1.01841569, "balance_loss_mlp": 1.03493667, "epoch": 0.7425822937020893, "flos": 34167135559680.0, "grad_norm": 1.91242903015783, "language_loss": 0.75357658, "learning_rate": 6.556335914965343e-07, "loss": 0.77492589, "num_input_tokens_seen": 266405990, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 12351, "time_per_iteration": 2.5609614849090576 }, { "auxiliary_loss_clip": 0.01103591, "auxiliary_loss_mlp": 0.01027382, "balance_loss_clip": 1.01496625, "balance_loss_mlp": 1.03495789, "epoch": 0.7426424169547573, "flos": 21283572234240.0, "grad_norm": 2.5298882177682627, "language_loss": 0.81206477, "learning_rate": 6.553452654553611e-07, "loss": 0.8333745, "num_input_tokens_seen": 266424260, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 12352, "time_per_iteration": 2.4717352390289307 }, { "auxiliary_loss_clip": 0.01105531, "auxiliary_loss_mlp": 0.01038306, "balance_loss_clip": 1.02628946, "balance_loss_mlp": 1.03640699, "epoch": 0.7427025402074252, "flos": 22448442297600.0, "grad_norm": 1.788723776833909, "language_loss": 0.71882463, "learning_rate": 6.550569904036307e-07, "loss": 0.74026293, "num_input_tokens_seen": 266444580, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 12353, "time_per_iteration": 2.5221848487854004 }, { "auxiliary_loss_clip": 0.01105546, "auxiliary_loss_mlp": 0.0103382, "balance_loss_clip": 1.02231014, "balance_loss_mlp": 1.03674412, "epoch": 0.7427626634600932, "flos": 22524609087360.0, "grad_norm": 1.739324487833724, "language_loss": 0.72256863, "learning_rate": 6.547687663522739e-07, "loss": 0.74396223, "num_input_tokens_seen": 266465640, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 12354, "time_per_iteration": 2.5016064643859863 }, { "auxiliary_loss_clip": 0.01029843, "auxiliary_loss_mlp": 0.01003495, "balance_loss_clip": 1.00242257, "balance_loss_mlp": 1.00731134, "epoch": 0.7428227867127611, "flos": 67209477655680.0, "grad_norm": 0.6962843676189617, "language_loss": 0.59602821, "learning_rate": 6.544805933122199e-07, "loss": 0.61636162, "num_input_tokens_seen": 266531950, "router_z_loss_clip": 0.01074219, "router_z_loss_mlp": 0.22558594, "step": 12355, "time_per_iteration": 3.2250406742095947 }, { "auxiliary_loss_clip": 0.0110656, "auxiliary_loss_mlp": 0.01031253, "balance_loss_clip": 1.0184257, "balance_loss_mlp": 1.03619397, "epoch": 0.7428829099654292, "flos": 14721221082240.0, "grad_norm": 1.7125699816278925, "language_loss": 0.67508638, "learning_rate": 6.541924712943971e-07, "loss": 0.69646454, "num_input_tokens_seen": 266550665, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 12356, "time_per_iteration": 2.4525797367095947 }, { "auxiliary_loss_clip": 0.01106179, "auxiliary_loss_mlp": 0.01037437, "balance_loss_clip": 1.02480721, "balance_loss_mlp": 1.03403163, "epoch": 0.7429430332180971, "flos": 48646496413440.0, "grad_norm": 1.8188140859805735, "language_loss": 0.71696043, "learning_rate": 6.539044003097301e-07, "loss": 0.73839658, "num_input_tokens_seen": 266572455, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 12357, "time_per_iteration": 2.7221617698669434 }, { "auxiliary_loss_clip": 0.01103125, "auxiliary_loss_mlp": 0.0102992, "balance_loss_clip": 1.01848197, "balance_loss_mlp": 1.03723979, "epoch": 0.7430031564707651, "flos": 16764071281920.0, "grad_norm": 1.726374511756878, "language_loss": 0.6545977, "learning_rate": 6.53616380369143e-07, "loss": 0.67592812, "num_input_tokens_seen": 266590895, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66015625, "step": 12358, "time_per_iteration": 2.4667811393737793 }, { "auxiliary_loss_clip": 0.01107953, "auxiliary_loss_mlp": 0.01035207, "balance_loss_clip": 1.02061605, "balance_loss_mlp": 1.0367285, "epoch": 0.743063279723433, "flos": 23870576545920.0, "grad_norm": 1.981953315079153, "language_loss": 0.81084472, "learning_rate": 6.533284114835591e-07, "loss": 0.83227628, "num_input_tokens_seen": 266607660, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.7109375, "step": 12359, "time_per_iteration": 2.4717705249786377 }, { "auxiliary_loss_clip": 0.0110411, "auxiliary_loss_mlp": 0.01029042, "balance_loss_clip": 1.01654315, "balance_loss_mlp": 1.03422689, "epoch": 0.743123402976101, "flos": 14391704689920.0, "grad_norm": 2.008779408934573, "language_loss": 0.67827737, "learning_rate": 6.530404936638956e-07, "loss": 0.69960892, "num_input_tokens_seen": 266624260, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 12360, "time_per_iteration": 2.44645357131958 }, { "auxiliary_loss_clip": 0.0110238, "auxiliary_loss_mlp": 0.01033829, "balance_loss_clip": 1.02052534, "balance_loss_mlp": 1.03281069, "epoch": 0.7431835262287689, "flos": 27454318335360.0, "grad_norm": 1.7868965913758184, "language_loss": 0.72304136, "learning_rate": 6.527526269210715e-07, "loss": 0.74440342, "num_input_tokens_seen": 266644210, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 12361, "time_per_iteration": 2.505629777908325 }, { "auxiliary_loss_clip": 0.01108304, "auxiliary_loss_mlp": 0.01032672, "balance_loss_clip": 1.01984549, "balance_loss_mlp": 1.0369947, "epoch": 0.743243649481437, "flos": 20959514709120.0, "grad_norm": 1.953608328610787, "language_loss": 0.56205273, "learning_rate": 6.524648112660027e-07, "loss": 0.58346248, "num_input_tokens_seen": 266664230, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 12362, "time_per_iteration": 2.466695785522461 }, { "auxiliary_loss_clip": 0.01107847, "auxiliary_loss_mlp": 0.01031919, "balance_loss_clip": 1.01929498, "balance_loss_mlp": 1.03810382, "epoch": 0.7433037727341049, "flos": 22783166161920.0, "grad_norm": 1.801283175066252, "language_loss": 0.77633107, "learning_rate": 6.521770467096039e-07, "loss": 0.79772878, "num_input_tokens_seen": 266683270, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12363, "time_per_iteration": 2.4570422172546387 }, { "auxiliary_loss_clip": 0.01105272, "auxiliary_loss_mlp": 0.01031243, "balance_loss_clip": 1.01922655, "balance_loss_mlp": 1.03607261, "epoch": 0.7433638959867729, "flos": 22196708807040.0, "grad_norm": 1.6551437097405357, "language_loss": 0.78210235, "learning_rate": 6.518893332627862e-07, "loss": 0.80346751, "num_input_tokens_seen": 266701235, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 12364, "time_per_iteration": 2.4814932346343994 }, { "auxiliary_loss_clip": 0.01104278, "auxiliary_loss_mlp": 0.01030318, "balance_loss_clip": 1.01783729, "balance_loss_mlp": 1.03437054, "epoch": 0.7434240192394409, "flos": 23296760778240.0, "grad_norm": 1.7752325667734137, "language_loss": 0.78595597, "learning_rate": 6.516016709364604e-07, "loss": 0.80730194, "num_input_tokens_seen": 266721495, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 12365, "time_per_iteration": 2.484367609024048 }, { "auxiliary_loss_clip": 0.01108391, "auxiliary_loss_mlp": 0.0103472, "balance_loss_clip": 1.02212548, "balance_loss_mlp": 1.03571773, "epoch": 0.7434841424921088, "flos": 54009575251200.0, "grad_norm": 1.6231662979272319, "language_loss": 0.77108175, "learning_rate": 6.513140597415346e-07, "loss": 0.79251289, "num_input_tokens_seen": 266747400, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 12366, "time_per_iteration": 2.7583823204040527 }, { "auxiliary_loss_clip": 0.01103586, "auxiliary_loss_mlp": 0.01027643, "balance_loss_clip": 1.01686656, "balance_loss_mlp": 1.03719115, "epoch": 0.7435442657447768, "flos": 21433966479360.0, "grad_norm": 2.5944312600017976, "language_loss": 0.71367931, "learning_rate": 6.510264996889141e-07, "loss": 0.73499161, "num_input_tokens_seen": 266767630, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.6640625, "step": 12367, "time_per_iteration": 2.464901924133301 }, { "auxiliary_loss_clip": 0.01109733, "auxiliary_loss_mlp": 0.01035359, "balance_loss_clip": 1.02266908, "balance_loss_mlp": 1.03739262, "epoch": 0.7436043889974447, "flos": 24499408970880.0, "grad_norm": 1.9194588178340268, "language_loss": 0.74550641, "learning_rate": 6.507389907895038e-07, "loss": 0.7669574, "num_input_tokens_seen": 266788015, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 12368, "time_per_iteration": 2.5182199478149414 }, { "auxiliary_loss_clip": 0.01102859, "auxiliary_loss_mlp": 0.01035879, "balance_loss_clip": 1.02421427, "balance_loss_mlp": 1.03613615, "epoch": 0.7436645122501128, "flos": 40698388512000.0, "grad_norm": 1.6026465243981902, "language_loss": 0.6932689, "learning_rate": 6.50451533054207e-07, "loss": 0.71465623, "num_input_tokens_seen": 266809010, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66796875, "step": 12369, "time_per_iteration": 2.6329379081726074 }, { "auxiliary_loss_clip": 0.01105274, "auxiliary_loss_mlp": 0.01029582, "balance_loss_clip": 1.01671898, "balance_loss_mlp": 1.0360477, "epoch": 0.7437246355027807, "flos": 18908835344640.0, "grad_norm": 2.0967128825206536, "language_loss": 0.75736076, "learning_rate": 6.501641264939233e-07, "loss": 0.77870929, "num_input_tokens_seen": 266825390, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 12370, "time_per_iteration": 2.4638121128082275 }, { "auxiliary_loss_clip": 0.01107212, "auxiliary_loss_mlp": 0.01032413, "balance_loss_clip": 1.01941311, "balance_loss_mlp": 1.03878784, "epoch": 0.7437847587554487, "flos": 21543817248000.0, "grad_norm": 11.120044588804245, "language_loss": 0.78506339, "learning_rate": 6.498767711195503e-07, "loss": 0.80645955, "num_input_tokens_seen": 266844675, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.68359375, "step": 12371, "time_per_iteration": 2.4561655521392822 }, { "auxiliary_loss_clip": 0.01105403, "auxiliary_loss_mlp": 0.01028487, "balance_loss_clip": 1.01536846, "balance_loss_mlp": 1.03558576, "epoch": 0.7438448820081166, "flos": 27782470010880.0, "grad_norm": 1.7228858480860951, "language_loss": 0.69390154, "learning_rate": 6.495894669419857e-07, "loss": 0.71524048, "num_input_tokens_seen": 266865160, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 12372, "time_per_iteration": 2.5361101627349854 }, { "auxiliary_loss_clip": 0.01104677, "auxiliary_loss_mlp": 0.01033183, "balance_loss_clip": 1.02061284, "balance_loss_mlp": 1.03598642, "epoch": 0.7439050052607846, "flos": 17967832796160.0, "grad_norm": 2.142783339624015, "language_loss": 0.7517103, "learning_rate": 6.493022139721245e-07, "loss": 0.77308893, "num_input_tokens_seen": 266883285, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 12373, "time_per_iteration": 2.464567184448242 }, { "auxiliary_loss_clip": 0.01107953, "auxiliary_loss_mlp": 0.01033294, "balance_loss_clip": 1.01937079, "balance_loss_mlp": 1.03583205, "epoch": 0.7439651285134525, "flos": 22958696949120.0, "grad_norm": 2.011845762119332, "language_loss": 0.77276671, "learning_rate": 6.49015012220858e-07, "loss": 0.7941792, "num_input_tokens_seen": 266900960, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 12374, "time_per_iteration": 2.529348134994507 }, { "auxiliary_loss_clip": 0.01106056, "auxiliary_loss_mlp": 0.01036495, "balance_loss_clip": 1.02357841, "balance_loss_mlp": 1.03594589, "epoch": 0.7440252517661206, "flos": 18806777827200.0, "grad_norm": 2.5351893527850526, "language_loss": 0.76763034, "learning_rate": 6.487278616990774e-07, "loss": 0.78905576, "num_input_tokens_seen": 266917710, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12375, "time_per_iteration": 2.4562017917633057 }, { "auxiliary_loss_clip": 0.01103091, "auxiliary_loss_mlp": 0.01027319, "balance_loss_clip": 1.01579738, "balance_loss_mlp": 1.03550029, "epoch": 0.7440853750187885, "flos": 20266295155200.0, "grad_norm": 1.9136025452633945, "language_loss": 0.77377331, "learning_rate": 6.484407624176733e-07, "loss": 0.79507738, "num_input_tokens_seen": 266934220, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 12376, "time_per_iteration": 2.4686429500579834 }, { "auxiliary_loss_clip": 0.01107997, "auxiliary_loss_mlp": 0.01027727, "balance_loss_clip": 1.01469779, "balance_loss_mlp": 1.03619647, "epoch": 0.7441454982714565, "flos": 25337276593920.0, "grad_norm": 1.8231763430941916, "language_loss": 0.79757482, "learning_rate": 6.481537143875296e-07, "loss": 0.81893206, "num_input_tokens_seen": 266955210, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 12377, "time_per_iteration": 2.4900169372558594 }, { "auxiliary_loss_clip": 0.0110745, "auxiliary_loss_mlp": 0.0102871, "balance_loss_clip": 1.01556671, "balance_loss_mlp": 1.03619897, "epoch": 0.7442056215241245, "flos": 64480910866560.0, "grad_norm": 3.0891614905017444, "language_loss": 0.67041892, "learning_rate": 6.478667176195322e-07, "loss": 0.69178051, "num_input_tokens_seen": 266976555, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12378, "time_per_iteration": 4.240155935287476 }, { "auxiliary_loss_clip": 0.01107745, "auxiliary_loss_mlp": 0.01038463, "balance_loss_clip": 1.02397299, "balance_loss_mlp": 1.03717208, "epoch": 0.7442657447767924, "flos": 31285376242560.0, "grad_norm": 1.8893298241727452, "language_loss": 0.71373057, "learning_rate": 6.475797721245648e-07, "loss": 0.73519266, "num_input_tokens_seen": 266997640, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.70703125, "step": 12379, "time_per_iteration": 4.004421949386597 }, { "auxiliary_loss_clip": 0.01105127, "auxiliary_loss_mlp": 0.0103699, "balance_loss_clip": 1.02391243, "balance_loss_mlp": 1.0353049, "epoch": 0.7443258680294604, "flos": 20807899401600.0, "grad_norm": 1.8767606271547597, "language_loss": 0.65339643, "learning_rate": 6.472928779135085e-07, "loss": 0.67481756, "num_input_tokens_seen": 267016165, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 12380, "time_per_iteration": 2.4602134227752686 }, { "auxiliary_loss_clip": 0.01106815, "auxiliary_loss_mlp": 0.01031059, "balance_loss_clip": 1.01837492, "balance_loss_mlp": 1.03627646, "epoch": 0.7443859912821283, "flos": 22199833290240.0, "grad_norm": 2.8644843665728916, "language_loss": 0.7838676, "learning_rate": 6.470060349972411e-07, "loss": 0.80524635, "num_input_tokens_seen": 267034075, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12381, "time_per_iteration": 2.4705514907836914 }, { "auxiliary_loss_clip": 0.01110532, "auxiliary_loss_mlp": 0.01036306, "balance_loss_clip": 1.02262688, "balance_loss_mlp": 1.03840578, "epoch": 0.7444461145347964, "flos": 22017838055040.0, "grad_norm": 2.296604116183444, "language_loss": 0.7308569, "learning_rate": 6.467192433866411e-07, "loss": 0.7523253, "num_input_tokens_seen": 267053645, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 12382, "time_per_iteration": 3.9407670497894287 }, { "auxiliary_loss_clip": 0.01030589, "auxiliary_loss_mlp": 0.01002591, "balance_loss_clip": 1.00141048, "balance_loss_mlp": 1.0080651, "epoch": 0.7445062377874643, "flos": 70559047704960.0, "grad_norm": 0.6501796829013731, "language_loss": 0.54629439, "learning_rate": 6.464325030925831e-07, "loss": 0.56662619, "num_input_tokens_seen": 267121830, "router_z_loss_clip": 0.01177979, "router_z_loss_mlp": 0.22558594, "step": 12383, "time_per_iteration": 3.2434403896331787 }, { "auxiliary_loss_clip": 0.01104998, "auxiliary_loss_mlp": 0.01028453, "balance_loss_clip": 1.01589465, "balance_loss_mlp": 1.03482091, "epoch": 0.7445663610401323, "flos": 22164425458560.0, "grad_norm": 4.0085598491161, "language_loss": 0.76200652, "learning_rate": 6.461458141259395e-07, "loss": 0.78334099, "num_input_tokens_seen": 267141145, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 12384, "time_per_iteration": 3.9353713989257812 }, { "auxiliary_loss_clip": 0.01104336, "auxiliary_loss_mlp": 0.01029642, "balance_loss_clip": 1.01663685, "balance_loss_mlp": 1.03508067, "epoch": 0.7446264842928002, "flos": 24170251714560.0, "grad_norm": 1.798310260353374, "language_loss": 0.79111713, "learning_rate": 6.458591764975823e-07, "loss": 0.81245691, "num_input_tokens_seen": 267159280, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 12385, "time_per_iteration": 2.4990663528442383 }, { "auxiliary_loss_clip": 0.01109338, "auxiliary_loss_mlp": 0.01035702, "balance_loss_clip": 1.0209384, "balance_loss_mlp": 1.0364449, "epoch": 0.7446866075454682, "flos": 24134556574080.0, "grad_norm": 1.6782412873465722, "language_loss": 0.81385434, "learning_rate": 6.455725902183813e-07, "loss": 0.83530474, "num_input_tokens_seen": 267179390, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7265625, "step": 12386, "time_per_iteration": 2.4964466094970703 }, { "auxiliary_loss_clip": 0.01104832, "auxiliary_loss_mlp": 0.01031741, "balance_loss_clip": 1.0191052, "balance_loss_mlp": 1.03718138, "epoch": 0.7447467307981361, "flos": 23548063305600.0, "grad_norm": 2.6377072315914867, "language_loss": 0.7098062, "learning_rate": 6.452860552992037e-07, "loss": 0.73117197, "num_input_tokens_seen": 267198165, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 12387, "time_per_iteration": 2.491828680038452 }, { "auxiliary_loss_clip": 0.01106447, "auxiliary_loss_mlp": 0.01029499, "balance_loss_clip": 1.0177207, "balance_loss_mlp": 1.03656268, "epoch": 0.7448068540508042, "flos": 19567832215680.0, "grad_norm": 2.7022142057672007, "language_loss": 0.70402706, "learning_rate": 6.449995717509138e-07, "loss": 0.7253865, "num_input_tokens_seen": 267214520, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69921875, "step": 12388, "time_per_iteration": 2.4420480728149414 }, { "auxiliary_loss_clip": 0.01104932, "auxiliary_loss_mlp": 0.01030344, "balance_loss_clip": 1.01779103, "balance_loss_mlp": 1.03528893, "epoch": 0.7448669773034721, "flos": 21839721488640.0, "grad_norm": 1.5441652996682176, "language_loss": 0.85146534, "learning_rate": 6.447131395843761e-07, "loss": 0.87281805, "num_input_tokens_seen": 267236555, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12389, "time_per_iteration": 2.511608839035034 }, { "auxiliary_loss_clip": 0.01105315, "auxiliary_loss_mlp": 0.01035823, "balance_loss_clip": 1.0231334, "balance_loss_mlp": 1.03517866, "epoch": 0.7449271005561401, "flos": 25155389099520.0, "grad_norm": 1.972023364637574, "language_loss": 0.79338098, "learning_rate": 6.444267588104526e-07, "loss": 0.81479239, "num_input_tokens_seen": 267254800, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12390, "time_per_iteration": 2.491503953933716 }, { "auxiliary_loss_clip": 0.01108781, "auxiliary_loss_mlp": 0.01030664, "balance_loss_clip": 1.01725912, "balance_loss_mlp": 1.0373739, "epoch": 0.7449872238088081, "flos": 22273342473600.0, "grad_norm": 1.8507765794503461, "language_loss": 0.84925753, "learning_rate": 6.441404294400014e-07, "loss": 0.87065196, "num_input_tokens_seen": 267274610, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 12391, "time_per_iteration": 2.4830808639526367 }, { "auxiliary_loss_clip": 0.011053, "auxiliary_loss_mlp": 0.01028046, "balance_loss_clip": 1.01624441, "balance_loss_mlp": 1.03583932, "epoch": 0.745047347061476, "flos": 20594805966720.0, "grad_norm": 1.6603757628493572, "language_loss": 0.73610067, "learning_rate": 6.438541514838811e-07, "loss": 0.75743413, "num_input_tokens_seen": 267292600, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6953125, "step": 12392, "time_per_iteration": 2.473207712173462 }, { "auxiliary_loss_clip": 0.01102868, "auxiliary_loss_mlp": 0.01037492, "balance_loss_clip": 1.02543402, "balance_loss_mlp": 1.03533125, "epoch": 0.745107470314144, "flos": 22127545169280.0, "grad_norm": 1.8188672378566588, "language_loss": 0.76491779, "learning_rate": 6.435679249529487e-07, "loss": 0.7863214, "num_input_tokens_seen": 267311295, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.67578125, "step": 12393, "time_per_iteration": 2.4980814456939697 }, { "auxiliary_loss_clip": 0.01106297, "auxiliary_loss_mlp": 0.01035501, "balance_loss_clip": 1.02120161, "balance_loss_mlp": 1.03691745, "epoch": 0.745167593566812, "flos": 22236498097920.0, "grad_norm": 2.7477382260997705, "language_loss": 0.72621804, "learning_rate": 6.432817498580552e-07, "loss": 0.74763596, "num_input_tokens_seen": 267328390, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.6953125, "step": 12394, "time_per_iteration": 2.4707584381103516 }, { "auxiliary_loss_clip": 0.01108124, "auxiliary_loss_mlp": 0.01034916, "balance_loss_clip": 1.02183902, "balance_loss_mlp": 1.03696132, "epoch": 0.74522771681948, "flos": 20666232161280.0, "grad_norm": 1.88065156977328, "language_loss": 0.81952894, "learning_rate": 6.429956262100535e-07, "loss": 0.84095931, "num_input_tokens_seen": 267348185, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12395, "time_per_iteration": 2.467298984527588 }, { "auxiliary_loss_clip": 0.01109637, "auxiliary_loss_mlp": 0.01035143, "balance_loss_clip": 1.02208328, "balance_loss_mlp": 1.03738892, "epoch": 0.7452878400721479, "flos": 21106999952640.0, "grad_norm": 3.564440620251828, "language_loss": 0.71582705, "learning_rate": 6.427095540197937e-07, "loss": 0.73727489, "num_input_tokens_seen": 267367010, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 12396, "time_per_iteration": 2.484802007675171 }, { "auxiliary_loss_clip": 0.01109829, "auxiliary_loss_mlp": 0.01033293, "balance_loss_clip": 1.0201441, "balance_loss_mlp": 1.03774476, "epoch": 0.7453479633248159, "flos": 26688056474880.0, "grad_norm": 1.730544800278088, "language_loss": 0.6857301, "learning_rate": 6.424235332981245e-07, "loss": 0.70716137, "num_input_tokens_seen": 267386605, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 12397, "time_per_iteration": 2.5067105293273926 }, { "auxiliary_loss_clip": 0.01105578, "auxiliary_loss_mlp": 0.01040766, "balance_loss_clip": 1.02799332, "balance_loss_mlp": 1.036991, "epoch": 0.7454080865774838, "flos": 17016056167680.0, "grad_norm": 1.8109644444638728, "language_loss": 0.76569855, "learning_rate": 6.421375640558908e-07, "loss": 0.78716201, "num_input_tokens_seen": 267404135, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 12398, "time_per_iteration": 2.4214792251586914 }, { "auxiliary_loss_clip": 0.0110284, "auxiliary_loss_mlp": 0.01027702, "balance_loss_clip": 1.01525044, "balance_loss_mlp": 1.03484833, "epoch": 0.7454682098301518, "flos": 21323900229120.0, "grad_norm": 1.6585190636802039, "language_loss": 0.77937305, "learning_rate": 6.418516463039363e-07, "loss": 0.80067849, "num_input_tokens_seen": 267423120, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 12399, "time_per_iteration": 2.474376678466797 }, { "auxiliary_loss_clip": 0.01102115, "auxiliary_loss_mlp": 0.01037378, "balance_loss_clip": 1.02584493, "balance_loss_mlp": 1.0355736, "epoch": 0.7455283330828197, "flos": 17858341163520.0, "grad_norm": 2.061317947755832, "language_loss": 0.7408334, "learning_rate": 6.415657800531038e-07, "loss": 0.76222837, "num_input_tokens_seen": 267441250, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 12400, "time_per_iteration": 2.439640760421753 }, { "auxiliary_loss_clip": 0.01103024, "auxiliary_loss_mlp": 0.01031556, "balance_loss_clip": 1.01925969, "balance_loss_mlp": 1.03488278, "epoch": 0.7455884563354878, "flos": 30774259664640.0, "grad_norm": 1.8084713426943717, "language_loss": 0.82366204, "learning_rate": 6.412799653142327e-07, "loss": 0.8450079, "num_input_tokens_seen": 267462820, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 12401, "time_per_iteration": 2.5280442237854004 }, { "auxiliary_loss_clip": 0.01104054, "auxiliary_loss_mlp": 0.01032606, "balance_loss_clip": 1.02103662, "balance_loss_mlp": 1.03597283, "epoch": 0.7456485795881557, "flos": 23185545292800.0, "grad_norm": 2.275288315451602, "language_loss": 0.65182877, "learning_rate": 6.409942020981611e-07, "loss": 0.67319536, "num_input_tokens_seen": 267483065, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 12402, "time_per_iteration": 2.4689011573791504 }, { "auxiliary_loss_clip": 0.01103498, "auxiliary_loss_mlp": 0.01034673, "balance_loss_clip": 1.02296662, "balance_loss_mlp": 1.03531957, "epoch": 0.7457087028408237, "flos": 38727144074880.0, "grad_norm": 1.8545939779939868, "language_loss": 0.73114276, "learning_rate": 6.407084904157265e-07, "loss": 0.7525245, "num_input_tokens_seen": 267504825, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 12403, "time_per_iteration": 2.612985849380493 }, { "auxiliary_loss_clip": 0.01030673, "auxiliary_loss_mlp": 0.01005198, "balance_loss_clip": 1.00412512, "balance_loss_mlp": 1.00804973, "epoch": 0.7457688260934917, "flos": 56043737337600.0, "grad_norm": 0.834302469037482, "language_loss": 0.58989787, "learning_rate": 6.404228302777621e-07, "loss": 0.61025655, "num_input_tokens_seen": 267559260, "router_z_loss_clip": 0.01074219, "router_z_loss_mlp": 0.2265625, "step": 12404, "time_per_iteration": 2.907151460647583 }, { "auxiliary_loss_clip": 0.0110307, "auxiliary_loss_mlp": 0.01033954, "balance_loss_clip": 1.0221765, "balance_loss_mlp": 1.03435934, "epoch": 0.7458289493461596, "flos": 20116152305280.0, "grad_norm": 1.706393496101383, "language_loss": 0.77724457, "learning_rate": 6.401372216950995e-07, "loss": 0.79861486, "num_input_tokens_seen": 267578720, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 12405, "time_per_iteration": 2.4553847312927246 }, { "auxiliary_loss_clip": 0.01103145, "auxiliary_loss_mlp": 0.01037316, "balance_loss_clip": 1.02506733, "balance_loss_mlp": 1.03566074, "epoch": 0.7458890725988276, "flos": 20193073280640.0, "grad_norm": 2.774767564289271, "language_loss": 0.69203413, "learning_rate": 6.398516646785698e-07, "loss": 0.71343875, "num_input_tokens_seen": 267598250, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.67578125, "step": 12406, "time_per_iteration": 2.458839178085327 }, { "auxiliary_loss_clip": 0.01109493, "auxiliary_loss_mlp": 0.01036201, "balance_loss_clip": 1.02248561, "balance_loss_mlp": 1.03637719, "epoch": 0.7459491958514956, "flos": 17018749687680.0, "grad_norm": 1.692361305205025, "language_loss": 0.65086639, "learning_rate": 6.39566159239002e-07, "loss": 0.67232335, "num_input_tokens_seen": 267615430, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.73046875, "step": 12407, "time_per_iteration": 2.4517667293548584 }, { "auxiliary_loss_clip": 0.01105997, "auxiliary_loss_mlp": 0.01034192, "balance_loss_clip": 1.02075124, "balance_loss_mlp": 1.0349102, "epoch": 0.7460093191041636, "flos": 25078719519360.0, "grad_norm": 1.9040749371686432, "language_loss": 0.7178632, "learning_rate": 6.392807053872212e-07, "loss": 0.73926508, "num_input_tokens_seen": 267635075, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 12408, "time_per_iteration": 2.4987223148345947 }, { "auxiliary_loss_clip": 0.01110406, "auxiliary_loss_mlp": 0.01032624, "balance_loss_clip": 1.01872969, "balance_loss_mlp": 1.03801775, "epoch": 0.7460694423568315, "flos": 21908525990400.0, "grad_norm": 1.8789310339796599, "language_loss": 0.72860098, "learning_rate": 6.38995303134053e-07, "loss": 0.75003129, "num_input_tokens_seen": 267654105, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 12409, "time_per_iteration": 2.481306552886963 }, { "auxiliary_loss_clip": 0.01101489, "auxiliary_loss_mlp": 0.01035869, "balance_loss_clip": 1.02426445, "balance_loss_mlp": 1.03517067, "epoch": 0.7461295656094995, "flos": 21215737399680.0, "grad_norm": 1.7361820299666175, "language_loss": 0.6602006, "learning_rate": 6.38709952490319e-07, "loss": 0.68157417, "num_input_tokens_seen": 267673090, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 12410, "time_per_iteration": 2.4651875495910645 }, { "auxiliary_loss_clip": 0.01102091, "auxiliary_loss_mlp": 0.01031557, "balance_loss_clip": 1.01912355, "balance_loss_mlp": 1.03494716, "epoch": 0.7461896888621674, "flos": 22346851656960.0, "grad_norm": 2.3498806949113287, "language_loss": 0.84491456, "learning_rate": 6.384246534668396e-07, "loss": 0.86625105, "num_input_tokens_seen": 267690605, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.671875, "step": 12411, "time_per_iteration": 2.4727933406829834 }, { "auxiliary_loss_clip": 0.01106587, "auxiliary_loss_mlp": 0.01029081, "balance_loss_clip": 1.01637375, "balance_loss_mlp": 1.03563833, "epoch": 0.7462498121148354, "flos": 25482930243840.0, "grad_norm": 1.743535454864525, "language_loss": 0.78004688, "learning_rate": 6.381394060744339e-07, "loss": 0.80140352, "num_input_tokens_seen": 267710540, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 12412, "time_per_iteration": 2.52840518951416 }, { "auxiliary_loss_clip": 0.01106031, "auxiliary_loss_mlp": 0.01036131, "balance_loss_clip": 1.0237987, "balance_loss_mlp": 1.03552771, "epoch": 0.7463099353675033, "flos": 33947936812800.0, "grad_norm": 2.0049774586661435, "language_loss": 0.62464261, "learning_rate": 6.378542103239188e-07, "loss": 0.64606422, "num_input_tokens_seen": 267730780, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 12413, "time_per_iteration": 2.5681142807006836 }, { "auxiliary_loss_clip": 0.01029674, "auxiliary_loss_mlp": 0.01004833, "balance_loss_clip": 1.00369406, "balance_loss_mlp": 1.00705802, "epoch": 0.7463700586201714, "flos": 62767723691520.0, "grad_norm": 0.7277973439776914, "language_loss": 0.54989499, "learning_rate": 6.375690662261082e-07, "loss": 0.57024008, "num_input_tokens_seen": 267794240, "router_z_loss_clip": 0.01141357, "router_z_loss_mlp": 0.2265625, "step": 12414, "time_per_iteration": 3.115863561630249 }, { "auxiliary_loss_clip": 0.0110455, "auxiliary_loss_mlp": 0.01030268, "balance_loss_clip": 1.01756001, "balance_loss_mlp": 1.03435576, "epoch": 0.7464301818728393, "flos": 33432654257280.0, "grad_norm": 1.8861260276237668, "language_loss": 0.5508765, "learning_rate": 6.372839737918154e-07, "loss": 0.57222474, "num_input_tokens_seen": 267817190, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12415, "time_per_iteration": 2.5794222354888916 }, { "auxiliary_loss_clip": 0.01104794, "auxiliary_loss_mlp": 0.01028639, "balance_loss_clip": 1.01548445, "balance_loss_mlp": 1.03615713, "epoch": 0.7464903051255073, "flos": 26869872142080.0, "grad_norm": 1.7731058597559555, "language_loss": 0.74364269, "learning_rate": 6.369989330318506e-07, "loss": 0.7649771, "num_input_tokens_seen": 267836245, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6875, "step": 12416, "time_per_iteration": 2.5172040462493896 }, { "auxiliary_loss_clip": 0.01103884, "auxiliary_loss_mlp": 0.0103387, "balance_loss_clip": 1.02142477, "balance_loss_mlp": 1.03534722, "epoch": 0.7465504283781753, "flos": 44086954775040.0, "grad_norm": 1.5773803162976392, "language_loss": 0.69472075, "learning_rate": 6.367139439570233e-07, "loss": 0.71609825, "num_input_tokens_seen": 267858310, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 12417, "time_per_iteration": 2.6554672718048096 }, { "auxiliary_loss_clip": 0.01108262, "auxiliary_loss_mlp": 0.01032701, "balance_loss_clip": 1.01912868, "balance_loss_mlp": 1.03748953, "epoch": 0.7466105516308432, "flos": 19676102785920.0, "grad_norm": 2.7051676833888427, "language_loss": 0.73904967, "learning_rate": 6.364290065781392e-07, "loss": 0.76045936, "num_input_tokens_seen": 267876345, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.70703125, "step": 12418, "time_per_iteration": 2.471285820007324 }, { "auxiliary_loss_clip": 0.0110495, "auxiliary_loss_mlp": 0.01030712, "balance_loss_clip": 1.01820064, "balance_loss_mlp": 1.03562427, "epoch": 0.7466706748835112, "flos": 20520722165760.0, "grad_norm": 1.968419121515378, "language_loss": 0.69247794, "learning_rate": 6.361441209060039e-07, "loss": 0.71383452, "num_input_tokens_seen": 267896740, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12419, "time_per_iteration": 2.453641414642334 }, { "auxiliary_loss_clip": 0.01098859, "auxiliary_loss_mlp": 0.01034193, "balance_loss_clip": 1.02243257, "balance_loss_mlp": 1.03423178, "epoch": 0.7467307981361792, "flos": 21690260997120.0, "grad_norm": 2.043220052569175, "language_loss": 0.74211311, "learning_rate": 6.358592869514216e-07, "loss": 0.76344359, "num_input_tokens_seen": 267914765, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.64453125, "step": 12420, "time_per_iteration": 5.325446605682373 }, { "auxiliary_loss_clip": 0.01107989, "auxiliary_loss_mlp": 0.01031749, "balance_loss_clip": 1.01923227, "balance_loss_mlp": 1.03810012, "epoch": 0.7467909213888472, "flos": 19573686132480.0, "grad_norm": 2.3326357167274154, "language_loss": 0.66871113, "learning_rate": 6.355745047251904e-07, "loss": 0.69010854, "num_input_tokens_seen": 267934085, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 12421, "time_per_iteration": 2.457695722579956 }, { "auxiliary_loss_clip": 0.01107925, "auxiliary_loss_mlp": 0.01033348, "balance_loss_clip": 1.01938915, "balance_loss_mlp": 1.03626871, "epoch": 0.7468510446415151, "flos": 23695225326720.0, "grad_norm": 1.598704939260093, "language_loss": 0.72513902, "learning_rate": 6.352897742381107e-07, "loss": 0.74655175, "num_input_tokens_seen": 267955170, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 12422, "time_per_iteration": 2.478099822998047 }, { "auxiliary_loss_clip": 0.01104358, "auxiliary_loss_mlp": 0.01033603, "balance_loss_clip": 1.02094269, "balance_loss_mlp": 1.0361011, "epoch": 0.7469111678941831, "flos": 29315783831040.0, "grad_norm": 2.2546916192512065, "language_loss": 0.74901175, "learning_rate": 6.350050955009796e-07, "loss": 0.77039135, "num_input_tokens_seen": 267974980, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 12423, "time_per_iteration": 2.527698040008545 }, { "auxiliary_loss_clip": 0.01101947, "auxiliary_loss_mlp": 0.01025893, "balance_loss_clip": 1.01439488, "balance_loss_mlp": 1.03382993, "epoch": 0.746971291146851, "flos": 21798639308160.0, "grad_norm": 1.3439986741730674, "language_loss": 0.67570317, "learning_rate": 6.347204685245929e-07, "loss": 0.69698155, "num_input_tokens_seen": 267994985, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 12424, "time_per_iteration": 3.8900578022003174 }, { "auxiliary_loss_clip": 0.01109285, "auxiliary_loss_mlp": 0.01033506, "balance_loss_clip": 1.02080393, "balance_loss_mlp": 1.03791344, "epoch": 0.747031414399519, "flos": 36245070368640.0, "grad_norm": 2.2778025956472177, "language_loss": 0.74970406, "learning_rate": 6.344358933197418e-07, "loss": 0.77113199, "num_input_tokens_seen": 268014985, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 12425, "time_per_iteration": 4.045286655426025 }, { "auxiliary_loss_clip": 0.01103741, "auxiliary_loss_mlp": 0.01030178, "balance_loss_clip": 1.01732707, "balance_loss_mlp": 1.03466403, "epoch": 0.7470915376521869, "flos": 19974916028160.0, "grad_norm": 1.8969108698865336, "language_loss": 0.69620365, "learning_rate": 6.341513698972194e-07, "loss": 0.71754283, "num_input_tokens_seen": 268034395, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 12426, "time_per_iteration": 2.4674439430236816 }, { "auxiliary_loss_clip": 0.01101726, "auxiliary_loss_mlp": 0.0103687, "balance_loss_clip": 1.02459145, "balance_loss_mlp": 1.0346173, "epoch": 0.747151660904855, "flos": 20084299920000.0, "grad_norm": 1.4391624352319339, "language_loss": 0.65453774, "learning_rate": 6.338668982678139e-07, "loss": 0.67592371, "num_input_tokens_seen": 268054485, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.671875, "step": 12427, "time_per_iteration": 2.470297336578369 }, { "auxiliary_loss_clip": 0.01104618, "auxiliary_loss_mlp": 0.0102741, "balance_loss_clip": 1.01447618, "balance_loss_mlp": 1.03556144, "epoch": 0.7472117841575229, "flos": 16290373697280.0, "grad_norm": 7.494483656883162, "language_loss": 0.74711847, "learning_rate": 6.335824784423118e-07, "loss": 0.76843882, "num_input_tokens_seen": 268072250, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 12428, "time_per_iteration": 2.4277772903442383 }, { "auxiliary_loss_clip": 0.0110796, "auxiliary_loss_mlp": 0.01031254, "balance_loss_clip": 1.01744318, "balance_loss_mlp": 1.03597093, "epoch": 0.7472719074101909, "flos": 21389939383680.0, "grad_norm": 1.9078067162434549, "language_loss": 0.58291882, "learning_rate": 6.33298110431499e-07, "loss": 0.60431099, "num_input_tokens_seen": 268089840, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 12429, "time_per_iteration": 2.4666061401367188 }, { "auxiliary_loss_clip": 0.01108092, "auxiliary_loss_mlp": 0.01033324, "balance_loss_clip": 1.02034187, "balance_loss_mlp": 1.03613651, "epoch": 0.7473320306628589, "flos": 29643289061760.0, "grad_norm": 2.8594812053284935, "language_loss": 0.60337305, "learning_rate": 6.330137942461595e-07, "loss": 0.62478727, "num_input_tokens_seen": 268109360, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 12430, "time_per_iteration": 2.565128803253174 }, { "auxiliary_loss_clip": 0.01103683, "auxiliary_loss_mlp": 0.01031101, "balance_loss_clip": 1.01901889, "balance_loss_mlp": 1.03577948, "epoch": 0.7473921539155268, "flos": 24136100858880.0, "grad_norm": 4.112680807124241, "language_loss": 0.75747842, "learning_rate": 6.327295298970734e-07, "loss": 0.77882624, "num_input_tokens_seen": 268131840, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 12431, "time_per_iteration": 2.5030899047851562 }, { "auxiliary_loss_clip": 0.01104482, "auxiliary_loss_mlp": 0.01029687, "balance_loss_clip": 1.0167048, "balance_loss_mlp": 1.03474092, "epoch": 0.7474522771681948, "flos": 17487958072320.0, "grad_norm": 1.8687108767521612, "language_loss": 0.75792605, "learning_rate": 6.32445317395021e-07, "loss": 0.77926767, "num_input_tokens_seen": 268148300, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 12432, "time_per_iteration": 2.447894811630249 }, { "auxiliary_loss_clip": 0.01108177, "auxiliary_loss_mlp": 0.01033353, "balance_loss_clip": 1.01914966, "balance_loss_mlp": 1.03529489, "epoch": 0.7475124004208628, "flos": 16727298733440.0, "grad_norm": 2.1782941625946077, "language_loss": 0.69924438, "learning_rate": 6.321611567507787e-07, "loss": 0.72065961, "num_input_tokens_seen": 268166450, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7265625, "step": 12433, "time_per_iteration": 2.428101062774658 }, { "auxiliary_loss_clip": 0.01105958, "auxiliary_loss_mlp": 0.01032729, "balance_loss_clip": 1.01911521, "balance_loss_mlp": 1.03541517, "epoch": 0.7475725236735308, "flos": 19720237622400.0, "grad_norm": 1.9743790759288737, "language_loss": 0.67325902, "learning_rate": 6.318770479751232e-07, "loss": 0.69464588, "num_input_tokens_seen": 268186165, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.70703125, "step": 12434, "time_per_iteration": 2.4726803302764893 }, { "auxiliary_loss_clip": 0.01100398, "auxiliary_loss_mlp": 0.01030225, "balance_loss_clip": 1.01890588, "balance_loss_mlp": 1.03590357, "epoch": 0.7476326469261987, "flos": 26286000566400.0, "grad_norm": 1.5305511774902538, "language_loss": 0.79426765, "learning_rate": 6.315929910788263e-07, "loss": 0.81557387, "num_input_tokens_seen": 268208145, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.64453125, "step": 12435, "time_per_iteration": 2.5056803226470947 }, { "auxiliary_loss_clip": 0.01107335, "auxiliary_loss_mlp": 0.01026054, "balance_loss_clip": 1.01381159, "balance_loss_mlp": 1.03612518, "epoch": 0.7476927701788667, "flos": 31831828824960.0, "grad_norm": 1.8366761167715857, "language_loss": 0.67784548, "learning_rate": 6.313089860726604e-07, "loss": 0.69917935, "num_input_tokens_seen": 268228345, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 12436, "time_per_iteration": 2.553083896636963 }, { "auxiliary_loss_clip": 0.01107405, "auxiliary_loss_mlp": 0.01031843, "balance_loss_clip": 1.01940346, "balance_loss_mlp": 1.03591049, "epoch": 0.7477528934315346, "flos": 31795487239680.0, "grad_norm": 1.6687315622289254, "language_loss": 0.71021247, "learning_rate": 6.31025032967396e-07, "loss": 0.73160493, "num_input_tokens_seen": 268250260, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71484375, "step": 12437, "time_per_iteration": 2.5320467948913574 }, { "auxiliary_loss_clip": 0.01099251, "auxiliary_loss_mlp": 0.01026396, "balance_loss_clip": 1.01513088, "balance_loss_mlp": 1.03396392, "epoch": 0.7478130166842026, "flos": 20371979946240.0, "grad_norm": 3.1875674990774105, "language_loss": 0.67038155, "learning_rate": 6.307411317737986e-07, "loss": 0.69163805, "num_input_tokens_seen": 268268440, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.65625, "step": 12438, "time_per_iteration": 2.480896472930908 }, { "auxiliary_loss_clip": 0.01105028, "auxiliary_loss_mlp": 0.01029625, "balance_loss_clip": 1.01719725, "balance_loss_mlp": 1.03581333, "epoch": 0.7478731399368705, "flos": 18148930191360.0, "grad_norm": 1.7968649363226405, "language_loss": 0.80532342, "learning_rate": 6.304572825026344e-07, "loss": 0.82666993, "num_input_tokens_seen": 268285765, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 12439, "time_per_iteration": 2.43450927734375 }, { "auxiliary_loss_clip": 0.01102296, "auxiliary_loss_mlp": 0.01033332, "balance_loss_clip": 1.02153063, "balance_loss_mlp": 1.03426075, "epoch": 0.7479332631895386, "flos": 15267889146240.0, "grad_norm": 3.101397761973738, "language_loss": 0.71010542, "learning_rate": 6.301734851646674e-07, "loss": 0.7314617, "num_input_tokens_seen": 268304015, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 12440, "time_per_iteration": 2.4477944374084473 }, { "auxiliary_loss_clip": 0.01102818, "auxiliary_loss_mlp": 0.01029607, "balance_loss_clip": 1.01794195, "balance_loss_mlp": 1.03619623, "epoch": 0.7479933864422065, "flos": 21142515525120.0, "grad_norm": 1.6599599695211027, "language_loss": 0.74387801, "learning_rate": 6.298897397706597e-07, "loss": 0.76520222, "num_input_tokens_seen": 268323290, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6640625, "step": 12441, "time_per_iteration": 2.495567798614502 }, { "auxiliary_loss_clip": 0.01106188, "auxiliary_loss_mlp": 0.01031396, "balance_loss_clip": 1.01830721, "balance_loss_mlp": 1.03592849, "epoch": 0.7480535096948745, "flos": 14392027912320.0, "grad_norm": 2.4194546775717436, "language_loss": 0.82611859, "learning_rate": 6.296060463313698e-07, "loss": 0.84749442, "num_input_tokens_seen": 268339490, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12442, "time_per_iteration": 2.456336259841919 }, { "auxiliary_loss_clip": 0.01106814, "auxiliary_loss_mlp": 0.01030648, "balance_loss_clip": 1.01745784, "balance_loss_mlp": 1.03666008, "epoch": 0.7481136329475425, "flos": 27344683048320.0, "grad_norm": 1.7642466228505747, "language_loss": 0.62217927, "learning_rate": 6.293224048575565e-07, "loss": 0.64355385, "num_input_tokens_seen": 268359865, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 12443, "time_per_iteration": 2.554640054702759 }, { "auxiliary_loss_clip": 0.01102262, "auxiliary_loss_mlp": 0.01027962, "balance_loss_clip": 1.01616645, "balance_loss_mlp": 1.03526258, "epoch": 0.7481737562002104, "flos": 19531454716800.0, "grad_norm": 1.9112468246586167, "language_loss": 0.71441126, "learning_rate": 6.29038815359975e-07, "loss": 0.73571348, "num_input_tokens_seen": 268377065, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 12444, "time_per_iteration": 2.5362415313720703 }, { "auxiliary_loss_clip": 0.0110518, "auxiliary_loss_mlp": 0.01031539, "balance_loss_clip": 1.01931453, "balance_loss_mlp": 1.03629839, "epoch": 0.7482338794528784, "flos": 21760035166080.0, "grad_norm": 1.5043927565459172, "language_loss": 0.68986833, "learning_rate": 6.287552778493786e-07, "loss": 0.71123558, "num_input_tokens_seen": 268396935, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 12445, "time_per_iteration": 2.5250966548919678 }, { "auxiliary_loss_clip": 0.01101827, "auxiliary_loss_mlp": 0.01022256, "balance_loss_clip": 1.01052606, "balance_loss_mlp": 1.03447962, "epoch": 0.7482940027055464, "flos": 18697358021760.0, "grad_norm": 1.6514659630382529, "language_loss": 0.73973852, "learning_rate": 6.28471792336519e-07, "loss": 0.76097941, "num_input_tokens_seen": 268414460, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 12446, "time_per_iteration": 2.4574766159057617 }, { "auxiliary_loss_clip": 0.01109302, "auxiliary_loss_mlp": 0.01030678, "balance_loss_clip": 1.017416, "balance_loss_mlp": 1.0375669, "epoch": 0.7483541259582144, "flos": 15998024903040.0, "grad_norm": 2.1157702177051982, "language_loss": 0.73039305, "learning_rate": 6.281883588321475e-07, "loss": 0.75179285, "num_input_tokens_seen": 268432225, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12447, "time_per_iteration": 2.4662880897521973 }, { "auxiliary_loss_clip": 0.01103076, "auxiliary_loss_mlp": 0.01031509, "balance_loss_clip": 1.02000499, "balance_loss_mlp": 1.03443253, "epoch": 0.7484142492108823, "flos": 25556295772800.0, "grad_norm": 3.5164444540482482, "language_loss": 0.71995795, "learning_rate": 6.279049773470109e-07, "loss": 0.7413038, "num_input_tokens_seen": 268449270, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 12448, "time_per_iteration": 2.506514072418213 }, { "auxiliary_loss_clip": 0.01105559, "auxiliary_loss_mlp": 0.01037161, "balance_loss_clip": 1.02494192, "balance_loss_mlp": 1.03588033, "epoch": 0.7484743724635503, "flos": 22887737631360.0, "grad_norm": 1.9473350987323619, "language_loss": 0.73839909, "learning_rate": 6.276216478918543e-07, "loss": 0.7598263, "num_input_tokens_seen": 268467250, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 12449, "time_per_iteration": 2.503542900085449 }, { "auxiliary_loss_clip": 0.01110866, "auxiliary_loss_mlp": 0.01034832, "balance_loss_clip": 1.02211237, "balance_loss_mlp": 1.03835106, "epoch": 0.7485344957162182, "flos": 25300288563840.0, "grad_norm": 2.2862266914187233, "language_loss": 0.61090964, "learning_rate": 6.273383704774225e-07, "loss": 0.6323666, "num_input_tokens_seen": 268487270, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 12450, "time_per_iteration": 2.5112478733062744 }, { "auxiliary_loss_clip": 0.01098588, "auxiliary_loss_mlp": 0.01024869, "balance_loss_clip": 1.01323438, "balance_loss_mlp": 1.03329694, "epoch": 0.7485946189688862, "flos": 27053016612480.0, "grad_norm": 1.6866926633032506, "language_loss": 0.7029534, "learning_rate": 6.270551451144577e-07, "loss": 0.72418797, "num_input_tokens_seen": 268508020, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.65234375, "step": 12451, "time_per_iteration": 2.4845597743988037 }, { "auxiliary_loss_clip": 0.01108025, "auxiliary_loss_mlp": 0.01029494, "balance_loss_clip": 1.01654243, "balance_loss_mlp": 1.03624535, "epoch": 0.7486547422215541, "flos": 26906752431360.0, "grad_norm": 2.641766692010702, "language_loss": 0.80939078, "learning_rate": 6.267719718136988e-07, "loss": 0.83076602, "num_input_tokens_seen": 268527375, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 12452, "time_per_iteration": 2.4974477291107178 }, { "auxiliary_loss_clip": 0.01111426, "auxiliary_loss_mlp": 0.01033618, "balance_loss_clip": 1.02048087, "balance_loss_mlp": 1.03897583, "epoch": 0.7487148654742222, "flos": 22346277039360.0, "grad_norm": 2.129175882648925, "language_loss": 0.71464479, "learning_rate": 6.264888505858843e-07, "loss": 0.73609519, "num_input_tokens_seen": 268544870, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 12453, "time_per_iteration": 2.4546282291412354 }, { "auxiliary_loss_clip": 0.01107481, "auxiliary_loss_mlp": 0.01036865, "balance_loss_clip": 1.02451539, "balance_loss_mlp": 1.03817368, "epoch": 0.7487749887268901, "flos": 23038814234880.0, "grad_norm": 1.5078380141827028, "language_loss": 0.7395559, "learning_rate": 6.262057814417517e-07, "loss": 0.76099944, "num_input_tokens_seen": 268564580, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 12454, "time_per_iteration": 2.4578676223754883 }, { "auxiliary_loss_clip": 0.01030346, "auxiliary_loss_mlp": 0.01008752, "balance_loss_clip": 1.00766075, "balance_loss_mlp": 1.00745738, "epoch": 0.7488351119795581, "flos": 71525294536320.0, "grad_norm": 0.7449164703179778, "language_loss": 0.59403503, "learning_rate": 6.259227643920322e-07, "loss": 0.61442596, "num_input_tokens_seen": 268629550, "router_z_loss_clip": 0.01092529, "router_z_loss_mlp": 0.22851562, "step": 12455, "time_per_iteration": 3.2375986576080322 }, { "auxiliary_loss_clip": 0.01103765, "auxiliary_loss_mlp": 0.01030921, "balance_loss_clip": 1.01859522, "balance_loss_mlp": 1.03644156, "epoch": 0.748895235232226, "flos": 17196255722880.0, "grad_norm": 2.2991721041880036, "language_loss": 0.79691952, "learning_rate": 6.256397994474592e-07, "loss": 0.81826639, "num_input_tokens_seen": 268646645, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 12456, "time_per_iteration": 2.4310522079467773 }, { "auxiliary_loss_clip": 0.01029932, "auxiliary_loss_mlp": 0.01003972, "balance_loss_clip": 1.00285172, "balance_loss_mlp": 1.00722599, "epoch": 0.748955358484894, "flos": 58979256336000.0, "grad_norm": 0.838573642486511, "language_loss": 0.6140905, "learning_rate": 6.25356886618763e-07, "loss": 0.63442957, "num_input_tokens_seen": 268702275, "router_z_loss_clip": 0.01123047, "router_z_loss_mlp": 0.22753906, "step": 12457, "time_per_iteration": 3.0064332485198975 }, { "auxiliary_loss_clip": 0.01109508, "auxiliary_loss_mlp": 0.01030188, "balance_loss_clip": 1.01781964, "balance_loss_mlp": 1.03895831, "epoch": 0.749015481737562, "flos": 11360413054080.0, "grad_norm": 2.3819464183290875, "language_loss": 0.67722452, "learning_rate": 6.250740259166711e-07, "loss": 0.69862151, "num_input_tokens_seen": 268716265, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 12458, "time_per_iteration": 2.4432296752929688 }, { "auxiliary_loss_clip": 0.01103682, "auxiliary_loss_mlp": 0.01033175, "balance_loss_clip": 1.02114105, "balance_loss_mlp": 1.03554678, "epoch": 0.74907560499023, "flos": 21106497162240.0, "grad_norm": 1.7761746232328395, "language_loss": 0.80018234, "learning_rate": 6.247912173519106e-07, "loss": 0.82155097, "num_input_tokens_seen": 268734330, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 12459, "time_per_iteration": 2.4402873516082764 }, { "auxiliary_loss_clip": 0.0110342, "auxiliary_loss_mlp": 0.01035565, "balance_loss_clip": 1.02260709, "balance_loss_mlp": 1.03611934, "epoch": 0.749135728242898, "flos": 22268027260800.0, "grad_norm": 2.455727195191648, "language_loss": 0.80336845, "learning_rate": 6.245084609352043e-07, "loss": 0.82475835, "num_input_tokens_seen": 268753500, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.671875, "step": 12460, "time_per_iteration": 2.464339017868042 }, { "auxiliary_loss_clip": 0.01104707, "auxiliary_loss_mlp": 0.01031933, "balance_loss_clip": 1.01849771, "balance_loss_mlp": 1.03624642, "epoch": 0.7491958514955659, "flos": 24057527857920.0, "grad_norm": 1.7461703187852657, "language_loss": 0.85979062, "learning_rate": 6.242257566772755e-07, "loss": 0.88115704, "num_input_tokens_seen": 268772055, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6875, "step": 12461, "time_per_iteration": 2.4747848510742188 }, { "auxiliary_loss_clip": 0.01102008, "auxiliary_loss_mlp": 0.01030774, "balance_loss_clip": 1.01881146, "balance_loss_mlp": 1.03581929, "epoch": 0.7492559747482339, "flos": 24492118510080.0, "grad_norm": 6.7421519121163875, "language_loss": 0.69268483, "learning_rate": 6.239431045888435e-07, "loss": 0.71401262, "num_input_tokens_seen": 268792265, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66015625, "step": 12462, "time_per_iteration": 5.304335832595825 }, { "auxiliary_loss_clip": 0.01103639, "auxiliary_loss_mlp": 0.01028715, "balance_loss_clip": 1.01599503, "balance_loss_mlp": 1.0352025, "epoch": 0.7493160980009018, "flos": 27745338326400.0, "grad_norm": 1.8383779840915202, "language_loss": 0.70258272, "learning_rate": 6.236605046806267e-07, "loss": 0.72390622, "num_input_tokens_seen": 268812735, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 12463, "time_per_iteration": 2.5039427280426025 }, { "auxiliary_loss_clip": 0.01105086, "auxiliary_loss_mlp": 0.0102977, "balance_loss_clip": 1.01840925, "balance_loss_mlp": 1.03628564, "epoch": 0.7493762212535698, "flos": 30226190970240.0, "grad_norm": 2.2659252463334467, "language_loss": 0.77129447, "learning_rate": 6.233779569633419e-07, "loss": 0.79264295, "num_input_tokens_seen": 268833090, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6875, "step": 12464, "time_per_iteration": 2.52469801902771 }, { "auxiliary_loss_clip": 0.01101707, "auxiliary_loss_mlp": 0.01027451, "balance_loss_clip": 1.01560736, "balance_loss_mlp": 1.03325462, "epoch": 0.7494363445062378, "flos": 21944472526080.0, "grad_norm": 2.3662948297378223, "language_loss": 0.78184825, "learning_rate": 6.230954614477034e-07, "loss": 0.80313981, "num_input_tokens_seen": 268851880, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 12465, "time_per_iteration": 3.878727674484253 }, { "auxiliary_loss_clip": 0.01112669, "auxiliary_loss_mlp": 0.01032279, "balance_loss_clip": 1.01817107, "balance_loss_mlp": 1.0381645, "epoch": 0.7494964677589058, "flos": 12490342162560.0, "grad_norm": 2.8298851852736426, "language_loss": 0.73895431, "learning_rate": 6.22813018144422e-07, "loss": 0.76040381, "num_input_tokens_seen": 268867910, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.74609375, "step": 12466, "time_per_iteration": 2.438267230987549 }, { "auxiliary_loss_clip": 0.01106238, "auxiliary_loss_mlp": 0.010318, "balance_loss_clip": 1.0193069, "balance_loss_mlp": 1.03660119, "epoch": 0.7495565910115737, "flos": 21653057485440.0, "grad_norm": 2.03602481908739, "language_loss": 0.66374964, "learning_rate": 6.22530627064209e-07, "loss": 0.68513006, "num_input_tokens_seen": 268887260, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12467, "time_per_iteration": 3.916343927383423 }, { "auxiliary_loss_clip": 0.01104695, "auxiliary_loss_mlp": 0.01038177, "balance_loss_clip": 1.02519548, "balance_loss_mlp": 1.03548574, "epoch": 0.7496167142642417, "flos": 15268535591040.0, "grad_norm": 2.492312953258179, "language_loss": 0.76545185, "learning_rate": 6.222482882177735e-07, "loss": 0.78688061, "num_input_tokens_seen": 268902520, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 12468, "time_per_iteration": 2.4295618534088135 }, { "auxiliary_loss_clip": 0.01103414, "auxiliary_loss_mlp": 0.01031484, "balance_loss_clip": 1.01838326, "balance_loss_mlp": 1.03574014, "epoch": 0.7496768375169096, "flos": 22054933825920.0, "grad_norm": 2.380790309181434, "language_loss": 0.69534373, "learning_rate": 6.219660016158201e-07, "loss": 0.71669275, "num_input_tokens_seen": 268920970, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.67578125, "step": 12469, "time_per_iteration": 2.4873526096343994 }, { "auxiliary_loss_clip": 0.01106402, "auxiliary_loss_mlp": 0.01032733, "balance_loss_clip": 1.01976252, "balance_loss_mlp": 1.03659415, "epoch": 0.7497369607695776, "flos": 19057038860160.0, "grad_norm": 2.1115981212043757, "language_loss": 0.69160241, "learning_rate": 6.216837672690543e-07, "loss": 0.71299374, "num_input_tokens_seen": 268936600, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 12470, "time_per_iteration": 2.4506800174713135 }, { "auxiliary_loss_clip": 0.01107789, "auxiliary_loss_mlp": 0.0103352, "balance_loss_clip": 1.0193882, "balance_loss_mlp": 1.03480768, "epoch": 0.7497970840222457, "flos": 21617434172160.0, "grad_norm": 1.8934682455047898, "language_loss": 0.75551498, "learning_rate": 6.214015851881793e-07, "loss": 0.77692813, "num_input_tokens_seen": 268956560, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7265625, "step": 12471, "time_per_iteration": 2.4729113578796387 }, { "auxiliary_loss_clip": 0.0110689, "auxiliary_loss_mlp": 0.01029449, "balance_loss_clip": 1.01669395, "balance_loss_mlp": 1.03668344, "epoch": 0.7498572072749136, "flos": 13735580906880.0, "grad_norm": 10.283764568537883, "language_loss": 0.76683807, "learning_rate": 6.211194553838929e-07, "loss": 0.78820145, "num_input_tokens_seen": 268973945, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 12472, "time_per_iteration": 2.453712224960327 }, { "auxiliary_loss_clip": 0.01102941, "auxiliary_loss_mlp": 0.01033366, "balance_loss_clip": 1.02112937, "balance_loss_mlp": 1.03467929, "epoch": 0.7499173305275816, "flos": 22966526113920.0, "grad_norm": 1.5303645888743285, "language_loss": 0.84521228, "learning_rate": 6.208373778668951e-07, "loss": 0.86657536, "num_input_tokens_seen": 268993245, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 12473, "time_per_iteration": 2.467562437057495 }, { "auxiliary_loss_clip": 0.01110311, "auxiliary_loss_mlp": 0.01033015, "balance_loss_clip": 1.01941299, "balance_loss_mlp": 1.03786969, "epoch": 0.7499774537802495, "flos": 22740467869440.0, "grad_norm": 2.828279411266656, "language_loss": 0.73949689, "learning_rate": 6.205553526478829e-07, "loss": 0.76093018, "num_input_tokens_seen": 269012125, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 12474, "time_per_iteration": 2.4526803493499756 }, { "auxiliary_loss_clip": 0.01109078, "auxiliary_loss_mlp": 0.01034218, "balance_loss_clip": 1.0212059, "balance_loss_mlp": 1.035725, "epoch": 0.7500375770329175, "flos": 18296559089280.0, "grad_norm": 1.8824600314485518, "language_loss": 0.74581158, "learning_rate": 6.202733797375492e-07, "loss": 0.76724452, "num_input_tokens_seen": 269030545, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 12475, "time_per_iteration": 2.432419538497925 }, { "auxiliary_loss_clip": 0.01111642, "auxiliary_loss_mlp": 0.01036316, "balance_loss_clip": 1.0230062, "balance_loss_mlp": 1.0378828, "epoch": 0.7500977002855854, "flos": 19169978198400.0, "grad_norm": 5.079888460540549, "language_loss": 0.80453539, "learning_rate": 6.199914591465878e-07, "loss": 0.826015, "num_input_tokens_seen": 269048180, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.734375, "step": 12476, "time_per_iteration": 2.453348159790039 }, { "auxiliary_loss_clip": 0.01104442, "auxiliary_loss_mlp": 0.01031807, "balance_loss_clip": 1.0194273, "balance_loss_mlp": 1.03549755, "epoch": 0.7501578235382534, "flos": 22163886754560.0, "grad_norm": 2.2229879292896118, "language_loss": 0.7783758, "learning_rate": 6.19709590885688e-07, "loss": 0.79973829, "num_input_tokens_seen": 269068600, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 12477, "time_per_iteration": 2.4694766998291016 }, { "auxiliary_loss_clip": 0.01029673, "auxiliary_loss_mlp": 0.01004259, "balance_loss_clip": 1.00315607, "balance_loss_mlp": 1.00686216, "epoch": 0.7502179467909214, "flos": 64465040033280.0, "grad_norm": 0.809074576366427, "language_loss": 0.54459953, "learning_rate": 6.194277749655394e-07, "loss": 0.5649389, "num_input_tokens_seen": 269119045, "router_z_loss_clip": 0.01104736, "router_z_loss_mlp": 0.22851562, "step": 12478, "time_per_iteration": 3.064948797225952 }, { "auxiliary_loss_clip": 0.01102735, "auxiliary_loss_mlp": 0.01032328, "balance_loss_clip": 1.02017438, "balance_loss_mlp": 1.03591847, "epoch": 0.7502780700435894, "flos": 20478275268480.0, "grad_norm": 1.7292519895057896, "language_loss": 0.80087852, "learning_rate": 6.191460113968272e-07, "loss": 0.82222915, "num_input_tokens_seen": 269136755, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.66796875, "step": 12479, "time_per_iteration": 2.440683126449585 }, { "auxiliary_loss_clip": 0.01110138, "auxiliary_loss_mlp": 0.01037473, "balance_loss_clip": 1.02364516, "balance_loss_mlp": 1.03734708, "epoch": 0.7503381932962573, "flos": 20445273648000.0, "grad_norm": 2.415334891274884, "language_loss": 0.63218427, "learning_rate": 6.188643001902369e-07, "loss": 0.6536603, "num_input_tokens_seen": 269156120, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 12480, "time_per_iteration": 2.485990524291992 }, { "auxiliary_loss_clip": 0.01102991, "auxiliary_loss_mlp": 0.01037964, "balance_loss_clip": 1.02591765, "balance_loss_mlp": 1.03670478, "epoch": 0.7503983165489253, "flos": 22381936266240.0, "grad_norm": 2.0242031829717435, "language_loss": 0.77901596, "learning_rate": 6.185826413564512e-07, "loss": 0.80042553, "num_input_tokens_seen": 269175650, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6640625, "step": 12481, "time_per_iteration": 2.4530744552612305 }, { "auxiliary_loss_clip": 0.01105919, "auxiliary_loss_mlp": 0.01034313, "balance_loss_clip": 1.02091336, "balance_loss_mlp": 1.03484237, "epoch": 0.7504584398015932, "flos": 24899453717760.0, "grad_norm": 2.7191648292359027, "language_loss": 0.71593934, "learning_rate": 6.183010349061501e-07, "loss": 0.73734164, "num_input_tokens_seen": 269197080, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 12482, "time_per_iteration": 2.526646852493286 }, { "auxiliary_loss_clip": 0.01106337, "auxiliary_loss_mlp": 0.01037351, "balance_loss_clip": 1.02508426, "balance_loss_mlp": 1.03644037, "epoch": 0.7505185630542612, "flos": 25885237547520.0, "grad_norm": 1.6714330818316279, "language_loss": 0.70424503, "learning_rate": 6.180194808500118e-07, "loss": 0.7256819, "num_input_tokens_seen": 269218600, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69921875, "step": 12483, "time_per_iteration": 2.502502202987671 }, { "auxiliary_loss_clip": 0.01104609, "auxiliary_loss_mlp": 0.01026701, "balance_loss_clip": 1.01526308, "balance_loss_mlp": 1.03574347, "epoch": 0.7505786863069293, "flos": 23143852581120.0, "grad_norm": 2.0071095936663266, "language_loss": 0.74582374, "learning_rate": 6.177379791987131e-07, "loss": 0.76713681, "num_input_tokens_seen": 269239245, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 12484, "time_per_iteration": 2.4606332778930664 }, { "auxiliary_loss_clip": 0.01103462, "auxiliary_loss_mlp": 0.01031456, "balance_loss_clip": 1.0192008, "balance_loss_mlp": 1.03490543, "epoch": 0.7506388095595972, "flos": 16983377769600.0, "grad_norm": 1.9207587934705714, "language_loss": 0.84698141, "learning_rate": 6.174565299629295e-07, "loss": 0.8683306, "num_input_tokens_seen": 269258520, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 12485, "time_per_iteration": 2.4618537425994873 }, { "auxiliary_loss_clip": 0.01103597, "auxiliary_loss_mlp": 0.0103091, "balance_loss_clip": 1.01845849, "balance_loss_mlp": 1.03541636, "epoch": 0.7506989328122652, "flos": 22344984149760.0, "grad_norm": 1.4447116516694558, "language_loss": 0.78263986, "learning_rate": 6.171751331533323e-07, "loss": 0.80398494, "num_input_tokens_seen": 269278320, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 12486, "time_per_iteration": 2.4792141914367676 }, { "auxiliary_loss_clip": 0.01108465, "auxiliary_loss_mlp": 0.01030216, "balance_loss_clip": 1.01703787, "balance_loss_mlp": 1.03734314, "epoch": 0.7507590560649331, "flos": 25776069137280.0, "grad_norm": 2.0340881341468555, "language_loss": 0.72483003, "learning_rate": 6.168937887805932e-07, "loss": 0.74621689, "num_input_tokens_seen": 269298025, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 12487, "time_per_iteration": 2.493753433227539 }, { "auxiliary_loss_clip": 0.01106003, "auxiliary_loss_mlp": 0.01027877, "balance_loss_clip": 1.01496077, "balance_loss_mlp": 1.03493273, "epoch": 0.7508191793176011, "flos": 24279420124800.0, "grad_norm": 1.9393227312759982, "language_loss": 0.67714238, "learning_rate": 6.166124968553801e-07, "loss": 0.6984812, "num_input_tokens_seen": 269316770, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 12488, "time_per_iteration": 2.4859812259674072 }, { "auxiliary_loss_clip": 0.01105852, "auxiliary_loss_mlp": 0.01027738, "balance_loss_clip": 1.01474404, "balance_loss_mlp": 1.03644657, "epoch": 0.750879302570269, "flos": 19899575251200.0, "grad_norm": 1.7700161258745617, "language_loss": 0.77236032, "learning_rate": 6.163312573883592e-07, "loss": 0.79369628, "num_input_tokens_seen": 269334755, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 12489, "time_per_iteration": 2.454756021499634 }, { "auxiliary_loss_clip": 0.01102604, "auxiliary_loss_mlp": 0.01031431, "balance_loss_clip": 1.01978421, "balance_loss_mlp": 1.0354836, "epoch": 0.750939425822937, "flos": 29205681667200.0, "grad_norm": 2.200850552961705, "language_loss": 0.75062418, "learning_rate": 6.160500703901956e-07, "loss": 0.77196449, "num_input_tokens_seen": 269353810, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 12490, "time_per_iteration": 2.5340120792388916 }, { "auxiliary_loss_clip": 0.01105248, "auxiliary_loss_mlp": 0.01030288, "balance_loss_clip": 1.01791382, "balance_loss_mlp": 1.03684306, "epoch": 0.750999549075605, "flos": 21142300043520.0, "grad_norm": 1.8261265996437857, "language_loss": 0.78145695, "learning_rate": 6.157689358715527e-07, "loss": 0.80281234, "num_input_tokens_seen": 269372910, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 12491, "time_per_iteration": 2.4477579593658447 }, { "auxiliary_loss_clip": 0.01101768, "auxiliary_loss_mlp": 0.0102822, "balance_loss_clip": 1.01690745, "balance_loss_mlp": 1.03465557, "epoch": 0.751059672328273, "flos": 23547740083200.0, "grad_norm": 1.6534674185774563, "language_loss": 0.76827449, "learning_rate": 6.154878538430899e-07, "loss": 0.78957438, "num_input_tokens_seen": 269391545, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 12492, "time_per_iteration": 2.4780828952789307 }, { "auxiliary_loss_clip": 0.01101736, "auxiliary_loss_mlp": 0.01029036, "balance_loss_clip": 1.01738882, "balance_loss_mlp": 1.03282833, "epoch": 0.7511197955809409, "flos": 18989742729600.0, "grad_norm": 1.9084137759232824, "language_loss": 0.7119292, "learning_rate": 6.152068243154671e-07, "loss": 0.73323691, "num_input_tokens_seen": 269408530, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 12493, "time_per_iteration": 2.4330592155456543 }, { "auxiliary_loss_clip": 0.01106998, "auxiliary_loss_mlp": 0.01027684, "balance_loss_clip": 1.01516068, "balance_loss_mlp": 1.03731465, "epoch": 0.7511799188336089, "flos": 22046961006720.0, "grad_norm": 1.6512753136790932, "language_loss": 0.80321199, "learning_rate": 6.149258472993395e-07, "loss": 0.82455885, "num_input_tokens_seen": 269425930, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12494, "time_per_iteration": 2.4450571537017822 }, { "auxiliary_loss_clip": 0.01105881, "auxiliary_loss_mlp": 0.01028547, "balance_loss_clip": 1.01542795, "balance_loss_mlp": 1.03608704, "epoch": 0.7512400420862768, "flos": 16467125546880.0, "grad_norm": 1.786282544587762, "language_loss": 0.78442341, "learning_rate": 6.146449228053634e-07, "loss": 0.80576771, "num_input_tokens_seen": 269443945, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 12495, "time_per_iteration": 2.4377758502960205 }, { "auxiliary_loss_clip": 0.01105036, "auxiliary_loss_mlp": 0.01033311, "balance_loss_clip": 1.02094913, "balance_loss_mlp": 1.03609014, "epoch": 0.7513001653389448, "flos": 20448326304000.0, "grad_norm": 2.147305631692901, "language_loss": 0.71260905, "learning_rate": 6.143640508441898e-07, "loss": 0.73399258, "num_input_tokens_seen": 269463625, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 12496, "time_per_iteration": 2.4550182819366455 }, { "auxiliary_loss_clip": 0.01105919, "auxiliary_loss_mlp": 0.01033781, "balance_loss_clip": 1.02200294, "balance_loss_mlp": 1.03658605, "epoch": 0.7513602885916129, "flos": 23476816679040.0, "grad_norm": 2.504553785074358, "language_loss": 0.78409553, "learning_rate": 6.140832314264705e-07, "loss": 0.80549252, "num_input_tokens_seen": 269483415, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69140625, "step": 12497, "time_per_iteration": 2.4715168476104736 }, { "auxiliary_loss_clip": 0.01106497, "auxiliary_loss_mlp": 0.01036027, "balance_loss_clip": 1.02271688, "balance_loss_mlp": 1.0355593, "epoch": 0.7514204118442808, "flos": 26797224885120.0, "grad_norm": 1.4909712225009764, "language_loss": 0.77048951, "learning_rate": 6.13802464562855e-07, "loss": 0.7919147, "num_input_tokens_seen": 269504635, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 12498, "time_per_iteration": 2.4953551292419434 }, { "auxiliary_loss_clip": 0.01103808, "auxiliary_loss_mlp": 0.01034898, "balance_loss_clip": 1.02334106, "balance_loss_mlp": 1.03775525, "epoch": 0.7514805350969488, "flos": 19865639877120.0, "grad_norm": 1.900870589700046, "language_loss": 0.74088854, "learning_rate": 6.135217502639878e-07, "loss": 0.76227558, "num_input_tokens_seen": 269523955, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66015625, "step": 12499, "time_per_iteration": 2.492159366607666 }, { "auxiliary_loss_clip": 0.01102218, "auxiliary_loss_mlp": 0.01027775, "balance_loss_clip": 1.01616395, "balance_loss_mlp": 1.03410077, "epoch": 0.7515406583496167, "flos": 24571553437440.0, "grad_norm": 2.02972587866479, "language_loss": 0.79612982, "learning_rate": 6.132410885405148e-07, "loss": 0.81742978, "num_input_tokens_seen": 269544410, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 12500, "time_per_iteration": 2.547102451324463 }, { "auxiliary_loss_clip": 0.01113129, "auxiliary_loss_mlp": 0.0103434, "balance_loss_clip": 1.01943827, "balance_loss_mlp": 1.03792739, "epoch": 0.7516007816022847, "flos": 20120246455680.0, "grad_norm": 2.062970404030193, "language_loss": 0.73547065, "learning_rate": 6.129604794030794e-07, "loss": 0.75694531, "num_input_tokens_seen": 269563315, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.75390625, "step": 12501, "time_per_iteration": 2.4613680839538574 }, { "auxiliary_loss_clip": 0.01103785, "auxiliary_loss_mlp": 0.01026537, "balance_loss_clip": 1.01372218, "balance_loss_mlp": 1.03508341, "epoch": 0.7516609048549526, "flos": 22784638619520.0, "grad_norm": 1.79238671906216, "language_loss": 0.78517658, "learning_rate": 6.126799228623207e-07, "loss": 0.80647981, "num_input_tokens_seen": 269583950, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 12502, "time_per_iteration": 2.510645627975464 }, { "auxiliary_loss_clip": 0.01106143, "auxiliary_loss_mlp": 0.0103167, "balance_loss_clip": 1.01881325, "balance_loss_mlp": 1.03686595, "epoch": 0.7517210281076206, "flos": 10634012311680.0, "grad_norm": 2.593421978889565, "language_loss": 0.70498073, "learning_rate": 6.123994189288786e-07, "loss": 0.72635883, "num_input_tokens_seen": 269600120, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12503, "time_per_iteration": 3.802367925643921 }, { "auxiliary_loss_clip": 0.0103028, "auxiliary_loss_mlp": 0.01003467, "balance_loss_clip": 1.00236416, "balance_loss_mlp": 1.00760627, "epoch": 0.7517811513602886, "flos": 66052221275520.0, "grad_norm": 1.0069688820235314, "language_loss": 0.63979805, "learning_rate": 6.121189676133903e-07, "loss": 0.66013551, "num_input_tokens_seen": 269659815, "router_z_loss_clip": 0.01104736, "router_z_loss_mlp": 0.2265625, "step": 12504, "time_per_iteration": 4.460524797439575 }, { "auxiliary_loss_clip": 0.01099456, "auxiliary_loss_mlp": 0.01031679, "balance_loss_clip": 1.02020502, "balance_loss_mlp": 1.03319597, "epoch": 0.7518412746129566, "flos": 37268345018880.0, "grad_norm": 1.4052204768616672, "language_loss": 0.68582666, "learning_rate": 6.118385689264896e-07, "loss": 0.70713806, "num_input_tokens_seen": 269684565, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6640625, "step": 12505, "time_per_iteration": 2.6329050064086914 }, { "auxiliary_loss_clip": 0.01030073, "auxiliary_loss_mlp": 0.01002529, "balance_loss_clip": 1.00137854, "balance_loss_mlp": 1.00737739, "epoch": 0.7519013978656245, "flos": 60518567727360.0, "grad_norm": 0.6589641413753962, "language_loss": 0.55111599, "learning_rate": 6.11558222878809e-07, "loss": 0.57144201, "num_input_tokens_seen": 269752325, "router_z_loss_clip": 0.01147461, "router_z_loss_mlp": 0.2265625, "step": 12506, "time_per_iteration": 3.1794533729553223 }, { "auxiliary_loss_clip": 0.01110667, "auxiliary_loss_mlp": 0.01037255, "balance_loss_clip": 1.02436209, "balance_loss_mlp": 1.03962612, "epoch": 0.7519615211182925, "flos": 18806885568000.0, "grad_norm": 3.201566151560251, "language_loss": 0.78584337, "learning_rate": 6.112779294809796e-07, "loss": 0.80732262, "num_input_tokens_seen": 269770630, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 12507, "time_per_iteration": 3.8418421745300293 }, { "auxiliary_loss_clip": 0.01104782, "auxiliary_loss_mlp": 0.01031347, "balance_loss_clip": 1.0192945, "balance_loss_mlp": 1.03819776, "epoch": 0.7520216443709604, "flos": 14575244209920.0, "grad_norm": 1.5949051891605726, "language_loss": 0.71423161, "learning_rate": 6.10997688743631e-07, "loss": 0.73559284, "num_input_tokens_seen": 269787280, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6640625, "step": 12508, "time_per_iteration": 3.8736138343811035 }, { "auxiliary_loss_clip": 0.01102511, "auxiliary_loss_mlp": 0.0103173, "balance_loss_clip": 1.01913524, "balance_loss_mlp": 1.03477669, "epoch": 0.7520817676236284, "flos": 17056599644160.0, "grad_norm": 1.7064032034950956, "language_loss": 0.72247195, "learning_rate": 6.107175006773885e-07, "loss": 0.74381435, "num_input_tokens_seen": 269805205, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 12509, "time_per_iteration": 2.4247896671295166 }, { "auxiliary_loss_clip": 0.01110896, "auxiliary_loss_mlp": 0.0103909, "balance_loss_clip": 1.02476692, "balance_loss_mlp": 1.03740537, "epoch": 0.7521418908762965, "flos": 25666397936640.0, "grad_norm": 1.6939986215531202, "language_loss": 0.62099469, "learning_rate": 6.104373652928785e-07, "loss": 0.64249456, "num_input_tokens_seen": 269824820, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.734375, "step": 12510, "time_per_iteration": 2.4933969974517822 }, { "auxiliary_loss_clip": 0.01102307, "auxiliary_loss_mlp": 0.01031354, "balance_loss_clip": 1.01876593, "balance_loss_mlp": 1.03611124, "epoch": 0.7522020141289644, "flos": 20886759711360.0, "grad_norm": 1.8871855635341934, "language_loss": 0.81888068, "learning_rate": 6.10157282600722e-07, "loss": 0.84021729, "num_input_tokens_seen": 269842825, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.66015625, "step": 12511, "time_per_iteration": 2.4704835414886475 }, { "auxiliary_loss_clip": 0.01108452, "auxiliary_loss_mlp": 0.01034749, "balance_loss_clip": 1.02137399, "balance_loss_mlp": 1.03649974, "epoch": 0.7522621373816324, "flos": 12640305444480.0, "grad_norm": 2.123211216084617, "language_loss": 0.75930983, "learning_rate": 6.098772526115412e-07, "loss": 0.78074181, "num_input_tokens_seen": 269859000, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 12512, "time_per_iteration": 2.4092886447906494 }, { "auxiliary_loss_clip": 0.01098372, "auxiliary_loss_mlp": 0.01028225, "balance_loss_clip": 1.01655471, "balance_loss_mlp": 1.03355265, "epoch": 0.7523222606343003, "flos": 25626141768960.0, "grad_norm": 1.6141050898591391, "language_loss": 0.82313931, "learning_rate": 6.095972753359537e-07, "loss": 0.84440523, "num_input_tokens_seen": 269878895, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6484375, "step": 12513, "time_per_iteration": 2.506065607070923 }, { "auxiliary_loss_clip": 0.01107454, "auxiliary_loss_mlp": 0.01032769, "balance_loss_clip": 1.01920319, "balance_loss_mlp": 1.03605866, "epoch": 0.7523823838869683, "flos": 20448900921600.0, "grad_norm": 2.0418392246370547, "language_loss": 0.74819791, "learning_rate": 6.093173507845771e-07, "loss": 0.76960015, "num_input_tokens_seen": 269897280, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 12514, "time_per_iteration": 2.4507007598876953 }, { "auxiliary_loss_clip": 0.01099776, "auxiliary_loss_mlp": 0.01029626, "balance_loss_clip": 1.01861143, "balance_loss_mlp": 1.03416181, "epoch": 0.7524425071396362, "flos": 14720610551040.0, "grad_norm": 1.7989840148250091, "language_loss": 0.68721229, "learning_rate": 6.090374789680271e-07, "loss": 0.70850629, "num_input_tokens_seen": 269914640, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65625, "step": 12515, "time_per_iteration": 2.4241957664489746 }, { "auxiliary_loss_clip": 0.0110629, "auxiliary_loss_mlp": 0.01030721, "balance_loss_clip": 1.0192709, "balance_loss_mlp": 1.03718209, "epoch": 0.7525026303923043, "flos": 30592048947840.0, "grad_norm": 2.1361779425919507, "language_loss": 0.70386606, "learning_rate": 6.087576598969137e-07, "loss": 0.72523624, "num_input_tokens_seen": 269934960, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.69140625, "step": 12516, "time_per_iteration": 2.5146446228027344 }, { "auxiliary_loss_clip": 0.01103733, "auxiliary_loss_mlp": 0.01034968, "balance_loss_clip": 1.02251041, "balance_loss_mlp": 1.03713369, "epoch": 0.7525627536449722, "flos": 24791757765120.0, "grad_norm": 1.5236835336306587, "language_loss": 0.89485633, "learning_rate": 6.084778935818495e-07, "loss": 0.91624337, "num_input_tokens_seen": 269956655, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6640625, "step": 12517, "time_per_iteration": 2.493391513824463 }, { "auxiliary_loss_clip": 0.01107465, "auxiliary_loss_mlp": 0.01033776, "balance_loss_clip": 1.02147353, "balance_loss_mlp": 1.03687525, "epoch": 0.7526228768976402, "flos": 20779782030720.0, "grad_norm": 1.6260032832910543, "language_loss": 0.74320972, "learning_rate": 6.081981800334437e-07, "loss": 0.76462209, "num_input_tokens_seen": 269976835, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 12518, "time_per_iteration": 2.464402437210083 }, { "auxiliary_loss_clip": 0.01031586, "auxiliary_loss_mlp": 0.01003835, "balance_loss_clip": 1.00267839, "balance_loss_mlp": 1.00893545, "epoch": 0.7526830001503081, "flos": 66559243703040.0, "grad_norm": 0.7048569744417615, "language_loss": 0.55621624, "learning_rate": 6.079185192623017e-07, "loss": 0.57657039, "num_input_tokens_seen": 270040630, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.2265625, "step": 12519, "time_per_iteration": 3.152106285095215 }, { "auxiliary_loss_clip": 0.01103557, "auxiliary_loss_mlp": 0.0103514, "balance_loss_clip": 1.02385712, "balance_loss_mlp": 1.03547752, "epoch": 0.7527431234029761, "flos": 23477894087040.0, "grad_norm": 2.5329857949270878, "language_loss": 0.77772754, "learning_rate": 6.07638911279029e-07, "loss": 0.79911453, "num_input_tokens_seen": 270059695, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6796875, "step": 12520, "time_per_iteration": 2.4885380268096924 }, { "auxiliary_loss_clip": 0.01104462, "auxiliary_loss_mlp": 0.01038601, "balance_loss_clip": 1.02684712, "balance_loss_mlp": 1.0355525, "epoch": 0.752803246655644, "flos": 22049546785920.0, "grad_norm": 2.145373284262611, "language_loss": 0.74172628, "learning_rate": 6.07359356094229e-07, "loss": 0.76315689, "num_input_tokens_seen": 270078420, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 12521, "time_per_iteration": 2.530775547027588 }, { "auxiliary_loss_clip": 0.01108668, "auxiliary_loss_mlp": 0.01036687, "balance_loss_clip": 1.02318025, "balance_loss_mlp": 1.0367533, "epoch": 0.752863369908312, "flos": 30153795108480.0, "grad_norm": 1.9964309718628386, "language_loss": 0.66890615, "learning_rate": 6.070798537185016e-07, "loss": 0.69035971, "num_input_tokens_seen": 270097040, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 12522, "time_per_iteration": 2.556171178817749 }, { "auxiliary_loss_clip": 0.01108694, "auxiliary_loss_mlp": 0.01039509, "balance_loss_clip": 1.02711725, "balance_loss_mlp": 1.03703737, "epoch": 0.7529234931609801, "flos": 24567638855040.0, "grad_norm": 1.6089449996522667, "language_loss": 0.78599507, "learning_rate": 6.068004041624453e-07, "loss": 0.80747712, "num_input_tokens_seen": 270116365, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 12523, "time_per_iteration": 2.522108316421509 }, { "auxiliary_loss_clip": 0.0110305, "auxiliary_loss_mlp": 0.01032178, "balance_loss_clip": 1.01994765, "balance_loss_mlp": 1.03581071, "epoch": 0.752983616413648, "flos": 23112395245440.0, "grad_norm": 3.028813465976653, "language_loss": 0.80872047, "learning_rate": 6.065210074366571e-07, "loss": 0.83007276, "num_input_tokens_seen": 270135395, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 12524, "time_per_iteration": 2.475832462310791 }, { "auxiliary_loss_clip": 0.01104018, "auxiliary_loss_mlp": 0.01030155, "balance_loss_clip": 1.01857388, "balance_loss_mlp": 1.03602147, "epoch": 0.753043739666316, "flos": 24316946858880.0, "grad_norm": 2.179026315357015, "language_loss": 0.74086434, "learning_rate": 6.062416635517326e-07, "loss": 0.76220608, "num_input_tokens_seen": 270156425, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 12525, "time_per_iteration": 2.4998879432678223 }, { "auxiliary_loss_clip": 0.01105344, "auxiliary_loss_mlp": 0.01035709, "balance_loss_clip": 1.02362752, "balance_loss_mlp": 1.03700924, "epoch": 0.7531038629189839, "flos": 24243294021120.0, "grad_norm": 2.5998405791604777, "language_loss": 0.72089529, "learning_rate": 6.059623725182641e-07, "loss": 0.74230582, "num_input_tokens_seen": 270176905, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 12526, "time_per_iteration": 2.473989486694336 }, { "auxiliary_loss_clip": 0.01103089, "auxiliary_loss_mlp": 0.01027315, "balance_loss_clip": 1.01591897, "balance_loss_mlp": 1.03527832, "epoch": 0.7531639861716519, "flos": 30188807890560.0, "grad_norm": 1.766897738972908, "language_loss": 0.72524863, "learning_rate": 6.056831343468414e-07, "loss": 0.74655271, "num_input_tokens_seen": 270196640, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 12527, "time_per_iteration": 2.5368847846984863 }, { "auxiliary_loss_clip": 0.01106286, "auxiliary_loss_mlp": 0.01026996, "balance_loss_clip": 1.01532578, "balance_loss_mlp": 1.03809345, "epoch": 0.7532241094243198, "flos": 18223193560320.0, "grad_norm": 2.9923172694081424, "language_loss": 0.81097364, "learning_rate": 6.054039490480539e-07, "loss": 0.8323065, "num_input_tokens_seen": 270213905, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 12528, "time_per_iteration": 2.4255430698394775 }, { "auxiliary_loss_clip": 0.01106249, "auxiliary_loss_mlp": 0.01035825, "balance_loss_clip": 1.02213407, "balance_loss_mlp": 1.03628612, "epoch": 0.7532842326769879, "flos": 20881049448960.0, "grad_norm": 2.614463956480165, "language_loss": 0.85340548, "learning_rate": 6.051248166324892e-07, "loss": 0.87482619, "num_input_tokens_seen": 270231995, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69921875, "step": 12529, "time_per_iteration": 2.4500014781951904 }, { "auxiliary_loss_clip": 0.01109823, "auxiliary_loss_mlp": 0.01034161, "balance_loss_clip": 1.02139425, "balance_loss_mlp": 1.03794694, "epoch": 0.7533443559296558, "flos": 18078689145600.0, "grad_norm": 2.2381949171873243, "language_loss": 0.74430406, "learning_rate": 6.048457371107303e-07, "loss": 0.76574397, "num_input_tokens_seen": 270251480, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 12530, "time_per_iteration": 2.444218873977661 }, { "auxiliary_loss_clip": 0.01030688, "auxiliary_loss_mlp": 0.0100434, "balance_loss_clip": 1.00330281, "balance_loss_mlp": 1.00811815, "epoch": 0.7534044791823238, "flos": 50254830766080.0, "grad_norm": 0.882264966280731, "language_loss": 0.63629073, "learning_rate": 6.045667104933612e-07, "loss": 0.65664101, "num_input_tokens_seen": 270306480, "router_z_loss_clip": 0.01037598, "router_z_loss_mlp": 0.22558594, "step": 12531, "time_per_iteration": 2.9489662647247314 }, { "auxiliary_loss_clip": 0.01108291, "auxiliary_loss_mlp": 0.0103095, "balance_loss_clip": 1.01783717, "balance_loss_mlp": 1.03704929, "epoch": 0.7534646024349917, "flos": 20850274471680.0, "grad_norm": 2.4145611663623083, "language_loss": 0.70581698, "learning_rate": 6.042877367909633e-07, "loss": 0.72720945, "num_input_tokens_seen": 270324595, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12532, "time_per_iteration": 2.448045015335083 }, { "auxiliary_loss_clip": 0.01101995, "auxiliary_loss_mlp": 0.01030265, "balance_loss_clip": 1.0192678, "balance_loss_mlp": 1.03617501, "epoch": 0.7535247256876597, "flos": 23071779941760.0, "grad_norm": 1.5586101853128942, "language_loss": 0.77772379, "learning_rate": 6.040088160141132e-07, "loss": 0.7990464, "num_input_tokens_seen": 270344375, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.65625, "step": 12533, "time_per_iteration": 2.4597883224487305 }, { "auxiliary_loss_clip": 0.01030515, "auxiliary_loss_mlp": 0.0100007, "balance_loss_clip": 0.99891394, "balance_loss_mlp": 1.00806665, "epoch": 0.7535848489403276, "flos": 58623418252800.0, "grad_norm": 0.7993453120002351, "language_loss": 0.57330137, "learning_rate": 6.037299481733886e-07, "loss": 0.59360725, "num_input_tokens_seen": 270405235, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.22460938, "step": 12534, "time_per_iteration": 3.141824960708618 }, { "auxiliary_loss_clip": 0.01103209, "auxiliary_loss_mlp": 0.01027292, "balance_loss_clip": 1.01500785, "balance_loss_mlp": 1.03463185, "epoch": 0.7536449721929956, "flos": 26577882483840.0, "grad_norm": 1.498112008601558, "language_loss": 0.71234494, "learning_rate": 6.03451133279365e-07, "loss": 0.73364997, "num_input_tokens_seen": 270425820, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 12535, "time_per_iteration": 2.529547929763794 }, { "auxiliary_loss_clip": 0.01106185, "auxiliary_loss_mlp": 0.0102949, "balance_loss_clip": 1.0164187, "balance_loss_mlp": 1.03492916, "epoch": 0.7537050954456637, "flos": 25735992537600.0, "grad_norm": 1.6776143580277454, "language_loss": 0.80571067, "learning_rate": 6.031723713426135e-07, "loss": 0.82706738, "num_input_tokens_seen": 270447120, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12536, "time_per_iteration": 2.5016226768493652 }, { "auxiliary_loss_clip": 0.01099575, "auxiliary_loss_mlp": 0.01031336, "balance_loss_clip": 1.01911116, "balance_loss_mlp": 1.0332942, "epoch": 0.7537652186983316, "flos": 30224431203840.0, "grad_norm": 1.9297942994419588, "language_loss": 0.74484146, "learning_rate": 6.028936623737067e-07, "loss": 0.76615059, "num_input_tokens_seen": 270468680, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.66015625, "step": 12537, "time_per_iteration": 2.564844846725464 }, { "auxiliary_loss_clip": 0.01105048, "auxiliary_loss_mlp": 0.01037651, "balance_loss_clip": 1.02530742, "balance_loss_mlp": 1.03522694, "epoch": 0.7538253419509996, "flos": 12641239198080.0, "grad_norm": 1.6313425700610438, "language_loss": 0.73715973, "learning_rate": 6.026150063832111e-07, "loss": 0.75858676, "num_input_tokens_seen": 270486310, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 12538, "time_per_iteration": 2.4377238750457764 }, { "auxiliary_loss_clip": 0.01107462, "auxiliary_loss_mlp": 0.01033393, "balance_loss_clip": 1.02071559, "balance_loss_mlp": 1.03692794, "epoch": 0.7538854652036675, "flos": 23185976256000.0, "grad_norm": 1.6138954603050135, "language_loss": 0.67572528, "learning_rate": 6.023364033816956e-07, "loss": 0.69713384, "num_input_tokens_seen": 270507210, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12539, "time_per_iteration": 2.524789571762085 }, { "auxiliary_loss_clip": 0.01102811, "auxiliary_loss_mlp": 0.01027836, "balance_loss_clip": 1.01533103, "balance_loss_mlp": 1.03516173, "epoch": 0.7539455884563355, "flos": 23186227651200.0, "grad_norm": 1.6981836581006176, "language_loss": 0.74978018, "learning_rate": 6.020578533797229e-07, "loss": 0.77108657, "num_input_tokens_seen": 270525250, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 12540, "time_per_iteration": 2.473191499710083 }, { "auxiliary_loss_clip": 0.01106994, "auxiliary_loss_mlp": 0.01030872, "balance_loss_clip": 1.01840913, "balance_loss_mlp": 1.03649378, "epoch": 0.7540057117090034, "flos": 13181155505280.0, "grad_norm": 2.475735809622442, "language_loss": 0.73003006, "learning_rate": 6.017793563878566e-07, "loss": 0.7514087, "num_input_tokens_seen": 270539295, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 12541, "time_per_iteration": 2.4293212890625 }, { "auxiliary_loss_clip": 0.01103429, "auxiliary_loss_mlp": 0.0102997, "balance_loss_clip": 1.01751256, "balance_loss_mlp": 1.03510678, "epoch": 0.7540658349616715, "flos": 45478134478080.0, "grad_norm": 1.7073674529850564, "language_loss": 0.72270209, "learning_rate": 6.015009124166576e-07, "loss": 0.74403608, "num_input_tokens_seen": 270562815, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.68359375, "step": 12542, "time_per_iteration": 2.6708920001983643 }, { "auxiliary_loss_clip": 0.01103269, "auxiliary_loss_mlp": 0.010282, "balance_loss_clip": 1.01607096, "balance_loss_mlp": 1.03511858, "epoch": 0.7541259582143394, "flos": 19930817105280.0, "grad_norm": 2.0538436960061777, "language_loss": 0.84447479, "learning_rate": 6.012225214766844e-07, "loss": 0.86578953, "num_input_tokens_seen": 270579055, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 12543, "time_per_iteration": 2.4408528804779053 }, { "auxiliary_loss_clip": 0.01107332, "auxiliary_loss_mlp": 0.01032537, "balance_loss_clip": 1.02035427, "balance_loss_mlp": 1.04005444, "epoch": 0.7541860814670074, "flos": 27198239299200.0, "grad_norm": 2.256814167198698, "language_loss": 0.73694646, "learning_rate": 6.009441835784927e-07, "loss": 0.75834519, "num_input_tokens_seen": 270599080, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 12544, "time_per_iteration": 2.4981839656829834 }, { "auxiliary_loss_clip": 0.01105271, "auxiliary_loss_mlp": 0.01036632, "balance_loss_clip": 1.02452624, "balance_loss_mlp": 1.03627157, "epoch": 0.7542462047196753, "flos": 21324151624320.0, "grad_norm": 2.364801197866936, "language_loss": 0.68552768, "learning_rate": 6.006658987326383e-07, "loss": 0.70694667, "num_input_tokens_seen": 270618715, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 12545, "time_per_iteration": 5.395116806030273 }, { "auxiliary_loss_clip": 0.01105433, "auxiliary_loss_mlp": 0.01029149, "balance_loss_clip": 1.01683462, "balance_loss_mlp": 1.03521204, "epoch": 0.7543063279723433, "flos": 11940944664960.0, "grad_norm": 2.101148453985527, "language_loss": 0.68894815, "learning_rate": 6.003876669496728e-07, "loss": 0.71029401, "num_input_tokens_seen": 270635695, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 12546, "time_per_iteration": 2.4188129901885986 }, { "auxiliary_loss_clip": 0.0110512, "auxiliary_loss_mlp": 0.01032748, "balance_loss_clip": 1.01993871, "balance_loss_mlp": 1.03568053, "epoch": 0.7543664512250112, "flos": 22819974624000.0, "grad_norm": 2.2721548844608144, "language_loss": 0.73597693, "learning_rate": 6.00109488240147e-07, "loss": 0.75735557, "num_input_tokens_seen": 270654325, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12547, "time_per_iteration": 2.4756572246551514 }, { "auxiliary_loss_clip": 0.01104988, "auxiliary_loss_mlp": 0.01027836, "balance_loss_clip": 1.01480627, "balance_loss_mlp": 1.03584933, "epoch": 0.7544265744776792, "flos": 20923855482240.0, "grad_norm": 1.9273848572026435, "language_loss": 0.67651558, "learning_rate": 5.998313626146099e-07, "loss": 0.69784379, "num_input_tokens_seen": 270674260, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 12548, "time_per_iteration": 3.8472788333892822 }, { "auxiliary_loss_clip": 0.01106732, "auxiliary_loss_mlp": 0.01033687, "balance_loss_clip": 1.02085471, "balance_loss_mlp": 1.03590608, "epoch": 0.7544866977303473, "flos": 15195493284480.0, "grad_norm": 2.1058613682950145, "language_loss": 0.8731938, "learning_rate": 5.995532900836088e-07, "loss": 0.89459801, "num_input_tokens_seen": 270692200, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 12549, "time_per_iteration": 2.4418623447418213 }, { "auxiliary_loss_clip": 0.01101779, "auxiliary_loss_mlp": 0.0103445, "balance_loss_clip": 1.02252269, "balance_loss_mlp": 1.03576493, "epoch": 0.7545468209830152, "flos": 27083683848960.0, "grad_norm": 1.717832286835587, "language_loss": 0.76997626, "learning_rate": 5.992752706576865e-07, "loss": 0.7913385, "num_input_tokens_seen": 270709675, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66015625, "step": 12550, "time_per_iteration": 3.965660810470581 }, { "auxiliary_loss_clip": 0.01105003, "auxiliary_loss_mlp": 0.01027339, "balance_loss_clip": 1.01546597, "balance_loss_mlp": 1.0356245, "epoch": 0.7546069442356832, "flos": 26871703735680.0, "grad_norm": 1.7906539379185877, "language_loss": 0.69620144, "learning_rate": 5.98997304347386e-07, "loss": 0.71752483, "num_input_tokens_seen": 270733055, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 12551, "time_per_iteration": 2.5256640911102295 }, { "auxiliary_loss_clip": 0.01106306, "auxiliary_loss_mlp": 0.01029014, "balance_loss_clip": 1.01643181, "balance_loss_mlp": 1.03827882, "epoch": 0.7546670674883511, "flos": 15743131015680.0, "grad_norm": 7.075627394531488, "language_loss": 0.8633846, "learning_rate": 5.987193911632487e-07, "loss": 0.88473779, "num_input_tokens_seen": 270749275, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 12552, "time_per_iteration": 2.4397330284118652 }, { "auxiliary_loss_clip": 0.01105548, "auxiliary_loss_mlp": 0.01031903, "balance_loss_clip": 1.02002418, "balance_loss_mlp": 1.03624737, "epoch": 0.7547271907410191, "flos": 23477714519040.0, "grad_norm": 1.7498412402782555, "language_loss": 0.78172362, "learning_rate": 5.98441531115812e-07, "loss": 0.8030982, "num_input_tokens_seen": 270768230, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69140625, "step": 12553, "time_per_iteration": 2.4751288890838623 }, { "auxiliary_loss_clip": 0.0110677, "auxiliary_loss_mlp": 0.01033989, "balance_loss_clip": 1.02168679, "balance_loss_mlp": 1.03748107, "epoch": 0.754787313993687, "flos": 31722804069120.0, "grad_norm": 2.1394790567125144, "language_loss": 0.62930763, "learning_rate": 5.981637242156135e-07, "loss": 0.65071523, "num_input_tokens_seen": 270786285, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 12554, "time_per_iteration": 2.5610244274139404 }, { "auxiliary_loss_clip": 0.01104418, "auxiliary_loss_mlp": 0.01034005, "balance_loss_clip": 1.02232242, "balance_loss_mlp": 1.03565979, "epoch": 0.7548474372463551, "flos": 27563055782400.0, "grad_norm": 1.6663773052857085, "language_loss": 0.73390305, "learning_rate": 5.978859704731864e-07, "loss": 0.75528729, "num_input_tokens_seen": 270805505, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 12555, "time_per_iteration": 2.5013327598571777 }, { "auxiliary_loss_clip": 0.01108377, "auxiliary_loss_mlp": 0.01029314, "balance_loss_clip": 1.01683271, "balance_loss_mlp": 1.037925, "epoch": 0.754907560499023, "flos": 19318576763520.0, "grad_norm": 1.8081608387265036, "language_loss": 0.78526759, "learning_rate": 5.976082698990645e-07, "loss": 0.80664456, "num_input_tokens_seen": 270824610, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 12556, "time_per_iteration": 2.4866392612457275 }, { "auxiliary_loss_clip": 0.01030677, "auxiliary_loss_mlp": 0.01002208, "balance_loss_clip": 1.00101626, "balance_loss_mlp": 1.00805831, "epoch": 0.754967683751691, "flos": 69744628684800.0, "grad_norm": 0.7037127758618285, "language_loss": 0.50464517, "learning_rate": 5.973306225037769e-07, "loss": 0.52497405, "num_input_tokens_seen": 270886155, "router_z_loss_clip": 0.01190186, "router_z_loss_mlp": 0.2265625, "step": 12557, "time_per_iteration": 3.0711894035339355 }, { "auxiliary_loss_clip": 0.0110712, "auxiliary_loss_mlp": 0.01032599, "balance_loss_clip": 1.0196588, "balance_loss_mlp": 1.03707111, "epoch": 0.7550278070043589, "flos": 24421913377920.0, "grad_norm": 2.082651298750522, "language_loss": 0.71358383, "learning_rate": 5.970530282978525e-07, "loss": 0.73498094, "num_input_tokens_seen": 270905325, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 12558, "time_per_iteration": 2.516134262084961 }, { "auxiliary_loss_clip": 0.01104175, "auxiliary_loss_mlp": 0.01030302, "balance_loss_clip": 1.0173738, "balance_loss_mlp": 1.03518844, "epoch": 0.7550879302570269, "flos": 32634611838720.0, "grad_norm": 2.15280610181205, "language_loss": 0.8002305, "learning_rate": 5.967754872918187e-07, "loss": 0.82157528, "num_input_tokens_seen": 270927535, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 12559, "time_per_iteration": 2.5723466873168945 }, { "auxiliary_loss_clip": 0.01107772, "auxiliary_loss_mlp": 0.0102838, "balance_loss_clip": 1.01557076, "balance_loss_mlp": 1.03692293, "epoch": 0.7551480535096948, "flos": 21795550738560.0, "grad_norm": 2.035763640830286, "language_loss": 0.78602934, "learning_rate": 5.96497999496199e-07, "loss": 0.80739093, "num_input_tokens_seen": 270946920, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 12560, "time_per_iteration": 2.522796154022217 }, { "auxiliary_loss_clip": 0.01103764, "auxiliary_loss_mlp": 0.01033193, "balance_loss_clip": 1.02136731, "balance_loss_mlp": 1.03673232, "epoch": 0.7552081767623628, "flos": 18515111391360.0, "grad_norm": 2.0970768535155053, "language_loss": 0.70645154, "learning_rate": 5.96220564921515e-07, "loss": 0.72782111, "num_input_tokens_seen": 270965705, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 12561, "time_per_iteration": 2.4377832412719727 }, { "auxiliary_loss_clip": 0.01103985, "auxiliary_loss_mlp": 0.0103149, "balance_loss_clip": 1.01857352, "balance_loss_mlp": 1.03478432, "epoch": 0.7552683000150308, "flos": 27634805199360.0, "grad_norm": 1.8041842082805035, "language_loss": 0.75370157, "learning_rate": 5.959431835782889e-07, "loss": 0.77505636, "num_input_tokens_seen": 270986550, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 12562, "time_per_iteration": 2.5285980701446533 }, { "auxiliary_loss_clip": 0.01105022, "auxiliary_loss_mlp": 0.01029073, "balance_loss_clip": 1.01625216, "balance_loss_mlp": 1.03658414, "epoch": 0.7553284232676988, "flos": 20302924049280.0, "grad_norm": 2.0495488923723872, "language_loss": 0.75974435, "learning_rate": 5.956658554770371e-07, "loss": 0.78108525, "num_input_tokens_seen": 271006250, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 12563, "time_per_iteration": 2.4474847316741943 }, { "auxiliary_loss_clip": 0.01112374, "auxiliary_loss_mlp": 0.01032106, "balance_loss_clip": 1.01706195, "balance_loss_mlp": 1.03753805, "epoch": 0.7553885465203668, "flos": 33255471444480.0, "grad_norm": 2.3494979004308, "language_loss": 0.67170155, "learning_rate": 5.953885806282768e-07, "loss": 0.69314635, "num_input_tokens_seen": 271025575, "router_z_loss_clip": 0.15039062, "router_z_loss_mlp": 0.74609375, "step": 12564, "time_per_iteration": 2.564026117324829 }, { "auxiliary_loss_clip": 0.01109062, "auxiliary_loss_mlp": 0.01035141, "balance_loss_clip": 1.02190256, "balance_loss_mlp": 1.03748345, "epoch": 0.7554486697730347, "flos": 21616249023360.0, "grad_norm": 2.448314193803085, "language_loss": 0.68991119, "learning_rate": 5.951113590425228e-07, "loss": 0.71135318, "num_input_tokens_seen": 271045805, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 12565, "time_per_iteration": 2.439739942550659 }, { "auxiliary_loss_clip": 0.0110803, "auxiliary_loss_mlp": 0.01030505, "balance_loss_clip": 1.01650965, "balance_loss_mlp": 1.03471231, "epoch": 0.7555087930257027, "flos": 27632973605760.0, "grad_norm": 1.5768989963800413, "language_loss": 0.75047874, "learning_rate": 5.94834190730287e-07, "loss": 0.77186406, "num_input_tokens_seen": 271066065, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.734375, "step": 12566, "time_per_iteration": 2.5000240802764893 }, { "auxiliary_loss_clip": 0.01109701, "auxiliary_loss_mlp": 0.01035693, "balance_loss_clip": 1.02181077, "balance_loss_mlp": 1.03791189, "epoch": 0.7555689162783706, "flos": 23621644316160.0, "grad_norm": 1.9102645968910272, "language_loss": 0.74198043, "learning_rate": 5.945570757020789e-07, "loss": 0.76343441, "num_input_tokens_seen": 271085870, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 12567, "time_per_iteration": 2.5086615085601807 }, { "auxiliary_loss_clip": 0.01104833, "auxiliary_loss_mlp": 0.01026579, "balance_loss_clip": 1.01479483, "balance_loss_mlp": 1.03553247, "epoch": 0.7556290395310387, "flos": 24863076218880.0, "grad_norm": 2.0953472201588057, "language_loss": 0.6331358, "learning_rate": 5.942800139684073e-07, "loss": 0.65444988, "num_input_tokens_seen": 271104260, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 12568, "time_per_iteration": 2.541855573654175 }, { "auxiliary_loss_clip": 0.01104743, "auxiliary_loss_mlp": 0.01031765, "balance_loss_clip": 1.01977241, "balance_loss_mlp": 1.03655541, "epoch": 0.7556891627837066, "flos": 43543770330240.0, "grad_norm": 2.1016316531920527, "language_loss": 0.66201246, "learning_rate": 5.940030055397789e-07, "loss": 0.68337762, "num_input_tokens_seen": 271125745, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 12569, "time_per_iteration": 2.6942763328552246 }, { "auxiliary_loss_clip": 0.01109816, "auxiliary_loss_mlp": 0.01035674, "balance_loss_clip": 1.02185798, "balance_loss_mlp": 1.03813684, "epoch": 0.7557492860363746, "flos": 26650924790400.0, "grad_norm": 1.7440722375333686, "language_loss": 0.67198288, "learning_rate": 5.93726050426697e-07, "loss": 0.69343781, "num_input_tokens_seen": 271147145, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 12570, "time_per_iteration": 2.541238784790039 }, { "auxiliary_loss_clip": 0.0110754, "auxiliary_loss_mlp": 0.0103143, "balance_loss_clip": 1.0185492, "balance_loss_mlp": 1.03709126, "epoch": 0.7558094092890425, "flos": 55182885010560.0, "grad_norm": 2.3161041556387123, "language_loss": 0.71635127, "learning_rate": 5.934491486396647e-07, "loss": 0.73774093, "num_input_tokens_seen": 271170865, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12571, "time_per_iteration": 2.7758748531341553 }, { "auxiliary_loss_clip": 0.0110848, "auxiliary_loss_mlp": 0.01031793, "balance_loss_clip": 1.01882267, "balance_loss_mlp": 1.03695142, "epoch": 0.7558695325417105, "flos": 23988292392960.0, "grad_norm": 2.378259732641909, "language_loss": 0.73643446, "learning_rate": 5.931723001891811e-07, "loss": 0.75783724, "num_input_tokens_seen": 271191450, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 12572, "time_per_iteration": 2.5220508575439453 }, { "auxiliary_loss_clip": 0.01109127, "auxiliary_loss_mlp": 0.01030073, "balance_loss_clip": 1.01735306, "balance_loss_mlp": 1.03855205, "epoch": 0.7559296557943784, "flos": 14611262572800.0, "grad_norm": 3.3720696212813075, "language_loss": 0.76653719, "learning_rate": 5.928955050857456e-07, "loss": 0.78792918, "num_input_tokens_seen": 271207335, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12573, "time_per_iteration": 2.4239659309387207 }, { "auxiliary_loss_clip": 0.01108409, "auxiliary_loss_mlp": 0.01032191, "balance_loss_clip": 1.01928675, "balance_loss_mlp": 1.0368638, "epoch": 0.7559897790470465, "flos": 18550483309440.0, "grad_norm": 1.5809542921601791, "language_loss": 0.68895566, "learning_rate": 5.926187633398527e-07, "loss": 0.7103616, "num_input_tokens_seen": 271226895, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 12574, "time_per_iteration": 2.4641833305358887 }, { "auxiliary_loss_clip": 0.011027, "auxiliary_loss_mlp": 0.01032698, "balance_loss_clip": 1.01944757, "balance_loss_mlp": 1.0345875, "epoch": 0.7560499022997144, "flos": 17967868709760.0, "grad_norm": 2.4871515197101086, "language_loss": 0.72191942, "learning_rate": 5.923420749619974e-07, "loss": 0.74327332, "num_input_tokens_seen": 271244375, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6796875, "step": 12575, "time_per_iteration": 2.4292378425598145 }, { "auxiliary_loss_clip": 0.01103011, "auxiliary_loss_mlp": 0.01032087, "balance_loss_clip": 1.02019012, "balance_loss_mlp": 1.03449893, "epoch": 0.7561100255523824, "flos": 15737815802880.0, "grad_norm": 2.1885275532036106, "language_loss": 0.71620917, "learning_rate": 5.92065439962673e-07, "loss": 0.73756021, "num_input_tokens_seen": 271259530, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 12576, "time_per_iteration": 2.434809923171997 }, { "auxiliary_loss_clip": 0.01104798, "auxiliary_loss_mlp": 0.01028701, "balance_loss_clip": 1.01599288, "balance_loss_mlp": 1.03682268, "epoch": 0.7561701488050504, "flos": 15888102307200.0, "grad_norm": 2.109355642363971, "language_loss": 0.67558163, "learning_rate": 5.917888583523669e-07, "loss": 0.69691664, "num_input_tokens_seen": 271276835, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 12577, "time_per_iteration": 2.4773926734924316 }, { "auxiliary_loss_clip": 0.0110398, "auxiliary_loss_mlp": 0.01033622, "balance_loss_clip": 1.02121258, "balance_loss_mlp": 1.03577459, "epoch": 0.7562302720577183, "flos": 20339157893760.0, "grad_norm": 1.9235468257329598, "language_loss": 0.78108203, "learning_rate": 5.915123301415685e-07, "loss": 0.80245805, "num_input_tokens_seen": 271296275, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 12578, "time_per_iteration": 2.5404605865478516 }, { "auxiliary_loss_clip": 0.01105488, "auxiliary_loss_mlp": 0.01028059, "balance_loss_clip": 1.01548815, "balance_loss_mlp": 1.03498578, "epoch": 0.7562903953103863, "flos": 20812209033600.0, "grad_norm": 1.743761957888241, "language_loss": 0.75555146, "learning_rate": 5.912358553407641e-07, "loss": 0.77688694, "num_input_tokens_seen": 271315685, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 12579, "time_per_iteration": 2.49855899810791 }, { "auxiliary_loss_clip": 0.01110668, "auxiliary_loss_mlp": 0.01033814, "balance_loss_clip": 1.01905608, "balance_loss_mlp": 1.03676736, "epoch": 0.7563505185630542, "flos": 37596999484800.0, "grad_norm": 2.749099858309253, "language_loss": 0.62672925, "learning_rate": 5.90959433960437e-07, "loss": 0.64817405, "num_input_tokens_seen": 271336790, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7421875, "step": 12580, "time_per_iteration": 2.609208583831787 }, { "auxiliary_loss_clip": 0.01106149, "auxiliary_loss_mlp": 0.01028798, "balance_loss_clip": 1.01667476, "balance_loss_mlp": 1.0373559, "epoch": 0.7564106418157223, "flos": 20230995064320.0, "grad_norm": 1.89936939222563, "language_loss": 0.75095439, "learning_rate": 5.906830660110691e-07, "loss": 0.77230388, "num_input_tokens_seen": 271355470, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 12581, "time_per_iteration": 2.443936347961426 }, { "auxiliary_loss_clip": 0.01106699, "auxiliary_loss_mlp": 0.01031769, "balance_loss_clip": 1.01901317, "balance_loss_mlp": 1.03550148, "epoch": 0.7564707650683902, "flos": 24754877475840.0, "grad_norm": 1.9429151800948732, "language_loss": 0.63373566, "learning_rate": 5.904067515031412e-07, "loss": 0.65512037, "num_input_tokens_seen": 271375810, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 12582, "time_per_iteration": 2.5108633041381836 }, { "auxiliary_loss_clip": 0.0102995, "auxiliary_loss_mlp": 0.01001511, "balance_loss_clip": 1.0002892, "balance_loss_mlp": 1.00744045, "epoch": 0.7565308883210582, "flos": 48530076433920.0, "grad_norm": 0.9641718895002018, "language_loss": 0.60716093, "learning_rate": 5.901304904471307e-07, "loss": 0.62747562, "num_input_tokens_seen": 271424775, "router_z_loss_clip": 0.01220703, "router_z_loss_mlp": 0.22558594, "step": 12583, "time_per_iteration": 2.851865530014038 }, { "auxiliary_loss_clip": 0.01107503, "auxiliary_loss_mlp": 0.01036953, "balance_loss_clip": 1.02385211, "balance_loss_mlp": 1.03715897, "epoch": 0.7565910115737261, "flos": 12495082757760.0, "grad_norm": 2.2669568234011703, "language_loss": 0.79014277, "learning_rate": 5.898542828535125e-07, "loss": 0.81158733, "num_input_tokens_seen": 271440500, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12584, "time_per_iteration": 2.4416825771331787 }, { "auxiliary_loss_clip": 0.01104486, "auxiliary_loss_mlp": 0.01026268, "balance_loss_clip": 1.01408482, "balance_loss_mlp": 1.03712988, "epoch": 0.7566511348263941, "flos": 21173003193600.0, "grad_norm": 2.0659037633364976, "language_loss": 0.77983707, "learning_rate": 5.895781287327612e-07, "loss": 0.8011446, "num_input_tokens_seen": 271458180, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 12585, "time_per_iteration": 2.463528633117676 }, { "auxiliary_loss_clip": 0.01111599, "auxiliary_loss_mlp": 0.01034927, "balance_loss_clip": 1.02142644, "balance_loss_mlp": 1.03996789, "epoch": 0.756711258079062, "flos": 21754827694080.0, "grad_norm": 1.8915955371224642, "language_loss": 0.83165586, "learning_rate": 5.893020280953493e-07, "loss": 0.8531211, "num_input_tokens_seen": 271475730, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 12586, "time_per_iteration": 2.45369815826416 }, { "auxiliary_loss_clip": 0.01109757, "auxiliary_loss_mlp": 0.01030563, "balance_loss_clip": 1.01841605, "balance_loss_mlp": 1.0372268, "epoch": 0.75677138133173, "flos": 22382905933440.0, "grad_norm": 2.3346264803412122, "language_loss": 0.83228827, "learning_rate": 5.890259809517459e-07, "loss": 0.85369146, "num_input_tokens_seen": 271495030, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7265625, "step": 12587, "time_per_iteration": 5.279499769210815 }, { "auxiliary_loss_clip": 0.01105449, "auxiliary_loss_mlp": 0.0103099, "balance_loss_clip": 1.01837122, "balance_loss_mlp": 1.03652787, "epoch": 0.756831504584398, "flos": 22708974620160.0, "grad_norm": 2.073982169765035, "language_loss": 0.71178323, "learning_rate": 5.88749987312418e-07, "loss": 0.73314762, "num_input_tokens_seen": 271515355, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 12588, "time_per_iteration": 2.4643094539642334 }, { "auxiliary_loss_clip": 0.01108614, "auxiliary_loss_mlp": 0.01029189, "balance_loss_clip": 1.01595676, "balance_loss_mlp": 1.03740907, "epoch": 0.756891627837066, "flos": 24098358643200.0, "grad_norm": 2.378574351385035, "language_loss": 0.68937027, "learning_rate": 5.884740471878327e-07, "loss": 0.71074831, "num_input_tokens_seen": 271535090, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 12589, "time_per_iteration": 2.487717628479004 }, { "auxiliary_loss_clip": 0.01105027, "auxiliary_loss_mlp": 0.01029741, "balance_loss_clip": 1.01711082, "balance_loss_mlp": 1.03556848, "epoch": 0.756951751089734, "flos": 19749001438080.0, "grad_norm": 1.8097300632236895, "language_loss": 0.92470938, "learning_rate": 5.881981605884522e-07, "loss": 0.94605708, "num_input_tokens_seen": 271551075, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12590, "time_per_iteration": 3.8597967624664307 }, { "auxiliary_loss_clip": 0.01102165, "auxiliary_loss_mlp": 0.01032487, "balance_loss_clip": 1.02014863, "balance_loss_mlp": 1.03484917, "epoch": 0.7570118743424019, "flos": 35079266551680.0, "grad_norm": 1.840721164960708, "language_loss": 0.65355182, "learning_rate": 5.879223275247391e-07, "loss": 0.67489833, "num_input_tokens_seen": 271571035, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.671875, "step": 12591, "time_per_iteration": 2.560814380645752 }, { "auxiliary_loss_clip": 0.01106558, "auxiliary_loss_mlp": 0.010227, "balance_loss_clip": 1.01119077, "balance_loss_mlp": 1.0377562, "epoch": 0.7570719975950699, "flos": 25594540778880.0, "grad_norm": 1.5886453817666546, "language_loss": 0.73539734, "learning_rate": 5.876465480071528e-07, "loss": 0.75668991, "num_input_tokens_seen": 271592950, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 12592, "time_per_iteration": 3.9498720169067383 }, { "auxiliary_loss_clip": 0.01106743, "auxiliary_loss_mlp": 0.01037321, "balance_loss_clip": 1.02433872, "balance_loss_mlp": 1.0356524, "epoch": 0.7571321208477378, "flos": 10816223028480.0, "grad_norm": 2.400378429644222, "language_loss": 0.71963531, "learning_rate": 5.873708220461522e-07, "loss": 0.74107599, "num_input_tokens_seen": 271608835, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 12593, "time_per_iteration": 2.4286208152770996 }, { "auxiliary_loss_clip": 0.01106686, "auxiliary_loss_mlp": 0.01028226, "balance_loss_clip": 1.01573896, "balance_loss_mlp": 1.03599286, "epoch": 0.7571922441004059, "flos": 18260109763200.0, "grad_norm": 1.9390220640984341, "language_loss": 0.66425872, "learning_rate": 5.870951496521903e-07, "loss": 0.68560791, "num_input_tokens_seen": 271627730, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 12594, "time_per_iteration": 2.466498613357544 }, { "auxiliary_loss_clip": 0.0110854, "auxiliary_loss_mlp": 0.01032801, "balance_loss_clip": 1.01945591, "balance_loss_mlp": 1.03673267, "epoch": 0.7572523673530738, "flos": 22890502978560.0, "grad_norm": 1.7061672360319378, "language_loss": 0.80941272, "learning_rate": 5.86819530835722e-07, "loss": 0.83082616, "num_input_tokens_seen": 271646415, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12595, "time_per_iteration": 2.4970998764038086 }, { "auxiliary_loss_clip": 0.01105529, "auxiliary_loss_mlp": 0.0102943, "balance_loss_clip": 1.0175035, "balance_loss_mlp": 1.03702879, "epoch": 0.7573124906057418, "flos": 20996323171200.0, "grad_norm": 1.9094614420948586, "language_loss": 0.71834028, "learning_rate": 5.865439656071993e-07, "loss": 0.73968989, "num_input_tokens_seen": 271666240, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 12596, "time_per_iteration": 2.5025405883789062 }, { "auxiliary_loss_clip": 0.01105329, "auxiliary_loss_mlp": 0.01029679, "balance_loss_clip": 1.01783562, "balance_loss_mlp": 1.03732586, "epoch": 0.7573726138584097, "flos": 20886292834560.0, "grad_norm": 1.6345827578792003, "language_loss": 0.80923486, "learning_rate": 5.862684539770706e-07, "loss": 0.83058488, "num_input_tokens_seen": 271686370, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 12597, "time_per_iteration": 2.4683310985565186 }, { "auxiliary_loss_clip": 0.01112608, "auxiliary_loss_mlp": 0.01028935, "balance_loss_clip": 1.01547599, "balance_loss_mlp": 1.04036462, "epoch": 0.7574327371110777, "flos": 24530507170560.0, "grad_norm": 1.7873756124779927, "language_loss": 0.83175725, "learning_rate": 5.859929959557835e-07, "loss": 0.85317266, "num_input_tokens_seen": 271705050, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 12598, "time_per_iteration": 2.482820749282837 }, { "auxiliary_loss_clip": 0.01104918, "auxiliary_loss_mlp": 0.01026062, "balance_loss_clip": 1.01420045, "balance_loss_mlp": 1.03713024, "epoch": 0.7574928603637456, "flos": 23364523785600.0, "grad_norm": 2.287931615550017, "language_loss": 0.62479705, "learning_rate": 5.857175915537845e-07, "loss": 0.64610684, "num_input_tokens_seen": 271724915, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 12599, "time_per_iteration": 2.450878381729126 }, { "auxiliary_loss_clip": 0.01110024, "auxiliary_loss_mlp": 0.01031845, "balance_loss_clip": 1.01714611, "balance_loss_mlp": 1.0376699, "epoch": 0.7575529836164137, "flos": 13516274419200.0, "grad_norm": 2.5610069653778496, "language_loss": 0.63417381, "learning_rate": 5.854422407815161e-07, "loss": 0.65559244, "num_input_tokens_seen": 271742410, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.72265625, "step": 12600, "time_per_iteration": 2.4139528274536133 }, { "auxiliary_loss_clip": 0.01106098, "auxiliary_loss_mlp": 0.01033288, "balance_loss_clip": 1.02066398, "balance_loss_mlp": 1.03755307, "epoch": 0.7576131068690816, "flos": 19646584784640.0, "grad_norm": 1.765087267456489, "language_loss": 0.6649524, "learning_rate": 5.851669436494191e-07, "loss": 0.68634623, "num_input_tokens_seen": 271761425, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 12601, "time_per_iteration": 2.436915874481201 }, { "auxiliary_loss_clip": 0.01103061, "auxiliary_loss_mlp": 0.010286, "balance_loss_clip": 1.01698327, "balance_loss_mlp": 1.03613627, "epoch": 0.7576732301217496, "flos": 20048245643520.0, "grad_norm": 1.7544039548454824, "language_loss": 0.67339617, "learning_rate": 5.848917001679335e-07, "loss": 0.69471276, "num_input_tokens_seen": 271780875, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 12602, "time_per_iteration": 2.4488155841827393 }, { "auxiliary_loss_clip": 0.01107281, "auxiliary_loss_mlp": 0.01037222, "balance_loss_clip": 1.02382267, "balance_loss_mlp": 1.0372541, "epoch": 0.7577333533744176, "flos": 15377093470080.0, "grad_norm": 1.807616962396943, "language_loss": 0.67191982, "learning_rate": 5.846165103474967e-07, "loss": 0.69336486, "num_input_tokens_seen": 271799490, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 12603, "time_per_iteration": 2.413311004638672 }, { "auxiliary_loss_clip": 0.01101978, "auxiliary_loss_mlp": 0.01030261, "balance_loss_clip": 1.01890588, "balance_loss_mlp": 1.03478384, "epoch": 0.7577934766270855, "flos": 17894862316800.0, "grad_norm": 3.0731036386783743, "language_loss": 0.61163092, "learning_rate": 5.843413741985439e-07, "loss": 0.63295329, "num_input_tokens_seen": 271817040, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 12604, "time_per_iteration": 2.4545669555664062 }, { "auxiliary_loss_clip": 0.01107218, "auxiliary_loss_mlp": 0.01036813, "balance_loss_clip": 1.02340198, "balance_loss_mlp": 1.03837061, "epoch": 0.7578535998797535, "flos": 21613770984960.0, "grad_norm": 1.883154274967166, "language_loss": 0.7994898, "learning_rate": 5.840662917315076e-07, "loss": 0.82093012, "num_input_tokens_seen": 271835480, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6875, "step": 12605, "time_per_iteration": 2.460799217224121 }, { "auxiliary_loss_clip": 0.01109124, "auxiliary_loss_mlp": 0.01030058, "balance_loss_clip": 1.0168674, "balance_loss_mlp": 1.03683519, "epoch": 0.7579137231324214, "flos": 18478374756480.0, "grad_norm": 4.608993295795506, "language_loss": 0.7964747, "learning_rate": 5.837912629568198e-07, "loss": 0.8178665, "num_input_tokens_seen": 271849835, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 12606, "time_per_iteration": 2.43711519241333 }, { "auxiliary_loss_clip": 0.01104514, "auxiliary_loss_mlp": 0.01028024, "balance_loss_clip": 1.01681876, "balance_loss_mlp": 1.0381763, "epoch": 0.7579738463850895, "flos": 23255032152960.0, "grad_norm": 1.412398599555667, "language_loss": 0.72931242, "learning_rate": 5.835162878849087e-07, "loss": 0.75063777, "num_input_tokens_seen": 271869560, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 12607, "time_per_iteration": 2.4654791355133057 }, { "auxiliary_loss_clip": 0.01108565, "auxiliary_loss_mlp": 0.01029644, "balance_loss_clip": 1.01657283, "balance_loss_mlp": 1.03535223, "epoch": 0.7580339696377574, "flos": 14027031861120.0, "grad_norm": 1.9581513350830464, "language_loss": 0.74969959, "learning_rate": 5.83241366526202e-07, "loss": 0.77108169, "num_input_tokens_seen": 271887950, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 12608, "time_per_iteration": 2.4415743350982666 }, { "auxiliary_loss_clip": 0.01104772, "auxiliary_loss_mlp": 0.01029861, "balance_loss_clip": 1.01706409, "balance_loss_mlp": 1.03682935, "epoch": 0.7580940928904254, "flos": 25082777756160.0, "grad_norm": 1.5771994313658182, "language_loss": 0.71376038, "learning_rate": 5.829664988911245e-07, "loss": 0.73510671, "num_input_tokens_seen": 271907700, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 12609, "time_per_iteration": 2.4878313541412354 }, { "auxiliary_loss_clip": 0.01106611, "auxiliary_loss_mlp": 0.01034163, "balance_loss_clip": 1.02018559, "balance_loss_mlp": 1.03515494, "epoch": 0.7581542161430933, "flos": 23836425690240.0, "grad_norm": 1.7529205716763097, "language_loss": 0.81828898, "learning_rate": 5.826916849901007e-07, "loss": 0.83969676, "num_input_tokens_seen": 271926840, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71484375, "step": 12610, "time_per_iteration": 2.4864799976348877 }, { "auxiliary_loss_clip": 0.01109837, "auxiliary_loss_mlp": 0.0103236, "balance_loss_clip": 1.0193541, "balance_loss_mlp": 1.03807986, "epoch": 0.7582143393957613, "flos": 22237000888320.0, "grad_norm": 3.6410604757211695, "language_loss": 0.70532209, "learning_rate": 5.824169248335488e-07, "loss": 0.72674406, "num_input_tokens_seen": 271946465, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 12611, "time_per_iteration": 2.4624831676483154 }, { "auxiliary_loss_clip": 0.01107624, "auxiliary_loss_mlp": 0.0103169, "balance_loss_clip": 1.0188514, "balance_loss_mlp": 1.03760445, "epoch": 0.7582744626484292, "flos": 21106389421440.0, "grad_norm": 7.0595742100258905, "language_loss": 0.71162605, "learning_rate": 5.821422184318893e-07, "loss": 0.73301923, "num_input_tokens_seen": 271967295, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12612, "time_per_iteration": 2.4691519737243652 }, { "auxiliary_loss_clip": 0.01109601, "auxiliary_loss_mlp": 0.01039023, "balance_loss_clip": 1.02653623, "balance_loss_mlp": 1.03775966, "epoch": 0.7583345859010973, "flos": 24604770539520.0, "grad_norm": 1.6536104876288544, "language_loss": 0.59479564, "learning_rate": 5.818675657955397e-07, "loss": 0.61628187, "num_input_tokens_seen": 271987960, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 12613, "time_per_iteration": 2.479914665222168 }, { "auxiliary_loss_clip": 0.01105721, "auxiliary_loss_mlp": 0.0103454, "balance_loss_clip": 1.02151084, "balance_loss_mlp": 1.03632617, "epoch": 0.7583947091537652, "flos": 33546814657920.0, "grad_norm": 2.102248608381691, "language_loss": 0.6010564, "learning_rate": 5.815929669349135e-07, "loss": 0.62245899, "num_input_tokens_seen": 272011780, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 12614, "time_per_iteration": 2.5655102729797363 }, { "auxiliary_loss_clip": 0.01107539, "auxiliary_loss_mlp": 0.01028675, "balance_loss_clip": 1.01524031, "balance_loss_mlp": 1.03595471, "epoch": 0.7584548324064332, "flos": 20121000641280.0, "grad_norm": 1.8442722429695872, "language_loss": 0.73299098, "learning_rate": 5.813184218604246e-07, "loss": 0.75435317, "num_input_tokens_seen": 272030825, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 12615, "time_per_iteration": 2.4432547092437744 }, { "auxiliary_loss_clip": 0.01031632, "auxiliary_loss_mlp": 0.01000441, "balance_loss_clip": 0.99923658, "balance_loss_mlp": 1.00886726, "epoch": 0.7585149556591012, "flos": 70402584061440.0, "grad_norm": 0.8117311736563707, "language_loss": 0.67631841, "learning_rate": 5.810439305824828e-07, "loss": 0.69663912, "num_input_tokens_seen": 272095825, "router_z_loss_clip": 0.01202393, "router_z_loss_mlp": 0.22753906, "step": 12616, "time_per_iteration": 3.1313397884368896 }, { "auxiliary_loss_clip": 0.01109517, "auxiliary_loss_mlp": 0.01033836, "balance_loss_clip": 1.02090836, "balance_loss_mlp": 1.03761816, "epoch": 0.7585750789117691, "flos": 16143786293760.0, "grad_norm": 1.9680955330376582, "language_loss": 0.84613633, "learning_rate": 5.807694931114979e-07, "loss": 0.8675698, "num_input_tokens_seen": 272113950, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 12617, "time_per_iteration": 2.4625658988952637 }, { "auxiliary_loss_clip": 0.01109864, "auxiliary_loss_mlp": 0.01032706, "balance_loss_clip": 1.02078509, "balance_loss_mlp": 1.03911078, "epoch": 0.7586352021644371, "flos": 17493165544320.0, "grad_norm": 2.3779234872304, "language_loss": 0.7520864, "learning_rate": 5.804951094578757e-07, "loss": 0.77351201, "num_input_tokens_seen": 272130315, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.70703125, "step": 12618, "time_per_iteration": 2.4348249435424805 }, { "auxiliary_loss_clip": 0.01110949, "auxiliary_loss_mlp": 0.01033339, "balance_loss_clip": 1.01976109, "balance_loss_mlp": 1.03734863, "epoch": 0.758695325417105, "flos": 17275187859840.0, "grad_norm": 2.4483972402901557, "language_loss": 0.77174759, "learning_rate": 5.802207796320209e-07, "loss": 0.79319048, "num_input_tokens_seen": 272149080, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 12619, "time_per_iteration": 2.4224693775177 }, { "auxiliary_loss_clip": 0.0110564, "auxiliary_loss_mlp": 0.0103408, "balance_loss_clip": 1.02066898, "balance_loss_mlp": 1.03636312, "epoch": 0.7587554486697731, "flos": 29495660163840.0, "grad_norm": 1.7632278319160872, "language_loss": 0.82397187, "learning_rate": 5.79946503644337e-07, "loss": 0.84536904, "num_input_tokens_seen": 272168285, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 12620, "time_per_iteration": 2.510606050491333 }, { "auxiliary_loss_clip": 0.01109538, "auxiliary_loss_mlp": 0.01038439, "balance_loss_clip": 1.02411628, "balance_loss_mlp": 1.03737032, "epoch": 0.758815571922441, "flos": 16100800692480.0, "grad_norm": 2.616104291762656, "language_loss": 0.82593882, "learning_rate": 5.796722815052242e-07, "loss": 0.84741867, "num_input_tokens_seen": 272184585, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.72265625, "step": 12621, "time_per_iteration": 2.420410394668579 }, { "auxiliary_loss_clip": 0.01107521, "auxiliary_loss_mlp": 0.01036828, "balance_loss_clip": 1.02420998, "balance_loss_mlp": 1.03765881, "epoch": 0.758875695175109, "flos": 16143714466560.0, "grad_norm": 4.259086467108462, "language_loss": 0.73645902, "learning_rate": 5.7939811322508e-07, "loss": 0.7579025, "num_input_tokens_seen": 272200205, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 12622, "time_per_iteration": 2.435755491256714 }, { "auxiliary_loss_clip": 0.01032844, "auxiliary_loss_mlp": 0.01001096, "balance_loss_clip": 0.99989241, "balance_loss_mlp": 1.01024365, "epoch": 0.7589358184277769, "flos": 68462006860800.0, "grad_norm": 0.8489049439946352, "language_loss": 0.60865963, "learning_rate": 5.791239988143024e-07, "loss": 0.62899899, "num_input_tokens_seen": 272259670, "router_z_loss_clip": 0.01202393, "router_z_loss_mlp": 0.2265625, "step": 12623, "time_per_iteration": 3.116145610809326 }, { "auxiliary_loss_clip": 0.01105572, "auxiliary_loss_mlp": 0.0103367, "balance_loss_clip": 1.02164221, "balance_loss_mlp": 1.03798771, "epoch": 0.7589959416804449, "flos": 20047311889920.0, "grad_norm": 2.1221415222189424, "language_loss": 0.6723876, "learning_rate": 5.788499382832847e-07, "loss": 0.69378, "num_input_tokens_seen": 272277925, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 12624, "time_per_iteration": 2.444310426712036 }, { "auxiliary_loss_clip": 0.01103516, "auxiliary_loss_mlp": 0.01028398, "balance_loss_clip": 1.01592898, "balance_loss_mlp": 1.03622139, "epoch": 0.7590560649331128, "flos": 18771800958720.0, "grad_norm": 1.873547771611416, "language_loss": 0.76181132, "learning_rate": 5.785759316424196e-07, "loss": 0.78313047, "num_input_tokens_seen": 272296010, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 12625, "time_per_iteration": 2.4347152709960938 }, { "auxiliary_loss_clip": 0.011071, "auxiliary_loss_mlp": 0.0103808, "balance_loss_clip": 1.02521777, "balance_loss_mlp": 1.03878689, "epoch": 0.7591161881857809, "flos": 29825284296960.0, "grad_norm": 1.9289638604757298, "language_loss": 0.62964725, "learning_rate": 5.783019789020977e-07, "loss": 0.65109903, "num_input_tokens_seen": 272318330, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 12626, "time_per_iteration": 2.5245578289031982 }, { "auxiliary_loss_clip": 0.01109831, "auxiliary_loss_mlp": 0.01042304, "balance_loss_clip": 1.0282433, "balance_loss_mlp": 1.03930306, "epoch": 0.7591763114384488, "flos": 20302708567680.0, "grad_norm": 2.0275163007956043, "language_loss": 0.74283266, "learning_rate": 5.780280800727084e-07, "loss": 0.76435399, "num_input_tokens_seen": 272335265, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.703125, "step": 12627, "time_per_iteration": 2.427344560623169 }, { "auxiliary_loss_clip": 0.01108741, "auxiliary_loss_mlp": 0.01031499, "balance_loss_clip": 1.01830268, "balance_loss_mlp": 1.03760839, "epoch": 0.7592364346911168, "flos": 20813609664000.0, "grad_norm": 2.094563196364682, "language_loss": 0.68634808, "learning_rate": 5.777542351646356e-07, "loss": 0.7077505, "num_input_tokens_seen": 272354795, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 12628, "time_per_iteration": 5.433999538421631 }, { "auxiliary_loss_clip": 0.0111636, "auxiliary_loss_mlp": 0.01036664, "balance_loss_clip": 1.02243042, "balance_loss_mlp": 1.04112768, "epoch": 0.7592965579437848, "flos": 21251504367360.0, "grad_norm": 4.181459777018976, "language_loss": 0.63075459, "learning_rate": 5.774804441882648e-07, "loss": 0.65228486, "num_input_tokens_seen": 272372875, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 12629, "time_per_iteration": 2.460569143295288 }, { "auxiliary_loss_clip": 0.01101311, "auxiliary_loss_mlp": 0.01028073, "balance_loss_clip": 1.01568747, "balance_loss_mlp": 1.03479576, "epoch": 0.7593566811964527, "flos": 26213604704640.0, "grad_norm": 1.6873161330666433, "language_loss": 0.77871579, "learning_rate": 5.772067071539786e-07, "loss": 0.80000961, "num_input_tokens_seen": 272394715, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6640625, "step": 12630, "time_per_iteration": 2.5259387493133545 }, { "auxiliary_loss_clip": 0.01032609, "auxiliary_loss_mlp": 0.01003026, "balance_loss_clip": 1.00176275, "balance_loss_mlp": 1.01001978, "epoch": 0.7594168044491207, "flos": 71237255374080.0, "grad_norm": 0.824733781010292, "language_loss": 0.61467624, "learning_rate": 5.769330240721562e-07, "loss": 0.63503259, "num_input_tokens_seen": 272458775, "router_z_loss_clip": 0.01263428, "router_z_loss_mlp": 0.2265625, "step": 12631, "time_per_iteration": 3.125445604324341 }, { "auxiliary_loss_clip": 0.01114232, "auxiliary_loss_mlp": 0.01038144, "balance_loss_clip": 1.02340984, "balance_loss_mlp": 1.04016328, "epoch": 0.7594769277017887, "flos": 26613326229120.0, "grad_norm": 1.719658343845679, "language_loss": 0.73845321, "learning_rate": 5.766593949531767e-07, "loss": 0.75997698, "num_input_tokens_seen": 272479355, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.7421875, "step": 12632, "time_per_iteration": 3.8889195919036865 }, { "auxiliary_loss_clip": 0.01108087, "auxiliary_loss_mlp": 0.01031782, "balance_loss_clip": 1.01885974, "balance_loss_mlp": 1.03804696, "epoch": 0.7595370509544567, "flos": 17595941333760.0, "grad_norm": 2.904608432489095, "language_loss": 0.74970931, "learning_rate": 5.763858198074154e-07, "loss": 0.77110803, "num_input_tokens_seen": 272493555, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 12633, "time_per_iteration": 3.878209352493286 }, { "auxiliary_loss_clip": 0.01105509, "auxiliary_loss_mlp": 0.01030441, "balance_loss_clip": 1.01872253, "balance_loss_mlp": 1.03682554, "epoch": 0.7595971742071246, "flos": 18002953319040.0, "grad_norm": 1.787359156536213, "language_loss": 0.73515117, "learning_rate": 5.76112298645246e-07, "loss": 0.75651073, "num_input_tokens_seen": 272508925, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 12634, "time_per_iteration": 2.446394681930542 }, { "auxiliary_loss_clip": 0.01109417, "auxiliary_loss_mlp": 0.01031161, "balance_loss_clip": 1.01848364, "balance_loss_mlp": 1.03884912, "epoch": 0.7596572974597926, "flos": 28840326480000.0, "grad_norm": 1.7401079579313414, "language_loss": 0.64735931, "learning_rate": 5.758388314770408e-07, "loss": 0.66876507, "num_input_tokens_seen": 272528805, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12635, "time_per_iteration": 2.5501792430877686 }, { "auxiliary_loss_clip": 0.01109537, "auxiliary_loss_mlp": 0.01032415, "balance_loss_clip": 1.0188489, "balance_loss_mlp": 1.0371325, "epoch": 0.7597174207124605, "flos": 14282823588480.0, "grad_norm": 1.7921772882860425, "language_loss": 0.68899375, "learning_rate": 5.7556541831317e-07, "loss": 0.71041322, "num_input_tokens_seen": 272546655, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 12636, "time_per_iteration": 2.4663116931915283 }, { "auxiliary_loss_clip": 0.01110743, "auxiliary_loss_mlp": 0.01033462, "balance_loss_clip": 1.02067041, "balance_loss_mlp": 1.03964436, "epoch": 0.7597775439651285, "flos": 21688932193920.0, "grad_norm": 1.9289748705979994, "language_loss": 0.81130028, "learning_rate": 5.752920591640018e-07, "loss": 0.83274233, "num_input_tokens_seen": 272564010, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 12637, "time_per_iteration": 2.4713380336761475 }, { "auxiliary_loss_clip": 0.01106984, "auxiliary_loss_mlp": 0.0103552, "balance_loss_clip": 1.02292562, "balance_loss_mlp": 1.03654587, "epoch": 0.7598376672177964, "flos": 36101248312320.0, "grad_norm": 2.894680912705174, "language_loss": 0.66699886, "learning_rate": 5.750187540399017e-07, "loss": 0.68842387, "num_input_tokens_seen": 272585840, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 12638, "time_per_iteration": 2.596257209777832 }, { "auxiliary_loss_clip": 0.01109405, "auxiliary_loss_mlp": 0.01034068, "balance_loss_clip": 1.01957214, "balance_loss_mlp": 1.0382427, "epoch": 0.7598977904704645, "flos": 18332326056960.0, "grad_norm": 2.143491040812216, "language_loss": 0.65337956, "learning_rate": 5.747455029512323e-07, "loss": 0.67481428, "num_input_tokens_seen": 272602300, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7109375, "step": 12639, "time_per_iteration": 2.4481842517852783 }, { "auxiliary_loss_clip": 0.01106343, "auxiliary_loss_mlp": 0.01033132, "balance_loss_clip": 1.01976275, "balance_loss_mlp": 1.03695846, "epoch": 0.7599579137231324, "flos": 20192642317440.0, "grad_norm": 2.008277811242643, "language_loss": 0.70253146, "learning_rate": 5.744723059083572e-07, "loss": 0.72392619, "num_input_tokens_seen": 272619595, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 12640, "time_per_iteration": 2.4407575130462646 }, { "auxiliary_loss_clip": 0.01111079, "auxiliary_loss_mlp": 0.01034623, "balance_loss_clip": 1.02051473, "balance_loss_mlp": 1.03848624, "epoch": 0.7600180369758004, "flos": 24024849459840.0, "grad_norm": 1.7833746043906946, "language_loss": 0.67005682, "learning_rate": 5.741991629216343e-07, "loss": 0.6915139, "num_input_tokens_seen": 272638825, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 12641, "time_per_iteration": 2.4934866428375244 }, { "auxiliary_loss_clip": 0.0110742, "auxiliary_loss_mlp": 0.01034516, "balance_loss_clip": 1.02046704, "balance_loss_mlp": 1.03559959, "epoch": 0.7600781602284684, "flos": 18989527248000.0, "grad_norm": 2.2316536979783113, "language_loss": 0.67168629, "learning_rate": 5.73926074001422e-07, "loss": 0.69310564, "num_input_tokens_seen": 272657240, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71875, "step": 12642, "time_per_iteration": 2.4257051944732666 }, { "auxiliary_loss_clip": 0.01108294, "auxiliary_loss_mlp": 0.01034766, "balance_loss_clip": 1.02217185, "balance_loss_mlp": 1.04031515, "epoch": 0.7601382834811363, "flos": 26067520091520.0, "grad_norm": 1.9563672068889202, "language_loss": 0.75523335, "learning_rate": 5.736530391580765e-07, "loss": 0.7766639, "num_input_tokens_seen": 272677520, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 12643, "time_per_iteration": 2.493915557861328 }, { "auxiliary_loss_clip": 0.01109861, "auxiliary_loss_mlp": 0.0103857, "balance_loss_clip": 1.02446187, "balance_loss_mlp": 1.03847563, "epoch": 0.7601984067338043, "flos": 18844232734080.0, "grad_norm": 2.3389384375612843, "language_loss": 0.78850251, "learning_rate": 5.733800584019508e-07, "loss": 0.80998683, "num_input_tokens_seen": 272696770, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71484375, "step": 12644, "time_per_iteration": 2.4314475059509277 }, { "auxiliary_loss_clip": 0.01109244, "auxiliary_loss_mlp": 0.01029461, "balance_loss_clip": 1.01718211, "balance_loss_mlp": 1.03884101, "epoch": 0.7602585299864723, "flos": 24646391424000.0, "grad_norm": 1.7381661655384624, "language_loss": 0.8040244, "learning_rate": 5.731071317433957e-07, "loss": 0.82541144, "num_input_tokens_seen": 272718340, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 12645, "time_per_iteration": 2.5072662830352783 }, { "auxiliary_loss_clip": 0.0110991, "auxiliary_loss_mlp": 0.01035024, "balance_loss_clip": 1.02147567, "balance_loss_mlp": 1.03758299, "epoch": 0.7603186532391403, "flos": 23842100039040.0, "grad_norm": 1.6394271238516833, "language_loss": 0.73066336, "learning_rate": 5.728342591927611e-07, "loss": 0.75211269, "num_input_tokens_seen": 272739575, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 12646, "time_per_iteration": 2.5029513835906982 }, { "auxiliary_loss_clip": 0.01105319, "auxiliary_loss_mlp": 0.01034995, "balance_loss_clip": 1.02262759, "balance_loss_mlp": 1.03713703, "epoch": 0.7603787764918082, "flos": 22199905117440.0, "grad_norm": 2.0344549193054826, "language_loss": 0.67241776, "learning_rate": 5.725614407603949e-07, "loss": 0.69382089, "num_input_tokens_seen": 272758710, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.68359375, "step": 12647, "time_per_iteration": 2.4598495960235596 }, { "auxiliary_loss_clip": 0.010329, "auxiliary_loss_mlp": 0.01000366, "balance_loss_clip": 0.9992277, "balance_loss_mlp": 1.01049542, "epoch": 0.7604388997444762, "flos": 54086894254080.0, "grad_norm": 0.6682018405626626, "language_loss": 0.48955822, "learning_rate": 5.722886764566415e-07, "loss": 0.50989085, "num_input_tokens_seen": 272814855, "router_z_loss_clip": 0.01141357, "router_z_loss_mlp": 0.22460938, "step": 12648, "time_per_iteration": 3.0402557849884033 }, { "auxiliary_loss_clip": 0.0110487, "auxiliary_loss_mlp": 0.01031596, "balance_loss_clip": 1.01897752, "balance_loss_mlp": 1.03696597, "epoch": 0.7604990229971441, "flos": 19681920789120.0, "grad_norm": 1.6668380526934394, "language_loss": 0.76622188, "learning_rate": 5.720159662918451e-07, "loss": 0.78758657, "num_input_tokens_seen": 272834400, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 12649, "time_per_iteration": 2.4814531803131104 }, { "auxiliary_loss_clip": 0.01105382, "auxiliary_loss_mlp": 0.01034599, "balance_loss_clip": 1.02131939, "balance_loss_mlp": 1.03628159, "epoch": 0.7605591462498121, "flos": 25228036356480.0, "grad_norm": 2.4680236059032303, "language_loss": 0.68736589, "learning_rate": 5.717433102763462e-07, "loss": 0.70876569, "num_input_tokens_seen": 272854760, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69140625, "step": 12650, "time_per_iteration": 2.522993326187134 }, { "auxiliary_loss_clip": 0.01032611, "auxiliary_loss_mlp": 0.01000606, "balance_loss_clip": 0.99944937, "balance_loss_mlp": 1.01014471, "epoch": 0.76061926950248, "flos": 66783757662720.0, "grad_norm": 0.7551273239985555, "language_loss": 0.62738717, "learning_rate": 5.714707084204838e-07, "loss": 0.64771938, "num_input_tokens_seen": 272919030, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.22460938, "step": 12651, "time_per_iteration": 3.078788995742798 }, { "auxiliary_loss_clip": 0.01105794, "auxiliary_loss_mlp": 0.01032436, "balance_loss_clip": 1.02031207, "balance_loss_mlp": 1.03683078, "epoch": 0.7606793927551481, "flos": 25338354001920.0, "grad_norm": 1.6485724659037866, "language_loss": 0.71775776, "learning_rate": 5.711981607345951e-07, "loss": 0.73914003, "num_input_tokens_seen": 272938925, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 12652, "time_per_iteration": 2.496809482574463 }, { "auxiliary_loss_clip": 0.01108294, "auxiliary_loss_mlp": 0.0103758, "balance_loss_clip": 1.02427065, "balance_loss_mlp": 1.03719544, "epoch": 0.760739516007816, "flos": 18223624523520.0, "grad_norm": 1.8805980395184454, "language_loss": 0.80098557, "learning_rate": 5.709256672290152e-07, "loss": 0.82244432, "num_input_tokens_seen": 272954945, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 12653, "time_per_iteration": 2.446347951889038 }, { "auxiliary_loss_clip": 0.01111879, "auxiliary_loss_mlp": 0.01030447, "balance_loss_clip": 1.01797211, "balance_loss_mlp": 1.03866792, "epoch": 0.760799639260484, "flos": 22559119079040.0, "grad_norm": 1.691079313185154, "language_loss": 0.80468571, "learning_rate": 5.706532279140785e-07, "loss": 0.82610893, "num_input_tokens_seen": 272972855, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.734375, "step": 12654, "time_per_iteration": 2.4509174823760986 }, { "auxiliary_loss_clip": 0.0111031, "auxiliary_loss_mlp": 0.01038003, "balance_loss_clip": 1.02456784, "balance_loss_mlp": 1.0378418, "epoch": 0.760859762513152, "flos": 22309324922880.0, "grad_norm": 2.252257871340272, "language_loss": 0.79193723, "learning_rate": 5.703808428001136e-07, "loss": 0.81342036, "num_input_tokens_seen": 272989895, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 12655, "time_per_iteration": 2.453639268875122 }, { "auxiliary_loss_clip": 0.01102221, "auxiliary_loss_mlp": 0.01026154, "balance_loss_clip": 1.01512742, "balance_loss_mlp": 1.03566861, "epoch": 0.7609198857658199, "flos": 24863902231680.0, "grad_norm": 1.7208541742288803, "language_loss": 0.68413395, "learning_rate": 5.701085118974505e-07, "loss": 0.70541775, "num_input_tokens_seen": 273011695, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6640625, "step": 12656, "time_per_iteration": 2.53365159034729 }, { "auxiliary_loss_clip": 0.01108701, "auxiliary_loss_mlp": 0.01031883, "balance_loss_clip": 1.01829338, "balance_loss_mlp": 1.03551769, "epoch": 0.760980009018488, "flos": 16836790366080.0, "grad_norm": 2.29008629882702, "language_loss": 0.7342549, "learning_rate": 5.698362352164164e-07, "loss": 0.75566077, "num_input_tokens_seen": 273028815, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 12657, "time_per_iteration": 2.4290337562561035 }, { "auxiliary_loss_clip": 0.0103224, "auxiliary_loss_mlp": 0.01003039, "balance_loss_clip": 1.00178111, "balance_loss_mlp": 1.00984669, "epoch": 0.7610401322711559, "flos": 61230603029760.0, "grad_norm": 0.8727310628257313, "language_loss": 0.64910197, "learning_rate": 5.695640127673347e-07, "loss": 0.66945475, "num_input_tokens_seen": 273084080, "router_z_loss_clip": 0.01257324, "router_z_loss_mlp": 0.22460938, "step": 12658, "time_per_iteration": 3.0298523902893066 }, { "auxiliary_loss_clip": 0.01102383, "auxiliary_loss_mlp": 0.01036334, "balance_loss_clip": 1.02400732, "balance_loss_mlp": 1.03578186, "epoch": 0.7611002555238239, "flos": 19640730867840.0, "grad_norm": 1.8226427374306433, "language_loss": 0.79464531, "learning_rate": 5.692918445605293e-07, "loss": 0.81603253, "num_input_tokens_seen": 273102295, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6640625, "step": 12659, "time_per_iteration": 2.447282314300537 }, { "auxiliary_loss_clip": 0.01104471, "auxiliary_loss_mlp": 0.01027071, "balance_loss_clip": 1.01398826, "balance_loss_mlp": 1.03558493, "epoch": 0.7611603787764918, "flos": 26872206526080.0, "grad_norm": 1.6477963703121898, "language_loss": 0.68995833, "learning_rate": 5.690197306063209e-07, "loss": 0.71127379, "num_input_tokens_seen": 273123400, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 12660, "time_per_iteration": 2.505329132080078 }, { "auxiliary_loss_clip": 0.01108154, "auxiliary_loss_mlp": 0.01031428, "balance_loss_clip": 1.01919127, "balance_loss_mlp": 1.03732085, "epoch": 0.7612205020291598, "flos": 27344252085120.0, "grad_norm": 1.8129697467555645, "language_loss": 0.70600712, "learning_rate": 5.687476709150281e-07, "loss": 0.72740293, "num_input_tokens_seen": 273145150, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.70703125, "step": 12661, "time_per_iteration": 2.517779588699341 }, { "auxiliary_loss_clip": 0.01105166, "auxiliary_loss_mlp": 0.01031159, "balance_loss_clip": 1.01837349, "balance_loss_mlp": 1.03511846, "epoch": 0.7612806252818277, "flos": 29314598682240.0, "grad_norm": 1.53892655716149, "language_loss": 0.83396298, "learning_rate": 5.68475665496966e-07, "loss": 0.85532618, "num_input_tokens_seen": 273165180, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 12662, "time_per_iteration": 2.5338494777679443 }, { "auxiliary_loss_clip": 0.01106672, "auxiliary_loss_mlp": 0.01040646, "balance_loss_clip": 1.02801597, "balance_loss_mlp": 1.03564382, "epoch": 0.7613407485344957, "flos": 19026048401280.0, "grad_norm": 1.7847928791320633, "language_loss": 0.68852645, "learning_rate": 5.682037143624505e-07, "loss": 0.70999962, "num_input_tokens_seen": 273184005, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 12663, "time_per_iteration": 2.453587770462036 }, { "auxiliary_loss_clip": 0.01106216, "auxiliary_loss_mlp": 0.01027109, "balance_loss_clip": 1.01492012, "balance_loss_mlp": 1.03824258, "epoch": 0.7614008717871636, "flos": 23256037733760.0, "grad_norm": 1.6330144165664433, "language_loss": 0.70161986, "learning_rate": 5.67931817521794e-07, "loss": 0.72295308, "num_input_tokens_seen": 273203565, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 12664, "time_per_iteration": 2.465803623199463 }, { "auxiliary_loss_clip": 0.01112239, "auxiliary_loss_mlp": 0.01038974, "balance_loss_clip": 1.02522874, "balance_loss_mlp": 1.03940785, "epoch": 0.7614609950398317, "flos": 21579907438080.0, "grad_norm": 1.8625880812279603, "language_loss": 0.79510415, "learning_rate": 5.676599749853066e-07, "loss": 0.81661624, "num_input_tokens_seen": 273221645, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 12665, "time_per_iteration": 2.4849002361297607 }, { "auxiliary_loss_clip": 0.01107297, "auxiliary_loss_mlp": 0.0103541, "balance_loss_clip": 1.02319646, "balance_loss_mlp": 1.03977036, "epoch": 0.7615211182924996, "flos": 29277897960960.0, "grad_norm": 1.627173538284549, "language_loss": 0.87997019, "learning_rate": 5.673881867632959e-07, "loss": 0.90139729, "num_input_tokens_seen": 273242040, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 12666, "time_per_iteration": 2.513105630874634 }, { "auxiliary_loss_clip": 0.01107381, "auxiliary_loss_mlp": 0.01035304, "balance_loss_clip": 1.02219069, "balance_loss_mlp": 1.03691125, "epoch": 0.7615812415451676, "flos": 13261129136640.0, "grad_norm": 2.9414015363303814, "language_loss": 0.83887935, "learning_rate": 5.671164528660693e-07, "loss": 0.86030626, "num_input_tokens_seen": 273257365, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12667, "time_per_iteration": 2.438340425491333 }, { "auxiliary_loss_clip": 0.0110731, "auxiliary_loss_mlp": 0.01038062, "balance_loss_clip": 1.02595079, "balance_loss_mlp": 1.03900278, "epoch": 0.7616413647978356, "flos": 18584741905920.0, "grad_norm": 3.490560706580034, "language_loss": 0.7845664, "learning_rate": 5.668447733039296e-07, "loss": 0.80602014, "num_input_tokens_seen": 273274710, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 12668, "time_per_iteration": 2.4098033905029297 }, { "auxiliary_loss_clip": 0.01105439, "auxiliary_loss_mlp": 0.01028496, "balance_loss_clip": 1.01627743, "balance_loss_mlp": 1.03662777, "epoch": 0.7617014880505035, "flos": 18516188799360.0, "grad_norm": 2.210262488363814, "language_loss": 0.63926882, "learning_rate": 5.6657314808718e-07, "loss": 0.66060817, "num_input_tokens_seen": 273292870, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 12669, "time_per_iteration": 2.4502365589141846 }, { "auxiliary_loss_clip": 0.01108028, "auxiliary_loss_mlp": 0.01037979, "balance_loss_clip": 1.02349544, "balance_loss_mlp": 1.03601694, "epoch": 0.7617616113031715, "flos": 24973178382720.0, "grad_norm": 1.8175728593838243, "language_loss": 0.66267991, "learning_rate": 5.663015772261202e-07, "loss": 0.68413997, "num_input_tokens_seen": 273312375, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.71875, "step": 12670, "time_per_iteration": 4.0548200607299805 }, { "auxiliary_loss_clip": 0.01108074, "auxiliary_loss_mlp": 0.01035716, "balance_loss_clip": 1.02303231, "balance_loss_mlp": 1.03588641, "epoch": 0.7618217345558395, "flos": 23295036925440.0, "grad_norm": 1.5252338882657874, "language_loss": 0.73459321, "learning_rate": 5.660300607310493e-07, "loss": 0.7560311, "num_input_tokens_seen": 273332590, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 12671, "time_per_iteration": 2.470381498336792 }, { "auxiliary_loss_clip": 0.01103677, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.02297401, "balance_loss_mlp": 1.03602219, "epoch": 0.7618818578085075, "flos": 25482894330240.0, "grad_norm": 1.6372999035518327, "language_loss": 0.73510957, "learning_rate": 5.657585986122613e-07, "loss": 0.75649214, "num_input_tokens_seen": 273352885, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 12672, "time_per_iteration": 2.5252161026000977 }, { "auxiliary_loss_clip": 0.01031955, "auxiliary_loss_mlp": 0.01001132, "balance_loss_clip": 0.99999905, "balance_loss_mlp": 1.00940549, "epoch": 0.7619419810611754, "flos": 61151994115200.0, "grad_norm": 0.7730035415145895, "language_loss": 0.56648958, "learning_rate": 5.654871908800506e-07, "loss": 0.58682048, "num_input_tokens_seen": 273411730, "router_z_loss_clip": 0.01135254, "router_z_loss_mlp": 0.22558594, "step": 12673, "time_per_iteration": 4.547141075134277 }, { "auxiliary_loss_clip": 0.01108437, "auxiliary_loss_mlp": 0.01030709, "balance_loss_clip": 1.01724458, "balance_loss_mlp": 1.0371623, "epoch": 0.7620021043138434, "flos": 23258659426560.0, "grad_norm": 1.8060281407091472, "language_loss": 0.74784243, "learning_rate": 5.652158375447102e-07, "loss": 0.76923382, "num_input_tokens_seen": 273430020, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 12674, "time_per_iteration": 2.5178749561309814 }, { "auxiliary_loss_clip": 0.01105342, "auxiliary_loss_mlp": 0.01027772, "balance_loss_clip": 1.0157795, "balance_loss_mlp": 1.03661823, "epoch": 0.7620622275665113, "flos": 25082490447360.0, "grad_norm": 2.1048014133783854, "language_loss": 0.72496843, "learning_rate": 5.649445386165286e-07, "loss": 0.74629956, "num_input_tokens_seen": 273448690, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 12675, "time_per_iteration": 3.905339479446411 }, { "auxiliary_loss_clip": 0.01103586, "auxiliary_loss_mlp": 0.01032781, "balance_loss_clip": 1.02061617, "balance_loss_mlp": 1.03670979, "epoch": 0.7621223508191793, "flos": 20155007842560.0, "grad_norm": 2.3414392075801853, "language_loss": 0.73001367, "learning_rate": 5.646732941057936e-07, "loss": 0.75137728, "num_input_tokens_seen": 273465190, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.66796875, "step": 12676, "time_per_iteration": 2.4213271141052246 }, { "auxiliary_loss_clip": 0.01114133, "auxiliary_loss_mlp": 0.0103289, "balance_loss_clip": 1.01972938, "balance_loss_mlp": 1.0393784, "epoch": 0.7621824740718472, "flos": 18000187971840.0, "grad_norm": 2.781273294807057, "language_loss": 0.53808075, "learning_rate": 5.644021040227927e-07, "loss": 0.559551, "num_input_tokens_seen": 273478620, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.74609375, "step": 12677, "time_per_iteration": 2.420238733291626 }, { "auxiliary_loss_clip": 0.01108017, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.02057207, "balance_loss_mlp": 1.03767681, "epoch": 0.7622425973245153, "flos": 21725668828800.0, "grad_norm": 3.318666508999385, "language_loss": 0.78483194, "learning_rate": 5.641309683778064e-07, "loss": 0.8062501, "num_input_tokens_seen": 273497635, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 12678, "time_per_iteration": 2.4401063919067383 }, { "auxiliary_loss_clip": 0.01107782, "auxiliary_loss_mlp": 0.01033039, "balance_loss_clip": 1.01994967, "balance_loss_mlp": 1.0373137, "epoch": 0.7623027205771832, "flos": 19718549683200.0, "grad_norm": 1.8427693663612668, "language_loss": 0.77407384, "learning_rate": 5.638598871811175e-07, "loss": 0.79548204, "num_input_tokens_seen": 273513955, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12679, "time_per_iteration": 2.4513912200927734 }, { "auxiliary_loss_clip": 0.01107517, "auxiliary_loss_mlp": 0.01027913, "balance_loss_clip": 1.01533699, "balance_loss_mlp": 1.0376184, "epoch": 0.7623628438298512, "flos": 23988831096960.0, "grad_norm": 1.5556606883385753, "language_loss": 0.80200094, "learning_rate": 5.635888604430059e-07, "loss": 0.82335526, "num_input_tokens_seen": 273533970, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 12680, "time_per_iteration": 2.471130132675171 }, { "auxiliary_loss_clip": 0.01108248, "auxiliary_loss_mlp": 0.01030141, "balance_loss_clip": 1.01618207, "balance_loss_mlp": 1.03768313, "epoch": 0.7624229670825191, "flos": 22345702421760.0, "grad_norm": 1.6490066399351666, "language_loss": 0.62769687, "learning_rate": 5.633178881737493e-07, "loss": 0.64908075, "num_input_tokens_seen": 273553090, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.70703125, "step": 12681, "time_per_iteration": 2.4600377082824707 }, { "auxiliary_loss_clip": 0.01105192, "auxiliary_loss_mlp": 0.01032928, "balance_loss_clip": 1.02064943, "balance_loss_mlp": 1.03687787, "epoch": 0.7624830903351871, "flos": 22711775880960.0, "grad_norm": 1.9807992258644005, "language_loss": 0.76332521, "learning_rate": 5.63046970383622e-07, "loss": 0.78470635, "num_input_tokens_seen": 273572460, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 12682, "time_per_iteration": 2.4614298343658447 }, { "auxiliary_loss_clip": 0.01103544, "auxiliary_loss_mlp": 0.01033108, "balance_loss_clip": 1.02113295, "balance_loss_mlp": 1.03598607, "epoch": 0.7625432135878552, "flos": 25593714766080.0, "grad_norm": 1.5979576949016352, "language_loss": 0.68303549, "learning_rate": 5.627761070828974e-07, "loss": 0.70440197, "num_input_tokens_seen": 273592815, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 12683, "time_per_iteration": 2.4951159954071045 }, { "auxiliary_loss_clip": 0.0110662, "auxiliary_loss_mlp": 0.01029091, "balance_loss_clip": 1.01656234, "balance_loss_mlp": 1.03720915, "epoch": 0.7626033368405231, "flos": 23987645948160.0, "grad_norm": 2.0703996822628867, "language_loss": 0.83042467, "learning_rate": 5.625052982818472e-07, "loss": 0.85178185, "num_input_tokens_seen": 273611790, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12684, "time_per_iteration": 2.4693899154663086 }, { "auxiliary_loss_clip": 0.01108788, "auxiliary_loss_mlp": 0.01038768, "balance_loss_clip": 1.02471304, "balance_loss_mlp": 1.03718531, "epoch": 0.7626634600931911, "flos": 12599115523200.0, "grad_norm": 1.959976534800086, "language_loss": 0.8283, "learning_rate": 5.622345439907396e-07, "loss": 0.84977567, "num_input_tokens_seen": 273628340, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71484375, "step": 12685, "time_per_iteration": 2.433854818344116 }, { "auxiliary_loss_clip": 0.01108551, "auxiliary_loss_mlp": 0.01027393, "balance_loss_clip": 1.01453626, "balance_loss_mlp": 1.0376631, "epoch": 0.762723583345859, "flos": 26322593546880.0, "grad_norm": 2.158531482342991, "language_loss": 0.77566504, "learning_rate": 5.619638442198422e-07, "loss": 0.79702449, "num_input_tokens_seen": 273646585, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 12686, "time_per_iteration": 2.4889862537384033 }, { "auxiliary_loss_clip": 0.01110548, "auxiliary_loss_mlp": 0.01039978, "balance_loss_clip": 1.02563095, "balance_loss_mlp": 1.0382309, "epoch": 0.762783706598527, "flos": 21907053532800.0, "grad_norm": 1.9373233288458402, "language_loss": 0.72167492, "learning_rate": 5.616931989794198e-07, "loss": 0.74318022, "num_input_tokens_seen": 273665410, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.72265625, "step": 12687, "time_per_iteration": 2.4780328273773193 }, { "auxiliary_loss_clip": 0.01107014, "auxiliary_loss_mlp": 0.01035944, "balance_loss_clip": 1.02206826, "balance_loss_mlp": 1.03690672, "epoch": 0.7628438298511949, "flos": 15339782217600.0, "grad_norm": 2.0065938567192454, "language_loss": 0.64781153, "learning_rate": 5.614226082797369e-07, "loss": 0.66924113, "num_input_tokens_seen": 273683035, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.703125, "step": 12688, "time_per_iteration": 2.4120559692382812 }, { "auxiliary_loss_clip": 0.01106084, "auxiliary_loss_mlp": 0.0103003, "balance_loss_clip": 1.01807356, "balance_loss_mlp": 1.03865671, "epoch": 0.7629039531038629, "flos": 13006307076480.0, "grad_norm": 2.389644843984655, "language_loss": 0.70757127, "learning_rate": 5.611520721310515e-07, "loss": 0.72893238, "num_input_tokens_seen": 273700130, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 12689, "time_per_iteration": 2.4455182552337646 }, { "auxiliary_loss_clip": 0.01112887, "auxiliary_loss_mlp": 0.0104208, "balance_loss_clip": 1.0289793, "balance_loss_mlp": 1.03907466, "epoch": 0.7629640763565309, "flos": 26171660597760.0, "grad_norm": 1.9652168583966991, "language_loss": 0.69749904, "learning_rate": 5.608815905436238e-07, "loss": 0.71904868, "num_input_tokens_seen": 273720310, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 12690, "time_per_iteration": 2.4950404167175293 }, { "auxiliary_loss_clip": 0.01107192, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 1.01679754, "balance_loss_mlp": 1.03703761, "epoch": 0.7630241996091989, "flos": 36793713680640.0, "grad_norm": 3.253384067631571, "language_loss": 0.69521874, "learning_rate": 5.606111635277109e-07, "loss": 0.71657979, "num_input_tokens_seen": 273744475, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 12691, "time_per_iteration": 2.577000141143799 }, { "auxiliary_loss_clip": 0.01105307, "auxiliary_loss_mlp": 0.01033811, "balance_loss_clip": 1.02203298, "balance_loss_mlp": 1.03693771, "epoch": 0.7630843228618668, "flos": 21835160461440.0, "grad_norm": 1.8899814814698568, "language_loss": 0.81809127, "learning_rate": 5.603407910935662e-07, "loss": 0.83948243, "num_input_tokens_seen": 273764635, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.68359375, "step": 12692, "time_per_iteration": 2.4534926414489746 }, { "auxiliary_loss_clip": 0.01111093, "auxiliary_loss_mlp": 0.01029212, "balance_loss_clip": 1.01729131, "balance_loss_mlp": 1.03963149, "epoch": 0.7631444461145348, "flos": 12640520926080.0, "grad_norm": 2.369601501703622, "language_loss": 0.77191806, "learning_rate": 5.600704732514438e-07, "loss": 0.79332113, "num_input_tokens_seen": 273780115, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.71484375, "step": 12693, "time_per_iteration": 2.432563066482544 }, { "auxiliary_loss_clip": 0.01109243, "auxiliary_loss_mlp": 0.01031147, "balance_loss_clip": 1.01805186, "balance_loss_mlp": 1.03823495, "epoch": 0.7632045693672027, "flos": 16836610798080.0, "grad_norm": 2.122724176684194, "language_loss": 0.72849196, "learning_rate": 5.598002100115933e-07, "loss": 0.74989587, "num_input_tokens_seen": 273796605, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12694, "time_per_iteration": 2.4453060626983643 }, { "auxiliary_loss_clip": 0.01104067, "auxiliary_loss_mlp": 0.01026598, "balance_loss_clip": 1.01408744, "balance_loss_mlp": 1.03595316, "epoch": 0.7632646926198707, "flos": 22017335264640.0, "grad_norm": 3.3447918747750465, "language_loss": 0.70866036, "learning_rate": 5.595300013842625e-07, "loss": 0.729967, "num_input_tokens_seen": 273816515, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 12695, "time_per_iteration": 2.503377676010132 }, { "auxiliary_loss_clip": 0.0110648, "auxiliary_loss_mlp": 0.01031948, "balance_loss_clip": 1.0200212, "balance_loss_mlp": 1.03691149, "epoch": 0.7633248158725388, "flos": 23114011357440.0, "grad_norm": 1.598093345458612, "language_loss": 0.72680116, "learning_rate": 5.592598473796985e-07, "loss": 0.7481854, "num_input_tokens_seen": 273837060, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 12696, "time_per_iteration": 2.518280029296875 }, { "auxiliary_loss_clip": 0.0110638, "auxiliary_loss_mlp": 0.01034645, "balance_loss_clip": 1.02190745, "balance_loss_mlp": 1.03633118, "epoch": 0.7633849391252067, "flos": 10889839952640.0, "grad_norm": 2.505322148567296, "language_loss": 0.71351039, "learning_rate": 5.589897480081453e-07, "loss": 0.73492056, "num_input_tokens_seen": 273853365, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 12697, "time_per_iteration": 2.4471824169158936 }, { "auxiliary_loss_clip": 0.01107082, "auxiliary_loss_mlp": 0.01031741, "balance_loss_clip": 1.01905727, "balance_loss_mlp": 1.03856647, "epoch": 0.7634450623778747, "flos": 20994168355200.0, "grad_norm": 2.239973054992421, "language_loss": 0.66793048, "learning_rate": 5.587197032798461e-07, "loss": 0.68931866, "num_input_tokens_seen": 273870750, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 12698, "time_per_iteration": 2.4336745738983154 }, { "auxiliary_loss_clip": 0.01107075, "auxiliary_loss_mlp": 0.01031707, "balance_loss_clip": 1.0190351, "balance_loss_mlp": 1.0368166, "epoch": 0.7635051856305426, "flos": 18882046776960.0, "grad_norm": 1.9987747700340734, "language_loss": 0.72351909, "learning_rate": 5.5844971320504e-07, "loss": 0.7449069, "num_input_tokens_seen": 273890890, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12699, "time_per_iteration": 2.481543779373169 }, { "auxiliary_loss_clip": 0.01105131, "auxiliary_loss_mlp": 0.01035251, "balance_loss_clip": 1.02330565, "balance_loss_mlp": 1.03667021, "epoch": 0.7635653088832106, "flos": 34786989584640.0, "grad_norm": 3.8909199922778406, "language_loss": 0.73309481, "learning_rate": 5.581797777939648e-07, "loss": 0.7544986, "num_input_tokens_seen": 273914015, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 12700, "time_per_iteration": 2.592170238494873 }, { "auxiliary_loss_clip": 0.01105495, "auxiliary_loss_mlp": 0.01030204, "balance_loss_clip": 1.0184797, "balance_loss_mlp": 1.03539896, "epoch": 0.7636254321358785, "flos": 23178434400000.0, "grad_norm": 1.9768156816849798, "language_loss": 0.69024372, "learning_rate": 5.579098970568574e-07, "loss": 0.71160072, "num_input_tokens_seen": 273927415, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69921875, "step": 12701, "time_per_iteration": 2.448596954345703 }, { "auxiliary_loss_clip": 0.01107787, "auxiliary_loss_mlp": 0.01028659, "balance_loss_clip": 1.01661932, "balance_loss_mlp": 1.0379045, "epoch": 0.7636855553885465, "flos": 21325229032320.0, "grad_norm": 1.611995498071973, "language_loss": 0.64479846, "learning_rate": 5.576400710039508e-07, "loss": 0.66616297, "num_input_tokens_seen": 273946690, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 12702, "time_per_iteration": 2.452971935272217 }, { "auxiliary_loss_clip": 0.01108344, "auxiliary_loss_mlp": 0.01032365, "balance_loss_clip": 1.01978254, "balance_loss_mlp": 1.03755295, "epoch": 0.7637456786412145, "flos": 28658079849600.0, "grad_norm": 2.117036591998041, "language_loss": 0.65644932, "learning_rate": 5.57370299645477e-07, "loss": 0.67785645, "num_input_tokens_seen": 273966870, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 12703, "time_per_iteration": 2.517857074737549 }, { "auxiliary_loss_clip": 0.01107011, "auxiliary_loss_mlp": 0.01027147, "balance_loss_clip": 1.01457024, "balance_loss_mlp": 1.03831649, "epoch": 0.7638058018938825, "flos": 21907269014400.0, "grad_norm": 2.3706758059960777, "language_loss": 0.83730668, "learning_rate": 5.571005829916668e-07, "loss": 0.8586483, "num_input_tokens_seen": 273986360, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 12704, "time_per_iteration": 2.4512906074523926 }, { "auxiliary_loss_clip": 0.01107884, "auxiliary_loss_mlp": 0.01032897, "balance_loss_clip": 1.02061307, "balance_loss_mlp": 1.0380435, "epoch": 0.7638659251465504, "flos": 29643899592960.0, "grad_norm": 1.802933711533832, "language_loss": 0.67858851, "learning_rate": 5.568309210527469e-07, "loss": 0.69999629, "num_input_tokens_seen": 274009745, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 12705, "time_per_iteration": 2.5307681560516357 }, { "auxiliary_loss_clip": 0.01104514, "auxiliary_loss_mlp": 0.01026333, "balance_loss_clip": 1.01383948, "balance_loss_mlp": 1.03636575, "epoch": 0.7639260483992184, "flos": 26141172929280.0, "grad_norm": 3.295556896540883, "language_loss": 0.74330628, "learning_rate": 5.565613138389427e-07, "loss": 0.76461482, "num_input_tokens_seen": 274028775, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 12706, "time_per_iteration": 2.4870026111602783 }, { "auxiliary_loss_clip": 0.01106341, "auxiliary_loss_mlp": 0.01031045, "balance_loss_clip": 1.01867115, "balance_loss_mlp": 1.03745067, "epoch": 0.7639861716518863, "flos": 20156695781760.0, "grad_norm": 1.8208776856369537, "language_loss": 0.78372014, "learning_rate": 5.562917613604781e-07, "loss": 0.805094, "num_input_tokens_seen": 274047520, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 12707, "time_per_iteration": 2.4545743465423584 }, { "auxiliary_loss_clip": 0.01104476, "auxiliary_loss_mlp": 0.01030331, "balance_loss_clip": 1.01772439, "balance_loss_mlp": 1.03540087, "epoch": 0.7640462949045543, "flos": 18583125793920.0, "grad_norm": 2.027591792330344, "language_loss": 0.79913867, "learning_rate": 5.560222636275751e-07, "loss": 0.82048672, "num_input_tokens_seen": 274065350, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 12708, "time_per_iteration": 2.4403462409973145 }, { "auxiliary_loss_clip": 0.01031552, "auxiliary_loss_mlp": 0.0100046, "balance_loss_clip": 0.999304, "balance_loss_mlp": 1.00878692, "epoch": 0.7641064181572224, "flos": 68321991646080.0, "grad_norm": 0.8311034819813584, "language_loss": 0.56416273, "learning_rate": 5.557528206504521e-07, "loss": 0.58448279, "num_input_tokens_seen": 274122315, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.22851562, "step": 12709, "time_per_iteration": 3.1204833984375 }, { "auxiliary_loss_clip": 0.01109275, "auxiliary_loss_mlp": 0.01036652, "balance_loss_clip": 1.0229671, "balance_loss_mlp": 1.03685224, "epoch": 0.7641665414098903, "flos": 17968982031360.0, "grad_norm": 1.7382340697028256, "language_loss": 0.63512498, "learning_rate": 5.554834324393271e-07, "loss": 0.65658426, "num_input_tokens_seen": 274140555, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 12710, "time_per_iteration": 2.4245638847351074 }, { "auxiliary_loss_clip": 0.01109889, "auxiliary_loss_mlp": 0.01035798, "balance_loss_clip": 1.02225566, "balance_loss_mlp": 1.03825867, "epoch": 0.7642266646625583, "flos": 21252078984960.0, "grad_norm": 2.3237883058528985, "language_loss": 0.64687681, "learning_rate": 5.552140990044154e-07, "loss": 0.66833365, "num_input_tokens_seen": 274161125, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 12711, "time_per_iteration": 2.462616443634033 }, { "auxiliary_loss_clip": 0.01105186, "auxiliary_loss_mlp": 0.01032431, "balance_loss_clip": 1.0200212, "balance_loss_mlp": 1.03590727, "epoch": 0.7642867879152262, "flos": 22747794243840.0, "grad_norm": 1.5573205629715314, "language_loss": 0.72922105, "learning_rate": 5.549448203559293e-07, "loss": 0.75059718, "num_input_tokens_seen": 274180835, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 12712, "time_per_iteration": 3.9997010231018066 }, { "auxiliary_loss_clip": 0.01105322, "auxiliary_loss_mlp": 0.01030678, "balance_loss_clip": 1.01913238, "balance_loss_mlp": 1.03801012, "epoch": 0.7643469111678942, "flos": 23332132696320.0, "grad_norm": 1.522439515854961, "language_loss": 0.80299377, "learning_rate": 5.546755965040804e-07, "loss": 0.82435381, "num_input_tokens_seen": 274201190, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 12713, "time_per_iteration": 2.5185673236846924 }, { "auxiliary_loss_clip": 0.01109491, "auxiliary_loss_mlp": 0.0103015, "balance_loss_clip": 1.01633382, "balance_loss_mlp": 1.03852725, "epoch": 0.7644070344205621, "flos": 19857092440320.0, "grad_norm": 2.234466834773271, "language_loss": 0.83137143, "learning_rate": 5.544064274590776e-07, "loss": 0.85276783, "num_input_tokens_seen": 274217595, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.70703125, "step": 12714, "time_per_iteration": 2.4954848289489746 }, { "auxiliary_loss_clip": 0.0110927, "auxiliary_loss_mlp": 0.01037159, "balance_loss_clip": 1.02418339, "balance_loss_mlp": 1.03801227, "epoch": 0.7644671576732301, "flos": 22090628966400.0, "grad_norm": 1.5144973018451013, "language_loss": 0.72786415, "learning_rate": 5.541373132311287e-07, "loss": 0.74932849, "num_input_tokens_seen": 274237885, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 12715, "time_per_iteration": 3.8732709884643555 }, { "auxiliary_loss_clip": 0.01105405, "auxiliary_loss_mlp": 0.01024864, "balance_loss_clip": 1.012532, "balance_loss_mlp": 1.0357796, "epoch": 0.7645272809258981, "flos": 25481421872640.0, "grad_norm": 1.705857982836792, "language_loss": 0.63295078, "learning_rate": 5.538682538304376e-07, "loss": 0.65425354, "num_input_tokens_seen": 274258820, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 12716, "time_per_iteration": 2.5166897773742676 }, { "auxiliary_loss_clip": 0.0110942, "auxiliary_loss_mlp": 0.01034619, "balance_loss_clip": 1.02108312, "balance_loss_mlp": 1.03748727, "epoch": 0.7645874041785661, "flos": 21541877913600.0, "grad_norm": 1.5823261217675766, "language_loss": 0.79870665, "learning_rate": 5.535992492672068e-07, "loss": 0.82014704, "num_input_tokens_seen": 274278835, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 12717, "time_per_iteration": 3.8921802043914795 }, { "auxiliary_loss_clip": 0.01106581, "auxiliary_loss_mlp": 0.01032998, "balance_loss_clip": 1.02087474, "balance_loss_mlp": 1.03761983, "epoch": 0.764647527431234, "flos": 20630896156800.0, "grad_norm": 2.452786885778489, "language_loss": 0.6657899, "learning_rate": 5.53330299551638e-07, "loss": 0.6871857, "num_input_tokens_seen": 274297110, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 12718, "time_per_iteration": 2.459820508956909 }, { "auxiliary_loss_clip": 0.01102571, "auxiliary_loss_mlp": 0.01034554, "balance_loss_clip": 1.02316976, "balance_loss_mlp": 1.0355978, "epoch": 0.764707650683902, "flos": 21434074220160.0, "grad_norm": 2.3798925191185805, "language_loss": 0.77387083, "learning_rate": 5.530614046939286e-07, "loss": 0.79524213, "num_input_tokens_seen": 274315610, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66796875, "step": 12719, "time_per_iteration": 2.465968608856201 }, { "auxiliary_loss_clip": 0.01106728, "auxiliary_loss_mlp": 0.01028092, "balance_loss_clip": 1.01484823, "balance_loss_mlp": 1.03643775, "epoch": 0.7647677739365699, "flos": 22711201263360.0, "grad_norm": 1.9785192370340274, "language_loss": 0.69781673, "learning_rate": 5.527925647042754e-07, "loss": 0.71916491, "num_input_tokens_seen": 274333975, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 12720, "time_per_iteration": 2.497296094894409 }, { "auxiliary_loss_clip": 0.01106714, "auxiliary_loss_mlp": 0.01034389, "balance_loss_clip": 1.02199101, "balance_loss_mlp": 1.03765988, "epoch": 0.7648278971892379, "flos": 21324115710720.0, "grad_norm": 2.068582588628615, "language_loss": 0.73959887, "learning_rate": 5.52523779592875e-07, "loss": 0.76100993, "num_input_tokens_seen": 274353695, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 12721, "time_per_iteration": 2.4713962078094482 }, { "auxiliary_loss_clip": 0.01107312, "auxiliary_loss_mlp": 0.01032191, "balance_loss_clip": 1.01917315, "balance_loss_mlp": 1.03738582, "epoch": 0.764888020441906, "flos": 20667345482880.0, "grad_norm": 2.0368558079605252, "language_loss": 0.73650533, "learning_rate": 5.522550493699163e-07, "loss": 0.75790036, "num_input_tokens_seen": 274371120, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 12722, "time_per_iteration": 2.48416805267334 }, { "auxiliary_loss_clip": 0.01107328, "auxiliary_loss_mlp": 0.0103656, "balance_loss_clip": 1.0239476, "balance_loss_mlp": 1.03786278, "epoch": 0.7649481436945739, "flos": 25082526360960.0, "grad_norm": 2.643342955906008, "language_loss": 0.74040306, "learning_rate": 5.519863740455912e-07, "loss": 0.76184189, "num_input_tokens_seen": 274389665, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12723, "time_per_iteration": 2.488241195678711 }, { "auxiliary_loss_clip": 0.01106888, "auxiliary_loss_mlp": 0.01030885, "balance_loss_clip": 1.01761079, "balance_loss_mlp": 1.03525269, "epoch": 0.7650082669472419, "flos": 24900890261760.0, "grad_norm": 1.7925432967014128, "language_loss": 0.73203236, "learning_rate": 5.517177536300881e-07, "loss": 0.7534101, "num_input_tokens_seen": 274408750, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12724, "time_per_iteration": 2.54494571685791 }, { "auxiliary_loss_clip": 0.01104719, "auxiliary_loss_mlp": 0.01026045, "balance_loss_clip": 1.01401126, "balance_loss_mlp": 1.03738463, "epoch": 0.7650683901999098, "flos": 14647388676480.0, "grad_norm": 2.5648298813501627, "language_loss": 0.84136415, "learning_rate": 5.514491881335935e-07, "loss": 0.86267179, "num_input_tokens_seen": 274424600, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 12725, "time_per_iteration": 2.4951138496398926 }, { "auxiliary_loss_clip": 0.01106488, "auxiliary_loss_mlp": 0.01034262, "balance_loss_clip": 1.02069616, "balance_loss_mlp": 1.03707266, "epoch": 0.7651285134525778, "flos": 26352434770560.0, "grad_norm": 1.8850238780573396, "language_loss": 0.77376258, "learning_rate": 5.511806775662901e-07, "loss": 0.79517019, "num_input_tokens_seen": 274443075, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6953125, "step": 12726, "time_per_iteration": 2.5402755737304688 }, { "auxiliary_loss_clip": 0.01107521, "auxiliary_loss_mlp": 0.01032201, "balance_loss_clip": 1.01964283, "balance_loss_mlp": 1.03723407, "epoch": 0.7651886367052457, "flos": 26646866553600.0, "grad_norm": 1.6442487127968488, "language_loss": 0.70491624, "learning_rate": 5.509122219383615e-07, "loss": 0.72631347, "num_input_tokens_seen": 274463240, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 12727, "time_per_iteration": 2.517277240753174 }, { "auxiliary_loss_clip": 0.01101186, "auxiliary_loss_mlp": 0.01026772, "balance_loss_clip": 1.01467276, "balance_loss_mlp": 1.03478432, "epoch": 0.7652487599579137, "flos": 25702847262720.0, "grad_norm": 1.7228581439800865, "language_loss": 0.80043828, "learning_rate": 5.506438212599864e-07, "loss": 0.82171786, "num_input_tokens_seen": 274482750, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6640625, "step": 12728, "time_per_iteration": 2.492990493774414 }, { "auxiliary_loss_clip": 0.01107137, "auxiliary_loss_mlp": 0.01031418, "balance_loss_clip": 1.01839399, "balance_loss_mlp": 1.03682745, "epoch": 0.7653088832105817, "flos": 28585576247040.0, "grad_norm": 2.0196836561622504, "language_loss": 0.55371356, "learning_rate": 5.503754755413424e-07, "loss": 0.57509911, "num_input_tokens_seen": 274503545, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12729, "time_per_iteration": 2.5285911560058594 }, { "auxiliary_loss_clip": 0.01105385, "auxiliary_loss_mlp": 0.01030058, "balance_loss_clip": 1.01703417, "balance_loss_mlp": 1.03618526, "epoch": 0.7653690064632497, "flos": 23366750428800.0, "grad_norm": 2.208580547421685, "language_loss": 0.77867883, "learning_rate": 5.501071847926055e-07, "loss": 0.80003327, "num_input_tokens_seen": 274523825, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 12730, "time_per_iteration": 2.4752399921417236 }, { "auxiliary_loss_clip": 0.0111137, "auxiliary_loss_mlp": 0.01037323, "balance_loss_clip": 1.02436495, "balance_loss_mlp": 1.04030681, "epoch": 0.7654291297159176, "flos": 15773905992960.0, "grad_norm": 1.907675306482972, "language_loss": 0.69313943, "learning_rate": 5.498389490239495e-07, "loss": 0.71462631, "num_input_tokens_seen": 274541625, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 12731, "time_per_iteration": 2.4534800052642822 }, { "auxiliary_loss_clip": 0.01107424, "auxiliary_loss_mlp": 0.01031094, "balance_loss_clip": 1.01790929, "balance_loss_mlp": 1.03655863, "epoch": 0.7654892529685856, "flos": 18033800123520.0, "grad_norm": 5.412644916949564, "language_loss": 0.70531178, "learning_rate": 5.495707682455471e-07, "loss": 0.72669697, "num_input_tokens_seen": 274557580, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 12732, "time_per_iteration": 2.4365882873535156 }, { "auxiliary_loss_clip": 0.01110206, "auxiliary_loss_mlp": 0.01027957, "balance_loss_clip": 1.01464093, "balance_loss_mlp": 1.03823566, "epoch": 0.7655493762212535, "flos": 27236017428480.0, "grad_norm": 1.5419245677525286, "language_loss": 0.7829591, "learning_rate": 5.493026424675653e-07, "loss": 0.80434072, "num_input_tokens_seen": 274578135, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12733, "time_per_iteration": 2.520181179046631 }, { "auxiliary_loss_clip": 0.0110548, "auxiliary_loss_mlp": 0.01030749, "balance_loss_clip": 1.0182085, "balance_loss_mlp": 1.03781486, "epoch": 0.7656094994739215, "flos": 20773964027520.0, "grad_norm": 6.205862076891944, "language_loss": 0.77629697, "learning_rate": 5.490345717001726e-07, "loss": 0.79765928, "num_input_tokens_seen": 274595655, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 12734, "time_per_iteration": 2.4403584003448486 }, { "auxiliary_loss_clip": 0.01109721, "auxiliary_loss_mlp": 0.01030916, "balance_loss_clip": 1.01698017, "balance_loss_mlp": 1.03744781, "epoch": 0.7656696227265896, "flos": 23039245198080.0, "grad_norm": 1.9303102746323313, "language_loss": 0.73239565, "learning_rate": 5.48766555953535e-07, "loss": 0.75380194, "num_input_tokens_seen": 274616305, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 12735, "time_per_iteration": 2.5034639835357666 }, { "auxiliary_loss_clip": 0.01107772, "auxiliary_loss_mlp": 0.01031194, "balance_loss_clip": 1.01843834, "balance_loss_mlp": 1.03687239, "epoch": 0.7657297459792575, "flos": 27525636789120.0, "grad_norm": 1.4849451350246863, "language_loss": 0.72569346, "learning_rate": 5.484985952378145e-07, "loss": 0.74708313, "num_input_tokens_seen": 274638110, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 12736, "time_per_iteration": 2.511068344116211 }, { "auxiliary_loss_clip": 0.01111282, "auxiliary_loss_mlp": 0.01035124, "balance_loss_clip": 1.02071166, "balance_loss_mlp": 1.03869569, "epoch": 0.7657898692319255, "flos": 17128456801920.0, "grad_norm": 1.9004359458727345, "language_loss": 0.77850211, "learning_rate": 5.482306895631728e-07, "loss": 0.79996622, "num_input_tokens_seen": 274656565, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7265625, "step": 12737, "time_per_iteration": 2.4391372203826904 }, { "auxiliary_loss_clip": 0.01106884, "auxiliary_loss_mlp": 0.0103503, "balance_loss_clip": 1.02204192, "balance_loss_mlp": 1.03714168, "epoch": 0.7658499924845934, "flos": 21465747037440.0, "grad_norm": 1.9630645657056573, "language_loss": 0.76870036, "learning_rate": 5.479628389397699e-07, "loss": 0.79011953, "num_input_tokens_seen": 274674215, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 12738, "time_per_iteration": 2.4360921382904053 }, { "auxiliary_loss_clip": 0.01109326, "auxiliary_loss_mlp": 0.01028239, "balance_loss_clip": 1.01491761, "balance_loss_mlp": 1.03747022, "epoch": 0.7659101157372614, "flos": 29496665744640.0, "grad_norm": 1.934170680053553, "language_loss": 0.62642586, "learning_rate": 5.476950433777603e-07, "loss": 0.64780152, "num_input_tokens_seen": 274693445, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12739, "time_per_iteration": 2.524996042251587 }, { "auxiliary_loss_clip": 0.01108895, "auxiliary_loss_mlp": 0.01037166, "balance_loss_clip": 1.02334332, "balance_loss_mlp": 1.03880644, "epoch": 0.7659702389899293, "flos": 18551812112640.0, "grad_norm": 2.3452639549415517, "language_loss": 0.79038596, "learning_rate": 5.474273028873004e-07, "loss": 0.81184655, "num_input_tokens_seen": 274712815, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.703125, "step": 12740, "time_per_iteration": 2.427165985107422 }, { "auxiliary_loss_clip": 0.01107716, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 1.02116585, "balance_loss_mlp": 1.03744435, "epoch": 0.7660303622425974, "flos": 23549176627200.0, "grad_norm": 2.128125575753197, "language_loss": 0.65402287, "learning_rate": 5.471596174785429e-07, "loss": 0.67544687, "num_input_tokens_seen": 274732690, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 12741, "time_per_iteration": 2.497894048690796 }, { "auxiliary_loss_clip": 0.01106017, "auxiliary_loss_mlp": 0.01030583, "balance_loss_clip": 1.01783371, "balance_loss_mlp": 1.03755832, "epoch": 0.7660904854952653, "flos": 18916736336640.0, "grad_norm": 1.5315828242127913, "language_loss": 0.7610755, "learning_rate": 5.468919871616386e-07, "loss": 0.78244156, "num_input_tokens_seen": 274752460, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 12742, "time_per_iteration": 2.4482085704803467 }, { "auxiliary_loss_clip": 0.01105128, "auxiliary_loss_mlp": 0.01032353, "balance_loss_clip": 1.02046752, "balance_loss_mlp": 1.03718042, "epoch": 0.7661506087479333, "flos": 23147515768320.0, "grad_norm": 1.6331361890350427, "language_loss": 0.76853621, "learning_rate": 5.46624411946736e-07, "loss": 0.78991103, "num_input_tokens_seen": 274773070, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 12743, "time_per_iteration": 2.475857973098755 }, { "auxiliary_loss_clip": 0.01106524, "auxiliary_loss_mlp": 0.01032875, "balance_loss_clip": 1.02041149, "balance_loss_mlp": 1.03701174, "epoch": 0.7662107320006012, "flos": 17565776887680.0, "grad_norm": 1.9988026542076271, "language_loss": 0.74783915, "learning_rate": 5.463568918439805e-07, "loss": 0.76923317, "num_input_tokens_seen": 274790220, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 12744, "time_per_iteration": 2.435868501663208 }, { "auxiliary_loss_clip": 0.01108357, "auxiliary_loss_mlp": 0.01031852, "balance_loss_clip": 1.01892352, "balance_loss_mlp": 1.03786051, "epoch": 0.7662708552532692, "flos": 22303075956480.0, "grad_norm": 2.307285279575636, "language_loss": 0.71231937, "learning_rate": 5.460894268635181e-07, "loss": 0.73372144, "num_input_tokens_seen": 274805095, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12745, "time_per_iteration": 2.4454615116119385 }, { "auxiliary_loss_clip": 0.0110701, "auxiliary_loss_mlp": 0.0103078, "balance_loss_clip": 1.01754737, "balance_loss_mlp": 1.03652716, "epoch": 0.7663309785059371, "flos": 15742053607680.0, "grad_norm": 2.2007639115186644, "language_loss": 0.76668894, "learning_rate": 5.458220170154896e-07, "loss": 0.78806686, "num_input_tokens_seen": 274821800, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 12746, "time_per_iteration": 2.430809259414673 }, { "auxiliary_loss_clip": 0.01031589, "auxiliary_loss_mlp": 0.01000158, "balance_loss_clip": 0.99906105, "balance_loss_mlp": 1.00884771, "epoch": 0.7663911017586051, "flos": 62163312514560.0, "grad_norm": 0.6682299937584485, "language_loss": 0.56811595, "learning_rate": 5.455546623100362e-07, "loss": 0.58843338, "num_input_tokens_seen": 274886970, "router_z_loss_clip": 0.01098633, "router_z_loss_mlp": 0.22753906, "step": 12747, "time_per_iteration": 3.1440093517303467 }, { "auxiliary_loss_clip": 0.01103986, "auxiliary_loss_mlp": 0.01034186, "balance_loss_clip": 1.02301645, "balance_loss_mlp": 1.0359304, "epoch": 0.7664512250112732, "flos": 26506025326080.0, "grad_norm": 1.6540192218945058, "language_loss": 0.7245993, "learning_rate": 5.452873627572956e-07, "loss": 0.74598104, "num_input_tokens_seen": 274907240, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6796875, "step": 12748, "time_per_iteration": 2.507643699645996 }, { "auxiliary_loss_clip": 0.01104377, "auxiliary_loss_mlp": 0.0103255, "balance_loss_clip": 1.01946628, "balance_loss_mlp": 1.03565383, "epoch": 0.7665113482639411, "flos": 16249542912000.0, "grad_norm": 1.7472776621135202, "language_loss": 0.6960929, "learning_rate": 5.450201183674052e-07, "loss": 0.71746218, "num_input_tokens_seen": 274924650, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 12749, "time_per_iteration": 2.429548740386963 }, { "auxiliary_loss_clip": 0.01107743, "auxiliary_loss_mlp": 0.01030038, "balance_loss_clip": 1.01661563, "balance_loss_mlp": 1.03731835, "epoch": 0.7665714715166091, "flos": 27197880163200.0, "grad_norm": 1.743883943789729, "language_loss": 0.73626888, "learning_rate": 5.447529291504967e-07, "loss": 0.75764668, "num_input_tokens_seen": 274944550, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 12750, "time_per_iteration": 2.501919746398926 }, { "auxiliary_loss_clip": 0.01102947, "auxiliary_loss_mlp": 0.01028614, "balance_loss_clip": 1.01641297, "balance_loss_mlp": 1.03554177, "epoch": 0.766631594769277, "flos": 21067785279360.0, "grad_norm": 2.629688574172689, "language_loss": 0.76015997, "learning_rate": 5.444857951167026e-07, "loss": 0.7814756, "num_input_tokens_seen": 274961330, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 12751, "time_per_iteration": 2.450488567352295 }, { "auxiliary_loss_clip": 0.01107057, "auxiliary_loss_mlp": 0.0103882, "balance_loss_clip": 1.02552247, "balance_loss_mlp": 1.03813994, "epoch": 0.766691718021945, "flos": 24097963593600.0, "grad_norm": 2.7324464063847667, "language_loss": 0.61384892, "learning_rate": 5.442187162761537e-07, "loss": 0.63530767, "num_input_tokens_seen": 274981655, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 12752, "time_per_iteration": 2.473870038986206 }, { "auxiliary_loss_clip": 0.01107546, "auxiliary_loss_mlp": 0.01030297, "balance_loss_clip": 1.01708245, "balance_loss_mlp": 1.03703034, "epoch": 0.7667518412746129, "flos": 23440654661760.0, "grad_norm": 2.516322480712095, "language_loss": 0.69367027, "learning_rate": 5.439516926389767e-07, "loss": 0.71504867, "num_input_tokens_seen": 274999970, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 12753, "time_per_iteration": 5.414042711257935 }, { "auxiliary_loss_clip": 0.01106264, "auxiliary_loss_mlp": 0.01038592, "balance_loss_clip": 1.02575898, "balance_loss_mlp": 1.03713489, "epoch": 0.766811964527281, "flos": 18148786536960.0, "grad_norm": 2.2148050641743393, "language_loss": 0.62708509, "learning_rate": 5.436847242152971e-07, "loss": 0.6485337, "num_input_tokens_seen": 275015805, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 12754, "time_per_iteration": 2.442399501800537 }, { "auxiliary_loss_clip": 0.01105957, "auxiliary_loss_mlp": 0.01030155, "balance_loss_clip": 1.01809645, "balance_loss_mlp": 1.03812957, "epoch": 0.7668720877799489, "flos": 19536051657600.0, "grad_norm": 3.4207847161518976, "language_loss": 0.8026011, "learning_rate": 5.434178110152401e-07, "loss": 0.82396221, "num_input_tokens_seen": 275031810, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 12755, "time_per_iteration": 2.428724765777588 }, { "auxiliary_loss_clip": 0.01105427, "auxiliary_loss_mlp": 0.01031037, "balance_loss_clip": 1.01875257, "balance_loss_mlp": 1.03688407, "epoch": 0.7669322110326169, "flos": 22674320974080.0, "grad_norm": 3.843403613884582, "language_loss": 0.70506072, "learning_rate": 5.431509530489242e-07, "loss": 0.72642541, "num_input_tokens_seen": 275049325, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 12756, "time_per_iteration": 3.8623111248016357 }, { "auxiliary_loss_clip": 0.01106599, "auxiliary_loss_mlp": 0.01035811, "balance_loss_clip": 1.02372909, "balance_loss_mlp": 1.03748608, "epoch": 0.7669923342852848, "flos": 26469396432000.0, "grad_norm": 1.6581712441816994, "language_loss": 0.70392448, "learning_rate": 5.428841503264706e-07, "loss": 0.72534859, "num_input_tokens_seen": 275070865, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69140625, "step": 12757, "time_per_iteration": 2.519449234008789 }, { "auxiliary_loss_clip": 0.01110004, "auxiliary_loss_mlp": 0.01039129, "balance_loss_clip": 1.0257417, "balance_loss_mlp": 1.03956831, "epoch": 0.7670524575379528, "flos": 22856136641280.0, "grad_norm": 2.051266587653075, "language_loss": 0.76093107, "learning_rate": 5.426174028579955e-07, "loss": 0.78242242, "num_input_tokens_seen": 275088015, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 12758, "time_per_iteration": 3.8763937950134277 }, { "auxiliary_loss_clip": 0.01103957, "auxiliary_loss_mlp": 0.01039879, "balance_loss_clip": 1.02736759, "balance_loss_mlp": 1.0365169, "epoch": 0.7671125807906207, "flos": 22452141398400.0, "grad_norm": 1.6728925944650637, "language_loss": 0.76189613, "learning_rate": 5.423507106536156e-07, "loss": 0.78333449, "num_input_tokens_seen": 275106975, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 12759, "time_per_iteration": 2.480602741241455 }, { "auxiliary_loss_clip": 0.01103666, "auxiliary_loss_mlp": 0.01029493, "balance_loss_clip": 1.0172081, "balance_loss_mlp": 1.03377199, "epoch": 0.7671727040432887, "flos": 35371543518720.0, "grad_norm": 2.541535382327769, "language_loss": 0.68791234, "learning_rate": 5.420840737234425e-07, "loss": 0.70924401, "num_input_tokens_seen": 275129560, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 12760, "time_per_iteration": 2.5681989192962646 }, { "auxiliary_loss_clip": 0.01107515, "auxiliary_loss_mlp": 0.01033536, "balance_loss_clip": 1.02007759, "balance_loss_mlp": 1.03836739, "epoch": 0.7672328272959568, "flos": 22494947431680.0, "grad_norm": 1.7155750701621988, "language_loss": 0.79222524, "learning_rate": 5.418174920775871e-07, "loss": 0.81363571, "num_input_tokens_seen": 275151180, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69140625, "step": 12761, "time_per_iteration": 2.495936155319214 }, { "auxiliary_loss_clip": 0.01102931, "auxiliary_loss_mlp": 0.01032486, "balance_loss_clip": 1.02029729, "balance_loss_mlp": 1.03587282, "epoch": 0.7672929505486247, "flos": 22815557251200.0, "grad_norm": 1.9956957834337712, "language_loss": 0.66217369, "learning_rate": 5.415509657261589e-07, "loss": 0.68352783, "num_input_tokens_seen": 275170605, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 12762, "time_per_iteration": 2.4504220485687256 }, { "auxiliary_loss_clip": 0.0110829, "auxiliary_loss_mlp": 0.01029277, "balance_loss_clip": 1.01631868, "balance_loss_mlp": 1.03709674, "epoch": 0.7673530738012927, "flos": 20338834671360.0, "grad_norm": 2.0686062771120874, "language_loss": 0.74072188, "learning_rate": 5.412844946792639e-07, "loss": 0.76209754, "num_input_tokens_seen": 275188750, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 12763, "time_per_iteration": 2.4663636684417725 }, { "auxiliary_loss_clip": 0.0110726, "auxiliary_loss_mlp": 0.01033346, "balance_loss_clip": 1.02093649, "balance_loss_mlp": 1.03886771, "epoch": 0.7674131970539606, "flos": 34933576988160.0, "grad_norm": 8.28158919857673, "language_loss": 0.70739615, "learning_rate": 5.410180789470067e-07, "loss": 0.7288022, "num_input_tokens_seen": 275211365, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 12764, "time_per_iteration": 2.5864672660827637 }, { "auxiliary_loss_clip": 0.01104907, "auxiliary_loss_mlp": 0.0102986, "balance_loss_clip": 1.01755166, "balance_loss_mlp": 1.03695476, "epoch": 0.7674733203066286, "flos": 28328850766080.0, "grad_norm": 1.710179196803219, "language_loss": 0.69568348, "learning_rate": 5.40751718539491e-07, "loss": 0.71703112, "num_input_tokens_seen": 275231670, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 12765, "time_per_iteration": 2.513639450073242 }, { "auxiliary_loss_clip": 0.01101556, "auxiliary_loss_mlp": 0.0102734, "balance_loss_clip": 1.01602745, "balance_loss_mlp": 1.03339744, "epoch": 0.7675334435592965, "flos": 16289727252480.0, "grad_norm": 2.0631431878769524, "language_loss": 0.60985291, "learning_rate": 5.404854134668162e-07, "loss": 0.6311419, "num_input_tokens_seen": 275249425, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 12766, "time_per_iteration": 2.4449634552001953 }, { "auxiliary_loss_clip": 0.01031665, "auxiliary_loss_mlp": 0.00999962, "balance_loss_clip": 0.99886531, "balance_loss_mlp": 1.00906646, "epoch": 0.7675935668119646, "flos": 64826232220800.0, "grad_norm": 0.7387088039398868, "language_loss": 0.60764027, "learning_rate": 5.402191637390803e-07, "loss": 0.62795651, "num_input_tokens_seen": 275312485, "router_z_loss_clip": 0.01098633, "router_z_loss_mlp": 0.2265625, "step": 12767, "time_per_iteration": 3.2104086875915527 }, { "auxiliary_loss_clip": 0.0110439, "auxiliary_loss_mlp": 0.01026588, "balance_loss_clip": 1.01488137, "balance_loss_mlp": 1.03691685, "epoch": 0.7676536900646325, "flos": 22675398382080.0, "grad_norm": 1.8217125959887566, "language_loss": 0.69534463, "learning_rate": 5.399529693663801e-07, "loss": 0.71665442, "num_input_tokens_seen": 275331680, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 12768, "time_per_iteration": 2.4551925659179688 }, { "auxiliary_loss_clip": 0.01113914, "auxiliary_loss_mlp": 0.01034458, "balance_loss_clip": 1.02092767, "balance_loss_mlp": 1.04092407, "epoch": 0.7677138133173005, "flos": 26939682224640.0, "grad_norm": 2.2154791027852982, "language_loss": 0.70659924, "learning_rate": 5.3968683035881e-07, "loss": 0.72808295, "num_input_tokens_seen": 275351615, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 12769, "time_per_iteration": 2.5156867504119873 }, { "auxiliary_loss_clip": 0.01109342, "auxiliary_loss_mlp": 0.01029622, "balance_loss_clip": 1.01715255, "balance_loss_mlp": 1.03805101, "epoch": 0.7677739365699684, "flos": 23799545400960.0, "grad_norm": 1.9700777912185772, "language_loss": 0.80273271, "learning_rate": 5.394207467264611e-07, "loss": 0.82412237, "num_input_tokens_seen": 275368815, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 12770, "time_per_iteration": 2.4788782596588135 }, { "auxiliary_loss_clip": 0.01104169, "auxiliary_loss_mlp": 0.01031873, "balance_loss_clip": 1.01997042, "balance_loss_mlp": 1.03710699, "epoch": 0.7678340598226364, "flos": 34455497944320.0, "grad_norm": 1.766264786594443, "language_loss": 0.78722191, "learning_rate": 5.391547184794245e-07, "loss": 0.80858231, "num_input_tokens_seen": 275389345, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 12771, "time_per_iteration": 2.620211124420166 }, { "auxiliary_loss_clip": 0.01104778, "auxiliary_loss_mlp": 0.01031855, "balance_loss_clip": 1.01919496, "balance_loss_mlp": 1.03578639, "epoch": 0.7678941830753043, "flos": 23841740903040.0, "grad_norm": 1.3848264548330529, "language_loss": 0.68059623, "learning_rate": 5.388887456277876e-07, "loss": 0.70196259, "num_input_tokens_seen": 275411240, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 12772, "time_per_iteration": 2.552554130554199 }, { "auxiliary_loss_clip": 0.01104345, "auxiliary_loss_mlp": 0.01028199, "balance_loss_clip": 1.01657057, "balance_loss_mlp": 1.03798127, "epoch": 0.7679543063279723, "flos": 25410929431680.0, "grad_norm": 1.7538176871028768, "language_loss": 0.73673654, "learning_rate": 5.386228281816349e-07, "loss": 0.75806195, "num_input_tokens_seen": 275432010, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 12773, "time_per_iteration": 2.497753381729126 }, { "auxiliary_loss_clip": 0.011031, "auxiliary_loss_mlp": 0.01027633, "balance_loss_clip": 1.01620078, "balance_loss_mlp": 1.03660214, "epoch": 0.7680144295806404, "flos": 27962382257280.0, "grad_norm": 1.6618766252116723, "language_loss": 0.81407893, "learning_rate": 5.383569661510512e-07, "loss": 0.83538628, "num_input_tokens_seen": 275453710, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 12774, "time_per_iteration": 2.5758957862854004 }, { "auxiliary_loss_clip": 0.0110704, "auxiliary_loss_mlp": 0.01032919, "balance_loss_clip": 1.02027106, "balance_loss_mlp": 1.03904045, "epoch": 0.7680745528333083, "flos": 20412810731520.0, "grad_norm": 1.5830111640863527, "language_loss": 0.69923973, "learning_rate": 5.380911595461177e-07, "loss": 0.72063935, "num_input_tokens_seen": 275472915, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 12775, "time_per_iteration": 2.455981969833374 }, { "auxiliary_loss_clip": 0.0103191, "auxiliary_loss_mlp": 0.01000805, "balance_loss_clip": 0.99972594, "balance_loss_mlp": 1.00901175, "epoch": 0.7681346760859763, "flos": 68401103351040.0, "grad_norm": 0.7669077439930478, "language_loss": 0.56842476, "learning_rate": 5.378254083769147e-07, "loss": 0.58875191, "num_input_tokens_seen": 275534785, "router_z_loss_clip": 0.01080322, "router_z_loss_mlp": 0.22949219, "step": 12776, "time_per_iteration": 3.197373628616333 }, { "auxiliary_loss_clip": 0.01104493, "auxiliary_loss_mlp": 0.01034365, "balance_loss_clip": 1.02242661, "balance_loss_mlp": 1.03656387, "epoch": 0.7681947993386442, "flos": 21251468453760.0, "grad_norm": 3.508147369746401, "language_loss": 0.73878694, "learning_rate": 5.375597126535188e-07, "loss": 0.76017553, "num_input_tokens_seen": 275553205, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 12777, "time_per_iteration": 2.4960248470306396 }, { "auxiliary_loss_clip": 0.01106234, "auxiliary_loss_mlp": 0.0103453, "balance_loss_clip": 1.02257907, "balance_loss_mlp": 1.03798378, "epoch": 0.7682549225913122, "flos": 21397696721280.0, "grad_norm": 2.0939003097481956, "language_loss": 0.70406264, "learning_rate": 5.372940723860043e-07, "loss": 0.7254703, "num_input_tokens_seen": 275571490, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 12778, "time_per_iteration": 2.4934117794036865 }, { "auxiliary_loss_clip": 0.01105577, "auxiliary_loss_mlp": 0.01033049, "balance_loss_clip": 1.02120566, "balance_loss_mlp": 1.03784561, "epoch": 0.7683150458439801, "flos": 23038921975680.0, "grad_norm": 1.8899688171669904, "language_loss": 0.70586801, "learning_rate": 5.37028487584446e-07, "loss": 0.72725427, "num_input_tokens_seen": 275589665, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 12779, "time_per_iteration": 2.4998297691345215 }, { "auxiliary_loss_clip": 0.01107935, "auxiliary_loss_mlp": 0.01029135, "balance_loss_clip": 1.01668382, "balance_loss_mlp": 1.03930402, "epoch": 0.7683751690966482, "flos": 67332397996800.0, "grad_norm": 1.743102948750131, "language_loss": 0.58623815, "learning_rate": 5.367629582589133e-07, "loss": 0.60760891, "num_input_tokens_seen": 275615605, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 12780, "time_per_iteration": 2.8719587326049805 }, { "auxiliary_loss_clip": 0.01110604, "auxiliary_loss_mlp": 0.01040126, "balance_loss_clip": 1.02568352, "balance_loss_mlp": 1.03759742, "epoch": 0.7684352923493161, "flos": 21798890703360.0, "grad_norm": 4.62477500587267, "language_loss": 0.68485224, "learning_rate": 5.364974844194759e-07, "loss": 0.70635951, "num_input_tokens_seen": 275634965, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.7265625, "step": 12781, "time_per_iteration": 2.4461443424224854 }, { "auxiliary_loss_clip": 0.01103466, "auxiliary_loss_mlp": 0.01029235, "balance_loss_clip": 1.01747537, "balance_loss_mlp": 1.03493142, "epoch": 0.7684954156019841, "flos": 25847603072640.0, "grad_norm": 1.604087733522928, "language_loss": 0.79286945, "learning_rate": 5.362320660762016e-07, "loss": 0.81419647, "num_input_tokens_seen": 275655785, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 12782, "time_per_iteration": 2.5087337493896484 }, { "auxiliary_loss_clip": 0.01105866, "auxiliary_loss_mlp": 0.01028565, "balance_loss_clip": 1.01602435, "balance_loss_mlp": 1.0363518, "epoch": 0.768555538854652, "flos": 25447378757760.0, "grad_norm": 1.9381173431521739, "language_loss": 0.67178732, "learning_rate": 5.35966703239153e-07, "loss": 0.69313169, "num_input_tokens_seen": 275676160, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12783, "time_per_iteration": 2.494457483291626 }, { "auxiliary_loss_clip": 0.01106705, "auxiliary_loss_mlp": 0.01034981, "balance_loss_clip": 1.02190399, "balance_loss_mlp": 1.03747618, "epoch": 0.76861566210732, "flos": 19646369303040.0, "grad_norm": 1.9880421870932252, "language_loss": 0.693515, "learning_rate": 5.357013959183938e-07, "loss": 0.71493185, "num_input_tokens_seen": 275695660, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 12784, "time_per_iteration": 2.4975080490112305 }, { "auxiliary_loss_clip": 0.011048, "auxiliary_loss_mlp": 0.0102689, "balance_loss_clip": 1.0155952, "balance_loss_mlp": 1.03653824, "epoch": 0.7686757853599879, "flos": 22419032037120.0, "grad_norm": 1.8801638836801329, "language_loss": 0.80272591, "learning_rate": 5.354361441239843e-07, "loss": 0.8240428, "num_input_tokens_seen": 275714025, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.68359375, "step": 12785, "time_per_iteration": 2.4616446495056152 }, { "auxiliary_loss_clip": 0.01106228, "auxiliary_loss_mlp": 0.01031434, "balance_loss_clip": 1.01796889, "balance_loss_mlp": 1.03674114, "epoch": 0.768735908612656, "flos": 47774262453120.0, "grad_norm": 2.3010650396515184, "language_loss": 0.77328557, "learning_rate": 5.351709478659836e-07, "loss": 0.79466218, "num_input_tokens_seen": 275737300, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 12786, "time_per_iteration": 2.7144341468811035 }, { "auxiliary_loss_clip": 0.01103637, "auxiliary_loss_mlp": 0.01032221, "balance_loss_clip": 1.02032971, "balance_loss_mlp": 1.0349946, "epoch": 0.7687960318653239, "flos": 30263179000320.0, "grad_norm": 2.22343687289191, "language_loss": 0.59049463, "learning_rate": 5.349058071544468e-07, "loss": 0.61185324, "num_input_tokens_seen": 275757895, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 12787, "time_per_iteration": 2.5293021202087402 }, { "auxiliary_loss_clip": 0.01102563, "auxiliary_loss_mlp": 0.01029565, "balance_loss_clip": 1.0170244, "balance_loss_mlp": 1.03486979, "epoch": 0.7688561551179919, "flos": 19573434737280.0, "grad_norm": 1.6683201290214418, "language_loss": 0.76128572, "learning_rate": 5.346407219994292e-07, "loss": 0.78260708, "num_input_tokens_seen": 275776745, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 12788, "time_per_iteration": 2.465127468109131 }, { "auxiliary_loss_clip": 0.01106499, "auxiliary_loss_mlp": 0.01038447, "balance_loss_clip": 1.02547121, "balance_loss_mlp": 1.03679442, "epoch": 0.7689162783706599, "flos": 22783776693120.0, "grad_norm": 2.026850311252214, "language_loss": 0.6658169, "learning_rate": 5.343756924109821e-07, "loss": 0.68726635, "num_input_tokens_seen": 275797205, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 12789, "time_per_iteration": 2.465712785720825 }, { "auxiliary_loss_clip": 0.0110828, "auxiliary_loss_mlp": 0.01036027, "balance_loss_clip": 1.02223396, "balance_loss_mlp": 1.03842878, "epoch": 0.7689764016233278, "flos": 34204195416960.0, "grad_norm": 1.6402268343911888, "language_loss": 0.69039357, "learning_rate": 5.341107183991553e-07, "loss": 0.7118367, "num_input_tokens_seen": 275817935, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.69921875, "step": 12790, "time_per_iteration": 2.6017909049987793 }, { "auxiliary_loss_clip": 0.01104418, "auxiliary_loss_mlp": 0.01032523, "balance_loss_clip": 1.01999462, "balance_loss_mlp": 1.0352385, "epoch": 0.7690365248759958, "flos": 17274469587840.0, "grad_norm": 2.1410138172500943, "language_loss": 0.68682313, "learning_rate": 5.338457999739969e-07, "loss": 0.70819253, "num_input_tokens_seen": 275837145, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 12791, "time_per_iteration": 2.4481565952301025 }, { "auxiliary_loss_clip": 0.01105023, "auxiliary_loss_mlp": 0.01033826, "balance_loss_clip": 1.02199459, "balance_loss_mlp": 1.03713191, "epoch": 0.7690966481286637, "flos": 18223157646720.0, "grad_norm": 1.7567573607149125, "language_loss": 0.79745686, "learning_rate": 5.335809371455526e-07, "loss": 0.81884527, "num_input_tokens_seen": 275855705, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 12792, "time_per_iteration": 2.4370648860931396 }, { "auxiliary_loss_clip": 0.01114474, "auxiliary_loss_mlp": 0.01034954, "balance_loss_clip": 1.02125108, "balance_loss_mlp": 1.04181361, "epoch": 0.7691567713813318, "flos": 21537568281600.0, "grad_norm": 5.1637716903722195, "language_loss": 0.7279129, "learning_rate": 5.333161299238673e-07, "loss": 0.74940717, "num_input_tokens_seen": 275873930, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 12793, "time_per_iteration": 2.436783790588379 }, { "auxiliary_loss_clip": 0.01107942, "auxiliary_loss_mlp": 0.01031891, "balance_loss_clip": 1.019279, "balance_loss_mlp": 1.03802276, "epoch": 0.7692168946339997, "flos": 39379999720320.0, "grad_norm": 1.8784688028372214, "language_loss": 0.63377452, "learning_rate": 5.330513783189803e-07, "loss": 0.65517282, "num_input_tokens_seen": 275895895, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 12794, "time_per_iteration": 2.6199474334716797 }, { "auxiliary_loss_clip": 0.01110972, "auxiliary_loss_mlp": 0.01033785, "balance_loss_clip": 1.02110136, "balance_loss_mlp": 1.03931069, "epoch": 0.7692770178866677, "flos": 25009950931200.0, "grad_norm": 2.768127306929991, "language_loss": 0.76499456, "learning_rate": 5.327866823409319e-07, "loss": 0.78644216, "num_input_tokens_seen": 275917825, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 12795, "time_per_iteration": 4.031525135040283 }, { "auxiliary_loss_clip": 0.01106979, "auxiliary_loss_mlp": 0.01029543, "balance_loss_clip": 1.0168649, "balance_loss_mlp": 1.03606105, "epoch": 0.7693371411393356, "flos": 24716273333760.0, "grad_norm": 1.8598113898784217, "language_loss": 0.72056389, "learning_rate": 5.325220419997601e-07, "loss": 0.74192911, "num_input_tokens_seen": 275937890, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 12796, "time_per_iteration": 2.4811668395996094 }, { "auxiliary_loss_clip": 0.0110567, "auxiliary_loss_mlp": 0.01028747, "balance_loss_clip": 1.01582479, "balance_loss_mlp": 1.03625023, "epoch": 0.7693972643920036, "flos": 15924803028480.0, "grad_norm": 1.9128370429852544, "language_loss": 0.64789379, "learning_rate": 5.32257457305499e-07, "loss": 0.66923791, "num_input_tokens_seen": 275954495, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12797, "time_per_iteration": 2.4452390670776367 }, { "auxiliary_loss_clip": 0.01107978, "auxiliary_loss_mlp": 0.0103963, "balance_loss_clip": 1.02584898, "balance_loss_mlp": 1.03742266, "epoch": 0.7694573876446715, "flos": 25405901527680.0, "grad_norm": 1.920618026733893, "language_loss": 0.91639197, "learning_rate": 5.319929282681823e-07, "loss": 0.937868, "num_input_tokens_seen": 275972395, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.70703125, "step": 12798, "time_per_iteration": 3.8676106929779053 }, { "auxiliary_loss_clip": 0.01104782, "auxiliary_loss_mlp": 0.01022632, "balance_loss_clip": 1.01025248, "balance_loss_mlp": 1.03539252, "epoch": 0.7695175108973396, "flos": 16654220513280.0, "grad_norm": 2.267271776762225, "language_loss": 0.82673156, "learning_rate": 5.317284548978418e-07, "loss": 0.84800565, "num_input_tokens_seen": 275989020, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 12799, "time_per_iteration": 2.421858072280884 }, { "auxiliary_loss_clip": 0.01108523, "auxiliary_loss_mlp": 0.01029978, "balance_loss_clip": 1.01675129, "balance_loss_mlp": 1.03842437, "epoch": 0.7695776341500075, "flos": 13626520237440.0, "grad_norm": 3.217356862607766, "language_loss": 0.78073734, "learning_rate": 5.314640372045045e-07, "loss": 0.80212229, "num_input_tokens_seen": 276006525, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 12800, "time_per_iteration": 3.884131669998169 }, { "auxiliary_loss_clip": 0.01111534, "auxiliary_loss_mlp": 0.01029119, "balance_loss_clip": 1.01502275, "balance_loss_mlp": 1.03718138, "epoch": 0.7696377574026755, "flos": 24276690691200.0, "grad_norm": 1.622616315108107, "language_loss": 0.83616638, "learning_rate": 5.31199675198198e-07, "loss": 0.85757297, "num_input_tokens_seen": 276027130, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7421875, "step": 12801, "time_per_iteration": 2.4757728576660156 }, { "auxiliary_loss_clip": 0.01107426, "auxiliary_loss_mlp": 0.01032077, "balance_loss_clip": 1.01923227, "balance_loss_mlp": 1.03876948, "epoch": 0.7696978806553435, "flos": 20923137210240.0, "grad_norm": 2.0927593654311836, "language_loss": 0.7189796, "learning_rate": 5.30935368888947e-07, "loss": 0.74037468, "num_input_tokens_seen": 276045715, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 12802, "time_per_iteration": 2.4854214191436768 }, { "auxiliary_loss_clip": 0.01105804, "auxiliary_loss_mlp": 0.01033704, "balance_loss_clip": 1.02147865, "balance_loss_mlp": 1.03720784, "epoch": 0.7697580039080114, "flos": 22929609911040.0, "grad_norm": 2.0919796944671303, "language_loss": 0.76172173, "learning_rate": 5.306711182867747e-07, "loss": 0.78311682, "num_input_tokens_seen": 276065375, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 12803, "time_per_iteration": 2.4776694774627686 }, { "auxiliary_loss_clip": 0.01032612, "auxiliary_loss_mlp": 0.01001782, "balance_loss_clip": 1.00070941, "balance_loss_mlp": 1.0097363, "epoch": 0.7698181271606794, "flos": 68717654933760.0, "grad_norm": 0.7545876062987276, "language_loss": 0.55833215, "learning_rate": 5.304069234017001e-07, "loss": 0.5786761, "num_input_tokens_seen": 276131405, "router_z_loss_clip": 0.01074219, "router_z_loss_mlp": 0.22851562, "step": 12804, "time_per_iteration": 3.104884386062622 }, { "auxiliary_loss_clip": 0.01032291, "auxiliary_loss_mlp": 0.01005289, "balance_loss_clip": 1.00428188, "balance_loss_mlp": 1.00954723, "epoch": 0.7698782504133473, "flos": 67409716999680.0, "grad_norm": 0.7584973243601382, "language_loss": 0.53988922, "learning_rate": 5.301427842437429e-07, "loss": 0.56026506, "num_input_tokens_seen": 276200755, "router_z_loss_clip": 0.0100708, "router_z_loss_mlp": 0.22753906, "step": 12805, "time_per_iteration": 3.2241125106811523 }, { "auxiliary_loss_clip": 0.0111152, "auxiliary_loss_mlp": 0.01036072, "balance_loss_clip": 1.02343535, "balance_loss_mlp": 1.04109859, "epoch": 0.7699383736660154, "flos": 22488842119680.0, "grad_norm": 1.896683131347672, "language_loss": 0.72772461, "learning_rate": 5.298787008229187e-07, "loss": 0.74920052, "num_input_tokens_seen": 276217880, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12806, "time_per_iteration": 2.462291717529297 }, { "auxiliary_loss_clip": 0.01106698, "auxiliary_loss_mlp": 0.01038412, "balance_loss_clip": 1.02572763, "balance_loss_mlp": 1.03687811, "epoch": 0.7699984969186833, "flos": 21539723097600.0, "grad_norm": 1.8181537770052063, "language_loss": 0.75186378, "learning_rate": 5.296146731492408e-07, "loss": 0.77331483, "num_input_tokens_seen": 276234810, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 12807, "time_per_iteration": 2.4479377269744873 }, { "auxiliary_loss_clip": 0.01110984, "auxiliary_loss_mlp": 0.01035036, "balance_loss_clip": 1.02156508, "balance_loss_mlp": 1.03818679, "epoch": 0.7700586201713513, "flos": 21719096640000.0, "grad_norm": 2.189944175647968, "language_loss": 0.8016305, "learning_rate": 5.293507012327218e-07, "loss": 0.82309067, "num_input_tokens_seen": 276252850, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 12808, "time_per_iteration": 2.474379301071167 }, { "auxiliary_loss_clip": 0.01110956, "auxiliary_loss_mlp": 0.01034929, "balance_loss_clip": 1.02150035, "balance_loss_mlp": 1.03876507, "epoch": 0.7701187434240192, "flos": 27856015107840.0, "grad_norm": 2.61595131915576, "language_loss": 0.79318678, "learning_rate": 5.290867850833718e-07, "loss": 0.81464565, "num_input_tokens_seen": 276272525, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 12809, "time_per_iteration": 2.540055751800537 }, { "auxiliary_loss_clip": 0.01103194, "auxiliary_loss_mlp": 0.01023523, "balance_loss_clip": 1.01195407, "balance_loss_mlp": 1.03664446, "epoch": 0.7701788666766872, "flos": 28621307301120.0, "grad_norm": 1.468248478732233, "language_loss": 0.70498294, "learning_rate": 5.288229247111993e-07, "loss": 0.72625017, "num_input_tokens_seen": 276294210, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6640625, "step": 12810, "time_per_iteration": 2.5162084102630615 }, { "auxiliary_loss_clip": 0.01109951, "auxiliary_loss_mlp": 0.01033525, "balance_loss_clip": 1.01970863, "balance_loss_mlp": 1.03734016, "epoch": 0.7702389899293551, "flos": 14246446089600.0, "grad_norm": 10.222283493096267, "language_loss": 0.78542316, "learning_rate": 5.285591201262079e-07, "loss": 0.80685788, "num_input_tokens_seen": 276310290, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 12811, "time_per_iteration": 2.4036262035369873 }, { "auxiliary_loss_clip": 0.01032751, "auxiliary_loss_mlp": 0.01003729, "balance_loss_clip": 1.00263822, "balance_loss_mlp": 1.0100832, "epoch": 0.7702991131820232, "flos": 70574128439040.0, "grad_norm": 0.8061026238101046, "language_loss": 0.56706655, "learning_rate": 5.28295371338402e-07, "loss": 0.58743137, "num_input_tokens_seen": 276371715, "router_z_loss_clip": 0.01092529, "router_z_loss_mlp": 0.2265625, "step": 12812, "time_per_iteration": 3.142871856689453 }, { "auxiliary_loss_clip": 0.01109211, "auxiliary_loss_mlp": 0.01035072, "balance_loss_clip": 1.02232838, "balance_loss_mlp": 1.03765249, "epoch": 0.7703592364346911, "flos": 25480021242240.0, "grad_norm": 1.6785259811523843, "language_loss": 0.7205503, "learning_rate": 5.280316783577836e-07, "loss": 0.74199313, "num_input_tokens_seen": 276389895, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 12813, "time_per_iteration": 2.4868228435516357 }, { "auxiliary_loss_clip": 0.01107861, "auxiliary_loss_mlp": 0.01028155, "balance_loss_clip": 1.0152204, "balance_loss_mlp": 1.03719628, "epoch": 0.7704193596873591, "flos": 19280906375040.0, "grad_norm": 2.098394329806506, "language_loss": 0.66639996, "learning_rate": 5.27768041194351e-07, "loss": 0.68776011, "num_input_tokens_seen": 276408990, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 12814, "time_per_iteration": 2.460472345352173 }, { "auxiliary_loss_clip": 0.0110582, "auxiliary_loss_mlp": 0.0103673, "balance_loss_clip": 1.02440357, "balance_loss_mlp": 1.03682184, "epoch": 0.7704794829400271, "flos": 23658452778240.0, "grad_norm": 1.9494883765620872, "language_loss": 0.65772492, "learning_rate": 5.275044598581018e-07, "loss": 0.67915046, "num_input_tokens_seen": 276428190, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 12815, "time_per_iteration": 2.4693236351013184 }, { "auxiliary_loss_clip": 0.01107307, "auxiliary_loss_mlp": 0.01030081, "balance_loss_clip": 1.01699781, "balance_loss_mlp": 1.0373857, "epoch": 0.770539606192695, "flos": 18989311766400.0, "grad_norm": 3.7704882868742264, "language_loss": 0.65135586, "learning_rate": 5.272409343590322e-07, "loss": 0.67272973, "num_input_tokens_seen": 276446855, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 12816, "time_per_iteration": 2.449643850326538 }, { "auxiliary_loss_clip": 0.01108457, "auxiliary_loss_mlp": 0.01031975, "balance_loss_clip": 1.01940489, "balance_loss_mlp": 1.03853679, "epoch": 0.770599729445363, "flos": 11830160142720.0, "grad_norm": 2.0610325801339933, "language_loss": 0.71920514, "learning_rate": 5.26977464707133e-07, "loss": 0.74060947, "num_input_tokens_seen": 276462000, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 12817, "time_per_iteration": 2.4037439823150635 }, { "auxiliary_loss_clip": 0.01108362, "auxiliary_loss_mlp": 0.01031157, "balance_loss_clip": 1.01843131, "balance_loss_mlp": 1.03833818, "epoch": 0.770659852698031, "flos": 17822610109440.0, "grad_norm": 10.505696630052851, "language_loss": 0.60995954, "learning_rate": 5.267140509123957e-07, "loss": 0.63135469, "num_input_tokens_seen": 276481190, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 12818, "time_per_iteration": 2.4755516052246094 }, { "auxiliary_loss_clip": 0.0110607, "auxiliary_loss_mlp": 0.01028977, "balance_loss_clip": 1.01757407, "balance_loss_mlp": 1.03802717, "epoch": 0.770719975950699, "flos": 21871968923520.0, "grad_norm": 1.6470256383059045, "language_loss": 0.67096782, "learning_rate": 5.264506929848093e-07, "loss": 0.69231832, "num_input_tokens_seen": 276499520, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 12819, "time_per_iteration": 2.4549429416656494 }, { "auxiliary_loss_clip": 0.01107882, "auxiliary_loss_mlp": 0.01029917, "balance_loss_clip": 1.01763868, "balance_loss_mlp": 1.03728104, "epoch": 0.7707800992033669, "flos": 21325049464320.0, "grad_norm": 6.255326743491116, "language_loss": 0.57738113, "learning_rate": 5.261873909343608e-07, "loss": 0.59875917, "num_input_tokens_seen": 276519110, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 12820, "time_per_iteration": 2.5003788471221924 }, { "auxiliary_loss_clip": 0.01105903, "auxiliary_loss_mlp": 0.01032504, "balance_loss_clip": 1.01945686, "balance_loss_mlp": 1.03568709, "epoch": 0.7708402224560349, "flos": 28179426188160.0, "grad_norm": 1.6088716424036422, "language_loss": 0.80988514, "learning_rate": 5.259241447710343e-07, "loss": 0.83126926, "num_input_tokens_seen": 276538805, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12821, "time_per_iteration": 2.508324146270752 }, { "auxiliary_loss_clip": 0.01107944, "auxiliary_loss_mlp": 0.01029966, "balance_loss_clip": 1.01735377, "balance_loss_mlp": 1.03802502, "epoch": 0.7709003457087028, "flos": 15377057556480.0, "grad_norm": 2.175687944602925, "language_loss": 0.68953812, "learning_rate": 5.256609545048114e-07, "loss": 0.71091717, "num_input_tokens_seen": 276554770, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 12822, "time_per_iteration": 2.4459853172302246 }, { "auxiliary_loss_clip": 0.01106542, "auxiliary_loss_mlp": 0.01034271, "balance_loss_clip": 1.02169394, "balance_loss_mlp": 1.03820086, "epoch": 0.7709604689613708, "flos": 30621854257920.0, "grad_norm": 1.6961972032480572, "language_loss": 0.71951985, "learning_rate": 5.253978201456733e-07, "loss": 0.74092793, "num_input_tokens_seen": 276574535, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 12823, "time_per_iteration": 2.5379440784454346 }, { "auxiliary_loss_clip": 0.01113636, "auxiliary_loss_mlp": 0.01039613, "balance_loss_clip": 1.02462852, "balance_loss_mlp": 1.0403862, "epoch": 0.7710205922140387, "flos": 20301272023680.0, "grad_norm": 1.8695444068223706, "language_loss": 0.76518416, "learning_rate": 5.251347417035969e-07, "loss": 0.7867167, "num_input_tokens_seen": 276592925, "router_z_loss_clip": 0.14941406, "router_z_loss_mlp": 0.734375, "step": 12824, "time_per_iteration": 2.4757027626037598 }, { "auxiliary_loss_clip": 0.01107496, "auxiliary_loss_mlp": 0.01029099, "balance_loss_clip": 1.01665378, "balance_loss_mlp": 1.03849423, "epoch": 0.7710807154667068, "flos": 19644214487040.0, "grad_norm": 1.9129841986912015, "language_loss": 0.72649562, "learning_rate": 5.248717191885592e-07, "loss": 0.74786156, "num_input_tokens_seen": 276610540, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 12825, "time_per_iteration": 2.4961912631988525 }, { "auxiliary_loss_clip": 0.01104831, "auxiliary_loss_mlp": 0.01033887, "balance_loss_clip": 1.02281892, "balance_loss_mlp": 1.03851664, "epoch": 0.7711408387193747, "flos": 20006337450240.0, "grad_norm": 1.3992876549664774, "language_loss": 0.73745704, "learning_rate": 5.246087526105343e-07, "loss": 0.7588442, "num_input_tokens_seen": 276629200, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6640625, "step": 12826, "time_per_iteration": 2.4777188301086426 }, { "auxiliary_loss_clip": 0.01107364, "auxiliary_loss_mlp": 0.0103339, "balance_loss_clip": 1.02002668, "balance_loss_mlp": 1.03595734, "epoch": 0.7712009619720427, "flos": 24971131307520.0, "grad_norm": 2.95266327497039, "language_loss": 0.81080478, "learning_rate": 5.243458419794933e-07, "loss": 0.83221233, "num_input_tokens_seen": 276648655, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 12827, "time_per_iteration": 2.522982120513916 }, { "auxiliary_loss_clip": 0.01032771, "auxiliary_loss_mlp": 0.00999001, "balance_loss_clip": 0.99791002, "balance_loss_mlp": 1.01009429, "epoch": 0.7712610852247107, "flos": 63249681404160.0, "grad_norm": 0.8658057035098667, "language_loss": 0.55179501, "learning_rate": 5.240829873054051e-07, "loss": 0.5721128, "num_input_tokens_seen": 276716500, "router_z_loss_clip": 0.01092529, "router_z_loss_mlp": 0.2265625, "step": 12828, "time_per_iteration": 3.243905544281006 }, { "auxiliary_loss_clip": 0.01104092, "auxiliary_loss_mlp": 0.01029672, "balance_loss_clip": 1.01762581, "balance_loss_mlp": 1.0362817, "epoch": 0.7713212084773786, "flos": 18697860812160.0, "grad_norm": 1.9962816943853503, "language_loss": 0.70097494, "learning_rate": 5.23820188598238e-07, "loss": 0.72231257, "num_input_tokens_seen": 276733535, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 12829, "time_per_iteration": 2.4739363193511963 }, { "auxiliary_loss_clip": 0.01112054, "auxiliary_loss_mlp": 0.01036394, "balance_loss_clip": 1.02273297, "balance_loss_mlp": 1.03921688, "epoch": 0.7713813317300466, "flos": 14173367869440.0, "grad_norm": 7.917840807734604, "language_loss": 0.80041343, "learning_rate": 5.235574458679579e-07, "loss": 0.82189792, "num_input_tokens_seen": 276749575, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 12830, "time_per_iteration": 2.408022403717041 }, { "auxiliary_loss_clip": 0.01110207, "auxiliary_loss_mlp": 0.01031373, "balance_loss_clip": 1.01747322, "balance_loss_mlp": 1.03735781, "epoch": 0.7714414549827145, "flos": 25703960584320.0, "grad_norm": 2.0916102312810407, "language_loss": 0.78254402, "learning_rate": 5.232947591245269e-07, "loss": 0.80395979, "num_input_tokens_seen": 276769460, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73046875, "step": 12831, "time_per_iteration": 2.510796546936035 }, { "auxiliary_loss_clip": 0.0110625, "auxiliary_loss_mlp": 0.01028147, "balance_loss_clip": 1.01580906, "balance_loss_mlp": 1.03655553, "epoch": 0.7715015782353826, "flos": 30555312312960.0, "grad_norm": 1.3987532308602908, "language_loss": 0.61326391, "learning_rate": 5.230321283779071e-07, "loss": 0.63460791, "num_input_tokens_seen": 276790820, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 12832, "time_per_iteration": 2.5317091941833496 }, { "auxiliary_loss_clip": 0.01109111, "auxiliary_loss_mlp": 0.0103769, "balance_loss_clip": 1.0243566, "balance_loss_mlp": 1.0372138, "epoch": 0.7715617014880505, "flos": 20229343038720.0, "grad_norm": 1.6863222152967992, "language_loss": 0.7927438, "learning_rate": 5.227695536380572e-07, "loss": 0.81421185, "num_input_tokens_seen": 276811345, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 12833, "time_per_iteration": 2.485126256942749 }, { "auxiliary_loss_clip": 0.01032992, "auxiliary_loss_mlp": 0.00999925, "balance_loss_clip": 0.99892962, "balance_loss_mlp": 1.01039863, "epoch": 0.7716218247407185, "flos": 63664770971520.0, "grad_norm": 0.8488856775576819, "language_loss": 0.55342382, "learning_rate": 5.22507034914933e-07, "loss": 0.57375306, "num_input_tokens_seen": 276870950, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.2265625, "step": 12834, "time_per_iteration": 3.081653594970703 }, { "auxiliary_loss_clip": 0.01108998, "auxiliary_loss_mlp": 0.01033045, "balance_loss_clip": 1.01992559, "balance_loss_mlp": 1.03813422, "epoch": 0.7716819479933864, "flos": 19791807471360.0, "grad_norm": 2.5618150411359575, "language_loss": 0.72946835, "learning_rate": 5.222445722184903e-07, "loss": 0.75088882, "num_input_tokens_seen": 276890760, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12835, "time_per_iteration": 2.4640934467315674 }, { "auxiliary_loss_clip": 0.01105812, "auxiliary_loss_mlp": 0.01036382, "balance_loss_clip": 1.02298284, "balance_loss_mlp": 1.03491306, "epoch": 0.7717420712460544, "flos": 18442176825600.0, "grad_norm": 1.7522664461173605, "language_loss": 0.70277369, "learning_rate": 5.219821655586814e-07, "loss": 0.72419566, "num_input_tokens_seen": 276909625, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 12836, "time_per_iteration": 3.904630661010742 }, { "auxiliary_loss_clip": 0.01104412, "auxiliary_loss_mlp": 0.01031943, "balance_loss_clip": 1.01960468, "balance_loss_mlp": 1.03662252, "epoch": 0.7718021944987223, "flos": 35189476456320.0, "grad_norm": 1.803112579437266, "language_loss": 0.59604979, "learning_rate": 5.217198149454575e-07, "loss": 0.61741334, "num_input_tokens_seen": 276930760, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 12837, "time_per_iteration": 3.9322328567504883 }, { "auxiliary_loss_clip": 0.01032689, "auxiliary_loss_mlp": 0.01002851, "balance_loss_clip": 1.00176001, "balance_loss_mlp": 1.01004815, "epoch": 0.7718623177513904, "flos": 67923167961600.0, "grad_norm": 0.8728748220928637, "language_loss": 0.55800825, "learning_rate": 5.214575203887666e-07, "loss": 0.57836366, "num_input_tokens_seen": 276989580, "router_z_loss_clip": 0.01092529, "router_z_loss_mlp": 0.2265625, "step": 12838, "time_per_iteration": 3.0721945762634277 }, { "auxiliary_loss_clip": 0.01105593, "auxiliary_loss_mlp": 0.01030787, "balance_loss_clip": 1.01862764, "balance_loss_mlp": 1.03691876, "epoch": 0.7719224410040583, "flos": 18581401941120.0, "grad_norm": 2.3092761303201748, "language_loss": 0.69028598, "learning_rate": 5.211952818985538e-07, "loss": 0.71164984, "num_input_tokens_seen": 277005450, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 12839, "time_per_iteration": 2.400721549987793 }, { "auxiliary_loss_clip": 0.01105868, "auxiliary_loss_mlp": 0.01026195, "balance_loss_clip": 1.01389849, "balance_loss_mlp": 1.03752756, "epoch": 0.7719825642567263, "flos": 23075802264960.0, "grad_norm": 2.7735494122952513, "language_loss": 0.8009119, "learning_rate": 5.209330994847647e-07, "loss": 0.82223248, "num_input_tokens_seen": 277023055, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 12840, "time_per_iteration": 3.8717777729034424 }, { "auxiliary_loss_clip": 0.01105853, "auxiliary_loss_mlp": 0.01031637, "balance_loss_clip": 1.01888192, "balance_loss_mlp": 1.0360924, "epoch": 0.7720426875093943, "flos": 20339086066560.0, "grad_norm": 1.7393458917624918, "language_loss": 0.79759169, "learning_rate": 5.206709731573402e-07, "loss": 0.81896657, "num_input_tokens_seen": 277041150, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 12841, "time_per_iteration": 2.489996910095215 }, { "auxiliary_loss_clip": 0.01108367, "auxiliary_loss_mlp": 0.01029134, "balance_loss_clip": 1.01659894, "balance_loss_mlp": 1.03832102, "epoch": 0.7721028107620622, "flos": 23880704181120.0, "grad_norm": 1.4852350199828133, "language_loss": 0.76916993, "learning_rate": 5.204089029262208e-07, "loss": 0.79054499, "num_input_tokens_seen": 277063895, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 12842, "time_per_iteration": 3.9148762226104736 }, { "auxiliary_loss_clip": 0.01110459, "auxiliary_loss_mlp": 0.01034888, "balance_loss_clip": 1.02212119, "balance_loss_mlp": 1.03901005, "epoch": 0.7721629340147302, "flos": 26651571235200.0, "grad_norm": 1.5666672193696283, "language_loss": 0.6905542, "learning_rate": 5.201468888013445e-07, "loss": 0.71200764, "num_input_tokens_seen": 277084045, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 12843, "time_per_iteration": 2.5210962295532227 }, { "auxiliary_loss_clip": 0.0110749, "auxiliary_loss_mlp": 0.0102864, "balance_loss_clip": 1.0163734, "balance_loss_mlp": 1.03564453, "epoch": 0.7722230572673981, "flos": 21178857110400.0, "grad_norm": 8.980675428256111, "language_loss": 0.74034262, "learning_rate": 5.198849307926465e-07, "loss": 0.76170385, "num_input_tokens_seen": 277102625, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 12844, "time_per_iteration": 2.47176456451416 }, { "auxiliary_loss_clip": 0.01105831, "auxiliary_loss_mlp": 0.01032428, "balance_loss_clip": 1.02019095, "balance_loss_mlp": 1.03644013, "epoch": 0.7722831805200662, "flos": 27964644814080.0, "grad_norm": 1.645352729608007, "language_loss": 0.71545172, "learning_rate": 5.196230289100596e-07, "loss": 0.73683429, "num_input_tokens_seen": 277123210, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 12845, "time_per_iteration": 2.5148611068725586 }, { "auxiliary_loss_clip": 0.01103247, "auxiliary_loss_mlp": 0.01031185, "balance_loss_clip": 1.01959789, "balance_loss_mlp": 1.03568864, "epoch": 0.7723433037727341, "flos": 33875576864640.0, "grad_norm": 1.9641270187576758, "language_loss": 0.64631659, "learning_rate": 5.193611831635159e-07, "loss": 0.66766095, "num_input_tokens_seen": 277144895, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 12846, "time_per_iteration": 2.584545850753784 }, { "auxiliary_loss_clip": 0.01032824, "auxiliary_loss_mlp": 0.01001121, "balance_loss_clip": 1.00005972, "balance_loss_mlp": 1.01016486, "epoch": 0.7724034270254021, "flos": 62848271940480.0, "grad_norm": 0.8056878820998024, "language_loss": 0.61651653, "learning_rate": 5.19099393562945e-07, "loss": 0.63685596, "num_input_tokens_seen": 277205160, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.2265625, "step": 12847, "time_per_iteration": 3.0210864543914795 }, { "auxiliary_loss_clip": 0.01105496, "auxiliary_loss_mlp": 0.01028353, "balance_loss_clip": 1.01565683, "balance_loss_mlp": 1.03527737, "epoch": 0.77246355027807, "flos": 23295467888640.0, "grad_norm": 1.6798504754017887, "language_loss": 0.7890166, "learning_rate": 5.188376601182732e-07, "loss": 0.81035507, "num_input_tokens_seen": 277223005, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12848, "time_per_iteration": 2.4641661643981934 }, { "auxiliary_loss_clip": 0.011098, "auxiliary_loss_mlp": 0.01032867, "balance_loss_clip": 1.01976573, "balance_loss_mlp": 1.03807163, "epoch": 0.772523673530738, "flos": 20121287950080.0, "grad_norm": 1.9199543846304588, "language_loss": 0.7306478, "learning_rate": 5.185759828394261e-07, "loss": 0.75207448, "num_input_tokens_seen": 277241785, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 12849, "time_per_iteration": 2.462329626083374 }, { "auxiliary_loss_clip": 0.01104935, "auxiliary_loss_mlp": 0.01030148, "balance_loss_clip": 1.01775062, "balance_loss_mlp": 1.03606367, "epoch": 0.7725837967834059, "flos": 17820096157440.0, "grad_norm": 2.067492363788785, "language_loss": 0.7811563, "learning_rate": 5.183143617363261e-07, "loss": 0.80250716, "num_input_tokens_seen": 277259050, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 12850, "time_per_iteration": 2.415752649307251 }, { "auxiliary_loss_clip": 0.0110487, "auxiliary_loss_mlp": 0.01033971, "balance_loss_clip": 1.02119195, "balance_loss_mlp": 1.03363931, "epoch": 0.772643920036074, "flos": 27198921657600.0, "grad_norm": 1.5617434277770323, "language_loss": 0.80172074, "learning_rate": 5.180527968188935e-07, "loss": 0.82310915, "num_input_tokens_seen": 277278235, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 12851, "time_per_iteration": 2.5116710662841797 }, { "auxiliary_loss_clip": 0.01107206, "auxiliary_loss_mlp": 0.01029777, "balance_loss_clip": 1.01643765, "balance_loss_mlp": 1.03823662, "epoch": 0.7727040432887419, "flos": 21579512388480.0, "grad_norm": 1.7959255024864864, "language_loss": 0.73951524, "learning_rate": 5.177912880970474e-07, "loss": 0.76088506, "num_input_tokens_seen": 277298355, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69140625, "step": 12852, "time_per_iteration": 2.4892680644989014 }, { "auxiliary_loss_clip": 0.0110472, "auxiliary_loss_mlp": 0.01038116, "balance_loss_clip": 1.02547956, "balance_loss_mlp": 1.03556931, "epoch": 0.7727641665414099, "flos": 22236641752320.0, "grad_norm": 1.7706740056286498, "language_loss": 0.82115757, "learning_rate": 5.17529835580704e-07, "loss": 0.84258592, "num_input_tokens_seen": 277316095, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 12853, "time_per_iteration": 2.4826650619506836 }, { "auxiliary_loss_clip": 0.01032812, "auxiliary_loss_mlp": 0.01002124, "balance_loss_clip": 1.00102091, "balance_loss_mlp": 1.01026738, "epoch": 0.7728242897940779, "flos": 54832221463680.0, "grad_norm": 0.8618535043758159, "language_loss": 0.54489052, "learning_rate": 5.172684392797786e-07, "loss": 0.56523991, "num_input_tokens_seen": 277380130, "router_z_loss_clip": 0.01104736, "router_z_loss_mlp": 0.22558594, "step": 12854, "time_per_iteration": 3.166571855545044 }, { "auxiliary_loss_clip": 0.01109285, "auxiliary_loss_mlp": 0.01031191, "balance_loss_clip": 1.0172441, "balance_loss_mlp": 1.03737521, "epoch": 0.7728844130467458, "flos": 34461962392320.0, "grad_norm": 1.678666140554348, "language_loss": 0.7209745, "learning_rate": 5.170070992041826e-07, "loss": 0.74237931, "num_input_tokens_seen": 277404015, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 12855, "time_per_iteration": 2.569481372833252 }, { "auxiliary_loss_clip": 0.01105841, "auxiliary_loss_mlp": 0.01029402, "balance_loss_clip": 1.01633048, "balance_loss_mlp": 1.03617907, "epoch": 0.7729445362994138, "flos": 18916341287040.0, "grad_norm": 1.7072517862752024, "language_loss": 0.6777389, "learning_rate": 5.167458153638254e-07, "loss": 0.69909132, "num_input_tokens_seen": 277421375, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 12856, "time_per_iteration": 2.456747055053711 }, { "auxiliary_loss_clip": 0.01107962, "auxiliary_loss_mlp": 0.0103043, "balance_loss_clip": 1.01821065, "balance_loss_mlp": 1.03754425, "epoch": 0.7730046595520818, "flos": 22200048771840.0, "grad_norm": 1.887532924346079, "language_loss": 0.79257655, "learning_rate": 5.164845877686162e-07, "loss": 0.81396049, "num_input_tokens_seen": 277440170, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 12857, "time_per_iteration": 2.466485023498535 }, { "auxiliary_loss_clip": 0.01105354, "auxiliary_loss_mlp": 0.01030194, "balance_loss_clip": 1.01748085, "balance_loss_mlp": 1.03735602, "epoch": 0.7730647828047498, "flos": 13552328695680.0, "grad_norm": 2.0461749180220536, "language_loss": 0.7823689, "learning_rate": 5.162234164284591e-07, "loss": 0.80372435, "num_input_tokens_seen": 277456880, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 12858, "time_per_iteration": 2.4423017501831055 }, { "auxiliary_loss_clip": 0.01105969, "auxiliary_loss_mlp": 0.01030156, "balance_loss_clip": 1.01779366, "balance_loss_mlp": 1.0354445, "epoch": 0.7731249060574177, "flos": 21976037602560.0, "grad_norm": 2.1242375040686317, "language_loss": 0.77252561, "learning_rate": 5.159623013532591e-07, "loss": 0.79388678, "num_input_tokens_seen": 277475365, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 12859, "time_per_iteration": 2.4471640586853027 }, { "auxiliary_loss_clip": 0.01103543, "auxiliary_loss_mlp": 0.01030479, "balance_loss_clip": 1.01946402, "balance_loss_mlp": 1.03722751, "epoch": 0.7731850293100857, "flos": 22601817371520.0, "grad_norm": 1.6345833002763253, "language_loss": 0.67787975, "learning_rate": 5.157012425529186e-07, "loss": 0.69921994, "num_input_tokens_seen": 277494975, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6640625, "step": 12860, "time_per_iteration": 2.4764246940612793 }, { "auxiliary_loss_clip": 0.0110794, "auxiliary_loss_mlp": 0.01035995, "balance_loss_clip": 1.02293599, "balance_loss_mlp": 1.03592157, "epoch": 0.7732451525627536, "flos": 14098422142080.0, "grad_norm": 2.4475879346126503, "language_loss": 0.75038463, "learning_rate": 5.154402400373343e-07, "loss": 0.771824, "num_input_tokens_seen": 277510520, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 12861, "time_per_iteration": 2.4179441928863525 }, { "auxiliary_loss_clip": 0.01110737, "auxiliary_loss_mlp": 0.01030861, "balance_loss_clip": 1.01781332, "balance_loss_mlp": 1.03884673, "epoch": 0.7733052758154216, "flos": 21470020755840.0, "grad_norm": 1.889988748592469, "language_loss": 0.7498672, "learning_rate": 5.15179293816405e-07, "loss": 0.77128315, "num_input_tokens_seen": 277530505, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 12862, "time_per_iteration": 2.5087392330169678 }, { "auxiliary_loss_clip": 0.01104917, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.02107346, "balance_loss_mlp": 1.03651762, "epoch": 0.7733653990680895, "flos": 21394284929280.0, "grad_norm": 1.5188958155393133, "language_loss": 0.83152169, "learning_rate": 5.149184039000256e-07, "loss": 0.85290098, "num_input_tokens_seen": 277550810, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 12863, "time_per_iteration": 2.4572360515594482 }, { "auxiliary_loss_clip": 0.0110488, "auxiliary_loss_mlp": 0.01033148, "balance_loss_clip": 1.02072644, "balance_loss_mlp": 1.03585887, "epoch": 0.7734255223207576, "flos": 17676058619520.0, "grad_norm": 1.6195010657955509, "language_loss": 0.72982132, "learning_rate": 5.146575702980898e-07, "loss": 0.75120163, "num_input_tokens_seen": 277567680, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 12864, "time_per_iteration": 2.428421974182129 }, { "auxiliary_loss_clip": 0.01105567, "auxiliary_loss_mlp": 0.01029572, "balance_loss_clip": 1.01780057, "balance_loss_mlp": 1.03546023, "epoch": 0.7734856455734255, "flos": 25230837617280.0, "grad_norm": 1.7129886972221078, "language_loss": 0.82456714, "learning_rate": 5.143967930204871e-07, "loss": 0.84591854, "num_input_tokens_seen": 277588970, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.703125, "step": 12865, "time_per_iteration": 2.523104429244995 }, { "auxiliary_loss_clip": 0.0111164, "auxiliary_loss_mlp": 0.0103261, "balance_loss_clip": 1.0189842, "balance_loss_mlp": 1.03900599, "epoch": 0.7735457688260935, "flos": 23433112805760.0, "grad_norm": 2.1602148381545123, "language_loss": 0.71866286, "learning_rate": 5.141360720771077e-07, "loss": 0.74010527, "num_input_tokens_seen": 277605450, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 12866, "time_per_iteration": 2.5068209171295166 }, { "auxiliary_loss_clip": 0.01109258, "auxiliary_loss_mlp": 0.01027826, "balance_loss_clip": 1.01455224, "balance_loss_mlp": 1.03922892, "epoch": 0.7736058920787615, "flos": 18729246320640.0, "grad_norm": 2.6244488576114096, "language_loss": 0.65349829, "learning_rate": 5.138754074778371e-07, "loss": 0.67486912, "num_input_tokens_seen": 277622530, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 12867, "time_per_iteration": 2.4352684020996094 }, { "auxiliary_loss_clip": 0.01103513, "auxiliary_loss_mlp": 0.01033067, "balance_loss_clip": 1.02098489, "balance_loss_mlp": 1.03560185, "epoch": 0.7736660153314294, "flos": 22893304239360.0, "grad_norm": 1.5851714506531158, "language_loss": 0.71268606, "learning_rate": 5.136147992325595e-07, "loss": 0.73405188, "num_input_tokens_seen": 277642700, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 12868, "time_per_iteration": 2.4732213020324707 }, { "auxiliary_loss_clip": 0.01108317, "auxiliary_loss_mlp": 0.01030187, "balance_loss_clip": 1.01794446, "balance_loss_mlp": 1.03828049, "epoch": 0.7737261385840974, "flos": 13800901789440.0, "grad_norm": 2.1071651871154837, "language_loss": 0.77955103, "learning_rate": 5.133542473511578e-07, "loss": 0.8009361, "num_input_tokens_seen": 277660005, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69921875, "step": 12869, "time_per_iteration": 2.420264720916748 }, { "auxiliary_loss_clip": 0.01103533, "auxiliary_loss_mlp": 0.01029177, "balance_loss_clip": 1.01654696, "balance_loss_mlp": 1.0360018, "epoch": 0.7737862618367654, "flos": 28730727106560.0, "grad_norm": 1.735226482902316, "language_loss": 0.73838806, "learning_rate": 5.130937518435124e-07, "loss": 0.7597152, "num_input_tokens_seen": 277682890, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 12870, "time_per_iteration": 2.518554449081421 }, { "auxiliary_loss_clip": 0.01107375, "auxiliary_loss_mlp": 0.01030406, "balance_loss_clip": 1.01767433, "balance_loss_mlp": 1.03742969, "epoch": 0.7738463850894334, "flos": 17018570119680.0, "grad_norm": 2.2264146892961305, "language_loss": 0.7567513, "learning_rate": 5.12833312719501e-07, "loss": 0.77812916, "num_input_tokens_seen": 277699330, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 12871, "time_per_iteration": 2.4292352199554443 }, { "auxiliary_loss_clip": 0.01104778, "auxiliary_loss_mlp": 0.01030302, "balance_loss_clip": 1.01869714, "balance_loss_mlp": 1.03619576, "epoch": 0.7739065083421013, "flos": 20704010290560.0, "grad_norm": 1.7848870863584176, "language_loss": 0.68643916, "learning_rate": 5.12572929988999e-07, "loss": 0.70778996, "num_input_tokens_seen": 277718750, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 12872, "time_per_iteration": 2.4495155811309814 }, { "auxiliary_loss_clip": 0.01107915, "auxiliary_loss_mlp": 0.01032173, "balance_loss_clip": 1.01852357, "balance_loss_mlp": 1.03744996, "epoch": 0.7739666315947693, "flos": 20697222620160.0, "grad_norm": 2.0158053093336727, "language_loss": 0.84906203, "learning_rate": 5.123126036618804e-07, "loss": 0.87046289, "num_input_tokens_seen": 277734645, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 12873, "time_per_iteration": 2.4469380378723145 }, { "auxiliary_loss_clip": 0.01109372, "auxiliary_loss_mlp": 0.01033261, "balance_loss_clip": 1.02102435, "balance_loss_mlp": 1.0388093, "epoch": 0.7740267548474372, "flos": 29570677718400.0, "grad_norm": 2.3953438716077184, "language_loss": 0.65685701, "learning_rate": 5.120523337480174e-07, "loss": 0.67828333, "num_input_tokens_seen": 277755535, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 12874, "time_per_iteration": 2.515207529067993 }, { "auxiliary_loss_clip": 0.01106996, "auxiliary_loss_mlp": 0.01031909, "balance_loss_clip": 1.01871812, "balance_loss_mlp": 1.03782392, "epoch": 0.7740868781001052, "flos": 23659099223040.0, "grad_norm": 1.5716492804304023, "language_loss": 0.62661004, "learning_rate": 5.117921202572785e-07, "loss": 0.64799905, "num_input_tokens_seen": 277775585, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69140625, "step": 12875, "time_per_iteration": 2.491361379623413 }, { "auxiliary_loss_clip": 0.01106676, "auxiliary_loss_mlp": 0.01030076, "balance_loss_clip": 1.01733232, "balance_loss_mlp": 1.03633761, "epoch": 0.7741470013527731, "flos": 24717314828160.0, "grad_norm": 5.312586756975756, "language_loss": 0.64987206, "learning_rate": 5.115319631995318e-07, "loss": 0.67123955, "num_input_tokens_seen": 277794795, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 12876, "time_per_iteration": 2.483902931213379 }, { "auxiliary_loss_clip": 0.01103876, "auxiliary_loss_mlp": 0.01029878, "balance_loss_clip": 1.01773107, "balance_loss_mlp": 1.03596652, "epoch": 0.7742071246054412, "flos": 21871645701120.0, "grad_norm": 2.3027178944002777, "language_loss": 0.71332449, "learning_rate": 5.112718625846433e-07, "loss": 0.73466206, "num_input_tokens_seen": 277813235, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 12877, "time_per_iteration": 2.48078989982605 }, { "auxiliary_loss_clip": 0.01109755, "auxiliary_loss_mlp": 0.01033365, "balance_loss_clip": 1.01973319, "balance_loss_mlp": 1.03679538, "epoch": 0.7742672478581091, "flos": 22674249146880.0, "grad_norm": 2.154712532416501, "language_loss": 0.82748616, "learning_rate": 5.110118184224736e-07, "loss": 0.84891737, "num_input_tokens_seen": 277832560, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73046875, "step": 12878, "time_per_iteration": 5.40735125541687 }, { "auxiliary_loss_clip": 0.01108765, "auxiliary_loss_mlp": 0.01034903, "balance_loss_clip": 1.02131271, "balance_loss_mlp": 1.03813744, "epoch": 0.7743273711107771, "flos": 18840892769280.0, "grad_norm": 1.8437574654353357, "language_loss": 0.73591971, "learning_rate": 5.10751830722885e-07, "loss": 0.75735641, "num_input_tokens_seen": 277850120, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 12879, "time_per_iteration": 2.4986908435821533 }, { "auxiliary_loss_clip": 0.0110177, "auxiliary_loss_mlp": 0.01028088, "balance_loss_clip": 1.01585102, "balance_loss_mlp": 1.03547585, "epoch": 0.7743874943634451, "flos": 28729326476160.0, "grad_norm": 7.596874972374934, "language_loss": 0.79752737, "learning_rate": 5.104918994957364e-07, "loss": 0.81882596, "num_input_tokens_seen": 277871020, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6640625, "step": 12880, "time_per_iteration": 2.540764808654785 }, { "auxiliary_loss_clip": 0.01105428, "auxiliary_loss_mlp": 0.01033339, "balance_loss_clip": 1.0209471, "balance_loss_mlp": 1.03783464, "epoch": 0.774447617616113, "flos": 21909639312000.0, "grad_norm": 1.6298258164089061, "language_loss": 0.70238721, "learning_rate": 5.102320247508847e-07, "loss": 0.72377491, "num_input_tokens_seen": 277891525, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.67578125, "step": 12881, "time_per_iteration": 3.894373655319214 }, { "auxiliary_loss_clip": 0.0111026, "auxiliary_loss_mlp": 0.01038492, "balance_loss_clip": 1.02470553, "balance_loss_mlp": 1.03742933, "epoch": 0.774507740868781, "flos": 19500643825920.0, "grad_norm": 1.8783279305081566, "language_loss": 0.84708929, "learning_rate": 5.099722064981832e-07, "loss": 0.86857682, "num_input_tokens_seen": 277910425, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 12882, "time_per_iteration": 2.4514541625976562 }, { "auxiliary_loss_clip": 0.01031745, "auxiliary_loss_mlp": 0.01000107, "balance_loss_clip": 0.99900454, "balance_loss_mlp": 1.00929987, "epoch": 0.774567864121449, "flos": 59426560402560.0, "grad_norm": 1.0380357151418789, "language_loss": 0.60490417, "learning_rate": 5.097124447474858e-07, "loss": 0.62522268, "num_input_tokens_seen": 277972795, "router_z_loss_clip": 0.01104736, "router_z_loss_mlp": 0.22460938, "step": 12883, "time_per_iteration": 4.5264129638671875 }, { "auxiliary_loss_clip": 0.01108854, "auxiliary_loss_mlp": 0.01034848, "balance_loss_clip": 1.02122831, "balance_loss_mlp": 1.03765678, "epoch": 0.774627987374117, "flos": 13225326255360.0, "grad_norm": 2.214802590143771, "language_loss": 0.72639489, "learning_rate": 5.094527395086416e-07, "loss": 0.74783182, "num_input_tokens_seen": 277990675, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 12884, "time_per_iteration": 2.4671308994293213 }, { "auxiliary_loss_clip": 0.01105953, "auxiliary_loss_mlp": 0.01036159, "balance_loss_clip": 1.02467299, "balance_loss_mlp": 1.03769732, "epoch": 0.7746881106267849, "flos": 21394033534080.0, "grad_norm": 2.0782544423267404, "language_loss": 0.81093204, "learning_rate": 5.091930907914986e-07, "loss": 0.83235312, "num_input_tokens_seen": 278010050, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.68359375, "step": 12885, "time_per_iteration": 2.4462132453918457 }, { "auxiliary_loss_clip": 0.01103565, "auxiliary_loss_mlp": 0.01030827, "balance_loss_clip": 1.01891184, "balance_loss_mlp": 1.03555059, "epoch": 0.7747482338794529, "flos": 25629338079360.0, "grad_norm": 1.8320914625583387, "language_loss": 0.63833129, "learning_rate": 5.089334986059029e-07, "loss": 0.65967524, "num_input_tokens_seen": 278030660, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 12886, "time_per_iteration": 2.5007808208465576 }, { "auxiliary_loss_clip": 0.01105839, "auxiliary_loss_mlp": 0.01031149, "balance_loss_clip": 1.01991999, "balance_loss_mlp": 1.03503811, "epoch": 0.7748083571321208, "flos": 11546933402880.0, "grad_norm": 2.0253818573698004, "language_loss": 0.69533277, "learning_rate": 5.086739629616987e-07, "loss": 0.7167027, "num_input_tokens_seen": 278047645, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.70703125, "step": 12887, "time_per_iteration": 2.418226957321167 }, { "auxiliary_loss_clip": 0.01102971, "auxiliary_loss_mlp": 0.01027688, "balance_loss_clip": 1.01640534, "balance_loss_mlp": 1.03512073, "epoch": 0.7748684803847888, "flos": 19062425900160.0, "grad_norm": 1.8654931625726057, "language_loss": 0.70482296, "learning_rate": 5.084144838687275e-07, "loss": 0.72612953, "num_input_tokens_seen": 278066170, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 12888, "time_per_iteration": 2.442225456237793 }, { "auxiliary_loss_clip": 0.01105883, "auxiliary_loss_mlp": 0.01031021, "balance_loss_clip": 1.0183965, "balance_loss_mlp": 1.03515542, "epoch": 0.7749286036374567, "flos": 22273162905600.0, "grad_norm": 1.7171099317445009, "language_loss": 0.81696039, "learning_rate": 5.081550613368279e-07, "loss": 0.83832937, "num_input_tokens_seen": 278085545, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 12889, "time_per_iteration": 2.4696500301361084 }, { "auxiliary_loss_clip": 0.01105456, "auxiliary_loss_mlp": 0.01029519, "balance_loss_clip": 1.01761627, "balance_loss_mlp": 1.03690767, "epoch": 0.7749887268901248, "flos": 20192462749440.0, "grad_norm": 2.763744519810429, "language_loss": 0.79849803, "learning_rate": 5.07895695375838e-07, "loss": 0.81984782, "num_input_tokens_seen": 278102995, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 12890, "time_per_iteration": 2.4412009716033936 }, { "auxiliary_loss_clip": 0.01109378, "auxiliary_loss_mlp": 0.01032073, "balance_loss_clip": 1.01929343, "balance_loss_mlp": 1.03927112, "epoch": 0.7750488501427927, "flos": 20337541781760.0, "grad_norm": 7.86322755451957, "language_loss": 0.6675033, "learning_rate": 5.076363859955932e-07, "loss": 0.68891776, "num_input_tokens_seen": 278121460, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 12891, "time_per_iteration": 2.44814133644104 }, { "auxiliary_loss_clip": 0.01105886, "auxiliary_loss_mlp": 0.01035891, "balance_loss_clip": 1.02340984, "balance_loss_mlp": 1.03641593, "epoch": 0.7751089733954607, "flos": 28364043116160.0, "grad_norm": 1.7124609257133174, "language_loss": 0.78703743, "learning_rate": 5.073771332059257e-07, "loss": 0.80845517, "num_input_tokens_seen": 278143905, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 12892, "time_per_iteration": 2.5541186332702637 }, { "auxiliary_loss_clip": 0.0110922, "auxiliary_loss_mlp": 0.01030297, "balance_loss_clip": 1.01737487, "balance_loss_mlp": 1.03832483, "epoch": 0.7751690966481286, "flos": 16943803960320.0, "grad_norm": 2.2004113261060207, "language_loss": 0.66921198, "learning_rate": 5.071179370166669e-07, "loss": 0.69060719, "num_input_tokens_seen": 278160850, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 12893, "time_per_iteration": 2.4779281616210938 }, { "auxiliary_loss_clip": 0.01031932, "auxiliary_loss_mlp": 0.009994, "balance_loss_clip": 0.99829715, "balance_loss_mlp": 1.00932491, "epoch": 0.7752292199007966, "flos": 65668050339840.0, "grad_norm": 0.8241164684326692, "language_loss": 0.5853315, "learning_rate": 5.068587974376468e-07, "loss": 0.60564476, "num_input_tokens_seen": 278219950, "router_z_loss_clip": 0.01104736, "router_z_loss_mlp": 0.2265625, "step": 12894, "time_per_iteration": 3.169438362121582 }, { "auxiliary_loss_clip": 0.0110887, "auxiliary_loss_mlp": 0.01034478, "balance_loss_clip": 1.02138925, "balance_loss_mlp": 1.03783584, "epoch": 0.7752893431534646, "flos": 20594662312320.0, "grad_norm": 2.164375293503111, "language_loss": 0.78205657, "learning_rate": 5.065997144786895e-07, "loss": 0.80349004, "num_input_tokens_seen": 278237805, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 12895, "time_per_iteration": 2.4474194049835205 }, { "auxiliary_loss_clip": 0.01106834, "auxiliary_loss_mlp": 0.01029114, "balance_loss_clip": 1.01605499, "balance_loss_mlp": 1.03808844, "epoch": 0.7753494664061326, "flos": 20485350247680.0, "grad_norm": 2.5227206936782314, "language_loss": 0.67493916, "learning_rate": 5.063406881496209e-07, "loss": 0.69629872, "num_input_tokens_seen": 278257660, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 12896, "time_per_iteration": 2.502948760986328 }, { "auxiliary_loss_clip": 0.01104527, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.02572346, "balance_loss_mlp": 1.03595555, "epoch": 0.7754095896588006, "flos": 20265900105600.0, "grad_norm": 2.263381714945602, "language_loss": 0.6877681, "learning_rate": 5.060817184602629e-07, "loss": 0.70918417, "num_input_tokens_seen": 278275110, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6875, "step": 12897, "time_per_iteration": 2.4647066593170166 }, { "auxiliary_loss_clip": 0.01108125, "auxiliary_loss_mlp": 0.0103965, "balance_loss_clip": 1.02631009, "balance_loss_mlp": 1.03838563, "epoch": 0.7754697129114685, "flos": 23331091201920.0, "grad_norm": 1.7111368743671538, "language_loss": 0.75206721, "learning_rate": 5.058228054204364e-07, "loss": 0.77354491, "num_input_tokens_seen": 278293035, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 12898, "time_per_iteration": 2.5057671070098877 }, { "auxiliary_loss_clip": 0.0110638, "auxiliary_loss_mlp": 0.01030325, "balance_loss_clip": 1.0167532, "balance_loss_mlp": 1.03605652, "epoch": 0.7755298361641365, "flos": 17347619635200.0, "grad_norm": 1.848848272594348, "language_loss": 0.7012558, "learning_rate": 5.055639490399588e-07, "loss": 0.72262287, "num_input_tokens_seen": 278311010, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 12899, "time_per_iteration": 2.438851833343506 }, { "auxiliary_loss_clip": 0.01106655, "auxiliary_loss_mlp": 0.01033469, "balance_loss_clip": 1.02064824, "balance_loss_mlp": 1.03784728, "epoch": 0.7755899594168044, "flos": 19645866512640.0, "grad_norm": 2.06564365397935, "language_loss": 0.7545644, "learning_rate": 5.053051493286453e-07, "loss": 0.77596569, "num_input_tokens_seen": 278329900, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 12900, "time_per_iteration": 2.4608986377716064 }, { "auxiliary_loss_clip": 0.01102056, "auxiliary_loss_mlp": 0.01037954, "balance_loss_clip": 1.0264678, "balance_loss_mlp": 1.03457808, "epoch": 0.7756500826694724, "flos": 27414457217280.0, "grad_norm": 2.2435776175963187, "language_loss": 0.7773968, "learning_rate": 5.050464062963113e-07, "loss": 0.79879683, "num_input_tokens_seen": 278349980, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 12901, "time_per_iteration": 2.508267641067505 }, { "auxiliary_loss_clip": 0.01108603, "auxiliary_loss_mlp": 0.01028937, "balance_loss_clip": 1.01614583, "balance_loss_mlp": 1.0399425, "epoch": 0.7757102059221404, "flos": 28730511624960.0, "grad_norm": 4.282308188286213, "language_loss": 0.77227163, "learning_rate": 5.047877199527666e-07, "loss": 0.79364705, "num_input_tokens_seen": 278372485, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 12902, "time_per_iteration": 2.5343639850616455 }, { "auxiliary_loss_clip": 0.01104849, "auxiliary_loss_mlp": 0.01029863, "balance_loss_clip": 1.01815081, "balance_loss_mlp": 1.03593993, "epoch": 0.7757703291748084, "flos": 22486795044480.0, "grad_norm": 1.880669940317233, "language_loss": 0.73015612, "learning_rate": 5.045290903078215e-07, "loss": 0.75150323, "num_input_tokens_seen": 278391660, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 12903, "time_per_iteration": 2.4694676399230957 }, { "auxiliary_loss_clip": 0.01106459, "auxiliary_loss_mlp": 0.01024826, "balance_loss_clip": 1.01223755, "balance_loss_mlp": 1.03778851, "epoch": 0.7758304524274763, "flos": 21430159637760.0, "grad_norm": 2.795008404018697, "language_loss": 0.76188433, "learning_rate": 5.042705173712835e-07, "loss": 0.78319716, "num_input_tokens_seen": 278409125, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 12904, "time_per_iteration": 2.497140884399414 }, { "auxiliary_loss_clip": 0.01102936, "auxiliary_loss_mlp": 0.01026017, "balance_loss_clip": 1.01417959, "balance_loss_mlp": 1.03682125, "epoch": 0.7758905756801443, "flos": 23659242877440.0, "grad_norm": 2.7017752446372647, "language_loss": 0.68418086, "learning_rate": 5.040120011529576e-07, "loss": 0.70547032, "num_input_tokens_seen": 278429450, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 12905, "time_per_iteration": 2.4914793968200684 }, { "auxiliary_loss_clip": 0.01103485, "auxiliary_loss_mlp": 0.01029991, "balance_loss_clip": 1.01764083, "balance_loss_mlp": 1.03745675, "epoch": 0.7759506989328122, "flos": 28365479660160.0, "grad_norm": 1.6667142072084327, "language_loss": 0.67251337, "learning_rate": 5.037535416626459e-07, "loss": 0.69384813, "num_input_tokens_seen": 278449925, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.66015625, "step": 12906, "time_per_iteration": 2.530207395553589 }, { "auxiliary_loss_clip": 0.01105456, "auxiliary_loss_mlp": 0.01028366, "balance_loss_clip": 1.0164628, "balance_loss_mlp": 1.03714585, "epoch": 0.7760108221854802, "flos": 14902785354240.0, "grad_norm": 1.8912292457875661, "language_loss": 0.81799287, "learning_rate": 5.034951389101498e-07, "loss": 0.83933109, "num_input_tokens_seen": 278467255, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 12907, "time_per_iteration": 2.4272260665893555 }, { "auxiliary_loss_clip": 0.01104591, "auxiliary_loss_mlp": 0.01030516, "balance_loss_clip": 1.01903617, "balance_loss_mlp": 1.03841686, "epoch": 0.7760709454381483, "flos": 14792503622400.0, "grad_norm": 2.7484875578223584, "language_loss": 0.67230195, "learning_rate": 5.032367929052685e-07, "loss": 0.69365299, "num_input_tokens_seen": 278484250, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66015625, "step": 12908, "time_per_iteration": 2.430203676223755 }, { "auxiliary_loss_clip": 0.01109465, "auxiliary_loss_mlp": 0.01036501, "balance_loss_clip": 1.02410269, "balance_loss_mlp": 1.03889859, "epoch": 0.7761310686908162, "flos": 17379831156480.0, "grad_norm": 1.709067861479245, "language_loss": 0.70769656, "learning_rate": 5.029785036577976e-07, "loss": 0.7291562, "num_input_tokens_seen": 278502740, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 12909, "time_per_iteration": 2.4307515621185303 }, { "auxiliary_loss_clip": 0.01103772, "auxiliary_loss_mlp": 0.01034224, "balance_loss_clip": 1.02247608, "balance_loss_mlp": 1.0360018, "epoch": 0.7761911919434842, "flos": 25556547168000.0, "grad_norm": 2.1668355347178867, "language_loss": 0.67614043, "learning_rate": 5.027202711775324e-07, "loss": 0.69752038, "num_input_tokens_seen": 278523890, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 12910, "time_per_iteration": 2.5062811374664307 }, { "auxiliary_loss_clip": 0.01108087, "auxiliary_loss_mlp": 0.01033848, "balance_loss_clip": 1.02265406, "balance_loss_mlp": 1.03904331, "epoch": 0.7762513151961521, "flos": 23179763203200.0, "grad_norm": 2.0512421522962456, "language_loss": 0.71729201, "learning_rate": 5.024620954742646e-07, "loss": 0.73871136, "num_input_tokens_seen": 278543185, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.69140625, "step": 12911, "time_per_iteration": 2.468538522720337 }, { "auxiliary_loss_clip": 0.0110997, "auxiliary_loss_mlp": 0.01033954, "balance_loss_clip": 1.02095985, "balance_loss_mlp": 1.03986681, "epoch": 0.7763114384488201, "flos": 21689614552320.0, "grad_norm": 3.1498648009264523, "language_loss": 0.63320017, "learning_rate": 5.022039765577836e-07, "loss": 0.65463942, "num_input_tokens_seen": 278559220, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 12912, "time_per_iteration": 2.462287425994873 }, { "auxiliary_loss_clip": 0.0103207, "auxiliary_loss_mlp": 0.01000704, "balance_loss_clip": 0.9995777, "balance_loss_mlp": 1.00955129, "epoch": 0.776371561701488, "flos": 69025554316800.0, "grad_norm": 0.772218259394208, "language_loss": 0.53252369, "learning_rate": 5.019459144378779e-07, "loss": 0.55285144, "num_input_tokens_seen": 278618185, "router_z_loss_clip": 0.0112915, "router_z_loss_mlp": 0.22558594, "step": 12913, "time_per_iteration": 3.1689982414245605 }, { "auxiliary_loss_clip": 0.011092, "auxiliary_loss_mlp": 0.01032587, "balance_loss_clip": 1.01984394, "balance_loss_mlp": 1.03930175, "epoch": 0.776431684954156, "flos": 22893914770560.0, "grad_norm": 1.7051551724656404, "language_loss": 0.62412512, "learning_rate": 5.016879091243338e-07, "loss": 0.64554298, "num_input_tokens_seen": 278636210, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 12914, "time_per_iteration": 2.4783499240875244 }, { "auxiliary_loss_clip": 0.01103643, "auxiliary_loss_mlp": 0.01035094, "balance_loss_clip": 1.0223918, "balance_loss_mlp": 1.03508735, "epoch": 0.776491808206824, "flos": 20261554560000.0, "grad_norm": 1.8047417657977576, "language_loss": 0.8221066, "learning_rate": 5.014299606269339e-07, "loss": 0.84349394, "num_input_tokens_seen": 278653305, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 12915, "time_per_iteration": 2.4263603687286377 }, { "auxiliary_loss_clip": 0.01107692, "auxiliary_loss_mlp": 0.01034847, "balance_loss_clip": 1.02151322, "balance_loss_mlp": 1.03569591, "epoch": 0.776551931459492, "flos": 26759051706240.0, "grad_norm": 1.765343634550467, "language_loss": 0.74604529, "learning_rate": 5.011720689554603e-07, "loss": 0.76747066, "num_input_tokens_seen": 278671850, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 12916, "time_per_iteration": 2.505028009414673 }, { "auxiliary_loss_clip": 0.01105683, "auxiliary_loss_mlp": 0.01031912, "balance_loss_clip": 1.01929927, "balance_loss_mlp": 1.03588855, "epoch": 0.7766120547121599, "flos": 52665080250240.0, "grad_norm": 1.7283405284858788, "language_loss": 0.65646505, "learning_rate": 5.009142341196919e-07, "loss": 0.67784095, "num_input_tokens_seen": 278697860, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12917, "time_per_iteration": 2.7761003971099854 }, { "auxiliary_loss_clip": 0.01104274, "auxiliary_loss_mlp": 0.01031091, "balance_loss_clip": 1.01894331, "balance_loss_mlp": 1.03479779, "epoch": 0.7766721779648279, "flos": 25156215112320.0, "grad_norm": 1.5220201660832278, "language_loss": 0.64477813, "learning_rate": 5.006564561294065e-07, "loss": 0.66613173, "num_input_tokens_seen": 278720655, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 12918, "time_per_iteration": 2.5128378868103027 }, { "auxiliary_loss_clip": 0.01104249, "auxiliary_loss_mlp": 0.01030926, "balance_loss_clip": 1.01876116, "balance_loss_mlp": 1.03621268, "epoch": 0.7767323012174958, "flos": 23760761690880.0, "grad_norm": 2.223637220585549, "language_loss": 0.73501003, "learning_rate": 5.003987349943777e-07, "loss": 0.75636172, "num_input_tokens_seen": 278737375, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 12919, "time_per_iteration": 2.4491031169891357 }, { "auxiliary_loss_clip": 0.0110841, "auxiliary_loss_mlp": 0.01030966, "balance_loss_clip": 1.0180732, "balance_loss_mlp": 1.03855455, "epoch": 0.7767924244701638, "flos": 22086642556800.0, "grad_norm": 2.3868987547925706, "language_loss": 0.78949761, "learning_rate": 5.001410707243792e-07, "loss": 0.81089139, "num_input_tokens_seen": 278756510, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 12920, "time_per_iteration": 5.42138671875 }, { "auxiliary_loss_clip": 0.01107614, "auxiliary_loss_mlp": 0.01030347, "balance_loss_clip": 1.01775241, "balance_loss_mlp": 1.03797293, "epoch": 0.7768525477228319, "flos": 21981640124160.0, "grad_norm": 1.653258939921959, "language_loss": 0.70830965, "learning_rate": 4.998834633291829e-07, "loss": 0.72968918, "num_input_tokens_seen": 278775410, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12921, "time_per_iteration": 2.501038074493408 }, { "auxiliary_loss_clip": 0.0110953, "auxiliary_loss_mlp": 0.01031863, "balance_loss_clip": 1.01814163, "balance_loss_mlp": 1.03742862, "epoch": 0.7769126709754998, "flos": 21794581071360.0, "grad_norm": 2.5717814668806662, "language_loss": 0.76003563, "learning_rate": 4.996259128185547e-07, "loss": 0.7814495, "num_input_tokens_seen": 278794260, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 12922, "time_per_iteration": 2.4693472385406494 }, { "auxiliary_loss_clip": 0.01108098, "auxiliary_loss_mlp": 0.01034847, "balance_loss_clip": 1.02202046, "balance_loss_mlp": 1.03846526, "epoch": 0.7769727942281678, "flos": 20047994248320.0, "grad_norm": 1.8389011171580087, "language_loss": 0.80536366, "learning_rate": 4.993684192022625e-07, "loss": 0.82679313, "num_input_tokens_seen": 278813290, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 12923, "time_per_iteration": 3.876081705093384 }, { "auxiliary_loss_clip": 0.01107876, "auxiliary_loss_mlp": 0.01034717, "balance_loss_clip": 1.02280164, "balance_loss_mlp": 1.03829455, "epoch": 0.7770329174808357, "flos": 21686777377920.0, "grad_norm": 1.9439326862629256, "language_loss": 0.92367142, "learning_rate": 4.991109824900699e-07, "loss": 0.94509733, "num_input_tokens_seen": 278830610, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 12924, "time_per_iteration": 2.493846893310547 }, { "auxiliary_loss_clip": 0.01104665, "auxiliary_loss_mlp": 0.01029816, "balance_loss_clip": 1.01692927, "balance_loss_mlp": 1.03510916, "epoch": 0.7770930407335037, "flos": 25849255098240.0, "grad_norm": 3.145914565709483, "language_loss": 0.66290587, "learning_rate": 4.988536026917401e-07, "loss": 0.68425071, "num_input_tokens_seen": 278849530, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12925, "time_per_iteration": 3.9543933868408203 }, { "auxiliary_loss_clip": 0.01108014, "auxiliary_loss_mlp": 0.01037766, "balance_loss_clip": 1.02509427, "balance_loss_mlp": 1.03715539, "epoch": 0.7771531639861716, "flos": 24347865490560.0, "grad_norm": 2.607173196989499, "language_loss": 0.71635675, "learning_rate": 4.985962798170314e-07, "loss": 0.73781461, "num_input_tokens_seen": 278869005, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 12926, "time_per_iteration": 2.46482515335083 }, { "auxiliary_loss_clip": 0.01108742, "auxiliary_loss_mlp": 0.01028918, "balance_loss_clip": 1.01541805, "balance_loss_mlp": 1.03770113, "epoch": 0.7772132872388396, "flos": 25629948610560.0, "grad_norm": 3.0185386583536173, "language_loss": 0.65697116, "learning_rate": 4.983390138757027e-07, "loss": 0.67834777, "num_input_tokens_seen": 278888790, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 12927, "time_per_iteration": 2.5083489418029785 }, { "auxiliary_loss_clip": 0.0110886, "auxiliary_loss_mlp": 0.01035945, "balance_loss_clip": 1.02239096, "balance_loss_mlp": 1.03863835, "epoch": 0.7772734104915076, "flos": 26067412350720.0, "grad_norm": 1.8670359654307918, "language_loss": 0.72238809, "learning_rate": 4.980818048775093e-07, "loss": 0.74383616, "num_input_tokens_seen": 278908150, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 12928, "time_per_iteration": 2.4809491634368896 }, { "auxiliary_loss_clip": 0.0110393, "auxiliary_loss_mlp": 0.01032321, "balance_loss_clip": 1.01926184, "balance_loss_mlp": 1.03556585, "epoch": 0.7773335337441756, "flos": 22925048883840.0, "grad_norm": 2.0832010997074435, "language_loss": 0.74259806, "learning_rate": 4.978246528322036e-07, "loss": 0.7639606, "num_input_tokens_seen": 278927425, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.68359375, "step": 12929, "time_per_iteration": 2.473421573638916 }, { "auxiliary_loss_clip": 0.01107004, "auxiliary_loss_mlp": 0.01032291, "balance_loss_clip": 1.01941657, "balance_loss_mlp": 1.0366205, "epoch": 0.7773936569968435, "flos": 20776765288320.0, "grad_norm": 1.9619411599225474, "language_loss": 0.7770623, "learning_rate": 4.975675577495377e-07, "loss": 0.79845524, "num_input_tokens_seen": 278946475, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 12930, "time_per_iteration": 2.4392592906951904 }, { "auxiliary_loss_clip": 0.01108788, "auxiliary_loss_mlp": 0.01032609, "balance_loss_clip": 1.01977658, "balance_loss_mlp": 1.03943241, "epoch": 0.7774537802495115, "flos": 20372267255040.0, "grad_norm": 2.5974274989833406, "language_loss": 0.79601693, "learning_rate": 4.973105196392613e-07, "loss": 0.81743085, "num_input_tokens_seen": 278964345, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 12931, "time_per_iteration": 2.429342269897461 }, { "auxiliary_loss_clip": 0.0103237, "auxiliary_loss_mlp": 0.01003287, "balance_loss_clip": 1.00220263, "balance_loss_mlp": 1.00975204, "epoch": 0.7775139035021794, "flos": 53912081738880.0, "grad_norm": 0.8076332472201005, "language_loss": 0.5976131, "learning_rate": 4.970535385111199e-07, "loss": 0.61796969, "num_input_tokens_seen": 279022380, "router_z_loss_clip": 0.01086426, "router_z_loss_mlp": 0.2265625, "step": 12932, "time_per_iteration": 3.0545976161956787 }, { "auxiliary_loss_clip": 0.01107816, "auxiliary_loss_mlp": 0.01032294, "balance_loss_clip": 1.01989627, "balance_loss_mlp": 1.03745556, "epoch": 0.7775740267548474, "flos": 28842481296000.0, "grad_norm": 1.570676812078601, "language_loss": 0.75894856, "learning_rate": 4.967966143748595e-07, "loss": 0.78034967, "num_input_tokens_seen": 279044275, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 12933, "time_per_iteration": 2.545008897781372 }, { "auxiliary_loss_clip": 0.01108184, "auxiliary_loss_mlp": 0.01034645, "balance_loss_clip": 1.02135301, "balance_loss_mlp": 1.03792691, "epoch": 0.7776341500075155, "flos": 21872471713920.0, "grad_norm": 3.0433713138649803, "language_loss": 0.73230946, "learning_rate": 4.965397472402215e-07, "loss": 0.75373775, "num_input_tokens_seen": 279063375, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 12934, "time_per_iteration": 2.462127685546875 }, { "auxiliary_loss_clip": 0.01108005, "auxiliary_loss_mlp": 0.01027889, "balance_loss_clip": 1.01487732, "balance_loss_mlp": 1.03797436, "epoch": 0.7776942732601834, "flos": 20229845829120.0, "grad_norm": 2.001382108548064, "language_loss": 0.69889766, "learning_rate": 4.962829371169475e-07, "loss": 0.72025663, "num_input_tokens_seen": 279082680, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 12935, "time_per_iteration": 2.4583587646484375 }, { "auxiliary_loss_clip": 0.01109088, "auxiliary_loss_mlp": 0.01038702, "balance_loss_clip": 1.02541566, "balance_loss_mlp": 1.03827369, "epoch": 0.7777543965128514, "flos": 22231829329920.0, "grad_norm": 1.723143437457413, "language_loss": 0.8362931, "learning_rate": 4.960261840147746e-07, "loss": 0.85777092, "num_input_tokens_seen": 279099805, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 12936, "time_per_iteration": 2.454603672027588 }, { "auxiliary_loss_clip": 0.01110742, "auxiliary_loss_mlp": 0.01029812, "balance_loss_clip": 1.01761723, "balance_loss_mlp": 1.03800011, "epoch": 0.7778145197655193, "flos": 14501950508160.0, "grad_norm": 2.4634623389226653, "language_loss": 0.67662382, "learning_rate": 4.957694879434397e-07, "loss": 0.69802928, "num_input_tokens_seen": 279117975, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7265625, "step": 12937, "time_per_iteration": 2.4774985313415527 }, { "auxiliary_loss_clip": 0.01107465, "auxiliary_loss_mlp": 0.01030456, "balance_loss_clip": 1.01762891, "balance_loss_mlp": 1.03674173, "epoch": 0.7778746430181873, "flos": 21140288881920.0, "grad_norm": 1.6359601911805894, "language_loss": 0.87085676, "learning_rate": 4.955128489126777e-07, "loss": 0.89223599, "num_input_tokens_seen": 279137255, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 12938, "time_per_iteration": 2.4597384929656982 }, { "auxiliary_loss_clip": 0.01108446, "auxiliary_loss_mlp": 0.01029266, "balance_loss_clip": 1.01608801, "balance_loss_mlp": 1.03815949, "epoch": 0.7779347662708552, "flos": 20266366982400.0, "grad_norm": 2.3928306169329914, "language_loss": 0.85078686, "learning_rate": 4.95256266932218e-07, "loss": 0.87216401, "num_input_tokens_seen": 279154500, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 12939, "time_per_iteration": 2.4761769771575928 }, { "auxiliary_loss_clip": 0.0110337, "auxiliary_loss_mlp": 0.01034595, "balance_loss_clip": 1.02228034, "balance_loss_mlp": 1.03654253, "epoch": 0.7779948895235232, "flos": 19209013303680.0, "grad_norm": 2.0432581979725404, "language_loss": 0.69078362, "learning_rate": 4.949997420117915e-07, "loss": 0.71216321, "num_input_tokens_seen": 279173635, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66796875, "step": 12940, "time_per_iteration": 2.451003074645996 }, { "auxiliary_loss_clip": 0.01107124, "auxiliary_loss_mlp": 0.01029667, "balance_loss_clip": 1.01819897, "balance_loss_mlp": 1.0365696, "epoch": 0.7780550127761912, "flos": 23914711382400.0, "grad_norm": 1.7304143407571317, "language_loss": 0.7808314, "learning_rate": 4.947432741611255e-07, "loss": 0.80219936, "num_input_tokens_seen": 279194430, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.70703125, "step": 12941, "time_per_iteration": 2.509615898132324 }, { "auxiliary_loss_clip": 0.0111162, "auxiliary_loss_mlp": 0.01036972, "balance_loss_clip": 1.02308369, "balance_loss_mlp": 1.0380044, "epoch": 0.7781151360288592, "flos": 32415951795840.0, "grad_norm": 3.1677163853921395, "language_loss": 0.73228127, "learning_rate": 4.944868633899462e-07, "loss": 0.75376713, "num_input_tokens_seen": 279212920, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 12942, "time_per_iteration": 2.582688808441162 }, { "auxiliary_loss_clip": 0.01103698, "auxiliary_loss_mlp": 0.01033804, "balance_loss_clip": 1.02144206, "balance_loss_mlp": 1.03584039, "epoch": 0.7781752592815271, "flos": 22346384780160.0, "grad_norm": 2.7949595235113533, "language_loss": 0.68046862, "learning_rate": 4.942305097079751e-07, "loss": 0.70184368, "num_input_tokens_seen": 279232310, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 12943, "time_per_iteration": 2.4961092472076416 }, { "auxiliary_loss_clip": 0.01031711, "auxiliary_loss_mlp": 0.01000755, "balance_loss_clip": 0.99961704, "balance_loss_mlp": 1.00925314, "epoch": 0.7782353825341951, "flos": 70460183520000.0, "grad_norm": 0.7782206880513943, "language_loss": 0.58499575, "learning_rate": 4.939742131249347e-07, "loss": 0.60532039, "num_input_tokens_seen": 279295375, "router_z_loss_clip": 0.01141357, "router_z_loss_mlp": 0.22460938, "step": 12944, "time_per_iteration": 3.2450551986694336 }, { "auxiliary_loss_clip": 0.01108929, "auxiliary_loss_mlp": 0.01035958, "balance_loss_clip": 1.02201653, "balance_loss_mlp": 1.03722, "epoch": 0.778295505786863, "flos": 19062569554560.0, "grad_norm": 1.878999795124873, "language_loss": 0.67538691, "learning_rate": 4.937179736505428e-07, "loss": 0.69683576, "num_input_tokens_seen": 279313660, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 12945, "time_per_iteration": 2.441951036453247 }, { "auxiliary_loss_clip": 0.01108686, "auxiliary_loss_mlp": 0.01036499, "balance_loss_clip": 1.02357721, "balance_loss_mlp": 1.03873014, "epoch": 0.778355629039531, "flos": 20999734963200.0, "grad_norm": 1.9459794049645183, "language_loss": 0.69278598, "learning_rate": 4.93461791294516e-07, "loss": 0.71423781, "num_input_tokens_seen": 279334495, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 12946, "time_per_iteration": 2.4817440509796143 }, { "auxiliary_loss_clip": 0.01107315, "auxiliary_loss_mlp": 0.01031267, "balance_loss_clip": 1.01795745, "balance_loss_mlp": 1.03756189, "epoch": 0.7784157522921991, "flos": 21398091770880.0, "grad_norm": 1.7375356196501293, "language_loss": 0.65342277, "learning_rate": 4.932056660665689e-07, "loss": 0.67480862, "num_input_tokens_seen": 279352985, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 12947, "time_per_iteration": 2.4725828170776367 }, { "auxiliary_loss_clip": 0.0110849, "auxiliary_loss_mlp": 0.01035873, "balance_loss_clip": 1.02275395, "balance_loss_mlp": 1.03828144, "epoch": 0.778475875544867, "flos": 20813861059200.0, "grad_norm": 1.907796720428581, "language_loss": 0.65569818, "learning_rate": 4.929495979764147e-07, "loss": 0.67714179, "num_input_tokens_seen": 279371360, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 12948, "time_per_iteration": 2.4644761085510254 }, { "auxiliary_loss_clip": 0.01108014, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.01964426, "balance_loss_mlp": 1.03817391, "epoch": 0.778535998797535, "flos": 14355363104640.0, "grad_norm": 1.838199690886016, "language_loss": 0.75083148, "learning_rate": 4.926935870337625e-07, "loss": 0.77223349, "num_input_tokens_seen": 279389400, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 12949, "time_per_iteration": 2.441122055053711 }, { "auxiliary_loss_clip": 0.01111678, "auxiliary_loss_mlp": 0.0103199, "balance_loss_clip": 1.01880515, "balance_loss_mlp": 1.0393815, "epoch": 0.7785961220502029, "flos": 19209552007680.0, "grad_norm": 1.436046441272537, "language_loss": 0.69174874, "learning_rate": 4.924376332483202e-07, "loss": 0.71318537, "num_input_tokens_seen": 279409715, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 12950, "time_per_iteration": 2.4404382705688477 }, { "auxiliary_loss_clip": 0.01107763, "auxiliary_loss_mlp": 0.01028295, "balance_loss_clip": 1.01533675, "balance_loss_mlp": 1.03694129, "epoch": 0.7786562453028709, "flos": 25738757884800.0, "grad_norm": 2.2255389586247167, "language_loss": 0.71816683, "learning_rate": 4.921817366297938e-07, "loss": 0.73952734, "num_input_tokens_seen": 279427705, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 12951, "time_per_iteration": 2.503326654434204 }, { "auxiliary_loss_clip": 0.01105013, "auxiliary_loss_mlp": 0.01031072, "balance_loss_clip": 1.01879895, "balance_loss_mlp": 1.03772449, "epoch": 0.7787163685555388, "flos": 25739440243200.0, "grad_norm": 1.7119015239603694, "language_loss": 0.65521896, "learning_rate": 4.919258971878877e-07, "loss": 0.67657977, "num_input_tokens_seen": 279448215, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.671875, "step": 12952, "time_per_iteration": 2.4862122535705566 }, { "auxiliary_loss_clip": 0.01100372, "auxiliary_loss_mlp": 0.01027986, "balance_loss_clip": 1.01592183, "balance_loss_mlp": 1.03484726, "epoch": 0.7787764918082068, "flos": 22747722416640.0, "grad_norm": 1.634486188256196, "language_loss": 0.81672925, "learning_rate": 4.916701149323022e-07, "loss": 0.83801287, "num_input_tokens_seen": 279466260, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.65625, "step": 12953, "time_per_iteration": 2.4793057441711426 }, { "auxiliary_loss_clip": 0.01112084, "auxiliary_loss_mlp": 0.0103411, "balance_loss_clip": 1.02103853, "balance_loss_mlp": 1.0400213, "epoch": 0.7788366150608748, "flos": 15190860430080.0, "grad_norm": 2.1946829011849602, "language_loss": 0.76846957, "learning_rate": 4.91414389872737e-07, "loss": 0.78993148, "num_input_tokens_seen": 279484520, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 12954, "time_per_iteration": 2.4363760948181152 }, { "auxiliary_loss_clip": 0.01108955, "auxiliary_loss_mlp": 0.01027917, "balance_loss_clip": 1.01575208, "balance_loss_mlp": 1.03744602, "epoch": 0.7788967383135428, "flos": 21210242618880.0, "grad_norm": 1.5113429767542284, "language_loss": 0.72846121, "learning_rate": 4.911587220188905e-07, "loss": 0.74982995, "num_input_tokens_seen": 279503130, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71484375, "step": 12955, "time_per_iteration": 2.468728542327881 }, { "auxiliary_loss_clip": 0.01107068, "auxiliary_loss_mlp": 0.0103294, "balance_loss_clip": 1.02023804, "balance_loss_mlp": 1.03594899, "epoch": 0.7789568615662107, "flos": 21682970536320.0, "grad_norm": 1.4722922912171021, "language_loss": 0.68908751, "learning_rate": 4.909031113804551e-07, "loss": 0.7104876, "num_input_tokens_seen": 279521930, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 12956, "time_per_iteration": 2.505831480026245 }, { "auxiliary_loss_clip": 0.01106935, "auxiliary_loss_mlp": 0.01031224, "balance_loss_clip": 1.01890349, "balance_loss_mlp": 1.03760147, "epoch": 0.7790169848188787, "flos": 26360371676160.0, "grad_norm": 1.693283865143, "language_loss": 0.76021218, "learning_rate": 4.906475579671252e-07, "loss": 0.78159374, "num_input_tokens_seen": 279542375, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 12957, "time_per_iteration": 2.5087125301361084 }, { "auxiliary_loss_clip": 0.01105962, "auxiliary_loss_mlp": 0.01030326, "balance_loss_clip": 1.01747561, "balance_loss_mlp": 1.0365144, "epoch": 0.7790771080715466, "flos": 25516183259520.0, "grad_norm": 1.734637531973742, "language_loss": 0.77398717, "learning_rate": 4.903920617885917e-07, "loss": 0.79535002, "num_input_tokens_seen": 279561885, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 12958, "time_per_iteration": 2.4800214767456055 }, { "auxiliary_loss_clip": 0.01108827, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.02405119, "balance_loss_mlp": 1.03790951, "epoch": 0.7791372313242146, "flos": 16034186920320.0, "grad_norm": 2.2215642227507546, "language_loss": 0.71410739, "learning_rate": 4.901366228545418e-07, "loss": 0.73556674, "num_input_tokens_seen": 279579965, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 12959, "time_per_iteration": 2.4367809295654297 }, { "auxiliary_loss_clip": 0.01106063, "auxiliary_loss_mlp": 0.01036955, "balance_loss_clip": 1.02433646, "balance_loss_mlp": 1.03649747, "epoch": 0.7791973545768827, "flos": 23842207779840.0, "grad_norm": 1.7547584842936284, "language_loss": 0.77910888, "learning_rate": 4.898812411746632e-07, "loss": 0.80053908, "num_input_tokens_seen": 279599030, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 12960, "time_per_iteration": 2.5141537189483643 }, { "auxiliary_loss_clip": 0.01108626, "auxiliary_loss_mlp": 0.01038045, "balance_loss_clip": 1.02484202, "balance_loss_mlp": 1.03783166, "epoch": 0.7792574778295506, "flos": 24168384207360.0, "grad_norm": 2.831652886734351, "language_loss": 0.7539261, "learning_rate": 4.896259167586385e-07, "loss": 0.77539283, "num_input_tokens_seen": 279614400, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 12961, "time_per_iteration": 3.9282522201538086 }, { "auxiliary_loss_clip": 0.01104405, "auxiliary_loss_mlp": 0.01037157, "balance_loss_clip": 1.02496171, "balance_loss_mlp": 1.03874946, "epoch": 0.7793176010822186, "flos": 21464921024640.0, "grad_norm": 1.7299918280643942, "language_loss": 0.73745024, "learning_rate": 4.893706496161511e-07, "loss": 0.75886583, "num_input_tokens_seen": 279633745, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.65625, "step": 12962, "time_per_iteration": 3.840322494506836 }, { "auxiliary_loss_clip": 0.01106526, "auxiliary_loss_mlp": 0.01027958, "balance_loss_clip": 1.01571488, "balance_loss_mlp": 1.03811991, "epoch": 0.7793777243348865, "flos": 20666699038080.0, "grad_norm": 1.8691489658914495, "language_loss": 0.69719553, "learning_rate": 4.891154397568795e-07, "loss": 0.71854031, "num_input_tokens_seen": 279651165, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 12963, "time_per_iteration": 2.4420113563537598 }, { "auxiliary_loss_clip": 0.01107283, "auxiliary_loss_mlp": 0.01033325, "balance_loss_clip": 1.02080822, "balance_loss_mlp": 1.03910661, "epoch": 0.7794378475875545, "flos": 27125771610240.0, "grad_norm": 1.7980516919616765, "language_loss": 0.63639522, "learning_rate": 4.888602871905019e-07, "loss": 0.65780127, "num_input_tokens_seen": 279671175, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 12964, "time_per_iteration": 3.894805431365967 }, { "auxiliary_loss_clip": 0.01107628, "auxiliary_loss_mlp": 0.01031995, "balance_loss_clip": 1.019979, "balance_loss_mlp": 1.03670502, "epoch": 0.7794979708402224, "flos": 28074136446720.0, "grad_norm": 1.9084152962332692, "language_loss": 0.76462102, "learning_rate": 4.88605191926694e-07, "loss": 0.78601724, "num_input_tokens_seen": 279688675, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 12965, "time_per_iteration": 2.4923806190490723 }, { "auxiliary_loss_clip": 0.01099419, "auxiliary_loss_mlp": 0.01029945, "balance_loss_clip": 1.01833403, "balance_loss_mlp": 1.03481817, "epoch": 0.7795580940928905, "flos": 26869548919680.0, "grad_norm": 1.8910379673694928, "language_loss": 0.72964579, "learning_rate": 4.883501539751289e-07, "loss": 0.75093937, "num_input_tokens_seen": 279710245, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6484375, "step": 12966, "time_per_iteration": 2.532689094543457 }, { "auxiliary_loss_clip": 0.01105851, "auxiliary_loss_mlp": 0.01026988, "balance_loss_clip": 1.01599669, "balance_loss_mlp": 1.03896773, "epoch": 0.7796182173455584, "flos": 23835384195840.0, "grad_norm": 2.67108400458196, "language_loss": 0.74371588, "learning_rate": 4.880951733454768e-07, "loss": 0.76504427, "num_input_tokens_seen": 279729045, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.66796875, "step": 12967, "time_per_iteration": 3.91160249710083 }, { "auxiliary_loss_clip": 0.01108523, "auxiliary_loss_mlp": 0.01029233, "balance_loss_clip": 1.01590586, "balance_loss_mlp": 1.03817368, "epoch": 0.7796783405982264, "flos": 19792238434560.0, "grad_norm": 12.776596330399714, "language_loss": 0.72491705, "learning_rate": 4.878402500474073e-07, "loss": 0.74629462, "num_input_tokens_seen": 279748350, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 12968, "time_per_iteration": 2.471898317337036 }, { "auxiliary_loss_clip": 0.01106708, "auxiliary_loss_mlp": 0.01035348, "balance_loss_clip": 1.02223492, "balance_loss_mlp": 1.03800726, "epoch": 0.7797384638508943, "flos": 15450207603840.0, "grad_norm": 2.3283817497148496, "language_loss": 0.61058211, "learning_rate": 4.875853840905874e-07, "loss": 0.63200259, "num_input_tokens_seen": 279765620, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 12969, "time_per_iteration": 2.4443349838256836 }, { "auxiliary_loss_clip": 0.0110149, "auxiliary_loss_mlp": 0.01032817, "balance_loss_clip": 1.02087784, "balance_loss_mlp": 1.0356245, "epoch": 0.7797985871035623, "flos": 20922742160640.0, "grad_norm": 1.6434099417923538, "language_loss": 0.70254147, "learning_rate": 4.873305754846811e-07, "loss": 0.72388458, "num_input_tokens_seen": 279782485, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.65625, "step": 12970, "time_per_iteration": 2.467750310897827 }, { "auxiliary_loss_clip": 0.01109104, "auxiliary_loss_mlp": 0.01031531, "balance_loss_clip": 1.0181973, "balance_loss_mlp": 1.03932524, "epoch": 0.7798587103562302, "flos": 36937212514560.0, "grad_norm": 1.5757129236960223, "language_loss": 0.72306114, "learning_rate": 4.870758242393507e-07, "loss": 0.7444675, "num_input_tokens_seen": 279804170, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 12971, "time_per_iteration": 2.6176164150238037 }, { "auxiliary_loss_clip": 0.01110899, "auxiliary_loss_mlp": 0.01028928, "balance_loss_clip": 1.01618993, "balance_loss_mlp": 1.03776193, "epoch": 0.7799188336088982, "flos": 22419283432320.0, "grad_norm": 2.2689332100948256, "language_loss": 0.74430966, "learning_rate": 4.868211303642578e-07, "loss": 0.76570797, "num_input_tokens_seen": 279823730, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 12972, "time_per_iteration": 2.482743501663208 }, { "auxiliary_loss_clip": 0.01106029, "auxiliary_loss_mlp": 0.01025785, "balance_loss_clip": 1.0125227, "balance_loss_mlp": 1.03620136, "epoch": 0.7799789568615663, "flos": 18880466578560.0, "grad_norm": 4.99071960146454, "language_loss": 0.7169565, "learning_rate": 4.865664938690584e-07, "loss": 0.73827463, "num_input_tokens_seen": 279843035, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 12973, "time_per_iteration": 2.47808575630188 }, { "auxiliary_loss_clip": 0.01104984, "auxiliary_loss_mlp": 0.01033092, "balance_loss_clip": 1.02120113, "balance_loss_mlp": 1.03679562, "epoch": 0.7800390801142342, "flos": 20262272832000.0, "grad_norm": 1.7694811263243728, "language_loss": 0.77562535, "learning_rate": 4.863119147634089e-07, "loss": 0.79700613, "num_input_tokens_seen": 279861450, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 12974, "time_per_iteration": 2.478102207183838 }, { "auxiliary_loss_clip": 0.01104413, "auxiliary_loss_mlp": 0.01029745, "balance_loss_clip": 1.01712108, "balance_loss_mlp": 1.03669715, "epoch": 0.7800992033669022, "flos": 16690310703360.0, "grad_norm": 1.6028618206436367, "language_loss": 0.69232601, "learning_rate": 4.86057393056964e-07, "loss": 0.71366763, "num_input_tokens_seen": 279878660, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 12975, "time_per_iteration": 2.447016954421997 }, { "auxiliary_loss_clip": 0.01104105, "auxiliary_loss_mlp": 0.01031775, "balance_loss_clip": 1.01965761, "balance_loss_mlp": 1.0368768, "epoch": 0.7801593266195701, "flos": 18585208782720.0, "grad_norm": 2.1098297360827405, "language_loss": 0.82050252, "learning_rate": 4.858029287593739e-07, "loss": 0.84186125, "num_input_tokens_seen": 279895685, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 12976, "time_per_iteration": 2.453131675720215 }, { "auxiliary_loss_clip": 0.01108287, "auxiliary_loss_mlp": 0.01027735, "balance_loss_clip": 1.0147593, "balance_loss_mlp": 1.03688908, "epoch": 0.7802194498722381, "flos": 25484941405440.0, "grad_norm": 1.4859366814409698, "language_loss": 0.65421772, "learning_rate": 4.85548521880289e-07, "loss": 0.675578, "num_input_tokens_seen": 279917240, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 12977, "time_per_iteration": 2.5096347332000732 }, { "auxiliary_loss_clip": 0.01104815, "auxiliary_loss_mlp": 0.01028126, "balance_loss_clip": 1.01637793, "balance_loss_mlp": 1.03709543, "epoch": 0.780279573124906, "flos": 31176315573120.0, "grad_norm": 1.53603376993084, "language_loss": 0.74786007, "learning_rate": 4.852941724293554e-07, "loss": 0.76918948, "num_input_tokens_seen": 279938665, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 12978, "time_per_iteration": 2.620788335800171 }, { "auxiliary_loss_clip": 0.0110806, "auxiliary_loss_mlp": 0.01032803, "balance_loss_clip": 1.01972032, "balance_loss_mlp": 1.03646481, "epoch": 0.780339696377574, "flos": 26944027770240.0, "grad_norm": 3.3865996208638296, "language_loss": 0.62139535, "learning_rate": 4.85039880416219e-07, "loss": 0.64280397, "num_input_tokens_seen": 279957965, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 12979, "time_per_iteration": 2.5079784393310547 }, { "auxiliary_loss_clip": 0.01106142, "auxiliary_loss_mlp": 0.01027757, "balance_loss_clip": 1.01497126, "balance_loss_mlp": 1.03735936, "epoch": 0.780399819630242, "flos": 27957426180480.0, "grad_norm": 2.0354427635306567, "language_loss": 0.77434075, "learning_rate": 4.847856458505217e-07, "loss": 0.79567969, "num_input_tokens_seen": 279977490, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 12980, "time_per_iteration": 2.522041082382202 }, { "auxiliary_loss_clip": 0.01108739, "auxiliary_loss_mlp": 0.01030414, "balance_loss_clip": 1.01821256, "balance_loss_mlp": 1.0377028, "epoch": 0.78045994288291, "flos": 22486795044480.0, "grad_norm": 2.3772444963969037, "language_loss": 0.77667499, "learning_rate": 4.845314687419046e-07, "loss": 0.7980665, "num_input_tokens_seen": 279994220, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 12981, "time_per_iteration": 2.4626898765563965 }, { "auxiliary_loss_clip": 0.01110472, "auxiliary_loss_mlp": 0.01034971, "balance_loss_clip": 1.02259064, "balance_loss_mlp": 1.03953934, "epoch": 0.7805200661355779, "flos": 20850849089280.0, "grad_norm": 1.7535525944700596, "language_loss": 0.72552872, "learning_rate": 4.842773491000067e-07, "loss": 0.74698317, "num_input_tokens_seen": 280012590, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 12982, "time_per_iteration": 2.516242504119873 }, { "auxiliary_loss_clip": 0.01103509, "auxiliary_loss_mlp": 0.01030219, "balance_loss_clip": 1.01841724, "balance_loss_mlp": 1.0336411, "epoch": 0.7805801893882459, "flos": 25665966973440.0, "grad_norm": 1.4596076068095716, "language_loss": 0.73456109, "learning_rate": 4.840232869344636e-07, "loss": 0.75589836, "num_input_tokens_seen": 280033700, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 12983, "time_per_iteration": 2.4985692501068115 }, { "auxiliary_loss_clip": 0.01106879, "auxiliary_loss_mlp": 0.01030156, "balance_loss_clip": 1.01798499, "balance_loss_mlp": 1.03804088, "epoch": 0.7806403126409138, "flos": 11327806483200.0, "grad_norm": 1.8034510964308925, "language_loss": 0.75038356, "learning_rate": 4.837692822549086e-07, "loss": 0.77175391, "num_input_tokens_seen": 280052215, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 12984, "time_per_iteration": 2.4482431411743164 }, { "auxiliary_loss_clip": 0.0110473, "auxiliary_loss_mlp": 0.01030923, "balance_loss_clip": 1.01962161, "balance_loss_mlp": 1.0365032, "epoch": 0.7807004358935818, "flos": 19573362910080.0, "grad_norm": 2.1440155232409843, "language_loss": 0.81228399, "learning_rate": 4.835153350709746e-07, "loss": 0.83364058, "num_input_tokens_seen": 280070525, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 12985, "time_per_iteration": 2.4547362327575684 }, { "auxiliary_loss_clip": 0.01106518, "auxiliary_loss_mlp": 0.01030968, "balance_loss_clip": 1.01857626, "balance_loss_mlp": 1.03801441, "epoch": 0.7807605591462499, "flos": 19135827342720.0, "grad_norm": 1.7481081108081715, "language_loss": 0.76931673, "learning_rate": 4.832614453922915e-07, "loss": 0.79069161, "num_input_tokens_seen": 280089855, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 12986, "time_per_iteration": 2.480186700820923 }, { "auxiliary_loss_clip": 0.01107157, "auxiliary_loss_mlp": 0.01034123, "balance_loss_clip": 1.02114081, "balance_loss_mlp": 1.03793108, "epoch": 0.7808206823989178, "flos": 32374654133760.0, "grad_norm": 1.8653032258246436, "language_loss": 0.74121225, "learning_rate": 4.830076132284859e-07, "loss": 0.76262504, "num_input_tokens_seen": 280109960, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 12987, "time_per_iteration": 2.5697526931762695 }, { "auxiliary_loss_clip": 0.01031397, "auxiliary_loss_mlp": 0.01004466, "balance_loss_clip": 1.0034411, "balance_loss_mlp": 1.00877094, "epoch": 0.7808808056515858, "flos": 55050235061760.0, "grad_norm": 0.7552380533565973, "language_loss": 0.55052817, "learning_rate": 4.82753838589184e-07, "loss": 0.57088673, "num_input_tokens_seen": 280169805, "router_z_loss_clip": 0.01025391, "router_z_loss_mlp": 0.2265625, "step": 12988, "time_per_iteration": 3.109443187713623 }, { "auxiliary_loss_clip": 0.01103915, "auxiliary_loss_mlp": 0.01034943, "balance_loss_clip": 1.02308202, "balance_loss_mlp": 1.0376811, "epoch": 0.7809409289042537, "flos": 12859468277760.0, "grad_norm": 2.3959130055006566, "language_loss": 0.80543625, "learning_rate": 4.82500121484009e-07, "loss": 0.82682484, "num_input_tokens_seen": 280184630, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6640625, "step": 12989, "time_per_iteration": 2.4216556549072266 }, { "auxiliary_loss_clip": 0.01102096, "auxiliary_loss_mlp": 0.01028557, "balance_loss_clip": 1.01636147, "balance_loss_mlp": 1.0349915, "epoch": 0.7810010521569217, "flos": 21687244254720.0, "grad_norm": 1.5962626346930595, "language_loss": 0.70542908, "learning_rate": 4.822464619225806e-07, "loss": 0.72673559, "num_input_tokens_seen": 280203880, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 12990, "time_per_iteration": 2.484811305999756 }, { "auxiliary_loss_clip": 0.01106807, "auxiliary_loss_mlp": 0.01029831, "balance_loss_clip": 1.01653314, "balance_loss_mlp": 1.03731155, "epoch": 0.7810611754095896, "flos": 16757068129920.0, "grad_norm": 2.0148098108134938, "language_loss": 0.77562726, "learning_rate": 4.819928599145184e-07, "loss": 0.79699361, "num_input_tokens_seen": 280220460, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 12991, "time_per_iteration": 2.4357728958129883 }, { "auxiliary_loss_clip": 0.01105639, "auxiliary_loss_mlp": 0.01038126, "balance_loss_clip": 1.02514362, "balance_loss_mlp": 1.03636408, "epoch": 0.7811212986622577, "flos": 43507464658560.0, "grad_norm": 3.440449388717315, "language_loss": 0.65902376, "learning_rate": 4.817393154694398e-07, "loss": 0.68046147, "num_input_tokens_seen": 280242680, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 12992, "time_per_iteration": 2.709676504135132 }, { "auxiliary_loss_clip": 0.01106868, "auxiliary_loss_mlp": 0.01027788, "balance_loss_clip": 1.01562905, "balance_loss_mlp": 1.03701162, "epoch": 0.7811814219149256, "flos": 21757700782080.0, "grad_norm": 1.9541202944091038, "language_loss": 0.61962622, "learning_rate": 4.814858285969578e-07, "loss": 0.64097285, "num_input_tokens_seen": 280260655, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69921875, "step": 12993, "time_per_iteration": 2.4925429821014404 }, { "auxiliary_loss_clip": 0.01105877, "auxiliary_loss_mlp": 0.01030713, "balance_loss_clip": 1.01814795, "balance_loss_mlp": 1.03740454, "epoch": 0.7812415451675936, "flos": 24061514267520.0, "grad_norm": 1.4821606909948801, "language_loss": 0.6859628, "learning_rate": 4.812323993066862e-07, "loss": 0.70732868, "num_input_tokens_seen": 280281185, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 12994, "time_per_iteration": 2.516221284866333 }, { "auxiliary_loss_clip": 0.01103584, "auxiliary_loss_mlp": 0.01026409, "balance_loss_clip": 1.01466727, "balance_loss_mlp": 1.03592777, "epoch": 0.7813016684202615, "flos": 18989706816000.0, "grad_norm": 2.002780619800685, "language_loss": 0.69195986, "learning_rate": 4.809790276082335e-07, "loss": 0.71325982, "num_input_tokens_seen": 280298255, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.67578125, "step": 12995, "time_per_iteration": 2.445079803466797 }, { "auxiliary_loss_clip": 0.01103062, "auxiliary_loss_mlp": 0.01025445, "balance_loss_clip": 1.01392365, "balance_loss_mlp": 1.03623116, "epoch": 0.7813617916729295, "flos": 25260786581760.0, "grad_norm": 1.7365782128068201, "language_loss": 0.75222915, "learning_rate": 4.807257135112088e-07, "loss": 0.77351427, "num_input_tokens_seen": 280319000, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 12996, "time_per_iteration": 2.528482675552368 }, { "auxiliary_loss_clip": 0.01109336, "auxiliary_loss_mlp": 0.01030868, "balance_loss_clip": 1.01830947, "balance_loss_mlp": 1.03730297, "epoch": 0.7814219149255974, "flos": 17966037116160.0, "grad_norm": 2.5901444642804834, "language_loss": 0.67666185, "learning_rate": 4.804724570252167e-07, "loss": 0.69806385, "num_input_tokens_seen": 280336375, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 12997, "time_per_iteration": 2.436062812805176 }, { "auxiliary_loss_clip": 0.01108665, "auxiliary_loss_mlp": 0.01032625, "balance_loss_clip": 1.01942873, "balance_loss_mlp": 1.03662896, "epoch": 0.7814820381782654, "flos": 25776176878080.0, "grad_norm": 1.8458786060066423, "language_loss": 0.82266045, "learning_rate": 4.802192581598614e-07, "loss": 0.84407341, "num_input_tokens_seen": 280358760, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 12998, "time_per_iteration": 2.5724599361419678 }, { "auxiliary_loss_clip": 0.01105313, "auxiliary_loss_mlp": 0.01033603, "balance_loss_clip": 1.02086544, "balance_loss_mlp": 1.03561985, "epoch": 0.7815421614309335, "flos": 20519572930560.0, "grad_norm": 6.007347241096432, "language_loss": 0.74767339, "learning_rate": 4.799661169247453e-07, "loss": 0.76906252, "num_input_tokens_seen": 280377085, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 12999, "time_per_iteration": 2.4679834842681885 }, { "auxiliary_loss_clip": 0.01107968, "auxiliary_loss_mlp": 0.01037067, "balance_loss_clip": 1.02434182, "balance_loss_mlp": 1.03761578, "epoch": 0.7816022846836014, "flos": 21287666384640.0, "grad_norm": 1.532971434901334, "language_loss": 0.84622258, "learning_rate": 4.797130333294652e-07, "loss": 0.86767292, "num_input_tokens_seen": 280395465, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13000, "time_per_iteration": 2.464944839477539 }, { "auxiliary_loss_clip": 0.01108556, "auxiliary_loss_mlp": 0.01032007, "balance_loss_clip": 1.01931763, "balance_loss_mlp": 1.03813672, "epoch": 0.7816624079362694, "flos": 19208402772480.0, "grad_norm": 1.998712589489018, "language_loss": 0.66140997, "learning_rate": 4.794600073836192e-07, "loss": 0.68281561, "num_input_tokens_seen": 280412775, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13001, "time_per_iteration": 2.4689159393310547 }, { "auxiliary_loss_clip": 0.01106612, "auxiliary_loss_mlp": 0.01031604, "balance_loss_clip": 1.0196774, "balance_loss_mlp": 1.03690171, "epoch": 0.7817225311889373, "flos": 26104687689600.0, "grad_norm": 1.5107584747750091, "language_loss": 0.6702224, "learning_rate": 4.792070390968027e-07, "loss": 0.69160455, "num_input_tokens_seen": 280432905, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 13002, "time_per_iteration": 2.5197951793670654 }, { "auxiliary_loss_clip": 0.01111341, "auxiliary_loss_mlp": 0.01033819, "balance_loss_clip": 1.0207243, "balance_loss_mlp": 1.04073095, "epoch": 0.7817826544416053, "flos": 21250929749760.0, "grad_norm": 2.2245060343797793, "language_loss": 0.73348451, "learning_rate": 4.78954128478607e-07, "loss": 0.7549361, "num_input_tokens_seen": 280450785, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 13003, "time_per_iteration": 5.305740118026733 }, { "auxiliary_loss_clip": 0.01109304, "auxiliary_loss_mlp": 0.01033533, "balance_loss_clip": 1.02147508, "balance_loss_mlp": 1.03955734, "epoch": 0.7818427776942732, "flos": 19932181822080.0, "grad_norm": 2.0601221324655064, "language_loss": 0.62043941, "learning_rate": 4.787012755386233e-07, "loss": 0.64186776, "num_input_tokens_seen": 280468400, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6953125, "step": 13004, "time_per_iteration": 2.4736831188201904 }, { "auxiliary_loss_clip": 0.01101271, "auxiliary_loss_mlp": 0.01028004, "balance_loss_clip": 1.01660132, "balance_loss_mlp": 1.03603423, "epoch": 0.7819029009469413, "flos": 11363753018880.0, "grad_norm": 1.901231742481442, "language_loss": 0.8304075, "learning_rate": 4.784484802864403e-07, "loss": 0.85170019, "num_input_tokens_seen": 280483930, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.65234375, "step": 13005, "time_per_iteration": 2.442146062850952 }, { "auxiliary_loss_clip": 0.01103933, "auxiliary_loss_mlp": 0.01030376, "balance_loss_clip": 1.01786518, "balance_loss_mlp": 1.03589702, "epoch": 0.7819630241996092, "flos": 24279276470400.0, "grad_norm": 1.9587200844950619, "language_loss": 0.72838509, "learning_rate": 4.781957427316432e-07, "loss": 0.7497282, "num_input_tokens_seen": 280503465, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 13006, "time_per_iteration": 3.9252119064331055 }, { "auxiliary_loss_clip": 0.01108674, "auxiliary_loss_mlp": 0.01033818, "balance_loss_clip": 1.0211575, "balance_loss_mlp": 1.0379076, "epoch": 0.7820231474522772, "flos": 22708902792960.0, "grad_norm": 1.7554410604447825, "language_loss": 0.71924579, "learning_rate": 4.779430628838157e-07, "loss": 0.74067068, "num_input_tokens_seen": 280523375, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 13007, "time_per_iteration": 2.473324775695801 }, { "auxiliary_loss_clip": 0.01106925, "auxiliary_loss_mlp": 0.01031221, "balance_loss_clip": 1.01833415, "balance_loss_mlp": 1.03534245, "epoch": 0.7820832707049451, "flos": 20047419630720.0, "grad_norm": 4.872013756886014, "language_loss": 0.69064289, "learning_rate": 4.776904407525397e-07, "loss": 0.71202433, "num_input_tokens_seen": 280542920, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 13008, "time_per_iteration": 3.959998607635498 }, { "auxiliary_loss_clip": 0.0110406, "auxiliary_loss_mlp": 0.01028984, "balance_loss_clip": 1.01583529, "balance_loss_mlp": 1.03583252, "epoch": 0.7821433939576131, "flos": 27162795553920.0, "grad_norm": 2.2653965369112354, "language_loss": 0.69703847, "learning_rate": 4.774378763473954e-07, "loss": 0.71836889, "num_input_tokens_seen": 280561700, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.68359375, "step": 13009, "time_per_iteration": 2.5244908332824707 }, { "auxiliary_loss_clip": 0.01103927, "auxiliary_loss_mlp": 0.01026804, "balance_loss_clip": 1.01423931, "balance_loss_mlp": 1.03542519, "epoch": 0.782203517210281, "flos": 22602068766720.0, "grad_norm": 1.7465746175787191, "language_loss": 0.81493628, "learning_rate": 4.771853696779586e-07, "loss": 0.83624351, "num_input_tokens_seen": 280580605, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 13010, "time_per_iteration": 2.48160719871521 }, { "auxiliary_loss_clip": 0.0110355, "auxiliary_loss_mlp": 0.01031798, "balance_loss_clip": 1.0199244, "balance_loss_mlp": 1.03647327, "epoch": 0.782263640462949, "flos": 29059812535680.0, "grad_norm": 1.5970897134616988, "language_loss": 0.62116402, "learning_rate": 4.76932920753806e-07, "loss": 0.64251745, "num_input_tokens_seen": 280601495, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 13011, "time_per_iteration": 2.5519096851348877 }, { "auxiliary_loss_clip": 0.01106711, "auxiliary_loss_mlp": 0.01030752, "balance_loss_clip": 1.01973081, "balance_loss_mlp": 1.03846073, "epoch": 0.782323763715617, "flos": 25299498464640.0, "grad_norm": 1.6494709501643499, "language_loss": 0.70232975, "learning_rate": 4.7668052958450913e-07, "loss": 0.7237044, "num_input_tokens_seen": 280622760, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.68359375, "step": 13012, "time_per_iteration": 2.5279593467712402 }, { "auxiliary_loss_clip": 0.01032555, "auxiliary_loss_mlp": 0.01000938, "balance_loss_clip": 0.99988288, "balance_loss_mlp": 1.00998557, "epoch": 0.782383886968285, "flos": 65194388668800.0, "grad_norm": 0.7077160499390426, "language_loss": 0.55073166, "learning_rate": 4.764281961796395e-07, "loss": 0.57106662, "num_input_tokens_seen": 280687115, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.2265625, "step": 13013, "time_per_iteration": 3.1746039390563965 }, { "auxiliary_loss_clip": 0.01111588, "auxiliary_loss_mlp": 0.010351, "balance_loss_clip": 1.0227735, "balance_loss_mlp": 1.04038775, "epoch": 0.782444010220953, "flos": 18405440190720.0, "grad_norm": 2.3450512943553687, "language_loss": 0.6563307, "learning_rate": 4.76175920548765e-07, "loss": 0.67779756, "num_input_tokens_seen": 280705000, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 13014, "time_per_iteration": 2.450124740600586 }, { "auxiliary_loss_clip": 0.01032358, "auxiliary_loss_mlp": 0.01003115, "balance_loss_clip": 1.00207222, "balance_loss_mlp": 1.00993073, "epoch": 0.7825041334736209, "flos": 63955003841280.0, "grad_norm": 0.723332375702561, "language_loss": 0.58433056, "learning_rate": 4.759237027014524e-07, "loss": 0.60468531, "num_input_tokens_seen": 280773525, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.22460938, "step": 13015, "time_per_iteration": 3.186937093734741 }, { "auxiliary_loss_clip": 0.01104892, "auxiliary_loss_mlp": 0.01030791, "balance_loss_clip": 1.01903129, "balance_loss_mlp": 1.03681183, "epoch": 0.7825642567262889, "flos": 20339373375360.0, "grad_norm": 1.6310530306489612, "language_loss": 0.74658006, "learning_rate": 4.756715426472666e-07, "loss": 0.76793689, "num_input_tokens_seen": 280791915, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 13016, "time_per_iteration": 2.468383550643921 }, { "auxiliary_loss_clip": 0.01107626, "auxiliary_loss_mlp": 0.01031411, "balance_loss_clip": 1.01764858, "balance_loss_mlp": 1.03768623, "epoch": 0.7826243799789568, "flos": 20262955190400.0, "grad_norm": 1.8329134385232675, "language_loss": 0.75284016, "learning_rate": 4.7541944039576766e-07, "loss": 0.77423048, "num_input_tokens_seen": 280811460, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.69921875, "step": 13017, "time_per_iteration": 2.450911283493042 }, { "auxiliary_loss_clip": 0.01106508, "auxiliary_loss_mlp": 0.01031763, "balance_loss_clip": 1.01826298, "balance_loss_mlp": 1.03568912, "epoch": 0.7826845032316249, "flos": 21132926593920.0, "grad_norm": 3.524708663150885, "language_loss": 0.75300008, "learning_rate": 4.7516739595651636e-07, "loss": 0.77438277, "num_input_tokens_seen": 280825415, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 13018, "time_per_iteration": 2.459402084350586 }, { "auxiliary_loss_clip": 0.01105197, "auxiliary_loss_mlp": 0.01026846, "balance_loss_clip": 1.01409078, "balance_loss_mlp": 1.03619814, "epoch": 0.7827446264842928, "flos": 22492253911680.0, "grad_norm": 1.4823534884754674, "language_loss": 0.77120328, "learning_rate": 4.749154093390708e-07, "loss": 0.79252374, "num_input_tokens_seen": 280845335, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13019, "time_per_iteration": 2.462345600128174 }, { "auxiliary_loss_clip": 0.01104786, "auxiliary_loss_mlp": 0.01029743, "balance_loss_clip": 1.01759577, "balance_loss_mlp": 1.03643167, "epoch": 0.7828047497369608, "flos": 28840649702400.0, "grad_norm": 1.4390214387794675, "language_loss": 0.67635369, "learning_rate": 4.746634805529852e-07, "loss": 0.69769895, "num_input_tokens_seen": 280867145, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 13020, "time_per_iteration": 2.5310654640197754 }, { "auxiliary_loss_clip": 0.01107192, "auxiliary_loss_mlp": 0.01030413, "balance_loss_clip": 1.01772332, "balance_loss_mlp": 1.03868055, "epoch": 0.7828648729896287, "flos": 23257689759360.0, "grad_norm": 3.602576747629042, "language_loss": 0.62979633, "learning_rate": 4.7441160960781325e-07, "loss": 0.6511724, "num_input_tokens_seen": 280886185, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13021, "time_per_iteration": 2.4621260166168213 }, { "auxiliary_loss_clip": 0.01102751, "auxiliary_loss_mlp": 0.01032843, "balance_loss_clip": 1.02101755, "balance_loss_mlp": 1.03567314, "epoch": 0.7829249962422967, "flos": 25265670831360.0, "grad_norm": 1.6180127460028129, "language_loss": 0.69366115, "learning_rate": 4.7415979651310636e-07, "loss": 0.71501708, "num_input_tokens_seen": 280907665, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 13022, "time_per_iteration": 2.499826431274414 }, { "auxiliary_loss_clip": 0.0103295, "auxiliary_loss_mlp": 0.01001776, "balance_loss_clip": 1.0006789, "balance_loss_mlp": 1.0102874, "epoch": 0.7829851194949646, "flos": 70722044645760.0, "grad_norm": 0.642057621199192, "language_loss": 0.56230128, "learning_rate": 4.739080412784131e-07, "loss": 0.58264852, "num_input_tokens_seen": 280971405, "router_z_loss_clip": 0.01098633, "router_z_loss_mlp": 0.2265625, "step": 13023, "time_per_iteration": 3.221086025238037 }, { "auxiliary_loss_clip": 0.01100754, "auxiliary_loss_mlp": 0.01029288, "balance_loss_clip": 1.01801634, "balance_loss_mlp": 1.03492999, "epoch": 0.7830452427476327, "flos": 25660795415040.0, "grad_norm": 1.7115170963991588, "language_loss": 0.67496949, "learning_rate": 4.736563439132792e-07, "loss": 0.69626987, "num_input_tokens_seen": 280989615, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.65625, "step": 13024, "time_per_iteration": 2.5134353637695312 }, { "auxiliary_loss_clip": 0.01109623, "auxiliary_loss_mlp": 0.01028418, "balance_loss_clip": 1.01553178, "balance_loss_mlp": 1.03854823, "epoch": 0.7831053660003006, "flos": 22784315397120.0, "grad_norm": 1.9875561554940624, "language_loss": 0.78012812, "learning_rate": 4.734047044272498e-07, "loss": 0.80150855, "num_input_tokens_seen": 281009450, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 13025, "time_per_iteration": 2.532707929611206 }, { "auxiliary_loss_clip": 0.01106116, "auxiliary_loss_mlp": 0.01032692, "balance_loss_clip": 1.02081907, "balance_loss_mlp": 1.03805757, "epoch": 0.7831654892529686, "flos": 25812267068160.0, "grad_norm": 1.7864807367204198, "language_loss": 0.78296089, "learning_rate": 4.731531228298673e-07, "loss": 0.80434901, "num_input_tokens_seen": 281028120, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13026, "time_per_iteration": 2.5020945072174072 }, { "auxiliary_loss_clip": 0.0110598, "auxiliary_loss_mlp": 0.01026532, "balance_loss_clip": 1.01467657, "balance_loss_mlp": 1.03825724, "epoch": 0.7832256125056366, "flos": 20771557816320.0, "grad_norm": 2.105517394810048, "language_loss": 0.75349373, "learning_rate": 4.729015991306715e-07, "loss": 0.77481884, "num_input_tokens_seen": 281042130, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 13027, "time_per_iteration": 2.4898059368133545 }, { "auxiliary_loss_clip": 0.01107476, "auxiliary_loss_mlp": 0.01030208, "balance_loss_clip": 1.01846004, "balance_loss_mlp": 1.03929937, "epoch": 0.7832857357583045, "flos": 21506541909120.0, "grad_norm": 1.8122093685142966, "language_loss": 0.71018684, "learning_rate": 4.726501333391997e-07, "loss": 0.73156375, "num_input_tokens_seen": 281060945, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 13028, "time_per_iteration": 2.4647653102874756 }, { "auxiliary_loss_clip": 0.01109551, "auxiliary_loss_mlp": 0.0103834, "balance_loss_clip": 1.02543569, "balance_loss_mlp": 1.03718698, "epoch": 0.7833458590109725, "flos": 18077791305600.0, "grad_norm": 2.502152190337041, "language_loss": 0.69266164, "learning_rate": 4.7239872546498774e-07, "loss": 0.71414053, "num_input_tokens_seen": 281079270, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 13029, "time_per_iteration": 2.4273455142974854 }, { "auxiliary_loss_clip": 0.0110901, "auxiliary_loss_mlp": 0.01030752, "balance_loss_clip": 1.01715636, "balance_loss_mlp": 1.03794694, "epoch": 0.7834059822636404, "flos": 28288738252800.0, "grad_norm": 1.7908391145255511, "language_loss": 0.80776477, "learning_rate": 4.721473755175698e-07, "loss": 0.82916242, "num_input_tokens_seen": 281099500, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 13030, "time_per_iteration": 2.524630308151245 }, { "auxiliary_loss_clip": 0.01108235, "auxiliary_loss_mlp": 0.01030029, "balance_loss_clip": 1.01761913, "balance_loss_mlp": 1.03648067, "epoch": 0.7834661055163085, "flos": 31686211088640.0, "grad_norm": 2.27072087990327, "language_loss": 0.70596844, "learning_rate": 4.71896083506476e-07, "loss": 0.72735113, "num_input_tokens_seen": 281121250, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 13031, "time_per_iteration": 2.5449230670928955 }, { "auxiliary_loss_clip": 0.01106318, "auxiliary_loss_mlp": 0.01028268, "balance_loss_clip": 1.01627564, "balance_loss_mlp": 1.03577924, "epoch": 0.7835262287689764, "flos": 12933192942720.0, "grad_norm": 3.2686794775604535, "language_loss": 0.7874555, "learning_rate": 4.7164484944123574e-07, "loss": 0.80880135, "num_input_tokens_seen": 281138760, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 13032, "time_per_iteration": 2.44405198097229 }, { "auxiliary_loss_clip": 0.01111718, "auxiliary_loss_mlp": 0.01035825, "balance_loss_clip": 1.02306986, "balance_loss_mlp": 1.039868, "epoch": 0.7835863520216444, "flos": 16143211676160.0, "grad_norm": 2.44405423047018, "language_loss": 0.62783843, "learning_rate": 4.7139367333137726e-07, "loss": 0.64931387, "num_input_tokens_seen": 281157420, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 13033, "time_per_iteration": 2.4313745498657227 }, { "auxiliary_loss_clip": 0.01106389, "auxiliary_loss_mlp": 0.0103145, "balance_loss_clip": 1.01850367, "balance_loss_mlp": 1.03729844, "epoch": 0.7836464752743123, "flos": 11509909459200.0, "grad_norm": 1.9665460460925903, "language_loss": 0.72130537, "learning_rate": 4.7114255518642255e-07, "loss": 0.74268377, "num_input_tokens_seen": 281174620, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 13034, "time_per_iteration": 2.4713947772979736 }, { "auxiliary_loss_clip": 0.01109951, "auxiliary_loss_mlp": 0.01028987, "balance_loss_clip": 1.01643419, "balance_loss_mlp": 1.03894567, "epoch": 0.7837065985269803, "flos": 18223696350720.0, "grad_norm": 11.76744381412745, "language_loss": 0.71978039, "learning_rate": 4.7089149501589555e-07, "loss": 0.74116981, "num_input_tokens_seen": 281193865, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 13035, "time_per_iteration": 2.4482603073120117 }, { "auxiliary_loss_clip": 0.01108087, "auxiliary_loss_mlp": 0.01035986, "balance_loss_clip": 1.02248573, "balance_loss_mlp": 1.03817654, "epoch": 0.7837667217796482, "flos": 24754410599040.0, "grad_norm": 2.4317853183799114, "language_loss": 0.66178101, "learning_rate": 4.7064049282931664e-07, "loss": 0.6832217, "num_input_tokens_seen": 281212250, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69921875, "step": 13036, "time_per_iteration": 2.4870731830596924 }, { "auxiliary_loss_clip": 0.01112837, "auxiliary_loss_mlp": 0.01034496, "balance_loss_clip": 1.02090573, "balance_loss_mlp": 1.03896093, "epoch": 0.7838268450323163, "flos": 22383121415040.0, "grad_norm": 2.163403776725154, "language_loss": 0.72671878, "learning_rate": 4.703895486362031e-07, "loss": 0.74819213, "num_input_tokens_seen": 281230850, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7421875, "step": 13037, "time_per_iteration": 2.463547945022583 }, { "auxiliary_loss_clip": 0.01105446, "auxiliary_loss_mlp": 0.01031266, "balance_loss_clip": 1.01841521, "balance_loss_mlp": 1.036291, "epoch": 0.7838869682849842, "flos": 19500284689920.0, "grad_norm": 31.59281670229262, "language_loss": 0.60266042, "learning_rate": 4.701386624460717e-07, "loss": 0.62402761, "num_input_tokens_seen": 281249810, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 13038, "time_per_iteration": 2.442124128341675 }, { "auxiliary_loss_clip": 0.01104524, "auxiliary_loss_mlp": 0.01030592, "balance_loss_clip": 1.01893914, "balance_loss_mlp": 1.0365665, "epoch": 0.7839470915376522, "flos": 32892845690880.0, "grad_norm": 1.560877997888263, "language_loss": 0.6798408, "learning_rate": 4.698878342684349e-07, "loss": 0.7011919, "num_input_tokens_seen": 281273730, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 13039, "time_per_iteration": 2.5659632682800293 }, { "auxiliary_loss_clip": 0.01101311, "auxiliary_loss_mlp": 0.0102554, "balance_loss_clip": 1.01466203, "balance_loss_mlp": 1.03397131, "epoch": 0.7840072147903202, "flos": 29676003373440.0, "grad_norm": 1.9355856098557465, "language_loss": 0.69291258, "learning_rate": 4.6963706411280537e-07, "loss": 0.71418113, "num_input_tokens_seen": 281293670, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.671875, "step": 13040, "time_per_iteration": 2.5051276683807373 }, { "auxiliary_loss_clip": 0.01109048, "auxiliary_loss_mlp": 0.01034169, "balance_loss_clip": 1.02148521, "balance_loss_mlp": 1.03836012, "epoch": 0.7840673380429881, "flos": 18186744234240.0, "grad_norm": 1.603219443437996, "language_loss": 0.67406785, "learning_rate": 4.6938635198869116e-07, "loss": 0.69550008, "num_input_tokens_seen": 281313070, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 13041, "time_per_iteration": 2.4532346725463867 }, { "auxiliary_loss_clip": 0.01032416, "auxiliary_loss_mlp": 0.01001301, "balance_loss_clip": 1.00030601, "balance_loss_mlp": 1.00981951, "epoch": 0.7841274612956561, "flos": 66346006613760.0, "grad_norm": 0.666338301029628, "language_loss": 0.57392454, "learning_rate": 4.691356979055998e-07, "loss": 0.59426177, "num_input_tokens_seen": 281374880, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.2265625, "step": 13042, "time_per_iteration": 3.0633139610290527 }, { "auxiliary_loss_clip": 0.01108569, "auxiliary_loss_mlp": 0.01031195, "balance_loss_clip": 1.01846337, "balance_loss_mlp": 1.03813386, "epoch": 0.784187584548324, "flos": 26648482665600.0, "grad_norm": 2.0175448731590855, "language_loss": 0.83840692, "learning_rate": 4.688851018730369e-07, "loss": 0.85980451, "num_input_tokens_seen": 281392620, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13043, "time_per_iteration": 2.4928576946258545 }, { "auxiliary_loss_clip": 0.01104007, "auxiliary_loss_mlp": 0.01027279, "balance_loss_clip": 1.01506042, "balance_loss_mlp": 1.03632891, "epoch": 0.7842477078009921, "flos": 25740158515200.0, "grad_norm": 1.5803845207073643, "language_loss": 0.88229656, "learning_rate": 4.6863456390050425e-07, "loss": 0.9036094, "num_input_tokens_seen": 281413140, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 13044, "time_per_iteration": 3.9811787605285645 }, { "auxiliary_loss_clip": 0.0111206, "auxiliary_loss_mlp": 0.01030248, "balance_loss_clip": 1.01764727, "balance_loss_mlp": 1.03894711, "epoch": 0.78430783105366, "flos": 21980957765760.0, "grad_norm": 3.649578215851881, "language_loss": 0.79275918, "learning_rate": 4.6838408399750195e-07, "loss": 0.81418228, "num_input_tokens_seen": 281430860, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73046875, "step": 13045, "time_per_iteration": 3.8170969486236572 }, { "auxiliary_loss_clip": 0.01104167, "auxiliary_loss_mlp": 0.01032122, "balance_loss_clip": 1.01995647, "balance_loss_mlp": 1.03585005, "epoch": 0.784367954306328, "flos": 23842279607040.0, "grad_norm": 1.4077111382063463, "language_loss": 0.7251364, "learning_rate": 4.6813366217352925e-07, "loss": 0.74649936, "num_input_tokens_seen": 281451385, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 13046, "time_per_iteration": 2.4859230518341064 }, { "auxiliary_loss_clip": 0.01105765, "auxiliary_loss_mlp": 0.01032302, "balance_loss_clip": 1.01967144, "balance_loss_mlp": 1.03800988, "epoch": 0.7844280775589959, "flos": 24826662806400.0, "grad_norm": 1.5561204802778867, "language_loss": 0.63315129, "learning_rate": 4.678832984380809e-07, "loss": 0.65453196, "num_input_tokens_seen": 281472255, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 13047, "time_per_iteration": 2.5187132358551025 }, { "auxiliary_loss_clip": 0.01107211, "auxiliary_loss_mlp": 0.0102573, "balance_loss_clip": 1.01383853, "balance_loss_mlp": 1.03906333, "epoch": 0.7844882008116639, "flos": 22455660931200.0, "grad_norm": 1.4993077178957699, "language_loss": 0.73333931, "learning_rate": 4.676329928006515e-07, "loss": 0.75466871, "num_input_tokens_seen": 281492860, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13048, "time_per_iteration": 3.8926877975463867 }, { "auxiliary_loss_clip": 0.0111046, "auxiliary_loss_mlp": 0.01033318, "balance_loss_clip": 1.02064645, "balance_loss_mlp": 1.03928602, "epoch": 0.7845483240643318, "flos": 26104041244800.0, "grad_norm": 6.044465866203175, "language_loss": 0.74831688, "learning_rate": 4.6738274527073243e-07, "loss": 0.76975465, "num_input_tokens_seen": 281511815, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 13049, "time_per_iteration": 2.5160231590270996 }, { "auxiliary_loss_clip": 0.011098, "auxiliary_loss_mlp": 0.01034408, "balance_loss_clip": 1.02059197, "balance_loss_mlp": 1.03652203, "epoch": 0.7846084473169999, "flos": 19354307817600.0, "grad_norm": 1.921680958420067, "language_loss": 0.7270261, "learning_rate": 4.6713255585781454e-07, "loss": 0.74846816, "num_input_tokens_seen": 281530090, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 13050, "time_per_iteration": 3.938054084777832 }, { "auxiliary_loss_clip": 0.01105511, "auxiliary_loss_mlp": 0.01033529, "balance_loss_clip": 1.02064836, "balance_loss_mlp": 1.0369302, "epoch": 0.7846685705696678, "flos": 23325811902720.0, "grad_norm": 2.162923949511628, "language_loss": 0.73595381, "learning_rate": 4.668824245713825e-07, "loss": 0.75734425, "num_input_tokens_seen": 281547075, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 13051, "time_per_iteration": 2.497502565383911 }, { "auxiliary_loss_clip": 0.0110861, "auxiliary_loss_mlp": 0.01033611, "balance_loss_clip": 1.02039623, "balance_loss_mlp": 1.0383352, "epoch": 0.7847286938223358, "flos": 35809545962880.0, "grad_norm": 2.1746440257791293, "language_loss": 0.72785199, "learning_rate": 4.666323514209227e-07, "loss": 0.74927419, "num_input_tokens_seen": 281568080, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 13052, "time_per_iteration": 2.602654457092285 }, { "auxiliary_loss_clip": 0.01102797, "auxiliary_loss_mlp": 0.01031664, "balance_loss_clip": 1.01995802, "balance_loss_mlp": 1.03707242, "epoch": 0.7847888170750038, "flos": 18478159274880.0, "grad_norm": 4.849536966693198, "language_loss": 0.68834209, "learning_rate": 4.663823364159183e-07, "loss": 0.70968676, "num_input_tokens_seen": 281586925, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.65625, "step": 13053, "time_per_iteration": 2.4782373905181885 }, { "auxiliary_loss_clip": 0.01106479, "auxiliary_loss_mlp": 0.01027116, "balance_loss_clip": 1.01517701, "balance_loss_mlp": 1.03742433, "epoch": 0.7848489403276717, "flos": 25119155255040.0, "grad_norm": 2.628564066647487, "language_loss": 0.69922304, "learning_rate": 4.6613237956584893e-07, "loss": 0.720559, "num_input_tokens_seen": 281603915, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 13054, "time_per_iteration": 2.5141093730926514 }, { "auxiliary_loss_clip": 0.01109774, "auxiliary_loss_mlp": 0.01033472, "balance_loss_clip": 1.02060914, "balance_loss_mlp": 1.03807366, "epoch": 0.7849090635803397, "flos": 26502433966080.0, "grad_norm": 1.7837367737393843, "language_loss": 0.75953579, "learning_rate": 4.658824808801938e-07, "loss": 0.78096825, "num_input_tokens_seen": 281624220, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 13055, "time_per_iteration": 2.510911703109741 }, { "auxiliary_loss_clip": 0.01112766, "auxiliary_loss_mlp": 0.01032699, "balance_loss_clip": 1.01958561, "balance_loss_mlp": 1.03988445, "epoch": 0.7849691868330076, "flos": 20959658363520.0, "grad_norm": 1.869776836969857, "language_loss": 0.74911714, "learning_rate": 4.656326403684283e-07, "loss": 0.77057183, "num_input_tokens_seen": 281642325, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 13056, "time_per_iteration": 2.488337278366089 }, { "auxiliary_loss_clip": 0.01108764, "auxiliary_loss_mlp": 0.01028331, "balance_loss_clip": 1.01599264, "balance_loss_mlp": 1.03947568, "epoch": 0.7850293100856757, "flos": 26067484177920.0, "grad_norm": 2.1492944302928354, "language_loss": 0.70419991, "learning_rate": 4.6538285804002744e-07, "loss": 0.72557086, "num_input_tokens_seen": 281663065, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 13057, "time_per_iteration": 2.5062415599823 }, { "auxiliary_loss_clip": 0.0110804, "auxiliary_loss_mlp": 0.01032021, "balance_loss_clip": 1.01928937, "balance_loss_mlp": 1.03727651, "epoch": 0.7850894333383436, "flos": 22491894775680.0, "grad_norm": 1.889250062564896, "language_loss": 0.76853025, "learning_rate": 4.6513313390446175e-07, "loss": 0.78993082, "num_input_tokens_seen": 281681005, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 13058, "time_per_iteration": 2.471440315246582 }, { "auxiliary_loss_clip": 0.01108751, "auxiliary_loss_mlp": 0.01030649, "balance_loss_clip": 1.01831102, "balance_loss_mlp": 1.03965354, "epoch": 0.7851495565910116, "flos": 20558643949440.0, "grad_norm": 1.9076284262592211, "language_loss": 0.70848632, "learning_rate": 4.6488346797120146e-07, "loss": 0.72988033, "num_input_tokens_seen": 281697965, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 13059, "time_per_iteration": 2.4639854431152344 }, { "auxiliary_loss_clip": 0.01111314, "auxiliary_loss_mlp": 0.01038951, "balance_loss_clip": 1.02530742, "balance_loss_mlp": 1.03761041, "epoch": 0.7852096798436795, "flos": 15924838942080.0, "grad_norm": 1.9775408131193173, "language_loss": 0.76635492, "learning_rate": 4.646338602497144e-07, "loss": 0.78785759, "num_input_tokens_seen": 281716035, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.73828125, "step": 13060, "time_per_iteration": 2.4399220943450928 }, { "auxiliary_loss_clip": 0.01107971, "auxiliary_loss_mlp": 0.01030875, "balance_loss_clip": 1.01786327, "balance_loss_mlp": 1.03859591, "epoch": 0.7852698030963475, "flos": 19062282245760.0, "grad_norm": 3.259016984560905, "language_loss": 0.77178466, "learning_rate": 4.643843107494654e-07, "loss": 0.79317313, "num_input_tokens_seen": 281732815, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 13061, "time_per_iteration": 2.4522504806518555 }, { "auxiliary_loss_clip": 0.01107114, "auxiliary_loss_mlp": 0.01032845, "balance_loss_clip": 1.01972651, "balance_loss_mlp": 1.03700674, "epoch": 0.7853299263490154, "flos": 24644380262400.0, "grad_norm": 1.9912177351080544, "language_loss": 0.74166948, "learning_rate": 4.641348194799164e-07, "loss": 0.76306909, "num_input_tokens_seen": 281751980, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 13062, "time_per_iteration": 2.5019540786743164 }, { "auxiliary_loss_clip": 0.01103963, "auxiliary_loss_mlp": 0.0103153, "balance_loss_clip": 1.01959693, "balance_loss_mlp": 1.03620887, "epoch": 0.7853900496016835, "flos": 22017981709440.0, "grad_norm": 1.4255797786631006, "language_loss": 0.68627816, "learning_rate": 4.638853864505297e-07, "loss": 0.70763308, "num_input_tokens_seen": 281772670, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13063, "time_per_iteration": 2.4784271717071533 }, { "auxiliary_loss_clip": 0.01109139, "auxiliary_loss_mlp": 0.01034983, "balance_loss_clip": 1.02311528, "balance_loss_mlp": 1.04190159, "epoch": 0.7854501728543514, "flos": 30227412032640.0, "grad_norm": 1.9191312441383486, "language_loss": 0.73363304, "learning_rate": 4.636360116707625e-07, "loss": 0.75507426, "num_input_tokens_seen": 281792930, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 13064, "time_per_iteration": 2.5760695934295654 }, { "auxiliary_loss_clip": 0.01108241, "auxiliary_loss_mlp": 0.01034449, "balance_loss_clip": 1.02180672, "balance_loss_mlp": 1.03715229, "epoch": 0.7855102961070194, "flos": 18843694030080.0, "grad_norm": 2.0054794064790187, "language_loss": 0.6784023, "learning_rate": 4.633866951500718e-07, "loss": 0.69982922, "num_input_tokens_seen": 281811805, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 13065, "time_per_iteration": 2.4729840755462646 }, { "auxiliary_loss_clip": 0.01108505, "auxiliary_loss_mlp": 0.01033629, "balance_loss_clip": 1.0213449, "balance_loss_mlp": 1.03903031, "epoch": 0.7855704193596874, "flos": 22309971367680.0, "grad_norm": 1.95985027343333, "language_loss": 0.76343238, "learning_rate": 4.6313743689791196e-07, "loss": 0.7848537, "num_input_tokens_seen": 281831885, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 13066, "time_per_iteration": 2.540503978729248 }, { "auxiliary_loss_clip": 0.01032902, "auxiliary_loss_mlp": 0.00999525, "balance_loss_clip": 0.99849337, "balance_loss_mlp": 1.01030684, "epoch": 0.7856305426123553, "flos": 60004434407040.0, "grad_norm": 0.8612641198520602, "language_loss": 0.53403664, "learning_rate": 4.628882369237346e-07, "loss": 0.55436087, "num_input_tokens_seen": 281900310, "router_z_loss_clip": 0.01031494, "router_z_loss_mlp": 0.2265625, "step": 13067, "time_per_iteration": 3.1769700050354004 }, { "auxiliary_loss_clip": 0.01105876, "auxiliary_loss_mlp": 0.01032588, "balance_loss_clip": 1.01923108, "balance_loss_mlp": 1.03543663, "epoch": 0.7856906658650233, "flos": 21868593045120.0, "grad_norm": 1.6680258220632354, "language_loss": 0.67531133, "learning_rate": 4.62639095236989e-07, "loss": 0.69669592, "num_input_tokens_seen": 281918870, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 13068, "time_per_iteration": 2.518704414367676 }, { "auxiliary_loss_clip": 0.01105581, "auxiliary_loss_mlp": 0.01029781, "balance_loss_clip": 1.01755631, "balance_loss_mlp": 1.03751922, "epoch": 0.7857507891176913, "flos": 23622937205760.0, "grad_norm": 2.457434024431031, "language_loss": 0.67887211, "learning_rate": 4.6239001184712267e-07, "loss": 0.70022571, "num_input_tokens_seen": 281936905, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 13069, "time_per_iteration": 2.496127128601074 }, { "auxiliary_loss_clip": 0.01109734, "auxiliary_loss_mlp": 0.01034386, "balance_loss_clip": 1.0218333, "balance_loss_mlp": 1.03950047, "epoch": 0.7858109123703593, "flos": 25520061928320.0, "grad_norm": 1.6251875410217778, "language_loss": 0.76893252, "learning_rate": 4.6214098676358195e-07, "loss": 0.79037368, "num_input_tokens_seen": 281955625, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 13070, "time_per_iteration": 2.5253119468688965 }, { "auxiliary_loss_clip": 0.01106653, "auxiliary_loss_mlp": 0.01034973, "balance_loss_clip": 1.0228852, "balance_loss_mlp": 1.03766584, "epoch": 0.7858710356230272, "flos": 17457398576640.0, "grad_norm": 1.664811658286748, "language_loss": 0.66116762, "learning_rate": 4.618920199958083e-07, "loss": 0.68258393, "num_input_tokens_seen": 281973285, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 13071, "time_per_iteration": 2.465252637863159 }, { "auxiliary_loss_clip": 0.01107651, "auxiliary_loss_mlp": 0.01030342, "balance_loss_clip": 1.01856422, "balance_loss_mlp": 1.03677642, "epoch": 0.7859311588756952, "flos": 24679680353280.0, "grad_norm": 1.8399427098419838, "language_loss": 0.73978782, "learning_rate": 4.616431115532442e-07, "loss": 0.76116771, "num_input_tokens_seen": 281991410, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.7109375, "step": 13072, "time_per_iteration": 2.485231399536133 }, { "auxiliary_loss_clip": 0.01109484, "auxiliary_loss_mlp": 0.01028358, "balance_loss_clip": 1.01569152, "balance_loss_mlp": 1.03875279, "epoch": 0.7859912821283631, "flos": 21799142098560.0, "grad_norm": 1.941680274677029, "language_loss": 0.71273196, "learning_rate": 4.613942614453268e-07, "loss": 0.73411036, "num_input_tokens_seen": 282010845, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 13073, "time_per_iteration": 2.464820146560669 }, { "auxiliary_loss_clip": 0.01106681, "auxiliary_loss_mlp": 0.01035015, "balance_loss_clip": 1.02206922, "balance_loss_mlp": 1.0371964, "epoch": 0.7860514053810311, "flos": 20847293642880.0, "grad_norm": 1.739283403310443, "language_loss": 0.76625216, "learning_rate": 4.611454696814938e-07, "loss": 0.78766906, "num_input_tokens_seen": 282029635, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 13074, "time_per_iteration": 2.443267583847046 }, { "auxiliary_loss_clip": 0.01104692, "auxiliary_loss_mlp": 0.01034287, "balance_loss_clip": 1.02213955, "balance_loss_mlp": 1.0377593, "epoch": 0.786111528633699, "flos": 24315689882880.0, "grad_norm": 1.6567533814159656, "language_loss": 0.75220263, "learning_rate": 4.608967362711782e-07, "loss": 0.77359235, "num_input_tokens_seen": 282050285, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 13075, "time_per_iteration": 2.4750399589538574 }, { "auxiliary_loss_clip": 0.01105823, "auxiliary_loss_mlp": 0.01023993, "balance_loss_clip": 1.01235271, "balance_loss_mlp": 1.03652883, "epoch": 0.7861716518863671, "flos": 24353180703360.0, "grad_norm": 1.6770919524706447, "language_loss": 0.68799418, "learning_rate": 4.6064806122381283e-07, "loss": 0.70929235, "num_input_tokens_seen": 282071040, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 13076, "time_per_iteration": 2.479478120803833 }, { "auxiliary_loss_clip": 0.01106478, "auxiliary_loss_mlp": 0.01030972, "balance_loss_clip": 1.01882493, "balance_loss_mlp": 1.03884637, "epoch": 0.786231775139035, "flos": 14022399006720.0, "grad_norm": 2.830363961018188, "language_loss": 0.80034691, "learning_rate": 4.603994445488282e-07, "loss": 0.82172143, "num_input_tokens_seen": 282086610, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 13077, "time_per_iteration": 2.423968553543091 }, { "auxiliary_loss_clip": 0.01107697, "auxiliary_loss_mlp": 0.01032622, "balance_loss_clip": 1.02029002, "balance_loss_mlp": 1.03861749, "epoch": 0.786291898391703, "flos": 33724248865920.0, "grad_norm": 1.6798929309886712, "language_loss": 0.70631552, "learning_rate": 4.6015088625564956e-07, "loss": 0.72771871, "num_input_tokens_seen": 282107440, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 13078, "time_per_iteration": 2.5739309787750244 }, { "auxiliary_loss_clip": 0.01107432, "auxiliary_loss_mlp": 0.01035124, "balance_loss_clip": 1.02350116, "balance_loss_mlp": 1.03931594, "epoch": 0.786352021644371, "flos": 25811476968960.0, "grad_norm": 1.5313125308572353, "language_loss": 0.81324172, "learning_rate": 4.599023863537039e-07, "loss": 0.83466733, "num_input_tokens_seen": 282127290, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 13079, "time_per_iteration": 2.505228281021118 }, { "auxiliary_loss_clip": 0.0110353, "auxiliary_loss_mlp": 0.01030081, "balance_loss_clip": 1.01818967, "balance_loss_mlp": 1.03730917, "epoch": 0.7864121448970389, "flos": 28910818920960.0, "grad_norm": 1.5225646217338067, "language_loss": 0.68444872, "learning_rate": 4.596539448524146e-07, "loss": 0.7057848, "num_input_tokens_seen": 282147505, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66015625, "step": 13080, "time_per_iteration": 2.5141820907592773 }, { "auxiliary_loss_clip": 0.0110687, "auxiliary_loss_mlp": 0.01035745, "balance_loss_clip": 1.02296019, "balance_loss_mlp": 1.03761542, "epoch": 0.7864722681497069, "flos": 19208833735680.0, "grad_norm": 1.7029868911672312, "language_loss": 0.70138526, "learning_rate": 4.594055617612016e-07, "loss": 0.7228114, "num_input_tokens_seen": 282166450, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 13081, "time_per_iteration": 2.460071325302124 }, { "auxiliary_loss_clip": 0.01106536, "auxiliary_loss_mlp": 0.01038885, "balance_loss_clip": 1.02674997, "balance_loss_mlp": 1.03595972, "epoch": 0.7865323914023749, "flos": 21871573873920.0, "grad_norm": 1.7158342701524558, "language_loss": 0.68737066, "learning_rate": 4.591572370894838e-07, "loss": 0.70882487, "num_input_tokens_seen": 282186465, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.70703125, "step": 13082, "time_per_iteration": 2.485567092895508 }, { "auxiliary_loss_clip": 0.01105957, "auxiliary_loss_mlp": 0.01034134, "balance_loss_clip": 1.02202868, "balance_loss_mlp": 1.03758883, "epoch": 0.7865925146550429, "flos": 25520313323520.0, "grad_norm": 1.6220387754049759, "language_loss": 0.66618156, "learning_rate": 4.589089708466789e-07, "loss": 0.68758249, "num_input_tokens_seen": 282207180, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 13083, "time_per_iteration": 2.500866413116455 }, { "auxiliary_loss_clip": 0.01111229, "auxiliary_loss_mlp": 0.01029996, "balance_loss_clip": 1.01640058, "balance_loss_mlp": 1.0383116, "epoch": 0.7866526379077108, "flos": 19097366855040.0, "grad_norm": 2.1858517670755053, "language_loss": 0.74733776, "learning_rate": 4.5866076304220015e-07, "loss": 0.76875007, "num_input_tokens_seen": 282225865, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 13084, "time_per_iteration": 2.4568912982940674 }, { "auxiliary_loss_clip": 0.01106897, "auxiliary_loss_mlp": 0.01035828, "balance_loss_clip": 1.0241456, "balance_loss_mlp": 1.03878808, "epoch": 0.7867127611603788, "flos": 16173771171840.0, "grad_norm": 1.8995164335240817, "language_loss": 0.70658088, "learning_rate": 4.584126136854591e-07, "loss": 0.72800815, "num_input_tokens_seen": 282242895, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 13085, "time_per_iteration": 2.444458246231079 }, { "auxiliary_loss_clip": 0.01108925, "auxiliary_loss_mlp": 0.01032229, "balance_loss_clip": 1.01959908, "balance_loss_mlp": 1.03614819, "epoch": 0.7867728844130467, "flos": 20773640805120.0, "grad_norm": 10.120327798678431, "language_loss": 0.72287202, "learning_rate": 4.5816452278586617e-07, "loss": 0.74428356, "num_input_tokens_seen": 282260425, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 13086, "time_per_iteration": 5.324958562850952 }, { "auxiliary_loss_clip": 0.01105777, "auxiliary_loss_mlp": 0.01031992, "balance_loss_clip": 1.02020788, "balance_loss_mlp": 1.03636336, "epoch": 0.7868330076657147, "flos": 21760106993280.0, "grad_norm": 2.0756199260917043, "language_loss": 0.74747485, "learning_rate": 4.5791649035282965e-07, "loss": 0.76885259, "num_input_tokens_seen": 282279335, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6953125, "step": 13087, "time_per_iteration": 2.476872682571411 }, { "auxiliary_loss_clip": 0.01104803, "auxiliary_loss_mlp": 0.01035765, "balance_loss_clip": 1.02412462, "balance_loss_mlp": 1.03669643, "epoch": 0.7868931309183826, "flos": 25700692446720.0, "grad_norm": 3.389558565195175, "language_loss": 0.7126165, "learning_rate": 4.5766851639575456e-07, "loss": 0.7340222, "num_input_tokens_seen": 282299905, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 13088, "time_per_iteration": 2.5011157989501953 }, { "auxiliary_loss_clip": 0.01032519, "auxiliary_loss_mlp": 0.0100541, "balance_loss_clip": 1.00434291, "balance_loss_mlp": 1.00987864, "epoch": 0.7869532541710507, "flos": 64644883430400.0, "grad_norm": 0.7060491280524873, "language_loss": 0.55443799, "learning_rate": 4.574206009240431e-07, "loss": 0.5748173, "num_input_tokens_seen": 282367620, "router_z_loss_clip": 0.01068115, "router_z_loss_mlp": 0.2265625, "step": 13089, "time_per_iteration": 4.590238332748413 }, { "auxiliary_loss_clip": 0.01031679, "auxiliary_loss_mlp": 0.01001856, "balance_loss_clip": 1.00074124, "balance_loss_mlp": 1.00915992, "epoch": 0.7870133774237186, "flos": 67453600440960.0, "grad_norm": 0.7235444095057808, "language_loss": 0.50002736, "learning_rate": 4.571727439470976e-07, "loss": 0.52036273, "num_input_tokens_seen": 282435695, "router_z_loss_clip": 0.01116943, "router_z_loss_mlp": 0.22558594, "step": 13090, "time_per_iteration": 3.190478563308716 }, { "auxiliary_loss_clip": 0.01105793, "auxiliary_loss_mlp": 0.01029945, "balance_loss_clip": 1.01856053, "balance_loss_mlp": 1.03796327, "epoch": 0.7870735006763866, "flos": 26068310190720.0, "grad_norm": 2.8605167219783456, "language_loss": 0.83544624, "learning_rate": 4.5692494547431583e-07, "loss": 0.8568036, "num_input_tokens_seen": 282456025, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 13091, "time_per_iteration": 2.5261788368225098 }, { "auxiliary_loss_clip": 0.01032896, "auxiliary_loss_mlp": 0.01002478, "balance_loss_clip": 1.00141072, "balance_loss_mlp": 1.01023984, "epoch": 0.7871336239290546, "flos": 70289572896000.0, "grad_norm": 0.7092109198452552, "language_loss": 0.63963294, "learning_rate": 4.566772055150947e-07, "loss": 0.65998662, "num_input_tokens_seen": 282520995, "router_z_loss_clip": 0.01068115, "router_z_loss_mlp": 0.2265625, "step": 13092, "time_per_iteration": 4.557527542114258 }, { "auxiliary_loss_clip": 0.0110998, "auxiliary_loss_mlp": 0.01037276, "balance_loss_clip": 1.02427578, "balance_loss_mlp": 1.03916669, "epoch": 0.7871937471817225, "flos": 15778574760960.0, "grad_norm": 2.0173979306115934, "language_loss": 0.79314053, "learning_rate": 4.564295240788285e-07, "loss": 0.81461304, "num_input_tokens_seen": 282539355, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 13093, "time_per_iteration": 2.4412360191345215 }, { "auxiliary_loss_clip": 0.01104751, "auxiliary_loss_mlp": 0.01026822, "balance_loss_clip": 1.01494932, "balance_loss_mlp": 1.03704405, "epoch": 0.7872538704343905, "flos": 20485242506880.0, "grad_norm": 1.982627641676233, "language_loss": 0.75458449, "learning_rate": 4.561819011749106e-07, "loss": 0.77590024, "num_input_tokens_seen": 282555735, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 13094, "time_per_iteration": 2.470186233520508 }, { "auxiliary_loss_clip": 0.01109279, "auxiliary_loss_mlp": 0.01038441, "balance_loss_clip": 1.02646661, "balance_loss_mlp": 1.0392071, "epoch": 0.7873139936870585, "flos": 25082670015360.0, "grad_norm": 1.6763348644572549, "language_loss": 0.79928237, "learning_rate": 4.5593433681272884e-07, "loss": 0.82075953, "num_input_tokens_seen": 282574550, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69921875, "step": 13095, "time_per_iteration": 2.5117921829223633 }, { "auxiliary_loss_clip": 0.01108349, "auxiliary_loss_mlp": 0.01030184, "balance_loss_clip": 1.01784003, "balance_loss_mlp": 1.03746367, "epoch": 0.7873741169397265, "flos": 30883176679680.0, "grad_norm": 1.6906861273912055, "language_loss": 0.68272805, "learning_rate": 4.556868310016715e-07, "loss": 0.70411336, "num_input_tokens_seen": 282596520, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7109375, "step": 13096, "time_per_iteration": 2.5445282459259033 }, { "auxiliary_loss_clip": 0.01099135, "auxiliary_loss_mlp": 0.01024324, "balance_loss_clip": 1.01402974, "balance_loss_mlp": 1.0336411, "epoch": 0.7874342401923944, "flos": 46791962242560.0, "grad_norm": 1.4740896014023663, "language_loss": 0.7049005, "learning_rate": 4.55439383751125e-07, "loss": 0.72613508, "num_input_tokens_seen": 282620560, "router_z_loss_clip": 0.10302734, "router_z_loss_mlp": 0.65625, "step": 13097, "time_per_iteration": 2.6915130615234375 }, { "auxiliary_loss_clip": 0.01109627, "auxiliary_loss_mlp": 0.01033747, "balance_loss_clip": 1.02086627, "balance_loss_mlp": 1.03894722, "epoch": 0.7874943634450624, "flos": 23584548545280.0, "grad_norm": 1.9697998627802542, "language_loss": 0.80900395, "learning_rate": 4.5519199507047126e-07, "loss": 0.83043766, "num_input_tokens_seen": 282639830, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 13098, "time_per_iteration": 2.4965128898620605 }, { "auxiliary_loss_clip": 0.01105151, "auxiliary_loss_mlp": 0.01028633, "balance_loss_clip": 1.01727796, "balance_loss_mlp": 1.03704977, "epoch": 0.7875544866977303, "flos": 20191169859840.0, "grad_norm": 2.022674335881578, "language_loss": 0.74290299, "learning_rate": 4.5494466496909177e-07, "loss": 0.7642408, "num_input_tokens_seen": 282660130, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 13099, "time_per_iteration": 2.483397960662842 }, { "auxiliary_loss_clip": 0.01106613, "auxiliary_loss_mlp": 0.01026829, "balance_loss_clip": 1.01397824, "balance_loss_mlp": 1.03775561, "epoch": 0.7876146099503983, "flos": 22602571557120.0, "grad_norm": 1.6608553602517133, "language_loss": 0.78236175, "learning_rate": 4.5469739345636603e-07, "loss": 0.80369616, "num_input_tokens_seen": 282681125, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 13100, "time_per_iteration": 2.5188939571380615 }, { "auxiliary_loss_clip": 0.01112192, "auxiliary_loss_mlp": 0.01032873, "balance_loss_clip": 1.01946223, "balance_loss_mlp": 1.03817856, "epoch": 0.7876747332030662, "flos": 10705833555840.0, "grad_norm": 2.4878641314460204, "language_loss": 0.66570145, "learning_rate": 4.5445018054167007e-07, "loss": 0.68715209, "num_input_tokens_seen": 282696690, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 13101, "time_per_iteration": 2.451176404953003 }, { "auxiliary_loss_clip": 0.01105064, "auxiliary_loss_mlp": 0.01030971, "balance_loss_clip": 1.01888919, "balance_loss_mlp": 1.03588176, "epoch": 0.7877348564557343, "flos": 38399315621760.0, "grad_norm": 2.002084823508511, "language_loss": 0.77754599, "learning_rate": 4.5420302623437745e-07, "loss": 0.79890633, "num_input_tokens_seen": 282721210, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 13102, "time_per_iteration": 2.6876444816589355 }, { "auxiliary_loss_clip": 0.01106466, "auxiliary_loss_mlp": 0.01039933, "balance_loss_clip": 1.02820253, "balance_loss_mlp": 1.03733778, "epoch": 0.7877949797084022, "flos": 18329524796160.0, "grad_norm": 1.9491917921134994, "language_loss": 0.82480747, "learning_rate": 4.5395593054386093e-07, "loss": 0.84627146, "num_input_tokens_seen": 282738505, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69140625, "step": 13103, "time_per_iteration": 2.4734690189361572 }, { "auxiliary_loss_clip": 0.01110481, "auxiliary_loss_mlp": 0.01034949, "balance_loss_clip": 1.02218771, "balance_loss_mlp": 1.0388602, "epoch": 0.7878551029610702, "flos": 25806736373760.0, "grad_norm": 2.051643475006295, "language_loss": 0.80412781, "learning_rate": 4.537088934794913e-07, "loss": 0.82558215, "num_input_tokens_seen": 282756895, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 13104, "time_per_iteration": 2.5328242778778076 }, { "auxiliary_loss_clip": 0.01107633, "auxiliary_loss_mlp": 0.01035434, "balance_loss_clip": 1.02300024, "balance_loss_mlp": 1.03753543, "epoch": 0.7879152262137382, "flos": 22342685679360.0, "grad_norm": 1.7529699328798731, "language_loss": 0.74239558, "learning_rate": 4.5346191505063515e-07, "loss": 0.76382625, "num_input_tokens_seen": 282774955, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 13105, "time_per_iteration": 2.46669340133667 }, { "auxiliary_loss_clip": 0.01108148, "auxiliary_loss_mlp": 0.01035064, "balance_loss_clip": 1.02276123, "balance_loss_mlp": 1.03635216, "epoch": 0.7879753494664061, "flos": 24785329230720.0, "grad_norm": 1.8117111420286687, "language_loss": 0.75340557, "learning_rate": 4.5321499526665776e-07, "loss": 0.77483773, "num_input_tokens_seen": 282793165, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 13106, "time_per_iteration": 2.5418550968170166 }, { "auxiliary_loss_clip": 0.01108967, "auxiliary_loss_mlp": 0.01035205, "balance_loss_clip": 1.02298617, "balance_loss_mlp": 1.03853226, "epoch": 0.7880354727190741, "flos": 16909078487040.0, "grad_norm": 2.954793757440104, "language_loss": 0.73226202, "learning_rate": 4.5296813413692337e-07, "loss": 0.75370371, "num_input_tokens_seen": 282809820, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 13107, "time_per_iteration": 2.4467051029205322 }, { "auxiliary_loss_clip": 0.01105579, "auxiliary_loss_mlp": 0.01033559, "balance_loss_clip": 1.02127481, "balance_loss_mlp": 1.03694499, "epoch": 0.7880955959717421, "flos": 22230500526720.0, "grad_norm": 3.741427056263779, "language_loss": 0.7330482, "learning_rate": 4.5272133167079165e-07, "loss": 0.75443965, "num_input_tokens_seen": 282828600, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 13108, "time_per_iteration": 2.4919328689575195 }, { "auxiliary_loss_clip": 0.01031293, "auxiliary_loss_mlp": 0.01002375, "balance_loss_clip": 1.00131416, "balance_loss_mlp": 1.00882769, "epoch": 0.7881557192244101, "flos": 69183200131200.0, "grad_norm": 0.8860050212763799, "language_loss": 0.60363305, "learning_rate": 4.5247458787762216e-07, "loss": 0.62396967, "num_input_tokens_seen": 282882775, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.22460938, "step": 13109, "time_per_iteration": 3.0494301319122314 }, { "auxiliary_loss_clip": 0.01103365, "auxiliary_loss_mlp": 0.01032041, "balance_loss_clip": 1.0203824, "balance_loss_mlp": 1.03688776, "epoch": 0.788215842477078, "flos": 24935436167040.0, "grad_norm": 1.981304813966953, "language_loss": 0.71870565, "learning_rate": 4.5222790276677126e-07, "loss": 0.74005973, "num_input_tokens_seen": 282902680, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6640625, "step": 13110, "time_per_iteration": 2.508129596710205 }, { "auxiliary_loss_clip": 0.01103793, "auxiliary_loss_mlp": 0.01029039, "balance_loss_clip": 1.01775575, "balance_loss_mlp": 1.03749347, "epoch": 0.788275965729746, "flos": 26106483369600.0, "grad_norm": 1.9670324004327049, "language_loss": 0.7508651, "learning_rate": 4.5198127634759455e-07, "loss": 0.77219343, "num_input_tokens_seen": 282923625, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6640625, "step": 13111, "time_per_iteration": 2.5036637783050537 }, { "auxiliary_loss_clip": 0.01106645, "auxiliary_loss_mlp": 0.01034521, "balance_loss_clip": 1.02182579, "balance_loss_mlp": 1.03771937, "epoch": 0.7883360889824139, "flos": 21214803646080.0, "grad_norm": 1.921101393859097, "language_loss": 0.61141837, "learning_rate": 4.5173470862944206e-07, "loss": 0.63283002, "num_input_tokens_seen": 282941955, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13112, "time_per_iteration": 2.499373197555542 }, { "auxiliary_loss_clip": 0.01106081, "auxiliary_loss_mlp": 0.01029258, "balance_loss_clip": 1.01665735, "balance_loss_mlp": 1.03673232, "epoch": 0.7883962122350819, "flos": 21142551438720.0, "grad_norm": 1.844555784364165, "language_loss": 0.67555034, "learning_rate": 4.514881996216644e-07, "loss": 0.69690377, "num_input_tokens_seen": 282961280, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 13113, "time_per_iteration": 2.4557533264160156 }, { "auxiliary_loss_clip": 0.01105654, "auxiliary_loss_mlp": 0.01026944, "balance_loss_clip": 1.01542795, "balance_loss_mlp": 1.03704, "epoch": 0.7884563354877498, "flos": 15302901928320.0, "grad_norm": 2.251005687709405, "language_loss": 0.57927084, "learning_rate": 4.5124174933361e-07, "loss": 0.60059685, "num_input_tokens_seen": 282978210, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 13114, "time_per_iteration": 2.4633028507232666 }, { "auxiliary_loss_clip": 0.01108541, "auxiliary_loss_mlp": 0.01029633, "balance_loss_clip": 1.01693082, "balance_loss_mlp": 1.03804398, "epoch": 0.7885164587404179, "flos": 24388301226240.0, "grad_norm": 1.5272428259451545, "language_loss": 0.66605145, "learning_rate": 4.5099535777462306e-07, "loss": 0.68743312, "num_input_tokens_seen": 282998845, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13115, "time_per_iteration": 2.4942822456359863 }, { "auxiliary_loss_clip": 0.01106508, "auxiliary_loss_mlp": 0.01027818, "balance_loss_clip": 1.015903, "balance_loss_mlp": 1.03745437, "epoch": 0.7885765819930858, "flos": 14385886686720.0, "grad_norm": 2.0126967775254343, "language_loss": 0.8849386, "learning_rate": 4.50749024954048e-07, "loss": 0.90628189, "num_input_tokens_seen": 283015200, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 13116, "time_per_iteration": 2.462224245071411 }, { "auxiliary_loss_clip": 0.01111349, "auxiliary_loss_mlp": 0.01032845, "balance_loss_clip": 1.01905227, "balance_loss_mlp": 1.03668642, "epoch": 0.7886367052457538, "flos": 18259930195200.0, "grad_norm": 1.7937342308643769, "language_loss": 0.72781694, "learning_rate": 4.505027508812245e-07, "loss": 0.74925888, "num_input_tokens_seen": 283033680, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.74609375, "step": 13117, "time_per_iteration": 2.4402220249176025 }, { "auxiliary_loss_clip": 0.01103391, "auxiliary_loss_mlp": 0.01024884, "balance_loss_clip": 1.01383924, "balance_loss_mlp": 1.03683269, "epoch": 0.7886968284984217, "flos": 15305092657920.0, "grad_norm": 1.516952370208628, "language_loss": 0.80212241, "learning_rate": 4.502565355654926e-07, "loss": 0.82340515, "num_input_tokens_seen": 283050620, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66796875, "step": 13118, "time_per_iteration": 2.461578845977783 }, { "auxiliary_loss_clip": 0.01105762, "auxiliary_loss_mlp": 0.01025499, "balance_loss_clip": 1.01378083, "balance_loss_mlp": 1.03793156, "epoch": 0.7887569517510897, "flos": 21215450090880.0, "grad_norm": 1.9305031418640273, "language_loss": 0.73310637, "learning_rate": 4.500103790161878e-07, "loss": 0.75441897, "num_input_tokens_seen": 283070215, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 13119, "time_per_iteration": 2.451732873916626 }, { "auxiliary_loss_clip": 0.01106684, "auxiliary_loss_mlp": 0.01024034, "balance_loss_clip": 1.01119542, "balance_loss_mlp": 1.03659105, "epoch": 0.7888170750037578, "flos": 22711237176960.0, "grad_norm": 1.496604781451919, "language_loss": 0.72006416, "learning_rate": 4.4976428124264454e-07, "loss": 0.74137133, "num_input_tokens_seen": 283091485, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13120, "time_per_iteration": 2.482203722000122 }, { "auxiliary_loss_clip": 0.01106675, "auxiliary_loss_mlp": 0.01032294, "balance_loss_clip": 1.01955616, "balance_loss_mlp": 1.03767097, "epoch": 0.7888771982564257, "flos": 36429148592640.0, "grad_norm": 1.4721643627830585, "language_loss": 0.78810346, "learning_rate": 4.4951824225419564e-07, "loss": 0.80949312, "num_input_tokens_seen": 283115040, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13121, "time_per_iteration": 2.5816473960876465 }, { "auxiliary_loss_clip": 0.01103826, "auxiliary_loss_mlp": 0.01030017, "balance_loss_clip": 1.01738667, "balance_loss_mlp": 1.03641796, "epoch": 0.7889373215090937, "flos": 27309993488640.0, "grad_norm": 1.4416235371172699, "language_loss": 0.80274951, "learning_rate": 4.4927226206017057e-07, "loss": 0.82408792, "num_input_tokens_seen": 283136925, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.671875, "step": 13122, "time_per_iteration": 2.533569812774658 }, { "auxiliary_loss_clip": 0.01104781, "auxiliary_loss_mlp": 0.01026402, "balance_loss_clip": 1.01459467, "balance_loss_mlp": 1.03536201, "epoch": 0.7889974447617616, "flos": 19829010983040.0, "grad_norm": 3.907946932533015, "language_loss": 0.78090745, "learning_rate": 4.4902634066989597e-07, "loss": 0.80221927, "num_input_tokens_seen": 283155725, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 13123, "time_per_iteration": 2.442227363586426 }, { "auxiliary_loss_clip": 0.01109575, "auxiliary_loss_mlp": 0.01031787, "balance_loss_clip": 1.01920414, "balance_loss_mlp": 1.0379833, "epoch": 0.7890575680144296, "flos": 17271201450240.0, "grad_norm": 1.955013804141376, "language_loss": 0.67574298, "learning_rate": 4.487804780926985e-07, "loss": 0.69715661, "num_input_tokens_seen": 283173845, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 13124, "time_per_iteration": 2.4581551551818848 }, { "auxiliary_loss_clip": 0.01109084, "auxiliary_loss_mlp": 0.01025472, "balance_loss_clip": 1.01276433, "balance_loss_mlp": 1.03803194, "epoch": 0.7891176912670975, "flos": 27600151553280.0, "grad_norm": 2.7147990350753424, "language_loss": 0.73249841, "learning_rate": 4.4853467433790036e-07, "loss": 0.7538439, "num_input_tokens_seen": 283191985, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 13125, "time_per_iteration": 2.508878469467163 }, { "auxiliary_loss_clip": 0.01105581, "auxiliary_loss_mlp": 0.01025975, "balance_loss_clip": 1.01295722, "balance_loss_mlp": 1.03431559, "epoch": 0.7891778145197655, "flos": 22711668140160.0, "grad_norm": 2.7908223953446916, "language_loss": 0.72728854, "learning_rate": 4.4828892941482267e-07, "loss": 0.74860412, "num_input_tokens_seen": 283210855, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 13126, "time_per_iteration": 2.4761924743652344 }, { "auxiliary_loss_clip": 0.01107855, "auxiliary_loss_mlp": 0.01029519, "balance_loss_clip": 1.01699638, "balance_loss_mlp": 1.03734159, "epoch": 0.7892379377724335, "flos": 17310775259520.0, "grad_norm": 1.8664175560413587, "language_loss": 0.76942688, "learning_rate": 4.480432433327845e-07, "loss": 0.79080069, "num_input_tokens_seen": 283229665, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 13127, "time_per_iteration": 2.449960231781006 }, { "auxiliary_loss_clip": 0.01103909, "auxiliary_loss_mlp": 0.01027925, "balance_loss_clip": 1.01620698, "balance_loss_mlp": 1.0380671, "epoch": 0.7892980610251015, "flos": 25775674087680.0, "grad_norm": 1.9116870195020848, "language_loss": 0.85706896, "learning_rate": 4.47797616101103e-07, "loss": 0.87838733, "num_input_tokens_seen": 283248615, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66015625, "step": 13128, "time_per_iteration": 5.432777643203735 }, { "auxiliary_loss_clip": 0.01106611, "auxiliary_loss_mlp": 0.01030864, "balance_loss_clip": 1.01940775, "balance_loss_mlp": 1.03847265, "epoch": 0.7893581842777694, "flos": 21579943351680.0, "grad_norm": 2.2324075423371936, "language_loss": 0.69161928, "learning_rate": 4.475520477290904e-07, "loss": 0.7129941, "num_input_tokens_seen": 283267135, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.68359375, "step": 13129, "time_per_iteration": 2.460630178451538 }, { "auxiliary_loss_clip": 0.01031218, "auxiliary_loss_mlp": 0.01003266, "balance_loss_clip": 1.00212765, "balance_loss_mlp": 1.00863028, "epoch": 0.7894183075304374, "flos": 69016468176000.0, "grad_norm": 0.7240229945463469, "language_loss": 0.61626929, "learning_rate": 4.473065382260597e-07, "loss": 0.63661414, "num_input_tokens_seen": 283328940, "router_z_loss_clip": 0.01141357, "router_z_loss_mlp": 0.2265625, "step": 13130, "time_per_iteration": 3.0938363075256348 }, { "auxiliary_loss_clip": 0.01107902, "auxiliary_loss_mlp": 0.01030692, "balance_loss_clip": 1.01881909, "balance_loss_mlp": 1.03823757, "epoch": 0.7894784307831053, "flos": 24243258107520.0, "grad_norm": 1.7571727125037513, "language_loss": 0.73687732, "learning_rate": 4.4706108760132124e-07, "loss": 0.75826335, "num_input_tokens_seen": 283350000, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 13131, "time_per_iteration": 3.9044721126556396 }, { "auxiliary_loss_clip": 0.01114731, "auxiliary_loss_mlp": 0.01028029, "balance_loss_clip": 1.01376617, "balance_loss_mlp": 1.03802252, "epoch": 0.7895385540357733, "flos": 20266546550400.0, "grad_norm": 4.557886649501787, "language_loss": 0.69169939, "learning_rate": 4.4681569586418153e-07, "loss": 0.71312696, "num_input_tokens_seen": 283368020, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.765625, "step": 13132, "time_per_iteration": 2.4532999992370605 }, { "auxiliary_loss_clip": 0.01109435, "auxiliary_loss_mlp": 0.01035033, "balance_loss_clip": 1.02204514, "balance_loss_mlp": 1.03878188, "epoch": 0.7895986772884414, "flos": 20996574566400.0, "grad_norm": 2.1890591591952178, "language_loss": 0.62425423, "learning_rate": 4.465703630239468e-07, "loss": 0.6456989, "num_input_tokens_seen": 283387030, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 13133, "time_per_iteration": 3.8841326236724854 }, { "auxiliary_loss_clip": 0.01110765, "auxiliary_loss_mlp": 0.01033352, "balance_loss_clip": 1.01979768, "balance_loss_mlp": 1.03890562, "epoch": 0.7896588005411093, "flos": 18657999694080.0, "grad_norm": 3.0729523349491954, "language_loss": 0.79997486, "learning_rate": 4.463250890899195e-07, "loss": 0.82141608, "num_input_tokens_seen": 283402090, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 13134, "time_per_iteration": 2.4290759563446045 }, { "auxiliary_loss_clip": 0.01105644, "auxiliary_loss_mlp": 0.01028627, "balance_loss_clip": 1.01601434, "balance_loss_mlp": 1.03585649, "epoch": 0.7897189237937773, "flos": 18405907067520.0, "grad_norm": 2.1213500816693793, "language_loss": 0.80673969, "learning_rate": 4.460798740713998e-07, "loss": 0.82808244, "num_input_tokens_seen": 283421035, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 13135, "time_per_iteration": 2.4529197216033936 }, { "auxiliary_loss_clip": 0.0110384, "auxiliary_loss_mlp": 0.01031122, "balance_loss_clip": 1.01813412, "balance_loss_mlp": 1.03537619, "epoch": 0.7897790470464452, "flos": 23731602825600.0, "grad_norm": 19.70137636836105, "language_loss": 0.72293806, "learning_rate": 4.4583471797768733e-07, "loss": 0.74428773, "num_input_tokens_seen": 283441830, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.68359375, "step": 13136, "time_per_iteration": 2.4682929515838623 }, { "auxiliary_loss_clip": 0.0111148, "auxiliary_loss_mlp": 0.0103948, "balance_loss_clip": 1.0260272, "balance_loss_mlp": 1.03736877, "epoch": 0.7898391702991132, "flos": 15918949111680.0, "grad_norm": 2.5134492552032657, "language_loss": 0.71021396, "learning_rate": 4.455896208180778e-07, "loss": 0.73172355, "num_input_tokens_seen": 283459540, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 13137, "time_per_iteration": 2.4444549083709717 }, { "auxiliary_loss_clip": 0.01104576, "auxiliary_loss_mlp": 0.01031261, "balance_loss_clip": 1.01820183, "balance_loss_mlp": 1.03711307, "epoch": 0.7898992935517811, "flos": 19829046896640.0, "grad_norm": 1.736593848730339, "language_loss": 0.73695469, "learning_rate": 4.4534458260186645e-07, "loss": 0.75831306, "num_input_tokens_seen": 283478790, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.67578125, "step": 13138, "time_per_iteration": 2.474485158920288 }, { "auxiliary_loss_clip": 0.01105791, "auxiliary_loss_mlp": 0.0103381, "balance_loss_clip": 1.02145398, "balance_loss_mlp": 1.03694141, "epoch": 0.7899594168044491, "flos": 16216253982720.0, "grad_norm": 1.9744788717565476, "language_loss": 0.68273509, "learning_rate": 4.4509960333834426e-07, "loss": 0.70413113, "num_input_tokens_seen": 283495720, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 13139, "time_per_iteration": 2.4338760375976562 }, { "auxiliary_loss_clip": 0.01030092, "auxiliary_loss_mlp": 0.01001607, "balance_loss_clip": 1.00051582, "balance_loss_mlp": 1.00778365, "epoch": 0.790019540057117, "flos": 68331005959680.0, "grad_norm": 0.8776688531210075, "language_loss": 0.60224056, "learning_rate": 4.448546830368003e-07, "loss": 0.62255758, "num_input_tokens_seen": 283558795, "router_z_loss_clip": 0.01092529, "router_z_loss_mlp": 0.22265625, "step": 13140, "time_per_iteration": 3.1674513816833496 }, { "auxiliary_loss_clip": 0.01107406, "auxiliary_loss_mlp": 0.01035475, "balance_loss_clip": 1.02274323, "balance_loss_mlp": 1.03781116, "epoch": 0.7900796633097851, "flos": 30332773601280.0, "grad_norm": 1.6371986316662268, "language_loss": 0.76204479, "learning_rate": 4.4460982170652304e-07, "loss": 0.78347361, "num_input_tokens_seen": 283579305, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 13141, "time_per_iteration": 2.5477473735809326 }, { "auxiliary_loss_clip": 0.01108737, "auxiliary_loss_mlp": 0.01034154, "balance_loss_clip": 1.0215174, "balance_loss_mlp": 1.03726256, "epoch": 0.790139786562453, "flos": 22126790983680.0, "grad_norm": 2.0025973381631146, "language_loss": 0.68685973, "learning_rate": 4.4436501935679694e-07, "loss": 0.70828867, "num_input_tokens_seen": 283597840, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 13142, "time_per_iteration": 2.460904598236084 }, { "auxiliary_loss_clip": 0.01029625, "auxiliary_loss_mlp": 0.00999923, "balance_loss_clip": 0.99880832, "balance_loss_mlp": 1.0072937, "epoch": 0.790199909815121, "flos": 58207284213120.0, "grad_norm": 0.9016499931047519, "language_loss": 0.6008482, "learning_rate": 4.441202759969049e-07, "loss": 0.62114358, "num_input_tokens_seen": 283647950, "router_z_loss_clip": 0.01116943, "router_z_loss_mlp": 0.22265625, "step": 13143, "time_per_iteration": 2.8953137397766113 }, { "auxiliary_loss_clip": 0.01109693, "auxiliary_loss_mlp": 0.01031932, "balance_loss_clip": 1.01887274, "balance_loss_mlp": 1.03845358, "epoch": 0.7902600330677889, "flos": 34533316759680.0, "grad_norm": 2.1346239564020757, "language_loss": 0.74647033, "learning_rate": 4.4387559163612875e-07, "loss": 0.76788664, "num_input_tokens_seen": 283670645, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 13144, "time_per_iteration": 2.5857324600219727 }, { "auxiliary_loss_clip": 0.01110271, "auxiliary_loss_mlp": 0.01033435, "balance_loss_clip": 1.02065563, "balance_loss_mlp": 1.03838742, "epoch": 0.7903201563204569, "flos": 22346384780160.0, "grad_norm": 2.7953053060763042, "language_loss": 0.83410853, "learning_rate": 4.4363096628374605e-07, "loss": 0.85554564, "num_input_tokens_seen": 283688830, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 13145, "time_per_iteration": 2.4914438724517822 }, { "auxiliary_loss_clip": 0.01100033, "auxiliary_loss_mlp": 0.01031238, "balance_loss_clip": 1.01975822, "balance_loss_mlp": 1.03382754, "epoch": 0.790380279573125, "flos": 22053533195520.0, "grad_norm": 1.8566802061290149, "language_loss": 0.72565067, "learning_rate": 4.4338639994903235e-07, "loss": 0.74696338, "num_input_tokens_seen": 283708625, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66015625, "step": 13146, "time_per_iteration": 2.4969429969787598 }, { "auxiliary_loss_clip": 0.01106987, "auxiliary_loss_mlp": 0.01033764, "balance_loss_clip": 1.02112198, "balance_loss_mlp": 1.03521752, "epoch": 0.7904404028257929, "flos": 20302600826880.0, "grad_norm": 1.9531105583507988, "language_loss": 0.75717598, "learning_rate": 4.4314189264126246e-07, "loss": 0.77858347, "num_input_tokens_seen": 283725710, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 13147, "time_per_iteration": 2.464404344558716 }, { "auxiliary_loss_clip": 0.01105471, "auxiliary_loss_mlp": 0.01035871, "balance_loss_clip": 1.02264464, "balance_loss_mlp": 1.0361644, "epoch": 0.7905005260784609, "flos": 20008923229440.0, "grad_norm": 2.7209067254886707, "language_loss": 0.72354841, "learning_rate": 4.428974443697087e-07, "loss": 0.74496186, "num_input_tokens_seen": 283744150, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 13148, "time_per_iteration": 2.4499809741973877 }, { "auxiliary_loss_clip": 0.01106333, "auxiliary_loss_mlp": 0.01029636, "balance_loss_clip": 1.01688635, "balance_loss_mlp": 1.03593576, "epoch": 0.7905606493311288, "flos": 26905926418560.0, "grad_norm": 1.7684027111733114, "language_loss": 0.71707058, "learning_rate": 4.4265305514363913e-07, "loss": 0.73843026, "num_input_tokens_seen": 283764170, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13149, "time_per_iteration": 2.5160601139068604 }, { "auxiliary_loss_clip": 0.01109645, "auxiliary_loss_mlp": 0.01031135, "balance_loss_clip": 1.0174495, "balance_loss_mlp": 1.03829479, "epoch": 0.7906207725837968, "flos": 23696230907520.0, "grad_norm": 2.696362636280078, "language_loss": 0.65217447, "learning_rate": 4.424087249723225e-07, "loss": 0.67358226, "num_input_tokens_seen": 283784305, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7109375, "step": 13150, "time_per_iteration": 2.4642064571380615 }, { "auxiliary_loss_clip": 0.01104858, "auxiliary_loss_mlp": 0.01028351, "balance_loss_clip": 1.01644778, "balance_loss_mlp": 1.03578281, "epoch": 0.7906808958364647, "flos": 20848837927680.0, "grad_norm": 1.7758885996709755, "language_loss": 0.69882756, "learning_rate": 4.421644538650231e-07, "loss": 0.72015971, "num_input_tokens_seen": 283804040, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 13151, "time_per_iteration": 2.4642693996429443 }, { "auxiliary_loss_clip": 0.01108336, "auxiliary_loss_mlp": 0.01035441, "balance_loss_clip": 1.02230358, "balance_loss_mlp": 1.03737426, "epoch": 0.7907410190891327, "flos": 40735196974080.0, "grad_norm": 1.5541382314818453, "language_loss": 0.70071828, "learning_rate": 4.4192024183100306e-07, "loss": 0.72215605, "num_input_tokens_seen": 283827120, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 13152, "time_per_iteration": 2.6255886554718018 }, { "auxiliary_loss_clip": 0.01105074, "auxiliary_loss_mlp": 0.01029244, "balance_loss_clip": 1.01703072, "balance_loss_mlp": 1.03635383, "epoch": 0.7908011423418007, "flos": 13261165050240.0, "grad_norm": 1.8964788266535586, "language_loss": 0.72860235, "learning_rate": 4.4167608887952367e-07, "loss": 0.74994552, "num_input_tokens_seen": 283844820, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 13153, "time_per_iteration": 2.4343628883361816 }, { "auxiliary_loss_clip": 0.01104614, "auxiliary_loss_mlp": 0.01028288, "balance_loss_clip": 1.01626551, "balance_loss_mlp": 1.03492737, "epoch": 0.7908612655944687, "flos": 19754747614080.0, "grad_norm": 1.6240239878843745, "language_loss": 0.7874617, "learning_rate": 4.4143199501984306e-07, "loss": 0.80879068, "num_input_tokens_seen": 283862870, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 13154, "time_per_iteration": 2.444807291030884 }, { "auxiliary_loss_clip": 0.01111467, "auxiliary_loss_mlp": 0.01029606, "balance_loss_clip": 1.01559329, "balance_loss_mlp": 1.03692889, "epoch": 0.7909213888471366, "flos": 21287738211840.0, "grad_norm": 2.2703634794179393, "language_loss": 0.70837957, "learning_rate": 4.411879602612185e-07, "loss": 0.72979033, "num_input_tokens_seen": 283882405, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.74609375, "step": 13155, "time_per_iteration": 2.4582056999206543 }, { "auxiliary_loss_clip": 0.01106981, "auxiliary_loss_mlp": 0.01028641, "balance_loss_clip": 1.01596284, "balance_loss_mlp": 1.03701532, "epoch": 0.7909815120998046, "flos": 22528882805760.0, "grad_norm": 1.6808099234998934, "language_loss": 0.77132416, "learning_rate": 4.4094398461290174e-07, "loss": 0.79268038, "num_input_tokens_seen": 283902070, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 13156, "time_per_iteration": 2.457559823989868 }, { "auxiliary_loss_clip": 0.0110336, "auxiliary_loss_mlp": 0.01028827, "balance_loss_clip": 1.01627398, "balance_loss_mlp": 1.0346272, "epoch": 0.7910416353524725, "flos": 26727702111360.0, "grad_norm": 1.8784153309485185, "language_loss": 0.65523279, "learning_rate": 4.4070006808414526e-07, "loss": 0.67655456, "num_input_tokens_seen": 283924100, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 13157, "time_per_iteration": 2.5305044651031494 }, { "auxiliary_loss_clip": 0.01106774, "auxiliary_loss_mlp": 0.01036968, "balance_loss_clip": 1.02365196, "balance_loss_mlp": 1.03625107, "epoch": 0.7911017586051405, "flos": 24644847139200.0, "grad_norm": 2.541279438589983, "language_loss": 0.74369049, "learning_rate": 4.4045621068419894e-07, "loss": 0.76512796, "num_input_tokens_seen": 283944955, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 13158, "time_per_iteration": 2.4870245456695557 }, { "auxiliary_loss_clip": 0.01102459, "auxiliary_loss_mlp": 0.01034473, "balance_loss_clip": 1.02298748, "balance_loss_mlp": 1.0351156, "epoch": 0.7911618818578086, "flos": 17565489578880.0, "grad_norm": 3.240001417558976, "language_loss": 0.67136347, "learning_rate": 4.40212412422309e-07, "loss": 0.69273281, "num_input_tokens_seen": 283963125, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 13159, "time_per_iteration": 2.4340438842773438 }, { "auxiliary_loss_clip": 0.01104746, "auxiliary_loss_mlp": 0.01036586, "balance_loss_clip": 1.02400374, "balance_loss_mlp": 1.03611207, "epoch": 0.7912220051104765, "flos": 16721660298240.0, "grad_norm": 2.134090325350584, "language_loss": 0.66975927, "learning_rate": 4.399686733077206e-07, "loss": 0.6911726, "num_input_tokens_seen": 283982850, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 13160, "time_per_iteration": 2.460296869277954 }, { "auxiliary_loss_clip": 0.01099896, "auxiliary_loss_mlp": 0.01027483, "balance_loss_clip": 1.01659322, "balance_loss_mlp": 1.0343684, "epoch": 0.7912821283631445, "flos": 13698736531200.0, "grad_norm": 5.7549688835400445, "language_loss": 0.72952896, "learning_rate": 4.3972499334967694e-07, "loss": 0.7508027, "num_input_tokens_seen": 283998275, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.65625, "step": 13161, "time_per_iteration": 2.434065103530884 }, { "auxiliary_loss_clip": 0.01103251, "auxiliary_loss_mlp": 0.01030146, "balance_loss_clip": 1.01783144, "balance_loss_mlp": 1.03611016, "epoch": 0.7913422516158124, "flos": 23769021818880.0, "grad_norm": 1.7310402346431324, "language_loss": 0.73012388, "learning_rate": 4.39481372557418e-07, "loss": 0.75145781, "num_input_tokens_seen": 284018750, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 13162, "time_per_iteration": 2.472249746322632 }, { "auxiliary_loss_clip": 0.01107569, "auxiliary_loss_mlp": 0.01029121, "balance_loss_clip": 1.01708126, "balance_loss_mlp": 1.03663385, "epoch": 0.7914023748684804, "flos": 19938251220480.0, "grad_norm": 1.7500464695081352, "language_loss": 0.71982205, "learning_rate": 4.392378109401811e-07, "loss": 0.74118888, "num_input_tokens_seen": 284037850, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7109375, "step": 13163, "time_per_iteration": 2.476642370223999 }, { "auxiliary_loss_clip": 0.01107012, "auxiliary_loss_mlp": 0.01033079, "balance_loss_clip": 1.02005541, "balance_loss_mlp": 1.03795242, "epoch": 0.7914624981211483, "flos": 20594805966720.0, "grad_norm": 2.011717890175843, "language_loss": 0.69567037, "learning_rate": 4.3899430850720296e-07, "loss": 0.71707129, "num_input_tokens_seen": 284056380, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 13164, "time_per_iteration": 2.4437644481658936 }, { "auxiliary_loss_clip": 0.01103421, "auxiliary_loss_mlp": 0.01032304, "balance_loss_clip": 1.02007318, "balance_loss_mlp": 1.03469443, "epoch": 0.7915226213738163, "flos": 21799465320960.0, "grad_norm": 2.164876209564862, "language_loss": 0.6597482, "learning_rate": 4.387508652677177e-07, "loss": 0.68110549, "num_input_tokens_seen": 284074945, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 13165, "time_per_iteration": 2.466641664505005 }, { "auxiliary_loss_clip": 0.01099645, "auxiliary_loss_mlp": 0.01029291, "balance_loss_clip": 1.01776934, "balance_loss_mlp": 1.03330791, "epoch": 0.7915827446264843, "flos": 16288362535680.0, "grad_norm": 2.0468058066102275, "language_loss": 0.72304392, "learning_rate": 4.385074812309557e-07, "loss": 0.74433327, "num_input_tokens_seen": 284092070, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 13166, "time_per_iteration": 2.4060981273651123 }, { "auxiliary_loss_clip": 0.01104918, "auxiliary_loss_mlp": 0.01033348, "balance_loss_clip": 1.02036583, "balance_loss_mlp": 1.03580499, "epoch": 0.7916428678791523, "flos": 25702595867520.0, "grad_norm": 2.1878990951498, "language_loss": 0.77139223, "learning_rate": 4.382641564061462e-07, "loss": 0.79277486, "num_input_tokens_seen": 284112255, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 13167, "time_per_iteration": 2.4926607608795166 }, { "auxiliary_loss_clip": 0.01105435, "auxiliary_loss_mlp": 0.01032309, "balance_loss_clip": 1.02081728, "balance_loss_mlp": 1.03697169, "epoch": 0.7917029911318202, "flos": 23878513451520.0, "grad_norm": 2.201930403924288, "language_loss": 0.84087151, "learning_rate": 4.3802089080251713e-07, "loss": 0.8622489, "num_input_tokens_seen": 284132330, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.68359375, "step": 13168, "time_per_iteration": 2.4634249210357666 }, { "auxiliary_loss_clip": 0.01106378, "auxiliary_loss_mlp": 0.01029053, "balance_loss_clip": 1.01683426, "balance_loss_mlp": 1.03630662, "epoch": 0.7917631143844882, "flos": 21646593037440.0, "grad_norm": 1.7200491042664354, "language_loss": 0.72529268, "learning_rate": 4.3777768442929155e-07, "loss": 0.746647, "num_input_tokens_seen": 284150640, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 13169, "time_per_iteration": 3.9400081634521484 }, { "auxiliary_loss_clip": 0.0110626, "auxiliary_loss_mlp": 0.01032936, "balance_loss_clip": 1.02018654, "balance_loss_mlp": 1.03516102, "epoch": 0.7918232376371561, "flos": 38874198355200.0, "grad_norm": 2.02929140193914, "language_loss": 0.67123306, "learning_rate": 4.3753453729569287e-07, "loss": 0.69262505, "num_input_tokens_seen": 284171910, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 13170, "time_per_iteration": 4.044754981994629 }, { "auxiliary_loss_clip": 0.01105425, "auxiliary_loss_mlp": 0.01023799, "balance_loss_clip": 1.01226521, "balance_loss_mlp": 1.0359354, "epoch": 0.7918833608898241, "flos": 20775544225920.0, "grad_norm": 1.8387449482559843, "language_loss": 0.70789909, "learning_rate": 4.372914494109412e-07, "loss": 0.7291913, "num_input_tokens_seen": 284191340, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6953125, "step": 13171, "time_per_iteration": 2.4436023235321045 }, { "auxiliary_loss_clip": 0.0110439, "auxiliary_loss_mlp": 0.01031106, "balance_loss_clip": 1.01840425, "balance_loss_mlp": 1.03548646, "epoch": 0.7919434841424922, "flos": 33910122769920.0, "grad_norm": 1.7117446313260296, "language_loss": 0.66812968, "learning_rate": 4.370484207842553e-07, "loss": 0.68948472, "num_input_tokens_seen": 284212495, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13172, "time_per_iteration": 3.979045867919922 }, { "auxiliary_loss_clip": 0.01105579, "auxiliary_loss_mlp": 0.01033767, "balance_loss_clip": 1.02105355, "balance_loss_mlp": 1.03608739, "epoch": 0.7920036073951601, "flos": 21064660796160.0, "grad_norm": 1.6753714906177617, "language_loss": 0.79603064, "learning_rate": 4.3680545142484893e-07, "loss": 0.81742412, "num_input_tokens_seen": 284230825, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 13173, "time_per_iteration": 2.4827628135681152 }, { "auxiliary_loss_clip": 0.01104627, "auxiliary_loss_mlp": 0.01030291, "balance_loss_clip": 1.01914454, "balance_loss_mlp": 1.03560996, "epoch": 0.7920637306478281, "flos": 23655974739840.0, "grad_norm": 2.4711198134514123, "language_loss": 0.76944667, "learning_rate": 4.365625413419365e-07, "loss": 0.7907958, "num_input_tokens_seen": 284250365, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.69140625, "step": 13174, "time_per_iteration": 2.458256721496582 }, { "auxiliary_loss_clip": 0.011032, "auxiliary_loss_mlp": 0.01029729, "balance_loss_clip": 1.01852345, "balance_loss_mlp": 1.0357604, "epoch": 0.792123853900496, "flos": 27195438038400.0, "grad_norm": 1.6049029581329302, "language_loss": 0.71676898, "learning_rate": 4.363196905447297e-07, "loss": 0.73809826, "num_input_tokens_seen": 284269635, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.67578125, "step": 13175, "time_per_iteration": 3.960660696029663 }, { "auxiliary_loss_clip": 0.01106239, "auxiliary_loss_mlp": 0.01030726, "balance_loss_clip": 1.01830482, "balance_loss_mlp": 1.03692961, "epoch": 0.792183977153164, "flos": 19098659744640.0, "grad_norm": 2.806666310296064, "language_loss": 0.5934785, "learning_rate": 4.360768990424364e-07, "loss": 0.61484814, "num_input_tokens_seen": 284288380, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 13176, "time_per_iteration": 2.4286141395568848 }, { "auxiliary_loss_clip": 0.01108063, "auxiliary_loss_mlp": 0.01031276, "balance_loss_clip": 1.01911664, "balance_loss_mlp": 1.03945148, "epoch": 0.7922441004058319, "flos": 17128851851520.0, "grad_norm": 1.9666919959880416, "language_loss": 0.73510909, "learning_rate": 4.3583416684426376e-07, "loss": 0.75650245, "num_input_tokens_seen": 284306920, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 13177, "time_per_iteration": 2.453016996383667 }, { "auxiliary_loss_clip": 0.01103937, "auxiliary_loss_mlp": 0.01030505, "balance_loss_clip": 1.018435, "balance_loss_mlp": 1.03664303, "epoch": 0.7923042236585, "flos": 17821640442240.0, "grad_norm": 2.034979073642218, "language_loss": 0.64047813, "learning_rate": 4.355914939594174e-07, "loss": 0.66182256, "num_input_tokens_seen": 284324700, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.671875, "step": 13178, "time_per_iteration": 2.423652410507202 }, { "auxiliary_loss_clip": 0.01103134, "auxiliary_loss_mlp": 0.01028861, "balance_loss_clip": 1.01751804, "balance_loss_mlp": 1.0344286, "epoch": 0.7923643469111679, "flos": 29935206892800.0, "grad_norm": 2.533948767574349, "language_loss": 0.68881202, "learning_rate": 4.3534888039709726e-07, "loss": 0.71013194, "num_input_tokens_seen": 284345985, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6875, "step": 13179, "time_per_iteration": 2.5238876342773438 }, { "auxiliary_loss_clip": 0.01104934, "auxiliary_loss_mlp": 0.01030451, "balance_loss_clip": 1.01782119, "balance_loss_mlp": 1.03619528, "epoch": 0.7924244701638359, "flos": 22674716023680.0, "grad_norm": 2.2717565521494834, "language_loss": 0.74355823, "learning_rate": 4.3510632616650444e-07, "loss": 0.76491207, "num_input_tokens_seen": 284364475, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13180, "time_per_iteration": 2.4780805110931396 }, { "auxiliary_loss_clip": 0.01111056, "auxiliary_loss_mlp": 0.01036797, "balance_loss_clip": 1.023404, "balance_loss_mlp": 1.0389421, "epoch": 0.7924845934165038, "flos": 17968156018560.0, "grad_norm": 2.0718198446835077, "language_loss": 0.81629354, "learning_rate": 4.3486383127683646e-07, "loss": 0.83777207, "num_input_tokens_seen": 284382125, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 13181, "time_per_iteration": 2.4333837032318115 }, { "auxiliary_loss_clip": 0.01103986, "auxiliary_loss_mlp": 0.01034185, "balance_loss_clip": 1.02101254, "balance_loss_mlp": 1.03620899, "epoch": 0.7925447166691718, "flos": 23476960333440.0, "grad_norm": 2.0966836723843896, "language_loss": 0.77698076, "learning_rate": 4.346213957372895e-07, "loss": 0.79836249, "num_input_tokens_seen": 284401585, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6796875, "step": 13182, "time_per_iteration": 2.488970994949341 }, { "auxiliary_loss_clip": 0.01109887, "auxiliary_loss_mlp": 0.01035957, "balance_loss_clip": 1.02187848, "balance_loss_mlp": 1.03737438, "epoch": 0.7926048399218397, "flos": 20447572118400.0, "grad_norm": 2.186864457673083, "language_loss": 0.74060535, "learning_rate": 4.34379019557056e-07, "loss": 0.7620638, "num_input_tokens_seen": 284419125, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 13183, "time_per_iteration": 2.491177797317505 }, { "auxiliary_loss_clip": 0.01104924, "auxiliary_loss_mlp": 0.0102925, "balance_loss_clip": 1.01708424, "balance_loss_mlp": 1.03630614, "epoch": 0.7926649631745077, "flos": 37160038535040.0, "grad_norm": 2.1574127794927214, "language_loss": 0.68297243, "learning_rate": 4.341367027453264e-07, "loss": 0.70431417, "num_input_tokens_seen": 284440445, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 13184, "time_per_iteration": 2.5836682319641113 }, { "auxiliary_loss_clip": 0.01107734, "auxiliary_loss_mlp": 0.01032579, "balance_loss_clip": 1.02006793, "balance_loss_mlp": 1.03666604, "epoch": 0.7927250864271758, "flos": 17018606033280.0, "grad_norm": 1.895207080027671, "language_loss": 0.70725501, "learning_rate": 4.338944453112907e-07, "loss": 0.72865808, "num_input_tokens_seen": 284459370, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 13185, "time_per_iteration": 2.4372715950012207 }, { "auxiliary_loss_clip": 0.01107666, "auxiliary_loss_mlp": 0.01028225, "balance_loss_clip": 1.01540446, "balance_loss_mlp": 1.03615296, "epoch": 0.7927852096798437, "flos": 17749208666880.0, "grad_norm": 1.9777937752707666, "language_loss": 0.65148133, "learning_rate": 4.3365224726413375e-07, "loss": 0.67284024, "num_input_tokens_seen": 284477525, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 13186, "time_per_iteration": 2.4151697158813477 }, { "auxiliary_loss_clip": 0.01103959, "auxiliary_loss_mlp": 0.01030246, "balance_loss_clip": 1.01855159, "balance_loss_mlp": 1.0358026, "epoch": 0.7928453329325117, "flos": 23838436851840.0, "grad_norm": 1.5655726993742813, "language_loss": 0.76793134, "learning_rate": 4.334101086130408e-07, "loss": 0.78927338, "num_input_tokens_seen": 284496590, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 13187, "time_per_iteration": 2.4679574966430664 }, { "auxiliary_loss_clip": 0.01104638, "auxiliary_loss_mlp": 0.01027646, "balance_loss_clip": 1.01570117, "balance_loss_mlp": 1.03650343, "epoch": 0.7929054561851796, "flos": 17454920538240.0, "grad_norm": 2.9000247018216174, "language_loss": 0.7312606, "learning_rate": 4.3316802936719334e-07, "loss": 0.75258344, "num_input_tokens_seen": 284511470, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 13188, "time_per_iteration": 2.4132375717163086 }, { "auxiliary_loss_clip": 0.01107116, "auxiliary_loss_mlp": 0.01039682, "balance_loss_clip": 1.0264082, "balance_loss_mlp": 1.03567386, "epoch": 0.7929655794378476, "flos": 21981280988160.0, "grad_norm": 2.824006064489063, "language_loss": 0.63711935, "learning_rate": 4.329260095357725e-07, "loss": 0.65858734, "num_input_tokens_seen": 284531125, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 13189, "time_per_iteration": 2.46547794342041 }, { "auxiliary_loss_clip": 0.0110428, "auxiliary_loss_mlp": 0.01031222, "balance_loss_clip": 1.01922917, "balance_loss_mlp": 1.03525686, "epoch": 0.7930257026905155, "flos": 17273930883840.0, "grad_norm": 1.7979832356612544, "language_loss": 0.7237395, "learning_rate": 4.3268404912795307e-07, "loss": 0.74509454, "num_input_tokens_seen": 284549340, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 13190, "time_per_iteration": 2.444997549057007 }, { "auxiliary_loss_clip": 0.01101856, "auxiliary_loss_mlp": 0.01029117, "balance_loss_clip": 1.01842964, "balance_loss_mlp": 1.03679538, "epoch": 0.7930858259431836, "flos": 27300584125440.0, "grad_norm": 2.315354742871074, "language_loss": 0.73568904, "learning_rate": 4.3244214815291166e-07, "loss": 0.75699878, "num_input_tokens_seen": 284567060, "router_z_loss_clip": 0.10693359, "router_z_loss_mlp": 0.6484375, "step": 13191, "time_per_iteration": 2.5132896900177 }, { "auxiliary_loss_clip": 0.01103603, "auxiliary_loss_mlp": 0.01037443, "balance_loss_clip": 1.02468204, "balance_loss_mlp": 1.0347836, "epoch": 0.7931459491958515, "flos": 19863736456320.0, "grad_norm": 2.383757305087791, "language_loss": 0.69130999, "learning_rate": 4.322003066198219e-07, "loss": 0.71272051, "num_input_tokens_seen": 284586600, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 13192, "time_per_iteration": 2.486558675765991 }, { "auxiliary_loss_clip": 0.01104907, "auxiliary_loss_mlp": 0.01032424, "balance_loss_clip": 1.02057445, "balance_loss_mlp": 1.03502631, "epoch": 0.7932060724485195, "flos": 23147120718720.0, "grad_norm": 3.8435057309334653, "language_loss": 0.7514407, "learning_rate": 4.3195852453785274e-07, "loss": 0.77281404, "num_input_tokens_seen": 284605715, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 13193, "time_per_iteration": 2.485114812850952 }, { "auxiliary_loss_clip": 0.01105745, "auxiliary_loss_mlp": 0.01033299, "balance_loss_clip": 1.01954269, "balance_loss_mlp": 1.0363214, "epoch": 0.7932661957011874, "flos": 29934847756800.0, "grad_norm": 1.549993544605444, "language_loss": 0.72118044, "learning_rate": 4.317168019161741e-07, "loss": 0.74257088, "num_input_tokens_seen": 284628540, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.6953125, "step": 13194, "time_per_iteration": 2.546985387802124 }, { "auxiliary_loss_clip": 0.01108016, "auxiliary_loss_mlp": 0.01033929, "balance_loss_clip": 1.0208931, "balance_loss_mlp": 1.03627396, "epoch": 0.7933263189538554, "flos": 22559119079040.0, "grad_norm": 4.251210045833859, "language_loss": 0.70442975, "learning_rate": 4.314751387639517e-07, "loss": 0.72584915, "num_input_tokens_seen": 284646040, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 13195, "time_per_iteration": 2.4708292484283447 }, { "auxiliary_loss_clip": 0.01104572, "auxiliary_loss_mlp": 0.0102974, "balance_loss_clip": 1.01690102, "balance_loss_mlp": 1.0354929, "epoch": 0.7933864422065233, "flos": 25479051575040.0, "grad_norm": 1.7616549280726161, "language_loss": 0.77547204, "learning_rate": 4.3123353509034844e-07, "loss": 0.79681516, "num_input_tokens_seen": 284665110, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 13196, "time_per_iteration": 2.5149168968200684 }, { "auxiliary_loss_clip": 0.01108943, "auxiliary_loss_mlp": 0.0103371, "balance_loss_clip": 1.02105546, "balance_loss_mlp": 1.03777409, "epoch": 0.7934465654591913, "flos": 33583156243200.0, "grad_norm": 1.6101612865733395, "language_loss": 0.68503797, "learning_rate": 4.309919909045268e-07, "loss": 0.70646453, "num_input_tokens_seen": 284686515, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 13197, "time_per_iteration": 2.5773162841796875 }, { "auxiliary_loss_clip": 0.01104323, "auxiliary_loss_mlp": 0.01027278, "balance_loss_clip": 1.01507699, "balance_loss_mlp": 1.03584957, "epoch": 0.7935066887118594, "flos": 31432538263680.0, "grad_norm": 2.0070551483231416, "language_loss": 0.64929521, "learning_rate": 4.30750506215646e-07, "loss": 0.67061126, "num_input_tokens_seen": 284707300, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 13198, "time_per_iteration": 2.536618232727051 }, { "auxiliary_loss_clip": 0.01109394, "auxiliary_loss_mlp": 0.01035653, "balance_loss_clip": 1.02215862, "balance_loss_mlp": 1.03756118, "epoch": 0.7935668119645273, "flos": 14682616940160.0, "grad_norm": 2.0370435654406474, "language_loss": 0.72458577, "learning_rate": 4.30509081032864e-07, "loss": 0.74603629, "num_input_tokens_seen": 284723545, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 13199, "time_per_iteration": 2.4304659366607666 }, { "auxiliary_loss_clip": 0.01106519, "auxiliary_loss_mlp": 0.01029472, "balance_loss_clip": 1.01712787, "balance_loss_mlp": 1.03701162, "epoch": 0.7936269352171953, "flos": 18004246208640.0, "grad_norm": 2.0656816191948253, "language_loss": 0.81058639, "learning_rate": 4.302677153653349e-07, "loss": 0.83194637, "num_input_tokens_seen": 284742650, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 13200, "time_per_iteration": 2.4472904205322266 }, { "auxiliary_loss_clip": 0.01103925, "auxiliary_loss_mlp": 0.01031691, "balance_loss_clip": 1.01980555, "balance_loss_mlp": 1.03703213, "epoch": 0.7936870584698632, "flos": 18880215183360.0, "grad_norm": 2.20194817152268, "language_loss": 0.77840114, "learning_rate": 4.3002640922221077e-07, "loss": 0.79975724, "num_input_tokens_seen": 284760955, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 13201, "time_per_iteration": 2.4517877101898193 }, { "auxiliary_loss_clip": 0.01103943, "auxiliary_loss_mlp": 0.01033457, "balance_loss_clip": 1.02132118, "balance_loss_mlp": 1.03580916, "epoch": 0.7937471817225312, "flos": 23367001824000.0, "grad_norm": 1.6749110044474773, "language_loss": 0.67218459, "learning_rate": 4.2978516261264296e-07, "loss": 0.69355857, "num_input_tokens_seen": 284780745, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 13202, "time_per_iteration": 2.5120627880096436 }, { "auxiliary_loss_clip": 0.01106614, "auxiliary_loss_mlp": 0.01032369, "balance_loss_clip": 1.01928604, "balance_loss_mlp": 1.03677893, "epoch": 0.7938073049751991, "flos": 22674428714880.0, "grad_norm": 2.2590754196194065, "language_loss": 0.7501806, "learning_rate": 4.2954397554577884e-07, "loss": 0.77157038, "num_input_tokens_seen": 284799000, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 13203, "time_per_iteration": 2.473208427429199 }, { "auxiliary_loss_clip": 0.01104487, "auxiliary_loss_mlp": 0.0103386, "balance_loss_clip": 1.02220118, "balance_loss_mlp": 1.03534269, "epoch": 0.7938674282278672, "flos": 22851431959680.0, "grad_norm": 1.7952439700279124, "language_loss": 0.66112691, "learning_rate": 4.293028480307643e-07, "loss": 0.68251038, "num_input_tokens_seen": 284817450, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69140625, "step": 13204, "time_per_iteration": 2.4492177963256836 }, { "auxiliary_loss_clip": 0.01102765, "auxiliary_loss_mlp": 0.01030546, "balance_loss_clip": 1.0180943, "balance_loss_mlp": 1.03437233, "epoch": 0.7939275514805351, "flos": 27012509049600.0, "grad_norm": 1.7943587283344453, "language_loss": 0.79503894, "learning_rate": 4.290617800767438e-07, "loss": 0.8163721, "num_input_tokens_seen": 284838865, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.68359375, "step": 13205, "time_per_iteration": 2.51476788520813 }, { "auxiliary_loss_clip": 0.01102383, "auxiliary_loss_mlp": 0.0102858, "balance_loss_clip": 1.01648033, "balance_loss_mlp": 1.03465307, "epoch": 0.7939876747332031, "flos": 21142838747520.0, "grad_norm": 3.326696442557899, "language_loss": 0.77935296, "learning_rate": 4.28820771692858e-07, "loss": 0.80066258, "num_input_tokens_seen": 284857975, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 13206, "time_per_iteration": 2.460319757461548 }, { "auxiliary_loss_clip": 0.01109012, "auxiliary_loss_mlp": 0.01031628, "balance_loss_clip": 1.0180496, "balance_loss_mlp": 1.0371995, "epoch": 0.794047797985871, "flos": 23289075267840.0, "grad_norm": 1.9270889967482054, "language_loss": 0.7852208, "learning_rate": 4.285798228882456e-07, "loss": 0.80662715, "num_input_tokens_seen": 284877145, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 13207, "time_per_iteration": 2.4707326889038086 }, { "auxiliary_loss_clip": 0.01105795, "auxiliary_loss_mlp": 0.01034999, "balance_loss_clip": 1.02255929, "balance_loss_mlp": 1.03705907, "epoch": 0.794107921238539, "flos": 24608074590720.0, "grad_norm": 1.9186427396821386, "language_loss": 0.84241456, "learning_rate": 4.2833893367204375e-07, "loss": 0.86382246, "num_input_tokens_seen": 284895560, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 13208, "time_per_iteration": 2.4845263957977295 }, { "auxiliary_loss_clip": 0.01030392, "auxiliary_loss_mlp": 0.01005174, "balance_loss_clip": 1.00423849, "balance_loss_mlp": 1.0075599, "epoch": 0.7941680444912069, "flos": 64093690252800.0, "grad_norm": 0.7894275313682845, "language_loss": 0.58325815, "learning_rate": 4.280981040533875e-07, "loss": 0.60361373, "num_input_tokens_seen": 284963135, "router_z_loss_clip": 0.00933838, "router_z_loss_mlp": 0.22851562, "step": 13209, "time_per_iteration": 3.185857057571411 }, { "auxiliary_loss_clip": 0.01110612, "auxiliary_loss_mlp": 0.01028382, "balance_loss_clip": 1.01542401, "balance_loss_mlp": 1.03812087, "epoch": 0.794228167743875, "flos": 24388839930240.0, "grad_norm": 2.7827607810566883, "language_loss": 0.6414113, "learning_rate": 4.2785733404140825e-07, "loss": 0.66280121, "num_input_tokens_seen": 284981755, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 13210, "time_per_iteration": 2.4776999950408936 }, { "auxiliary_loss_clip": 0.01105252, "auxiliary_loss_mlp": 0.01031839, "balance_loss_clip": 1.01985872, "balance_loss_mlp": 1.0356524, "epoch": 0.794288290996543, "flos": 28512498026880.0, "grad_norm": 1.620304113496028, "language_loss": 0.69601452, "learning_rate": 4.2761662364523676e-07, "loss": 0.71738547, "num_input_tokens_seen": 285003060, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 13211, "time_per_iteration": 5.482569932937622 }, { "auxiliary_loss_clip": 0.0110648, "auxiliary_loss_mlp": 0.01036005, "balance_loss_clip": 1.02266502, "balance_loss_mlp": 1.03513098, "epoch": 0.7943484142492109, "flos": 25922117836800.0, "grad_norm": 1.5153905959420206, "language_loss": 0.72137284, "learning_rate": 4.2737597287400074e-07, "loss": 0.74279767, "num_input_tokens_seen": 285021640, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 13212, "time_per_iteration": 2.492255449295044 }, { "auxiliary_loss_clip": 0.01103663, "auxiliary_loss_mlp": 0.0103169, "balance_loss_clip": 1.02021623, "balance_loss_mlp": 1.03651619, "epoch": 0.7944085375018789, "flos": 23915286000000.0, "grad_norm": 1.5729663052376957, "language_loss": 0.80476469, "learning_rate": 4.271353817368246e-07, "loss": 0.82611823, "num_input_tokens_seen": 285040490, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 13213, "time_per_iteration": 2.460684061050415 }, { "auxiliary_loss_clip": 0.01110532, "auxiliary_loss_mlp": 0.01034803, "balance_loss_clip": 1.02165413, "balance_loss_mlp": 1.03826034, "epoch": 0.7944686607545468, "flos": 20229953569920.0, "grad_norm": 2.722925836226407, "language_loss": 0.67949772, "learning_rate": 4.268948502428327e-07, "loss": 0.70095098, "num_input_tokens_seen": 285059270, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 13214, "time_per_iteration": 3.82637619972229 }, { "auxiliary_loss_clip": 0.01102755, "auxiliary_loss_mlp": 0.01029601, "balance_loss_clip": 1.01791859, "balance_loss_mlp": 1.03570139, "epoch": 0.7945287840072148, "flos": 21980993679360.0, "grad_norm": 2.1221685978652967, "language_loss": 0.72585255, "learning_rate": 4.2665437840114535e-07, "loss": 0.74717611, "num_input_tokens_seen": 285075390, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 13215, "time_per_iteration": 2.4700369834899902 }, { "auxiliary_loss_clip": 0.01107598, "auxiliary_loss_mlp": 0.01029712, "balance_loss_clip": 1.01782084, "balance_loss_mlp": 1.03904605, "epoch": 0.7945889072598827, "flos": 26397718842240.0, "grad_norm": 3.018440812933166, "language_loss": 0.78776664, "learning_rate": 4.2641396622088253e-07, "loss": 0.80913979, "num_input_tokens_seen": 285096290, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 13216, "time_per_iteration": 2.538908004760742 }, { "auxiliary_loss_clip": 0.01106737, "auxiliary_loss_mlp": 0.01032832, "balance_loss_clip": 1.02085757, "balance_loss_mlp": 1.03697073, "epoch": 0.7946490305125508, "flos": 25810255906560.0, "grad_norm": 1.6699064462568476, "language_loss": 0.73815334, "learning_rate": 4.261736137111598e-07, "loss": 0.75954902, "num_input_tokens_seen": 285116020, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69921875, "step": 13217, "time_per_iteration": 3.910142183303833 }, { "auxiliary_loss_clip": 0.01103721, "auxiliary_loss_mlp": 0.01032786, "balance_loss_clip": 1.01993561, "balance_loss_mlp": 1.03648853, "epoch": 0.7947091537652187, "flos": 15960965045760.0, "grad_norm": 3.440400174809167, "language_loss": 0.74114728, "learning_rate": 4.259333208810907e-07, "loss": 0.76251239, "num_input_tokens_seen": 285133510, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.671875, "step": 13218, "time_per_iteration": 2.446303606033325 }, { "auxiliary_loss_clip": 0.01106726, "auxiliary_loss_mlp": 0.01037936, "balance_loss_clip": 1.02434659, "balance_loss_mlp": 1.03488696, "epoch": 0.7947692770178867, "flos": 18587866389120.0, "grad_norm": 2.564853148302145, "language_loss": 0.83374476, "learning_rate": 4.2569308773978817e-07, "loss": 0.85519135, "num_input_tokens_seen": 285151690, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 13219, "time_per_iteration": 2.4287216663360596 }, { "auxiliary_loss_clip": 0.01110601, "auxiliary_loss_mlp": 0.01034694, "balance_loss_clip": 1.02130699, "balance_loss_mlp": 1.0377295, "epoch": 0.7948294002705546, "flos": 20442220992000.0, "grad_norm": 11.12972106293741, "language_loss": 0.75244832, "learning_rate": 4.2545291429636123e-07, "loss": 0.77390122, "num_input_tokens_seen": 285170485, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 13220, "time_per_iteration": 2.488867998123169 }, { "auxiliary_loss_clip": 0.01108718, "auxiliary_loss_mlp": 0.01037617, "balance_loss_clip": 1.02480197, "balance_loss_mlp": 1.03708827, "epoch": 0.7948895235232226, "flos": 38181194282880.0, "grad_norm": 1.76565695165487, "language_loss": 0.72402263, "learning_rate": 4.252128005599176e-07, "loss": 0.74548602, "num_input_tokens_seen": 285191050, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 13221, "time_per_iteration": 2.603907346725464 }, { "auxiliary_loss_clip": 0.01103073, "auxiliary_loss_mlp": 0.01028222, "balance_loss_clip": 1.01650977, "balance_loss_mlp": 1.03598523, "epoch": 0.7949496467758905, "flos": 15559806977280.0, "grad_norm": 2.223919633644355, "language_loss": 0.75043952, "learning_rate": 4.249727465395634e-07, "loss": 0.77175248, "num_input_tokens_seen": 285208750, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 13222, "time_per_iteration": 2.4354302883148193 }, { "auxiliary_loss_clip": 0.01030015, "auxiliary_loss_mlp": 0.010017, "balance_loss_clip": 1.00071609, "balance_loss_mlp": 1.00729024, "epoch": 0.7950097700285585, "flos": 70897036728960.0, "grad_norm": 0.7738811548077, "language_loss": 0.67042863, "learning_rate": 4.247327522443993e-07, "loss": 0.69074571, "num_input_tokens_seen": 285264605, "router_z_loss_clip": 0.00982666, "router_z_loss_mlp": 0.22753906, "step": 13223, "time_per_iteration": 2.9506518840789795 }, { "auxiliary_loss_clip": 0.01104556, "auxiliary_loss_mlp": 0.01033169, "balance_loss_clip": 1.02033579, "balance_loss_mlp": 1.03450966, "epoch": 0.7950698932812266, "flos": 23951627585280.0, "grad_norm": 2.036008498498927, "language_loss": 0.7092011, "learning_rate": 4.2449281768352717e-07, "loss": 0.7305783, "num_input_tokens_seen": 285283940, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13224, "time_per_iteration": 2.5108206272125244 }, { "auxiliary_loss_clip": 0.01030817, "auxiliary_loss_mlp": 0.01001121, "balance_loss_clip": 1.00010765, "balance_loss_mlp": 1.00788736, "epoch": 0.7951300165338945, "flos": 60282561415680.0, "grad_norm": 0.6851600745562187, "language_loss": 0.54976177, "learning_rate": 4.2425294286604527e-07, "loss": 0.57008111, "num_input_tokens_seen": 285349525, "router_z_loss_clip": 0.01013184, "router_z_loss_mlp": 0.22949219, "step": 13225, "time_per_iteration": 3.142537832260132 }, { "auxiliary_loss_clip": 0.01101796, "auxiliary_loss_mlp": 0.01024012, "balance_loss_clip": 1.01240134, "balance_loss_mlp": 1.03413916, "epoch": 0.7951901397865625, "flos": 22819004956800.0, "grad_norm": 1.9352636637668437, "language_loss": 0.65063906, "learning_rate": 4.2401312780105034e-07, "loss": 0.67189717, "num_input_tokens_seen": 285367355, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 13226, "time_per_iteration": 2.4838154315948486 }, { "auxiliary_loss_clip": 0.01107439, "auxiliary_loss_mlp": 0.01040762, "balance_loss_clip": 1.02813745, "balance_loss_mlp": 1.03698826, "epoch": 0.7952502630392304, "flos": 35695672871040.0, "grad_norm": 2.4066535790742294, "language_loss": 0.7035495, "learning_rate": 4.237733724976349e-07, "loss": 0.7250315, "num_input_tokens_seen": 285386190, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 13227, "time_per_iteration": 2.586549758911133 }, { "auxiliary_loss_clip": 0.01102448, "auxiliary_loss_mlp": 0.0102945, "balance_loss_clip": 1.01839888, "balance_loss_mlp": 1.03524888, "epoch": 0.7953103862918984, "flos": 25629840869760.0, "grad_norm": 2.5276025141661096, "language_loss": 0.69479346, "learning_rate": 4.2353367696489184e-07, "loss": 0.71611249, "num_input_tokens_seen": 285406150, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.671875, "step": 13228, "time_per_iteration": 2.5308523178100586 }, { "auxiliary_loss_clip": 0.01106675, "auxiliary_loss_mlp": 0.01036234, "balance_loss_clip": 1.02353787, "balance_loss_mlp": 1.03612924, "epoch": 0.7953705095445663, "flos": 40551980676480.0, "grad_norm": 2.3864177013146812, "language_loss": 0.70820785, "learning_rate": 4.232940412119095e-07, "loss": 0.72963691, "num_input_tokens_seen": 285429900, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13229, "time_per_iteration": 2.62778377532959 }, { "auxiliary_loss_clip": 0.01112218, "auxiliary_loss_mlp": 0.01032038, "balance_loss_clip": 1.0197413, "balance_loss_mlp": 1.03974831, "epoch": 0.7954306327972344, "flos": 27636672706560.0, "grad_norm": 1.967642642468879, "language_loss": 0.71884781, "learning_rate": 4.2305446524777457e-07, "loss": 0.74029034, "num_input_tokens_seen": 285452555, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7265625, "step": 13230, "time_per_iteration": 2.535308599472046 }, { "auxiliary_loss_clip": 0.0103021, "auxiliary_loss_mlp": 0.01001588, "balance_loss_clip": 1.00054514, "balance_loss_mlp": 1.00756538, "epoch": 0.7954907560499023, "flos": 59504055995520.0, "grad_norm": 0.9003585744217645, "language_loss": 0.63537312, "learning_rate": 4.2281494908157247e-07, "loss": 0.65569115, "num_input_tokens_seen": 285515700, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.2265625, "step": 13231, "time_per_iteration": 3.096871852874756 }, { "auxiliary_loss_clip": 0.01105451, "auxiliary_loss_mlp": 0.01028009, "balance_loss_clip": 1.01601088, "balance_loss_mlp": 1.03646755, "epoch": 0.7955508793025703, "flos": 20120533764480.0, "grad_norm": 1.727468524963267, "language_loss": 0.70050061, "learning_rate": 4.2257549272238566e-07, "loss": 0.7218352, "num_input_tokens_seen": 285533910, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 13232, "time_per_iteration": 2.482184648513794 }, { "auxiliary_loss_clip": 0.01105237, "auxiliary_loss_mlp": 0.01026692, "balance_loss_clip": 1.01433563, "balance_loss_mlp": 1.03623199, "epoch": 0.7956110025552382, "flos": 26505378881280.0, "grad_norm": 1.534793636449539, "language_loss": 0.78206146, "learning_rate": 4.223360961792952e-07, "loss": 0.80338073, "num_input_tokens_seen": 285554080, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 13233, "time_per_iteration": 2.506441593170166 }, { "auxiliary_loss_clip": 0.01106294, "auxiliary_loss_mlp": 0.01030956, "balance_loss_clip": 1.01866591, "balance_loss_mlp": 1.03564894, "epoch": 0.7956711258079062, "flos": 22565475786240.0, "grad_norm": 2.2979163900980484, "language_loss": 0.79156291, "learning_rate": 4.220967594613769e-07, "loss": 0.81293535, "num_input_tokens_seen": 285572325, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 13234, "time_per_iteration": 2.5348801612854004 }, { "auxiliary_loss_clip": 0.01103788, "auxiliary_loss_mlp": 0.01029217, "balance_loss_clip": 1.01766574, "balance_loss_mlp": 1.03556752, "epoch": 0.7957312490605741, "flos": 17379005143680.0, "grad_norm": 1.8780297513776967, "language_loss": 0.70364785, "learning_rate": 4.218574825777077e-07, "loss": 0.72497791, "num_input_tokens_seen": 285589770, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.68359375, "step": 13235, "time_per_iteration": 2.425487518310547 }, { "auxiliary_loss_clip": 0.01105782, "auxiliary_loss_mlp": 0.01031367, "balance_loss_clip": 1.01833749, "balance_loss_mlp": 1.03605247, "epoch": 0.7957913723132422, "flos": 22491427898880.0, "grad_norm": 1.5818987191877616, "language_loss": 0.67987925, "learning_rate": 4.2161826553736145e-07, "loss": 0.70125079, "num_input_tokens_seen": 285610065, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 13236, "time_per_iteration": 2.506474733352661 }, { "auxiliary_loss_clip": 0.01104347, "auxiliary_loss_mlp": 0.01027391, "balance_loss_clip": 1.01530337, "balance_loss_mlp": 1.03610373, "epoch": 0.7958514955659101, "flos": 22638087129600.0, "grad_norm": 1.9543906658662884, "language_loss": 0.75124758, "learning_rate": 4.2137910834940826e-07, "loss": 0.77256501, "num_input_tokens_seen": 285628480, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 13237, "time_per_iteration": 2.4712541103363037 }, { "auxiliary_loss_clip": 0.01108344, "auxiliary_loss_mlp": 0.01034737, "balance_loss_clip": 1.0217905, "balance_loss_mlp": 1.0385263, "epoch": 0.7959116188185781, "flos": 20704225772160.0, "grad_norm": 1.7907089733820944, "language_loss": 0.7121278, "learning_rate": 4.211400110229175e-07, "loss": 0.73355865, "num_input_tokens_seen": 285647805, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 13238, "time_per_iteration": 2.4867427349090576 }, { "auxiliary_loss_clip": 0.01104094, "auxiliary_loss_mlp": 0.01026604, "balance_loss_clip": 1.01434946, "balance_loss_mlp": 1.03452492, "epoch": 0.7959717420712461, "flos": 19024683684480.0, "grad_norm": 3.841091654518357, "language_loss": 0.73904169, "learning_rate": 4.2090097356695684e-07, "loss": 0.76034862, "num_input_tokens_seen": 285665505, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 13239, "time_per_iteration": 2.4317476749420166 }, { "auxiliary_loss_clip": 0.01108282, "auxiliary_loss_mlp": 0.01032752, "balance_loss_clip": 1.02005017, "balance_loss_mlp": 1.03686213, "epoch": 0.796031865323914, "flos": 26356636661760.0, "grad_norm": 1.9760614718581215, "language_loss": 0.69073296, "learning_rate": 4.2066199599058814e-07, "loss": 0.7121433, "num_input_tokens_seen": 285685855, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 13240, "time_per_iteration": 2.56084942817688 }, { "auxiliary_loss_clip": 0.01029995, "auxiliary_loss_mlp": 0.01003795, "balance_loss_clip": 1.002859, "balance_loss_mlp": 1.00721443, "epoch": 0.796091988576582, "flos": 62069440320000.0, "grad_norm": 0.8987653320070665, "language_loss": 0.58675426, "learning_rate": 4.2042307830287526e-07, "loss": 0.60709214, "num_input_tokens_seen": 285735710, "router_z_loss_clip": 0.00933838, "router_z_loss_mlp": 0.22851562, "step": 13241, "time_per_iteration": 2.875171184539795 }, { "auxiliary_loss_clip": 0.01106647, "auxiliary_loss_mlp": 0.0102867, "balance_loss_clip": 1.01735139, "balance_loss_mlp": 1.03754926, "epoch": 0.7961521118292499, "flos": 39020103400320.0, "grad_norm": 2.1489693368743077, "language_loss": 0.6498338, "learning_rate": 4.201842205128772e-07, "loss": 0.67118692, "num_input_tokens_seen": 285757045, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.69140625, "step": 13242, "time_per_iteration": 2.600825309753418 }, { "auxiliary_loss_clip": 0.01106644, "auxiliary_loss_mlp": 0.010316, "balance_loss_clip": 1.01871324, "balance_loss_mlp": 1.03607082, "epoch": 0.796212235081918, "flos": 21762836426880.0, "grad_norm": 1.8150581503100154, "language_loss": 0.75940788, "learning_rate": 4.199454226296526e-07, "loss": 0.78079033, "num_input_tokens_seen": 285776050, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 13243, "time_per_iteration": 2.4555962085723877 }, { "auxiliary_loss_clip": 0.01106129, "auxiliary_loss_mlp": 0.01031498, "balance_loss_clip": 1.01883829, "balance_loss_mlp": 1.0358243, "epoch": 0.7962723583345859, "flos": 21178857110400.0, "grad_norm": 1.7451154078866868, "language_loss": 0.79690015, "learning_rate": 4.1970668466225565e-07, "loss": 0.81827641, "num_input_tokens_seen": 285796830, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13244, "time_per_iteration": 2.474069595336914 }, { "auxiliary_loss_clip": 0.01108347, "auxiliary_loss_mlp": 0.01030883, "balance_loss_clip": 1.01783526, "balance_loss_mlp": 1.03617215, "epoch": 0.7963324815872539, "flos": 17128636369920.0, "grad_norm": 2.1987360327291245, "language_loss": 0.68263525, "learning_rate": 4.1946800661973934e-07, "loss": 0.70402753, "num_input_tokens_seen": 285814755, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 13245, "time_per_iteration": 2.419079542160034 }, { "auxiliary_loss_clip": 0.01105575, "auxiliary_loss_mlp": 0.01034641, "balance_loss_clip": 1.02184367, "balance_loss_mlp": 1.03566957, "epoch": 0.7963926048399218, "flos": 21397481239680.0, "grad_norm": 1.98002804792033, "language_loss": 0.79125202, "learning_rate": 4.192293885111549e-07, "loss": 0.81265414, "num_input_tokens_seen": 285834255, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 13246, "time_per_iteration": 2.470197916030884 }, { "auxiliary_loss_clip": 0.01107748, "auxiliary_loss_mlp": 0.01029736, "balance_loss_clip": 1.01668811, "balance_loss_mlp": 1.03559017, "epoch": 0.7964527280925898, "flos": 25184188828800.0, "grad_norm": 2.3216155227572672, "language_loss": 0.66016984, "learning_rate": 4.1899083034555007e-07, "loss": 0.68154472, "num_input_tokens_seen": 285853540, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 13247, "time_per_iteration": 2.486424207687378 }, { "auxiliary_loss_clip": 0.01102023, "auxiliary_loss_mlp": 0.01029057, "balance_loss_clip": 1.01738048, "balance_loss_mlp": 1.03475809, "epoch": 0.7965128513452577, "flos": 27015884928000.0, "grad_norm": 1.7522622043244367, "language_loss": 0.71445632, "learning_rate": 4.1875233213197123e-07, "loss": 0.73576713, "num_input_tokens_seen": 285872705, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 13248, "time_per_iteration": 2.4804015159606934 }, { "auxiliary_loss_clip": 0.01107433, "auxiliary_loss_mlp": 0.01029217, "balance_loss_clip": 1.01659846, "balance_loss_mlp": 1.03605032, "epoch": 0.7965729745979258, "flos": 24419578993920.0, "grad_norm": 2.138186702504696, "language_loss": 0.76340836, "learning_rate": 4.1851389387946255e-07, "loss": 0.7847749, "num_input_tokens_seen": 285890290, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 13249, "time_per_iteration": 2.454777717590332 }, { "auxiliary_loss_clip": 0.01104544, "auxiliary_loss_mlp": 0.01031352, "balance_loss_clip": 1.0191865, "balance_loss_mlp": 1.03663969, "epoch": 0.7966330978505937, "flos": 18840389978880.0, "grad_norm": 2.3218789431283917, "language_loss": 0.62142432, "learning_rate": 4.1827551559706674e-07, "loss": 0.64278334, "num_input_tokens_seen": 285909190, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 13250, "time_per_iteration": 2.4366557598114014 }, { "auxiliary_loss_clip": 0.01105754, "auxiliary_loss_mlp": 0.01026995, "balance_loss_clip": 1.01502681, "balance_loss_mlp": 1.03668857, "epoch": 0.7966932211032617, "flos": 13152319862400.0, "grad_norm": 2.249279375514545, "language_loss": 0.71645492, "learning_rate": 4.180371972938206e-07, "loss": 0.73778248, "num_input_tokens_seen": 285927570, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69140625, "step": 13251, "time_per_iteration": 2.428439140319824 }, { "auxiliary_loss_clip": 0.01110977, "auxiliary_loss_mlp": 0.01034177, "balance_loss_clip": 1.02041459, "balance_loss_mlp": 1.03831148, "epoch": 0.7967533443559297, "flos": 23949760078080.0, "grad_norm": 1.7840326792225623, "language_loss": 0.72744912, "learning_rate": 4.177989389787624e-07, "loss": 0.74890065, "num_input_tokens_seen": 285945810, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 13252, "time_per_iteration": 4.041464328765869 }, { "auxiliary_loss_clip": 0.0110251, "auxiliary_loss_mlp": 0.0102685, "balance_loss_clip": 1.01461291, "balance_loss_mlp": 1.0359199, "epoch": 0.7968134676085976, "flos": 30368791964160.0, "grad_norm": 1.8501167148321378, "language_loss": 0.66042143, "learning_rate": 4.175607406609278e-07, "loss": 0.68171501, "num_input_tokens_seen": 285964235, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.66796875, "step": 13253, "time_per_iteration": 3.9143595695495605 }, { "auxiliary_loss_clip": 0.01109728, "auxiliary_loss_mlp": 0.0103592, "balance_loss_clip": 1.02314687, "balance_loss_mlp": 1.0384419, "epoch": 0.7968735908612656, "flos": 23075048079360.0, "grad_norm": 2.067008953413039, "language_loss": 0.67718035, "learning_rate": 4.1732260234934767e-07, "loss": 0.69863689, "num_input_tokens_seen": 285983710, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 13254, "time_per_iteration": 2.4600162506103516 }, { "auxiliary_loss_clip": 0.01103014, "auxiliary_loss_mlp": 0.0103735, "balance_loss_clip": 1.02474999, "balance_loss_mlp": 1.03460932, "epoch": 0.7969337141139335, "flos": 23582250074880.0, "grad_norm": 1.963507480042055, "language_loss": 0.6948446, "learning_rate": 4.1708452405305314e-07, "loss": 0.71624821, "num_input_tokens_seen": 286003425, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 13255, "time_per_iteration": 2.4770185947418213 }, { "auxiliary_loss_clip": 0.01101903, "auxiliary_loss_mlp": 0.01031032, "balance_loss_clip": 1.01965308, "balance_loss_mlp": 1.03411198, "epoch": 0.7969938373666016, "flos": 19755860935680.0, "grad_norm": 2.1290187400749083, "language_loss": 0.79289126, "learning_rate": 4.168465057810733e-07, "loss": 0.81422067, "num_input_tokens_seen": 286020130, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 13256, "time_per_iteration": 3.849377393722534 }, { "auxiliary_loss_clip": 0.01107417, "auxiliary_loss_mlp": 0.01027545, "balance_loss_clip": 1.015028, "balance_loss_mlp": 1.03732967, "epoch": 0.7970539606192695, "flos": 24134089697280.0, "grad_norm": 1.827128505587668, "language_loss": 0.6590091, "learning_rate": 4.166085475424315e-07, "loss": 0.68035877, "num_input_tokens_seen": 286040230, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 13257, "time_per_iteration": 2.472301721572876 }, { "auxiliary_loss_clip": 0.01110463, "auxiliary_loss_mlp": 0.01033356, "balance_loss_clip": 1.02098775, "balance_loss_mlp": 1.03756046, "epoch": 0.7971140838719375, "flos": 17968622895360.0, "grad_norm": 2.1074371661072653, "language_loss": 0.71981227, "learning_rate": 4.163706493461523e-07, "loss": 0.74125051, "num_input_tokens_seen": 286059475, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.73046875, "step": 13258, "time_per_iteration": 3.8924667835235596 }, { "auxiliary_loss_clip": 0.01106637, "auxiliary_loss_mlp": 0.01030041, "balance_loss_clip": 1.01694036, "balance_loss_mlp": 1.03636551, "epoch": 0.7971742071246054, "flos": 19169547235200.0, "grad_norm": 2.245762600935303, "language_loss": 0.68780482, "learning_rate": 4.1613281120125655e-07, "loss": 0.70917159, "num_input_tokens_seen": 286077820, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 13259, "time_per_iteration": 2.441143035888672 }, { "auxiliary_loss_clip": 0.01102926, "auxiliary_loss_mlp": 0.0103027, "balance_loss_clip": 1.01890922, "balance_loss_mlp": 1.03612995, "epoch": 0.7972343303772734, "flos": 27125951178240.0, "grad_norm": 2.34570099223686, "language_loss": 0.73742491, "learning_rate": 4.158950331167641e-07, "loss": 0.75875688, "num_input_tokens_seen": 286097285, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66796875, "step": 13260, "time_per_iteration": 2.5123090744018555 }, { "auxiliary_loss_clip": 0.01103037, "auxiliary_loss_mlp": 0.01031133, "balance_loss_clip": 1.01878333, "balance_loss_mlp": 1.03492069, "epoch": 0.7972944536299413, "flos": 20996646393600.0, "grad_norm": 1.9013726861507652, "language_loss": 0.78528798, "learning_rate": 4.1565731510169065e-07, "loss": 0.80662972, "num_input_tokens_seen": 286116000, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 13261, "time_per_iteration": 2.4594004154205322 }, { "auxiliary_loss_clip": 0.01099924, "auxiliary_loss_mlp": 0.01029852, "balance_loss_clip": 1.01926053, "balance_loss_mlp": 1.03538966, "epoch": 0.7973545768826094, "flos": 21580015178880.0, "grad_norm": 1.4761475536007562, "language_loss": 0.76084036, "learning_rate": 4.154196571650501e-07, "loss": 0.78213811, "num_input_tokens_seen": 286135110, "router_z_loss_clip": 0.10595703, "router_z_loss_mlp": 0.6484375, "step": 13262, "time_per_iteration": 2.4652199745178223 }, { "auxiliary_loss_clip": 0.01110776, "auxiliary_loss_mlp": 0.01035612, "balance_loss_clip": 1.02199793, "balance_loss_mlp": 1.03822088, "epoch": 0.7974147001352773, "flos": 20558536208640.0, "grad_norm": 7.966768985989486, "language_loss": 0.70463347, "learning_rate": 4.1518205931585524e-07, "loss": 0.72609735, "num_input_tokens_seen": 286152835, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 13263, "time_per_iteration": 2.4315617084503174 }, { "auxiliary_loss_clip": 0.01110977, "auxiliary_loss_mlp": 0.01035745, "balance_loss_clip": 1.02192259, "balance_loss_mlp": 1.03716421, "epoch": 0.7974748233879453, "flos": 20996790048000.0, "grad_norm": 2.1021555814598254, "language_loss": 0.71138006, "learning_rate": 4.149445215631153e-07, "loss": 0.73284727, "num_input_tokens_seen": 286171785, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73828125, "step": 13264, "time_per_iteration": 2.4718048572540283 }, { "auxiliary_loss_clip": 0.01103407, "auxiliary_loss_mlp": 0.01031892, "balance_loss_clip": 1.01978004, "balance_loss_mlp": 1.03635132, "epoch": 0.7975349466406133, "flos": 22565188477440.0, "grad_norm": 1.7645851032125237, "language_loss": 0.77366, "learning_rate": 4.1470704391583776e-07, "loss": 0.79501295, "num_input_tokens_seen": 286190420, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 13265, "time_per_iteration": 2.446436643600464 }, { "auxiliary_loss_clip": 0.01105615, "auxiliary_loss_mlp": 0.01028842, "balance_loss_clip": 1.01701057, "balance_loss_mlp": 1.03562856, "epoch": 0.7975950698932812, "flos": 21689542725120.0, "grad_norm": 2.1375365440166147, "language_loss": 0.75550449, "learning_rate": 4.144696263830285e-07, "loss": 0.77684903, "num_input_tokens_seen": 286210105, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69921875, "step": 13266, "time_per_iteration": 2.454625129699707 }, { "auxiliary_loss_clip": 0.01102881, "auxiliary_loss_mlp": 0.01028497, "balance_loss_clip": 1.01665902, "balance_loss_mlp": 1.03423429, "epoch": 0.7976551931459492, "flos": 19604568850560.0, "grad_norm": 10.798293592121766, "language_loss": 0.83861631, "learning_rate": 4.1423226897369015e-07, "loss": 0.85993004, "num_input_tokens_seen": 286228180, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 13267, "time_per_iteration": 2.435434341430664 }, { "auxiliary_loss_clip": 0.01104245, "auxiliary_loss_mlp": 0.01029383, "balance_loss_clip": 1.01665127, "balance_loss_mlp": 1.03560901, "epoch": 0.7977153163986171, "flos": 21687603390720.0, "grad_norm": 2.071548083533168, "language_loss": 0.75846124, "learning_rate": 4.139949716968223e-07, "loss": 0.77979749, "num_input_tokens_seen": 286247305, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13268, "time_per_iteration": 2.454526662826538 }, { "auxiliary_loss_clip": 0.01105942, "auxiliary_loss_mlp": 0.01030283, "balance_loss_clip": 1.01832676, "balance_loss_mlp": 1.03703249, "epoch": 0.7977754396512852, "flos": 23476780765440.0, "grad_norm": 1.641820674419086, "language_loss": 0.77733052, "learning_rate": 4.1375773456142403e-07, "loss": 0.79869282, "num_input_tokens_seen": 286268145, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 13269, "time_per_iteration": 2.475938081741333 }, { "auxiliary_loss_clip": 0.01100502, "auxiliary_loss_mlp": 0.0103257, "balance_loss_clip": 1.02080393, "balance_loss_mlp": 1.03413844, "epoch": 0.7978355629039531, "flos": 22382223575040.0, "grad_norm": 1.7373516000587172, "language_loss": 0.82177913, "learning_rate": 4.135205575764922e-07, "loss": 0.84310985, "num_input_tokens_seen": 286286775, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 13270, "time_per_iteration": 2.451359987258911 }, { "auxiliary_loss_clip": 0.01105332, "auxiliary_loss_mlp": 0.01033609, "balance_loss_clip": 1.02142549, "balance_loss_mlp": 1.03691244, "epoch": 0.7978956861566211, "flos": 20266331068800.0, "grad_norm": 1.769758542138606, "language_loss": 0.59333903, "learning_rate": 4.1328344075101905e-07, "loss": 0.61472845, "num_input_tokens_seen": 286305590, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 13271, "time_per_iteration": 2.4521894454956055 }, { "auxiliary_loss_clip": 0.01108712, "auxiliary_loss_mlp": 0.01033538, "balance_loss_clip": 1.02106237, "balance_loss_mlp": 1.03712511, "epoch": 0.797955809409289, "flos": 28112417366400.0, "grad_norm": 1.48055117101435, "language_loss": 0.7320618, "learning_rate": 4.130463840939975e-07, "loss": 0.75348431, "num_input_tokens_seen": 286328050, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.71484375, "step": 13272, "time_per_iteration": 2.523158073425293 }, { "auxiliary_loss_clip": 0.01105093, "auxiliary_loss_mlp": 0.01027134, "balance_loss_clip": 1.01469433, "balance_loss_mlp": 1.03690362, "epoch": 0.798015932661957, "flos": 15559591495680.0, "grad_norm": 1.8600330629096677, "language_loss": 0.71688569, "learning_rate": 4.128093876144161e-07, "loss": 0.73820794, "num_input_tokens_seen": 286345265, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 13273, "time_per_iteration": 2.431699752807617 }, { "auxiliary_loss_clip": 0.01108765, "auxiliary_loss_mlp": 0.01034104, "balance_loss_clip": 1.02125287, "balance_loss_mlp": 1.03703189, "epoch": 0.7980760559146249, "flos": 23951196622080.0, "grad_norm": 1.8842950146016981, "language_loss": 0.75518417, "learning_rate": 4.1257245132126117e-07, "loss": 0.77661282, "num_input_tokens_seen": 286364465, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 13274, "time_per_iteration": 2.477172613143921 }, { "auxiliary_loss_clip": 0.01098289, "auxiliary_loss_mlp": 0.01028676, "balance_loss_clip": 1.01728535, "balance_loss_mlp": 1.03398561, "epoch": 0.798136179167293, "flos": 28038082170240.0, "grad_norm": 3.344545504150177, "language_loss": 0.78078341, "learning_rate": 4.12335575223518e-07, "loss": 0.80205309, "num_input_tokens_seen": 286385565, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.64453125, "step": 13275, "time_per_iteration": 2.5097570419311523 }, { "auxiliary_loss_clip": 0.01107186, "auxiliary_loss_mlp": 0.01036792, "balance_loss_clip": 1.02369666, "balance_loss_mlp": 1.03538013, "epoch": 0.7981963024199609, "flos": 35984538046080.0, "grad_norm": 2.6473819260698357, "language_loss": 0.64086944, "learning_rate": 4.1209875933016877e-07, "loss": 0.66230917, "num_input_tokens_seen": 286403950, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 13276, "time_per_iteration": 2.5391108989715576 }, { "auxiliary_loss_clip": 0.01102361, "auxiliary_loss_mlp": 0.010285, "balance_loss_clip": 1.01698422, "balance_loss_mlp": 1.03541458, "epoch": 0.7982564256726289, "flos": 25884914325120.0, "grad_norm": 1.746599841378362, "language_loss": 0.60937977, "learning_rate": 4.118620036501945e-07, "loss": 0.63068831, "num_input_tokens_seen": 286426160, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 13277, "time_per_iteration": 2.496051549911499 }, { "auxiliary_loss_clip": 0.01109287, "auxiliary_loss_mlp": 0.01029764, "balance_loss_clip": 1.01752758, "balance_loss_mlp": 1.03807867, "epoch": 0.7983165489252969, "flos": 25739152934400.0, "grad_norm": 2.045617285355093, "language_loss": 0.79365289, "learning_rate": 4.1162530819257227e-07, "loss": 0.81504345, "num_input_tokens_seen": 286446610, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 13278, "time_per_iteration": 2.5103566646575928 }, { "auxiliary_loss_clip": 0.01107068, "auxiliary_loss_mlp": 0.01035706, "balance_loss_clip": 1.02237296, "balance_loss_mlp": 1.0357672, "epoch": 0.7983766721779648, "flos": 21908202768000.0, "grad_norm": 1.8969875249733945, "language_loss": 0.63193011, "learning_rate": 4.113886729662768e-07, "loss": 0.65335786, "num_input_tokens_seen": 286465460, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 13279, "time_per_iteration": 2.4337990283966064 }, { "auxiliary_loss_clip": 0.01099678, "auxiliary_loss_mlp": 0.01025903, "balance_loss_clip": 1.01444101, "balance_loss_mlp": 1.03459525, "epoch": 0.7984367954306328, "flos": 29347420734720.0, "grad_norm": 2.117798770750621, "language_loss": 0.70803308, "learning_rate": 4.111520979802825e-07, "loss": 0.72928882, "num_input_tokens_seen": 286485720, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.65234375, "step": 13280, "time_per_iteration": 2.510469675064087 }, { "auxiliary_loss_clip": 0.01108485, "auxiliary_loss_mlp": 0.01031736, "balance_loss_clip": 1.01832461, "balance_loss_mlp": 1.03757262, "epoch": 0.7984969186833007, "flos": 31357772104320.0, "grad_norm": 2.971014161355844, "language_loss": 0.63162315, "learning_rate": 4.1091558324355955e-07, "loss": 0.65302539, "num_input_tokens_seen": 286507465, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 13281, "time_per_iteration": 2.527292490005493 }, { "auxiliary_loss_clip": 0.01109238, "auxiliary_loss_mlp": 0.01029286, "balance_loss_clip": 1.01702499, "balance_loss_mlp": 1.03650641, "epoch": 0.7985570419359688, "flos": 24312924535680.0, "grad_norm": 2.7654857590219137, "language_loss": 0.80414653, "learning_rate": 4.1067912876507683e-07, "loss": 0.82553172, "num_input_tokens_seen": 286526345, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7265625, "step": 13282, "time_per_iteration": 2.4816367626190186 }, { "auxiliary_loss_clip": 0.01106025, "auxiliary_loss_mlp": 0.01031489, "balance_loss_clip": 1.01812553, "balance_loss_mlp": 1.03501332, "epoch": 0.7986171651886367, "flos": 15742233175680.0, "grad_norm": 2.10061720634633, "language_loss": 0.71498573, "learning_rate": 4.10442734553802e-07, "loss": 0.73636091, "num_input_tokens_seen": 286544095, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 13283, "time_per_iteration": 2.4142305850982666 }, { "auxiliary_loss_clip": 0.01101787, "auxiliary_loss_mlp": 0.01024486, "balance_loss_clip": 1.01316166, "balance_loss_mlp": 1.03280604, "epoch": 0.7986772884413047, "flos": 11619401091840.0, "grad_norm": 2.446555968987006, "language_loss": 0.73358452, "learning_rate": 4.102064006186967e-07, "loss": 0.75484729, "num_input_tokens_seen": 286560960, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.69140625, "step": 13284, "time_per_iteration": 2.431309938430786 }, { "auxiliary_loss_clip": 0.01104911, "auxiliary_loss_mlp": 0.01032241, "balance_loss_clip": 1.02120805, "balance_loss_mlp": 1.03700948, "epoch": 0.7987374116939726, "flos": 22091059929600.0, "grad_norm": 1.593504887466339, "language_loss": 0.70335245, "learning_rate": 4.0997012696872415e-07, "loss": 0.72472394, "num_input_tokens_seen": 286579865, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6796875, "step": 13285, "time_per_iteration": 2.45925235748291 }, { "auxiliary_loss_clip": 0.01103976, "auxiliary_loss_mlp": 0.01029374, "balance_loss_clip": 1.01757812, "balance_loss_mlp": 1.03543007, "epoch": 0.7987975349466406, "flos": 17890696339200.0, "grad_norm": 1.80175536552908, "language_loss": 0.73647976, "learning_rate": 4.097339136128437e-07, "loss": 0.75781327, "num_input_tokens_seen": 286597295, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 13286, "time_per_iteration": 2.4563660621643066 }, { "auxiliary_loss_clip": 0.01105674, "auxiliary_loss_mlp": 0.01030114, "balance_loss_clip": 1.0178895, "balance_loss_mlp": 1.03639257, "epoch": 0.7988576581993085, "flos": 19719232041600.0, "grad_norm": 1.8205802824655182, "language_loss": 0.75153744, "learning_rate": 4.0949776056001296e-07, "loss": 0.77289534, "num_input_tokens_seen": 286616270, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 13287, "time_per_iteration": 2.4417295455932617 }, { "auxiliary_loss_clip": 0.01104114, "auxiliary_loss_mlp": 0.01028222, "balance_loss_clip": 1.015872, "balance_loss_mlp": 1.03667259, "epoch": 0.7989177814519766, "flos": 28036358317440.0, "grad_norm": 3.031433414905548, "language_loss": 0.61980796, "learning_rate": 4.092616678191863e-07, "loss": 0.64113128, "num_input_tokens_seen": 286638315, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 13288, "time_per_iteration": 2.548694372177124 }, { "auxiliary_loss_clip": 0.01105118, "auxiliary_loss_mlp": 0.01030457, "balance_loss_clip": 1.01882792, "balance_loss_mlp": 1.03787971, "epoch": 0.7989779047046445, "flos": 28871029630080.0, "grad_norm": 2.422943089875367, "language_loss": 0.70063269, "learning_rate": 4.090256353993169e-07, "loss": 0.72198844, "num_input_tokens_seen": 286658630, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 13289, "time_per_iteration": 2.522427558898926 }, { "auxiliary_loss_clip": 0.01103285, "auxiliary_loss_mlp": 0.01031424, "balance_loss_clip": 1.01876414, "balance_loss_mlp": 1.03642488, "epoch": 0.7990380279573125, "flos": 18186887888640.0, "grad_norm": 2.043980093782131, "language_loss": 0.62111664, "learning_rate": 4.0878966330935506e-07, "loss": 0.64246374, "num_input_tokens_seen": 286676870, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.66796875, "step": 13290, "time_per_iteration": 2.4443087577819824 }, { "auxiliary_loss_clip": 0.01108089, "auxiliary_loss_mlp": 0.01033437, "balance_loss_clip": 1.02048469, "balance_loss_mlp": 1.03804708, "epoch": 0.7990981512099805, "flos": 20879936127360.0, "grad_norm": 2.09983012179625, "language_loss": 0.71717751, "learning_rate": 4.08553751558248e-07, "loss": 0.73859274, "num_input_tokens_seen": 286694300, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 13291, "time_per_iteration": 2.4798741340637207 }, { "auxiliary_loss_clip": 0.01103057, "auxiliary_loss_mlp": 0.01028988, "balance_loss_clip": 1.0174129, "balance_loss_mlp": 1.03543115, "epoch": 0.7991582744626484, "flos": 26099911180800.0, "grad_norm": 1.6608611989469269, "language_loss": 0.63521588, "learning_rate": 4.083179001549422e-07, "loss": 0.65653634, "num_input_tokens_seen": 286714545, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.67578125, "step": 13292, "time_per_iteration": 2.49776291847229 }, { "auxiliary_loss_clip": 0.01104367, "auxiliary_loss_mlp": 0.01031957, "balance_loss_clip": 1.02007222, "balance_loss_mlp": 1.03626275, "epoch": 0.7992183977153164, "flos": 35295843605760.0, "grad_norm": 1.6999271676933427, "language_loss": 0.56336945, "learning_rate": 4.0808210910838105e-07, "loss": 0.58473277, "num_input_tokens_seen": 286734525, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 13293, "time_per_iteration": 2.569596529006958 }, { "auxiliary_loss_clip": 0.01105994, "auxiliary_loss_mlp": 0.01033951, "balance_loss_clip": 1.02161908, "balance_loss_mlp": 1.03700757, "epoch": 0.7992785209679844, "flos": 51853426577280.0, "grad_norm": 3.316668846636199, "language_loss": 0.71351731, "learning_rate": 4.0784637842750704e-07, "loss": 0.73491675, "num_input_tokens_seen": 286753430, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 13294, "time_per_iteration": 4.243142366409302 }, { "auxiliary_loss_clip": 0.01105855, "auxiliary_loss_mlp": 0.01030742, "balance_loss_clip": 1.01873207, "balance_loss_mlp": 1.03708506, "epoch": 0.7993386442206524, "flos": 22565116650240.0, "grad_norm": 2.166991256819185, "language_loss": 0.72539675, "learning_rate": 4.0761070812125675e-07, "loss": 0.74676275, "num_input_tokens_seen": 286771915, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 13295, "time_per_iteration": 3.8568074703216553 }, { "auxiliary_loss_clip": 0.01102447, "auxiliary_loss_mlp": 0.01034708, "balance_loss_clip": 1.02325773, "balance_loss_mlp": 1.03588057, "epoch": 0.7993987674733203, "flos": 18800277465600.0, "grad_norm": 1.913959759928944, "language_loss": 0.76678348, "learning_rate": 4.0737509819856797e-07, "loss": 0.78815502, "num_input_tokens_seen": 286789835, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6640625, "step": 13296, "time_per_iteration": 2.4371395111083984 }, { "auxiliary_loss_clip": 0.01029616, "auxiliary_loss_mlp": 0.01004285, "balance_loss_clip": 1.0032481, "balance_loss_mlp": 1.00714469, "epoch": 0.7994588907259883, "flos": 69421720394880.0, "grad_norm": 0.6888659175090407, "language_loss": 0.60807621, "learning_rate": 4.0713954866837573e-07, "loss": 0.62841523, "num_input_tokens_seen": 286855580, "router_z_loss_clip": 0.01037598, "router_z_loss_mlp": 0.22460938, "step": 13297, "time_per_iteration": 4.586021184921265 }, { "auxiliary_loss_clip": 0.01103656, "auxiliary_loss_mlp": 0.01031787, "balance_loss_clip": 1.01975286, "balance_loss_mlp": 1.03523016, "epoch": 0.7995190139786562, "flos": 13480327883520.0, "grad_norm": 2.4438089098757327, "language_loss": 0.7076003, "learning_rate": 4.0690405953961073e-07, "loss": 0.72895467, "num_input_tokens_seen": 286874360, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 13298, "time_per_iteration": 2.442315101623535 }, { "auxiliary_loss_clip": 0.01110142, "auxiliary_loss_mlp": 0.01033978, "balance_loss_clip": 1.02029896, "balance_loss_mlp": 1.03751099, "epoch": 0.7995791372313242, "flos": 21652842003840.0, "grad_norm": 2.299377308374009, "language_loss": 0.75353515, "learning_rate": 4.066686308212037e-07, "loss": 0.77497631, "num_input_tokens_seen": 286891950, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 13299, "time_per_iteration": 2.4576351642608643 }, { "auxiliary_loss_clip": 0.01100718, "auxiliary_loss_mlp": 0.01030324, "balance_loss_clip": 1.01895761, "balance_loss_mlp": 1.03456748, "epoch": 0.7996392604839921, "flos": 26068130622720.0, "grad_norm": 1.775786024297132, "language_loss": 0.7730422, "learning_rate": 4.064332625220828e-07, "loss": 0.79435265, "num_input_tokens_seen": 286911725, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6640625, "step": 13300, "time_per_iteration": 3.9473133087158203 }, { "auxiliary_loss_clip": 0.01106144, "auxiliary_loss_mlp": 0.0102719, "balance_loss_clip": 1.01458335, "balance_loss_mlp": 1.03541684, "epoch": 0.7996993837366602, "flos": 24606889441920.0, "grad_norm": 1.899243194919052, "language_loss": 0.63838315, "learning_rate": 4.0619795465117115e-07, "loss": 0.65971649, "num_input_tokens_seen": 286931400, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 13301, "time_per_iteration": 2.484062433242798 }, { "auxiliary_loss_clip": 0.01101916, "auxiliary_loss_mlp": 0.01034421, "balance_loss_clip": 1.02214813, "balance_loss_mlp": 1.03593493, "epoch": 0.7997595069893281, "flos": 20992049452800.0, "grad_norm": 1.7682505856326625, "language_loss": 0.71980947, "learning_rate": 4.059627072173928e-07, "loss": 0.74117285, "num_input_tokens_seen": 286949795, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.66015625, "step": 13302, "time_per_iteration": 2.447108268737793 }, { "auxiliary_loss_clip": 0.01107161, "auxiliary_loss_mlp": 0.01030133, "balance_loss_clip": 1.01734817, "balance_loss_mlp": 1.03620338, "epoch": 0.7998196302419961, "flos": 24426510318720.0, "grad_norm": 1.9632603629635794, "language_loss": 0.8362062, "learning_rate": 4.057275202296684e-07, "loss": 0.85757911, "num_input_tokens_seen": 286968805, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 13303, "time_per_iteration": 2.4901182651519775 }, { "auxiliary_loss_clip": 0.01102325, "auxiliary_loss_mlp": 0.01030268, "balance_loss_clip": 1.01916981, "balance_loss_mlp": 1.0357703, "epoch": 0.7998797534946641, "flos": 30264651457920.0, "grad_norm": 1.9956250830162567, "language_loss": 0.5902276, "learning_rate": 4.054923936969166e-07, "loss": 0.61155355, "num_input_tokens_seen": 286990235, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6640625, "step": 13304, "time_per_iteration": 2.5389041900634766 }, { "auxiliary_loss_clip": 0.01105281, "auxiliary_loss_mlp": 0.0102965, "balance_loss_clip": 1.01703238, "balance_loss_mlp": 1.03407311, "epoch": 0.799939876747332, "flos": 23513984277120.0, "grad_norm": 1.5911157816680845, "language_loss": 0.69464284, "learning_rate": 4.0525732762805265e-07, "loss": 0.71599221, "num_input_tokens_seen": 287011060, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 13305, "time_per_iteration": 2.4865942001342773 }, { "auxiliary_loss_clip": 0.0110226, "auxiliary_loss_mlp": 0.01028168, "balance_loss_clip": 1.01711178, "balance_loss_mlp": 1.03546607, "epoch": 0.8, "flos": 19318109886720.0, "grad_norm": 2.2302081429988196, "language_loss": 0.69109499, "learning_rate": 4.0502232203199107e-07, "loss": 0.7123993, "num_input_tokens_seen": 287029215, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66796875, "step": 13306, "time_per_iteration": 2.4281396865844727 }, { "auxiliary_loss_clip": 0.0110707, "auxiliary_loss_mlp": 0.01034518, "balance_loss_clip": 1.02234066, "balance_loss_mlp": 1.03807342, "epoch": 0.800060123252668, "flos": 32412432263040.0, "grad_norm": 1.7325326868656574, "language_loss": 0.69498742, "learning_rate": 4.0478737691764286e-07, "loss": 0.71640325, "num_input_tokens_seen": 287050855, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 13307, "time_per_iteration": 2.570976734161377 }, { "auxiliary_loss_clip": 0.01105067, "auxiliary_loss_mlp": 0.0103068, "balance_loss_clip": 1.01889014, "balance_loss_mlp": 1.03594697, "epoch": 0.800120246505336, "flos": 20010611168640.0, "grad_norm": 3.5854831249650716, "language_loss": 0.76606733, "learning_rate": 4.0455249229391677e-07, "loss": 0.7874248, "num_input_tokens_seen": 287069915, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69140625, "step": 13308, "time_per_iteration": 2.451770782470703 }, { "auxiliary_loss_clip": 0.01109436, "auxiliary_loss_mlp": 0.01031807, "balance_loss_clip": 1.01788378, "balance_loss_mlp": 1.03647077, "epoch": 0.8001803697580039, "flos": 31868278151040.0, "grad_norm": 1.4496975486936727, "language_loss": 0.78762054, "learning_rate": 4.0431766816972e-07, "loss": 0.80903298, "num_input_tokens_seen": 287091450, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.73046875, "step": 13309, "time_per_iteration": 2.538909912109375 }, { "auxiliary_loss_clip": 0.01029406, "auxiliary_loss_mlp": 0.01001667, "balance_loss_clip": 1.00070179, "balance_loss_mlp": 1.00692844, "epoch": 0.8002404930106719, "flos": 63392066916480.0, "grad_norm": 0.9879144207482666, "language_loss": 0.64695305, "learning_rate": 4.040829045539571e-07, "loss": 0.66726381, "num_input_tokens_seen": 287148365, "router_z_loss_clip": 0.00964355, "router_z_loss_mlp": 0.22460938, "step": 13310, "time_per_iteration": 3.022045850753784 }, { "auxiliary_loss_clip": 0.01104273, "auxiliary_loss_mlp": 0.01032492, "balance_loss_clip": 1.02069592, "balance_loss_mlp": 1.0360862, "epoch": 0.8003006162633398, "flos": 27855476403840.0, "grad_norm": 1.9415913243965137, "language_loss": 0.82900333, "learning_rate": 4.0384820145553156e-07, "loss": 0.850371, "num_input_tokens_seen": 287168280, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 13311, "time_per_iteration": 2.5026566982269287 }, { "auxiliary_loss_clip": 0.01104449, "auxiliary_loss_mlp": 0.01032128, "balance_loss_clip": 1.01944995, "balance_loss_mlp": 1.03584993, "epoch": 0.8003607395160078, "flos": 18223337214720.0, "grad_norm": 2.3580859308526962, "language_loss": 0.65909392, "learning_rate": 4.0361355888334116e-07, "loss": 0.68045962, "num_input_tokens_seen": 287185980, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13312, "time_per_iteration": 2.432856798171997 }, { "auxiliary_loss_clip": 0.01109966, "auxiliary_loss_mlp": 0.01034681, "balance_loss_clip": 1.0216279, "balance_loss_mlp": 1.03847003, "epoch": 0.8004208627686757, "flos": 20886975192960.0, "grad_norm": 1.7076925137459327, "language_loss": 0.7508027, "learning_rate": 4.033789768462843e-07, "loss": 0.77224922, "num_input_tokens_seen": 287203875, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 13313, "time_per_iteration": 2.5299839973449707 }, { "auxiliary_loss_clip": 0.01104202, "auxiliary_loss_mlp": 0.01029186, "balance_loss_clip": 1.01700902, "balance_loss_mlp": 1.03495955, "epoch": 0.8004809860213438, "flos": 26436143416320.0, "grad_norm": 1.416449380776648, "language_loss": 0.75841677, "learning_rate": 4.031444553532575e-07, "loss": 0.7797507, "num_input_tokens_seen": 287226445, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 13314, "time_per_iteration": 2.578986644744873 }, { "auxiliary_loss_clip": 0.01029295, "auxiliary_loss_mlp": 0.01001495, "balance_loss_clip": 1.00054741, "balance_loss_mlp": 1.00685072, "epoch": 0.8005411092740117, "flos": 63648612829440.0, "grad_norm": 0.8236429204814939, "language_loss": 0.53782982, "learning_rate": 4.029099944131522e-07, "loss": 0.55813771, "num_input_tokens_seen": 287286240, "router_z_loss_clip": 0.00946045, "router_z_loss_mlp": 0.22460938, "step": 13315, "time_per_iteration": 3.0264270305633545 }, { "auxiliary_loss_clip": 0.0110517, "auxiliary_loss_mlp": 0.01033718, "balance_loss_clip": 1.02121282, "balance_loss_mlp": 1.03643119, "epoch": 0.8006012325266797, "flos": 36138056774400.0, "grad_norm": 1.7544119521667387, "language_loss": 0.71235633, "learning_rate": 4.026755940348603e-07, "loss": 0.73374522, "num_input_tokens_seen": 287310265, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 13316, "time_per_iteration": 2.6035242080688477 }, { "auxiliary_loss_clip": 0.0110995, "auxiliary_loss_mlp": 0.0103383, "balance_loss_clip": 1.02134871, "balance_loss_mlp": 1.0375129, "epoch": 0.8006613557793477, "flos": 33838947970560.0, "grad_norm": 1.9526254777933902, "language_loss": 0.64867783, "learning_rate": 4.024412542272706e-07, "loss": 0.67011565, "num_input_tokens_seen": 287331610, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 13317, "time_per_iteration": 2.581631898880005 }, { "auxiliary_loss_clip": 0.01029336, "auxiliary_loss_mlp": 0.01000889, "balance_loss_clip": 0.99989414, "balance_loss_mlp": 1.00684309, "epoch": 0.8007214790320156, "flos": 67348310699520.0, "grad_norm": 0.7624510403104905, "language_loss": 0.58958977, "learning_rate": 4.0220697499926783e-07, "loss": 0.60989201, "num_input_tokens_seen": 287394795, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22460938, "step": 13318, "time_per_iteration": 3.176081418991089 }, { "auxiliary_loss_clip": 0.01102294, "auxiliary_loss_mlp": 0.01024952, "balance_loss_clip": 1.01244068, "balance_loss_mlp": 1.03434825, "epoch": 0.8007816022846836, "flos": 23185653033600.0, "grad_norm": 1.7102274884389281, "language_loss": 0.66427159, "learning_rate": 4.019727563597366e-07, "loss": 0.68554401, "num_input_tokens_seen": 287414595, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 13319, "time_per_iteration": 2.4862964153289795 }, { "auxiliary_loss_clip": 0.01107381, "auxiliary_loss_mlp": 0.01038544, "balance_loss_clip": 1.0254246, "balance_loss_mlp": 1.0361793, "epoch": 0.8008417255373516, "flos": 21981388728960.0, "grad_norm": 4.549189454645845, "language_loss": 0.74221301, "learning_rate": 4.0173859831755873e-07, "loss": 0.76367235, "num_input_tokens_seen": 287434395, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 13320, "time_per_iteration": 2.5070812702178955 }, { "auxiliary_loss_clip": 0.0110743, "auxiliary_loss_mlp": 0.01027638, "balance_loss_clip": 1.01488864, "balance_loss_mlp": 1.03684044, "epoch": 0.8009018487900196, "flos": 16727334647040.0, "grad_norm": 2.220572203338567, "language_loss": 0.80414808, "learning_rate": 4.015045008816138e-07, "loss": 0.8254987, "num_input_tokens_seen": 287450590, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13321, "time_per_iteration": 2.4560787677764893 }, { "auxiliary_loss_clip": 0.01099249, "auxiliary_loss_mlp": 0.01031912, "balance_loss_clip": 1.02022362, "balance_loss_mlp": 1.03256893, "epoch": 0.8009619720426875, "flos": 20813609664000.0, "grad_norm": 1.793277128150247, "language_loss": 0.65986627, "learning_rate": 4.0127046406077825e-07, "loss": 0.68117785, "num_input_tokens_seen": 287468455, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66796875, "step": 13322, "time_per_iteration": 2.449986696243286 }, { "auxiliary_loss_clip": 0.01103687, "auxiliary_loss_mlp": 0.01029252, "balance_loss_clip": 1.01697946, "balance_loss_mlp": 1.03478742, "epoch": 0.8010220952953555, "flos": 17931096161280.0, "grad_norm": 2.0591293595185727, "language_loss": 0.77861851, "learning_rate": 4.010364878639265e-07, "loss": 0.79994792, "num_input_tokens_seen": 287486485, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 13323, "time_per_iteration": 2.432574510574341 }, { "auxiliary_loss_clip": 0.01107756, "auxiliary_loss_mlp": 0.01031793, "balance_loss_clip": 1.0187571, "balance_loss_mlp": 1.03643894, "epoch": 0.8010822185480234, "flos": 24572235795840.0, "grad_norm": 2.4260064998683593, "language_loss": 0.7117852, "learning_rate": 4.00802572299932e-07, "loss": 0.7331807, "num_input_tokens_seen": 287503940, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 13324, "time_per_iteration": 2.4828102588653564 }, { "auxiliary_loss_clip": 0.01105825, "auxiliary_loss_mlp": 0.01034335, "balance_loss_clip": 1.02172244, "balance_loss_mlp": 1.03430367, "epoch": 0.8011423418006914, "flos": 21829988903040.0, "grad_norm": 1.6961508165443349, "language_loss": 0.76420295, "learning_rate": 4.005687173776635e-07, "loss": 0.7856046, "num_input_tokens_seen": 287521660, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 13325, "time_per_iteration": 2.4683849811553955 }, { "auxiliary_loss_clip": 0.01097424, "auxiliary_loss_mlp": 0.01025986, "balance_loss_clip": 1.01510191, "balance_loss_mlp": 1.03265154, "epoch": 0.8012024650533593, "flos": 23915178259200.0, "grad_norm": 1.6164050884725312, "language_loss": 0.79684699, "learning_rate": 4.003349231059898e-07, "loss": 0.81808114, "num_input_tokens_seen": 287541505, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.6484375, "step": 13326, "time_per_iteration": 2.475494861602783 }, { "auxiliary_loss_clip": 0.01101999, "auxiliary_loss_mlp": 0.01032765, "balance_loss_clip": 1.02131474, "balance_loss_mlp": 1.03549469, "epoch": 0.8012625883060274, "flos": 23587062497280.0, "grad_norm": 2.386362807501643, "language_loss": 0.6621592, "learning_rate": 4.001011894937765e-07, "loss": 0.68350685, "num_input_tokens_seen": 287560015, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 13327, "time_per_iteration": 2.484137773513794 }, { "auxiliary_loss_clip": 0.01101693, "auxiliary_loss_mlp": 0.01029171, "balance_loss_clip": 1.01773882, "balance_loss_mlp": 1.03575265, "epoch": 0.8013227115586953, "flos": 20813932886400.0, "grad_norm": 2.1540241152316693, "language_loss": 0.7372092, "learning_rate": 3.9986751654988636e-07, "loss": 0.75851786, "num_input_tokens_seen": 287579150, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66015625, "step": 13328, "time_per_iteration": 2.4420106410980225 }, { "auxiliary_loss_clip": 0.01106595, "auxiliary_loss_mlp": 0.01032383, "balance_loss_clip": 1.01885283, "balance_loss_mlp": 1.03575027, "epoch": 0.8013828348113633, "flos": 15888317788800.0, "grad_norm": 3.2065854295919336, "language_loss": 0.7381016, "learning_rate": 3.996339042831798e-07, "loss": 0.75949144, "num_input_tokens_seen": 287597420, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 13329, "time_per_iteration": 2.4395577907562256 }, { "auxiliary_loss_clip": 0.01029272, "auxiliary_loss_mlp": 0.01002736, "balance_loss_clip": 1.00172269, "balance_loss_mlp": 1.00680327, "epoch": 0.8014429580640313, "flos": 71062981562880.0, "grad_norm": 0.701576215428519, "language_loss": 0.52963758, "learning_rate": 3.9940035270251605e-07, "loss": 0.54995763, "num_input_tokens_seen": 287667280, "router_z_loss_clip": 0.01013184, "router_z_loss_mlp": 0.22460938, "step": 13330, "time_per_iteration": 3.179321527481079 }, { "auxiliary_loss_clip": 0.01107906, "auxiliary_loss_mlp": 0.01036145, "balance_loss_clip": 1.02228129, "balance_loss_mlp": 1.03599, "epoch": 0.8015030813166992, "flos": 23076340968960.0, "grad_norm": 3.5099906518204507, "language_loss": 0.72900462, "learning_rate": 3.991668618167519e-07, "loss": 0.75044513, "num_input_tokens_seen": 287687375, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71875, "step": 13331, "time_per_iteration": 2.4899420738220215 }, { "auxiliary_loss_clip": 0.01103337, "auxiliary_loss_mlp": 0.01028415, "balance_loss_clip": 1.01692295, "balance_loss_mlp": 1.03472781, "epoch": 0.8015632045693672, "flos": 21872328059520.0, "grad_norm": 2.1173474920908912, "language_loss": 0.77609277, "learning_rate": 3.989334316347401e-07, "loss": 0.79741025, "num_input_tokens_seen": 287707895, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6875, "step": 13332, "time_per_iteration": 2.5127642154693604 }, { "auxiliary_loss_clip": 0.01106546, "auxiliary_loss_mlp": 0.01029771, "balance_loss_clip": 1.01728988, "balance_loss_mlp": 1.0372293, "epoch": 0.8016233278220352, "flos": 23656728925440.0, "grad_norm": 2.3046838164078634, "language_loss": 0.83215964, "learning_rate": 3.987000621653338e-07, "loss": 0.85352278, "num_input_tokens_seen": 287723990, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 13333, "time_per_iteration": 2.469849109649658 }, { "auxiliary_loss_clip": 0.01103981, "auxiliary_loss_mlp": 0.0102851, "balance_loss_clip": 1.01601648, "balance_loss_mlp": 1.03401482, "epoch": 0.8016834510747032, "flos": 16253170185600.0, "grad_norm": 1.5824744330093565, "language_loss": 0.73593795, "learning_rate": 3.9846675341738133e-07, "loss": 0.75726283, "num_input_tokens_seen": 287742380, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 13334, "time_per_iteration": 2.4363489151000977 }, { "auxiliary_loss_clip": 0.01103467, "auxiliary_loss_mlp": 0.0102845, "balance_loss_clip": 1.01630902, "balance_loss_mlp": 1.03679919, "epoch": 0.8017435743273711, "flos": 12276027665280.0, "grad_norm": 1.8940270619592037, "language_loss": 0.74480158, "learning_rate": 3.9823350539972967e-07, "loss": 0.76612079, "num_input_tokens_seen": 287760130, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6640625, "step": 13335, "time_per_iteration": 3.991332530975342 }, { "auxiliary_loss_clip": 0.01101989, "auxiliary_loss_mlp": 0.01027763, "balance_loss_clip": 1.0146203, "balance_loss_mlp": 1.03316724, "epoch": 0.8018036975800391, "flos": 17196112068480.0, "grad_norm": 2.0675444202308415, "language_loss": 0.75741935, "learning_rate": 3.9800031812122416e-07, "loss": 0.77871686, "num_input_tokens_seen": 287777565, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 13336, "time_per_iteration": 3.9001657962799072 }, { "auxiliary_loss_clip": 0.01111629, "auxiliary_loss_mlp": 0.01033409, "balance_loss_clip": 1.01997447, "balance_loss_mlp": 1.03837264, "epoch": 0.801863820832707, "flos": 20631865824000.0, "grad_norm": 2.767574353336712, "language_loss": 0.74839973, "learning_rate": 3.977671915907068e-07, "loss": 0.76985008, "num_input_tokens_seen": 287796310, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 13337, "time_per_iteration": 2.4638051986694336 }, { "auxiliary_loss_clip": 0.01107999, "auxiliary_loss_mlp": 0.01034011, "balance_loss_clip": 1.02099347, "balance_loss_mlp": 1.03684855, "epoch": 0.801923944085375, "flos": 30445569285120.0, "grad_norm": 1.8999285121354788, "language_loss": 0.79930294, "learning_rate": 3.9753412581701883e-07, "loss": 0.82072306, "num_input_tokens_seen": 287817330, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 13338, "time_per_iteration": 2.561711311340332 }, { "auxiliary_loss_clip": 0.01105663, "auxiliary_loss_mlp": 0.01028912, "balance_loss_clip": 1.01600766, "balance_loss_mlp": 1.03461885, "epoch": 0.801984067338043, "flos": 20010575255040.0, "grad_norm": 2.654681626751789, "language_loss": 0.74554664, "learning_rate": 3.9730112080899733e-07, "loss": 0.76689243, "num_input_tokens_seen": 287835095, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 13339, "time_per_iteration": 3.8460068702697754 }, { "auxiliary_loss_clip": 0.0110126, "auxiliary_loss_mlp": 0.01028321, "balance_loss_clip": 1.01690125, "balance_loss_mlp": 1.03495789, "epoch": 0.802044190590711, "flos": 22784028088320.0, "grad_norm": 2.3190523944919508, "language_loss": 0.78664041, "learning_rate": 3.970681765754775e-07, "loss": 0.80793619, "num_input_tokens_seen": 287854595, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 13340, "time_per_iteration": 2.475390672683716 }, { "auxiliary_loss_clip": 0.01104176, "auxiliary_loss_mlp": 0.01030169, "balance_loss_clip": 1.01832581, "balance_loss_mlp": 1.03466451, "epoch": 0.8021043138433789, "flos": 27600115639680.0, "grad_norm": 2.32467565500403, "language_loss": 0.68017864, "learning_rate": 3.968352931252936e-07, "loss": 0.70152205, "num_input_tokens_seen": 287876960, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 13341, "time_per_iteration": 3.9713425636291504 }, { "auxiliary_loss_clip": 0.01028542, "auxiliary_loss_mlp": 0.01000228, "balance_loss_clip": 0.99920863, "balance_loss_mlp": 1.00613809, "epoch": 0.8021644370960469, "flos": 62063730057600.0, "grad_norm": 0.8107781041626634, "language_loss": 0.61604989, "learning_rate": 3.9660247046727547e-07, "loss": 0.63633752, "num_input_tokens_seen": 287936530, "router_z_loss_clip": 0.01019287, "router_z_loss_mlp": 0.22460938, "step": 13342, "time_per_iteration": 3.039100170135498 }, { "auxiliary_loss_clip": 0.0110786, "auxiliary_loss_mlp": 0.01035654, "balance_loss_clip": 1.02237391, "balance_loss_mlp": 1.03807247, "epoch": 0.8022245603487148, "flos": 23361794352000.0, "grad_norm": 1.836886105666957, "language_loss": 0.63668084, "learning_rate": 3.963697086102522e-07, "loss": 0.65811598, "num_input_tokens_seen": 287954285, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 13343, "time_per_iteration": 2.445613145828247 }, { "auxiliary_loss_clip": 0.0110114, "auxiliary_loss_mlp": 0.01026596, "balance_loss_clip": 1.01510453, "balance_loss_mlp": 1.03490853, "epoch": 0.8022846836013828, "flos": 10853354712960.0, "grad_norm": 2.0172012185161923, "language_loss": 0.68511295, "learning_rate": 3.96137007563051e-07, "loss": 0.70639032, "num_input_tokens_seen": 287971595, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 13344, "time_per_iteration": 2.4349260330200195 }, { "auxiliary_loss_clip": 0.01106055, "auxiliary_loss_mlp": 0.01028639, "balance_loss_clip": 1.01588976, "balance_loss_mlp": 1.03711534, "epoch": 0.8023448068540509, "flos": 29240443054080.0, "grad_norm": 1.5996572342640756, "language_loss": 0.69937307, "learning_rate": 3.9590436733449506e-07, "loss": 0.72072005, "num_input_tokens_seen": 287992540, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13345, "time_per_iteration": 2.535369396209717 }, { "auxiliary_loss_clip": 0.01028935, "auxiliary_loss_mlp": 0.01000385, "balance_loss_clip": 0.99940157, "balance_loss_mlp": 1.00643528, "epoch": 0.8024049301067188, "flos": 64153588181760.0, "grad_norm": 0.8684056159058915, "language_loss": 0.62939274, "learning_rate": 3.956717879334059e-07, "loss": 0.64968598, "num_input_tokens_seen": 288052810, "router_z_loss_clip": 0.00982666, "router_z_loss_mlp": 0.22460938, "step": 13346, "time_per_iteration": 3.137935161590576 }, { "auxiliary_loss_clip": 0.01104242, "auxiliary_loss_mlp": 0.01030058, "balance_loss_clip": 1.0179162, "balance_loss_mlp": 1.03699541, "epoch": 0.8024650533593868, "flos": 28585360765440.0, "grad_norm": 1.6220408116170832, "language_loss": 0.72397661, "learning_rate": 3.9543926936860327e-07, "loss": 0.7453196, "num_input_tokens_seen": 288073045, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 13347, "time_per_iteration": 2.5149545669555664 }, { "auxiliary_loss_clip": 0.01105362, "auxiliary_loss_mlp": 0.01030341, "balance_loss_clip": 1.01723409, "balance_loss_mlp": 1.03488755, "epoch": 0.8025251766120547, "flos": 16982264448000.0, "grad_norm": 1.8926045932228273, "language_loss": 0.72451961, "learning_rate": 3.9520681164890493e-07, "loss": 0.74587661, "num_input_tokens_seen": 288091165, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 13348, "time_per_iteration": 2.4778871536254883 }, { "auxiliary_loss_clip": 0.01106166, "auxiliary_loss_mlp": 0.01032849, "balance_loss_clip": 1.02026093, "balance_loss_mlp": 1.03708458, "epoch": 0.8025852998647227, "flos": 22163671272960.0, "grad_norm": 2.273200263953461, "language_loss": 0.76151836, "learning_rate": 3.9497441478312444e-07, "loss": 0.7829085, "num_input_tokens_seen": 288110595, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 13349, "time_per_iteration": 2.485084295272827 }, { "auxiliary_loss_clip": 0.01106243, "auxiliary_loss_mlp": 0.01029874, "balance_loss_clip": 1.01855528, "balance_loss_mlp": 1.03789306, "epoch": 0.8026454231173906, "flos": 22017012042240.0, "grad_norm": 2.179432172710088, "language_loss": 0.83151782, "learning_rate": 3.947420787800755e-07, "loss": 0.85287905, "num_input_tokens_seen": 288128995, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 13350, "time_per_iteration": 2.463564872741699 }, { "auxiliary_loss_clip": 0.01107863, "auxiliary_loss_mlp": 0.01033944, "balance_loss_clip": 1.02181447, "balance_loss_mlp": 1.03873336, "epoch": 0.8027055463700586, "flos": 22491320158080.0, "grad_norm": 1.8132077273687623, "language_loss": 0.71463358, "learning_rate": 3.945098036485679e-07, "loss": 0.73605162, "num_input_tokens_seen": 288149265, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 13351, "time_per_iteration": 2.4633917808532715 }, { "auxiliary_loss_clip": 0.01105015, "auxiliary_loss_mlp": 0.01028815, "balance_loss_clip": 1.01641703, "balance_loss_mlp": 1.03778934, "epoch": 0.8027656696227266, "flos": 28912901909760.0, "grad_norm": 1.7509689672647366, "language_loss": 0.61705279, "learning_rate": 3.9427758939740885e-07, "loss": 0.63839102, "num_input_tokens_seen": 288170745, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 13352, "time_per_iteration": 2.522090435028076 }, { "auxiliary_loss_clip": 0.01105829, "auxiliary_loss_mlp": 0.0103836, "balance_loss_clip": 1.02589035, "balance_loss_mlp": 1.03783953, "epoch": 0.8028257928753946, "flos": 18589374760320.0, "grad_norm": 1.7509313608091892, "language_loss": 0.76706225, "learning_rate": 3.940454360354046e-07, "loss": 0.78850412, "num_input_tokens_seen": 288189415, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 13353, "time_per_iteration": 2.4398179054260254 }, { "auxiliary_loss_clip": 0.01112521, "auxiliary_loss_mlp": 0.01028642, "balance_loss_clip": 1.01471829, "balance_loss_mlp": 1.03724098, "epoch": 0.8028859161280625, "flos": 19130009339520.0, "grad_norm": 2.7149677816190834, "language_loss": 0.73141438, "learning_rate": 3.938133435713582e-07, "loss": 0.75282598, "num_input_tokens_seen": 288206900, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.75390625, "step": 13354, "time_per_iteration": 2.4380698204040527 }, { "auxiliary_loss_clip": 0.01106577, "auxiliary_loss_mlp": 0.01034326, "balance_loss_clip": 1.02221465, "balance_loss_mlp": 1.036273, "epoch": 0.8029460393807305, "flos": 20229881742720.0, "grad_norm": 1.9340854170366748, "language_loss": 0.66133136, "learning_rate": 3.935813120140714e-07, "loss": 0.68274045, "num_input_tokens_seen": 288224800, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 13355, "time_per_iteration": 2.439054489135742 }, { "auxiliary_loss_clip": 0.01108048, "auxiliary_loss_mlp": 0.01032974, "balance_loss_clip": 1.01978314, "balance_loss_mlp": 1.03596652, "epoch": 0.8030061626333984, "flos": 49783320933120.0, "grad_norm": 2.037633929029371, "language_loss": 0.68526793, "learning_rate": 3.9334934137234235e-07, "loss": 0.70667815, "num_input_tokens_seen": 288249400, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 13356, "time_per_iteration": 2.7260687351226807 }, { "auxiliary_loss_clip": 0.01105296, "auxiliary_loss_mlp": 0.01030638, "balance_loss_clip": 1.0182879, "balance_loss_mlp": 1.03647339, "epoch": 0.8030662858860664, "flos": 21615243442560.0, "grad_norm": 1.5623970705585493, "language_loss": 0.77361864, "learning_rate": 3.931174316549666e-07, "loss": 0.79497796, "num_input_tokens_seen": 288268780, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 13357, "time_per_iteration": 2.469369649887085 }, { "auxiliary_loss_clip": 0.01106236, "auxiliary_loss_mlp": 0.01029989, "balance_loss_clip": 1.01698291, "balance_loss_mlp": 1.03402972, "epoch": 0.8031264091387345, "flos": 25630056351360.0, "grad_norm": 1.5760215631751004, "language_loss": 0.77152908, "learning_rate": 3.9288558287073937e-07, "loss": 0.79289138, "num_input_tokens_seen": 288290830, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 13358, "time_per_iteration": 2.5206968784332275 }, { "auxiliary_loss_clip": 0.01102289, "auxiliary_loss_mlp": 0.01032929, "balance_loss_clip": 1.02091277, "balance_loss_mlp": 1.03438163, "epoch": 0.8031865323914024, "flos": 19646225648640.0, "grad_norm": 1.7574736076776902, "language_loss": 0.85098058, "learning_rate": 3.9265379502845143e-07, "loss": 0.87233281, "num_input_tokens_seen": 288308865, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 13359, "time_per_iteration": 2.4373295307159424 }, { "auxiliary_loss_clip": 0.01104155, "auxiliary_loss_mlp": 0.01026292, "balance_loss_clip": 1.01468718, "balance_loss_mlp": 1.03645658, "epoch": 0.8032466556440704, "flos": 26169110732160.0, "grad_norm": 2.313278282288738, "language_loss": 0.73307031, "learning_rate": 3.924220681368928e-07, "loss": 0.75437474, "num_input_tokens_seen": 288327325, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 13360, "time_per_iteration": 2.5019545555114746 }, { "auxiliary_loss_clip": 0.0110532, "auxiliary_loss_mlp": 0.01027183, "balance_loss_clip": 1.01550603, "balance_loss_mlp": 1.03539526, "epoch": 0.8033067788967383, "flos": 25520026014720.0, "grad_norm": 2.9344289184909975, "language_loss": 0.69557947, "learning_rate": 3.921904022048512e-07, "loss": 0.71690452, "num_input_tokens_seen": 288347285, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69921875, "step": 13361, "time_per_iteration": 2.5010650157928467 }, { "auxiliary_loss_clip": 0.0110858, "auxiliary_loss_mlp": 0.01035245, "balance_loss_clip": 1.02209008, "balance_loss_mlp": 1.03650677, "epoch": 0.8033669021494063, "flos": 24024274842240.0, "grad_norm": 1.9794765547697282, "language_loss": 0.70249832, "learning_rate": 3.919587972411098e-07, "loss": 0.72393656, "num_input_tokens_seen": 288367785, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 13362, "time_per_iteration": 2.511122703552246 }, { "auxiliary_loss_clip": 0.01112883, "auxiliary_loss_mlp": 0.0103554, "balance_loss_clip": 1.02094913, "balance_loss_mlp": 1.03825235, "epoch": 0.8034270254020742, "flos": 13588059749760.0, "grad_norm": 10.401979599330813, "language_loss": 0.78809333, "learning_rate": 3.91727253254452e-07, "loss": 0.80957752, "num_input_tokens_seen": 288384135, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.74609375, "step": 13363, "time_per_iteration": 2.4270310401916504 }, { "auxiliary_loss_clip": 0.01105756, "auxiliary_loss_mlp": 0.01031143, "balance_loss_clip": 1.01794076, "balance_loss_mlp": 1.03568602, "epoch": 0.8034871486547422, "flos": 27412661537280.0, "grad_norm": 2.344801762883309, "language_loss": 0.75017583, "learning_rate": 3.9149577025365787e-07, "loss": 0.77154481, "num_input_tokens_seen": 288403805, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 13364, "time_per_iteration": 2.5534486770629883 }, { "auxiliary_loss_clip": 0.01108223, "auxiliary_loss_mlp": 0.01033452, "balance_loss_clip": 1.02155495, "balance_loss_mlp": 1.03932512, "epoch": 0.8035472719074102, "flos": 32598593475840.0, "grad_norm": 3.3575218054569596, "language_loss": 0.60876966, "learning_rate": 3.9126434824750596e-07, "loss": 0.63018638, "num_input_tokens_seen": 288424895, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 13365, "time_per_iteration": 2.5503780841827393 }, { "auxiliary_loss_clip": 0.01106395, "auxiliary_loss_mlp": 0.01033936, "balance_loss_clip": 1.02123439, "balance_loss_mlp": 1.03627443, "epoch": 0.8036073951600782, "flos": 21287989607040.0, "grad_norm": 7.217133701087321, "language_loss": 0.66260147, "learning_rate": 3.910329872447706e-07, "loss": 0.68400472, "num_input_tokens_seen": 288443865, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 13366, "time_per_iteration": 2.4824202060699463 }, { "auxiliary_loss_clip": 0.0110388, "auxiliary_loss_mlp": 0.01033581, "balance_loss_clip": 1.02170157, "balance_loss_mlp": 1.03670406, "epoch": 0.8036675184127461, "flos": 18113845582080.0, "grad_norm": 2.6773809952028613, "language_loss": 0.74954462, "learning_rate": 3.908016872542259e-07, "loss": 0.77091926, "num_input_tokens_seen": 288461065, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 13367, "time_per_iteration": 2.4175455570220947 }, { "auxiliary_loss_clip": 0.01104304, "auxiliary_loss_mlp": 0.01027565, "balance_loss_clip": 1.01564395, "balance_loss_mlp": 1.03593767, "epoch": 0.8037276416654141, "flos": 26030280666240.0, "grad_norm": 4.336748545766294, "language_loss": 0.74019039, "learning_rate": 3.905704482846428e-07, "loss": 0.76150906, "num_input_tokens_seen": 288481865, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 13368, "time_per_iteration": 2.5123069286346436 }, { "auxiliary_loss_clip": 0.0110712, "auxiliary_loss_mlp": 0.01032305, "balance_loss_clip": 1.02009225, "balance_loss_mlp": 1.03597581, "epoch": 0.803787764918082, "flos": 18802180886400.0, "grad_norm": 2.316156918580419, "language_loss": 0.70246798, "learning_rate": 3.90339270344789e-07, "loss": 0.72386223, "num_input_tokens_seen": 288499345, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 13369, "time_per_iteration": 2.422959566116333 }, { "auxiliary_loss_clip": 0.0110222, "auxiliary_loss_mlp": 0.01032155, "balance_loss_clip": 1.02071106, "balance_loss_mlp": 1.03467155, "epoch": 0.80384788817075, "flos": 20225787592320.0, "grad_norm": 2.159688445041415, "language_loss": 0.7377063, "learning_rate": 3.901081534434312e-07, "loss": 0.75905001, "num_input_tokens_seen": 288517660, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.67578125, "step": 13370, "time_per_iteration": 2.465912342071533 }, { "auxiliary_loss_clip": 0.01108599, "auxiliary_loss_mlp": 0.01034538, "balance_loss_clip": 1.02122247, "balance_loss_mlp": 1.03645992, "epoch": 0.8039080114234181, "flos": 18515290959360.0, "grad_norm": 3.030318702166131, "language_loss": 0.86875087, "learning_rate": 3.898770975893342e-07, "loss": 0.89018226, "num_input_tokens_seen": 288534180, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 13371, "time_per_iteration": 2.42242169380188 }, { "auxiliary_loss_clip": 0.011089, "auxiliary_loss_mlp": 0.01031624, "balance_loss_clip": 1.01833189, "balance_loss_mlp": 1.03589201, "epoch": 0.803968134676086, "flos": 22382510883840.0, "grad_norm": 1.8775343445100268, "language_loss": 0.74967474, "learning_rate": 3.89646102791259e-07, "loss": 0.77107996, "num_input_tokens_seen": 288553350, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 13372, "time_per_iteration": 2.472196102142334 }, { "auxiliary_loss_clip": 0.01103736, "auxiliary_loss_mlp": 0.01028484, "balance_loss_clip": 1.01562166, "balance_loss_mlp": 1.0349946, "epoch": 0.804028257928754, "flos": 23842566915840.0, "grad_norm": 2.0913899642879414, "language_loss": 0.79324377, "learning_rate": 3.894151690579646e-07, "loss": 0.81456596, "num_input_tokens_seen": 288571325, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 13373, "time_per_iteration": 2.4662914276123047 }, { "auxiliary_loss_clip": 0.01101872, "auxiliary_loss_mlp": 0.01033376, "balance_loss_clip": 1.0217768, "balance_loss_mlp": 1.03493285, "epoch": 0.8040883811814219, "flos": 23550720912000.0, "grad_norm": 1.5202636513564238, "language_loss": 0.74404514, "learning_rate": 3.8918429639820815e-07, "loss": 0.76539755, "num_input_tokens_seen": 288592100, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 13374, "time_per_iteration": 2.5016496181488037 }, { "auxiliary_loss_clip": 0.01106328, "auxiliary_loss_mlp": 0.01031867, "balance_loss_clip": 1.01887882, "balance_loss_mlp": 1.03506243, "epoch": 0.8041485044340899, "flos": 19026263882880.0, "grad_norm": 3.69239952529215, "language_loss": 0.68804777, "learning_rate": 3.889534848207452e-07, "loss": 0.70942974, "num_input_tokens_seen": 288612305, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 13375, "time_per_iteration": 2.4371135234832764 }, { "auxiliary_loss_clip": 0.01029063, "auxiliary_loss_mlp": 0.01002041, "balance_loss_clip": 1.00103366, "balance_loss_mlp": 1.00643325, "epoch": 0.8042086276867578, "flos": 70005663797760.0, "grad_norm": 0.7317136796395681, "language_loss": 0.55633432, "learning_rate": 3.887227343343271e-07, "loss": 0.57664537, "num_input_tokens_seen": 288676015, "router_z_loss_clip": 0.0100708, "router_z_loss_mlp": 0.2265625, "step": 13376, "time_per_iteration": 3.1780331134796143 }, { "auxiliary_loss_clip": 0.01106676, "auxiliary_loss_mlp": 0.01028014, "balance_loss_clip": 1.01592064, "balance_loss_mlp": 1.0355953, "epoch": 0.8042687509394258, "flos": 21872435800320.0, "grad_norm": 2.4210206819750053, "language_loss": 0.73110044, "learning_rate": 3.8849204494770425e-07, "loss": 0.75244737, "num_input_tokens_seen": 288696455, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 13377, "time_per_iteration": 3.968829393386841 }, { "auxiliary_loss_clip": 0.01104155, "auxiliary_loss_mlp": 0.01029473, "balance_loss_clip": 1.01665187, "balance_loss_mlp": 1.03350854, "epoch": 0.8043288741920938, "flos": 26614870513920.0, "grad_norm": 3.409689840123828, "language_loss": 0.70042598, "learning_rate": 3.8826141666962567e-07, "loss": 0.72176224, "num_input_tokens_seen": 288715560, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 13378, "time_per_iteration": 3.862109899520874 }, { "auxiliary_loss_clip": 0.01107609, "auxiliary_loss_mlp": 0.01030425, "balance_loss_clip": 1.01756847, "balance_loss_mlp": 1.03715289, "epoch": 0.8043889974447618, "flos": 33403387651200.0, "grad_norm": 1.4584538633666484, "language_loss": 0.69303137, "learning_rate": 3.880308495088347e-07, "loss": 0.71441174, "num_input_tokens_seen": 288739485, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 13379, "time_per_iteration": 2.577378511428833 }, { "auxiliary_loss_clip": 0.01110301, "auxiliary_loss_mlp": 0.01032024, "balance_loss_clip": 1.01811171, "balance_loss_mlp": 1.03796601, "epoch": 0.8044491206974297, "flos": 20375966355840.0, "grad_norm": 2.0047616879985064, "language_loss": 0.75956786, "learning_rate": 3.8780034347407533e-07, "loss": 0.78099114, "num_input_tokens_seen": 288757420, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 13380, "time_per_iteration": 3.8248019218444824 }, { "auxiliary_loss_clip": 0.01102949, "auxiliary_loss_mlp": 0.01027475, "balance_loss_clip": 1.01600158, "balance_loss_mlp": 1.0341512, "epoch": 0.8045092439500977, "flos": 23403810286080.0, "grad_norm": 2.290778718340793, "language_loss": 0.69101417, "learning_rate": 3.875698985740887e-07, "loss": 0.71231842, "num_input_tokens_seen": 288775535, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6875, "step": 13381, "time_per_iteration": 2.4789559841156006 }, { "auxiliary_loss_clip": 0.0110875, "auxiliary_loss_mlp": 0.01031251, "balance_loss_clip": 1.01858521, "balance_loss_mlp": 1.0385983, "epoch": 0.8045693672027656, "flos": 24097245321600.0, "grad_norm": 3.3306595562865513, "language_loss": 0.64086282, "learning_rate": 3.873395148176135e-07, "loss": 0.6622628, "num_input_tokens_seen": 288795035, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13382, "time_per_iteration": 3.9337050914764404 }, { "auxiliary_loss_clip": 0.01104041, "auxiliary_loss_mlp": 0.01035749, "balance_loss_clip": 1.02425122, "balance_loss_mlp": 1.03601432, "epoch": 0.8046294904554336, "flos": 27707165147520.0, "grad_norm": 2.0109561509442075, "language_loss": 0.76445138, "learning_rate": 3.8710919221338487e-07, "loss": 0.78584933, "num_input_tokens_seen": 288816270, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 13383, "time_per_iteration": 2.5108933448791504 }, { "auxiliary_loss_clip": 0.01105813, "auxiliary_loss_mlp": 0.01036425, "balance_loss_clip": 1.02427721, "balance_loss_mlp": 1.03657234, "epoch": 0.8046896137081017, "flos": 24972998814720.0, "grad_norm": 1.9419885104444519, "language_loss": 0.69952166, "learning_rate": 3.868789307701381e-07, "loss": 0.72094405, "num_input_tokens_seen": 288836050, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 13384, "time_per_iteration": 2.49831223487854 }, { "auxiliary_loss_clip": 0.01106409, "auxiliary_loss_mlp": 0.01034771, "balance_loss_clip": 1.02090085, "balance_loss_mlp": 1.03421664, "epoch": 0.8047497369607696, "flos": 17675484001920.0, "grad_norm": 3.131703262350833, "language_loss": 0.80113435, "learning_rate": 3.8664873049660375e-07, "loss": 0.82254618, "num_input_tokens_seen": 288852900, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 13385, "time_per_iteration": 2.4376559257507324 }, { "auxiliary_loss_clip": 0.01104642, "auxiliary_loss_mlp": 0.01030606, "balance_loss_clip": 1.01785696, "balance_loss_mlp": 1.03474152, "epoch": 0.8048098602134376, "flos": 22382079920640.0, "grad_norm": 1.5928391676710725, "language_loss": 0.7195406, "learning_rate": 3.864185914015108e-07, "loss": 0.74089313, "num_input_tokens_seen": 288872625, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 13386, "time_per_iteration": 2.4736592769622803 }, { "auxiliary_loss_clip": 0.01028868, "auxiliary_loss_mlp": 0.01002169, "balance_loss_clip": 1.00109637, "balance_loss_mlp": 1.00636756, "epoch": 0.8048699834661055, "flos": 71200949702400.0, "grad_norm": 0.6739313314865844, "language_loss": 0.51257145, "learning_rate": 3.861885134935865e-07, "loss": 0.53288186, "num_input_tokens_seen": 288939180, "router_z_loss_clip": 0.01074219, "router_z_loss_mlp": 0.22460938, "step": 13387, "time_per_iteration": 3.139638900756836 }, { "auxiliary_loss_clip": 0.01106397, "auxiliary_loss_mlp": 0.01031567, "balance_loss_clip": 1.01754785, "balance_loss_mlp": 1.03579128, "epoch": 0.8049301067187735, "flos": 23660320285440.0, "grad_norm": 1.6794897193746716, "language_loss": 0.74078667, "learning_rate": 3.859584967815559e-07, "loss": 0.76216638, "num_input_tokens_seen": 288958925, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.70703125, "step": 13388, "time_per_iteration": 2.4902777671813965 }, { "auxiliary_loss_clip": 0.01103804, "auxiliary_loss_mlp": 0.01027022, "balance_loss_clip": 1.0151968, "balance_loss_mlp": 1.0365572, "epoch": 0.8049902299714414, "flos": 24426330750720.0, "grad_norm": 1.616824313493933, "language_loss": 0.71732599, "learning_rate": 3.857285412741411e-07, "loss": 0.73863429, "num_input_tokens_seen": 288980935, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 13389, "time_per_iteration": 2.5141115188598633 }, { "auxiliary_loss_clip": 0.01105789, "auxiliary_loss_mlp": 0.01030001, "balance_loss_clip": 1.01775861, "balance_loss_mlp": 1.037377, "epoch": 0.8050503532241094, "flos": 17492626840320.0, "grad_norm": 2.0867470129711325, "language_loss": 0.82690579, "learning_rate": 3.8549864698006097e-07, "loss": 0.84826374, "num_input_tokens_seen": 288996780, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 13390, "time_per_iteration": 2.4326515197753906 }, { "auxiliary_loss_clip": 0.01028604, "auxiliary_loss_mlp": 0.01003033, "balance_loss_clip": 1.00204325, "balance_loss_mlp": 1.00621152, "epoch": 0.8051104764767774, "flos": 57658030369920.0, "grad_norm": 0.7882013492983773, "language_loss": 0.55545616, "learning_rate": 3.8526881390803424e-07, "loss": 0.57577252, "num_input_tokens_seen": 289057590, "router_z_loss_clip": 0.0098877, "router_z_loss_mlp": 0.22460938, "step": 13391, "time_per_iteration": 3.0582263469696045 }, { "auxiliary_loss_clip": 0.01103075, "auxiliary_loss_mlp": 0.01031665, "balance_loss_clip": 1.01978624, "balance_loss_mlp": 1.03575397, "epoch": 0.8051705997294454, "flos": 18003456109440.0, "grad_norm": 2.6369637077012844, "language_loss": 0.84824252, "learning_rate": 3.850390420667762e-07, "loss": 0.86958992, "num_input_tokens_seen": 289076285, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 13392, "time_per_iteration": 2.453360080718994 }, { "auxiliary_loss_clip": 0.01103666, "auxiliary_loss_mlp": 0.01028372, "balance_loss_clip": 1.01645148, "balance_loss_mlp": 1.03440237, "epoch": 0.8052307229821133, "flos": 26397754755840.0, "grad_norm": 2.354490946538654, "language_loss": 0.70184982, "learning_rate": 3.8480933146499914e-07, "loss": 0.72317022, "num_input_tokens_seen": 289097585, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 13393, "time_per_iteration": 2.5359065532684326 }, { "auxiliary_loss_clip": 0.01106076, "auxiliary_loss_mlp": 0.01032699, "balance_loss_clip": 1.01964569, "balance_loss_mlp": 1.03629529, "epoch": 0.8052908462347813, "flos": 21757018423680.0, "grad_norm": 2.1219646653649584, "language_loss": 0.76180828, "learning_rate": 3.84579682111414e-07, "loss": 0.78319597, "num_input_tokens_seen": 289116890, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 13394, "time_per_iteration": 2.4983675479888916 }, { "auxiliary_loss_clip": 0.01106606, "auxiliary_loss_mlp": 0.01027636, "balance_loss_clip": 1.0156256, "balance_loss_mlp": 1.03751755, "epoch": 0.8053509694874492, "flos": 25442279026560.0, "grad_norm": 1.6993065359269204, "language_loss": 0.65020657, "learning_rate": 3.843500940147304e-07, "loss": 0.67154896, "num_input_tokens_seen": 289136670, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 13395, "time_per_iteration": 2.5031650066375732 }, { "auxiliary_loss_clip": 0.01028856, "auxiliary_loss_mlp": 0.01001297, "balance_loss_clip": 1.00031912, "balance_loss_mlp": 1.00637341, "epoch": 0.8054110927401172, "flos": 57668122091520.0, "grad_norm": 0.7663052793531178, "language_loss": 0.57360893, "learning_rate": 3.8412056718365206e-07, "loss": 0.59391046, "num_input_tokens_seen": 289200150, "router_z_loss_clip": 0.00976562, "router_z_loss_mlp": 0.22460938, "step": 13396, "time_per_iteration": 3.2108750343322754 }, { "auxiliary_loss_clip": 0.01105937, "auxiliary_loss_mlp": 0.01034367, "balance_loss_clip": 1.02083707, "balance_loss_mlp": 1.03575778, "epoch": 0.8054712159927853, "flos": 19276201693440.0, "grad_norm": 1.693918995479455, "language_loss": 0.77439106, "learning_rate": 3.8389110162688353e-07, "loss": 0.79579413, "num_input_tokens_seen": 289218125, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 13397, "time_per_iteration": 2.479515314102173 }, { "auxiliary_loss_clip": 0.01106481, "auxiliary_loss_mlp": 0.01025322, "balance_loss_clip": 1.0130856, "balance_loss_mlp": 1.03818369, "epoch": 0.8055313392454532, "flos": 17967617314560.0, "grad_norm": 1.73403557343391, "language_loss": 0.70362675, "learning_rate": 3.836616973531266e-07, "loss": 0.72494483, "num_input_tokens_seen": 289237115, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 13398, "time_per_iteration": 2.4488844871520996 }, { "auxiliary_loss_clip": 0.01104801, "auxiliary_loss_mlp": 0.01028441, "balance_loss_clip": 1.01655626, "balance_loss_mlp": 1.03597724, "epoch": 0.8055914624981212, "flos": 13478352635520.0, "grad_norm": 2.935253250608294, "language_loss": 0.69012356, "learning_rate": 3.834323543710805e-07, "loss": 0.71145594, "num_input_tokens_seen": 289253635, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 13399, "time_per_iteration": 2.4422290325164795 }, { "auxiliary_loss_clip": 0.01105767, "auxiliary_loss_mlp": 0.01031163, "balance_loss_clip": 1.01988626, "balance_loss_mlp": 1.03677058, "epoch": 0.8056515857507891, "flos": 13224787551360.0, "grad_norm": 3.3237031634011074, "language_loss": 0.72628164, "learning_rate": 3.8320307268944153e-07, "loss": 0.74765098, "num_input_tokens_seen": 289270085, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6875, "step": 13400, "time_per_iteration": 2.4620556831359863 }, { "auxiliary_loss_clip": 0.01102941, "auxiliary_loss_mlp": 0.01029432, "balance_loss_clip": 1.01718915, "balance_loss_mlp": 1.0345217, "epoch": 0.8057117090034571, "flos": 23878190229120.0, "grad_norm": 1.7938314464518819, "language_loss": 0.6412288, "learning_rate": 3.829738523169037e-07, "loss": 0.66255248, "num_input_tokens_seen": 289289645, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 13401, "time_per_iteration": 2.4943761825561523 }, { "auxiliary_loss_clip": 0.01106418, "auxiliary_loss_mlp": 0.01028665, "balance_loss_clip": 1.01683378, "balance_loss_mlp": 1.03595614, "epoch": 0.805771832256125, "flos": 21214300855680.0, "grad_norm": 2.164814675723121, "language_loss": 0.8409614, "learning_rate": 3.8274469326215985e-07, "loss": 0.8623122, "num_input_tokens_seen": 289306630, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 13402, "time_per_iteration": 2.4971961975097656 }, { "auxiliary_loss_clip": 0.01108686, "auxiliary_loss_mlp": 0.01031517, "balance_loss_clip": 1.01919079, "balance_loss_mlp": 1.03814483, "epoch": 0.805831955508793, "flos": 17566818382080.0, "grad_norm": 1.9068610493886855, "language_loss": 0.6748454, "learning_rate": 3.8251559553389876e-07, "loss": 0.69624746, "num_input_tokens_seen": 289324960, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.70703125, "step": 13403, "time_per_iteration": 2.4408435821533203 }, { "auxiliary_loss_clip": 0.01104779, "auxiliary_loss_mlp": 0.01035968, "balance_loss_clip": 1.02427959, "balance_loss_mlp": 1.03805208, "epoch": 0.805892078761461, "flos": 26907542530560.0, "grad_norm": 1.7452317487682565, "language_loss": 0.84736192, "learning_rate": 3.822865591408084e-07, "loss": 0.86876941, "num_input_tokens_seen": 289344980, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6640625, "step": 13404, "time_per_iteration": 2.5272161960601807 }, { "auxiliary_loss_clip": 0.01101146, "auxiliary_loss_mlp": 0.0103188, "balance_loss_clip": 1.02040637, "balance_loss_mlp": 1.03451109, "epoch": 0.805952202014129, "flos": 31506442496640.0, "grad_norm": 1.900221993648362, "language_loss": 0.70632565, "learning_rate": 3.820575840915743e-07, "loss": 0.72765595, "num_input_tokens_seen": 289367500, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66796875, "step": 13405, "time_per_iteration": 2.6342756748199463 }, { "auxiliary_loss_clip": 0.01102968, "auxiliary_loss_mlp": 0.01024736, "balance_loss_clip": 1.0133872, "balance_loss_mlp": 1.03522873, "epoch": 0.8060123252667969, "flos": 24389953251840.0, "grad_norm": 2.292664665931262, "language_loss": 0.75606269, "learning_rate": 3.818286703948788e-07, "loss": 0.7773397, "num_input_tokens_seen": 289385930, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.67578125, "step": 13406, "time_per_iteration": 2.5607500076293945 }, { "auxiliary_loss_clip": 0.01106582, "auxiliary_loss_mlp": 0.01031265, "balance_loss_clip": 1.01877832, "balance_loss_mlp": 1.03714252, "epoch": 0.8060724485194649, "flos": 23479941162240.0, "grad_norm": 1.5241558455320587, "language_loss": 0.76361644, "learning_rate": 3.815998180594018e-07, "loss": 0.78499496, "num_input_tokens_seen": 289408025, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 13407, "time_per_iteration": 2.5262339115142822 }, { "auxiliary_loss_clip": 0.01104626, "auxiliary_loss_mlp": 0.01032188, "balance_loss_clip": 1.01987326, "balance_loss_mlp": 1.03596246, "epoch": 0.8061325717721328, "flos": 18624495283200.0, "grad_norm": 1.856184448041962, "language_loss": 0.74150431, "learning_rate": 3.81371027093822e-07, "loss": 0.76287246, "num_input_tokens_seen": 289426575, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 13408, "time_per_iteration": 2.4693121910095215 }, { "auxiliary_loss_clip": 0.01104612, "auxiliary_loss_mlp": 0.0103077, "balance_loss_clip": 1.01809192, "balance_loss_mlp": 1.03578806, "epoch": 0.8061926950248008, "flos": 23582752865280.0, "grad_norm": 2.0779695574136285, "language_loss": 0.70762694, "learning_rate": 3.8114229750681523e-07, "loss": 0.72898072, "num_input_tokens_seen": 289447760, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13409, "time_per_iteration": 2.497394323348999 }, { "auxiliary_loss_clip": 0.01105371, "auxiliary_loss_mlp": 0.01028139, "balance_loss_clip": 1.01605773, "balance_loss_mlp": 1.03572989, "epoch": 0.8062528182774689, "flos": 11143333209600.0, "grad_norm": 2.280638415929891, "language_loss": 0.76959586, "learning_rate": 3.809136293070545e-07, "loss": 0.79093099, "num_input_tokens_seen": 289463920, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 13410, "time_per_iteration": 2.4323997497558594 }, { "auxiliary_loss_clip": 0.01103935, "auxiliary_loss_mlp": 0.01034046, "balance_loss_clip": 1.02163649, "balance_loss_mlp": 1.03662848, "epoch": 0.8063129415301368, "flos": 22346815743360.0, "grad_norm": 1.9379645703940913, "language_loss": 0.68472064, "learning_rate": 3.806850225032117e-07, "loss": 0.70610052, "num_input_tokens_seen": 289482635, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 13411, "time_per_iteration": 2.4858481884002686 }, { "auxiliary_loss_clip": 0.01102829, "auxiliary_loss_mlp": 0.01028123, "balance_loss_clip": 1.01649427, "balance_loss_mlp": 1.03532946, "epoch": 0.8063730647828048, "flos": 23988400133760.0, "grad_norm": 1.9876719763439676, "language_loss": 0.68515384, "learning_rate": 3.804564771039551e-07, "loss": 0.7064634, "num_input_tokens_seen": 289502040, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 13412, "time_per_iteration": 2.5142030715942383 }, { "auxiliary_loss_clip": 0.01109429, "auxiliary_loss_mlp": 0.01033436, "balance_loss_clip": 1.01978064, "balance_loss_mlp": 1.03785372, "epoch": 0.8064331880354727, "flos": 21321494017920.0, "grad_norm": 1.8568372516759801, "language_loss": 0.8155508, "learning_rate": 3.8022799311795064e-07, "loss": 0.83697945, "num_input_tokens_seen": 289520740, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 13413, "time_per_iteration": 2.4682393074035645 }, { "auxiliary_loss_clip": 0.0110397, "auxiliary_loss_mlp": 0.01033317, "balance_loss_clip": 1.02093673, "balance_loss_mlp": 1.03617895, "epoch": 0.8064933112881407, "flos": 19682890456320.0, "grad_norm": 2.3050794189958244, "language_loss": 0.85336745, "learning_rate": 3.7999957055386303e-07, "loss": 0.87474024, "num_input_tokens_seen": 289535840, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 13414, "time_per_iteration": 2.4787261486053467 }, { "auxiliary_loss_clip": 0.01102454, "auxiliary_loss_mlp": 0.01030357, "balance_loss_clip": 1.01886475, "balance_loss_mlp": 1.03431654, "epoch": 0.8065534345408086, "flos": 19279721226240.0, "grad_norm": 2.0710663132085982, "language_loss": 0.67192543, "learning_rate": 3.7977120942035467e-07, "loss": 0.69325352, "num_input_tokens_seen": 289555205, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 13415, "time_per_iteration": 2.53715443611145 }, { "auxiliary_loss_clip": 0.01101976, "auxiliary_loss_mlp": 0.01025445, "balance_loss_clip": 1.01423311, "balance_loss_mlp": 1.03564048, "epoch": 0.8066135577934767, "flos": 19677718897920.0, "grad_norm": 2.0823657994941107, "language_loss": 0.76382446, "learning_rate": 3.7954290972608383e-07, "loss": 0.78509867, "num_input_tokens_seen": 289573000, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 13416, "time_per_iteration": 2.4904375076293945 }, { "auxiliary_loss_clip": 0.01107494, "auxiliary_loss_mlp": 0.01032047, "balance_loss_clip": 1.01998866, "balance_loss_mlp": 1.03522134, "epoch": 0.8066736810461446, "flos": 21143592933120.0, "grad_norm": 2.352092202188334, "language_loss": 0.65342784, "learning_rate": 3.793146714797086e-07, "loss": 0.67482328, "num_input_tokens_seen": 289592625, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.72265625, "step": 13417, "time_per_iteration": 2.474867820739746 }, { "auxiliary_loss_clip": 0.01108641, "auxiliary_loss_mlp": 0.01036173, "balance_loss_clip": 1.02409756, "balance_loss_mlp": 1.03731906, "epoch": 0.8067338042988126, "flos": 22598261925120.0, "grad_norm": 1.9223532074148535, "language_loss": 0.80815351, "learning_rate": 3.7908649468988306e-07, "loss": 0.82960171, "num_input_tokens_seen": 289610780, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 13418, "time_per_iteration": 2.47135329246521 }, { "auxiliary_loss_clip": 0.01107389, "auxiliary_loss_mlp": 0.0102944, "balance_loss_clip": 1.01659513, "balance_loss_mlp": 1.03662825, "epoch": 0.8067939275514805, "flos": 16508423208960.0, "grad_norm": 3.4192456925522463, "language_loss": 0.8484543, "learning_rate": 3.7885837936526066e-07, "loss": 0.86982262, "num_input_tokens_seen": 289628890, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 13419, "time_per_iteration": 5.329720973968506 }, { "auxiliary_loss_clip": 0.01105486, "auxiliary_loss_mlp": 0.01029274, "balance_loss_clip": 1.0171032, "balance_loss_mlp": 1.0350914, "epoch": 0.8068540508041485, "flos": 28541836460160.0, "grad_norm": 1.7353342628454405, "language_loss": 0.7570312, "learning_rate": 3.7863032551449047e-07, "loss": 0.77837873, "num_input_tokens_seen": 289647220, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 13420, "time_per_iteration": 2.5236895084381104 }, { "auxiliary_loss_clip": 0.01101148, "auxiliary_loss_mlp": 0.01025119, "balance_loss_clip": 1.01460505, "balance_loss_mlp": 1.03407502, "epoch": 0.8069141740568164, "flos": 21652482867840.0, "grad_norm": 1.8402187621400925, "language_loss": 0.78793436, "learning_rate": 3.784023331462207e-07, "loss": 0.80919707, "num_input_tokens_seen": 289665800, "router_z_loss_clip": 0.10546875, "router_z_loss_mlp": 0.66796875, "step": 13421, "time_per_iteration": 2.4517910480499268 }, { "auxiliary_loss_clip": 0.01106297, "auxiliary_loss_mlp": 0.01023393, "balance_loss_clip": 1.01109087, "balance_loss_mlp": 1.03703594, "epoch": 0.8069742973094844, "flos": 17529327561600.0, "grad_norm": 2.003130490600954, "language_loss": 0.79569888, "learning_rate": 3.78174402269098e-07, "loss": 0.8169958, "num_input_tokens_seen": 289682705, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 13422, "time_per_iteration": 3.835552215576172 }, { "auxiliary_loss_clip": 0.01102944, "auxiliary_loss_mlp": 0.01030973, "balance_loss_clip": 1.01924253, "balance_loss_mlp": 1.03438914, "epoch": 0.8070344205621525, "flos": 23367037737600.0, "grad_norm": 1.9009281301527503, "language_loss": 0.67931241, "learning_rate": 3.7794653289176347e-07, "loss": 0.70065159, "num_input_tokens_seen": 289702920, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 13423, "time_per_iteration": 2.5043275356292725 }, { "auxiliary_loss_clip": 0.01104721, "auxiliary_loss_mlp": 0.01036776, "balance_loss_clip": 1.02404475, "balance_loss_mlp": 1.03448486, "epoch": 0.8070945438148204, "flos": 22930184528640.0, "grad_norm": 2.010288693444004, "language_loss": 0.80218494, "learning_rate": 3.7771872502285904e-07, "loss": 0.82359993, "num_input_tokens_seen": 289723280, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13424, "time_per_iteration": 3.9221158027648926 }, { "auxiliary_loss_clip": 0.01106016, "auxiliary_loss_mlp": 0.01026571, "balance_loss_clip": 1.01442313, "balance_loss_mlp": 1.03489995, "epoch": 0.8071546670674884, "flos": 25300683613440.0, "grad_norm": 1.437297148607897, "language_loss": 0.78921223, "learning_rate": 3.774909786710232e-07, "loss": 0.81053805, "num_input_tokens_seen": 289743475, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7109375, "step": 13425, "time_per_iteration": 2.512454032897949 }, { "auxiliary_loss_clip": 0.01103757, "auxiliary_loss_mlp": 0.010295, "balance_loss_clip": 1.01781118, "balance_loss_mlp": 1.03541195, "epoch": 0.8072147903201563, "flos": 18113701927680.0, "grad_norm": 2.873691225976873, "language_loss": 0.75691247, "learning_rate": 3.772632938448923e-07, "loss": 0.77824509, "num_input_tokens_seen": 289761400, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.68359375, "step": 13426, "time_per_iteration": 2.4340927600860596 }, { "auxiliary_loss_clip": 0.01104092, "auxiliary_loss_mlp": 0.01027146, "balance_loss_clip": 1.01508832, "balance_loss_mlp": 1.03510451, "epoch": 0.8072749135728243, "flos": 26688164215680.0, "grad_norm": 1.8784258030546976, "language_loss": 0.73432946, "learning_rate": 3.770356705530997e-07, "loss": 0.75564188, "num_input_tokens_seen": 289781025, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 13427, "time_per_iteration": 2.524013042449951 }, { "auxiliary_loss_clip": 0.011057, "auxiliary_loss_mlp": 0.01035707, "balance_loss_clip": 1.02238536, "balance_loss_mlp": 1.03659534, "epoch": 0.8073350368254922, "flos": 19240291071360.0, "grad_norm": 1.71087423229255, "language_loss": 0.70229876, "learning_rate": 3.768081088042774e-07, "loss": 0.7237128, "num_input_tokens_seen": 289798380, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69140625, "step": 13428, "time_per_iteration": 2.4434685707092285 }, { "auxiliary_loss_clip": 0.01104518, "auxiliary_loss_mlp": 0.01027305, "balance_loss_clip": 1.01573527, "balance_loss_mlp": 1.03513515, "epoch": 0.8073951600781603, "flos": 13334530579200.0, "grad_norm": 1.883469295842064, "language_loss": 0.74698585, "learning_rate": 3.765806086070544e-07, "loss": 0.76830411, "num_input_tokens_seen": 289814515, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6953125, "step": 13429, "time_per_iteration": 2.418994665145874 }, { "auxiliary_loss_clip": 0.01101207, "auxiliary_loss_mlp": 0.01028347, "balance_loss_clip": 1.01662314, "balance_loss_mlp": 1.03508437, "epoch": 0.8074552833308282, "flos": 22853191726080.0, "grad_norm": 4.905082015439168, "language_loss": 0.67003429, "learning_rate": 3.763531699700568e-07, "loss": 0.69132984, "num_input_tokens_seen": 289834315, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66015625, "step": 13430, "time_per_iteration": 2.4576807022094727 }, { "auxiliary_loss_clip": 0.01104682, "auxiliary_loss_mlp": 0.01030424, "balance_loss_clip": 1.018646, "balance_loss_mlp": 1.03575563, "epoch": 0.8075154065834962, "flos": 20339409288960.0, "grad_norm": 1.7300361425396344, "language_loss": 0.80204195, "learning_rate": 3.7612579290190994e-07, "loss": 0.82339299, "num_input_tokens_seen": 289853770, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 13431, "time_per_iteration": 2.4584600925445557 }, { "auxiliary_loss_clip": 0.0110234, "auxiliary_loss_mlp": 0.01029272, "balance_loss_clip": 1.01658821, "balance_loss_mlp": 1.03490686, "epoch": 0.8075755298361641, "flos": 21908059113600.0, "grad_norm": 2.2300287493663475, "language_loss": 0.79958868, "learning_rate": 3.7589847741123593e-07, "loss": 0.82090479, "num_input_tokens_seen": 289870480, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 13432, "time_per_iteration": 2.450885057449341 }, { "auxiliary_loss_clip": 0.01111068, "auxiliary_loss_mlp": 0.01035022, "balance_loss_clip": 1.02249336, "balance_loss_mlp": 1.03895354, "epoch": 0.8076356530888321, "flos": 15669298609920.0, "grad_norm": 2.2361811807771184, "language_loss": 0.70427692, "learning_rate": 3.7567122350665415e-07, "loss": 0.72573781, "num_input_tokens_seen": 289888275, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 13433, "time_per_iteration": 2.4233269691467285 }, { "auxiliary_loss_clip": 0.01104241, "auxiliary_loss_mlp": 0.01028597, "balance_loss_clip": 1.01712275, "balance_loss_mlp": 1.03571343, "epoch": 0.8076957763415, "flos": 37777414521600.0, "grad_norm": 1.8862779099165259, "language_loss": 0.72653735, "learning_rate": 3.754440311967828e-07, "loss": 0.74786568, "num_input_tokens_seen": 289911495, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6875, "step": 13434, "time_per_iteration": 2.605269193649292 }, { "auxiliary_loss_clip": 0.01105526, "auxiliary_loss_mlp": 0.01025355, "balance_loss_clip": 1.0138036, "balance_loss_mlp": 1.03809202, "epoch": 0.807755899594168, "flos": 19610781903360.0, "grad_norm": 2.252828729901502, "language_loss": 0.68095624, "learning_rate": 3.752169004902361e-07, "loss": 0.70226502, "num_input_tokens_seen": 289930045, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 13435, "time_per_iteration": 2.4490067958831787 }, { "auxiliary_loss_clip": 0.01107718, "auxiliary_loss_mlp": 0.01035682, "balance_loss_clip": 1.02213955, "balance_loss_mlp": 1.03705323, "epoch": 0.8078160228468361, "flos": 23294893271040.0, "grad_norm": 1.7657057501793925, "language_loss": 0.75138021, "learning_rate": 3.749898313956279e-07, "loss": 0.77281421, "num_input_tokens_seen": 289950815, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.70703125, "step": 13436, "time_per_iteration": 2.481212615966797 }, { "auxiliary_loss_clip": 0.01099024, "auxiliary_loss_mlp": 0.01029521, "balance_loss_clip": 1.01727271, "balance_loss_mlp": 1.03250837, "epoch": 0.807876146099504, "flos": 27162651899520.0, "grad_norm": 1.940110344315083, "language_loss": 0.70370996, "learning_rate": 3.747628239215674e-07, "loss": 0.72499537, "num_input_tokens_seen": 289971730, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 13437, "time_per_iteration": 2.51067852973938 }, { "auxiliary_loss_clip": 0.01105201, "auxiliary_loss_mlp": 0.01031559, "balance_loss_clip": 1.01990044, "balance_loss_mlp": 1.03807271, "epoch": 0.807936269352172, "flos": 27160030206720.0, "grad_norm": 1.8454923737495206, "language_loss": 0.72611082, "learning_rate": 3.745358780766636e-07, "loss": 0.74747849, "num_input_tokens_seen": 289992995, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 13438, "time_per_iteration": 2.5139083862304688 }, { "auxiliary_loss_clip": 0.01102536, "auxiliary_loss_mlp": 0.01030772, "balance_loss_clip": 1.0192802, "balance_loss_mlp": 1.03564453, "epoch": 0.8079963926048399, "flos": 20740423703040.0, "grad_norm": 3.750023462635659, "language_loss": 0.77438176, "learning_rate": 3.7430899386952344e-07, "loss": 0.79571486, "num_input_tokens_seen": 290009405, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 13439, "time_per_iteration": 2.4456074237823486 }, { "auxiliary_loss_clip": 0.01104047, "auxiliary_loss_mlp": 0.01036036, "balance_loss_clip": 1.02412748, "balance_loss_mlp": 1.03610241, "epoch": 0.8080565158575079, "flos": 25009663622400.0, "grad_norm": 1.5130481084749887, "language_loss": 0.78692663, "learning_rate": 3.7408217130874786e-07, "loss": 0.80832744, "num_input_tokens_seen": 290031085, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13440, "time_per_iteration": 2.5046045780181885 }, { "auxiliary_loss_clip": 0.01105809, "auxiliary_loss_mlp": 0.01028337, "balance_loss_clip": 1.01552844, "balance_loss_mlp": 1.03544664, "epoch": 0.8081166391101758, "flos": 18698076293760.0, "grad_norm": 1.7213570611386928, "language_loss": 0.59325099, "learning_rate": 3.7385541040293946e-07, "loss": 0.61459243, "num_input_tokens_seen": 290048670, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13441, "time_per_iteration": 2.438399314880371 }, { "auxiliary_loss_clip": 0.01103403, "auxiliary_loss_mlp": 0.01031554, "balance_loss_clip": 1.01863742, "balance_loss_mlp": 1.03558004, "epoch": 0.8081767623628439, "flos": 19828651847040.0, "grad_norm": 2.338705604944314, "language_loss": 0.76518416, "learning_rate": 3.7362871116069684e-07, "loss": 0.78653371, "num_input_tokens_seen": 290064085, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 13442, "time_per_iteration": 2.4353675842285156 }, { "auxiliary_loss_clip": 0.01105273, "auxiliary_loss_mlp": 0.01030983, "balance_loss_clip": 1.01896691, "balance_loss_mlp": 1.0365665, "epoch": 0.8082368856155118, "flos": 35772952982400.0, "grad_norm": 1.485081903781087, "language_loss": 0.70438814, "learning_rate": 3.734020735906169e-07, "loss": 0.72575068, "num_input_tokens_seen": 290086255, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 13443, "time_per_iteration": 2.6032917499542236 }, { "auxiliary_loss_clip": 0.01104593, "auxiliary_loss_mlp": 0.01035409, "balance_loss_clip": 1.02352405, "balance_loss_mlp": 1.03738856, "epoch": 0.8082970088681798, "flos": 17198015489280.0, "grad_norm": 4.894677427087939, "language_loss": 0.82535535, "learning_rate": 3.7317549770129286e-07, "loss": 0.84675539, "num_input_tokens_seen": 290103995, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 13444, "time_per_iteration": 2.4382729530334473 }, { "auxiliary_loss_clip": 0.01029262, "auxiliary_loss_mlp": 0.01001593, "balance_loss_clip": 1.0005976, "balance_loss_mlp": 1.00689769, "epoch": 0.8083571321208477, "flos": 63555207511680.0, "grad_norm": 0.899159838251158, "language_loss": 0.53642952, "learning_rate": 3.7294898350131754e-07, "loss": 0.55673808, "num_input_tokens_seen": 290157245, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22460938, "step": 13445, "time_per_iteration": 2.9438343048095703 }, { "auxiliary_loss_clip": 0.01105103, "auxiliary_loss_mlp": 0.01033453, "balance_loss_clip": 1.020239, "balance_loss_mlp": 1.03664064, "epoch": 0.8084172553735157, "flos": 17930701111680.0, "grad_norm": 2.153217420971385, "language_loss": 0.7295059, "learning_rate": 3.7272253099927964e-07, "loss": 0.75089145, "num_input_tokens_seen": 290174970, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.68359375, "step": 13446, "time_per_iteration": 2.4706907272338867 }, { "auxiliary_loss_clip": 0.01109398, "auxiliary_loss_mlp": 0.01032898, "balance_loss_clip": 1.0198741, "balance_loss_mlp": 1.038095, "epoch": 0.8084773786261836, "flos": 24097999507200.0, "grad_norm": 2.003054702991815, "language_loss": 0.71279943, "learning_rate": 3.7249614020376606e-07, "loss": 0.73422241, "num_input_tokens_seen": 290194395, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 13447, "time_per_iteration": 2.533597230911255 }, { "auxiliary_loss_clip": 0.01106393, "auxiliary_loss_mlp": 0.0103261, "balance_loss_clip": 1.0190556, "balance_loss_mlp": 1.03564405, "epoch": 0.8085375018788516, "flos": 15588211656960.0, "grad_norm": 2.07833889213013, "language_loss": 0.74900281, "learning_rate": 3.7226981112336197e-07, "loss": 0.77039278, "num_input_tokens_seen": 290209200, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.70703125, "step": 13448, "time_per_iteration": 2.413623094558716 }, { "auxiliary_loss_clip": 0.01029637, "auxiliary_loss_mlp": 0.01001956, "balance_loss_clip": 1.00097263, "balance_loss_mlp": 1.00727713, "epoch": 0.8085976251315197, "flos": 67561296393600.0, "grad_norm": 0.741211319138431, "language_loss": 0.6384573, "learning_rate": 3.7204354376665024e-07, "loss": 0.65877324, "num_input_tokens_seen": 290274565, "router_z_loss_clip": 0.00982666, "router_z_loss_mlp": 0.22363281, "step": 13449, "time_per_iteration": 3.1445486545562744 }, { "auxiliary_loss_clip": 0.01105317, "auxiliary_loss_mlp": 0.0102666, "balance_loss_clip": 1.01387477, "balance_loss_mlp": 1.03661251, "epoch": 0.8086577483841876, "flos": 22561453463040.0, "grad_norm": 1.6980713884248577, "language_loss": 0.73946142, "learning_rate": 3.718173381422105e-07, "loss": 0.76078123, "num_input_tokens_seen": 290293630, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 13450, "time_per_iteration": 2.4871768951416016 }, { "auxiliary_loss_clip": 0.01101922, "auxiliary_loss_mlp": 0.01030099, "balance_loss_clip": 1.01793993, "balance_loss_mlp": 1.03306365, "epoch": 0.8087178716368556, "flos": 17968084191360.0, "grad_norm": 3.7579364770804964, "language_loss": 0.74060237, "learning_rate": 3.7159119425861986e-07, "loss": 0.7619226, "num_input_tokens_seen": 290311450, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 13451, "time_per_iteration": 2.4465513229370117 }, { "auxiliary_loss_clip": 0.01107047, "auxiliary_loss_mlp": 0.01031427, "balance_loss_clip": 1.01811182, "balance_loss_mlp": 1.03495741, "epoch": 0.8087779948895235, "flos": 21719527603200.0, "grad_norm": 1.7558175336431365, "language_loss": 0.7986331, "learning_rate": 3.713651121244543e-07, "loss": 0.82001787, "num_input_tokens_seen": 290330165, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 13452, "time_per_iteration": 2.531665325164795 }, { "auxiliary_loss_clip": 0.01105849, "auxiliary_loss_mlp": 0.01033761, "balance_loss_clip": 1.02136874, "balance_loss_mlp": 1.0361352, "epoch": 0.8088381181421915, "flos": 29092885983360.0, "grad_norm": 1.6262729771377789, "language_loss": 0.78399771, "learning_rate": 3.711390917482875e-07, "loss": 0.80539382, "num_input_tokens_seen": 290350815, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 13453, "time_per_iteration": 2.5501632690429688 }, { "auxiliary_loss_clip": 0.01103325, "auxiliary_loss_mlp": 0.01032156, "balance_loss_clip": 1.0194304, "balance_loss_mlp": 1.03408122, "epoch": 0.8088982413948594, "flos": 22198432659840.0, "grad_norm": 2.3221161516912563, "language_loss": 0.77348435, "learning_rate": 3.709131331386892e-07, "loss": 0.79483914, "num_input_tokens_seen": 290367380, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 13454, "time_per_iteration": 2.4757285118103027 }, { "auxiliary_loss_clip": 0.01103284, "auxiliary_loss_mlp": 0.01035036, "balance_loss_clip": 1.02197027, "balance_loss_mlp": 1.0350914, "epoch": 0.8089583646475275, "flos": 28036717453440.0, "grad_norm": 1.8991050340100524, "language_loss": 0.76865554, "learning_rate": 3.7068723630422795e-07, "loss": 0.7900387, "num_input_tokens_seen": 290387965, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6796875, "step": 13455, "time_per_iteration": 2.5067968368530273 }, { "auxiliary_loss_clip": 0.01104315, "auxiliary_loss_mlp": 0.01031292, "balance_loss_clip": 1.01981187, "balance_loss_mlp": 1.03547072, "epoch": 0.8090184879001954, "flos": 16617735273600.0, "grad_norm": 4.301788073981849, "language_loss": 0.78959203, "learning_rate": 3.70461401253471e-07, "loss": 0.81094813, "num_input_tokens_seen": 290404150, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6875, "step": 13456, "time_per_iteration": 2.4365031719207764 }, { "auxiliary_loss_clip": 0.01105031, "auxiliary_loss_mlp": 0.01034343, "balance_loss_clip": 1.02276754, "balance_loss_mlp": 1.03801036, "epoch": 0.8090786111528634, "flos": 27340804379520.0, "grad_norm": 2.111286759136874, "language_loss": 0.71867096, "learning_rate": 3.702356279949801e-07, "loss": 0.74006468, "num_input_tokens_seen": 290422370, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 13457, "time_per_iteration": 2.5238540172576904 }, { "auxiliary_loss_clip": 0.01103503, "auxiliary_loss_mlp": 0.01028924, "balance_loss_clip": 1.0177598, "balance_loss_mlp": 1.03570795, "epoch": 0.8091387344055313, "flos": 21105742976640.0, "grad_norm": 2.156458536229543, "language_loss": 0.72794473, "learning_rate": 3.700099165373176e-07, "loss": 0.74926889, "num_input_tokens_seen": 290442645, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6796875, "step": 13458, "time_per_iteration": 2.483157157897949 }, { "auxiliary_loss_clip": 0.01105256, "auxiliary_loss_mlp": 0.01034993, "balance_loss_clip": 1.0227201, "balance_loss_mlp": 1.03602135, "epoch": 0.8091988576581993, "flos": 11655060318720.0, "grad_norm": 4.11079615028485, "language_loss": 0.79219329, "learning_rate": 3.6978426688904275e-07, "loss": 0.81359577, "num_input_tokens_seen": 290458520, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 13459, "time_per_iteration": 2.413087844848633 }, { "auxiliary_loss_clip": 0.01105409, "auxiliary_loss_mlp": 0.01029106, "balance_loss_clip": 1.0162251, "balance_loss_mlp": 1.03517842, "epoch": 0.8092589809108672, "flos": 22963329803520.0, "grad_norm": 2.0305223138717152, "language_loss": 0.80234933, "learning_rate": 3.695586790587113e-07, "loss": 0.82369447, "num_input_tokens_seen": 290474465, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 13460, "time_per_iteration": 3.8045949935913086 }, { "auxiliary_loss_clip": 0.01104713, "auxiliary_loss_mlp": 0.01034383, "balance_loss_clip": 1.0213418, "balance_loss_mlp": 1.03448188, "epoch": 0.8093191041635353, "flos": 13260985482240.0, "grad_norm": 1.8486650766895467, "language_loss": 0.84635186, "learning_rate": 3.693331530548789e-07, "loss": 0.86774278, "num_input_tokens_seen": 290492060, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 13461, "time_per_iteration": 3.8944809436798096 }, { "auxiliary_loss_clip": 0.01106706, "auxiliary_loss_mlp": 0.01037608, "balance_loss_clip": 1.02519846, "balance_loss_mlp": 1.03607917, "epoch": 0.8093792274162032, "flos": 25516003691520.0, "grad_norm": 1.9379874405399338, "language_loss": 0.76614708, "learning_rate": 3.69107688886096e-07, "loss": 0.78759027, "num_input_tokens_seen": 290511510, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 13462, "time_per_iteration": 2.4969544410705566 }, { "auxiliary_loss_clip": 0.0110754, "auxiliary_loss_mlp": 0.01034889, "balance_loss_clip": 1.02166891, "balance_loss_mlp": 1.03740072, "epoch": 0.8094393506688712, "flos": 23546483107200.0, "grad_norm": 3.026678018384309, "language_loss": 0.833121, "learning_rate": 3.6888228656091357e-07, "loss": 0.8545453, "num_input_tokens_seen": 290530035, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 13463, "time_per_iteration": 2.4905688762664795 }, { "auxiliary_loss_clip": 0.01103157, "auxiliary_loss_mlp": 0.01035547, "balance_loss_clip": 1.02447259, "balance_loss_mlp": 1.03575575, "epoch": 0.8094994739215392, "flos": 17055917285760.0, "grad_norm": 1.8258902201661478, "language_loss": 0.62325913, "learning_rate": 3.686569460878779e-07, "loss": 0.64464617, "num_input_tokens_seen": 290548245, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.671875, "step": 13464, "time_per_iteration": 3.844848394393921 }, { "auxiliary_loss_clip": 0.0110245, "auxiliary_loss_mlp": 0.01025893, "balance_loss_clip": 1.01505041, "balance_loss_mlp": 1.03538704, "epoch": 0.8095595971742071, "flos": 23551223702400.0, "grad_norm": 1.484030881665689, "language_loss": 0.61841536, "learning_rate": 3.684316674755341e-07, "loss": 0.63969874, "num_input_tokens_seen": 290568625, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.671875, "step": 13465, "time_per_iteration": 3.9473397731781006 }, { "auxiliary_loss_clip": 0.01104866, "auxiliary_loss_mlp": 0.010349, "balance_loss_clip": 1.02297354, "balance_loss_mlp": 1.03743482, "epoch": 0.8096197204268751, "flos": 20373201008640.0, "grad_norm": 1.786564954227268, "language_loss": 0.81937397, "learning_rate": 3.682064507324256e-07, "loss": 0.84077156, "num_input_tokens_seen": 290586575, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 13466, "time_per_iteration": 2.4653398990631104 }, { "auxiliary_loss_clip": 0.01108307, "auxiliary_loss_mlp": 0.01035911, "balance_loss_clip": 1.02351892, "balance_loss_mlp": 1.03745461, "epoch": 0.809679843679543, "flos": 27818775682560.0, "grad_norm": 1.7509674600907787, "language_loss": 0.75604224, "learning_rate": 3.6798129586709204e-07, "loss": 0.77748442, "num_input_tokens_seen": 290606790, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 13467, "time_per_iteration": 2.529080867767334 }, { "auxiliary_loss_clip": 0.011013, "auxiliary_loss_mlp": 0.01027909, "balance_loss_clip": 1.01580977, "balance_loss_mlp": 1.03242683, "epoch": 0.8097399669322111, "flos": 22014103040640.0, "grad_norm": 1.892601082777338, "language_loss": 0.79237771, "learning_rate": 3.6775620288807073e-07, "loss": 0.8136698, "num_input_tokens_seen": 290625525, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 13468, "time_per_iteration": 2.466383695602417 }, { "auxiliary_loss_clip": 0.01099569, "auxiliary_loss_mlp": 0.01030497, "balance_loss_clip": 1.01885068, "balance_loss_mlp": 1.03377891, "epoch": 0.809800090184879, "flos": 18988988544000.0, "grad_norm": 2.2789399786204223, "language_loss": 0.68283582, "learning_rate": 3.675311718038978e-07, "loss": 0.70413649, "num_input_tokens_seen": 290644935, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.65625, "step": 13469, "time_per_iteration": 2.443779706954956 }, { "auxiliary_loss_clip": 0.01029074, "auxiliary_loss_mlp": 0.0100232, "balance_loss_clip": 1.00132501, "balance_loss_mlp": 1.0069654, "epoch": 0.809860213437547, "flos": 66099516508800.0, "grad_norm": 0.7102866318259579, "language_loss": 0.54638433, "learning_rate": 3.6730620262310683e-07, "loss": 0.56669825, "num_input_tokens_seen": 290710735, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22070312, "step": 13470, "time_per_iteration": 3.182476043701172 }, { "auxiliary_loss_clip": 0.01103118, "auxiliary_loss_mlp": 0.01028853, "balance_loss_clip": 1.01735556, "balance_loss_mlp": 1.03468382, "epoch": 0.8099203366902149, "flos": 20882485992960.0, "grad_norm": 2.2441375869741123, "language_loss": 0.6980747, "learning_rate": 3.670812953542279e-07, "loss": 0.71939445, "num_input_tokens_seen": 290729565, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 13471, "time_per_iteration": 2.4499411582946777 }, { "auxiliary_loss_clip": 0.01104593, "auxiliary_loss_mlp": 0.01027876, "balance_loss_clip": 1.01647341, "balance_loss_mlp": 1.03631985, "epoch": 0.8099804599428829, "flos": 26030927111040.0, "grad_norm": 1.7926176542996257, "language_loss": 0.7929486, "learning_rate": 3.6685645000579003e-07, "loss": 0.8142733, "num_input_tokens_seen": 290749360, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 13472, "time_per_iteration": 2.504561185836792 }, { "auxiliary_loss_clip": 0.01028926, "auxiliary_loss_mlp": 0.01002936, "balance_loss_clip": 1.00194669, "balance_loss_mlp": 1.00656557, "epoch": 0.8100405831955508, "flos": 69303573584640.0, "grad_norm": 0.7476886332518377, "language_loss": 0.5794214, "learning_rate": 3.666316665863201e-07, "loss": 0.59974003, "num_input_tokens_seen": 290812145, "router_z_loss_clip": 0.0098877, "router_z_loss_mlp": 0.22363281, "step": 13473, "time_per_iteration": 3.0525925159454346 }, { "auxiliary_loss_clip": 0.01106996, "auxiliary_loss_mlp": 0.01031914, "balance_loss_clip": 1.01953983, "balance_loss_mlp": 1.03702307, "epoch": 0.8101007064482189, "flos": 15012492468480.0, "grad_norm": 2.4264299382540693, "language_loss": 0.73887324, "learning_rate": 3.664069451043399e-07, "loss": 0.76026237, "num_input_tokens_seen": 290829845, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69921875, "step": 13474, "time_per_iteration": 2.4809677600860596 }, { "auxiliary_loss_clip": 0.01108272, "auxiliary_loss_mlp": 0.01032165, "balance_loss_clip": 1.02015519, "balance_loss_mlp": 1.03732407, "epoch": 0.8101608297008868, "flos": 21067210661760.0, "grad_norm": 1.7113701369827972, "language_loss": 0.78698409, "learning_rate": 3.661822855683723e-07, "loss": 0.80838847, "num_input_tokens_seen": 290848815, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 13475, "time_per_iteration": 2.4557416439056396 }, { "auxiliary_loss_clip": 0.01103044, "auxiliary_loss_mlp": 0.0103728, "balance_loss_clip": 1.0254606, "balance_loss_mlp": 1.03525209, "epoch": 0.8102209529535548, "flos": 23731279603200.0, "grad_norm": 1.7805653560452672, "language_loss": 0.75483984, "learning_rate": 3.659576879869364e-07, "loss": 0.77624309, "num_input_tokens_seen": 290868580, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 13476, "time_per_iteration": 2.510242462158203 }, { "auxiliary_loss_clip": 0.01108999, "auxiliary_loss_mlp": 0.01035409, "balance_loss_clip": 1.02209306, "balance_loss_mlp": 1.03751743, "epoch": 0.8102810762062228, "flos": 10955879107200.0, "grad_norm": 2.4986853447410655, "language_loss": 0.73940533, "learning_rate": 3.657331523685485e-07, "loss": 0.76084942, "num_input_tokens_seen": 290883540, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 13477, "time_per_iteration": 2.4457194805145264 }, { "auxiliary_loss_clip": 0.01103074, "auxiliary_loss_mlp": 0.01032494, "balance_loss_clip": 1.0206387, "balance_loss_mlp": 1.03445482, "epoch": 0.8103411994588907, "flos": 14648825220480.0, "grad_norm": 2.829603740187628, "language_loss": 0.69582385, "learning_rate": 3.6550867872172365e-07, "loss": 0.71717954, "num_input_tokens_seen": 290901560, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 13478, "time_per_iteration": 2.4609086513519287 }, { "auxiliary_loss_clip": 0.0102915, "auxiliary_loss_mlp": 0.01004149, "balance_loss_clip": 1.00314808, "balance_loss_mlp": 1.00685906, "epoch": 0.8104013227115587, "flos": 59153314665600.0, "grad_norm": 0.6906782211742926, "language_loss": 0.52192003, "learning_rate": 3.6528426705497293e-07, "loss": 0.54225302, "num_input_tokens_seen": 290959185, "router_z_loss_clip": 0.01000977, "router_z_loss_mlp": 0.22265625, "step": 13479, "time_per_iteration": 3.022282838821411 }, { "auxiliary_loss_clip": 0.01104928, "auxiliary_loss_mlp": 0.01030809, "balance_loss_clip": 1.01880407, "balance_loss_mlp": 1.03692222, "epoch": 0.8104614459642266, "flos": 19828687760640.0, "grad_norm": 4.114030698572091, "language_loss": 0.71508974, "learning_rate": 3.650599173768072e-07, "loss": 0.7364471, "num_input_tokens_seen": 290979585, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 13480, "time_per_iteration": 2.5118777751922607 }, { "auxiliary_loss_clip": 0.01104108, "auxiliary_loss_mlp": 0.01028359, "balance_loss_clip": 1.01704621, "balance_loss_mlp": 1.03501523, "epoch": 0.8105215692168947, "flos": 25374264624000.0, "grad_norm": 2.8632750792866313, "language_loss": 0.79714173, "learning_rate": 3.648356296957327e-07, "loss": 0.81846637, "num_input_tokens_seen": 291000865, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.69140625, "step": 13481, "time_per_iteration": 2.4905056953430176 }, { "auxiliary_loss_clip": 0.01103431, "auxiliary_loss_mlp": 0.01032889, "balance_loss_clip": 1.02102184, "balance_loss_mlp": 1.03474545, "epoch": 0.8105816924695626, "flos": 20481722974080.0, "grad_norm": 1.8672124586892909, "language_loss": 0.72162282, "learning_rate": 3.646114040202548e-07, "loss": 0.74298608, "num_input_tokens_seen": 291018285, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 13482, "time_per_iteration": 2.4790871143341064 }, { "auxiliary_loss_clip": 0.01103778, "auxiliary_loss_mlp": 0.01024495, "balance_loss_clip": 1.01203144, "balance_loss_mlp": 1.03333521, "epoch": 0.8106418157222306, "flos": 14538687143040.0, "grad_norm": 2.1004814953292676, "language_loss": 0.65310609, "learning_rate": 3.6438724035887705e-07, "loss": 0.67438883, "num_input_tokens_seen": 291035745, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 13483, "time_per_iteration": 2.443171501159668 }, { "auxiliary_loss_clip": 0.01103821, "auxiliary_loss_mlp": 0.01027122, "balance_loss_clip": 1.01501, "balance_loss_mlp": 1.03504229, "epoch": 0.8107019389748985, "flos": 22564470205440.0, "grad_norm": 1.7838683145337912, "language_loss": 0.76153642, "learning_rate": 3.641631387200992e-07, "loss": 0.78284585, "num_input_tokens_seen": 291053280, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 13484, "time_per_iteration": 2.491943120956421 }, { "auxiliary_loss_clip": 0.01110035, "auxiliary_loss_mlp": 0.01034152, "balance_loss_clip": 1.02096105, "balance_loss_mlp": 1.03777885, "epoch": 0.8107620622275665, "flos": 19609560840960.0, "grad_norm": 1.7491743444886685, "language_loss": 0.72169399, "learning_rate": 3.639390991124183e-07, "loss": 0.74313581, "num_input_tokens_seen": 291072855, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 13485, "time_per_iteration": 2.4877216815948486 }, { "auxiliary_loss_clip": 0.01103474, "auxiliary_loss_mlp": 0.01025858, "balance_loss_clip": 1.01469445, "balance_loss_mlp": 1.03686905, "epoch": 0.8108221854802344, "flos": 16143498984960.0, "grad_norm": 1.9640203741744497, "language_loss": 0.76041001, "learning_rate": 3.637151215443308e-07, "loss": 0.78170335, "num_input_tokens_seen": 291090285, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6640625, "step": 13486, "time_per_iteration": 2.4511983394622803 }, { "auxiliary_loss_clip": 0.0110847, "auxiliary_loss_mlp": 0.01032245, "balance_loss_clip": 1.0200851, "balance_loss_mlp": 1.03697824, "epoch": 0.8108823087329025, "flos": 21106209853440.0, "grad_norm": 3.026468818489939, "language_loss": 0.724738, "learning_rate": 3.6349120602433045e-07, "loss": 0.74614513, "num_input_tokens_seen": 291107675, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71484375, "step": 13487, "time_per_iteration": 2.4696414470672607 }, { "auxiliary_loss_clip": 0.01101709, "auxiliary_loss_mlp": 0.01033324, "balance_loss_clip": 1.02185559, "balance_loss_mlp": 1.03629875, "epoch": 0.8109424319855704, "flos": 29199648182400.0, "grad_norm": 2.0257740813932044, "language_loss": 0.84401596, "learning_rate": 3.6326735256090715e-07, "loss": 0.86536634, "num_input_tokens_seen": 291126900, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.65625, "step": 13488, "time_per_iteration": 2.556658983230591 }, { "auxiliary_loss_clip": 0.01107601, "auxiliary_loss_mlp": 0.01029976, "balance_loss_clip": 1.01717925, "balance_loss_mlp": 1.03722692, "epoch": 0.8110025552382384, "flos": 23111856541440.0, "grad_norm": 2.229367415068896, "language_loss": 0.74005067, "learning_rate": 3.630435611625502e-07, "loss": 0.76142639, "num_input_tokens_seen": 291145285, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13489, "time_per_iteration": 2.473179340362549 }, { "auxiliary_loss_clip": 0.01101989, "auxiliary_loss_mlp": 0.01031272, "balance_loss_clip": 1.01948857, "balance_loss_mlp": 1.0351553, "epoch": 0.8110626784909064, "flos": 22379961018240.0, "grad_norm": 1.7883475011707282, "language_loss": 0.7201885, "learning_rate": 3.628198318377453e-07, "loss": 0.74152112, "num_input_tokens_seen": 291163485, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 13490, "time_per_iteration": 2.511880874633789 }, { "auxiliary_loss_clip": 0.01107574, "auxiliary_loss_mlp": 0.01035399, "balance_loss_clip": 1.02225637, "balance_loss_mlp": 1.03717828, "epoch": 0.8111228017435743, "flos": 23368043318400.0, "grad_norm": 2.013797658915237, "language_loss": 0.71899205, "learning_rate": 3.625961645949762e-07, "loss": 0.74042177, "num_input_tokens_seen": 291182215, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 13491, "time_per_iteration": 2.4865670204162598 }, { "auxiliary_loss_clip": 0.01103576, "auxiliary_loss_mlp": 0.01028408, "balance_loss_clip": 1.01666605, "balance_loss_mlp": 1.0346365, "epoch": 0.8111829249962423, "flos": 21286553063040.0, "grad_norm": 1.4577313683148092, "language_loss": 0.67900437, "learning_rate": 3.623725594427245e-07, "loss": 0.70032424, "num_input_tokens_seen": 291203145, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 13492, "time_per_iteration": 2.5198850631713867 }, { "auxiliary_loss_clip": 0.01107245, "auxiliary_loss_mlp": 0.01031262, "balance_loss_clip": 1.01835167, "balance_loss_mlp": 1.03690755, "epoch": 0.8112430482489102, "flos": 22345558767360.0, "grad_norm": 1.7415225824539116, "language_loss": 0.72141159, "learning_rate": 3.6214901638947006e-07, "loss": 0.74279666, "num_input_tokens_seen": 291220600, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 13493, "time_per_iteration": 2.4444713592529297 }, { "auxiliary_loss_clip": 0.0110434, "auxiliary_loss_mlp": 0.01037171, "balance_loss_clip": 1.02421856, "balance_loss_mlp": 1.03470099, "epoch": 0.8113031715015783, "flos": 31138321962240.0, "grad_norm": 1.6466347360895537, "language_loss": 0.70755619, "learning_rate": 3.619255354436885e-07, "loss": 0.72897124, "num_input_tokens_seen": 291241195, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 13494, "time_per_iteration": 2.555241346359253 }, { "auxiliary_loss_clip": 0.01106707, "auxiliary_loss_mlp": 0.01030246, "balance_loss_clip": 1.01650691, "balance_loss_mlp": 1.0354141, "epoch": 0.8113632947542462, "flos": 25335445000320.0, "grad_norm": 2.205931550901497, "language_loss": 0.765113, "learning_rate": 3.6170211661385543e-07, "loss": 0.78648251, "num_input_tokens_seen": 291258715, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 13495, "time_per_iteration": 2.479438304901123 }, { "auxiliary_loss_clip": 0.01104377, "auxiliary_loss_mlp": 0.01036659, "balance_loss_clip": 1.02412403, "balance_loss_mlp": 1.03539348, "epoch": 0.8114234180069142, "flos": 28439168411520.0, "grad_norm": 2.0148369682855916, "language_loss": 0.80039078, "learning_rate": 3.614787599084417e-07, "loss": 0.82180119, "num_input_tokens_seen": 291278030, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 13496, "time_per_iteration": 2.5134437084198 }, { "auxiliary_loss_clip": 0.01104241, "auxiliary_loss_mlp": 0.01032276, "balance_loss_clip": 1.01885915, "balance_loss_mlp": 1.03550839, "epoch": 0.8114835412595821, "flos": 20338870584960.0, "grad_norm": 2.042607094319217, "language_loss": 0.71683884, "learning_rate": 3.6125546533591787e-07, "loss": 0.738204, "num_input_tokens_seen": 291296740, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6875, "step": 13497, "time_per_iteration": 2.4565720558166504 }, { "auxiliary_loss_clip": 0.0110589, "auxiliary_loss_mlp": 0.01029156, "balance_loss_clip": 1.01749742, "balance_loss_mlp": 1.03615141, "epoch": 0.8115436645122501, "flos": 22490889194880.0, "grad_norm": 1.8047850145219608, "language_loss": 0.76737475, "learning_rate": 3.610322329047508e-07, "loss": 0.78872526, "num_input_tokens_seen": 291318730, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 13498, "time_per_iteration": 2.5294387340545654 }, { "auxiliary_loss_clip": 0.01104535, "auxiliary_loss_mlp": 0.01035511, "balance_loss_clip": 1.02307773, "balance_loss_mlp": 1.03527904, "epoch": 0.811603787764918, "flos": 13845288021120.0, "grad_norm": 1.8576938500273412, "language_loss": 0.83886689, "learning_rate": 3.608090626234055e-07, "loss": 0.8602674, "num_input_tokens_seen": 291336755, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 13499, "time_per_iteration": 2.469597101211548 }, { "auxiliary_loss_clip": 0.01104053, "auxiliary_loss_mlp": 0.01030847, "balance_loss_clip": 1.01744795, "balance_loss_mlp": 1.03551137, "epoch": 0.8116639110175861, "flos": 21614632911360.0, "grad_norm": 1.5455971351955058, "language_loss": 0.76013488, "learning_rate": 3.6058595450034603e-07, "loss": 0.78148389, "num_input_tokens_seen": 291356795, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.68359375, "step": 13500, "time_per_iteration": 2.493959426879883 }, { "auxiliary_loss_clip": 0.01028682, "auxiliary_loss_mlp": 0.01002371, "balance_loss_clip": 1.00136936, "balance_loss_mlp": 1.00637126, "epoch": 0.811724034270254, "flos": 64459799625600.0, "grad_norm": 1.0274413981734443, "language_loss": 0.5992949, "learning_rate": 3.603629085440303e-07, "loss": 0.61960542, "num_input_tokens_seen": 291416005, "router_z_loss_clip": 0.01000977, "router_z_loss_mlp": 0.22265625, "step": 13501, "time_per_iteration": 3.148620843887329 }, { "auxiliary_loss_clip": 0.01100701, "auxiliary_loss_mlp": 0.01027097, "balance_loss_clip": 1.01535511, "balance_loss_mlp": 1.0351733, "epoch": 0.811784157522922, "flos": 24754123290240.0, "grad_norm": 1.9148816145867087, "language_loss": 0.78847998, "learning_rate": 3.6013992476291753e-07, "loss": 0.80975795, "num_input_tokens_seen": 291434870, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.65625, "step": 13502, "time_per_iteration": 3.9256913661956787 }, { "auxiliary_loss_clip": 0.01104724, "auxiliary_loss_mlp": 0.01032816, "balance_loss_clip": 1.0213902, "balance_loss_mlp": 1.03605592, "epoch": 0.81184428077559, "flos": 12167146563840.0, "grad_norm": 2.4038892768963493, "language_loss": 0.7139945, "learning_rate": 3.599170031654635e-07, "loss": 0.73536992, "num_input_tokens_seen": 291452230, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 13503, "time_per_iteration": 3.843647003173828 }, { "auxiliary_loss_clip": 0.01105245, "auxiliary_loss_mlp": 0.01030566, "balance_loss_clip": 1.01719666, "balance_loss_mlp": 1.03643775, "epoch": 0.8119044040282579, "flos": 44422037775360.0, "grad_norm": 1.8047038811608045, "language_loss": 0.67966461, "learning_rate": 3.5969414376012065e-07, "loss": 0.70102274, "num_input_tokens_seen": 291477425, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6875, "step": 13504, "time_per_iteration": 2.6825790405273438 }, { "auxiliary_loss_clip": 0.01104778, "auxiliary_loss_mlp": 0.01031787, "balance_loss_clip": 1.01871598, "balance_loss_mlp": 1.03404927, "epoch": 0.8119645272809259, "flos": 52155507957120.0, "grad_norm": 2.0761090176594457, "language_loss": 0.74370974, "learning_rate": 3.594713465553403e-07, "loss": 0.76507533, "num_input_tokens_seen": 291501070, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 13505, "time_per_iteration": 4.155048370361328 }, { "auxiliary_loss_clip": 0.01104809, "auxiliary_loss_mlp": 0.01029391, "balance_loss_clip": 1.01611733, "balance_loss_mlp": 1.03562677, "epoch": 0.8120246505335939, "flos": 30232978640640.0, "grad_norm": 2.00271880790217, "language_loss": 0.73105896, "learning_rate": 3.5924861155957123e-07, "loss": 0.75240099, "num_input_tokens_seen": 291524945, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69140625, "step": 13506, "time_per_iteration": 2.591801166534424 }, { "auxiliary_loss_clip": 0.01109083, "auxiliary_loss_mlp": 0.01028857, "balance_loss_clip": 1.01622081, "balance_loss_mlp": 1.0358119, "epoch": 0.8120847737862619, "flos": 22127652910080.0, "grad_norm": 2.3866444776958686, "language_loss": 0.76012534, "learning_rate": 3.590259387812593e-07, "loss": 0.78150463, "num_input_tokens_seen": 291544605, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.734375, "step": 13507, "time_per_iteration": 3.9366064071655273 }, { "auxiliary_loss_clip": 0.01104827, "auxiliary_loss_mlp": 0.01026079, "balance_loss_clip": 1.01376486, "balance_loss_mlp": 1.03356123, "epoch": 0.8121448970389298, "flos": 23295180579840.0, "grad_norm": 1.576296690188842, "language_loss": 0.70071423, "learning_rate": 3.5880332822884783e-07, "loss": 0.72202331, "num_input_tokens_seen": 291563850, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 13508, "time_per_iteration": 2.4563379287719727 }, { "auxiliary_loss_clip": 0.01104125, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.01911902, "balance_loss_mlp": 1.03633928, "epoch": 0.8122050202915978, "flos": 22164138149760.0, "grad_norm": 1.9260151351433275, "language_loss": 0.76132673, "learning_rate": 3.585807799107785e-07, "loss": 0.78267729, "num_input_tokens_seen": 291581730, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 13509, "time_per_iteration": 2.5917749404907227 }, { "auxiliary_loss_clip": 0.01104444, "auxiliary_loss_mlp": 0.01034528, "balance_loss_clip": 1.02221966, "balance_loss_mlp": 1.03421712, "epoch": 0.8122651435442657, "flos": 23258946735360.0, "grad_norm": 2.0122147096314897, "language_loss": 0.76952934, "learning_rate": 3.58358293835491e-07, "loss": 0.79091907, "num_input_tokens_seen": 291601225, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 13510, "time_per_iteration": 2.6279382705688477 }, { "auxiliary_loss_clip": 0.01107187, "auxiliary_loss_mlp": 0.01031796, "balance_loss_clip": 1.01914191, "balance_loss_mlp": 1.03622115, "epoch": 0.8123252667969337, "flos": 16140015365760.0, "grad_norm": 1.7750882243984893, "language_loss": 0.70114732, "learning_rate": 3.581358700114212e-07, "loss": 0.72253716, "num_input_tokens_seen": 291616995, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 13511, "time_per_iteration": 2.4125430583953857 }, { "auxiliary_loss_clip": 0.01107091, "auxiliary_loss_mlp": 0.01036449, "balance_loss_clip": 1.02406955, "balance_loss_mlp": 1.03675544, "epoch": 0.8123853900496016, "flos": 21245399055360.0, "grad_norm": 2.4504426470463505, "language_loss": 0.79417866, "learning_rate": 3.57913508447004e-07, "loss": 0.8156141, "num_input_tokens_seen": 291636145, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 13512, "time_per_iteration": 2.45479679107666 }, { "auxiliary_loss_clip": 0.01103621, "auxiliary_loss_mlp": 0.01029465, "balance_loss_clip": 1.01741314, "balance_loss_mlp": 1.03509116, "epoch": 0.8124455133022697, "flos": 64377596373120.0, "grad_norm": 1.6489219584927082, "language_loss": 0.63627517, "learning_rate": 3.5769120915067076e-07, "loss": 0.65760601, "num_input_tokens_seen": 291662440, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 13513, "time_per_iteration": 2.853013515472412 }, { "auxiliary_loss_clip": 0.01107157, "auxiliary_loss_mlp": 0.0103249, "balance_loss_clip": 1.02004421, "balance_loss_mlp": 1.03587198, "epoch": 0.8125056365549376, "flos": 23842207779840.0, "grad_norm": 2.5345107782818865, "language_loss": 0.71684778, "learning_rate": 3.5746897213085194e-07, "loss": 0.7382443, "num_input_tokens_seen": 291680950, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 13514, "time_per_iteration": 2.467944383621216 }, { "auxiliary_loss_clip": 0.01101461, "auxiliary_loss_mlp": 0.01029379, "balance_loss_clip": 1.01692748, "balance_loss_mlp": 1.03406072, "epoch": 0.8125657598076056, "flos": 23550325862400.0, "grad_norm": 19.20399618004666, "language_loss": 0.63366461, "learning_rate": 3.5724679739597364e-07, "loss": 0.65497303, "num_input_tokens_seen": 291702395, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 13515, "time_per_iteration": 2.49556565284729 }, { "auxiliary_loss_clip": 0.01098363, "auxiliary_loss_mlp": 0.01025652, "balance_loss_clip": 1.01374328, "balance_loss_mlp": 1.0338068, "epoch": 0.8126258830602736, "flos": 20704225772160.0, "grad_norm": 1.5507221480644475, "language_loss": 0.75257468, "learning_rate": 3.570246849544616e-07, "loss": 0.77381486, "num_input_tokens_seen": 291721135, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.64453125, "step": 13516, "time_per_iteration": 2.4442806243896484 }, { "auxiliary_loss_clip": 0.01104853, "auxiliary_loss_mlp": 0.01030761, "balance_loss_clip": 1.01860762, "balance_loss_mlp": 1.0349189, "epoch": 0.8126860063129415, "flos": 23618160696960.0, "grad_norm": 1.534903077876235, "language_loss": 0.91323179, "learning_rate": 3.5680263481473907e-07, "loss": 0.93458796, "num_input_tokens_seen": 291741235, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69921875, "step": 13517, "time_per_iteration": 2.4725852012634277 }, { "auxiliary_loss_clip": 0.01107717, "auxiliary_loss_mlp": 0.01029338, "balance_loss_clip": 1.01752996, "balance_loss_mlp": 1.03820229, "epoch": 0.8127461295656095, "flos": 25007149670400.0, "grad_norm": 1.4815537288623326, "language_loss": 0.78460461, "learning_rate": 3.565806469852244e-07, "loss": 0.8059752, "num_input_tokens_seen": 291761430, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 13518, "time_per_iteration": 2.481757879257202 }, { "auxiliary_loss_clip": 0.01105264, "auxiliary_loss_mlp": 0.01030461, "balance_loss_clip": 1.01950002, "balance_loss_mlp": 1.03734195, "epoch": 0.8128062528182775, "flos": 27342169096320.0, "grad_norm": 1.5982005007804017, "language_loss": 0.79321969, "learning_rate": 3.56358721474336e-07, "loss": 0.81457698, "num_input_tokens_seen": 291781755, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.6796875, "step": 13519, "time_per_iteration": 2.51033091545105 }, { "auxiliary_loss_clip": 0.01104775, "auxiliary_loss_mlp": 0.01037146, "balance_loss_clip": 1.02526093, "balance_loss_mlp": 1.03471875, "epoch": 0.8128663760709455, "flos": 26506312634880.0, "grad_norm": 1.9439820473010632, "language_loss": 0.70579964, "learning_rate": 3.561368582904905e-07, "loss": 0.72721887, "num_input_tokens_seen": 291804410, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 13520, "time_per_iteration": 2.504328966140747 }, { "auxiliary_loss_clip": 0.01105482, "auxiliary_loss_mlp": 0.01028447, "balance_loss_clip": 1.01657367, "balance_loss_mlp": 1.03614628, "epoch": 0.8129264993236134, "flos": 17931239815680.0, "grad_norm": 1.5352201772962955, "language_loss": 0.72850025, "learning_rate": 3.5591505744209925e-07, "loss": 0.74983954, "num_input_tokens_seen": 291823285, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 13521, "time_per_iteration": 2.4553301334381104 }, { "auxiliary_loss_clip": 0.01105599, "auxiliary_loss_mlp": 0.01032537, "balance_loss_clip": 1.0202589, "balance_loss_mlp": 1.03508151, "epoch": 0.8129866225762814, "flos": 26177694082560.0, "grad_norm": 2.3398703643829575, "language_loss": 0.70218104, "learning_rate": 3.5569331893757394e-07, "loss": 0.72356236, "num_input_tokens_seen": 291845305, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 13522, "time_per_iteration": 2.5508947372436523 }, { "auxiliary_loss_clip": 0.01101122, "auxiliary_loss_mlp": 0.01034852, "balance_loss_clip": 1.02350891, "balance_loss_mlp": 1.03507948, "epoch": 0.8130467458289493, "flos": 21032197879680.0, "grad_norm": 1.5535478197473551, "language_loss": 0.70342821, "learning_rate": 3.554716427853233e-07, "loss": 0.72478795, "num_input_tokens_seen": 291863715, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 13523, "time_per_iteration": 2.5035529136657715 }, { "auxiliary_loss_clip": 0.01103267, "auxiliary_loss_mlp": 0.0102942, "balance_loss_clip": 1.01646781, "balance_loss_mlp": 1.0345614, "epoch": 0.8131068690816173, "flos": 15487051979520.0, "grad_norm": 2.278210187086723, "language_loss": 0.70884734, "learning_rate": 3.5525002899375256e-07, "loss": 0.73017418, "num_input_tokens_seen": 291880735, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 13524, "time_per_iteration": 2.4405272006988525 }, { "auxiliary_loss_clip": 0.01103591, "auxiliary_loss_mlp": 0.01030086, "balance_loss_clip": 1.01857078, "balance_loss_mlp": 1.03487575, "epoch": 0.8131669923342852, "flos": 29351227576320.0, "grad_norm": 1.8590318889243385, "language_loss": 0.62414223, "learning_rate": 3.550284775712653e-07, "loss": 0.64547896, "num_input_tokens_seen": 291900535, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 13525, "time_per_iteration": 2.5524425506591797 }, { "auxiliary_loss_clip": 0.01104519, "auxiliary_loss_mlp": 0.01032192, "balance_loss_clip": 1.02058077, "balance_loss_mlp": 1.03646135, "epoch": 0.8132271155869533, "flos": 35256162055680.0, "grad_norm": 1.5890343043362458, "language_loss": 0.65594214, "learning_rate": 3.548069885262628e-07, "loss": 0.67730922, "num_input_tokens_seen": 291919760, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 13526, "time_per_iteration": 2.598017692565918 }, { "auxiliary_loss_clip": 0.01102062, "auxiliary_loss_mlp": 0.01028389, "balance_loss_clip": 1.01712394, "balance_loss_mlp": 1.03415179, "epoch": 0.8132872388396212, "flos": 27781895393280.0, "grad_norm": 1.67360617631448, "language_loss": 0.75405312, "learning_rate": 3.5458556186714473e-07, "loss": 0.7753576, "num_input_tokens_seen": 291938915, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6796875, "step": 13527, "time_per_iteration": 2.54052734375 }, { "auxiliary_loss_clip": 0.01103141, "auxiliary_loss_mlp": 0.01025732, "balance_loss_clip": 1.01428223, "balance_loss_mlp": 1.03477597, "epoch": 0.8133473620922892, "flos": 27819601695360.0, "grad_norm": 1.684999657254135, "language_loss": 0.70658612, "learning_rate": 3.5436419760230706e-07, "loss": 0.72787482, "num_input_tokens_seen": 291958145, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.68359375, "step": 13528, "time_per_iteration": 2.521352529525757 }, { "auxiliary_loss_clip": 0.01104365, "auxiliary_loss_mlp": 0.01031291, "balance_loss_clip": 1.01929212, "balance_loss_mlp": 1.03501546, "epoch": 0.8134074853449572, "flos": 18989527248000.0, "grad_norm": 2.1446986736793625, "language_loss": 0.68630981, "learning_rate": 3.5414289574014357e-07, "loss": 0.70766628, "num_input_tokens_seen": 291976860, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 13529, "time_per_iteration": 2.447638511657715 }, { "auxiliary_loss_clip": 0.01100922, "auxiliary_loss_mlp": 0.01030642, "balance_loss_clip": 1.01869726, "balance_loss_mlp": 1.03367376, "epoch": 0.8134676085976251, "flos": 24242863057920.0, "grad_norm": 1.5851037159854164, "language_loss": 0.77366805, "learning_rate": 3.5392165628904635e-07, "loss": 0.79498363, "num_input_tokens_seen": 291998085, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 13530, "time_per_iteration": 2.4774985313415527 }, { "auxiliary_loss_clip": 0.01104861, "auxiliary_loss_mlp": 0.01031006, "balance_loss_clip": 1.01826203, "balance_loss_mlp": 1.03687501, "epoch": 0.8135277318502931, "flos": 19062389986560.0, "grad_norm": 1.7945426292772544, "language_loss": 0.8202728, "learning_rate": 3.537004792574052e-07, "loss": 0.84163147, "num_input_tokens_seen": 292016585, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 13531, "time_per_iteration": 2.440565586090088 }, { "auxiliary_loss_clip": 0.01105943, "auxiliary_loss_mlp": 0.01029493, "balance_loss_clip": 1.01635003, "balance_loss_mlp": 1.03526151, "epoch": 0.813587855102961, "flos": 17269728992640.0, "grad_norm": 1.977536925350266, "language_loss": 0.72125006, "learning_rate": 3.534793646536065e-07, "loss": 0.74260437, "num_input_tokens_seen": 292033255, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 13532, "time_per_iteration": 2.3921291828155518 }, { "auxiliary_loss_clip": 0.0110367, "auxiliary_loss_mlp": 0.01028832, "balance_loss_clip": 1.01687527, "balance_loss_mlp": 1.03530836, "epoch": 0.8136479783556291, "flos": 20157593621760.0, "grad_norm": 2.0055471670144773, "language_loss": 0.76720285, "learning_rate": 3.5325831248603533e-07, "loss": 0.78852785, "num_input_tokens_seen": 292051800, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 13533, "time_per_iteration": 2.454617738723755 }, { "auxiliary_loss_clip": 0.01109365, "auxiliary_loss_mlp": 0.01040118, "balance_loss_clip": 1.02703428, "balance_loss_mlp": 1.03626502, "epoch": 0.813708101608297, "flos": 22052348046720.0, "grad_norm": 1.8028171220613558, "language_loss": 0.76153815, "learning_rate": 3.5303732276307495e-07, "loss": 0.78303301, "num_input_tokens_seen": 292072215, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 13534, "time_per_iteration": 2.4474096298217773 }, { "auxiliary_loss_clip": 0.01103578, "auxiliary_loss_mlp": 0.01025286, "balance_loss_clip": 1.01412845, "balance_loss_mlp": 1.03558457, "epoch": 0.813768224860965, "flos": 16173412035840.0, "grad_norm": 2.1922827708759938, "language_loss": 0.93045771, "learning_rate": 3.5281639549310336e-07, "loss": 0.95174634, "num_input_tokens_seen": 292088830, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6796875, "step": 13535, "time_per_iteration": 2.4550273418426514 }, { "auxiliary_loss_clip": 0.01103219, "auxiliary_loss_mlp": 0.01028827, "balance_loss_clip": 1.01753235, "balance_loss_mlp": 1.03670442, "epoch": 0.8138283481136329, "flos": 24352318776960.0, "grad_norm": 1.9067273993139333, "language_loss": 0.70560706, "learning_rate": 3.52595530684499e-07, "loss": 0.72692752, "num_input_tokens_seen": 292109225, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 13536, "time_per_iteration": 2.482156991958618 }, { "auxiliary_loss_clip": 0.01102713, "auxiliary_loss_mlp": 0.0103255, "balance_loss_clip": 1.02011681, "balance_loss_mlp": 1.03448462, "epoch": 0.8138884713663009, "flos": 25516362827520.0, "grad_norm": 1.6064379702907723, "language_loss": 0.7568047, "learning_rate": 3.5237472834563775e-07, "loss": 0.77815735, "num_input_tokens_seen": 292129660, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 13537, "time_per_iteration": 2.500136137008667 }, { "auxiliary_loss_clip": 0.01102254, "auxiliary_loss_mlp": 0.01030161, "balance_loss_clip": 1.01852608, "balance_loss_mlp": 1.0352124, "epoch": 0.8139485946189688, "flos": 22454368041600.0, "grad_norm": 1.580670101892615, "language_loss": 0.76343846, "learning_rate": 3.5215398848489163e-07, "loss": 0.78476262, "num_input_tokens_seen": 292149090, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 13538, "time_per_iteration": 2.46039080619812 }, { "auxiliary_loss_clip": 0.01102771, "auxiliary_loss_mlp": 0.01029177, "balance_loss_clip": 1.01730394, "balance_loss_mlp": 1.03373861, "epoch": 0.8140087178716369, "flos": 21250391045760.0, "grad_norm": 1.7404936093587944, "language_loss": 0.77757704, "learning_rate": 3.5193331111063176e-07, "loss": 0.79889649, "num_input_tokens_seen": 292169260, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 13539, "time_per_iteration": 2.4831809997558594 }, { "auxiliary_loss_clip": 0.01103528, "auxiliary_loss_mlp": 0.01029003, "balance_loss_clip": 1.01774335, "balance_loss_mlp": 1.03658605, "epoch": 0.8140688411243048, "flos": 39415730774400.0, "grad_norm": 2.109634931376382, "language_loss": 0.65943384, "learning_rate": 3.5171269623122533e-07, "loss": 0.68075919, "num_input_tokens_seen": 292188145, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66796875, "step": 13540, "time_per_iteration": 2.586064338684082 }, { "auxiliary_loss_clip": 0.01106227, "auxiliary_loss_mlp": 0.01032559, "balance_loss_clip": 1.02146661, "balance_loss_mlp": 1.03771973, "epoch": 0.8141289643769728, "flos": 25415885508480.0, "grad_norm": 2.6271427856846823, "language_loss": 0.67560035, "learning_rate": 3.5149214385503913e-07, "loss": 0.69698817, "num_input_tokens_seen": 292212135, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6875, "step": 13541, "time_per_iteration": 2.5420773029327393 }, { "auxiliary_loss_clip": 0.01102818, "auxiliary_loss_mlp": 0.01033308, "balance_loss_clip": 1.02102399, "balance_loss_mlp": 1.03463495, "epoch": 0.8141890876296408, "flos": 12568053237120.0, "grad_norm": 1.8574046693193091, "language_loss": 0.69283563, "learning_rate": 3.512716539904355e-07, "loss": 0.71419692, "num_input_tokens_seen": 292230645, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 13542, "time_per_iteration": 2.4188802242279053 }, { "auxiliary_loss_clip": 0.0110875, "auxiliary_loss_mlp": 0.01033221, "balance_loss_clip": 1.02033424, "balance_loss_mlp": 1.03589582, "epoch": 0.8142492108823087, "flos": 14967172483200.0, "grad_norm": 3.2251516910958196, "language_loss": 0.7935558, "learning_rate": 3.5105122664577613e-07, "loss": 0.81497544, "num_input_tokens_seen": 292243540, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 13543, "time_per_iteration": 2.4056501388549805 }, { "auxiliary_loss_clip": 0.01109119, "auxiliary_loss_mlp": 0.01038978, "balance_loss_clip": 1.02575779, "balance_loss_mlp": 1.0366677, "epoch": 0.8143093341349767, "flos": 12422004537600.0, "grad_norm": 2.80689958431459, "language_loss": 0.7786777, "learning_rate": 3.5083086182942003e-07, "loss": 0.80015874, "num_input_tokens_seen": 292261715, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 13544, "time_per_iteration": 5.308887004852295 }, { "auxiliary_loss_clip": 0.01111728, "auxiliary_loss_mlp": 0.01034498, "balance_loss_clip": 1.02045548, "balance_loss_mlp": 1.03772712, "epoch": 0.8143694573876447, "flos": 11910564737280.0, "grad_norm": 3.132414924304698, "language_loss": 0.73554748, "learning_rate": 3.5061055954972264e-07, "loss": 0.75700974, "num_input_tokens_seen": 292275080, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73828125, "step": 13545, "time_per_iteration": 2.4034459590911865 }, { "auxiliary_loss_clip": 0.01101224, "auxiliary_loss_mlp": 0.01028757, "balance_loss_clip": 1.01691973, "balance_loss_mlp": 1.03416562, "epoch": 0.8144295806403127, "flos": 21212900225280.0, "grad_norm": 1.7693431048135881, "language_loss": 0.76792073, "learning_rate": 3.5039031981503776e-07, "loss": 0.78922057, "num_input_tokens_seen": 292294635, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 13546, "time_per_iteration": 2.4641709327697754 }, { "auxiliary_loss_clip": 0.01107803, "auxiliary_loss_mlp": 0.01027815, "balance_loss_clip": 1.01628113, "balance_loss_mlp": 1.03763998, "epoch": 0.8144897038929806, "flos": 19865280741120.0, "grad_norm": 2.548370559388035, "language_loss": 0.69982833, "learning_rate": 3.501701426337178e-07, "loss": 0.72118455, "num_input_tokens_seen": 292312695, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.703125, "step": 13547, "time_per_iteration": 3.818298816680908 }, { "auxiliary_loss_clip": 0.0110773, "auxiliary_loss_mlp": 0.01034257, "balance_loss_clip": 1.02117968, "balance_loss_mlp": 1.03629184, "epoch": 0.8145498271456486, "flos": 24571733005440.0, "grad_norm": 3.4736748257981014, "language_loss": 0.70855677, "learning_rate": 3.49950028014111e-07, "loss": 0.72997659, "num_input_tokens_seen": 292332005, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 13548, "time_per_iteration": 3.8846333026885986 }, { "auxiliary_loss_clip": 0.01108469, "auxiliary_loss_mlp": 0.01035408, "balance_loss_clip": 1.02222395, "balance_loss_mlp": 1.03756332, "epoch": 0.8146099503983165, "flos": 20193037367040.0, "grad_norm": 2.226441097167319, "language_loss": 0.76958025, "learning_rate": 3.4972997596456444e-07, "loss": 0.79101908, "num_input_tokens_seen": 292348365, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 13549, "time_per_iteration": 2.45166277885437 }, { "auxiliary_loss_clip": 0.01106679, "auxiliary_loss_mlp": 0.01028696, "balance_loss_clip": 1.01632798, "balance_loss_mlp": 1.03718185, "epoch": 0.8146700736509845, "flos": 19536949497600.0, "grad_norm": 2.1340444143182076, "language_loss": 0.70957661, "learning_rate": 3.4950998649342233e-07, "loss": 0.73093033, "num_input_tokens_seen": 292368050, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 13550, "time_per_iteration": 2.43449330329895 }, { "auxiliary_loss_clip": 0.01100107, "auxiliary_loss_mlp": 0.01025762, "balance_loss_clip": 1.01410341, "balance_loss_mlp": 1.03445685, "epoch": 0.8147301969036524, "flos": 18041341979520.0, "grad_norm": 2.536960777798989, "language_loss": 0.72031641, "learning_rate": 3.4929005960902826e-07, "loss": 0.741575, "num_input_tokens_seen": 292385315, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.65625, "step": 13551, "time_per_iteration": 2.4223525524139404 }, { "auxiliary_loss_clip": 0.01110359, "auxiliary_loss_mlp": 0.01031794, "balance_loss_clip": 1.0186398, "balance_loss_mlp": 1.03764141, "epoch": 0.8147903201563205, "flos": 18004713085440.0, "grad_norm": 1.960184515109224, "language_loss": 0.68683195, "learning_rate": 3.4907019531971926e-07, "loss": 0.7082535, "num_input_tokens_seen": 292403375, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 13552, "time_per_iteration": 2.4344022274017334 }, { "auxiliary_loss_clip": 0.01103056, "auxiliary_loss_mlp": 0.01041981, "balance_loss_clip": 1.02976239, "balance_loss_mlp": 1.03436065, "epoch": 0.8148504434089884, "flos": 20259327916800.0, "grad_norm": 2.1539787366991234, "language_loss": 0.81970775, "learning_rate": 3.4885039363383407e-07, "loss": 0.84115815, "num_input_tokens_seen": 292419260, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 13553, "time_per_iteration": 2.4355552196502686 }, { "auxiliary_loss_clip": 0.0110292, "auxiliary_loss_mlp": 0.01026303, "balance_loss_clip": 1.01425648, "balance_loss_mlp": 1.03463769, "epoch": 0.8149105666616564, "flos": 12494723621760.0, "grad_norm": 2.112933055663947, "language_loss": 0.6791029, "learning_rate": 3.4863065455970795e-07, "loss": 0.70039511, "num_input_tokens_seen": 292436095, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.68359375, "step": 13554, "time_per_iteration": 2.400108575820923 }, { "auxiliary_loss_clip": 0.01107643, "auxiliary_loss_mlp": 0.01031868, "balance_loss_clip": 1.01886249, "balance_loss_mlp": 1.03778911, "epoch": 0.8149706899143244, "flos": 32523683662080.0, "grad_norm": 2.665634603694982, "language_loss": 0.66293144, "learning_rate": 3.484109781056723e-07, "loss": 0.68432659, "num_input_tokens_seen": 292457190, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 13555, "time_per_iteration": 2.5372884273529053 }, { "auxiliary_loss_clip": 0.01106112, "auxiliary_loss_mlp": 0.01034316, "balance_loss_clip": 1.02147079, "balance_loss_mlp": 1.03458905, "epoch": 0.8150308131669923, "flos": 19386088375680.0, "grad_norm": 2.253822721810139, "language_loss": 0.73580635, "learning_rate": 3.4819136428005844e-07, "loss": 0.75721061, "num_input_tokens_seen": 292474300, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 13556, "time_per_iteration": 2.422680616378784 }, { "auxiliary_loss_clip": 0.0110687, "auxiliary_loss_mlp": 0.010261, "balance_loss_clip": 1.01505589, "balance_loss_mlp": 1.0389297, "epoch": 0.8150909364196604, "flos": 17421380213760.0, "grad_norm": 1.66881676502297, "language_loss": 0.80550128, "learning_rate": 3.4797181309119307e-07, "loss": 0.82683098, "num_input_tokens_seen": 292492420, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6796875, "step": 13557, "time_per_iteration": 2.4502575397491455 }, { "auxiliary_loss_clip": 0.01110414, "auxiliary_loss_mlp": 0.01032339, "balance_loss_clip": 1.01979852, "balance_loss_mlp": 1.03787398, "epoch": 0.8151510596723283, "flos": 27162795553920.0, "grad_norm": 1.7326559315174381, "language_loss": 0.65612644, "learning_rate": 3.4775232454740255e-07, "loss": 0.67755401, "num_input_tokens_seen": 292512895, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 13558, "time_per_iteration": 2.5021286010742188 }, { "auxiliary_loss_clip": 0.01029437, "auxiliary_loss_mlp": 0.01001277, "balance_loss_clip": 1.0003289, "balance_loss_mlp": 1.00712359, "epoch": 0.8152111829249963, "flos": 64219052718720.0, "grad_norm": 0.7998332616651237, "language_loss": 0.56984711, "learning_rate": 3.4753289865700896e-07, "loss": 0.59015423, "num_input_tokens_seen": 292566580, "router_z_loss_clip": 0.00946045, "router_z_loss_mlp": 0.22265625, "step": 13559, "time_per_iteration": 3.0367910861968994 }, { "auxiliary_loss_clip": 0.01029402, "auxiliary_loss_mlp": 0.01000646, "balance_loss_clip": 0.99967998, "balance_loss_mlp": 1.00705206, "epoch": 0.8152713061776642, "flos": 67072012306560.0, "grad_norm": 0.675823922159606, "language_loss": 0.55302835, "learning_rate": 3.473135354283334e-07, "loss": 0.57332873, "num_input_tokens_seen": 292621490, "router_z_loss_clip": 0.00964355, "router_z_loss_mlp": 0.22363281, "step": 13560, "time_per_iteration": 3.011162757873535 }, { "auxiliary_loss_clip": 0.0110183, "auxiliary_loss_mlp": 0.01029582, "balance_loss_clip": 1.0182215, "balance_loss_mlp": 1.03469312, "epoch": 0.8153314294303322, "flos": 14391130072320.0, "grad_norm": 1.8477006660800606, "language_loss": 0.67324507, "learning_rate": 3.470942348696948e-07, "loss": 0.69455922, "num_input_tokens_seen": 292638660, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.671875, "step": 13561, "time_per_iteration": 2.4582812786102295 }, { "auxiliary_loss_clip": 0.01107999, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.0188036, "balance_loss_mlp": 1.03567946, "epoch": 0.8153915526830001, "flos": 25623520076160.0, "grad_norm": 2.0728200369783334, "language_loss": 0.81683916, "learning_rate": 3.468749969894085e-07, "loss": 0.83822846, "num_input_tokens_seen": 292658545, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.72265625, "step": 13562, "time_per_iteration": 2.4852547645568848 }, { "auxiliary_loss_clip": 0.01105484, "auxiliary_loss_mlp": 0.01031471, "balance_loss_clip": 1.01934159, "balance_loss_mlp": 1.03610229, "epoch": 0.8154516759356681, "flos": 23369156640000.0, "grad_norm": 1.6776534945341257, "language_loss": 0.72177142, "learning_rate": 3.4665582179578734e-07, "loss": 0.743141, "num_input_tokens_seen": 292678460, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 13563, "time_per_iteration": 2.4851036071777344 }, { "auxiliary_loss_clip": 0.01104652, "auxiliary_loss_mlp": 0.01029531, "balance_loss_clip": 1.01654315, "balance_loss_mlp": 1.03436267, "epoch": 0.815511799188336, "flos": 28149189914880.0, "grad_norm": 1.5643907263946266, "language_loss": 0.70266497, "learning_rate": 3.4643670929714387e-07, "loss": 0.72400677, "num_input_tokens_seen": 292699815, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 13564, "time_per_iteration": 2.5151479244232178 }, { "auxiliary_loss_clip": 0.01103985, "auxiliary_loss_mlp": 0.01026607, "balance_loss_clip": 1.01434052, "balance_loss_mlp": 1.03500819, "epoch": 0.8155719224410041, "flos": 16983413683200.0, "grad_norm": 2.145425231156195, "language_loss": 0.70544308, "learning_rate": 3.462176595017854e-07, "loss": 0.726749, "num_input_tokens_seen": 292717370, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 13565, "time_per_iteration": 2.435861110687256 }, { "auxiliary_loss_clip": 0.01102807, "auxiliary_loss_mlp": 0.01033098, "balance_loss_clip": 1.02153444, "balance_loss_mlp": 1.03502321, "epoch": 0.815632045693672, "flos": 24681727428480.0, "grad_norm": 4.498355795298741, "language_loss": 0.78656811, "learning_rate": 3.459986724180188e-07, "loss": 0.80792713, "num_input_tokens_seen": 292737110, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 13566, "time_per_iteration": 2.4900684356689453 }, { "auxiliary_loss_clip": 0.01102989, "auxiliary_loss_mlp": 0.0102838, "balance_loss_clip": 1.01737726, "balance_loss_mlp": 1.03641045, "epoch": 0.81569216894634, "flos": 19938323047680.0, "grad_norm": 3.7875034802424485, "language_loss": 0.8231163, "learning_rate": 3.457797480541491e-07, "loss": 0.84442997, "num_input_tokens_seen": 292756510, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6640625, "step": 13567, "time_per_iteration": 2.442397356033325 }, { "auxiliary_loss_clip": 0.01101413, "auxiliary_loss_mlp": 0.01025064, "balance_loss_clip": 1.01441836, "balance_loss_mlp": 1.03479028, "epoch": 0.8157522921990079, "flos": 21799393493760.0, "grad_norm": 2.0005418461396975, "language_loss": 0.80015153, "learning_rate": 3.455608864184771e-07, "loss": 0.82141632, "num_input_tokens_seen": 292776710, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.6640625, "step": 13568, "time_per_iteration": 2.4413535594940186 }, { "auxiliary_loss_clip": 0.01099637, "auxiliary_loss_mlp": 0.01031207, "balance_loss_clip": 1.01957214, "balance_loss_mlp": 1.03348482, "epoch": 0.8158124154516759, "flos": 18508323720960.0, "grad_norm": 2.3352311697637194, "language_loss": 0.77412534, "learning_rate": 3.453420875193016e-07, "loss": 0.79543382, "num_input_tokens_seen": 292794350, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66015625, "step": 13569, "time_per_iteration": 2.4420225620269775 }, { "auxiliary_loss_clip": 0.01102987, "auxiliary_loss_mlp": 0.01035014, "balance_loss_clip": 1.02386236, "balance_loss_mlp": 1.03568089, "epoch": 0.815872538704344, "flos": 26830801123200.0, "grad_norm": 3.8719377062241147, "language_loss": 0.58114016, "learning_rate": 3.451233513649199e-07, "loss": 0.60252017, "num_input_tokens_seen": 292814005, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.671875, "step": 13570, "time_per_iteration": 2.4869883060455322 }, { "auxiliary_loss_clip": 0.01107888, "auxiliary_loss_mlp": 0.01033722, "balance_loss_clip": 1.02124643, "balance_loss_mlp": 1.03631341, "epoch": 0.8159326619570119, "flos": 21725704742400.0, "grad_norm": 2.1101739617377087, "language_loss": 0.82285738, "learning_rate": 3.4490467796362687e-07, "loss": 0.84427357, "num_input_tokens_seen": 292833485, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 13571, "time_per_iteration": 2.444072961807251 }, { "auxiliary_loss_clip": 0.01104535, "auxiliary_loss_mlp": 0.01039798, "balance_loss_clip": 1.02740073, "balance_loss_mlp": 1.03611183, "epoch": 0.8159927852096799, "flos": 13840726993920.0, "grad_norm": 2.4412195258390286, "language_loss": 0.7887131, "learning_rate": 3.446860673237142e-07, "loss": 0.81015646, "num_input_tokens_seen": 292848045, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 13572, "time_per_iteration": 2.3945059776306152 }, { "auxiliary_loss_clip": 0.01104782, "auxiliary_loss_mlp": 0.01030096, "balance_loss_clip": 1.01843119, "balance_loss_mlp": 1.03467941, "epoch": 0.8160529084623478, "flos": 24499516711680.0, "grad_norm": 3.7763315856135833, "language_loss": 0.65084583, "learning_rate": 3.4446751945347186e-07, "loss": 0.6721946, "num_input_tokens_seen": 292869965, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69921875, "step": 13573, "time_per_iteration": 2.5426902770996094 }, { "auxiliary_loss_clip": 0.01102577, "auxiliary_loss_mlp": 0.01031284, "balance_loss_clip": 1.01999474, "balance_loss_mlp": 1.03474963, "epoch": 0.8161130317150158, "flos": 24826339584000.0, "grad_norm": 1.680880019967668, "language_loss": 0.75418633, "learning_rate": 3.442490343611868e-07, "loss": 0.77552497, "num_input_tokens_seen": 292889680, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6796875, "step": 13574, "time_per_iteration": 2.475379705429077 }, { "auxiliary_loss_clip": 0.0110561, "auxiliary_loss_mlp": 0.01035063, "balance_loss_clip": 1.02258778, "balance_loss_mlp": 1.03581166, "epoch": 0.8161731549676837, "flos": 30956542208640.0, "grad_norm": 1.6842161951605623, "language_loss": 0.59849334, "learning_rate": 3.4403061205514485e-07, "loss": 0.61990011, "num_input_tokens_seen": 292912360, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 13575, "time_per_iteration": 2.523028612136841 }, { "auxiliary_loss_clip": 0.01103215, "auxiliary_loss_mlp": 0.01031155, "balance_loss_clip": 1.01913285, "balance_loss_mlp": 1.03527272, "epoch": 0.8162332782203517, "flos": 18551991680640.0, "grad_norm": 2.295978755278728, "language_loss": 0.74635506, "learning_rate": 3.4381225254362736e-07, "loss": 0.76769876, "num_input_tokens_seen": 292928325, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 13576, "time_per_iteration": 2.430826187133789 }, { "auxiliary_loss_clip": 0.01029396, "auxiliary_loss_mlp": 0.00999889, "balance_loss_clip": 0.9989773, "balance_loss_mlp": 1.00694323, "epoch": 0.8162934014730197, "flos": 70386853904640.0, "grad_norm": 0.8292453902791955, "language_loss": 0.5862757, "learning_rate": 3.435939558349155e-07, "loss": 0.60656857, "num_input_tokens_seen": 292992795, "router_z_loss_clip": 0.00909424, "router_z_loss_mlp": 0.22460938, "step": 13577, "time_per_iteration": 3.079862117767334 }, { "auxiliary_loss_clip": 0.01101083, "auxiliary_loss_mlp": 0.01028251, "balance_loss_clip": 1.01668143, "balance_loss_mlp": 1.0354135, "epoch": 0.8163535247256877, "flos": 21214839559680.0, "grad_norm": 1.6481911872483044, "language_loss": 0.71382892, "learning_rate": 3.4337572193728747e-07, "loss": 0.73512232, "num_input_tokens_seen": 293011950, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.65625, "step": 13578, "time_per_iteration": 2.441063642501831 }, { "auxiliary_loss_clip": 0.01103164, "auxiliary_loss_mlp": 0.01030342, "balance_loss_clip": 1.01886797, "balance_loss_mlp": 1.0350641, "epoch": 0.8164136479783556, "flos": 21098847565440.0, "grad_norm": 1.758043415007058, "language_loss": 0.73219049, "learning_rate": 3.431575508590172e-07, "loss": 0.7535255, "num_input_tokens_seen": 293030175, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 13579, "time_per_iteration": 2.458564519882202 }, { "auxiliary_loss_clip": 0.01106612, "auxiliary_loss_mlp": 0.0102509, "balance_loss_clip": 1.01303208, "balance_loss_mlp": 1.03547645, "epoch": 0.8164737712310236, "flos": 21720640924800.0, "grad_norm": 2.08010004086213, "language_loss": 0.79413795, "learning_rate": 3.4293944260837873e-07, "loss": 0.81545496, "num_input_tokens_seen": 293047980, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7109375, "step": 13580, "time_per_iteration": 2.450669050216675 }, { "auxiliary_loss_clip": 0.01103608, "auxiliary_loss_mlp": 0.01032706, "balance_loss_clip": 1.02041531, "balance_loss_mlp": 1.03650224, "epoch": 0.8165338944836915, "flos": 19536805843200.0, "grad_norm": 1.8103142817859683, "language_loss": 0.69131792, "learning_rate": 3.4272139719364314e-07, "loss": 0.71268106, "num_input_tokens_seen": 293067030, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 13581, "time_per_iteration": 2.459520101547241 }, { "auxiliary_loss_clip": 0.01103837, "auxiliary_loss_mlp": 0.01027124, "balance_loss_clip": 1.01557839, "balance_loss_mlp": 1.03510559, "epoch": 0.8165940177363595, "flos": 22928568416640.0, "grad_norm": 1.859136288941115, "language_loss": 0.59431601, "learning_rate": 3.4250341462307786e-07, "loss": 0.61562562, "num_input_tokens_seen": 293085575, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 13582, "time_per_iteration": 2.4676625728607178 }, { "auxiliary_loss_clip": 0.0109941, "auxiliary_loss_mlp": 0.01027504, "balance_loss_clip": 1.01636386, "balance_loss_mlp": 1.03497982, "epoch": 0.8166541409890276, "flos": 23370377702400.0, "grad_norm": 1.4940485028005925, "language_loss": 0.82306689, "learning_rate": 3.4228549490494897e-07, "loss": 0.84433603, "num_input_tokens_seen": 293108200, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.64453125, "step": 13583, "time_per_iteration": 2.5586295127868652 }, { "auxiliary_loss_clip": 0.01103621, "auxiliary_loss_mlp": 0.0102754, "balance_loss_clip": 1.01634002, "balance_loss_mlp": 1.03521907, "epoch": 0.8167142642416955, "flos": 18441997257600.0, "grad_norm": 1.7905269176290814, "language_loss": 0.7466265, "learning_rate": 3.4206763804752093e-07, "loss": 0.76793814, "num_input_tokens_seen": 293126020, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.68359375, "step": 13584, "time_per_iteration": 2.425926923751831 }, { "auxiliary_loss_clip": 0.01106691, "auxiliary_loss_mlp": 0.01027858, "balance_loss_clip": 1.01549602, "balance_loss_mlp": 1.03763247, "epoch": 0.8167743874943635, "flos": 21214983214080.0, "grad_norm": 2.6524098272468826, "language_loss": 0.74589103, "learning_rate": 3.4184984405905405e-07, "loss": 0.76723647, "num_input_tokens_seen": 293144620, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 13585, "time_per_iteration": 3.8727409839630127 }, { "auxiliary_loss_clip": 0.01104075, "auxiliary_loss_mlp": 0.01031958, "balance_loss_clip": 1.01973927, "balance_loss_mlp": 1.03522241, "epoch": 0.8168345107470314, "flos": 18697681244160.0, "grad_norm": 1.8090022036467497, "language_loss": 0.69631124, "learning_rate": 3.416321129478068e-07, "loss": 0.71767163, "num_input_tokens_seen": 293162850, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 13586, "time_per_iteration": 3.850475311279297 }, { "auxiliary_loss_clip": 0.0110417, "auxiliary_loss_mlp": 0.01032309, "balance_loss_clip": 1.02132392, "balance_loss_mlp": 1.03600478, "epoch": 0.8168946339996994, "flos": 16253098358400.0, "grad_norm": 1.5995525316797605, "language_loss": 0.60831332, "learning_rate": 3.4141444472203594e-07, "loss": 0.62967813, "num_input_tokens_seen": 293181620, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.6796875, "step": 13587, "time_per_iteration": 2.4157297611236572 }, { "auxiliary_loss_clip": 0.01107095, "auxiliary_loss_mlp": 0.01030031, "balance_loss_clip": 1.01795554, "balance_loss_mlp": 1.03563881, "epoch": 0.8169547572523673, "flos": 26941585645440.0, "grad_norm": 11.951119287863083, "language_loss": 0.70036721, "learning_rate": 3.4119683938999624e-07, "loss": 0.72173846, "num_input_tokens_seen": 293200270, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71484375, "step": 13588, "time_per_iteration": 2.4961421489715576 }, { "auxiliary_loss_clip": 0.01106778, "auxiliary_loss_mlp": 0.01034827, "balance_loss_clip": 1.02117729, "balance_loss_mlp": 1.03654659, "epoch": 0.8170148805050353, "flos": 18952323736320.0, "grad_norm": 1.8232986860237435, "language_loss": 0.73101151, "learning_rate": 3.4097929695993854e-07, "loss": 0.75242758, "num_input_tokens_seen": 293218960, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 13589, "time_per_iteration": 3.8500585556030273 }, { "auxiliary_loss_clip": 0.01102968, "auxiliary_loss_mlp": 0.01033797, "balance_loss_clip": 1.02188206, "balance_loss_mlp": 1.0355823, "epoch": 0.8170750037577033, "flos": 21834909066240.0, "grad_norm": 1.864613481447253, "language_loss": 0.73388422, "learning_rate": 3.4076181744011166e-07, "loss": 0.75525188, "num_input_tokens_seen": 293236450, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 13590, "time_per_iteration": 3.881326913833618 }, { "auxiliary_loss_clip": 0.0110823, "auxiliary_loss_mlp": 0.01031914, "balance_loss_clip": 1.01846743, "balance_loss_mlp": 1.0359261, "epoch": 0.8171351270103713, "flos": 33507169021440.0, "grad_norm": 2.01551522540486, "language_loss": 0.65229237, "learning_rate": 3.4054440083876345e-07, "loss": 0.67369384, "num_input_tokens_seen": 293256480, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.72265625, "step": 13591, "time_per_iteration": 2.5432040691375732 }, { "auxiliary_loss_clip": 0.01106237, "auxiliary_loss_mlp": 0.01033007, "balance_loss_clip": 1.02065074, "balance_loss_mlp": 1.03470707, "epoch": 0.8171952502630392, "flos": 22708184520960.0, "grad_norm": 2.0628756068040697, "language_loss": 0.67819965, "learning_rate": 3.403270471641373e-07, "loss": 0.69959211, "num_input_tokens_seen": 293274960, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71484375, "step": 13592, "time_per_iteration": 2.4551877975463867 }, { "auxiliary_loss_clip": 0.01105601, "auxiliary_loss_mlp": 0.01029088, "balance_loss_clip": 1.01648784, "balance_loss_mlp": 1.0353595, "epoch": 0.8172553735157072, "flos": 26723715701760.0, "grad_norm": 1.7209355451954549, "language_loss": 0.66185862, "learning_rate": 3.401097564244759e-07, "loss": 0.68320549, "num_input_tokens_seen": 293295945, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 13593, "time_per_iteration": 2.499786615371704 }, { "auxiliary_loss_clip": 0.01102946, "auxiliary_loss_mlp": 0.01031117, "balance_loss_clip": 1.01919639, "balance_loss_mlp": 1.03422773, "epoch": 0.8173154967683751, "flos": 15961072786560.0, "grad_norm": 2.0279092214848173, "language_loss": 0.69577813, "learning_rate": 3.398925286280188e-07, "loss": 0.71711886, "num_input_tokens_seen": 293313300, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 13594, "time_per_iteration": 2.4665071964263916 }, { "auxiliary_loss_clip": 0.01106555, "auxiliary_loss_mlp": 0.01032941, "balance_loss_clip": 1.02115738, "balance_loss_mlp": 1.03563857, "epoch": 0.8173756200210431, "flos": 25986720447360.0, "grad_norm": 1.947210299140125, "language_loss": 0.65920281, "learning_rate": 3.3967536378300456e-07, "loss": 0.68059778, "num_input_tokens_seen": 293333085, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.70703125, "step": 13595, "time_per_iteration": 2.5830276012420654 }, { "auxiliary_loss_clip": 0.01106188, "auxiliary_loss_mlp": 0.0102896, "balance_loss_clip": 1.01643133, "balance_loss_mlp": 1.03472376, "epoch": 0.8174357432737112, "flos": 25664422688640.0, "grad_norm": 2.757965258758585, "language_loss": 0.78758496, "learning_rate": 3.394582618976658e-07, "loss": 0.80893648, "num_input_tokens_seen": 293351895, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 13596, "time_per_iteration": 2.5236012935638428 }, { "auxiliary_loss_clip": 0.0110179, "auxiliary_loss_mlp": 0.0102647, "balance_loss_clip": 1.01401901, "balance_loss_mlp": 1.03388298, "epoch": 0.8174958665263791, "flos": 21835088634240.0, "grad_norm": 5.162194509177315, "language_loss": 0.5881564, "learning_rate": 3.392412229802362e-07, "loss": 0.60943902, "num_input_tokens_seen": 293371165, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 13597, "time_per_iteration": 2.4824557304382324 }, { "auxiliary_loss_clip": 0.0109994, "auxiliary_loss_mlp": 0.01029705, "balance_loss_clip": 1.01783216, "balance_loss_mlp": 1.03293037, "epoch": 0.8175559897790471, "flos": 22455517276800.0, "grad_norm": 1.7290794338338489, "language_loss": 0.82552886, "learning_rate": 3.390242470389462e-07, "loss": 0.84682536, "num_input_tokens_seen": 293391150, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 13598, "time_per_iteration": 2.484541893005371 }, { "auxiliary_loss_clip": 0.01106465, "auxiliary_loss_mlp": 0.01030129, "balance_loss_clip": 1.0181725, "balance_loss_mlp": 1.03601384, "epoch": 0.817616113031715, "flos": 23615790399360.0, "grad_norm": 1.9233396716536706, "language_loss": 0.82977605, "learning_rate": 3.3880733408202277e-07, "loss": 0.85114199, "num_input_tokens_seen": 293409440, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.703125, "step": 13599, "time_per_iteration": 2.4626834392547607 }, { "auxiliary_loss_clip": 0.01104645, "auxiliary_loss_mlp": 0.01027941, "balance_loss_clip": 1.01609206, "balance_loss_mlp": 1.03605223, "epoch": 0.817676236284383, "flos": 27672260106240.0, "grad_norm": 1.7428508703716492, "language_loss": 0.84000194, "learning_rate": 3.3859048411769186e-07, "loss": 0.86132783, "num_input_tokens_seen": 293428995, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 13600, "time_per_iteration": 2.522289276123047 }, { "auxiliary_loss_clip": 0.01105137, "auxiliary_loss_mlp": 0.01032132, "balance_loss_clip": 1.0197463, "balance_loss_mlp": 1.03514957, "epoch": 0.8177363595370509, "flos": 24681009156480.0, "grad_norm": 1.7813107705330398, "language_loss": 0.73938155, "learning_rate": 3.383736971541766e-07, "loss": 0.76075429, "num_input_tokens_seen": 293449155, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 13601, "time_per_iteration": 2.476794958114624 }, { "auxiliary_loss_clip": 0.01108273, "auxiliary_loss_mlp": 0.01030772, "balance_loss_clip": 1.01814163, "balance_loss_mlp": 1.03582883, "epoch": 0.817796482789719, "flos": 17346326745600.0, "grad_norm": 2.1112739009783903, "language_loss": 0.68420571, "learning_rate": 3.3815697319969737e-07, "loss": 0.70559615, "num_input_tokens_seen": 293466125, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 13602, "time_per_iteration": 2.448198080062866 }, { "auxiliary_loss_clip": 0.01101607, "auxiliary_loss_mlp": 0.01030693, "balance_loss_clip": 1.01907659, "balance_loss_mlp": 1.03443885, "epoch": 0.8178566060423869, "flos": 17778475272960.0, "grad_norm": 2.231335743330526, "language_loss": 0.83552301, "learning_rate": 3.379403122624718e-07, "loss": 0.85684603, "num_input_tokens_seen": 293481345, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 13603, "time_per_iteration": 2.3956973552703857 }, { "auxiliary_loss_clip": 0.01105124, "auxiliary_loss_mlp": 0.01027885, "balance_loss_clip": 1.01592803, "balance_loss_mlp": 1.03576648, "epoch": 0.8179167292950549, "flos": 24973250209920.0, "grad_norm": 2.1340656242035494, "language_loss": 0.69175422, "learning_rate": 3.377237143507159e-07, "loss": 0.71308428, "num_input_tokens_seen": 293502330, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 13604, "time_per_iteration": 2.4732065200805664 }, { "auxiliary_loss_clip": 0.01105913, "auxiliary_loss_mlp": 0.0103565, "balance_loss_clip": 1.02305496, "balance_loss_mlp": 1.03735828, "epoch": 0.8179768525477228, "flos": 22856783086080.0, "grad_norm": 1.6681027865789453, "language_loss": 0.73975444, "learning_rate": 3.3750717947264406e-07, "loss": 0.76117009, "num_input_tokens_seen": 293521415, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 13605, "time_per_iteration": 2.5164124965667725 }, { "auxiliary_loss_clip": 0.0110392, "auxiliary_loss_mlp": 0.01041517, "balance_loss_clip": 1.0287317, "balance_loss_mlp": 1.03689075, "epoch": 0.8180369758003908, "flos": 18515147304960.0, "grad_norm": 1.9840652733795052, "language_loss": 0.74222791, "learning_rate": 3.372907076364666e-07, "loss": 0.76368231, "num_input_tokens_seen": 293539245, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.671875, "step": 13606, "time_per_iteration": 2.456331968307495 }, { "auxiliary_loss_clip": 0.01102808, "auxiliary_loss_mlp": 0.01026471, "balance_loss_clip": 1.01469326, "balance_loss_mlp": 1.03537083, "epoch": 0.8180970990530587, "flos": 33182105915520.0, "grad_norm": 2.130271819359328, "language_loss": 0.65336424, "learning_rate": 3.370742988503916e-07, "loss": 0.67465699, "num_input_tokens_seen": 293560640, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 13607, "time_per_iteration": 2.597027063369751 }, { "auxiliary_loss_clip": 0.01105012, "auxiliary_loss_mlp": 0.01031139, "balance_loss_clip": 1.01885402, "balance_loss_mlp": 1.03560448, "epoch": 0.8181572223057267, "flos": 25010022758400.0, "grad_norm": 1.8311242197779065, "language_loss": 0.7038517, "learning_rate": 3.3685795312262634e-07, "loss": 0.72521323, "num_input_tokens_seen": 293579465, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 13608, "time_per_iteration": 2.5463080406188965 }, { "auxiliary_loss_clip": 0.01102679, "auxiliary_loss_mlp": 0.01034756, "balance_loss_clip": 1.02274525, "balance_loss_mlp": 1.03445101, "epoch": 0.8182173455583948, "flos": 28548731871360.0, "grad_norm": 2.965308970653148, "language_loss": 0.79912668, "learning_rate": 3.366416704613735e-07, "loss": 0.82050109, "num_input_tokens_seen": 293600540, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 13609, "time_per_iteration": 2.539053440093994 }, { "auxiliary_loss_clip": 0.01028692, "auxiliary_loss_mlp": 0.01003515, "balance_loss_clip": 1.00256109, "balance_loss_mlp": 1.00636089, "epoch": 0.8182774688110627, "flos": 72028043245440.0, "grad_norm": 0.7397880575996743, "language_loss": 0.55879045, "learning_rate": 3.3642545087483544e-07, "loss": 0.57911253, "num_input_tokens_seen": 293665160, "router_z_loss_clip": 0.00952148, "router_z_loss_mlp": 0.22363281, "step": 13610, "time_per_iteration": 3.1728310585021973 }, { "auxiliary_loss_clip": 0.01099906, "auxiliary_loss_mlp": 0.0102859, "balance_loss_clip": 1.01723576, "balance_loss_mlp": 1.03370488, "epoch": 0.8183375920637307, "flos": 19755358145280.0, "grad_norm": 2.2229847517246335, "language_loss": 0.77830291, "learning_rate": 3.362092943712107e-07, "loss": 0.79958785, "num_input_tokens_seen": 293683995, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 13611, "time_per_iteration": 2.475464344024658 }, { "auxiliary_loss_clip": 0.0110749, "auxiliary_loss_mlp": 0.01034153, "balance_loss_clip": 1.02045631, "balance_loss_mlp": 1.03413248, "epoch": 0.8183977153163986, "flos": 22341895580160.0, "grad_norm": 2.6557949520640856, "language_loss": 0.77363884, "learning_rate": 3.3599320095869745e-07, "loss": 0.79505527, "num_input_tokens_seen": 293704115, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 13612, "time_per_iteration": 2.507406234741211 }, { "auxiliary_loss_clip": 0.01098888, "auxiliary_loss_mlp": 0.01023735, "balance_loss_clip": 1.01211226, "balance_loss_mlp": 1.03317904, "epoch": 0.8184578385690666, "flos": 17712472032000.0, "grad_norm": 2.0295736821827233, "language_loss": 0.86475599, "learning_rate": 3.3577717064548793e-07, "loss": 0.88598222, "num_input_tokens_seen": 293722225, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.65625, "step": 13613, "time_per_iteration": 2.4416399002075195 }, { "auxiliary_loss_clip": 0.01104649, "auxiliary_loss_mlp": 0.01039273, "balance_loss_clip": 1.02785301, "balance_loss_mlp": 1.03670311, "epoch": 0.8185179618217345, "flos": 25701159323520.0, "grad_norm": 1.619439685197243, "language_loss": 0.72505283, "learning_rate": 3.355612034397746e-07, "loss": 0.74649203, "num_input_tokens_seen": 293743995, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 13614, "time_per_iteration": 2.495727777481079 }, { "auxiliary_loss_clip": 0.01106073, "auxiliary_loss_mlp": 0.01035652, "balance_loss_clip": 1.02328992, "balance_loss_mlp": 1.03635967, "epoch": 0.8185780850744026, "flos": 25960326929280.0, "grad_norm": 1.857467142904582, "language_loss": 0.80954123, "learning_rate": 3.353452993497479e-07, "loss": 0.83095855, "num_input_tokens_seen": 293764935, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 13615, "time_per_iteration": 2.4926788806915283 }, { "auxiliary_loss_clip": 0.01102699, "auxiliary_loss_mlp": 0.0103078, "balance_loss_clip": 1.01838851, "balance_loss_mlp": 1.03439271, "epoch": 0.8186382083270705, "flos": 25228431406080.0, "grad_norm": 1.9460595418781208, "language_loss": 0.75714517, "learning_rate": 3.3512945838359375e-07, "loss": 0.77848005, "num_input_tokens_seen": 293784035, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 13616, "time_per_iteration": 2.501102924346924 }, { "auxiliary_loss_clip": 0.0110028, "auxiliary_loss_mlp": 0.01029606, "balance_loss_clip": 1.01724994, "balance_loss_mlp": 1.03344452, "epoch": 0.8186983315797385, "flos": 22415009713920.0, "grad_norm": 1.9931253774577804, "language_loss": 0.752038, "learning_rate": 3.349136805494979e-07, "loss": 0.77333683, "num_input_tokens_seen": 293803360, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.66796875, "step": 13617, "time_per_iteration": 2.498430013656616 }, { "auxiliary_loss_clip": 0.01098733, "auxiliary_loss_mlp": 0.01027789, "balance_loss_clip": 1.01678574, "balance_loss_mlp": 1.03189945, "epoch": 0.8187584548324064, "flos": 22018017623040.0, "grad_norm": 1.9603992106466535, "language_loss": 0.68529201, "learning_rate": 3.346979658556415e-07, "loss": 0.70655715, "num_input_tokens_seen": 293821325, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66796875, "step": 13618, "time_per_iteration": 2.476137161254883 }, { "auxiliary_loss_clip": 0.01108524, "auxiliary_loss_mlp": 0.01034094, "balance_loss_clip": 1.02105212, "balance_loss_mlp": 1.03651392, "epoch": 0.8188185780850744, "flos": 29241664116480.0, "grad_norm": 2.3055028575322414, "language_loss": 0.69835269, "learning_rate": 3.344823143102058e-07, "loss": 0.7197789, "num_input_tokens_seen": 293840315, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 13619, "time_per_iteration": 2.54341459274292 }, { "auxiliary_loss_clip": 0.01107506, "auxiliary_loss_mlp": 0.01028198, "balance_loss_clip": 1.01565111, "balance_loss_mlp": 1.03753519, "epoch": 0.8188787013377423, "flos": 20696504348160.0, "grad_norm": 1.949394306141181, "language_loss": 0.74002111, "learning_rate": 3.3426672592136694e-07, "loss": 0.76137817, "num_input_tokens_seen": 293855685, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 13620, "time_per_iteration": 2.4696834087371826 }, { "auxiliary_loss_clip": 0.01100947, "auxiliary_loss_mlp": 0.01027241, "balance_loss_clip": 1.01548135, "balance_loss_mlp": 1.03378081, "epoch": 0.8189388245904103, "flos": 23732967542400.0, "grad_norm": 1.683971248415393, "language_loss": 0.76420522, "learning_rate": 3.340512006973011e-07, "loss": 0.78548706, "num_input_tokens_seen": 293875540, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 13621, "time_per_iteration": 2.473801612854004 }, { "auxiliary_loss_clip": 0.01102678, "auxiliary_loss_mlp": 0.01030011, "balance_loss_clip": 1.01744628, "balance_loss_mlp": 1.0349474, "epoch": 0.8189989478430784, "flos": 28255090187520.0, "grad_norm": 2.1397827114914247, "language_loss": 0.65630215, "learning_rate": 3.3383573864618076e-07, "loss": 0.67762905, "num_input_tokens_seen": 293896570, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 13622, "time_per_iteration": 2.560603141784668 }, { "auxiliary_loss_clip": 0.01106708, "auxiliary_loss_mlp": 0.01028621, "balance_loss_clip": 1.01558542, "balance_loss_mlp": 1.03701711, "epoch": 0.8190590710957463, "flos": 21397696721280.0, "grad_norm": 1.9895974954122164, "language_loss": 0.75309724, "learning_rate": 3.3362033977617653e-07, "loss": 0.77445054, "num_input_tokens_seen": 293914680, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 13623, "time_per_iteration": 2.5415141582489014 }, { "auxiliary_loss_clip": 0.01105608, "auxiliary_loss_mlp": 0.0103464, "balance_loss_clip": 1.02185464, "balance_loss_mlp": 1.03596687, "epoch": 0.8191191943484143, "flos": 38796451367040.0, "grad_norm": 2.194743463963587, "language_loss": 0.63405281, "learning_rate": 3.3340500409545527e-07, "loss": 0.65545529, "num_input_tokens_seen": 293936480, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 13624, "time_per_iteration": 2.6122372150421143 }, { "auxiliary_loss_clip": 0.01101715, "auxiliary_loss_mlp": 0.01033605, "balance_loss_clip": 1.02176738, "balance_loss_mlp": 1.03424203, "epoch": 0.8191793176010822, "flos": 25446516831360.0, "grad_norm": 2.326490779166869, "language_loss": 0.78185654, "learning_rate": 3.3318973161218386e-07, "loss": 0.80320972, "num_input_tokens_seen": 293957815, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 13625, "time_per_iteration": 2.487999439239502 }, { "auxiliary_loss_clip": 0.01109279, "auxiliary_loss_mlp": 0.01031276, "balance_loss_clip": 1.01847875, "balance_loss_mlp": 1.03561699, "epoch": 0.8192394408537502, "flos": 25083029151360.0, "grad_norm": 2.888320739458254, "language_loss": 0.76294577, "learning_rate": 3.329745223345244e-07, "loss": 0.78435135, "num_input_tokens_seen": 293975440, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.73828125, "step": 13626, "time_per_iteration": 2.5052685737609863 }, { "auxiliary_loss_clip": 0.01102947, "auxiliary_loss_mlp": 0.01033694, "balance_loss_clip": 1.02254236, "balance_loss_mlp": 1.03577602, "epoch": 0.8192995641064181, "flos": 27673732563840.0, "grad_norm": 1.7152769114399884, "language_loss": 0.73594481, "learning_rate": 3.3275937627063823e-07, "loss": 0.75731122, "num_input_tokens_seen": 293997540, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.671875, "step": 13627, "time_per_iteration": 3.853837013244629 }, { "auxiliary_loss_clip": 0.01105603, "auxiliary_loss_mlp": 0.01031885, "balance_loss_clip": 1.01946986, "balance_loss_mlp": 1.03577125, "epoch": 0.8193596873590862, "flos": 21288492397440.0, "grad_norm": 1.8566386161583057, "language_loss": 0.68982708, "learning_rate": 3.3254429342868353e-07, "loss": 0.71120203, "num_input_tokens_seen": 294017030, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 13628, "time_per_iteration": 3.8135111331939697 }, { "auxiliary_loss_clip": 0.01109784, "auxiliary_loss_mlp": 0.01032794, "balance_loss_clip": 1.02015781, "balance_loss_mlp": 1.03786397, "epoch": 0.8194198106117541, "flos": 17492626840320.0, "grad_norm": 2.738310837257061, "language_loss": 0.85476589, "learning_rate": 3.323292738168171e-07, "loss": 0.87619174, "num_input_tokens_seen": 294035700, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 13629, "time_per_iteration": 2.4435737133026123 }, { "auxiliary_loss_clip": 0.01103036, "auxiliary_loss_mlp": 0.01024969, "balance_loss_clip": 1.01327419, "balance_loss_mlp": 1.03527069, "epoch": 0.8194799338644221, "flos": 15267925059840.0, "grad_norm": 2.2633933107788455, "language_loss": 0.74097806, "learning_rate": 3.3211431744319084e-07, "loss": 0.76225817, "num_input_tokens_seen": 294049730, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 13630, "time_per_iteration": 3.7784833908081055 }, { "auxiliary_loss_clip": 0.01106227, "auxiliary_loss_mlp": 0.01030205, "balance_loss_clip": 1.01829565, "balance_loss_mlp": 1.03631699, "epoch": 0.81954005711709, "flos": 14718814871040.0, "grad_norm": 2.3054556682295315, "language_loss": 0.72465366, "learning_rate": 3.31899424315957e-07, "loss": 0.74601793, "num_input_tokens_seen": 294066545, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 13631, "time_per_iteration": 2.4328808784484863 }, { "auxiliary_loss_clip": 0.01103004, "auxiliary_loss_mlp": 0.01033415, "balance_loss_clip": 1.02168489, "balance_loss_mlp": 1.03347909, "epoch": 0.819600180369758, "flos": 23074042498560.0, "grad_norm": 2.1088971653103554, "language_loss": 0.76454377, "learning_rate": 3.3168459444326447e-07, "loss": 0.78590792, "num_input_tokens_seen": 294087455, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 13632, "time_per_iteration": 3.90183687210083 }, { "auxiliary_loss_clip": 0.01101322, "auxiliary_loss_mlp": 0.01029481, "balance_loss_clip": 1.01878774, "balance_loss_mlp": 1.03348505, "epoch": 0.8196603036224259, "flos": 27599792417280.0, "grad_norm": 1.794431730348963, "language_loss": 0.65574598, "learning_rate": 3.314698278332588e-07, "loss": 0.67705399, "num_input_tokens_seen": 294107480, "router_z_loss_clip": 0.10693359, "router_z_loss_mlp": 0.6796875, "step": 13633, "time_per_iteration": 2.5700316429138184 }, { "auxiliary_loss_clip": 0.01101072, "auxiliary_loss_mlp": 0.0103581, "balance_loss_clip": 1.02469945, "balance_loss_mlp": 1.03506231, "epoch": 0.8197204268750939, "flos": 28582020800640.0, "grad_norm": 1.4865434551951975, "language_loss": 0.75626212, "learning_rate": 3.3125512449408513e-07, "loss": 0.77763093, "num_input_tokens_seen": 294130115, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.66015625, "step": 13634, "time_per_iteration": 2.5477499961853027 }, { "auxiliary_loss_clip": 0.01101474, "auxiliary_loss_mlp": 0.01029475, "balance_loss_clip": 1.01825106, "balance_loss_mlp": 1.03455114, "epoch": 0.819780550127762, "flos": 23258300290560.0, "grad_norm": 2.037704462142037, "language_loss": 0.82211494, "learning_rate": 3.310404844338841e-07, "loss": 0.84342444, "num_input_tokens_seen": 294148495, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 13635, "time_per_iteration": 2.474597692489624 }, { "auxiliary_loss_clip": 0.01103354, "auxiliary_loss_mlp": 0.01031888, "balance_loss_clip": 1.01917481, "balance_loss_mlp": 1.03402579, "epoch": 0.8198406733804299, "flos": 26685255214080.0, "grad_norm": 1.8591825743650159, "language_loss": 0.75749993, "learning_rate": 3.308259076607949e-07, "loss": 0.77885234, "num_input_tokens_seen": 294169595, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 13636, "time_per_iteration": 2.5067644119262695 }, { "auxiliary_loss_clip": 0.01100866, "auxiliary_loss_mlp": 0.01035756, "balance_loss_clip": 1.02323937, "balance_loss_mlp": 1.03359973, "epoch": 0.8199007966330979, "flos": 20084084438400.0, "grad_norm": 2.1143158802775517, "language_loss": 0.81499529, "learning_rate": 3.3061139418295445e-07, "loss": 0.83636147, "num_input_tokens_seen": 294183885, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 13637, "time_per_iteration": 2.443565607070923 }, { "auxiliary_loss_clip": 0.01104718, "auxiliary_loss_mlp": 0.01033586, "balance_loss_clip": 1.02161169, "balance_loss_mlp": 1.03681993, "epoch": 0.8199609198857658, "flos": 31902788142720.0, "grad_norm": 2.5957325988624196, "language_loss": 0.71527511, "learning_rate": 3.3039694400849725e-07, "loss": 0.73665816, "num_input_tokens_seen": 294200150, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 13638, "time_per_iteration": 2.5372848510742188 }, { "auxiliary_loss_clip": 0.0110819, "auxiliary_loss_mlp": 0.01033817, "balance_loss_clip": 1.01984572, "balance_loss_mlp": 1.03600872, "epoch": 0.8200210431384338, "flos": 26470150617600.0, "grad_norm": 1.9316069008736747, "language_loss": 0.7951417, "learning_rate": 3.3018255714555564e-07, "loss": 0.81656182, "num_input_tokens_seen": 294220385, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.72265625, "step": 13639, "time_per_iteration": 2.509073495864868 }, { "auxiliary_loss_clip": 0.01102567, "auxiliary_loss_mlp": 0.01028878, "balance_loss_clip": 1.01744008, "balance_loss_mlp": 1.0346632, "epoch": 0.8200811663911017, "flos": 22091454979200.0, "grad_norm": 1.8999823753057348, "language_loss": 0.7929002, "learning_rate": 3.299682336022589e-07, "loss": 0.81421471, "num_input_tokens_seen": 294239355, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 13640, "time_per_iteration": 2.518491744995117 }, { "auxiliary_loss_clip": 0.01108827, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.02043521, "balance_loss_mlp": 1.03603554, "epoch": 0.8201412896437698, "flos": 37593659520000.0, "grad_norm": 1.8275208493348871, "language_loss": 0.63417876, "learning_rate": 3.297539733867336e-07, "loss": 0.65559554, "num_input_tokens_seen": 294259395, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 13641, "time_per_iteration": 2.6045448780059814 }, { "auxiliary_loss_clip": 0.01104876, "auxiliary_loss_mlp": 0.01031578, "balance_loss_clip": 1.01941836, "balance_loss_mlp": 1.03632987, "epoch": 0.8202014128964377, "flos": 19646333389440.0, "grad_norm": 1.9139311685189002, "language_loss": 0.7327106, "learning_rate": 3.295397765071055e-07, "loss": 0.75407517, "num_input_tokens_seen": 294277365, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 13642, "time_per_iteration": 2.4755799770355225 }, { "auxiliary_loss_clip": 0.01105168, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.01990342, "balance_loss_mlp": 1.03718567, "epoch": 0.8202615361491057, "flos": 31467335564160.0, "grad_norm": 1.7584604797563874, "language_loss": 0.70188373, "learning_rate": 3.2932564297149615e-07, "loss": 0.72325152, "num_input_tokens_seen": 294297555, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 13643, "time_per_iteration": 2.522822380065918 }, { "auxiliary_loss_clip": 0.01103127, "auxiliary_loss_mlp": 0.01031657, "balance_loss_clip": 1.01977241, "balance_loss_mlp": 1.03563547, "epoch": 0.8203216594017736, "flos": 24715555061760.0, "grad_norm": 1.9411900975505108, "language_loss": 0.65729165, "learning_rate": 3.291115727880256e-07, "loss": 0.67863947, "num_input_tokens_seen": 294317600, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 13644, "time_per_iteration": 2.498379945755005 }, { "auxiliary_loss_clip": 0.01105535, "auxiliary_loss_mlp": 0.01037036, "balance_loss_clip": 1.02516246, "balance_loss_mlp": 1.03541398, "epoch": 0.8203817826544416, "flos": 26031824951040.0, "grad_norm": 1.4931600016304856, "language_loss": 0.70704097, "learning_rate": 3.2889756596481234e-07, "loss": 0.72846663, "num_input_tokens_seen": 294340215, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 13645, "time_per_iteration": 2.4952383041381836 }, { "auxiliary_loss_clip": 0.01102791, "auxiliary_loss_mlp": 0.01029884, "balance_loss_clip": 1.01802921, "balance_loss_mlp": 1.03565741, "epoch": 0.8204419059071095, "flos": 25954544839680.0, "grad_norm": 1.9268513644553213, "language_loss": 0.7121526, "learning_rate": 3.286836225099707e-07, "loss": 0.73347926, "num_input_tokens_seen": 294358590, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 13646, "time_per_iteration": 2.5279195308685303 }, { "auxiliary_loss_clip": 0.0110616, "auxiliary_loss_mlp": 0.01029645, "balance_loss_clip": 1.01742005, "balance_loss_mlp": 1.03660786, "epoch": 0.8205020291597775, "flos": 23580059345280.0, "grad_norm": 8.367652180096943, "language_loss": 0.78401381, "learning_rate": 3.284697424316132e-07, "loss": 0.80537188, "num_input_tokens_seen": 294375825, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 13647, "time_per_iteration": 2.442756175994873 }, { "auxiliary_loss_clip": 0.01100549, "auxiliary_loss_mlp": 0.01033041, "balance_loss_clip": 1.02165079, "balance_loss_mlp": 1.03551936, "epoch": 0.8205621524124456, "flos": 26799164219520.0, "grad_norm": 1.4150587284286222, "language_loss": 0.676301, "learning_rate": 3.2825592573785034e-07, "loss": 0.6976369, "num_input_tokens_seen": 294398500, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6484375, "step": 13648, "time_per_iteration": 2.4965288639068604 }, { "auxiliary_loss_clip": 0.01103132, "auxiliary_loss_mlp": 0.01028252, "balance_loss_clip": 1.01602161, "balance_loss_mlp": 1.03420615, "epoch": 0.8206222756651135, "flos": 27527863432320.0, "grad_norm": 2.831770998944804, "language_loss": 0.79859489, "learning_rate": 3.28042172436791e-07, "loss": 0.81990874, "num_input_tokens_seen": 294418840, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 13649, "time_per_iteration": 2.50939679145813 }, { "auxiliary_loss_clip": 0.01107128, "auxiliary_loss_mlp": 0.0103236, "balance_loss_clip": 1.01893663, "balance_loss_mlp": 1.03784537, "epoch": 0.8206823989177815, "flos": 21178605715200.0, "grad_norm": 1.7059262099171724, "language_loss": 0.69141585, "learning_rate": 3.278284825365396e-07, "loss": 0.71281075, "num_input_tokens_seen": 294438215, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 13650, "time_per_iteration": 2.4978151321411133 }, { "auxiliary_loss_clip": 0.01107157, "auxiliary_loss_mlp": 0.01033464, "balance_loss_clip": 1.02059555, "balance_loss_mlp": 1.0367924, "epoch": 0.8207425221704494, "flos": 11509622150400.0, "grad_norm": 2.276933898264626, "language_loss": 0.60481399, "learning_rate": 3.276148560452001e-07, "loss": 0.62622023, "num_input_tokens_seen": 294455260, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 13651, "time_per_iteration": 2.4260194301605225 }, { "auxiliary_loss_clip": 0.01108657, "auxiliary_loss_mlp": 0.01034571, "balance_loss_clip": 1.02183938, "balance_loss_mlp": 1.03740859, "epoch": 0.8208026454231174, "flos": 19791987039360.0, "grad_norm": 2.2588199612306967, "language_loss": 0.72369623, "learning_rate": 3.2740129297087293e-07, "loss": 0.74512857, "num_input_tokens_seen": 294473205, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 13652, "time_per_iteration": 2.4534902572631836 }, { "auxiliary_loss_clip": 0.01099649, "auxiliary_loss_mlp": 0.01027519, "balance_loss_clip": 1.01658177, "balance_loss_mlp": 1.03490353, "epoch": 0.8208627686757853, "flos": 15667538843520.0, "grad_norm": 4.268445718452963, "language_loss": 0.73145735, "learning_rate": 3.271877933216558e-07, "loss": 0.75272906, "num_input_tokens_seen": 294490645, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6484375, "step": 13653, "time_per_iteration": 2.405451774597168 }, { "auxiliary_loss_clip": 0.01110613, "auxiliary_loss_mlp": 0.01034728, "balance_loss_clip": 1.02131724, "balance_loss_mlp": 1.03762221, "epoch": 0.8209228919284534, "flos": 37482659516160.0, "grad_norm": 2.1845025053839326, "language_loss": 0.63258493, "learning_rate": 3.269743571056451e-07, "loss": 0.65403831, "num_input_tokens_seen": 294513500, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 13654, "time_per_iteration": 2.6002578735351562 }, { "auxiliary_loss_clip": 0.01104819, "auxiliary_loss_mlp": 0.01031801, "balance_loss_clip": 1.01934958, "balance_loss_mlp": 1.03391135, "epoch": 0.8209830151811213, "flos": 23112969863040.0, "grad_norm": 1.6378921312172936, "language_loss": 0.69858366, "learning_rate": 3.2676098433093447e-07, "loss": 0.71994984, "num_input_tokens_seen": 294535710, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 13655, "time_per_iteration": 2.5025455951690674 }, { "auxiliary_loss_clip": 0.01103424, "auxiliary_loss_mlp": 0.01034274, "balance_loss_clip": 1.02263319, "balance_loss_mlp": 1.0359627, "epoch": 0.8210431384337893, "flos": 21288169175040.0, "grad_norm": 2.050204818819904, "language_loss": 0.82764566, "learning_rate": 3.265476750056162e-07, "loss": 0.84902263, "num_input_tokens_seen": 294554055, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 13656, "time_per_iteration": 2.47928786277771 }, { "auxiliary_loss_clip": 0.01101351, "auxiliary_loss_mlp": 0.01034825, "balance_loss_clip": 1.02274323, "balance_loss_mlp": 1.03601551, "epoch": 0.8211032616864572, "flos": 11502403516800.0, "grad_norm": 2.1967538685042225, "language_loss": 0.73562074, "learning_rate": 3.2633442913777654e-07, "loss": 0.75698251, "num_input_tokens_seen": 294570390, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.65234375, "step": 13657, "time_per_iteration": 2.4268572330474854 }, { "auxiliary_loss_clip": 0.01102717, "auxiliary_loss_mlp": 0.0103171, "balance_loss_clip": 1.01974118, "balance_loss_mlp": 1.03407836, "epoch": 0.8211633849391252, "flos": 29821477455360.0, "grad_norm": 3.2794314669314897, "language_loss": 0.55367988, "learning_rate": 3.2612124673550325e-07, "loss": 0.57502413, "num_input_tokens_seen": 294593050, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 13658, "time_per_iteration": 2.568758487701416 }, { "auxiliary_loss_clip": 0.01104709, "auxiliary_loss_mlp": 0.01034532, "balance_loss_clip": 1.02253401, "balance_loss_mlp": 1.03481936, "epoch": 0.8212235081917931, "flos": 13115439573120.0, "grad_norm": 2.2791713772767586, "language_loss": 0.78770864, "learning_rate": 3.259081278068805e-07, "loss": 0.80910105, "num_input_tokens_seen": 294608550, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 13659, "time_per_iteration": 2.426684617996216 }, { "auxiliary_loss_clip": 0.01098355, "auxiliary_loss_mlp": 0.01027984, "balance_loss_clip": 1.01705885, "balance_loss_mlp": 1.03328514, "epoch": 0.8212836314444611, "flos": 40515351782400.0, "grad_norm": 2.229749118162526, "language_loss": 0.59897703, "learning_rate": 3.256950723599887e-07, "loss": 0.62024045, "num_input_tokens_seen": 294630380, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6484375, "step": 13660, "time_per_iteration": 2.6388537883758545 }, { "auxiliary_loss_clip": 0.01106412, "auxiliary_loss_mlp": 0.0103033, "balance_loss_clip": 1.01728868, "balance_loss_mlp": 1.03604698, "epoch": 0.8213437546971292, "flos": 18770543982720.0, "grad_norm": 1.7695169047239834, "language_loss": 0.72867572, "learning_rate": 3.254820804029075e-07, "loss": 0.75004315, "num_input_tokens_seen": 294648655, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 13661, "time_per_iteration": 2.4279186725616455 }, { "auxiliary_loss_clip": 0.0110475, "auxiliary_loss_mlp": 0.0103056, "balance_loss_clip": 1.01810884, "balance_loss_mlp": 1.03347731, "epoch": 0.8214038779497971, "flos": 19682279925120.0, "grad_norm": 2.3217683941649407, "language_loss": 0.7502861, "learning_rate": 3.252691519437143e-07, "loss": 0.77163923, "num_input_tokens_seen": 294666915, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 13662, "time_per_iteration": 2.4419479370117188 }, { "auxiliary_loss_clip": 0.01030183, "auxiliary_loss_mlp": 0.0100373, "balance_loss_clip": 1.0027709, "balance_loss_mlp": 1.00782752, "epoch": 0.8214640012024651, "flos": 71602969697280.0, "grad_norm": 0.7441536976996355, "language_loss": 0.5397948, "learning_rate": 3.250562869904825e-07, "loss": 0.56013393, "num_input_tokens_seen": 294731545, "router_z_loss_clip": 0.00958252, "router_z_loss_mlp": 0.22460938, "step": 13663, "time_per_iteration": 3.2191226482391357 }, { "auxiliary_loss_clip": 0.01102838, "auxiliary_loss_mlp": 0.01031288, "balance_loss_clip": 1.01935518, "balance_loss_mlp": 1.03382945, "epoch": 0.821524124455133, "flos": 14757203531520.0, "grad_norm": 2.1023524905100945, "language_loss": 0.65960395, "learning_rate": 3.248434855512838e-07, "loss": 0.68094516, "num_input_tokens_seen": 294748745, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 13664, "time_per_iteration": 2.434910297393799 }, { "auxiliary_loss_clip": 0.01102227, "auxiliary_loss_mlp": 0.0103023, "balance_loss_clip": 1.01907206, "balance_loss_mlp": 1.03599977, "epoch": 0.821584247707801, "flos": 25082274965760.0, "grad_norm": 1.57364506109805, "language_loss": 0.75262463, "learning_rate": 3.246307476341881e-07, "loss": 0.77394927, "num_input_tokens_seen": 294768955, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6640625, "step": 13665, "time_per_iteration": 2.4698269367218018 }, { "auxiliary_loss_clip": 0.0110464, "auxiliary_loss_mlp": 0.0103257, "balance_loss_clip": 1.02076185, "balance_loss_mlp": 1.03536987, "epoch": 0.8216443709604689, "flos": 36830701710720.0, "grad_norm": 3.0425795071510513, "language_loss": 0.65486991, "learning_rate": 3.2441807324726256e-07, "loss": 0.67624205, "num_input_tokens_seen": 294789250, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 13666, "time_per_iteration": 2.563619375228882 }, { "auxiliary_loss_clip": 0.01103052, "auxiliary_loss_mlp": 0.01031524, "balance_loss_clip": 1.02008557, "balance_loss_mlp": 1.03575885, "epoch": 0.821704494213137, "flos": 25081808088960.0, "grad_norm": 1.6673517904471316, "language_loss": 0.7689662, "learning_rate": 3.2420546239857174e-07, "loss": 0.79031193, "num_input_tokens_seen": 294809760, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 13667, "time_per_iteration": 2.4686267375946045 }, { "auxiliary_loss_clip": 0.01106664, "auxiliary_loss_mlp": 0.01036508, "balance_loss_clip": 1.02403235, "balance_loss_mlp": 1.03616226, "epoch": 0.8217646174658049, "flos": 14356117290240.0, "grad_norm": 1.937014434976987, "language_loss": 0.77417797, "learning_rate": 3.239929150961773e-07, "loss": 0.79560965, "num_input_tokens_seen": 294826495, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 13668, "time_per_iteration": 3.8594818115234375 }, { "auxiliary_loss_clip": 0.01103121, "auxiliary_loss_mlp": 0.0103175, "balance_loss_clip": 1.01974583, "balance_loss_mlp": 1.0349158, "epoch": 0.8218247407184729, "flos": 22090557139200.0, "grad_norm": 2.053408235730785, "language_loss": 0.73669344, "learning_rate": 3.2378043134813984e-07, "loss": 0.7580421, "num_input_tokens_seen": 294845370, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 13669, "time_per_iteration": 2.517902374267578 }, { "auxiliary_loss_clip": 0.01104044, "auxiliary_loss_mlp": 0.01025698, "balance_loss_clip": 1.01420069, "balance_loss_mlp": 1.03579175, "epoch": 0.8218848639711408, "flos": 16764035368320.0, "grad_norm": 1.8456389389469372, "language_loss": 0.78775132, "learning_rate": 3.235680111625161e-07, "loss": 0.80904871, "num_input_tokens_seen": 294863740, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 13670, "time_per_iteration": 3.7828872203826904 }, { "auxiliary_loss_clip": 0.01107853, "auxiliary_loss_mlp": 0.01038394, "balance_loss_clip": 1.0256567, "balance_loss_mlp": 1.03669488, "epoch": 0.8219449872238088, "flos": 25994801007360.0, "grad_norm": 1.7671011567179895, "language_loss": 0.74610263, "learning_rate": 3.2335565454736123e-07, "loss": 0.76756507, "num_input_tokens_seen": 294882815, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 13671, "time_per_iteration": 2.488459587097168 }, { "auxiliary_loss_clip": 0.01110278, "auxiliary_loss_mlp": 0.01029969, "balance_loss_clip": 1.01721358, "balance_loss_mlp": 1.03660345, "epoch": 0.8220051104764767, "flos": 20778094091520.0, "grad_norm": 1.941910659872597, "language_loss": 0.76732808, "learning_rate": 3.23143361510728e-07, "loss": 0.78873056, "num_input_tokens_seen": 294901985, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73828125, "step": 13672, "time_per_iteration": 3.8621184825897217 }, { "auxiliary_loss_clip": 0.01104003, "auxiliary_loss_mlp": 0.01035937, "balance_loss_clip": 1.02253807, "balance_loss_mlp": 1.03528214, "epoch": 0.8220652337291448, "flos": 14574849160320.0, "grad_norm": 2.2428128887777623, "language_loss": 0.74684501, "learning_rate": 3.2293113206066733e-07, "loss": 0.76824445, "num_input_tokens_seen": 294919705, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6875, "step": 13673, "time_per_iteration": 3.8562464714050293 }, { "auxiliary_loss_clip": 0.01106899, "auxiliary_loss_mlp": 0.01031768, "balance_loss_clip": 1.01923263, "balance_loss_mlp": 1.03610408, "epoch": 0.8221253569818128, "flos": 23805866194560.0, "grad_norm": 2.807267615192286, "language_loss": 0.7961933, "learning_rate": 3.227189662052254e-07, "loss": 0.81757998, "num_input_tokens_seen": 294939900, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 13674, "time_per_iteration": 2.5281057357788086 }, { "auxiliary_loss_clip": 0.01103564, "auxiliary_loss_mlp": 0.01034584, "balance_loss_clip": 1.02247798, "balance_loss_mlp": 1.0349462, "epoch": 0.8221854802344807, "flos": 21288241002240.0, "grad_norm": 1.951752388365855, "language_loss": 0.70290136, "learning_rate": 3.225068639524484e-07, "loss": 0.7242828, "num_input_tokens_seen": 294959110, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 13675, "time_per_iteration": 2.442126512527466 }, { "auxiliary_loss_clip": 0.01102833, "auxiliary_loss_mlp": 0.01036523, "balance_loss_clip": 1.02455449, "balance_loss_mlp": 1.03589106, "epoch": 0.8222456034871487, "flos": 20956785275520.0, "grad_norm": 1.956650503096692, "language_loss": 0.74303192, "learning_rate": 3.2229482531037965e-07, "loss": 0.76442552, "num_input_tokens_seen": 294978660, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 13676, "time_per_iteration": 2.4671645164489746 }, { "auxiliary_loss_clip": 0.01104043, "auxiliary_loss_mlp": 0.01030445, "balance_loss_clip": 1.01912594, "balance_loss_mlp": 1.03526413, "epoch": 0.8223057267398166, "flos": 21397517153280.0, "grad_norm": 1.910453261958105, "language_loss": 0.80469501, "learning_rate": 3.2208285028705893e-07, "loss": 0.82603985, "num_input_tokens_seen": 294998075, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6875, "step": 13677, "time_per_iteration": 2.446277141571045 }, { "auxiliary_loss_clip": 0.01106416, "auxiliary_loss_mlp": 0.01033197, "balance_loss_clip": 1.02099609, "balance_loss_mlp": 1.03620934, "epoch": 0.8223658499924846, "flos": 15268212368640.0, "grad_norm": 1.8028212360357359, "language_loss": 0.70179498, "learning_rate": 3.218709388905245e-07, "loss": 0.72319108, "num_input_tokens_seen": 295015950, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 13678, "time_per_iteration": 2.4406111240386963 }, { "auxiliary_loss_clip": 0.01101029, "auxiliary_loss_mlp": 0.01034595, "balance_loss_clip": 1.02223861, "balance_loss_mlp": 1.03345859, "epoch": 0.8224259732451525, "flos": 31249537447680.0, "grad_norm": 1.6859383098480545, "language_loss": 0.71549696, "learning_rate": 3.216590911288133e-07, "loss": 0.73685318, "num_input_tokens_seen": 295036800, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.67578125, "step": 13679, "time_per_iteration": 2.543623447418213 }, { "auxiliary_loss_clip": 0.01101994, "auxiliary_loss_mlp": 0.01026384, "balance_loss_clip": 1.01440346, "balance_loss_mlp": 1.0339129, "epoch": 0.8224860964978206, "flos": 21574628138880.0, "grad_norm": 2.360941568989708, "language_loss": 0.69990039, "learning_rate": 3.214473070099564e-07, "loss": 0.72118413, "num_input_tokens_seen": 295055300, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 13680, "time_per_iteration": 2.4986460208892822 }, { "auxiliary_loss_clip": 0.01104758, "auxiliary_loss_mlp": 0.01030248, "balance_loss_clip": 1.01857138, "balance_loss_mlp": 1.03643799, "epoch": 0.8225462197504885, "flos": 25483217552640.0, "grad_norm": 1.7342000616644218, "language_loss": 0.59518725, "learning_rate": 3.21235586541986e-07, "loss": 0.61653733, "num_input_tokens_seen": 295076420, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.68359375, "step": 13681, "time_per_iteration": 2.485290288925171 }, { "auxiliary_loss_clip": 0.01106476, "auxiliary_loss_mlp": 0.0103522, "balance_loss_clip": 1.02270341, "balance_loss_mlp": 1.0343256, "epoch": 0.8226063430031565, "flos": 39385458587520.0, "grad_norm": 1.815761766284584, "language_loss": 0.69448555, "learning_rate": 3.2102392973293047e-07, "loss": 0.71590257, "num_input_tokens_seen": 295100540, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 13682, "time_per_iteration": 2.6130735874176025 }, { "auxiliary_loss_clip": 0.01105415, "auxiliary_loss_mlp": 0.01033033, "balance_loss_clip": 1.01976502, "balance_loss_mlp": 1.03525841, "epoch": 0.8226664662558244, "flos": 22815269942400.0, "grad_norm": 1.9157589826455106, "language_loss": 0.79696286, "learning_rate": 3.20812336590816e-07, "loss": 0.81834733, "num_input_tokens_seen": 295120180, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 13683, "time_per_iteration": 2.4847044944763184 }, { "auxiliary_loss_clip": 0.01100377, "auxiliary_loss_mlp": 0.01027734, "balance_loss_clip": 1.0167191, "balance_loss_mlp": 1.03477812, "epoch": 0.8227265895084924, "flos": 25665607837440.0, "grad_norm": 2.14193397953022, "language_loss": 0.86811495, "learning_rate": 3.206008071236661e-07, "loss": 0.88939607, "num_input_tokens_seen": 295138530, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65625, "step": 13684, "time_per_iteration": 2.48044490814209 }, { "auxiliary_loss_clip": 0.01101314, "auxiliary_loss_mlp": 0.01024788, "balance_loss_clip": 1.01317716, "balance_loss_mlp": 1.03514767, "epoch": 0.8227867127611603, "flos": 26179274280960.0, "grad_norm": 1.5532578307769451, "language_loss": 0.79971218, "learning_rate": 3.2038934133950157e-07, "loss": 0.82097328, "num_input_tokens_seen": 295160260, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 13685, "time_per_iteration": 2.493671178817749 }, { "auxiliary_loss_clip": 0.0110333, "auxiliary_loss_mlp": 0.01030342, "balance_loss_clip": 1.01822424, "balance_loss_mlp": 1.03529465, "epoch": 0.8228468360138284, "flos": 22018053536640.0, "grad_norm": 2.4169844215307217, "language_loss": 0.68703699, "learning_rate": 3.2017793924634194e-07, "loss": 0.70837367, "num_input_tokens_seen": 295177055, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 13686, "time_per_iteration": 2.4591410160064697 }, { "auxiliary_loss_clip": 0.01104103, "auxiliary_loss_mlp": 0.01034251, "balance_loss_clip": 1.02173376, "balance_loss_mlp": 1.03361881, "epoch": 0.8229069592664963, "flos": 14903359971840.0, "grad_norm": 2.83545913966764, "language_loss": 0.78007758, "learning_rate": 3.1996660085220263e-07, "loss": 0.8014611, "num_input_tokens_seen": 295193870, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 13687, "time_per_iteration": 2.4191901683807373 }, { "auxiliary_loss_clip": 0.01103817, "auxiliary_loss_mlp": 0.0103039, "balance_loss_clip": 1.01761079, "balance_loss_mlp": 1.03458834, "epoch": 0.8229670825191643, "flos": 15669478177920.0, "grad_norm": 2.5965538893108633, "language_loss": 0.72696114, "learning_rate": 3.1975532616509825e-07, "loss": 0.74830323, "num_input_tokens_seen": 295211040, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 13688, "time_per_iteration": 2.4642465114593506 }, { "auxiliary_loss_clip": 0.01105099, "auxiliary_loss_mlp": 0.01032097, "balance_loss_clip": 1.01972318, "balance_loss_mlp": 1.03596044, "epoch": 0.8230272057718323, "flos": 23183498217600.0, "grad_norm": 1.6352957489518738, "language_loss": 0.7329306, "learning_rate": 3.1954411519304025e-07, "loss": 0.75430256, "num_input_tokens_seen": 295231300, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 13689, "time_per_iteration": 2.4785783290863037 }, { "auxiliary_loss_clip": 0.01104754, "auxiliary_loss_mlp": 0.01033617, "balance_loss_clip": 1.02103424, "balance_loss_mlp": 1.03440607, "epoch": 0.8230873290245002, "flos": 21032413361280.0, "grad_norm": 1.9801732751505654, "language_loss": 0.69376004, "learning_rate": 3.1933296794403887e-07, "loss": 0.7151438, "num_input_tokens_seen": 295251045, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 13690, "time_per_iteration": 2.500805377960205 }, { "auxiliary_loss_clip": 0.01104806, "auxiliary_loss_mlp": 0.01033292, "balance_loss_clip": 1.02094817, "balance_loss_mlp": 1.03534532, "epoch": 0.8231474522771682, "flos": 21250139650560.0, "grad_norm": 2.179235547445205, "language_loss": 0.84958994, "learning_rate": 3.191218844260988e-07, "loss": 0.87097096, "num_input_tokens_seen": 295270225, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 13691, "time_per_iteration": 2.4497568607330322 }, { "auxiliary_loss_clip": 0.01104923, "auxiliary_loss_mlp": 0.01030224, "balance_loss_clip": 1.01876783, "balance_loss_mlp": 1.03556597, "epoch": 0.8232075755298361, "flos": 23842028211840.0, "grad_norm": 3.5675218378300664, "language_loss": 0.76986945, "learning_rate": 3.189108646472252e-07, "loss": 0.7912209, "num_input_tokens_seen": 295288950, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 13692, "time_per_iteration": 2.539475679397583 }, { "auxiliary_loss_clip": 0.01102395, "auxiliary_loss_mlp": 0.01027053, "balance_loss_clip": 1.01485205, "balance_loss_mlp": 1.03468919, "epoch": 0.8232676987825042, "flos": 21653955325440.0, "grad_norm": 1.5555483322900596, "language_loss": 0.71593976, "learning_rate": 3.186999086154205e-07, "loss": 0.73723418, "num_input_tokens_seen": 295309405, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 13693, "time_per_iteration": 2.5097453594207764 }, { "auxiliary_loss_clip": 0.01098699, "auxiliary_loss_mlp": 0.01033171, "balance_loss_clip": 1.02258551, "balance_loss_mlp": 1.03327656, "epoch": 0.8233278220351721, "flos": 26322701287680.0, "grad_norm": 1.8777346812369191, "language_loss": 0.83750123, "learning_rate": 3.1848901633868355e-07, "loss": 0.8588199, "num_input_tokens_seen": 295331115, "router_z_loss_clip": 0.10595703, "router_z_loss_mlp": 0.65625, "step": 13694, "time_per_iteration": 2.515815019607544 }, { "auxiliary_loss_clip": 0.01104963, "auxiliary_loss_mlp": 0.01032845, "balance_loss_clip": 1.0199703, "balance_loss_mlp": 1.03516495, "epoch": 0.8233879452878401, "flos": 21725812483200.0, "grad_norm": 1.8298790485466907, "language_loss": 0.77114224, "learning_rate": 3.182781878250118e-07, "loss": 0.79252028, "num_input_tokens_seen": 295350495, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 13695, "time_per_iteration": 2.4440674781799316 }, { "auxiliary_loss_clip": 0.0110494, "auxiliary_loss_mlp": 0.0103507, "balance_loss_clip": 1.02335179, "balance_loss_mlp": 1.03657591, "epoch": 0.823448068540508, "flos": 20557746109440.0, "grad_norm": 2.1469423517630353, "language_loss": 0.80860186, "learning_rate": 3.1806742308239985e-07, "loss": 0.83000195, "num_input_tokens_seen": 295368225, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 13696, "time_per_iteration": 2.460850954055786 }, { "auxiliary_loss_clip": 0.01030326, "auxiliary_loss_mlp": 0.01001537, "balance_loss_clip": 1.00057125, "balance_loss_mlp": 1.00813341, "epoch": 0.823508191793176, "flos": 67273688194560.0, "grad_norm": 0.7668490585136811, "language_loss": 0.63859707, "learning_rate": 3.178567221188393e-07, "loss": 0.6589157, "num_input_tokens_seen": 295430035, "router_z_loss_clip": 0.00964355, "router_z_loss_mlp": 0.22265625, "step": 13697, "time_per_iteration": 3.1494882106781006 }, { "auxiliary_loss_clip": 0.01099646, "auxiliary_loss_mlp": 0.01026316, "balance_loss_clip": 1.01544392, "balance_loss_mlp": 1.03440595, "epoch": 0.8235683150458439, "flos": 17928402641280.0, "grad_norm": 1.6401031839521074, "language_loss": 0.72834402, "learning_rate": 3.1764608494232037e-07, "loss": 0.74960363, "num_input_tokens_seen": 295447765, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.65234375, "step": 13698, "time_per_iteration": 2.4412057399749756 }, { "auxiliary_loss_clip": 0.01104508, "auxiliary_loss_mlp": 0.01028942, "balance_loss_clip": 1.01618075, "balance_loss_mlp": 1.03529716, "epoch": 0.823628438298512, "flos": 18916089891840.0, "grad_norm": 6.399860271364832, "language_loss": 0.71875244, "learning_rate": 3.174355115608305e-07, "loss": 0.74008691, "num_input_tokens_seen": 295464810, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 13699, "time_per_iteration": 2.427841901779175 }, { "auxiliary_loss_clip": 0.01103438, "auxiliary_loss_mlp": 0.01029875, "balance_loss_clip": 1.01801991, "balance_loss_mlp": 1.03577709, "epoch": 0.8236885615511799, "flos": 18696460181760.0, "grad_norm": 6.384540489829939, "language_loss": 0.82120419, "learning_rate": 3.1722500198235526e-07, "loss": 0.84253734, "num_input_tokens_seen": 295482605, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 13700, "time_per_iteration": 2.425589084625244 }, { "auxiliary_loss_clip": 0.01103704, "auxiliary_loss_mlp": 0.01036277, "balance_loss_clip": 1.02441549, "balance_loss_mlp": 1.03437471, "epoch": 0.8237486848038479, "flos": 23695009845120.0, "grad_norm": 2.645489255682321, "language_loss": 0.73030633, "learning_rate": 3.170145562148763e-07, "loss": 0.75170612, "num_input_tokens_seen": 295503780, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69140625, "step": 13701, "time_per_iteration": 2.461803913116455 }, { "auxiliary_loss_clip": 0.01104815, "auxiliary_loss_mlp": 0.01034758, "balance_loss_clip": 1.02210951, "balance_loss_mlp": 1.03424144, "epoch": 0.8238088080565159, "flos": 23441301106560.0, "grad_norm": 1.866666338573546, "language_loss": 0.69346595, "learning_rate": 3.1680417426637384e-07, "loss": 0.71486163, "num_input_tokens_seen": 295522035, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13702, "time_per_iteration": 2.466301441192627 }, { "auxiliary_loss_clip": 0.01104332, "auxiliary_loss_mlp": 0.01032402, "balance_loss_clip": 1.02023625, "balance_loss_mlp": 1.03547549, "epoch": 0.8238689313091838, "flos": 22746537267840.0, "grad_norm": 2.262121825872804, "language_loss": 0.75069249, "learning_rate": 3.1659385614482603e-07, "loss": 0.7720598, "num_input_tokens_seen": 295541190, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 13703, "time_per_iteration": 2.444380044937134 }, { "auxiliary_loss_clip": 0.01109294, "auxiliary_loss_mlp": 0.01038859, "balance_loss_clip": 1.0251261, "balance_loss_mlp": 1.03586125, "epoch": 0.8239290545618518, "flos": 25630092264960.0, "grad_norm": 2.8819933052269104, "language_loss": 0.70050937, "learning_rate": 3.1638360185820755e-07, "loss": 0.72199094, "num_input_tokens_seen": 295558860, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 13704, "time_per_iteration": 2.476077079772949 }, { "auxiliary_loss_clip": 0.01100781, "auxiliary_loss_mlp": 0.01027416, "balance_loss_clip": 1.01559091, "balance_loss_mlp": 1.03335881, "epoch": 0.8239891778145197, "flos": 26026473824640.0, "grad_norm": 1.8234905117096187, "language_loss": 0.64062476, "learning_rate": 3.161734114144916e-07, "loss": 0.66190678, "num_input_tokens_seen": 295578155, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 13705, "time_per_iteration": 2.4830636978149414 }, { "auxiliary_loss_clip": 0.01105782, "auxiliary_loss_mlp": 0.01029435, "balance_loss_clip": 1.01665008, "balance_loss_mlp": 1.03501952, "epoch": 0.8240493010671878, "flos": 21833257040640.0, "grad_norm": 2.064205397319768, "language_loss": 0.69281501, "learning_rate": 3.1596328482164915e-07, "loss": 0.71416724, "num_input_tokens_seen": 295599170, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 13706, "time_per_iteration": 2.4612879753112793 }, { "auxiliary_loss_clip": 0.01106836, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.02074862, "balance_loss_mlp": 1.03736711, "epoch": 0.8241094243198557, "flos": 18551919853440.0, "grad_norm": 1.6956673775098967, "language_loss": 0.69744051, "learning_rate": 3.157532220876475e-07, "loss": 0.71884573, "num_input_tokens_seen": 295617465, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 13707, "time_per_iteration": 2.4271433353424072 }, { "auxiliary_loss_clip": 0.01104635, "auxiliary_loss_mlp": 0.01028924, "balance_loss_clip": 1.01675236, "balance_loss_mlp": 1.03476453, "epoch": 0.8241695475725237, "flos": 25447163276160.0, "grad_norm": 1.9071015656922912, "language_loss": 0.79142392, "learning_rate": 3.1554322322045226e-07, "loss": 0.81275946, "num_input_tokens_seen": 295634960, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 13708, "time_per_iteration": 2.494185447692871 }, { "auxiliary_loss_clip": 0.0110492, "auxiliary_loss_mlp": 0.01028921, "balance_loss_clip": 1.01633251, "balance_loss_mlp": 1.03457582, "epoch": 0.8242296708251916, "flos": 18989670902400.0, "grad_norm": 3.501108586057642, "language_loss": 0.68585503, "learning_rate": 3.1533328822802664e-07, "loss": 0.70719337, "num_input_tokens_seen": 295652725, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 13709, "time_per_iteration": 2.421225070953369 }, { "auxiliary_loss_clip": 0.01104396, "auxiliary_loss_mlp": 0.01033431, "balance_loss_clip": 1.02168322, "balance_loss_mlp": 1.0347296, "epoch": 0.8242897940778596, "flos": 22600883617920.0, "grad_norm": 2.047779104030527, "language_loss": 0.82596242, "learning_rate": 3.151234171183319e-07, "loss": 0.8473407, "num_input_tokens_seen": 295671195, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 13710, "time_per_iteration": 3.81247878074646 }, { "auxiliary_loss_clip": 0.01103288, "auxiliary_loss_mlp": 0.01031969, "balance_loss_clip": 1.01935065, "balance_loss_mlp": 1.0338757, "epoch": 0.8243499173305275, "flos": 21468153248640.0, "grad_norm": 2.409639153584569, "language_loss": 0.78142655, "learning_rate": 3.149136098993257e-07, "loss": 0.8027792, "num_input_tokens_seen": 295689130, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 13711, "time_per_iteration": 3.843968152999878 }, { "auxiliary_loss_clip": 0.01102246, "auxiliary_loss_mlp": 0.01029106, "balance_loss_clip": 1.01713705, "balance_loss_mlp": 1.03454244, "epoch": 0.8244100405831956, "flos": 20010359773440.0, "grad_norm": 4.255303140552095, "language_loss": 0.65964377, "learning_rate": 3.1470386657896473e-07, "loss": 0.68095726, "num_input_tokens_seen": 295706385, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 13712, "time_per_iteration": 2.4368808269500732 }, { "auxiliary_loss_clip": 0.01103024, "auxiliary_loss_mlp": 0.01027475, "balance_loss_clip": 1.01578081, "balance_loss_mlp": 1.03465581, "epoch": 0.8244701638358635, "flos": 26430684549120.0, "grad_norm": 1.8775203008543668, "language_loss": 0.73975497, "learning_rate": 3.14494187165202e-07, "loss": 0.76106, "num_input_tokens_seen": 295727925, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 13713, "time_per_iteration": 3.8932719230651855 }, { "auxiliary_loss_clip": 0.01104072, "auxiliary_loss_mlp": 0.01025776, "balance_loss_clip": 1.01369977, "balance_loss_mlp": 1.03419542, "epoch": 0.8245302870885315, "flos": 17640004343040.0, "grad_norm": 2.189639969607681, "language_loss": 0.81238228, "learning_rate": 3.1428457166598833e-07, "loss": 0.83368075, "num_input_tokens_seen": 295744420, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 13714, "time_per_iteration": 2.438232421875 }, { "auxiliary_loss_clip": 0.01106427, "auxiliary_loss_mlp": 0.01038014, "balance_loss_clip": 1.02488327, "balance_loss_mlp": 1.03843689, "epoch": 0.8245904103411995, "flos": 26209510554240.0, "grad_norm": 2.1385501812399292, "language_loss": 0.6631633, "learning_rate": 3.1407502008927235e-07, "loss": 0.68460774, "num_input_tokens_seen": 295765105, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6796875, "step": 13715, "time_per_iteration": 3.958561658859253 }, { "auxiliary_loss_clip": 0.01105307, "auxiliary_loss_mlp": 0.01030267, "balance_loss_clip": 1.01748204, "balance_loss_mlp": 1.03557646, "epoch": 0.8246505335938674, "flos": 24205084928640.0, "grad_norm": 6.037102437945159, "language_loss": 0.75152344, "learning_rate": 3.1386553244300086e-07, "loss": 0.77287912, "num_input_tokens_seen": 295784200, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 13716, "time_per_iteration": 2.4747092723846436 }, { "auxiliary_loss_clip": 0.01029809, "auxiliary_loss_mlp": 0.00999731, "balance_loss_clip": 0.99876577, "balance_loss_mlp": 1.00763822, "epoch": 0.8247106568465354, "flos": 67092195749760.0, "grad_norm": 0.7142849491588643, "language_loss": 0.58958125, "learning_rate": 3.136561087351175e-07, "loss": 0.60987663, "num_input_tokens_seen": 295846555, "router_z_loss_clip": 0.00964355, "router_z_loss_mlp": 0.22265625, "step": 13717, "time_per_iteration": 3.2078018188476562 }, { "auxiliary_loss_clip": 0.01104237, "auxiliary_loss_mlp": 0.01027444, "balance_loss_clip": 1.01654232, "balance_loss_mlp": 1.03663492, "epoch": 0.8247707800992033, "flos": 12568232805120.0, "grad_norm": 2.447970666520577, "language_loss": 0.79481417, "learning_rate": 3.1344674897356373e-07, "loss": 0.816131, "num_input_tokens_seen": 295863425, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.67578125, "step": 13718, "time_per_iteration": 2.4248270988464355 }, { "auxiliary_loss_clip": 0.01099923, "auxiliary_loss_mlp": 0.01031951, "balance_loss_clip": 1.02016091, "balance_loss_mlp": 1.03337181, "epoch": 0.8248309033518714, "flos": 15923617879680.0, "grad_norm": 1.8056802042194477, "language_loss": 0.68926597, "learning_rate": 3.132374531662778e-07, "loss": 0.71058476, "num_input_tokens_seen": 295880925, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 13719, "time_per_iteration": 2.4276630878448486 }, { "auxiliary_loss_clip": 0.01104865, "auxiliary_loss_mlp": 0.01030062, "balance_loss_clip": 1.01710975, "balance_loss_mlp": 1.03426957, "epoch": 0.8248910266045393, "flos": 17564735393280.0, "grad_norm": 2.3671426724462457, "language_loss": 0.69958961, "learning_rate": 3.13028221321197e-07, "loss": 0.7209388, "num_input_tokens_seen": 295898205, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 13720, "time_per_iteration": 2.407928705215454 }, { "auxiliary_loss_clip": 0.01105308, "auxiliary_loss_mlp": 0.01027224, "balance_loss_clip": 1.01465893, "balance_loss_mlp": 1.03508389, "epoch": 0.8249511498572073, "flos": 28619655275520.0, "grad_norm": 1.7346854725261485, "language_loss": 0.76195765, "learning_rate": 3.1281905344625467e-07, "loss": 0.783283, "num_input_tokens_seen": 295918130, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 13721, "time_per_iteration": 2.5070724487304688 }, { "auxiliary_loss_clip": 0.01101825, "auxiliary_loss_mlp": 0.01026676, "balance_loss_clip": 1.01533949, "balance_loss_mlp": 1.03410697, "epoch": 0.8250112731098752, "flos": 25556583081600.0, "grad_norm": 2.5182510316260553, "language_loss": 0.77793705, "learning_rate": 3.1260994954938305e-07, "loss": 0.79922211, "num_input_tokens_seen": 295937760, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 13722, "time_per_iteration": 2.467036485671997 }, { "auxiliary_loss_clip": 0.01102448, "auxiliary_loss_mlp": 0.01033062, "balance_loss_clip": 1.02134407, "balance_loss_mlp": 1.03553116, "epoch": 0.8250713963625432, "flos": 27746164339200.0, "grad_norm": 1.8220498259356976, "language_loss": 0.62673187, "learning_rate": 3.1240090963851205e-07, "loss": 0.64808691, "num_input_tokens_seen": 295957585, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 13723, "time_per_iteration": 2.5111823081970215 }, { "auxiliary_loss_clip": 0.01104839, "auxiliary_loss_mlp": 0.01032332, "balance_loss_clip": 1.01997018, "balance_loss_mlp": 1.035344, "epoch": 0.8251315196152111, "flos": 21610610588160.0, "grad_norm": 1.5192009575924161, "language_loss": 0.74070048, "learning_rate": 3.121919337215666e-07, "loss": 0.76207221, "num_input_tokens_seen": 295977135, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 13724, "time_per_iteration": 2.436901330947876 }, { "auxiliary_loss_clip": 0.01104964, "auxiliary_loss_mlp": 0.01034363, "balance_loss_clip": 1.02133322, "balance_loss_mlp": 1.03612566, "epoch": 0.8251916428678792, "flos": 28579363194240.0, "grad_norm": 2.0070407796203065, "language_loss": 0.63897419, "learning_rate": 3.1198302180647253e-07, "loss": 0.66036749, "num_input_tokens_seen": 295996265, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 13725, "time_per_iteration": 2.5243611335754395 }, { "auxiliary_loss_clip": 0.01101809, "auxiliary_loss_mlp": 0.01030992, "balance_loss_clip": 1.0189935, "balance_loss_mlp": 1.03437924, "epoch": 0.8252517661205471, "flos": 23075191733760.0, "grad_norm": 1.6872008375608636, "language_loss": 0.82150662, "learning_rate": 3.1177417390115125e-07, "loss": 0.84283459, "num_input_tokens_seen": 296014745, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 13726, "time_per_iteration": 2.4493401050567627 }, { "auxiliary_loss_clip": 0.01096863, "auxiliary_loss_mlp": 0.01034091, "balance_loss_clip": 1.02292705, "balance_loss_mlp": 1.03188634, "epoch": 0.8253118893732151, "flos": 31759576617600.0, "grad_norm": 1.6408208932953252, "language_loss": 0.70516127, "learning_rate": 3.1156539001352286e-07, "loss": 0.72647083, "num_input_tokens_seen": 296036960, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6484375, "step": 13727, "time_per_iteration": 2.547698974609375 }, { "auxiliary_loss_clip": 0.01106944, "auxiliary_loss_mlp": 0.010329, "balance_loss_clip": 1.02009678, "balance_loss_mlp": 1.03738499, "epoch": 0.8253720126258831, "flos": 18296415434880.0, "grad_norm": 2.2970081163414604, "language_loss": 0.6268366, "learning_rate": 3.113566701515036e-07, "loss": 0.64823508, "num_input_tokens_seen": 296056540, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 13728, "time_per_iteration": 2.450721263885498 }, { "auxiliary_loss_clip": 0.011099, "auxiliary_loss_mlp": 0.01029311, "balance_loss_clip": 1.01626396, "balance_loss_mlp": 1.03776217, "epoch": 0.825432135878551, "flos": 26797332625920.0, "grad_norm": 1.8391278184031212, "language_loss": 0.71127129, "learning_rate": 3.111480143230092e-07, "loss": 0.73266345, "num_input_tokens_seen": 296077950, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 13729, "time_per_iteration": 2.509615421295166 }, { "auxiliary_loss_clip": 0.01029147, "auxiliary_loss_mlp": 0.01000603, "balance_loss_clip": 0.99954778, "balance_loss_mlp": 1.00708008, "epoch": 0.825492259131219, "flos": 54219116217600.0, "grad_norm": 0.8708760874214418, "language_loss": 0.62677026, "learning_rate": 3.109394225359514e-07, "loss": 0.64706779, "num_input_tokens_seen": 296127060, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.22070312, "step": 13730, "time_per_iteration": 2.9130842685699463 }, { "auxiliary_loss_clip": 0.01103844, "auxiliary_loss_mlp": 0.01031738, "balance_loss_clip": 1.01976979, "balance_loss_mlp": 1.03527462, "epoch": 0.825552382383887, "flos": 43756145493120.0, "grad_norm": 1.95772607463791, "language_loss": 0.63283473, "learning_rate": 3.1073089479823945e-07, "loss": 0.65419054, "num_input_tokens_seen": 296147775, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 13731, "time_per_iteration": 2.678445816040039 }, { "auxiliary_loss_clip": 0.01106425, "auxiliary_loss_mlp": 0.01030426, "balance_loss_clip": 1.01810575, "balance_loss_mlp": 1.03432846, "epoch": 0.825612505636555, "flos": 12602814624000.0, "grad_norm": 2.8566864656307347, "language_loss": 0.69755661, "learning_rate": 3.105224311177812e-07, "loss": 0.71892512, "num_input_tokens_seen": 296163560, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 13732, "time_per_iteration": 2.430274724960327 }, { "auxiliary_loss_clip": 0.0110576, "auxiliary_loss_mlp": 0.0103501, "balance_loss_clip": 1.02230787, "balance_loss_mlp": 1.03428841, "epoch": 0.8256726288892229, "flos": 17595618111360.0, "grad_norm": 2.281425729759395, "language_loss": 0.71136057, "learning_rate": 3.103140315024817e-07, "loss": 0.7327683, "num_input_tokens_seen": 296178730, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 13733, "time_per_iteration": 2.422308921813965 }, { "auxiliary_loss_clip": 0.0110099, "auxiliary_loss_mlp": 0.01029679, "balance_loss_clip": 1.01689339, "balance_loss_mlp": 1.03350294, "epoch": 0.8257327521418909, "flos": 23805794367360.0, "grad_norm": 1.4712552319106835, "language_loss": 0.82189214, "learning_rate": 3.1010569596024437e-07, "loss": 0.84319884, "num_input_tokens_seen": 296200175, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.67578125, "step": 13734, "time_per_iteration": 2.490027904510498 }, { "auxiliary_loss_clip": 0.01100087, "auxiliary_loss_mlp": 0.01030476, "balance_loss_clip": 1.0182513, "balance_loss_mlp": 1.03341126, "epoch": 0.8257928753945588, "flos": 19281121856640.0, "grad_norm": 1.9354173405669175, "language_loss": 0.83061826, "learning_rate": 3.098974244989676e-07, "loss": 0.85192382, "num_input_tokens_seen": 296219305, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6640625, "step": 13735, "time_per_iteration": 2.4479763507843018 }, { "auxiliary_loss_clip": 0.01106774, "auxiliary_loss_mlp": 0.01028046, "balance_loss_clip": 1.01694727, "balance_loss_mlp": 1.03765428, "epoch": 0.8258529986472268, "flos": 18478841633280.0, "grad_norm": 2.455713944684296, "language_loss": 0.70479321, "learning_rate": 3.096892171265497e-07, "loss": 0.72614145, "num_input_tokens_seen": 296236945, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.69140625, "step": 13736, "time_per_iteration": 2.4300169944763184 }, { "auxiliary_loss_clip": 0.01029585, "auxiliary_loss_mlp": 0.01001742, "balance_loss_clip": 1.00072885, "balance_loss_mlp": 1.00742221, "epoch": 0.8259131218998947, "flos": 62137957512960.0, "grad_norm": 0.8771564640286923, "language_loss": 0.6794166, "learning_rate": 3.0948107385088665e-07, "loss": 0.69972986, "num_input_tokens_seen": 296294685, "router_z_loss_clip": 0.01013184, "router_z_loss_mlp": 0.22265625, "step": 13737, "time_per_iteration": 3.0651426315307617 }, { "auxiliary_loss_clip": 0.01103951, "auxiliary_loss_mlp": 0.01032591, "balance_loss_clip": 1.02090287, "balance_loss_mlp": 1.0346036, "epoch": 0.8259732451525628, "flos": 22159038418560.0, "grad_norm": 1.9580831334743096, "language_loss": 0.69670171, "learning_rate": 3.0927299467987e-07, "loss": 0.71806717, "num_input_tokens_seen": 296314790, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 13738, "time_per_iteration": 2.4468324184417725 }, { "auxiliary_loss_clip": 0.0110858, "auxiliary_loss_mlp": 0.01033094, "balance_loss_clip": 1.01880634, "balance_loss_mlp": 1.03829312, "epoch": 0.8260333684052307, "flos": 38361645233280.0, "grad_norm": 2.314522525365119, "language_loss": 0.63257003, "learning_rate": 3.090649796213911e-07, "loss": 0.65398669, "num_input_tokens_seen": 296335355, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.703125, "step": 13739, "time_per_iteration": 2.600259780883789 }, { "auxiliary_loss_clip": 0.01029431, "auxiliary_loss_mlp": 0.01000139, "balance_loss_clip": 0.99910825, "balance_loss_mlp": 1.00715554, "epoch": 0.8260934916578987, "flos": 62185611882240.0, "grad_norm": 0.8171200371304371, "language_loss": 0.5926345, "learning_rate": 3.0885702868333853e-07, "loss": 0.61293024, "num_input_tokens_seen": 296399885, "router_z_loss_clip": 0.01031494, "router_z_loss_mlp": 0.22265625, "step": 13740, "time_per_iteration": 3.1198692321777344 }, { "auxiliary_loss_clip": 0.01109211, "auxiliary_loss_mlp": 0.01029795, "balance_loss_clip": 1.0166049, "balance_loss_mlp": 1.03668642, "epoch": 0.8261536149105667, "flos": 22565475786240.0, "grad_norm": 2.827563026070721, "language_loss": 0.75379735, "learning_rate": 3.086491418735959e-07, "loss": 0.77518743, "num_input_tokens_seen": 296417660, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 13741, "time_per_iteration": 2.463665723800659 }, { "auxiliary_loss_clip": 0.01102736, "auxiliary_loss_mlp": 0.01032244, "balance_loss_clip": 1.01992965, "balance_loss_mlp": 1.03427315, "epoch": 0.8262137381632346, "flos": 32525479342080.0, "grad_norm": 2.4216898736391115, "language_loss": 0.62396359, "learning_rate": 3.0844131920004726e-07, "loss": 0.64531338, "num_input_tokens_seen": 296438255, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 13742, "time_per_iteration": 2.520179271697998 }, { "auxiliary_loss_clip": 0.01109019, "auxiliary_loss_mlp": 0.01035397, "balance_loss_clip": 1.02131212, "balance_loss_mlp": 1.03589833, "epoch": 0.8262738614159026, "flos": 14136451666560.0, "grad_norm": 6.939085825974587, "language_loss": 0.65977895, "learning_rate": 3.0823356067057327e-07, "loss": 0.68122309, "num_input_tokens_seen": 296454485, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73046875, "step": 13743, "time_per_iteration": 2.4346776008605957 }, { "auxiliary_loss_clip": 0.01105148, "auxiliary_loss_mlp": 0.01033952, "balance_loss_clip": 1.02207232, "balance_loss_mlp": 1.03602314, "epoch": 0.8263339846685706, "flos": 19825347795840.0, "grad_norm": 2.02537714882329, "language_loss": 0.66822278, "learning_rate": 3.0802586629305283e-07, "loss": 0.68961376, "num_input_tokens_seen": 296473740, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 13744, "time_per_iteration": 2.4460325241088867 }, { "auxiliary_loss_clip": 0.01105044, "auxiliary_loss_mlp": 0.01031578, "balance_loss_clip": 1.01972842, "balance_loss_mlp": 1.03606188, "epoch": 0.8263941079212386, "flos": 22745962650240.0, "grad_norm": 2.186749744973891, "language_loss": 0.75321788, "learning_rate": 3.078182360753612e-07, "loss": 0.77458411, "num_input_tokens_seen": 296493355, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 13745, "time_per_iteration": 2.4638612270355225 }, { "auxiliary_loss_clip": 0.01101023, "auxiliary_loss_mlp": 0.01034246, "balance_loss_clip": 1.02339792, "balance_loss_mlp": 1.03419304, "epoch": 0.8264542311739065, "flos": 20120641505280.0, "grad_norm": 2.386314247771619, "language_loss": 0.7885896, "learning_rate": 3.076106700253709e-07, "loss": 0.80994225, "num_input_tokens_seen": 296510520, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.66796875, "step": 13746, "time_per_iteration": 2.428439140319824 }, { "auxiliary_loss_clip": 0.0110896, "auxiliary_loss_mlp": 0.01035719, "balance_loss_clip": 1.02261829, "balance_loss_mlp": 1.03764606, "epoch": 0.8265143544265745, "flos": 16837149502080.0, "grad_norm": 2.99332138255549, "language_loss": 0.68054843, "learning_rate": 3.0740316815095415e-07, "loss": 0.70199525, "num_input_tokens_seen": 296528265, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 13747, "time_per_iteration": 2.4298412799835205 }, { "auxiliary_loss_clip": 0.01106326, "auxiliary_loss_mlp": 0.01030969, "balance_loss_clip": 1.01821983, "balance_loss_mlp": 1.03573823, "epoch": 0.8265744776792424, "flos": 22018592240640.0, "grad_norm": 2.205348831552659, "language_loss": 0.75660729, "learning_rate": 3.0719573045997835e-07, "loss": 0.77798033, "num_input_tokens_seen": 296547810, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 13748, "time_per_iteration": 2.447053909301758 }, { "auxiliary_loss_clip": 0.0110069, "auxiliary_loss_mlp": 0.01032795, "balance_loss_clip": 1.0223819, "balance_loss_mlp": 1.03523612, "epoch": 0.8266346009319104, "flos": 19244852098560.0, "grad_norm": 1.7683179471503327, "language_loss": 0.63932776, "learning_rate": 3.069883569603102e-07, "loss": 0.66066259, "num_input_tokens_seen": 296565940, "router_z_loss_clip": 0.10400391, "router_z_loss_mlp": 0.65625, "step": 13749, "time_per_iteration": 2.460003137588501 }, { "auxiliary_loss_clip": 0.01101273, "auxiliary_loss_mlp": 0.01029661, "balance_loss_clip": 1.01806223, "balance_loss_mlp": 1.03365934, "epoch": 0.8266947241845783, "flos": 24166768095360.0, "grad_norm": 2.6691791539372476, "language_loss": 0.73667681, "learning_rate": 3.067810476598132e-07, "loss": 0.75798619, "num_input_tokens_seen": 296585090, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 13750, "time_per_iteration": 2.457892894744873 }, { "auxiliary_loss_clip": 0.0110634, "auxiliary_loss_mlp": 0.0103538, "balance_loss_clip": 1.0228039, "balance_loss_mlp": 1.03655255, "epoch": 0.8267548474372464, "flos": 21105814803840.0, "grad_norm": 2.2622404840423145, "language_loss": 0.659464, "learning_rate": 3.065738025663496e-07, "loss": 0.68088126, "num_input_tokens_seen": 296604950, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 13751, "time_per_iteration": 4.028950929641724 }, { "auxiliary_loss_clip": 0.01100835, "auxiliary_loss_mlp": 0.01032045, "balance_loss_clip": 1.02042174, "balance_loss_mlp": 1.03387475, "epoch": 0.8268149706899143, "flos": 39968288668800.0, "grad_norm": 2.186901757713523, "language_loss": 0.60447323, "learning_rate": 3.0636662168777607e-07, "loss": 0.62580198, "num_input_tokens_seen": 296627780, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 13752, "time_per_iteration": 2.646176815032959 }, { "auxiliary_loss_clip": 0.01030644, "auxiliary_loss_mlp": 0.01000379, "balance_loss_clip": 0.99928778, "balance_loss_mlp": 1.00845623, "epoch": 0.8268750939425823, "flos": 65782423244160.0, "grad_norm": 0.7803343396948219, "language_loss": 0.57510543, "learning_rate": 3.0615950503194986e-07, "loss": 0.59541565, "num_input_tokens_seen": 296683850, "router_z_loss_clip": 0.01092529, "router_z_loss_mlp": 0.22265625, "step": 13753, "time_per_iteration": 4.518921852111816 }, { "auxiliary_loss_clip": 0.01029535, "auxiliary_loss_mlp": 0.01003416, "balance_loss_clip": 1.00243223, "balance_loss_mlp": 1.00733149, "epoch": 0.8269352171952503, "flos": 52981455242880.0, "grad_norm": 0.7022869205003053, "language_loss": 0.54918164, "learning_rate": 3.0595245260672563e-07, "loss": 0.56951118, "num_input_tokens_seen": 296741420, "router_z_loss_clip": 0.00982666, "router_z_loss_mlp": 0.22265625, "step": 13754, "time_per_iteration": 3.1712448596954346 }, { "auxiliary_loss_clip": 0.01099401, "auxiliary_loss_mlp": 0.01032252, "balance_loss_clip": 1.02173758, "balance_loss_mlp": 1.0329237, "epoch": 0.8269953404479182, "flos": 23076125487360.0, "grad_norm": 2.0946222186029497, "language_loss": 0.69704747, "learning_rate": 3.0574546441995354e-07, "loss": 0.718364, "num_input_tokens_seen": 296759620, "router_z_loss_clip": 0.10546875, "router_z_loss_mlp": 0.6640625, "step": 13755, "time_per_iteration": 3.840341091156006 }, { "auxiliary_loss_clip": 0.01102236, "auxiliary_loss_mlp": 0.01030907, "balance_loss_clip": 1.01982045, "balance_loss_mlp": 1.0350709, "epoch": 0.8270554637005862, "flos": 14209996763520.0, "grad_norm": 2.1475850432438834, "language_loss": 0.69819176, "learning_rate": 3.0553854047948324e-07, "loss": 0.71952319, "num_input_tokens_seen": 296777275, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.671875, "step": 13756, "time_per_iteration": 3.8864028453826904 }, { "auxiliary_loss_clip": 0.01105876, "auxiliary_loss_mlp": 0.01035883, "balance_loss_clip": 1.02371764, "balance_loss_mlp": 1.03741395, "epoch": 0.8271155869532542, "flos": 21762046327680.0, "grad_norm": 2.051775650093407, "language_loss": 0.72423291, "learning_rate": 3.053316807931623e-07, "loss": 0.74565053, "num_input_tokens_seen": 296796655, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 13757, "time_per_iteration": 2.4413788318634033 }, { "auxiliary_loss_clip": 0.01107378, "auxiliary_loss_mlp": 0.01032312, "balance_loss_clip": 1.01847231, "balance_loss_mlp": 1.03572989, "epoch": 0.8271757102059222, "flos": 15120475729920.0, "grad_norm": 2.482164233977125, "language_loss": 0.69502014, "learning_rate": 3.0512488536883283e-07, "loss": 0.71641707, "num_input_tokens_seen": 296813705, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 13758, "time_per_iteration": 2.424882173538208 }, { "auxiliary_loss_clip": 0.0109863, "auxiliary_loss_mlp": 0.01028263, "balance_loss_clip": 1.01689076, "balance_loss_mlp": 1.03293359, "epoch": 0.8272358334585901, "flos": 24133730561280.0, "grad_norm": 2.248543330970045, "language_loss": 0.69591141, "learning_rate": 3.0491815421433775e-07, "loss": 0.71718037, "num_input_tokens_seen": 296833985, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.65625, "step": 13759, "time_per_iteration": 2.5016114711761475 }, { "auxiliary_loss_clip": 0.01103474, "auxiliary_loss_mlp": 0.01030922, "balance_loss_clip": 1.01850045, "balance_loss_mlp": 1.03559422, "epoch": 0.8272959567112581, "flos": 18990712396800.0, "grad_norm": 2.011145625925427, "language_loss": 0.70744157, "learning_rate": 3.047114873375161e-07, "loss": 0.72878551, "num_input_tokens_seen": 296850150, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 13760, "time_per_iteration": 2.501227617263794 }, { "auxiliary_loss_clip": 0.01102109, "auxiliary_loss_mlp": 0.01028575, "balance_loss_clip": 1.0177989, "balance_loss_mlp": 1.03581512, "epoch": 0.827356079963926, "flos": 20631614428800.0, "grad_norm": 1.8017465957113277, "language_loss": 0.7773065, "learning_rate": 3.0450488474620505e-07, "loss": 0.79861331, "num_input_tokens_seen": 296869585, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.6640625, "step": 13761, "time_per_iteration": 2.4897024631500244 }, { "auxiliary_loss_clip": 0.01099123, "auxiliary_loss_mlp": 0.01034184, "balance_loss_clip": 1.02315748, "balance_loss_mlp": 1.03389537, "epoch": 0.827416203216594, "flos": 22416625825920.0, "grad_norm": 1.8695121292834813, "language_loss": 0.69794381, "learning_rate": 3.042983464482387e-07, "loss": 0.71927691, "num_input_tokens_seen": 296887710, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65234375, "step": 13762, "time_per_iteration": 2.45511531829834 }, { "auxiliary_loss_clip": 0.01102283, "auxiliary_loss_mlp": 0.01024239, "balance_loss_clip": 1.01290774, "balance_loss_mlp": 1.03395915, "epoch": 0.827476326469262, "flos": 19026192055680.0, "grad_norm": 1.8509570263345803, "language_loss": 0.70215076, "learning_rate": 3.0409187245144853e-07, "loss": 0.72341597, "num_input_tokens_seen": 296906265, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.68359375, "step": 13763, "time_per_iteration": 2.413870096206665 }, { "auxiliary_loss_clip": 0.01029535, "auxiliary_loss_mlp": 0.00999552, "balance_loss_clip": 0.99855691, "balance_loss_mlp": 1.00730515, "epoch": 0.82753644972193, "flos": 68500575089280.0, "grad_norm": 0.8423775220225249, "language_loss": 0.65136862, "learning_rate": 3.038854627636651e-07, "loss": 0.67165947, "num_input_tokens_seen": 296971290, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22265625, "step": 13764, "time_per_iteration": 3.1517083644866943 }, { "auxiliary_loss_clip": 0.0110636, "auxiliary_loss_mlp": 0.0103343, "balance_loss_clip": 1.0207994, "balance_loss_mlp": 1.0375545, "epoch": 0.8275965729745979, "flos": 18405404277120.0, "grad_norm": 2.967408616398587, "language_loss": 0.77988207, "learning_rate": 3.0367911739271423e-07, "loss": 0.8012799, "num_input_tokens_seen": 296989060, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13765, "time_per_iteration": 2.4308407306671143 }, { "auxiliary_loss_clip": 0.0110614, "auxiliary_loss_mlp": 0.01031925, "balance_loss_clip": 1.01934862, "balance_loss_mlp": 1.03536355, "epoch": 0.8276566962272659, "flos": 28512067063680.0, "grad_norm": 1.7561192561351913, "language_loss": 0.6270014, "learning_rate": 3.034728363464214e-07, "loss": 0.64838213, "num_input_tokens_seen": 297011300, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 13766, "time_per_iteration": 2.5264883041381836 }, { "auxiliary_loss_clip": 0.01104353, "auxiliary_loss_mlp": 0.01029961, "balance_loss_clip": 1.01731873, "balance_loss_mlp": 1.03539848, "epoch": 0.8277168194799339, "flos": 20230240878720.0, "grad_norm": 1.9857683697179136, "language_loss": 0.82635134, "learning_rate": 3.03266619632609e-07, "loss": 0.84769452, "num_input_tokens_seen": 297030350, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13767, "time_per_iteration": 2.449388265609741 }, { "auxiliary_loss_clip": 0.01107689, "auxiliary_loss_mlp": 0.01029222, "balance_loss_clip": 1.01730657, "balance_loss_mlp": 1.03754175, "epoch": 0.8277769427326018, "flos": 28476623318400.0, "grad_norm": 1.7522072033620175, "language_loss": 0.69076687, "learning_rate": 3.030604672590964e-07, "loss": 0.71213597, "num_input_tokens_seen": 297049710, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 13768, "time_per_iteration": 2.53145170211792 }, { "auxiliary_loss_clip": 0.01100002, "auxiliary_loss_mlp": 0.01029812, "balance_loss_clip": 1.01833844, "balance_loss_mlp": 1.03337073, "epoch": 0.8278370659852698, "flos": 27197628768000.0, "grad_norm": 1.7133515785919273, "language_loss": 0.7451666, "learning_rate": 3.028543792337006e-07, "loss": 0.76646477, "num_input_tokens_seen": 297070510, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 13769, "time_per_iteration": 2.513270139694214 }, { "auxiliary_loss_clip": 0.01102843, "auxiliary_loss_mlp": 0.01028586, "balance_loss_clip": 1.01643252, "balance_loss_mlp": 1.03388572, "epoch": 0.8278971892379378, "flos": 37816126404480.0, "grad_norm": 2.52739488169651, "language_loss": 0.74303418, "learning_rate": 3.0264835556423675e-07, "loss": 0.76434851, "num_input_tokens_seen": 297092585, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 13770, "time_per_iteration": 2.6046156883239746 }, { "auxiliary_loss_clip": 0.01104517, "auxiliary_loss_mlp": 0.01033207, "balance_loss_clip": 1.02033865, "balance_loss_mlp": 1.03493047, "epoch": 0.8279573124906058, "flos": 22560160573440.0, "grad_norm": 1.8599389081084168, "language_loss": 0.75745022, "learning_rate": 3.0244239625851785e-07, "loss": 0.77882743, "num_input_tokens_seen": 297110055, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 13771, "time_per_iteration": 2.4474709033966064 }, { "auxiliary_loss_clip": 0.01103778, "auxiliary_loss_mlp": 0.01028927, "balance_loss_clip": 1.01718473, "balance_loss_mlp": 1.03492689, "epoch": 0.8280174357432737, "flos": 36064619418240.0, "grad_norm": 1.6593402439116778, "language_loss": 0.72491539, "learning_rate": 3.0223650132435284e-07, "loss": 0.74624246, "num_input_tokens_seen": 297132170, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 13772, "time_per_iteration": 2.620728015899658 }, { "auxiliary_loss_clip": 0.01101658, "auxiliary_loss_mlp": 0.01029, "balance_loss_clip": 1.01624489, "balance_loss_mlp": 1.03434801, "epoch": 0.8280775589959417, "flos": 22961067246720.0, "grad_norm": 2.966439554454227, "language_loss": 0.75090462, "learning_rate": 3.0203067076955035e-07, "loss": 0.77221119, "num_input_tokens_seen": 297149515, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.671875, "step": 13773, "time_per_iteration": 2.5228047370910645 }, { "auxiliary_loss_clip": 0.01103371, "auxiliary_loss_mlp": 0.01034718, "balance_loss_clip": 1.02305937, "balance_loss_mlp": 1.03634572, "epoch": 0.8281376822486096, "flos": 26063282286720.0, "grad_norm": 2.1276230132978724, "language_loss": 0.75727117, "learning_rate": 3.01824904601915e-07, "loss": 0.77865195, "num_input_tokens_seen": 297170320, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 13774, "time_per_iteration": 2.505303144454956 }, { "auxiliary_loss_clip": 0.01108405, "auxiliary_loss_mlp": 0.01026689, "balance_loss_clip": 1.01481557, "balance_loss_mlp": 1.03718448, "epoch": 0.8281978055012776, "flos": 20667776446080.0, "grad_norm": 2.0086107378028064, "language_loss": 0.74796087, "learning_rate": 3.01619202829249e-07, "loss": 0.76931179, "num_input_tokens_seen": 297189935, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.7109375, "step": 13775, "time_per_iteration": 2.429697275161743 }, { "auxiliary_loss_clip": 0.01107344, "auxiliary_loss_mlp": 0.01030407, "balance_loss_clip": 1.01725256, "balance_loss_mlp": 1.03482008, "epoch": 0.8282579287539455, "flos": 29315281040640.0, "grad_norm": 2.1502284592737793, "language_loss": 0.73490608, "learning_rate": 3.01413565459353e-07, "loss": 0.75628352, "num_input_tokens_seen": 297210885, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 13776, "time_per_iteration": 2.5305850505828857 }, { "auxiliary_loss_clip": 0.01103484, "auxiliary_loss_mlp": 0.01025189, "balance_loss_clip": 1.01324439, "balance_loss_mlp": 1.03388, "epoch": 0.8283180520066136, "flos": 15706178899200.0, "grad_norm": 2.356481921936337, "language_loss": 0.77557558, "learning_rate": 3.0120799250002483e-07, "loss": 0.7968623, "num_input_tokens_seen": 297228500, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 13777, "time_per_iteration": 2.4157631397247314 }, { "auxiliary_loss_clip": 0.0110137, "auxiliary_loss_mlp": 0.01027966, "balance_loss_clip": 1.01676035, "balance_loss_mlp": 1.03537846, "epoch": 0.8283781752592815, "flos": 24791470456320.0, "grad_norm": 1.7036667099891236, "language_loss": 0.82514608, "learning_rate": 3.010024839590604e-07, "loss": 0.84643948, "num_input_tokens_seen": 297249470, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.66015625, "step": 13778, "time_per_iteration": 2.4889910221099854 }, { "auxiliary_loss_clip": 0.01100216, "auxiliary_loss_mlp": 0.01023512, "balance_loss_clip": 1.01159704, "balance_loss_mlp": 1.03436518, "epoch": 0.8284382985119495, "flos": 18982811404800.0, "grad_norm": 1.8984143910236715, "language_loss": 0.74525559, "learning_rate": 3.0079703984425187e-07, "loss": 0.76649284, "num_input_tokens_seen": 297265970, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.65625, "step": 13779, "time_per_iteration": 2.434109687805176 }, { "auxiliary_loss_clip": 0.01029518, "auxiliary_loss_mlp": 0.01002809, "balance_loss_clip": 1.0017246, "balance_loss_mlp": 1.00733387, "epoch": 0.8284984217646175, "flos": 61034460814080.0, "grad_norm": 1.1123569367248578, "language_loss": 0.5669837, "learning_rate": 3.0059166016338954e-07, "loss": 0.58730704, "num_input_tokens_seen": 297325525, "router_z_loss_clip": 0.01086426, "router_z_loss_mlp": 0.22265625, "step": 13780, "time_per_iteration": 3.123883008956909 }, { "auxiliary_loss_clip": 0.01102922, "auxiliary_loss_mlp": 0.01028355, "balance_loss_clip": 1.01616561, "balance_loss_mlp": 1.03413534, "epoch": 0.8285585450172854, "flos": 19714635100800.0, "grad_norm": 1.905494094913047, "language_loss": 0.7992937, "learning_rate": 3.0038634492426205e-07, "loss": 0.82060647, "num_input_tokens_seen": 297345025, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 13781, "time_per_iteration": 2.448671579360962 }, { "auxiliary_loss_clip": 0.01106893, "auxiliary_loss_mlp": 0.01028513, "balance_loss_clip": 1.01507258, "balance_loss_mlp": 1.03729677, "epoch": 0.8286186682699535, "flos": 21688896280320.0, "grad_norm": 2.471847516469185, "language_loss": 0.75860226, "learning_rate": 3.001810941346543e-07, "loss": 0.77995634, "num_input_tokens_seen": 297363570, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 13782, "time_per_iteration": 2.493960380554199 }, { "auxiliary_loss_clip": 0.01101839, "auxiliary_loss_mlp": 0.01028902, "balance_loss_clip": 1.01661801, "balance_loss_mlp": 1.03288698, "epoch": 0.8286787915226214, "flos": 25775566346880.0, "grad_norm": 1.5741210510691388, "language_loss": 0.76306474, "learning_rate": 2.9997590780234983e-07, "loss": 0.78437215, "num_input_tokens_seen": 297385385, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 13783, "time_per_iteration": 2.470109701156616 }, { "auxiliary_loss_clip": 0.01103661, "auxiliary_loss_mlp": 0.01028471, "balance_loss_clip": 1.01629364, "balance_loss_mlp": 1.03428268, "epoch": 0.8287389147752894, "flos": 21288348743040.0, "grad_norm": 2.0577826310389127, "language_loss": 0.73464012, "learning_rate": 2.997707859351304e-07, "loss": 0.75596142, "num_input_tokens_seen": 297403950, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 13784, "time_per_iteration": 2.503114700317383 }, { "auxiliary_loss_clip": 0.01106797, "auxiliary_loss_mlp": 0.01036508, "balance_loss_clip": 1.02284658, "balance_loss_mlp": 1.03463626, "epoch": 0.8287990380279573, "flos": 33544875323520.0, "grad_norm": 2.513375992436038, "language_loss": 0.69596612, "learning_rate": 2.99565728540772e-07, "loss": 0.71739912, "num_input_tokens_seen": 297424565, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 13785, "time_per_iteration": 2.57613468170166 }, { "auxiliary_loss_clip": 0.01104973, "auxiliary_loss_mlp": 0.01033335, "balance_loss_clip": 1.02101481, "balance_loss_mlp": 1.03610778, "epoch": 0.8288591612806253, "flos": 22966346545920.0, "grad_norm": 1.5672599196444157, "language_loss": 0.68532109, "learning_rate": 2.993607356270516e-07, "loss": 0.70670414, "num_input_tokens_seen": 297445180, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 13786, "time_per_iteration": 2.500309705734253 }, { "auxiliary_loss_clip": 0.01107147, "auxiliary_loss_mlp": 0.01034462, "balance_loss_clip": 1.02146244, "balance_loss_mlp": 1.03511488, "epoch": 0.8289192845332932, "flos": 18588979710720.0, "grad_norm": 2.0830024886459477, "language_loss": 0.77594578, "learning_rate": 2.991558072017426e-07, "loss": 0.79736185, "num_input_tokens_seen": 297463790, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 13787, "time_per_iteration": 2.420041084289551 }, { "auxiliary_loss_clip": 0.01102658, "auxiliary_loss_mlp": 0.01030864, "balance_loss_clip": 1.01946735, "balance_loss_mlp": 1.03486276, "epoch": 0.8289794077859612, "flos": 15450423085440.0, "grad_norm": 2.1399243238195482, "language_loss": 0.80464804, "learning_rate": 2.989509432726163e-07, "loss": 0.82598329, "num_input_tokens_seen": 297480100, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 13788, "time_per_iteration": 2.455033540725708 }, { "auxiliary_loss_clip": 0.0110267, "auxiliary_loss_mlp": 0.01031965, "balance_loss_clip": 1.02027071, "balance_loss_mlp": 1.03479362, "epoch": 0.8290395310386292, "flos": 28877853214080.0, "grad_norm": 1.6575769549058983, "language_loss": 0.71381581, "learning_rate": 2.9874614384744014e-07, "loss": 0.7351622, "num_input_tokens_seen": 297499890, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 13789, "time_per_iteration": 2.5093917846679688 }, { "auxiliary_loss_clip": 0.01103905, "auxiliary_loss_mlp": 0.01029218, "balance_loss_clip": 1.01671886, "balance_loss_mlp": 1.03312385, "epoch": 0.8290996542912972, "flos": 36576274700160.0, "grad_norm": 1.833161005604502, "language_loss": 0.68005848, "learning_rate": 2.985414089339813e-07, "loss": 0.70138973, "num_input_tokens_seen": 297521440, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 13790, "time_per_iteration": 2.596641778945923 }, { "auxiliary_loss_clip": 0.01104652, "auxiliary_loss_mlp": 0.01029824, "balance_loss_clip": 1.0160737, "balance_loss_mlp": 1.03448927, "epoch": 0.8291597775439651, "flos": 23623009032960.0, "grad_norm": 1.8288076117367649, "language_loss": 0.77585101, "learning_rate": 2.9833673854000265e-07, "loss": 0.79719573, "num_input_tokens_seen": 297539920, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.703125, "step": 13791, "time_per_iteration": 2.526870012283325 }, { "auxiliary_loss_clip": 0.01103152, "auxiliary_loss_mlp": 0.01028253, "balance_loss_clip": 1.01552105, "balance_loss_mlp": 1.03642607, "epoch": 0.8292199007966331, "flos": 21397481239680.0, "grad_norm": 1.5376095285080222, "language_loss": 0.69934022, "learning_rate": 2.981321326732651e-07, "loss": 0.72065425, "num_input_tokens_seen": 297560000, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.66796875, "step": 13792, "time_per_iteration": 3.9703938961029053 }, { "auxiliary_loss_clip": 0.01103584, "auxiliary_loss_mlp": 0.01032458, "balance_loss_clip": 1.019804, "balance_loss_mlp": 1.03346646, "epoch": 0.829280024049301, "flos": 28767607395840.0, "grad_norm": 2.879084031520084, "language_loss": 0.64930379, "learning_rate": 2.9792759134152736e-07, "loss": 0.67066419, "num_input_tokens_seen": 297579300, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 13793, "time_per_iteration": 2.517561912536621 }, { "auxiliary_loss_clip": 0.01105434, "auxiliary_loss_mlp": 0.01030185, "balance_loss_clip": 1.01724458, "balance_loss_mlp": 1.03401923, "epoch": 0.829340147301969, "flos": 19938071652480.0, "grad_norm": 2.169052252122558, "language_loss": 0.66579747, "learning_rate": 2.977231145525461e-07, "loss": 0.68715364, "num_input_tokens_seen": 297598095, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 13794, "time_per_iteration": 3.8543472290039062 }, { "auxiliary_loss_clip": 0.01101808, "auxiliary_loss_mlp": 0.01034983, "balance_loss_clip": 1.02218008, "balance_loss_mlp": 1.03286779, "epoch": 0.829400270554637, "flos": 25228575060480.0, "grad_norm": 2.7371769808055517, "language_loss": 0.66346169, "learning_rate": 2.975187023140757e-07, "loss": 0.68482959, "num_input_tokens_seen": 297615955, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 13795, "time_per_iteration": 2.4948832988739014 }, { "auxiliary_loss_clip": 0.01099458, "auxiliary_loss_mlp": 0.01032833, "balance_loss_clip": 1.02120399, "balance_loss_mlp": 1.03492165, "epoch": 0.829460393807305, "flos": 24463570176000.0, "grad_norm": 1.9442369023783541, "language_loss": 0.66339552, "learning_rate": 2.973143546338661e-07, "loss": 0.68471837, "num_input_tokens_seen": 297636285, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.64453125, "step": 13796, "time_per_iteration": 2.498128652572632 }, { "auxiliary_loss_clip": 0.01102397, "auxiliary_loss_mlp": 0.01031178, "balance_loss_clip": 1.019418, "balance_loss_mlp": 1.03531504, "epoch": 0.829520517059973, "flos": 15122486891520.0, "grad_norm": 1.6259346211112702, "language_loss": 0.72027612, "learning_rate": 2.971100715196666e-07, "loss": 0.7416119, "num_input_tokens_seen": 297653315, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 13797, "time_per_iteration": 3.8175528049468994 }, { "auxiliary_loss_clip": 0.0110609, "auxiliary_loss_mlp": 0.01031017, "balance_loss_clip": 1.01949596, "balance_loss_mlp": 1.03593385, "epoch": 0.8295806403126409, "flos": 21579979265280.0, "grad_norm": 1.9792690698596593, "language_loss": 0.7220031, "learning_rate": 2.969058529792243e-07, "loss": 0.74337417, "num_input_tokens_seen": 297673480, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.703125, "step": 13798, "time_per_iteration": 3.892876148223877 }, { "auxiliary_loss_clip": 0.010988, "auxiliary_loss_mlp": 0.01031642, "balance_loss_clip": 1.02003717, "balance_loss_mlp": 1.0336926, "epoch": 0.8296407635653089, "flos": 21726566668800.0, "grad_norm": 1.9044605687123535, "language_loss": 0.76327574, "learning_rate": 2.967016990202822e-07, "loss": 0.78458011, "num_input_tokens_seen": 297693250, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6484375, "step": 13799, "time_per_iteration": 2.466947078704834 }, { "auxiliary_loss_clip": 0.01103419, "auxiliary_loss_mlp": 0.01032771, "balance_loss_clip": 1.02089167, "balance_loss_mlp": 1.03565574, "epoch": 0.8297008868179768, "flos": 11181147252480.0, "grad_norm": 1.837662517295678, "language_loss": 0.67852855, "learning_rate": 2.9649760965058245e-07, "loss": 0.69989049, "num_input_tokens_seen": 297710975, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13800, "time_per_iteration": 2.409745216369629 }, { "auxiliary_loss_clip": 0.01110373, "auxiliary_loss_mlp": 0.01033304, "balance_loss_clip": 1.01957726, "balance_loss_mlp": 1.03860235, "epoch": 0.8297610100706448, "flos": 20664041431680.0, "grad_norm": 2.2050013832677626, "language_loss": 0.7423982, "learning_rate": 2.9629358487786515e-07, "loss": 0.76383501, "num_input_tokens_seen": 297730860, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 13801, "time_per_iteration": 2.461538076400757 }, { "auxiliary_loss_clip": 0.01105407, "auxiliary_loss_mlp": 0.01030349, "balance_loss_clip": 1.01915538, "balance_loss_mlp": 1.0356909, "epoch": 0.8298211333233128, "flos": 20376325491840.0, "grad_norm": 1.7843308606507617, "language_loss": 0.73519951, "learning_rate": 2.9608962470986476e-07, "loss": 0.75655711, "num_input_tokens_seen": 297749765, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6953125, "step": 13802, "time_per_iteration": 2.4461686611175537 }, { "auxiliary_loss_clip": 0.01104521, "auxiliary_loss_mlp": 0.01029582, "balance_loss_clip": 1.01758337, "balance_loss_mlp": 1.03464937, "epoch": 0.8298812565759808, "flos": 21508696725120.0, "grad_norm": 1.6369320904162556, "language_loss": 0.74748194, "learning_rate": 2.9588572915431644e-07, "loss": 0.76882297, "num_input_tokens_seen": 297770380, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 13803, "time_per_iteration": 2.468809127807617 }, { "auxiliary_loss_clip": 0.01104951, "auxiliary_loss_mlp": 0.01034189, "balance_loss_clip": 1.02236903, "balance_loss_mlp": 1.03581798, "epoch": 0.8299413798286487, "flos": 22818681734400.0, "grad_norm": 1.6164820838563532, "language_loss": 0.79036039, "learning_rate": 2.9568189821895215e-07, "loss": 0.81175184, "num_input_tokens_seen": 297789440, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 13804, "time_per_iteration": 2.497516632080078 }, { "auxiliary_loss_clip": 0.01102709, "auxiliary_loss_mlp": 0.01030243, "balance_loss_clip": 1.01872134, "balance_loss_mlp": 1.03471911, "epoch": 0.8300015030813167, "flos": 29679199683840.0, "grad_norm": 1.7016578678746965, "language_loss": 0.72798288, "learning_rate": 2.954781319115016e-07, "loss": 0.7493124, "num_input_tokens_seen": 297810425, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 13805, "time_per_iteration": 2.5133566856384277 }, { "auxiliary_loss_clip": 0.01106472, "auxiliary_loss_mlp": 0.01029537, "balance_loss_clip": 1.01735353, "balance_loss_mlp": 1.0359726, "epoch": 0.8300616263339846, "flos": 19719483436800.0, "grad_norm": 2.24964934106526, "language_loss": 0.77900934, "learning_rate": 2.952744302396906e-07, "loss": 0.80036938, "num_input_tokens_seen": 297827680, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 13806, "time_per_iteration": 2.453101873397827 }, { "auxiliary_loss_clip": 0.01108279, "auxiliary_loss_mlp": 0.01033067, "balance_loss_clip": 1.02025235, "balance_loss_mlp": 1.03676498, "epoch": 0.8301217495866526, "flos": 19901945548800.0, "grad_norm": 2.271308650258576, "language_loss": 0.63702899, "learning_rate": 2.950707932112444e-07, "loss": 0.65844244, "num_input_tokens_seen": 297848005, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 13807, "time_per_iteration": 2.4844448566436768 }, { "auxiliary_loss_clip": 0.01105415, "auxiliary_loss_mlp": 0.01028587, "balance_loss_clip": 1.01637411, "balance_loss_mlp": 1.03692842, "epoch": 0.8301818728393207, "flos": 19715784336000.0, "grad_norm": 1.6930384701761314, "language_loss": 0.72714794, "learning_rate": 2.948672208338847e-07, "loss": 0.74848801, "num_input_tokens_seen": 297866730, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 13808, "time_per_iteration": 2.44893217086792 }, { "auxiliary_loss_clip": 0.01113308, "auxiliary_loss_mlp": 0.01036543, "balance_loss_clip": 1.02296531, "balance_loss_mlp": 1.03988969, "epoch": 0.8302419960919886, "flos": 28293658416000.0, "grad_norm": 1.802439468342804, "language_loss": 0.66585439, "learning_rate": 2.9466371311533046e-07, "loss": 0.6873529, "num_input_tokens_seen": 297886390, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 13809, "time_per_iteration": 2.538355588912964 }, { "auxiliary_loss_clip": 0.01104688, "auxiliary_loss_mlp": 0.01026458, "balance_loss_clip": 1.01459026, "balance_loss_mlp": 1.03482413, "epoch": 0.8303021193446566, "flos": 18223444955520.0, "grad_norm": 3.71713102781265, "language_loss": 0.74321365, "learning_rate": 2.9446027006329896e-07, "loss": 0.76452506, "num_input_tokens_seen": 297905110, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69921875, "step": 13810, "time_per_iteration": 2.4519107341766357 }, { "auxiliary_loss_clip": 0.0110204, "auxiliary_loss_mlp": 0.01035538, "balance_loss_clip": 1.02405834, "balance_loss_mlp": 1.03562427, "epoch": 0.8303622425973245, "flos": 23111425578240.0, "grad_norm": 1.8419834683588914, "language_loss": 0.81017512, "learning_rate": 2.94256891685505e-07, "loss": 0.8315509, "num_input_tokens_seen": 297925460, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6640625, "step": 13811, "time_per_iteration": 2.472790241241455 }, { "auxiliary_loss_clip": 0.01107261, "auxiliary_loss_mlp": 0.01039537, "balance_loss_clip": 1.02725875, "balance_loss_mlp": 1.03694868, "epoch": 0.8304223658499925, "flos": 19572860119680.0, "grad_norm": 2.9124976768778907, "language_loss": 0.73377347, "learning_rate": 2.9405357798966156e-07, "loss": 0.75524151, "num_input_tokens_seen": 297941760, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 13812, "time_per_iteration": 2.4268219470977783 }, { "auxiliary_loss_clip": 0.01103515, "auxiliary_loss_mlp": 0.01030461, "balance_loss_clip": 1.01828945, "balance_loss_mlp": 1.03678596, "epoch": 0.8304824891026604, "flos": 24426115269120.0, "grad_norm": 1.6478231034737465, "language_loss": 0.78158367, "learning_rate": 2.9385032898347664e-07, "loss": 0.80292338, "num_input_tokens_seen": 297959745, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6640625, "step": 13813, "time_per_iteration": 2.4697790145874023 }, { "auxiliary_loss_clip": 0.01105472, "auxiliary_loss_mlp": 0.01025646, "balance_loss_clip": 1.01318288, "balance_loss_mlp": 1.03433049, "epoch": 0.8305426123553284, "flos": 22381792611840.0, "grad_norm": 2.437025197821283, "language_loss": 0.70725113, "learning_rate": 2.93647144674658e-07, "loss": 0.72856236, "num_input_tokens_seen": 297977665, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 13814, "time_per_iteration": 2.457576274871826 }, { "auxiliary_loss_clip": 0.01111548, "auxiliary_loss_mlp": 0.01042539, "balance_loss_clip": 1.02834082, "balance_loss_mlp": 1.03604794, "epoch": 0.8306027356079964, "flos": 14903575453440.0, "grad_norm": 2.092501817683436, "language_loss": 0.68072069, "learning_rate": 2.9344402507091116e-07, "loss": 0.70226157, "num_input_tokens_seen": 297993525, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.75390625, "step": 13815, "time_per_iteration": 2.3991456031799316 }, { "auxiliary_loss_clip": 0.01105972, "auxiliary_loss_mlp": 0.01032286, "balance_loss_clip": 1.02016211, "balance_loss_mlp": 1.03700805, "epoch": 0.8306628588606644, "flos": 19644573623040.0, "grad_norm": 1.9720382226640136, "language_loss": 0.76013052, "learning_rate": 2.9324097017993745e-07, "loss": 0.78151309, "num_input_tokens_seen": 298012920, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 13816, "time_per_iteration": 2.4379355907440186 }, { "auxiliary_loss_clip": 0.01104156, "auxiliary_loss_mlp": 0.01032651, "balance_loss_clip": 1.02142131, "balance_loss_mlp": 1.03570151, "epoch": 0.8307229821133323, "flos": 24389737770240.0, "grad_norm": 1.71124778588649, "language_loss": 0.81478679, "learning_rate": 2.930379800094371e-07, "loss": 0.83615482, "num_input_tokens_seen": 298033310, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.68359375, "step": 13817, "time_per_iteration": 2.469588041305542 }, { "auxiliary_loss_clip": 0.01107834, "auxiliary_loss_mlp": 0.01035911, "balance_loss_clip": 1.02255964, "balance_loss_mlp": 1.03649628, "epoch": 0.8307831053660003, "flos": 20996933702400.0, "grad_norm": 1.5820000394659701, "language_loss": 0.78074837, "learning_rate": 2.9283505456710875e-07, "loss": 0.80218577, "num_input_tokens_seen": 298053530, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 13818, "time_per_iteration": 2.46014404296875 }, { "auxiliary_loss_clip": 0.01107484, "auxiliary_loss_mlp": 0.01037011, "balance_loss_clip": 1.02442265, "balance_loss_mlp": 1.03813517, "epoch": 0.8308432286186682, "flos": 21397301671680.0, "grad_norm": 2.2863566903800585, "language_loss": 0.8208698, "learning_rate": 2.926321938606453e-07, "loss": 0.84231478, "num_input_tokens_seen": 298069305, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 13819, "time_per_iteration": 2.4535274505615234 }, { "auxiliary_loss_clip": 0.01029498, "auxiliary_loss_mlp": 0.01001081, "balance_loss_clip": 0.99998468, "balance_loss_mlp": 1.00725853, "epoch": 0.8309033518713362, "flos": 62533656714240.0, "grad_norm": 0.7579226095652131, "language_loss": 0.5623166, "learning_rate": 2.924293978977399e-07, "loss": 0.58262241, "num_input_tokens_seen": 298125830, "router_z_loss_clip": 0.01098633, "router_z_loss_mlp": 0.22265625, "step": 13820, "time_per_iteration": 3.0884432792663574 }, { "auxiliary_loss_clip": 0.01099062, "auxiliary_loss_mlp": 0.0102652, "balance_loss_clip": 1.01478362, "balance_loss_mlp": 1.03343701, "epoch": 0.8309634751240043, "flos": 16979104051200.0, "grad_norm": 2.1117411294990176, "language_loss": 0.68234909, "learning_rate": 2.922266666860831e-07, "loss": 0.70360488, "num_input_tokens_seen": 298142320, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.65625, "step": 13821, "time_per_iteration": 2.412390947341919 }, { "auxiliary_loss_clip": 0.01107266, "auxiliary_loss_mlp": 0.01032425, "balance_loss_clip": 1.01994979, "balance_loss_mlp": 1.03513026, "epoch": 0.8310235983766722, "flos": 22674464628480.0, "grad_norm": 3.124179923448393, "language_loss": 0.68990767, "learning_rate": 2.920240002333625e-07, "loss": 0.71130455, "num_input_tokens_seen": 298161845, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 13822, "time_per_iteration": 2.4843902587890625 }, { "auxiliary_loss_clip": 0.01102136, "auxiliary_loss_mlp": 0.01032113, "balance_loss_clip": 1.02043617, "balance_loss_mlp": 1.03558373, "epoch": 0.8310837216293402, "flos": 30811463176320.0, "grad_norm": 2.2499434437570716, "language_loss": 0.61896086, "learning_rate": 2.918213985472631e-07, "loss": 0.64030337, "num_input_tokens_seen": 298184165, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6640625, "step": 13823, "time_per_iteration": 2.5306594371795654 }, { "auxiliary_loss_clip": 0.01029653, "auxiliary_loss_mlp": 0.01001204, "balance_loss_clip": 1.00014865, "balance_loss_mlp": 1.00748551, "epoch": 0.8311438448820081, "flos": 71276074997760.0, "grad_norm": 0.8720091439816209, "language_loss": 0.61902642, "learning_rate": 2.916188616354669e-07, "loss": 0.63933498, "num_input_tokens_seen": 298251720, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.22167969, "step": 13824, "time_per_iteration": 3.1804721355438232 }, { "auxiliary_loss_clip": 0.01102978, "auxiliary_loss_mlp": 0.01029468, "balance_loss_clip": 1.01752949, "balance_loss_mlp": 1.03443766, "epoch": 0.8312039681346761, "flos": 20887082933760.0, "grad_norm": 1.6329171328541654, "language_loss": 0.74430144, "learning_rate": 2.914163895056552e-07, "loss": 0.76562595, "num_input_tokens_seen": 298271910, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 13825, "time_per_iteration": 2.478454351425171 }, { "auxiliary_loss_clip": 0.01104908, "auxiliary_loss_mlp": 0.0103261, "balance_loss_clip": 1.02007508, "balance_loss_mlp": 1.03427672, "epoch": 0.831264091387344, "flos": 17017528625280.0, "grad_norm": 2.0126402578537212, "language_loss": 0.79975837, "learning_rate": 2.9121398216550486e-07, "loss": 0.82113355, "num_input_tokens_seen": 298288105, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 13826, "time_per_iteration": 2.410717725753784 }, { "auxiliary_loss_clip": 0.01103682, "auxiliary_loss_mlp": 0.01030886, "balance_loss_clip": 1.01824403, "balance_loss_mlp": 1.03431988, "epoch": 0.831324214640012, "flos": 24419578993920.0, "grad_norm": 1.7758227559377364, "language_loss": 0.68082702, "learning_rate": 2.910116396226914e-07, "loss": 0.70217264, "num_input_tokens_seen": 298307600, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 13827, "time_per_iteration": 2.5036721229553223 }, { "auxiliary_loss_clip": 0.01102676, "auxiliary_loss_mlp": 0.01030914, "balance_loss_clip": 1.01967251, "balance_loss_mlp": 1.03403878, "epoch": 0.83138433789268, "flos": 13545576938880.0, "grad_norm": 1.9047020883153836, "language_loss": 0.74420166, "learning_rate": 2.9080936188488834e-07, "loss": 0.7655375, "num_input_tokens_seen": 298323055, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6875, "step": 13828, "time_per_iteration": 2.4100730419158936 }, { "auxiliary_loss_clip": 0.01103294, "auxiliary_loss_mlp": 0.01031923, "balance_loss_clip": 1.01968598, "balance_loss_mlp": 1.03315926, "epoch": 0.831444461145348, "flos": 44492386561920.0, "grad_norm": 1.5179316228183606, "language_loss": 0.66754389, "learning_rate": 2.906071489597657e-07, "loss": 0.68889612, "num_input_tokens_seen": 298346950, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 13829, "time_per_iteration": 2.6603968143463135 }, { "auxiliary_loss_clip": 0.01106664, "auxiliary_loss_mlp": 0.01029206, "balance_loss_clip": 1.01656389, "balance_loss_mlp": 1.03508759, "epoch": 0.8315045843980159, "flos": 22705024124160.0, "grad_norm": 4.944369779311727, "language_loss": 0.82660818, "learning_rate": 2.9040500085499054e-07, "loss": 0.84796679, "num_input_tokens_seen": 298366315, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 13830, "time_per_iteration": 2.4774630069732666 }, { "auxiliary_loss_clip": 0.01104349, "auxiliary_loss_mlp": 0.01031943, "balance_loss_clip": 1.01977801, "balance_loss_mlp": 1.03551078, "epoch": 0.8315647076506839, "flos": 16873491087360.0, "grad_norm": 2.152552103628287, "language_loss": 0.74270701, "learning_rate": 2.9020291757822925e-07, "loss": 0.76406991, "num_input_tokens_seen": 298385185, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 13831, "time_per_iteration": 2.4784576892852783 }, { "auxiliary_loss_clip": 0.01105638, "auxiliary_loss_mlp": 0.01033464, "balance_loss_clip": 1.02058339, "balance_loss_mlp": 1.03616214, "epoch": 0.8316248309033518, "flos": 13808730954240.0, "grad_norm": 1.8577827369008437, "language_loss": 0.713866, "learning_rate": 2.9000089913714523e-07, "loss": 0.73525697, "num_input_tokens_seen": 298402335, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 13832, "time_per_iteration": 2.4226553440093994 }, { "auxiliary_loss_clip": 0.0110261, "auxiliary_loss_mlp": 0.01030616, "balance_loss_clip": 1.01833773, "balance_loss_mlp": 1.03387868, "epoch": 0.8316849541560198, "flos": 23512511819520.0, "grad_norm": 1.8509293506079316, "language_loss": 0.84471929, "learning_rate": 2.897989455393979e-07, "loss": 0.86605161, "num_input_tokens_seen": 298423370, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 13833, "time_per_iteration": 2.509355306625366 }, { "auxiliary_loss_clip": 0.01105613, "auxiliary_loss_mlp": 0.01035914, "balance_loss_clip": 1.02311647, "balance_loss_mlp": 1.03528953, "epoch": 0.8317450774086879, "flos": 23771356202880.0, "grad_norm": 1.861563518724991, "language_loss": 0.76205581, "learning_rate": 2.8959705679264625e-07, "loss": 0.78347111, "num_input_tokens_seen": 298444835, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13834, "time_per_iteration": 4.059255599975586 }, { "auxiliary_loss_clip": 0.0110013, "auxiliary_loss_mlp": 0.01030051, "balance_loss_clip": 1.01834476, "balance_loss_mlp": 1.03356242, "epoch": 0.8318052006613558, "flos": 16215535710720.0, "grad_norm": 2.0311450208215893, "language_loss": 0.79874855, "learning_rate": 2.893952329045459e-07, "loss": 0.82005036, "num_input_tokens_seen": 298461845, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66796875, "step": 13835, "time_per_iteration": 2.427506923675537 }, { "auxiliary_loss_clip": 0.01108985, "auxiliary_loss_mlp": 0.01036072, "balance_loss_clip": 1.0220654, "balance_loss_mlp": 1.03705466, "epoch": 0.8318653239140238, "flos": 19974556892160.0, "grad_norm": 2.0137337515229725, "language_loss": 0.80971789, "learning_rate": 2.8919347388274905e-07, "loss": 0.83116841, "num_input_tokens_seen": 298479095, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 13836, "time_per_iteration": 3.919186592102051 }, { "auxiliary_loss_clip": 0.01101855, "auxiliary_loss_mlp": 0.01028933, "balance_loss_clip": 1.01720905, "balance_loss_mlp": 1.03453827, "epoch": 0.8319254471666917, "flos": 17704714694400.0, "grad_norm": 2.2234521190186913, "language_loss": 0.77919066, "learning_rate": 2.8899177973490727e-07, "loss": 0.80049855, "num_input_tokens_seen": 298494475, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 13837, "time_per_iteration": 2.3894429206848145 }, { "auxiliary_loss_clip": 0.01107812, "auxiliary_loss_mlp": 0.01030842, "balance_loss_clip": 1.01748466, "balance_loss_mlp": 1.03498268, "epoch": 0.8319855704193597, "flos": 19536554448000.0, "grad_norm": 2.100571168791017, "language_loss": 0.83456135, "learning_rate": 2.887901504686685e-07, "loss": 0.85594791, "num_input_tokens_seen": 298513185, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 13838, "time_per_iteration": 3.8177425861358643 }, { "auxiliary_loss_clip": 0.0110234, "auxiliary_loss_mlp": 0.0103279, "balance_loss_clip": 1.02009976, "balance_loss_mlp": 1.03503466, "epoch": 0.8320456936720276, "flos": 21178067011200.0, "grad_norm": 2.3833710123901835, "language_loss": 0.74552643, "learning_rate": 2.885885860916795e-07, "loss": 0.76687777, "num_input_tokens_seen": 298531885, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.671875, "step": 13839, "time_per_iteration": 3.933342933654785 }, { "auxiliary_loss_clip": 0.0110542, "auxiliary_loss_mlp": 0.01027689, "balance_loss_clip": 1.01572609, "balance_loss_mlp": 1.03666461, "epoch": 0.8321058169246957, "flos": 33250874503680.0, "grad_norm": 1.6419638355395247, "language_loss": 0.67695725, "learning_rate": 2.8838708661158253e-07, "loss": 0.69828832, "num_input_tokens_seen": 298554905, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 13840, "time_per_iteration": 2.537156820297241 }, { "auxiliary_loss_clip": 0.01102158, "auxiliary_loss_mlp": 0.01030768, "balance_loss_clip": 1.01860309, "balance_loss_mlp": 1.03300607, "epoch": 0.8321659401773636, "flos": 14208129256320.0, "grad_norm": 2.065554999072171, "language_loss": 0.79205108, "learning_rate": 2.8818565203601843e-07, "loss": 0.81338036, "num_input_tokens_seen": 298571185, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 13841, "time_per_iteration": 2.423800468444824 }, { "auxiliary_loss_clip": 0.01102974, "auxiliary_loss_mlp": 0.01029942, "balance_loss_clip": 1.0176928, "balance_loss_mlp": 1.03541934, "epoch": 0.8322260634300316, "flos": 15158253859200.0, "grad_norm": 1.832054061295971, "language_loss": 0.68359816, "learning_rate": 2.879842823726262e-07, "loss": 0.70492733, "num_input_tokens_seen": 298588505, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 13842, "time_per_iteration": 2.411827564239502 }, { "auxiliary_loss_clip": 0.01104656, "auxiliary_loss_mlp": 0.01028009, "balance_loss_clip": 1.01507497, "balance_loss_mlp": 1.03641844, "epoch": 0.8322861866826995, "flos": 25300827267840.0, "grad_norm": 1.6245470722312596, "language_loss": 0.73196018, "learning_rate": 2.8778297762904124e-07, "loss": 0.75328684, "num_input_tokens_seen": 298609295, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 13843, "time_per_iteration": 2.5166258811950684 }, { "auxiliary_loss_clip": 0.01104732, "auxiliary_loss_mlp": 0.0103159, "balance_loss_clip": 1.01916862, "balance_loss_mlp": 1.03722143, "epoch": 0.8323463099353675, "flos": 17019360218880.0, "grad_norm": 1.8665170109894136, "language_loss": 0.77495784, "learning_rate": 2.875817378128975e-07, "loss": 0.79632103, "num_input_tokens_seen": 298625765, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.67578125, "step": 13844, "time_per_iteration": 2.4647934436798096 }, { "auxiliary_loss_clip": 0.01029071, "auxiliary_loss_mlp": 0.01003984, "balance_loss_clip": 1.00291753, "balance_loss_mlp": 1.00693321, "epoch": 0.8324064331880354, "flos": 55607889709440.0, "grad_norm": 0.7767813376766495, "language_loss": 0.55261254, "learning_rate": 2.8738056293182624e-07, "loss": 0.57294309, "num_input_tokens_seen": 298683005, "router_z_loss_clip": 0.01068115, "router_z_loss_mlp": 0.22070312, "step": 13845, "time_per_iteration": 3.032747983932495 }, { "auxiliary_loss_clip": 0.01107882, "auxiliary_loss_mlp": 0.01044592, "balance_loss_clip": 1.0314132, "balance_loss_mlp": 1.03691888, "epoch": 0.8324665564407034, "flos": 26138623063680.0, "grad_norm": 1.8472839583404186, "language_loss": 0.7539627, "learning_rate": 2.871794529934555e-07, "loss": 0.77548742, "num_input_tokens_seen": 298703060, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 13846, "time_per_iteration": 2.5059943199157715 }, { "auxiliary_loss_clip": 0.01104792, "auxiliary_loss_mlp": 0.01027908, "balance_loss_clip": 1.01458621, "balance_loss_mlp": 1.03337121, "epoch": 0.8325266796933715, "flos": 22049187649920.0, "grad_norm": 1.725765299769976, "language_loss": 0.78605568, "learning_rate": 2.8697840800541115e-07, "loss": 0.8073827, "num_input_tokens_seen": 298721765, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 13847, "time_per_iteration": 2.502948760986328 }, { "auxiliary_loss_clip": 0.01101066, "auxiliary_loss_mlp": 0.01029017, "balance_loss_clip": 1.01739979, "balance_loss_mlp": 1.03389585, "epoch": 0.8325868029460394, "flos": 22816634659200.0, "grad_norm": 1.8068624910281075, "language_loss": 0.74454314, "learning_rate": 2.867774279753175e-07, "loss": 0.76584399, "num_input_tokens_seen": 298740825, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 13848, "time_per_iteration": 2.5232672691345215 }, { "auxiliary_loss_clip": 0.01104642, "auxiliary_loss_mlp": 0.01028176, "balance_loss_clip": 1.01612961, "balance_loss_mlp": 1.0362941, "epoch": 0.8326469261987074, "flos": 14757454926720.0, "grad_norm": 8.006426198983222, "language_loss": 0.63772923, "learning_rate": 2.8657651291079554e-07, "loss": 0.65905744, "num_input_tokens_seen": 298758515, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 13849, "time_per_iteration": 2.4398374557495117 }, { "auxiliary_loss_clip": 0.01105441, "auxiliary_loss_mlp": 0.01030534, "balance_loss_clip": 1.0186367, "balance_loss_mlp": 1.03483891, "epoch": 0.8327070494513753, "flos": 22926126291840.0, "grad_norm": 2.5397532571346013, "language_loss": 0.79486054, "learning_rate": 2.863756628194638e-07, "loss": 0.81622028, "num_input_tokens_seen": 298776375, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.70703125, "step": 13850, "time_per_iteration": 2.442744493484497 }, { "auxiliary_loss_clip": 0.01100273, "auxiliary_loss_mlp": 0.01033243, "balance_loss_clip": 1.0221858, "balance_loss_mlp": 1.03427708, "epoch": 0.8327671727040433, "flos": 20665334321280.0, "grad_norm": 1.6328338765745896, "language_loss": 0.78216881, "learning_rate": 2.8617487770893877e-07, "loss": 0.80350399, "num_input_tokens_seen": 298795135, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.65625, "step": 13851, "time_per_iteration": 2.45609188079834 }, { "auxiliary_loss_clip": 0.01029148, "auxiliary_loss_mlp": 0.01003501, "balance_loss_clip": 1.00250518, "balance_loss_mlp": 1.00715041, "epoch": 0.8328272959567112, "flos": 56060760384000.0, "grad_norm": 0.7707929358020051, "language_loss": 0.55787921, "learning_rate": 2.859741575868344e-07, "loss": 0.57820559, "num_input_tokens_seen": 298855475, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.21972656, "step": 13852, "time_per_iteration": 3.059391736984253 }, { "auxiliary_loss_clip": 0.01101567, "auxiliary_loss_mlp": 0.01027593, "balance_loss_clip": 1.01553464, "balance_loss_mlp": 1.03476882, "epoch": 0.8328874192093793, "flos": 32303084284800.0, "grad_norm": 2.3221835700723923, "language_loss": 0.67323601, "learning_rate": 2.8577350246076125e-07, "loss": 0.69452763, "num_input_tokens_seen": 298875875, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.66796875, "step": 13853, "time_per_iteration": 2.548790693283081 }, { "auxiliary_loss_clip": 0.01105208, "auxiliary_loss_mlp": 0.01031704, "balance_loss_clip": 1.01992035, "balance_loss_mlp": 1.03683865, "epoch": 0.8329475424620472, "flos": 23512691387520.0, "grad_norm": 1.6282100425435357, "language_loss": 0.77897543, "learning_rate": 2.855729123383286e-07, "loss": 0.80034459, "num_input_tokens_seen": 298895950, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 13854, "time_per_iteration": 2.461348295211792 }, { "auxiliary_loss_clip": 0.01029316, "auxiliary_loss_mlp": 0.01001921, "balance_loss_clip": 1.0009253, "balance_loss_mlp": 1.00711429, "epoch": 0.8330076657147152, "flos": 67840680378240.0, "grad_norm": 0.7772179563309609, "language_loss": 0.58637881, "learning_rate": 2.8537238722714295e-07, "loss": 0.60669112, "num_input_tokens_seen": 298955770, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22265625, "step": 13855, "time_per_iteration": 2.97377610206604 }, { "auxiliary_loss_clip": 0.01102791, "auxiliary_loss_mlp": 0.01027647, "balance_loss_clip": 1.01542234, "balance_loss_mlp": 1.03525853, "epoch": 0.8330677889673831, "flos": 22892801448960.0, "grad_norm": 4.102388468517432, "language_loss": 0.71809578, "learning_rate": 2.8517192713480853e-07, "loss": 0.73940021, "num_input_tokens_seen": 298976545, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 13856, "time_per_iteration": 2.4689505100250244 }, { "auxiliary_loss_clip": 0.01104097, "auxiliary_loss_mlp": 0.01027901, "balance_loss_clip": 1.01569963, "balance_loss_mlp": 1.03569627, "epoch": 0.8331279122200511, "flos": 27345042184320.0, "grad_norm": 1.536032213991273, "language_loss": 0.75657642, "learning_rate": 2.8497153206892677e-07, "loss": 0.7778964, "num_input_tokens_seen": 298996750, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 13857, "time_per_iteration": 2.5148186683654785 }, { "auxiliary_loss_clip": 0.01100552, "auxiliary_loss_mlp": 0.01025666, "balance_loss_clip": 1.01463938, "balance_loss_mlp": 1.0357852, "epoch": 0.833188035472719, "flos": 19938179393280.0, "grad_norm": 3.8479098128887492, "language_loss": 0.73173386, "learning_rate": 2.847712020370958e-07, "loss": 0.75299603, "num_input_tokens_seen": 299014895, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6484375, "step": 13858, "time_per_iteration": 2.4476699829101562 }, { "auxiliary_loss_clip": 0.01106642, "auxiliary_loss_mlp": 0.01033344, "balance_loss_clip": 1.02038574, "balance_loss_mlp": 1.03413844, "epoch": 0.833248158725387, "flos": 15232624968960.0, "grad_norm": 1.9193736408095647, "language_loss": 0.73197424, "learning_rate": 2.8457093704691316e-07, "loss": 0.75337404, "num_input_tokens_seen": 299032855, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 13859, "time_per_iteration": 2.5089786052703857 }, { "auxiliary_loss_clip": 0.01098204, "auxiliary_loss_mlp": 0.01025219, "balance_loss_clip": 1.01412678, "balance_loss_mlp": 1.03333449, "epoch": 0.8333082819780551, "flos": 24535535074560.0, "grad_norm": 3.0760412023651273, "language_loss": 0.79293931, "learning_rate": 2.8437073710597205e-07, "loss": 0.81417358, "num_input_tokens_seen": 299052055, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6484375, "step": 13860, "time_per_iteration": 2.4628522396087646 }, { "auxiliary_loss_clip": 0.01100814, "auxiliary_loss_mlp": 0.01028802, "balance_loss_clip": 1.01663029, "balance_loss_mlp": 1.03421164, "epoch": 0.833368405230723, "flos": 31467407391360.0, "grad_norm": 1.5021730308876353, "language_loss": 0.82175642, "learning_rate": 2.841706022218644e-07, "loss": 0.84305263, "num_input_tokens_seen": 299075285, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.66796875, "step": 13861, "time_per_iteration": 2.5337677001953125 }, { "auxiliary_loss_clip": 0.01106928, "auxiliary_loss_mlp": 0.01030657, "balance_loss_clip": 1.01815236, "balance_loss_mlp": 1.03781009, "epoch": 0.833428528483391, "flos": 14902713527040.0, "grad_norm": 2.1719149580858113, "language_loss": 0.78962517, "learning_rate": 2.839705324021806e-07, "loss": 0.811001, "num_input_tokens_seen": 299092520, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 13862, "time_per_iteration": 2.414680242538452 }, { "auxiliary_loss_clip": 0.01103729, "auxiliary_loss_mlp": 0.01031877, "balance_loss_clip": 1.01950955, "balance_loss_mlp": 1.03392029, "epoch": 0.8334886517360589, "flos": 22199833290240.0, "grad_norm": 2.3816553905895423, "language_loss": 0.74646914, "learning_rate": 2.83770527654505e-07, "loss": 0.76782513, "num_input_tokens_seen": 299109450, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 13863, "time_per_iteration": 2.4514825344085693 }, { "auxiliary_loss_clip": 0.01101504, "auxiliary_loss_mlp": 0.01028722, "balance_loss_clip": 1.01733756, "balance_loss_mlp": 1.03528309, "epoch": 0.8335487749887269, "flos": 30372562892160.0, "grad_norm": 2.1407090518079466, "language_loss": 0.75081891, "learning_rate": 2.835705879864232e-07, "loss": 0.77212113, "num_input_tokens_seen": 299129540, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 13864, "time_per_iteration": 2.5048258304595947 }, { "auxiliary_loss_clip": 0.01103616, "auxiliary_loss_mlp": 0.01033427, "balance_loss_clip": 1.02037382, "balance_loss_mlp": 1.03420162, "epoch": 0.8336088982413948, "flos": 24681152810880.0, "grad_norm": 2.3652636404666287, "language_loss": 0.69034111, "learning_rate": 2.833707134055168e-07, "loss": 0.71171153, "num_input_tokens_seen": 299148670, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 13865, "time_per_iteration": 2.5039570331573486 }, { "auxiliary_loss_clip": 0.0110475, "auxiliary_loss_mlp": 0.01033539, "balance_loss_clip": 1.02120674, "balance_loss_mlp": 1.03590035, "epoch": 0.8336690214940629, "flos": 38177207873280.0, "grad_norm": 1.7368499686235492, "language_loss": 0.75467789, "learning_rate": 2.831709039193653e-07, "loss": 0.77606082, "num_input_tokens_seen": 299169330, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 13866, "time_per_iteration": 2.5829029083251953 }, { "auxiliary_loss_clip": 0.0102931, "auxiliary_loss_mlp": 0.00999273, "balance_loss_clip": 0.99811709, "balance_loss_mlp": 1.00693464, "epoch": 0.8337291447467308, "flos": 55565119589760.0, "grad_norm": 0.8769456642489802, "language_loss": 0.63054937, "learning_rate": 2.8297115953554465e-07, "loss": 0.65083522, "num_input_tokens_seen": 299220980, "router_z_loss_clip": 0.01153564, "router_z_loss_mlp": 0.22460938, "step": 13867, "time_per_iteration": 3.017237901687622 }, { "auxiliary_loss_clip": 0.01100364, "auxiliary_loss_mlp": 0.01030677, "balance_loss_clip": 1.01920283, "balance_loss_mlp": 1.03398061, "epoch": 0.8337892679993988, "flos": 24133550993280.0, "grad_norm": 1.6606505821446564, "language_loss": 0.71941376, "learning_rate": 2.827714802616301e-07, "loss": 0.74072415, "num_input_tokens_seen": 299240130, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6640625, "step": 13868, "time_per_iteration": 2.5466315746307373 }, { "auxiliary_loss_clip": 0.01105214, "auxiliary_loss_mlp": 0.01031352, "balance_loss_clip": 1.01854277, "balance_loss_mlp": 1.03727686, "epoch": 0.8338493912520667, "flos": 28183915388160.0, "grad_norm": 1.52359275529393, "language_loss": 0.80253768, "learning_rate": 2.8257186610519325e-07, "loss": 0.82390344, "num_input_tokens_seen": 299260705, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 13869, "time_per_iteration": 2.494863510131836 }, { "auxiliary_loss_clip": 0.01106223, "auxiliary_loss_mlp": 0.0103239, "balance_loss_clip": 1.0200696, "balance_loss_mlp": 1.0371449, "epoch": 0.8339095145047347, "flos": 22158356060160.0, "grad_norm": 1.5783910975100452, "language_loss": 0.82571894, "learning_rate": 2.823723170738028e-07, "loss": 0.84710503, "num_input_tokens_seen": 299278925, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 13870, "time_per_iteration": 2.5163416862487793 }, { "auxiliary_loss_clip": 0.01104408, "auxiliary_loss_mlp": 0.01025937, "balance_loss_clip": 1.01291966, "balance_loss_mlp": 1.03338814, "epoch": 0.8339696377574026, "flos": 17307112072320.0, "grad_norm": 2.6817247080801554, "language_loss": 0.70384008, "learning_rate": 2.821728331750264e-07, "loss": 0.72514355, "num_input_tokens_seen": 299291580, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 13871, "time_per_iteration": 2.4317362308502197 }, { "auxiliary_loss_clip": 0.01102709, "auxiliary_loss_mlp": 0.01031052, "balance_loss_clip": 1.01957846, "balance_loss_mlp": 1.03594637, "epoch": 0.8340297610100706, "flos": 20668351063680.0, "grad_norm": 1.9266874065994535, "language_loss": 0.69301623, "learning_rate": 2.8197341441642853e-07, "loss": 0.71435386, "num_input_tokens_seen": 299310385, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66796875, "step": 13872, "time_per_iteration": 2.467156410217285 }, { "auxiliary_loss_clip": 0.01103676, "auxiliary_loss_mlp": 0.010269, "balance_loss_clip": 1.01509869, "balance_loss_mlp": 1.03481364, "epoch": 0.8340898842627387, "flos": 20515442866560.0, "grad_norm": 2.259499413040007, "language_loss": 0.73507744, "learning_rate": 2.817740608055712e-07, "loss": 0.75638318, "num_input_tokens_seen": 299327660, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 13873, "time_per_iteration": 2.42974853515625 }, { "auxiliary_loss_clip": 0.01106112, "auxiliary_loss_mlp": 0.01032244, "balance_loss_clip": 1.01796269, "balance_loss_mlp": 1.03503239, "epoch": 0.8341500075154066, "flos": 21425850005760.0, "grad_norm": 2.2297756722632656, "language_loss": 0.75154603, "learning_rate": 2.81574772350013e-07, "loss": 0.77292955, "num_input_tokens_seen": 299343685, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.7109375, "step": 13874, "time_per_iteration": 2.4393019676208496 }, { "auxiliary_loss_clip": 0.01102048, "auxiliary_loss_mlp": 0.01024397, "balance_loss_clip": 1.01286376, "balance_loss_mlp": 1.03499126, "epoch": 0.8342101307680746, "flos": 22090988102400.0, "grad_norm": 2.032871210720099, "language_loss": 0.66580027, "learning_rate": 2.813755490573118e-07, "loss": 0.68706471, "num_input_tokens_seen": 299363305, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 13875, "time_per_iteration": 2.45859432220459 }, { "auxiliary_loss_clip": 0.01104593, "auxiliary_loss_mlp": 0.01033716, "balance_loss_clip": 1.02173507, "balance_loss_mlp": 1.03689849, "epoch": 0.8342702540207425, "flos": 21871466133120.0, "grad_norm": 1.722386032967114, "language_loss": 0.79643142, "learning_rate": 2.8117639093502243e-07, "loss": 0.81781459, "num_input_tokens_seen": 299382630, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 13876, "time_per_iteration": 3.918842315673828 }, { "auxiliary_loss_clip": 0.01102425, "auxiliary_loss_mlp": 0.01029747, "balance_loss_clip": 1.01743853, "balance_loss_mlp": 1.03501928, "epoch": 0.8343303772734105, "flos": 22528487756160.0, "grad_norm": 2.7624764457657327, "language_loss": 0.87275571, "learning_rate": 2.8097729799069615e-07, "loss": 0.89407742, "num_input_tokens_seen": 299402385, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 13877, "time_per_iteration": 2.454244613647461 }, { "auxiliary_loss_clip": 0.01102188, "auxiliary_loss_mlp": 0.01028371, "balance_loss_clip": 1.01748729, "balance_loss_mlp": 1.03419042, "epoch": 0.8343905005260784, "flos": 14939773384320.0, "grad_norm": 1.8320895004056854, "language_loss": 0.69237119, "learning_rate": 2.807782702318828e-07, "loss": 0.71367681, "num_input_tokens_seen": 299419820, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.6796875, "step": 13878, "time_per_iteration": 3.790964365005493 }, { "auxiliary_loss_clip": 0.01101595, "auxiliary_loss_mlp": 0.01027126, "balance_loss_clip": 1.01562834, "balance_loss_mlp": 1.03434372, "epoch": 0.8344506237787465, "flos": 15012456554880.0, "grad_norm": 1.7586631533034394, "language_loss": 0.79763532, "learning_rate": 2.805793076661309e-07, "loss": 0.81892246, "num_input_tokens_seen": 299436265, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 13879, "time_per_iteration": 2.441715717315674 }, { "auxiliary_loss_clip": 0.0110377, "auxiliary_loss_mlp": 0.01027253, "balance_loss_clip": 1.01622581, "balance_loss_mlp": 1.03577912, "epoch": 0.8345107470314144, "flos": 17560389847680.0, "grad_norm": 2.182401250053671, "language_loss": 0.83238089, "learning_rate": 2.803804103009828e-07, "loss": 0.8536911, "num_input_tokens_seen": 299451660, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6796875, "step": 13880, "time_per_iteration": 3.8367695808410645 }, { "auxiliary_loss_clip": 0.0110701, "auxiliary_loss_mlp": 0.01030224, "balance_loss_clip": 1.0183568, "balance_loss_mlp": 1.03632474, "epoch": 0.8345708702840824, "flos": 25187277398400.0, "grad_norm": 1.6195307077134173, "language_loss": 0.78288954, "learning_rate": 2.80181578143982e-07, "loss": 0.80426192, "num_input_tokens_seen": 299472070, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.70703125, "step": 13881, "time_per_iteration": 3.8905863761901855 }, { "auxiliary_loss_clip": 0.01098244, "auxiliary_loss_mlp": 0.01022983, "balance_loss_clip": 1.01254654, "balance_loss_mlp": 1.03449225, "epoch": 0.8346309935367503, "flos": 15083559527040.0, "grad_norm": 11.91748559747255, "language_loss": 0.78404224, "learning_rate": 2.7998281120266807e-07, "loss": 0.80525458, "num_input_tokens_seen": 299486725, "router_z_loss_clip": 0.10449219, "router_z_loss_mlp": 0.63671875, "step": 13882, "time_per_iteration": 2.4242656230926514 }, { "auxiliary_loss_clip": 0.01105189, "auxiliary_loss_mlp": 0.01034837, "balance_loss_clip": 1.02257049, "balance_loss_mlp": 1.03671908, "epoch": 0.8346911167894183, "flos": 22930615491840.0, "grad_norm": 5.141937330022154, "language_loss": 0.8053807, "learning_rate": 2.79784109484579e-07, "loss": 0.82678097, "num_input_tokens_seen": 299505435, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 13883, "time_per_iteration": 2.4626455307006836 }, { "auxiliary_loss_clip": 0.01102429, "auxiliary_loss_mlp": 0.01031338, "balance_loss_clip": 1.01898217, "balance_loss_mlp": 1.03279805, "epoch": 0.8347512400420862, "flos": 20193037367040.0, "grad_norm": 2.0474595863383005, "language_loss": 0.74424767, "learning_rate": 2.795854729972482e-07, "loss": 0.7655853, "num_input_tokens_seen": 299523555, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 13884, "time_per_iteration": 2.444204807281494 }, { "auxiliary_loss_clip": 0.01113359, "auxiliary_loss_mlp": 0.01037113, "balance_loss_clip": 1.02299285, "balance_loss_mlp": 1.03831804, "epoch": 0.8348113632947542, "flos": 25954832148480.0, "grad_norm": 1.9134937019063103, "language_loss": 0.70632803, "learning_rate": 2.7938690174820913e-07, "loss": 0.72783273, "num_input_tokens_seen": 299541660, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.75, "step": 13885, "time_per_iteration": 2.476804494857788 }, { "auxiliary_loss_clip": 0.01104722, "auxiliary_loss_mlp": 0.01030296, "balance_loss_clip": 1.01790452, "balance_loss_mlp": 1.03515208, "epoch": 0.8348714865474223, "flos": 34204554552960.0, "grad_norm": 1.725238302612825, "language_loss": 0.69916701, "learning_rate": 2.791883957449912e-07, "loss": 0.72051716, "num_input_tokens_seen": 299562465, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 13886, "time_per_iteration": 2.5721027851104736 }, { "auxiliary_loss_clip": 0.01103069, "auxiliary_loss_mlp": 0.01027785, "balance_loss_clip": 1.01552474, "balance_loss_mlp": 1.03498888, "epoch": 0.8349316098000902, "flos": 24390132819840.0, "grad_norm": 1.519903926097654, "language_loss": 0.79132068, "learning_rate": 2.7898995499512134e-07, "loss": 0.81262922, "num_input_tokens_seen": 299582700, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 13887, "time_per_iteration": 2.4773528575897217 }, { "auxiliary_loss_clip": 0.01109582, "auxiliary_loss_mlp": 0.01030393, "balance_loss_clip": 1.01744092, "balance_loss_mlp": 1.03688908, "epoch": 0.8349917330527582, "flos": 23032744836480.0, "grad_norm": 3.234696024047825, "language_loss": 0.63780463, "learning_rate": 2.7879157950612467e-07, "loss": 0.65920436, "num_input_tokens_seen": 299600310, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 13888, "time_per_iteration": 2.4434754848480225 }, { "auxiliary_loss_clip": 0.01105702, "auxiliary_loss_mlp": 0.01027326, "balance_loss_clip": 1.01541066, "balance_loss_mlp": 1.03462589, "epoch": 0.8350518563054261, "flos": 13625873792640.0, "grad_norm": 2.238150264428845, "language_loss": 0.66427541, "learning_rate": 2.785932692855244e-07, "loss": 0.6856057, "num_input_tokens_seen": 299617025, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 13889, "time_per_iteration": 2.4064671993255615 }, { "auxiliary_loss_clip": 0.01101553, "auxiliary_loss_mlp": 0.01025471, "balance_loss_clip": 1.01410413, "balance_loss_mlp": 1.03358126, "epoch": 0.8351119795580941, "flos": 21579799697280.0, "grad_norm": 2.0637408452235175, "language_loss": 0.68602788, "learning_rate": 2.783950243408399e-07, "loss": 0.70729816, "num_input_tokens_seen": 299633050, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 13890, "time_per_iteration": 2.4377644062042236 }, { "auxiliary_loss_clip": 0.0110585, "auxiliary_loss_mlp": 0.01034295, "balance_loss_clip": 1.02141476, "balance_loss_mlp": 1.03610468, "epoch": 0.835172102810762, "flos": 20038297576320.0, "grad_norm": 3.100328484835324, "language_loss": 0.59361112, "learning_rate": 2.7819684467958817e-07, "loss": 0.61501259, "num_input_tokens_seen": 299646445, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 13891, "time_per_iteration": 2.3966550827026367 }, { "auxiliary_loss_clip": 0.01104491, "auxiliary_loss_mlp": 0.01028982, "balance_loss_clip": 1.01721585, "balance_loss_mlp": 1.03504968, "epoch": 0.8352322260634301, "flos": 25111577485440.0, "grad_norm": 1.7010422859041927, "language_loss": 0.71742976, "learning_rate": 2.779987303092846e-07, "loss": 0.73876452, "num_input_tokens_seen": 299662665, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6953125, "step": 13892, "time_per_iteration": 2.490736722946167 }, { "auxiliary_loss_clip": 0.011004, "auxiliary_loss_mlp": 0.01029906, "balance_loss_clip": 1.01750839, "balance_loss_mlp": 1.03374338, "epoch": 0.835292349316098, "flos": 24863758577280.0, "grad_norm": 1.5749994554748665, "language_loss": 0.65758228, "learning_rate": 2.7780068123744207e-07, "loss": 0.6788854, "num_input_tokens_seen": 299683585, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6640625, "step": 13893, "time_per_iteration": 2.518242835998535 }, { "auxiliary_loss_clip": 0.01100955, "auxiliary_loss_mlp": 0.01026945, "balance_loss_clip": 1.01488686, "balance_loss_mlp": 1.03216326, "epoch": 0.835352472568766, "flos": 19865568049920.0, "grad_norm": 2.0987877519898728, "language_loss": 0.78367847, "learning_rate": 2.7760269747156996e-07, "loss": 0.80495745, "num_input_tokens_seen": 299702680, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 13894, "time_per_iteration": 2.435206174850464 }, { "auxiliary_loss_clip": 0.0110073, "auxiliary_loss_mlp": 0.01027195, "balance_loss_clip": 1.01485121, "balance_loss_mlp": 1.0356214, "epoch": 0.8354125958214339, "flos": 22054754257920.0, "grad_norm": 1.8011682496566392, "language_loss": 0.72790194, "learning_rate": 2.7740477901917625e-07, "loss": 0.74918127, "num_input_tokens_seen": 299721050, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.65234375, "step": 13895, "time_per_iteration": 2.485454797744751 }, { "auxiliary_loss_clip": 0.01104794, "auxiliary_loss_mlp": 0.01039257, "balance_loss_clip": 1.02540493, "balance_loss_mlp": 1.03436804, "epoch": 0.8354727190741019, "flos": 21397804462080.0, "grad_norm": 2.1517991048167397, "language_loss": 0.71998775, "learning_rate": 2.772069258877667e-07, "loss": 0.74142826, "num_input_tokens_seen": 299738255, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.703125, "step": 13896, "time_per_iteration": 2.443349599838257 }, { "auxiliary_loss_clip": 0.01100404, "auxiliary_loss_mlp": 0.01024086, "balance_loss_clip": 1.01218319, "balance_loss_mlp": 1.03356051, "epoch": 0.8355328423267698, "flos": 50840997834240.0, "grad_norm": 3.6640860919595646, "language_loss": 0.59288883, "learning_rate": 2.770091380848423e-07, "loss": 0.61413372, "num_input_tokens_seen": 299761315, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 13897, "time_per_iteration": 2.7226474285125732 }, { "auxiliary_loss_clip": 0.01029258, "auxiliary_loss_mlp": 0.01001923, "balance_loss_clip": 1.00090337, "balance_loss_mlp": 1.00684285, "epoch": 0.8355929655794379, "flos": 65551052764800.0, "grad_norm": 0.706015911635541, "language_loss": 0.57652122, "learning_rate": 2.7681141561790423e-07, "loss": 0.59683305, "num_input_tokens_seen": 299828735, "router_z_loss_clip": 0.01019287, "router_z_loss_mlp": 0.22460938, "step": 13898, "time_per_iteration": 3.1195971965789795 }, { "auxiliary_loss_clip": 0.01105334, "auxiliary_loss_mlp": 0.01035797, "balance_loss_clip": 1.02239227, "balance_loss_mlp": 1.03476143, "epoch": 0.8356530888321058, "flos": 19170516902400.0, "grad_norm": 1.8474104335902055, "language_loss": 0.80139798, "learning_rate": 2.7661375849444967e-07, "loss": 0.82280928, "num_input_tokens_seen": 299848395, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.70703125, "step": 13899, "time_per_iteration": 2.4376986026763916 }, { "auxiliary_loss_clip": 0.01103085, "auxiliary_loss_mlp": 0.01030481, "balance_loss_clip": 1.01904869, "balance_loss_mlp": 1.03403926, "epoch": 0.8357132120847738, "flos": 44126672238720.0, "grad_norm": 2.9521415656577252, "language_loss": 0.69220102, "learning_rate": 2.764161667219749e-07, "loss": 0.71353662, "num_input_tokens_seen": 299871665, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.69140625, "step": 13900, "time_per_iteration": 2.6525380611419678 }, { "auxiliary_loss_clip": 0.01104678, "auxiliary_loss_mlp": 0.01028688, "balance_loss_clip": 1.01659417, "balance_loss_mlp": 1.03657603, "epoch": 0.8357733353374418, "flos": 24389701856640.0, "grad_norm": 1.4696072228370685, "language_loss": 0.71147269, "learning_rate": 2.762186403079716e-07, "loss": 0.73280632, "num_input_tokens_seen": 299891960, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 13901, "time_per_iteration": 2.4740703105926514 }, { "auxiliary_loss_clip": 0.01108112, "auxiliary_loss_mlp": 0.01035217, "balance_loss_clip": 1.02244401, "balance_loss_mlp": 1.03629446, "epoch": 0.8358334585901097, "flos": 20916313626240.0, "grad_norm": 2.250275062860868, "language_loss": 0.80423701, "learning_rate": 2.7602117925992963e-07, "loss": 0.8256703, "num_input_tokens_seen": 299905070, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 13902, "time_per_iteration": 2.4344539642333984 }, { "auxiliary_loss_clip": 0.01101892, "auxiliary_loss_mlp": 0.01031493, "balance_loss_clip": 1.01963806, "balance_loss_mlp": 1.03568316, "epoch": 0.8358935818427777, "flos": 19244169740160.0, "grad_norm": 1.5398627094678694, "language_loss": 0.62554204, "learning_rate": 2.758237835853379e-07, "loss": 0.64687586, "num_input_tokens_seen": 299925130, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.66015625, "step": 13903, "time_per_iteration": 2.433626413345337 }, { "auxiliary_loss_clip": 0.01105414, "auxiliary_loss_mlp": 0.01032, "balance_loss_clip": 1.01997805, "balance_loss_mlp": 1.03639662, "epoch": 0.8359537050954456, "flos": 24134053783680.0, "grad_norm": 1.7499385721049343, "language_loss": 0.74289525, "learning_rate": 2.7562645329168054e-07, "loss": 0.76426947, "num_input_tokens_seen": 299943845, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 13904, "time_per_iteration": 2.4942078590393066 }, { "auxiliary_loss_clip": 0.01100245, "auxiliary_loss_mlp": 0.01030427, "balance_loss_clip": 1.01826143, "balance_loss_mlp": 1.03357184, "epoch": 0.8360138283481137, "flos": 16180415187840.0, "grad_norm": 1.7714612280024433, "language_loss": 0.72675967, "learning_rate": 2.7542918838644104e-07, "loss": 0.74806643, "num_input_tokens_seen": 299961620, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.66796875, "step": 13905, "time_per_iteration": 2.422384023666382 }, { "auxiliary_loss_clip": 0.01103495, "auxiliary_loss_mlp": 0.01033351, "balance_loss_clip": 1.02238417, "balance_loss_mlp": 1.03707039, "epoch": 0.8360739516007816, "flos": 22198899536640.0, "grad_norm": 2.762054337121708, "language_loss": 0.66546756, "learning_rate": 2.752319888771e-07, "loss": 0.686836, "num_input_tokens_seen": 299982170, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6640625, "step": 13906, "time_per_iteration": 2.4751477241516113 }, { "auxiliary_loss_clip": 0.0110279, "auxiliary_loss_mlp": 0.01027912, "balance_loss_clip": 1.01565766, "balance_loss_mlp": 1.03450668, "epoch": 0.8361340748534496, "flos": 20923137210240.0, "grad_norm": 1.7738742815958661, "language_loss": 0.74232763, "learning_rate": 2.7503485477113475e-07, "loss": 0.76363468, "num_input_tokens_seen": 300001330, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 13907, "time_per_iteration": 2.4599289894104004 }, { "auxiliary_loss_clip": 0.01105104, "auxiliary_loss_mlp": 0.01034973, "balance_loss_clip": 1.02249753, "balance_loss_mlp": 1.03394985, "epoch": 0.8361941981061175, "flos": 26173599932160.0, "grad_norm": 1.8505093306151705, "language_loss": 0.75172824, "learning_rate": 2.7483778607602005e-07, "loss": 0.77312905, "num_input_tokens_seen": 300020645, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 13908, "time_per_iteration": 2.497190475463867 }, { "auxiliary_loss_clip": 0.0110548, "auxiliary_loss_mlp": 0.01029721, "balance_loss_clip": 1.01655388, "balance_loss_mlp": 1.03535092, "epoch": 0.8362543213587855, "flos": 24419363512320.0, "grad_norm": 2.205892392091471, "language_loss": 0.71508896, "learning_rate": 2.7464078279922964e-07, "loss": 0.73644102, "num_input_tokens_seen": 300039945, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 13909, "time_per_iteration": 2.497354507446289 }, { "auxiliary_loss_clip": 0.01106561, "auxiliary_loss_mlp": 0.0103487, "balance_loss_clip": 1.02234685, "balance_loss_mlp": 1.03499508, "epoch": 0.8363144446114534, "flos": 17202396948480.0, "grad_norm": 4.597978599220477, "language_loss": 0.73548609, "learning_rate": 2.744438449482338e-07, "loss": 0.75690037, "num_input_tokens_seen": 300058260, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 13910, "time_per_iteration": 2.436758279800415 }, { "auxiliary_loss_clip": 0.01104686, "auxiliary_loss_mlp": 0.01029693, "balance_loss_clip": 1.01814771, "balance_loss_mlp": 1.03568816, "epoch": 0.8363745678641215, "flos": 19279398003840.0, "grad_norm": 2.0011537007870817, "language_loss": 0.7363801, "learning_rate": 2.742469725305001e-07, "loss": 0.75772393, "num_input_tokens_seen": 300076720, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 13911, "time_per_iteration": 2.4175655841827393 }, { "auxiliary_loss_clip": 0.01106136, "auxiliary_loss_mlp": 0.01035769, "balance_loss_clip": 1.02344847, "balance_loss_mlp": 1.03556085, "epoch": 0.8364346911167894, "flos": 11874869596800.0, "grad_norm": 1.996988515763797, "language_loss": 0.79152966, "learning_rate": 2.740501655534946e-07, "loss": 0.8129487, "num_input_tokens_seen": 300092950, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 13912, "time_per_iteration": 2.432030200958252 }, { "auxiliary_loss_clip": 0.01105337, "auxiliary_loss_mlp": 0.0103279, "balance_loss_clip": 1.02122033, "balance_loss_mlp": 1.03659081, "epoch": 0.8364948143694574, "flos": 20225212974720.0, "grad_norm": 1.993772586832138, "language_loss": 0.78850174, "learning_rate": 2.738534240246797e-07, "loss": 0.80988306, "num_input_tokens_seen": 300110950, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6875, "step": 13913, "time_per_iteration": 2.427156448364258 }, { "auxiliary_loss_clip": 0.01102572, "auxiliary_loss_mlp": 0.01029716, "balance_loss_clip": 1.01724064, "balance_loss_mlp": 1.03329682, "epoch": 0.8365549376221254, "flos": 21612909058560.0, "grad_norm": 6.643716420615778, "language_loss": 0.73387587, "learning_rate": 2.736567479515153e-07, "loss": 0.75519878, "num_input_tokens_seen": 300128705, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 13914, "time_per_iteration": 2.468883514404297 }, { "auxiliary_loss_clip": 0.01104204, "auxiliary_loss_mlp": 0.01031175, "balance_loss_clip": 1.01853251, "balance_loss_mlp": 1.03551006, "epoch": 0.8366150608747933, "flos": 23294210912640.0, "grad_norm": 1.6863954680291917, "language_loss": 0.71171457, "learning_rate": 2.7346013734146025e-07, "loss": 0.73306835, "num_input_tokens_seen": 300148635, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13915, "time_per_iteration": 2.4458680152893066 }, { "auxiliary_loss_clip": 0.01104404, "auxiliary_loss_mlp": 0.01031243, "balance_loss_clip": 1.0196501, "balance_loss_mlp": 1.03511906, "epoch": 0.8366751841274613, "flos": 15267673664640.0, "grad_norm": 1.9636918883370782, "language_loss": 0.72135699, "learning_rate": 2.7326359220197035e-07, "loss": 0.74271345, "num_input_tokens_seen": 300165490, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 13916, "time_per_iteration": 2.418048143386841 }, { "auxiliary_loss_clip": 0.01104139, "auxiliary_loss_mlp": 0.01026294, "balance_loss_clip": 1.01396739, "balance_loss_mlp": 1.03529346, "epoch": 0.8367353073801292, "flos": 13224931205760.0, "grad_norm": 1.865062870726077, "language_loss": 0.7491256, "learning_rate": 2.7306711254049755e-07, "loss": 0.77042991, "num_input_tokens_seen": 300182130, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 13917, "time_per_iteration": 3.6929290294647217 }, { "auxiliary_loss_clip": 0.01101463, "auxiliary_loss_mlp": 0.01032478, "balance_loss_clip": 1.02131402, "balance_loss_mlp": 1.03678107, "epoch": 0.8367954306327973, "flos": 24205084928640.0, "grad_norm": 1.8441315492987025, "language_loss": 0.79103351, "learning_rate": 2.728706983644933e-07, "loss": 0.81237292, "num_input_tokens_seen": 300203050, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6484375, "step": 13918, "time_per_iteration": 2.4903194904327393 }, { "auxiliary_loss_clip": 0.01106314, "auxiliary_loss_mlp": 0.01034894, "balance_loss_clip": 1.02240729, "balance_loss_mlp": 1.03728509, "epoch": 0.8368555538854652, "flos": 24534744975360.0, "grad_norm": 1.7198077120315647, "language_loss": 0.67973769, "learning_rate": 2.7267434968140457e-07, "loss": 0.70114982, "num_input_tokens_seen": 300224380, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 13919, "time_per_iteration": 2.47749662399292 }, { "auxiliary_loss_clip": 0.01101618, "auxiliary_loss_mlp": 0.01026108, "balance_loss_clip": 1.01417542, "balance_loss_mlp": 1.03369987, "epoch": 0.8369156771381332, "flos": 20259363830400.0, "grad_norm": 1.7740871251948445, "language_loss": 0.7337966, "learning_rate": 2.7247806649867835e-07, "loss": 0.75507385, "num_input_tokens_seen": 300242915, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 13920, "time_per_iteration": 3.8632328510284424 }, { "auxiliary_loss_clip": 0.01103834, "auxiliary_loss_mlp": 0.01030781, "balance_loss_clip": 1.01849008, "balance_loss_mlp": 1.03503692, "epoch": 0.8369758003908011, "flos": 21835555511040.0, "grad_norm": 1.6352774027047114, "language_loss": 0.68596673, "learning_rate": 2.722818488237566e-07, "loss": 0.70731282, "num_input_tokens_seen": 300261905, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 13921, "time_per_iteration": 3.834608316421509 }, { "auxiliary_loss_clip": 0.01108226, "auxiliary_loss_mlp": 0.01031611, "balance_loss_clip": 1.01953483, "balance_loss_mlp": 1.03730226, "epoch": 0.8370359236434691, "flos": 21719312121600.0, "grad_norm": 2.0693562500056504, "language_loss": 0.85468358, "learning_rate": 2.720856966640801e-07, "loss": 0.87608194, "num_input_tokens_seen": 300281145, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.70703125, "step": 13922, "time_per_iteration": 3.918208360671997 }, { "auxiliary_loss_clip": 0.01098492, "auxiliary_loss_mlp": 0.01031649, "balance_loss_clip": 1.02041304, "balance_loss_mlp": 1.03226721, "epoch": 0.837096046896137, "flos": 23148880485120.0, "grad_norm": 1.5946136888959965, "language_loss": 0.71762234, "learning_rate": 2.71889610027088e-07, "loss": 0.73892373, "num_input_tokens_seen": 300301610, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 13923, "time_per_iteration": 2.4572219848632812 }, { "auxiliary_loss_clip": 0.01103019, "auxiliary_loss_mlp": 0.01031053, "balance_loss_clip": 1.01808953, "balance_loss_mlp": 1.03601933, "epoch": 0.8371561701488051, "flos": 24492872695680.0, "grad_norm": 1.9668234338888644, "language_loss": 0.76263213, "learning_rate": 2.7169358892021433e-07, "loss": 0.78397286, "num_input_tokens_seen": 300319420, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.671875, "step": 13924, "time_per_iteration": 2.4941866397857666 }, { "auxiliary_loss_clip": 0.01102832, "auxiliary_loss_mlp": 0.01028966, "balance_loss_clip": 1.01710439, "balance_loss_mlp": 1.03444552, "epoch": 0.837216293401473, "flos": 29206723161600.0, "grad_norm": 1.4974870533750082, "language_loss": 0.64694262, "learning_rate": 2.7149763335089293e-07, "loss": 0.66826057, "num_input_tokens_seen": 300341325, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.68359375, "step": 13925, "time_per_iteration": 2.5358667373657227 }, { "auxiliary_loss_clip": 0.01106682, "auxiliary_loss_mlp": 0.01030307, "balance_loss_clip": 1.01771271, "balance_loss_mlp": 1.03653169, "epoch": 0.837276416654141, "flos": 25265275781760.0, "grad_norm": 1.6454086928084444, "language_loss": 0.74518138, "learning_rate": 2.713017433265543e-07, "loss": 0.76655126, "num_input_tokens_seen": 300361620, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 13926, "time_per_iteration": 2.5499179363250732 }, { "auxiliary_loss_clip": 0.011047, "auxiliary_loss_mlp": 0.01034537, "balance_loss_clip": 1.02182937, "balance_loss_mlp": 1.03638375, "epoch": 0.837336539906809, "flos": 13882024656000.0, "grad_norm": 1.8827540173781678, "language_loss": 0.71612823, "learning_rate": 2.711059188546274e-07, "loss": 0.73752058, "num_input_tokens_seen": 300378675, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 13927, "time_per_iteration": 2.4130125045776367 }, { "auxiliary_loss_clip": 0.01029643, "auxiliary_loss_mlp": 0.01000847, "balance_loss_clip": 0.99975628, "balance_loss_mlp": 1.00727654, "epoch": 0.8373966631594769, "flos": 68870599044480.0, "grad_norm": 0.7273067850470796, "language_loss": 0.5875355, "learning_rate": 2.7091015994253695e-07, "loss": 0.60784042, "num_input_tokens_seen": 300449740, "router_z_loss_clip": 0.01092529, "router_z_loss_mlp": 0.22363281, "step": 13928, "time_per_iteration": 3.2072887420654297 }, { "auxiliary_loss_clip": 0.01107759, "auxiliary_loss_mlp": 0.01035521, "balance_loss_clip": 1.02298021, "balance_loss_mlp": 1.03850341, "epoch": 0.8374567864121449, "flos": 20448972748800.0, "grad_norm": 2.1119089323178075, "language_loss": 0.69908398, "learning_rate": 2.707144665977068e-07, "loss": 0.7205168, "num_input_tokens_seen": 300470000, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 13929, "time_per_iteration": 2.449728012084961 }, { "auxiliary_loss_clip": 0.01107177, "auxiliary_loss_mlp": 0.01027692, "balance_loss_clip": 1.01469803, "balance_loss_mlp": 1.03613365, "epoch": 0.8375169096648128, "flos": 41904197101440.0, "grad_norm": 1.7228550347907565, "language_loss": 0.6716935, "learning_rate": 2.705188388275574e-07, "loss": 0.69304216, "num_input_tokens_seen": 300494975, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 13930, "time_per_iteration": 2.6559948921203613 }, { "auxiliary_loss_clip": 0.01105187, "auxiliary_loss_mlp": 0.01028883, "balance_loss_clip": 1.0160923, "balance_loss_mlp": 1.03796363, "epoch": 0.8375770329174809, "flos": 20009354192640.0, "grad_norm": 1.5956429905271514, "language_loss": 0.71508658, "learning_rate": 2.703232766395067e-07, "loss": 0.73642731, "num_input_tokens_seen": 300513175, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.671875, "step": 13931, "time_per_iteration": 2.429687023162842 }, { "auxiliary_loss_clip": 0.01102731, "auxiliary_loss_mlp": 0.01032484, "balance_loss_clip": 1.02031863, "balance_loss_mlp": 1.03526568, "epoch": 0.8376371561701488, "flos": 22783597125120.0, "grad_norm": 2.2540127285497737, "language_loss": 0.71981382, "learning_rate": 2.701277800409705e-07, "loss": 0.741166, "num_input_tokens_seen": 300533770, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 13932, "time_per_iteration": 2.4647061824798584 }, { "auxiliary_loss_clip": 0.01101167, "auxiliary_loss_mlp": 0.010339, "balance_loss_clip": 1.02275372, "balance_loss_mlp": 1.03306055, "epoch": 0.8376972794228168, "flos": 23914459987200.0, "grad_norm": 2.244760482181671, "language_loss": 0.67215466, "learning_rate": 2.699323490393628e-07, "loss": 0.69350529, "num_input_tokens_seen": 300552995, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.68359375, "step": 13933, "time_per_iteration": 2.4588396549224854 }, { "auxiliary_loss_clip": 0.01101872, "auxiliary_loss_mlp": 0.01036116, "balance_loss_clip": 1.02426708, "balance_loss_mlp": 1.0359211, "epoch": 0.8377574026754847, "flos": 13734718980480.0, "grad_norm": 2.0743198673484025, "language_loss": 0.76621979, "learning_rate": 2.697369836420933e-07, "loss": 0.78759968, "num_input_tokens_seen": 300570275, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.66015625, "step": 13934, "time_per_iteration": 2.4590373039245605 }, { "auxiliary_loss_clip": 0.01105986, "auxiliary_loss_mlp": 0.01028101, "balance_loss_clip": 1.01621008, "balance_loss_mlp": 1.03885484, "epoch": 0.8378175259281527, "flos": 21651333632640.0, "grad_norm": 1.7271134199162974, "language_loss": 0.77525496, "learning_rate": 2.6954168385657115e-07, "loss": 0.79659581, "num_input_tokens_seen": 300590875, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 13935, "time_per_iteration": 2.4735937118530273 }, { "auxiliary_loss_clip": 0.01103577, "auxiliary_loss_mlp": 0.01032558, "balance_loss_clip": 1.02007639, "balance_loss_mlp": 1.03456318, "epoch": 0.8378776491808206, "flos": 15448806973440.0, "grad_norm": 4.8303099470087485, "language_loss": 0.56134051, "learning_rate": 2.6934644969020135e-07, "loss": 0.58270186, "num_input_tokens_seen": 300607490, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 13936, "time_per_iteration": 2.415208578109741 }, { "auxiliary_loss_clip": 0.01101423, "auxiliary_loss_mlp": 0.01025696, "balance_loss_clip": 1.01476431, "balance_loss_mlp": 1.03391802, "epoch": 0.8379377724334887, "flos": 14720395069440.0, "grad_norm": 2.0823275489368256, "language_loss": 0.89399362, "learning_rate": 2.691512811503882e-07, "loss": 0.91526484, "num_input_tokens_seen": 300623635, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.67578125, "step": 13937, "time_per_iteration": 2.3925013542175293 }, { "auxiliary_loss_clip": 0.01105047, "auxiliary_loss_mlp": 0.01031639, "balance_loss_clip": 1.01945603, "balance_loss_mlp": 1.03618431, "epoch": 0.8379978956861566, "flos": 24535247765760.0, "grad_norm": 2.4073286129887053, "language_loss": 0.81734234, "learning_rate": 2.689561782445313e-07, "loss": 0.83870924, "num_input_tokens_seen": 300643835, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 13938, "time_per_iteration": 2.508782386779785 }, { "auxiliary_loss_clip": 0.01106762, "auxiliary_loss_mlp": 0.0103198, "balance_loss_clip": 1.01919508, "balance_loss_mlp": 1.03659868, "epoch": 0.8380580189388246, "flos": 18952611045120.0, "grad_norm": 1.890279475348413, "language_loss": 0.70626289, "learning_rate": 2.6876114098002965e-07, "loss": 0.72765028, "num_input_tokens_seen": 300662500, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 13939, "time_per_iteration": 2.4315619468688965 }, { "auxiliary_loss_clip": 0.01108349, "auxiliary_loss_mlp": 0.01035456, "balance_loss_clip": 1.02261066, "balance_loss_mlp": 1.03762853, "epoch": 0.8381181421914926, "flos": 26540283922560.0, "grad_norm": 1.5726091552172623, "language_loss": 0.76300502, "learning_rate": 2.6856616936428e-07, "loss": 0.78444314, "num_input_tokens_seen": 300681480, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 13940, "time_per_iteration": 2.5019705295562744 }, { "auxiliary_loss_clip": 0.011023, "auxiliary_loss_mlp": 0.01032927, "balance_loss_clip": 1.02104783, "balance_loss_mlp": 1.03556204, "epoch": 0.8381782654441605, "flos": 23291481479040.0, "grad_norm": 6.339483830757961, "language_loss": 0.76784915, "learning_rate": 2.6837126340467374e-07, "loss": 0.78920144, "num_input_tokens_seen": 300699165, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.66796875, "step": 13941, "time_per_iteration": 2.455355644226074 }, { "auxiliary_loss_clip": 0.01106625, "auxiliary_loss_mlp": 0.01028969, "balance_loss_clip": 1.01592183, "balance_loss_mlp": 1.03516924, "epoch": 0.8382383886968285, "flos": 26758800311040.0, "grad_norm": 2.373850258083325, "language_loss": 0.73733127, "learning_rate": 2.6817642310860276e-07, "loss": 0.75868714, "num_input_tokens_seen": 300714615, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 13942, "time_per_iteration": 2.496647357940674 }, { "auxiliary_loss_clip": 0.01110792, "auxiliary_loss_mlp": 0.01037542, "balance_loss_clip": 1.02445865, "balance_loss_mlp": 1.03652608, "epoch": 0.8382985119494964, "flos": 26104544035200.0, "grad_norm": 1.6259264628271892, "language_loss": 0.79574156, "learning_rate": 2.679816484834554e-07, "loss": 0.81722486, "num_input_tokens_seen": 300734860, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 13943, "time_per_iteration": 2.4921493530273438 }, { "auxiliary_loss_clip": 0.01101261, "auxiliary_loss_mlp": 0.01027468, "balance_loss_clip": 1.01583958, "balance_loss_mlp": 1.0337882, "epoch": 0.8383586352021645, "flos": 16435129507200.0, "grad_norm": 2.117663893068034, "language_loss": 0.85161632, "learning_rate": 2.6778693953661766e-07, "loss": 0.87290359, "num_input_tokens_seen": 300752735, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 13944, "time_per_iteration": 2.4544432163238525 }, { "auxiliary_loss_clip": 0.01029058, "auxiliary_loss_mlp": 0.01002368, "balance_loss_clip": 1.00139093, "balance_loss_mlp": 1.00677502, "epoch": 0.8384187584548324, "flos": 64195532288640.0, "grad_norm": 0.7506167320924908, "language_loss": 0.50283396, "learning_rate": 2.6759229627547263e-07, "loss": 0.52314818, "num_input_tokens_seen": 300820760, "router_z_loss_clip": 0.00976562, "router_z_loss_mlp": 0.22265625, "step": 13945, "time_per_iteration": 3.21403431892395 }, { "auxiliary_loss_clip": 0.011013, "auxiliary_loss_mlp": 0.01028377, "balance_loss_clip": 1.01702273, "balance_loss_mlp": 1.03471291, "epoch": 0.8384788817075004, "flos": 22382905933440.0, "grad_norm": 1.6990026979408197, "language_loss": 0.64983499, "learning_rate": 2.673977187074017e-07, "loss": 0.67113179, "num_input_tokens_seen": 300840025, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 13946, "time_per_iteration": 2.486147165298462 }, { "auxiliary_loss_clip": 0.0110155, "auxiliary_loss_mlp": 0.01028987, "balance_loss_clip": 1.01642871, "balance_loss_mlp": 1.03297687, "epoch": 0.8385390049601683, "flos": 29496845312640.0, "grad_norm": 1.7683365415954126, "language_loss": 0.67615759, "learning_rate": 2.672032068397829e-07, "loss": 0.69746298, "num_input_tokens_seen": 300860380, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 13947, "time_per_iteration": 2.496147632598877 }, { "auxiliary_loss_clip": 0.01107695, "auxiliary_loss_mlp": 0.01029268, "balance_loss_clip": 1.01642275, "balance_loss_mlp": 1.0375731, "epoch": 0.8385991282128363, "flos": 32707797799680.0, "grad_norm": 1.5007257829060163, "language_loss": 0.69962108, "learning_rate": 2.6700876067999176e-07, "loss": 0.72099066, "num_input_tokens_seen": 300881895, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 13948, "time_per_iteration": 2.5691967010498047 }, { "auxiliary_loss_clip": 0.01100193, "auxiliary_loss_mlp": 0.01031285, "balance_loss_clip": 1.02031207, "balance_loss_mlp": 1.03423309, "epoch": 0.8386592514655042, "flos": 25441022050560.0, "grad_norm": 1.7752073591356037, "language_loss": 0.85030001, "learning_rate": 2.6681438023540194e-07, "loss": 0.87161475, "num_input_tokens_seen": 300901575, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.66015625, "step": 13949, "time_per_iteration": 2.4655253887176514 }, { "auxiliary_loss_clip": 0.0110225, "auxiliary_loss_mlp": 0.01028156, "balance_loss_clip": 1.01634252, "balance_loss_mlp": 1.03525543, "epoch": 0.8387193747181723, "flos": 22015898720640.0, "grad_norm": 1.9644049367914873, "language_loss": 0.70847487, "learning_rate": 2.66620065513385e-07, "loss": 0.72977889, "num_input_tokens_seen": 300919735, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 13950, "time_per_iteration": 2.459205150604248 }, { "auxiliary_loss_clip": 0.01102489, "auxiliary_loss_mlp": 0.0102996, "balance_loss_clip": 1.01779509, "balance_loss_mlp": 1.03474224, "epoch": 0.8387794979708402, "flos": 18150223080960.0, "grad_norm": 1.7978017371572268, "language_loss": 0.64668179, "learning_rate": 2.6642581652130913e-07, "loss": 0.6680063, "num_input_tokens_seen": 300939150, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.67578125, "step": 13951, "time_per_iteration": 2.414628505706787 }, { "auxiliary_loss_clip": 0.01105196, "auxiliary_loss_mlp": 0.01030292, "balance_loss_clip": 1.01874042, "balance_loss_mlp": 1.03641748, "epoch": 0.8388396212235082, "flos": 25411216740480.0, "grad_norm": 1.5063752300279116, "language_loss": 0.70033842, "learning_rate": 2.662316332665393e-07, "loss": 0.72169328, "num_input_tokens_seen": 300959730, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 13952, "time_per_iteration": 2.495837926864624 }, { "auxiliary_loss_clip": 0.01101921, "auxiliary_loss_mlp": 0.01027637, "balance_loss_clip": 1.01594293, "balance_loss_mlp": 1.03509939, "epoch": 0.8388997444761762, "flos": 22273055164800.0, "grad_norm": 1.8400260772927108, "language_loss": 0.72526872, "learning_rate": 2.6603751575643987e-07, "loss": 0.74656433, "num_input_tokens_seen": 300976120, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66796875, "step": 13953, "time_per_iteration": 2.4361324310302734 }, { "auxiliary_loss_clip": 0.01101108, "auxiliary_loss_mlp": 0.01027876, "balance_loss_clip": 1.01555014, "balance_loss_mlp": 1.03404975, "epoch": 0.8389598677288441, "flos": 19573219255680.0, "grad_norm": 1.861649632375196, "language_loss": 0.6829977, "learning_rate": 2.6584346399837176e-07, "loss": 0.70428753, "num_input_tokens_seen": 300995080, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 13954, "time_per_iteration": 2.4541449546813965 }, { "auxiliary_loss_clip": 0.01105356, "auxiliary_loss_mlp": 0.01031677, "balance_loss_clip": 1.02057219, "balance_loss_mlp": 1.03692496, "epoch": 0.8390199909815121, "flos": 17384715406080.0, "grad_norm": 1.7045154559871805, "language_loss": 0.73243368, "learning_rate": 2.656494779996932e-07, "loss": 0.75380397, "num_input_tokens_seen": 301012920, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.68359375, "step": 13955, "time_per_iteration": 2.4106366634368896 }, { "auxiliary_loss_clip": 0.01104309, "auxiliary_loss_mlp": 0.01026981, "balance_loss_clip": 1.01465499, "balance_loss_mlp": 1.03551459, "epoch": 0.83908011423418, "flos": 24639639667200.0, "grad_norm": 2.861247227571618, "language_loss": 0.66600668, "learning_rate": 2.6545555776775995e-07, "loss": 0.68731958, "num_input_tokens_seen": 301028875, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 13956, "time_per_iteration": 2.4660146236419678 }, { "auxiliary_loss_clip": 0.01106125, "auxiliary_loss_mlp": 0.01032392, "balance_loss_clip": 1.01934433, "balance_loss_mlp": 1.03509378, "epoch": 0.8391402374868481, "flos": 24718356322560.0, "grad_norm": 2.0738196213287288, "language_loss": 0.79879886, "learning_rate": 2.6526170330992667e-07, "loss": 0.82018405, "num_input_tokens_seen": 301050115, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 13957, "time_per_iteration": 2.4660122394561768 }, { "auxiliary_loss_clip": 0.01029066, "auxiliary_loss_mlp": 0.01001266, "balance_loss_clip": 1.00041997, "balance_loss_mlp": 1.00698042, "epoch": 0.839200360739516, "flos": 56871695784960.0, "grad_norm": 0.7537055344330055, "language_loss": 0.5332641, "learning_rate": 2.6506791463354283e-07, "loss": 0.55356741, "num_input_tokens_seen": 301114155, "router_z_loss_clip": 0.00848389, "router_z_loss_mlp": 0.22070312, "step": 13958, "time_per_iteration": 4.608596324920654 }, { "auxiliary_loss_clip": 0.0110229, "auxiliary_loss_mlp": 0.01030096, "balance_loss_clip": 1.01679242, "balance_loss_mlp": 1.03412557, "epoch": 0.839260483992184, "flos": 18332792933760.0, "grad_norm": 1.9250778492800975, "language_loss": 0.73841059, "learning_rate": 2.648741917459574e-07, "loss": 0.75973445, "num_input_tokens_seen": 301133150, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.68359375, "step": 13959, "time_per_iteration": 2.43911075592041 }, { "auxiliary_loss_clip": 0.01100778, "auxiliary_loss_mlp": 0.01025977, "balance_loss_clip": 1.0141933, "balance_loss_mlp": 1.03534532, "epoch": 0.8393206072448519, "flos": 27087921653760.0, "grad_norm": 1.9924862997318757, "language_loss": 0.55516487, "learning_rate": 2.646805346545169e-07, "loss": 0.57643247, "num_input_tokens_seen": 301153600, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.65625, "step": 13960, "time_per_iteration": 2.4828343391418457 }, { "auxiliary_loss_clip": 0.01029247, "auxiliary_loss_mlp": 0.01001868, "balance_loss_clip": 1.00084913, "balance_loss_mlp": 1.00702763, "epoch": 0.8393807304975199, "flos": 61521192057600.0, "grad_norm": 0.7723836614772481, "language_loss": 0.60755074, "learning_rate": 2.6448694336656397e-07, "loss": 0.62786192, "num_input_tokens_seen": 301214335, "router_z_loss_clip": 0.01019287, "router_z_loss_mlp": 0.22265625, "step": 13961, "time_per_iteration": 4.519460439682007 }, { "auxiliary_loss_clip": 0.0110147, "auxiliary_loss_mlp": 0.01032454, "balance_loss_clip": 1.02060413, "balance_loss_mlp": 1.03266382, "epoch": 0.8394408537501878, "flos": 14894848448640.0, "grad_norm": 2.321324356877151, "language_loss": 0.68358624, "learning_rate": 2.642934178894405e-07, "loss": 0.70492542, "num_input_tokens_seen": 301228960, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 13962, "time_per_iteration": 2.409647226333618 }, { "auxiliary_loss_clip": 0.01104204, "auxiliary_loss_mlp": 0.01028817, "balance_loss_clip": 1.01634765, "balance_loss_mlp": 1.03352189, "epoch": 0.8395009770028559, "flos": 17412186332160.0, "grad_norm": 2.5534087509105867, "language_loss": 0.73423219, "learning_rate": 2.640999582304841e-07, "loss": 0.75556237, "num_input_tokens_seen": 301245875, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 13963, "time_per_iteration": 3.8100690841674805 }, { "auxiliary_loss_clip": 0.01104035, "auxiliary_loss_mlp": 0.01032473, "balance_loss_clip": 1.02046311, "balance_loss_mlp": 1.03495908, "epoch": 0.8395611002555238, "flos": 27924747782400.0, "grad_norm": 1.716485409397957, "language_loss": 0.76614839, "learning_rate": 2.6390656439703173e-07, "loss": 0.78751349, "num_input_tokens_seen": 301265550, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 13964, "time_per_iteration": 3.9478936195373535 }, { "auxiliary_loss_clip": 0.01106142, "auxiliary_loss_mlp": 0.01033955, "balance_loss_clip": 1.02065682, "balance_loss_mlp": 1.03541517, "epoch": 0.8396212235081918, "flos": 11100922225920.0, "grad_norm": 2.4152222952700395, "language_loss": 0.78602237, "learning_rate": 2.637132363964161e-07, "loss": 0.80742329, "num_input_tokens_seen": 301282035, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 13965, "time_per_iteration": 2.4258432388305664 }, { "auxiliary_loss_clip": 0.01101619, "auxiliary_loss_mlp": 0.01030378, "balance_loss_clip": 1.01914287, "balance_loss_mlp": 1.03419411, "epoch": 0.8396813467608598, "flos": 35735641729920.0, "grad_norm": 1.6248915339322096, "language_loss": 0.6603685, "learning_rate": 2.635199742359684e-07, "loss": 0.68168849, "num_input_tokens_seen": 301305210, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 13966, "time_per_iteration": 2.562502861022949 }, { "auxiliary_loss_clip": 0.0110259, "auxiliary_loss_mlp": 0.01031774, "balance_loss_clip": 1.0199188, "balance_loss_mlp": 1.03517795, "epoch": 0.8397414700135277, "flos": 26176724415360.0, "grad_norm": 1.641076316789953, "language_loss": 0.74576807, "learning_rate": 2.633267779230177e-07, "loss": 0.76711172, "num_input_tokens_seen": 301324885, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 13967, "time_per_iteration": 2.501511335372925 }, { "auxiliary_loss_clip": 0.01104271, "auxiliary_loss_mlp": 0.01033277, "balance_loss_clip": 1.02117705, "balance_loss_mlp": 1.03637111, "epoch": 0.8398015932661957, "flos": 18333116156160.0, "grad_norm": 1.8471396905357587, "language_loss": 0.8277837, "learning_rate": 2.6313364746488974e-07, "loss": 0.84915918, "num_input_tokens_seen": 301343070, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 13968, "time_per_iteration": 2.408269166946411 }, { "auxiliary_loss_clip": 0.01105701, "auxiliary_loss_mlp": 0.0103205, "balance_loss_clip": 1.01972377, "balance_loss_mlp": 1.03617859, "epoch": 0.8398617165188637, "flos": 17379507934080.0, "grad_norm": 2.9781897642889406, "language_loss": 0.7786938, "learning_rate": 2.629405828689075e-07, "loss": 0.8000713, "num_input_tokens_seen": 301359280, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 13969, "time_per_iteration": 2.4355227947235107 }, { "auxiliary_loss_clip": 0.01106429, "auxiliary_loss_mlp": 0.01028822, "balance_loss_clip": 1.01611471, "balance_loss_mlp": 1.03503728, "epoch": 0.8399218397715317, "flos": 22929681738240.0, "grad_norm": 2.964769630858267, "language_loss": 0.77620924, "learning_rate": 2.627475841423923e-07, "loss": 0.79756176, "num_input_tokens_seen": 301376465, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 13970, "time_per_iteration": 2.45786714553833 }, { "auxiliary_loss_clip": 0.01104513, "auxiliary_loss_mlp": 0.01034952, "balance_loss_clip": 1.02284074, "balance_loss_mlp": 1.03549874, "epoch": 0.8399819630241996, "flos": 23149562843520.0, "grad_norm": 2.197394643284934, "language_loss": 0.71991539, "learning_rate": 2.625546512926633e-07, "loss": 0.74131, "num_input_tokens_seen": 301396000, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 13971, "time_per_iteration": 2.4504330158233643 }, { "auxiliary_loss_clip": 0.01102576, "auxiliary_loss_mlp": 0.01030427, "balance_loss_clip": 1.01769567, "balance_loss_mlp": 1.03379416, "epoch": 0.8400420862768676, "flos": 16397423205120.0, "grad_norm": 1.8227531804808352, "language_loss": 0.77511704, "learning_rate": 2.623617843270358e-07, "loss": 0.79644704, "num_input_tokens_seen": 301413160, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 13972, "time_per_iteration": 2.462132692337036 }, { "auxiliary_loss_clip": 0.01102178, "auxiliary_loss_mlp": 0.01030168, "balance_loss_clip": 1.01861084, "balance_loss_mlp": 1.03523099, "epoch": 0.8401022095295355, "flos": 21287486816640.0, "grad_norm": 1.4037124852383132, "language_loss": 0.68386865, "learning_rate": 2.6216898325282333e-07, "loss": 0.70519209, "num_input_tokens_seen": 301433325, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66796875, "step": 13973, "time_per_iteration": 2.4618582725524902 }, { "auxiliary_loss_clip": 0.0110661, "auxiliary_loss_mlp": 0.01027478, "balance_loss_clip": 1.01479399, "balance_loss_mlp": 1.03640354, "epoch": 0.8401623327822035, "flos": 17311313963520.0, "grad_norm": 2.0860519212013804, "language_loss": 0.7774241, "learning_rate": 2.619762480773382e-07, "loss": 0.79876494, "num_input_tokens_seen": 301450265, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 13974, "time_per_iteration": 2.433945417404175 }, { "auxiliary_loss_clip": 0.0110292, "auxiliary_loss_mlp": 0.01029912, "balance_loss_clip": 1.01790154, "balance_loss_mlp": 1.03385985, "epoch": 0.8402224560348714, "flos": 22236677665920.0, "grad_norm": 1.7330297235538394, "language_loss": 0.73022842, "learning_rate": 2.617835788078868e-07, "loss": 0.75155675, "num_input_tokens_seen": 301470760, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 13975, "time_per_iteration": 2.463472604751587 }, { "auxiliary_loss_clip": 0.01103357, "auxiliary_loss_mlp": 0.01026786, "balance_loss_clip": 1.01453102, "balance_loss_mlp": 1.03522706, "epoch": 0.8402825792875395, "flos": 20229953569920.0, "grad_norm": 1.625911563945149, "language_loss": 0.72398317, "learning_rate": 2.6159097545177645e-07, "loss": 0.74528462, "num_input_tokens_seen": 301489425, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 13976, "time_per_iteration": 2.450261354446411 }, { "auxiliary_loss_clip": 0.01102282, "auxiliary_loss_mlp": 0.01027645, "balance_loss_clip": 1.01626611, "balance_loss_mlp": 1.0344826, "epoch": 0.8403427025402074, "flos": 23289973107840.0, "grad_norm": 1.8138313670985948, "language_loss": 0.71798354, "learning_rate": 2.61398438016311e-07, "loss": 0.73928285, "num_input_tokens_seen": 301508885, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 13977, "time_per_iteration": 2.456744432449341 }, { "auxiliary_loss_clip": 0.01100952, "auxiliary_loss_mlp": 0.01030786, "balance_loss_clip": 1.0187223, "balance_loss_mlp": 1.03218627, "epoch": 0.8404028257928754, "flos": 32675586278400.0, "grad_norm": 13.178635410194937, "language_loss": 0.68542898, "learning_rate": 2.6120596650879043e-07, "loss": 0.70674646, "num_input_tokens_seen": 301533780, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 13978, "time_per_iteration": 2.572828769683838 }, { "auxiliary_loss_clip": 0.01100518, "auxiliary_loss_mlp": 0.01029239, "balance_loss_clip": 1.01725256, "balance_loss_mlp": 1.03509259, "epoch": 0.8404629490455434, "flos": 16180522928640.0, "grad_norm": 1.8788630643910966, "language_loss": 0.77932382, "learning_rate": 2.610135609365145e-07, "loss": 0.80062139, "num_input_tokens_seen": 301551775, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.65234375, "step": 13979, "time_per_iteration": 2.417201280593872 }, { "auxiliary_loss_clip": 0.01106546, "auxiliary_loss_mlp": 0.01029181, "balance_loss_clip": 1.01696777, "balance_loss_mlp": 1.03735709, "epoch": 0.8405230722982113, "flos": 15194451790080.0, "grad_norm": 1.7958357651934, "language_loss": 0.7805962, "learning_rate": 2.60821221306778e-07, "loss": 0.80195349, "num_input_tokens_seen": 301570495, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 13980, "time_per_iteration": 2.4464619159698486 }, { "auxiliary_loss_clip": 0.01104767, "auxiliary_loss_mlp": 0.01026537, "balance_loss_clip": 1.01527143, "balance_loss_mlp": 1.03772902, "epoch": 0.8405831955508793, "flos": 27812418975360.0, "grad_norm": 2.0074892861397102, "language_loss": 0.8668918, "learning_rate": 2.606289476268757e-07, "loss": 0.88820481, "num_input_tokens_seen": 301591705, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.671875, "step": 13981, "time_per_iteration": 2.50201153755188 }, { "auxiliary_loss_clip": 0.01104461, "auxiliary_loss_mlp": 0.01032239, "balance_loss_clip": 1.02019274, "balance_loss_mlp": 1.03669858, "epoch": 0.8406433188035473, "flos": 23769452782080.0, "grad_norm": 2.2155301653019754, "language_loss": 0.67662054, "learning_rate": 2.6043673990409745e-07, "loss": 0.69798756, "num_input_tokens_seen": 301611670, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 13982, "time_per_iteration": 2.48649001121521 }, { "auxiliary_loss_clip": 0.011049, "auxiliary_loss_mlp": 0.0103625, "balance_loss_clip": 1.02222514, "balance_loss_mlp": 1.03610861, "epoch": 0.8407034420562153, "flos": 29205681667200.0, "grad_norm": 1.674901792362358, "language_loss": 0.68540722, "learning_rate": 2.602445981457324e-07, "loss": 0.7068187, "num_input_tokens_seen": 301632540, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.6875, "step": 13983, "time_per_iteration": 2.5029473304748535 }, { "auxiliary_loss_clip": 0.01103406, "auxiliary_loss_mlp": 0.01030319, "balance_loss_clip": 1.01731896, "balance_loss_mlp": 1.03310478, "epoch": 0.8407635653088832, "flos": 26360084367360.0, "grad_norm": 1.7203118840510268, "language_loss": 0.78845316, "learning_rate": 2.6005252235906684e-07, "loss": 0.80979043, "num_input_tokens_seen": 301651480, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 13984, "time_per_iteration": 2.5057268142700195 }, { "auxiliary_loss_clip": 0.01100144, "auxiliary_loss_mlp": 0.01031334, "balance_loss_clip": 1.01912057, "balance_loss_mlp": 1.0319649, "epoch": 0.8408236885615512, "flos": 21468799693440.0, "grad_norm": 2.042306021999221, "language_loss": 0.60639167, "learning_rate": 2.598605125513842e-07, "loss": 0.62770647, "num_input_tokens_seen": 301670010, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 13985, "time_per_iteration": 2.458138942718506 }, { "auxiliary_loss_clip": 0.01105333, "auxiliary_loss_mlp": 0.01031019, "balance_loss_clip": 1.01855576, "balance_loss_mlp": 1.0350703, "epoch": 0.8408838118142191, "flos": 22963724853120.0, "grad_norm": 1.6103676216232485, "language_loss": 0.82002711, "learning_rate": 2.5966856872996467e-07, "loss": 0.84139061, "num_input_tokens_seen": 301689785, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 13986, "time_per_iteration": 2.459667921066284 }, { "auxiliary_loss_clip": 0.01106927, "auxiliary_loss_mlp": 0.01026011, "balance_loss_clip": 1.01432228, "balance_loss_mlp": 1.03807592, "epoch": 0.8409439350668871, "flos": 26800026145920.0, "grad_norm": 1.6028749417141817, "language_loss": 0.65948606, "learning_rate": 2.5947669090208755e-07, "loss": 0.68081546, "num_input_tokens_seen": 301712225, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 13987, "time_per_iteration": 2.4976210594177246 }, { "auxiliary_loss_clip": 0.0110272, "auxiliary_loss_mlp": 0.01034351, "balance_loss_clip": 1.02234626, "balance_loss_mlp": 1.03482819, "epoch": 0.841004058319555, "flos": 26578672583040.0, "grad_norm": 1.9235461264834697, "language_loss": 0.67527473, "learning_rate": 2.5928487907502906e-07, "loss": 0.6966455, "num_input_tokens_seen": 301730955, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 13988, "time_per_iteration": 2.476428270339966 }, { "auxiliary_loss_clip": 0.01109784, "auxiliary_loss_mlp": 0.01037486, "balance_loss_clip": 1.02431989, "balance_loss_mlp": 1.03822577, "epoch": 0.8410641815722231, "flos": 14501878680960.0, "grad_norm": 2.100666522994462, "language_loss": 0.80933368, "learning_rate": 2.590931332560622e-07, "loss": 0.83080637, "num_input_tokens_seen": 301746930, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 13989, "time_per_iteration": 2.438427448272705 }, { "auxiliary_loss_clip": 0.01105068, "auxiliary_loss_mlp": 0.01028255, "balance_loss_clip": 1.01604807, "balance_loss_mlp": 1.03547072, "epoch": 0.841124304824891, "flos": 29166682475520.0, "grad_norm": 1.7725256726546654, "language_loss": 0.75280178, "learning_rate": 2.5890145345245826e-07, "loss": 0.77413499, "num_input_tokens_seen": 301766945, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 13990, "time_per_iteration": 2.4979536533355713 }, { "auxiliary_loss_clip": 0.01098337, "auxiliary_loss_mlp": 0.01031531, "balance_loss_clip": 1.01968217, "balance_loss_mlp": 1.03285646, "epoch": 0.841184428077559, "flos": 22412028885120.0, "grad_norm": 1.578556765748192, "language_loss": 0.8062067, "learning_rate": 2.5870983967148597e-07, "loss": 0.82750547, "num_input_tokens_seen": 301785460, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.65625, "step": 13991, "time_per_iteration": 2.4386394023895264 }, { "auxiliary_loss_clip": 0.01101525, "auxiliary_loss_mlp": 0.01032132, "balance_loss_clip": 1.02074742, "balance_loss_mlp": 1.03418815, "epoch": 0.841244551330227, "flos": 22962791099520.0, "grad_norm": 2.0433355514930693, "language_loss": 0.7069903, "learning_rate": 2.585182919204105e-07, "loss": 0.72832686, "num_input_tokens_seen": 301804180, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.671875, "step": 13992, "time_per_iteration": 2.4545040130615234 }, { "auxiliary_loss_clip": 0.01105378, "auxiliary_loss_mlp": 0.01028174, "balance_loss_clip": 1.01610994, "balance_loss_mlp": 1.0356797, "epoch": 0.8413046745828949, "flos": 21032736583680.0, "grad_norm": 1.7156775471454604, "language_loss": 0.76219177, "learning_rate": 2.583268102064959e-07, "loss": 0.78352726, "num_input_tokens_seen": 301823670, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 13993, "time_per_iteration": 2.4648306369781494 }, { "auxiliary_loss_clip": 0.01109747, "auxiliary_loss_mlp": 0.01035159, "balance_loss_clip": 1.02085948, "balance_loss_mlp": 1.0349654, "epoch": 0.841364797835563, "flos": 27052082858880.0, "grad_norm": 2.8783113825146707, "language_loss": 0.7417208, "learning_rate": 2.5813539453700393e-07, "loss": 0.76316988, "num_input_tokens_seen": 301845890, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.74609375, "step": 13994, "time_per_iteration": 2.49786639213562 }, { "auxiliary_loss_clip": 0.01102307, "auxiliary_loss_mlp": 0.01028593, "balance_loss_clip": 1.01736927, "balance_loss_mlp": 1.03615558, "epoch": 0.8414249210882309, "flos": 17895688329600.0, "grad_norm": 2.536603489067221, "language_loss": 0.59240901, "learning_rate": 2.5794404491919163e-07, "loss": 0.61371803, "num_input_tokens_seen": 301863985, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66015625, "step": 13995, "time_per_iteration": 2.416752338409424 }, { "auxiliary_loss_clip": 0.01103851, "auxiliary_loss_mlp": 0.01032134, "balance_loss_clip": 1.01918805, "balance_loss_mlp": 1.03562009, "epoch": 0.8414850443408989, "flos": 25441201618560.0, "grad_norm": 2.8407150001407144, "language_loss": 0.71872967, "learning_rate": 2.577527613603163e-07, "loss": 0.74008954, "num_input_tokens_seen": 301882765, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.68359375, "step": 13996, "time_per_iteration": 2.490018606185913 }, { "auxiliary_loss_clip": 0.01103975, "auxiliary_loss_mlp": 0.01030854, "balance_loss_clip": 1.01931477, "balance_loss_mlp": 1.0352149, "epoch": 0.8415451675935668, "flos": 23220055284480.0, "grad_norm": 2.085557036208591, "language_loss": 0.64290446, "learning_rate": 2.5756154386763017e-07, "loss": 0.66425276, "num_input_tokens_seen": 301902720, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 13997, "time_per_iteration": 2.439528226852417 }, { "auxiliary_loss_clip": 0.01108345, "auxiliary_loss_mlp": 0.01033121, "balance_loss_clip": 1.01900089, "balance_loss_mlp": 1.03622603, "epoch": 0.8416052908462348, "flos": 18546496899840.0, "grad_norm": 2.884118984708291, "language_loss": 0.81990325, "learning_rate": 2.5737039244838565e-07, "loss": 0.84131789, "num_input_tokens_seen": 301921245, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71875, "step": 13998, "time_per_iteration": 2.4667582511901855 }, { "auxiliary_loss_clip": 0.01105035, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.02355433, "balance_loss_mlp": 1.03603339, "epoch": 0.8416654140989027, "flos": 26105190480000.0, "grad_norm": 2.1566297592003916, "language_loss": 0.80340672, "learning_rate": 2.5717930710982984e-07, "loss": 0.82482243, "num_input_tokens_seen": 301942320, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 13999, "time_per_iteration": 2.494720697402954 }, { "auxiliary_loss_clip": 0.01107977, "auxiliary_loss_mlp": 0.01032439, "balance_loss_clip": 1.01933241, "balance_loss_mlp": 1.03702533, "epoch": 0.8417255373515707, "flos": 26433270328320.0, "grad_norm": 2.21434654815121, "language_loss": 0.66793752, "learning_rate": 2.569882878592096e-07, "loss": 0.68934166, "num_input_tokens_seen": 301963110, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 14000, "time_per_iteration": 3.8678395748138428 }, { "auxiliary_loss_clip": 0.01108714, "auxiliary_loss_mlp": 0.01030519, "balance_loss_clip": 1.01800251, "balance_loss_mlp": 1.03674972, "epoch": 0.8417856606042387, "flos": 24717745791360.0, "grad_norm": 3.1409583672196746, "language_loss": 0.79545176, "learning_rate": 2.5679733470376885e-07, "loss": 0.81684411, "num_input_tokens_seen": 301984915, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 14001, "time_per_iteration": 2.4944701194763184 }, { "auxiliary_loss_clip": 0.01103086, "auxiliary_loss_mlp": 0.01030299, "balance_loss_clip": 1.01884842, "balance_loss_mlp": 1.03422475, "epoch": 0.8418457838569067, "flos": 20850849089280.0, "grad_norm": 1.762826350664737, "language_loss": 0.7868017, "learning_rate": 2.5660644765074703e-07, "loss": 0.80813545, "num_input_tokens_seen": 302004095, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6875, "step": 14002, "time_per_iteration": 3.922102212905884 }, { "auxiliary_loss_clip": 0.011031, "auxiliary_loss_mlp": 0.01028084, "balance_loss_clip": 1.01559114, "balance_loss_mlp": 1.0345763, "epoch": 0.8419059071095746, "flos": 28660629715200.0, "grad_norm": 1.554335196249262, "language_loss": 0.78228188, "learning_rate": 2.5641562670738334e-07, "loss": 0.80359375, "num_input_tokens_seen": 302027250, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 14003, "time_per_iteration": 2.5502431392669678 }, { "auxiliary_loss_clip": 0.0110482, "auxiliary_loss_mlp": 0.0102904, "balance_loss_clip": 1.01690447, "balance_loss_mlp": 1.03626001, "epoch": 0.8419660303622426, "flos": 21653596189440.0, "grad_norm": 1.8829060893542944, "language_loss": 0.65495193, "learning_rate": 2.5622487188091436e-07, "loss": 0.67629057, "num_input_tokens_seen": 302046950, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 14004, "time_per_iteration": 3.837932586669922 }, { "auxiliary_loss_clip": 0.01107472, "auxiliary_loss_mlp": 0.01031774, "balance_loss_clip": 1.01888108, "balance_loss_mlp": 1.03681862, "epoch": 0.8420261536149106, "flos": 25301114576640.0, "grad_norm": 2.0891235904661487, "language_loss": 0.76041585, "learning_rate": 2.560341831785724e-07, "loss": 0.78180838, "num_input_tokens_seen": 302065470, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 14005, "time_per_iteration": 3.9811787605285645 }, { "auxiliary_loss_clip": 0.01103726, "auxiliary_loss_mlp": 0.01031415, "balance_loss_clip": 1.01835513, "balance_loss_mlp": 1.0338881, "epoch": 0.8420862768675785, "flos": 18763397176320.0, "grad_norm": 1.9714623250666103, "language_loss": 0.7775929, "learning_rate": 2.5584356060758906e-07, "loss": 0.79894429, "num_input_tokens_seen": 302083190, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 14006, "time_per_iteration": 2.433041572570801 }, { "auxiliary_loss_clip": 0.01104596, "auxiliary_loss_mlp": 0.01033723, "balance_loss_clip": 1.0216291, "balance_loss_mlp": 1.0361408, "epoch": 0.8421464001202466, "flos": 18328052338560.0, "grad_norm": 1.852802320235433, "language_loss": 0.76933885, "learning_rate": 2.556530041751932e-07, "loss": 0.79072201, "num_input_tokens_seen": 302098820, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 14007, "time_per_iteration": 2.420314311981201 }, { "auxiliary_loss_clip": 0.01104509, "auxiliary_loss_mlp": 0.01034245, "balance_loss_clip": 1.02110767, "balance_loss_mlp": 1.0338887, "epoch": 0.8422065233729145, "flos": 31537181560320.0, "grad_norm": 1.9998514827608047, "language_loss": 0.66042292, "learning_rate": 2.554625138886102e-07, "loss": 0.68181044, "num_input_tokens_seen": 302117075, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 14008, "time_per_iteration": 2.5183074474334717 }, { "auxiliary_loss_clip": 0.01030267, "auxiliary_loss_mlp": 0.01000931, "balance_loss_clip": 0.99992353, "balance_loss_mlp": 1.00798619, "epoch": 0.8422666466255825, "flos": 64298128510080.0, "grad_norm": 0.7234306210460493, "language_loss": 0.56926131, "learning_rate": 2.552720897550631e-07, "loss": 0.58957326, "num_input_tokens_seen": 302179735, "router_z_loss_clip": 0.0100708, "router_z_loss_mlp": 0.22265625, "step": 14009, "time_per_iteration": 3.1351284980773926 }, { "auxiliary_loss_clip": 0.01100168, "auxiliary_loss_mlp": 0.01029314, "balance_loss_clip": 1.01776886, "balance_loss_mlp": 1.03332663, "epoch": 0.8423267698782504, "flos": 24316731377280.0, "grad_norm": 1.296184469820101, "language_loss": 0.77989084, "learning_rate": 2.5508173178177304e-07, "loss": 0.80118567, "num_input_tokens_seen": 302202055, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 14010, "time_per_iteration": 2.492870569229126 }, { "auxiliary_loss_clip": 0.01108377, "auxiliary_loss_mlp": 0.01035301, "balance_loss_clip": 1.02194977, "balance_loss_mlp": 1.03731167, "epoch": 0.8423868931309184, "flos": 18296092212480.0, "grad_norm": 1.947234311920953, "language_loss": 0.72424912, "learning_rate": 2.548914399759592e-07, "loss": 0.74568594, "num_input_tokens_seen": 302221360, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 14011, "time_per_iteration": 2.457505464553833 }, { "auxiliary_loss_clip": 0.01102656, "auxiliary_loss_mlp": 0.0103549, "balance_loss_clip": 1.02358043, "balance_loss_mlp": 1.03385663, "epoch": 0.8424470163835863, "flos": 23550218121600.0, "grad_norm": 1.8391530204167943, "language_loss": 0.84286726, "learning_rate": 2.5470121434483636e-07, "loss": 0.86424863, "num_input_tokens_seen": 302240715, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14012, "time_per_iteration": 2.456751585006714 }, { "auxiliary_loss_clip": 0.01095542, "auxiliary_loss_mlp": 0.01028375, "balance_loss_clip": 1.01811099, "balance_loss_mlp": 1.03268659, "epoch": 0.8425071396362543, "flos": 23769488695680.0, "grad_norm": 2.6358913615658954, "language_loss": 0.67914093, "learning_rate": 2.5451105489561884e-07, "loss": 0.70038009, "num_input_tokens_seen": 302260950, "router_z_loss_clip": 0.10253906, "router_z_loss_mlp": 0.62890625, "step": 14013, "time_per_iteration": 2.4791152477264404 }, { "auxiliary_loss_clip": 0.01108549, "auxiliary_loss_mlp": 0.01031169, "balance_loss_clip": 1.01865244, "balance_loss_mlp": 1.03577459, "epoch": 0.8425672628889223, "flos": 16178906816640.0, "grad_norm": 2.280948241505521, "language_loss": 0.78968525, "learning_rate": 2.5432096163551644e-07, "loss": 0.81108248, "num_input_tokens_seen": 302277500, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 14014, "time_per_iteration": 2.4162285327911377 }, { "auxiliary_loss_clip": 0.01104099, "auxiliary_loss_mlp": 0.01028132, "balance_loss_clip": 1.01578164, "balance_loss_mlp": 1.03516436, "epoch": 0.8426273861415903, "flos": 23149131880320.0, "grad_norm": 2.248307552644928, "language_loss": 0.67927444, "learning_rate": 2.5413093457173884e-07, "loss": 0.70059681, "num_input_tokens_seen": 302297930, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 14015, "time_per_iteration": 2.4817702770233154 }, { "auxiliary_loss_clip": 0.01103942, "auxiliary_loss_mlp": 0.01031122, "balance_loss_clip": 1.01848555, "balance_loss_mlp": 1.03562737, "epoch": 0.8426875093942582, "flos": 17457757712640.0, "grad_norm": 4.19981509008067, "language_loss": 0.75956851, "learning_rate": 2.5394097371149036e-07, "loss": 0.78091913, "num_input_tokens_seen": 302315735, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 14016, "time_per_iteration": 2.4382102489471436 }, { "auxiliary_loss_clip": 0.01105806, "auxiliary_loss_mlp": 0.01031368, "balance_loss_clip": 1.01833844, "balance_loss_mlp": 1.03681552, "epoch": 0.8427476326469262, "flos": 19640551299840.0, "grad_norm": 1.8837162580745752, "language_loss": 0.79311645, "learning_rate": 2.5375107906197544e-07, "loss": 0.81448817, "num_input_tokens_seen": 302332790, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 14017, "time_per_iteration": 2.426103115081787 }, { "auxiliary_loss_clip": 0.01104197, "auxiliary_loss_mlp": 0.01032121, "balance_loss_clip": 1.02039719, "balance_loss_mlp": 1.03546166, "epoch": 0.8428077558995941, "flos": 11941160146560.0, "grad_norm": 2.0371325557533284, "language_loss": 0.62699068, "learning_rate": 2.5356125063039525e-07, "loss": 0.64835387, "num_input_tokens_seen": 302346490, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 14018, "time_per_iteration": 2.3978288173675537 }, { "auxiliary_loss_clip": 0.01104665, "auxiliary_loss_mlp": 0.01029352, "balance_loss_clip": 1.01766384, "balance_loss_mlp": 1.03462708, "epoch": 0.8428678791522621, "flos": 10451729767680.0, "grad_norm": 2.0070012768460868, "language_loss": 0.79207671, "learning_rate": 2.5337148842394687e-07, "loss": 0.81341684, "num_input_tokens_seen": 302363235, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.703125, "step": 14019, "time_per_iteration": 2.4028193950653076 }, { "auxiliary_loss_clip": 0.0110555, "auxiliary_loss_mlp": 0.01031566, "balance_loss_clip": 1.01886392, "balance_loss_mlp": 1.03526342, "epoch": 0.8429280024049302, "flos": 28767248259840.0, "grad_norm": 13.758507212354527, "language_loss": 0.78276026, "learning_rate": 2.531817924498265e-07, "loss": 0.80413139, "num_input_tokens_seen": 302383270, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 14020, "time_per_iteration": 2.4989755153656006 }, { "auxiliary_loss_clip": 0.01104027, "auxiliary_loss_mlp": 0.01025828, "balance_loss_clip": 1.01363254, "balance_loss_mlp": 1.03544843, "epoch": 0.8429881256575981, "flos": 19537093152000.0, "grad_norm": 1.6345215623306195, "language_loss": 0.7111488, "learning_rate": 2.5299216271522805e-07, "loss": 0.73244733, "num_input_tokens_seen": 302401355, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 14021, "time_per_iteration": 2.4223973751068115 }, { "auxiliary_loss_clip": 0.0110517, "auxiliary_loss_mlp": 0.01040231, "balance_loss_clip": 1.02761865, "balance_loss_mlp": 1.0348922, "epoch": 0.8430482489102661, "flos": 24790931752320.0, "grad_norm": 1.9674656957207384, "language_loss": 0.69638193, "learning_rate": 2.5280259922734125e-07, "loss": 0.7178359, "num_input_tokens_seen": 302419515, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 14022, "time_per_iteration": 2.484703540802002 }, { "auxiliary_loss_clip": 0.01108572, "auxiliary_loss_mlp": 0.01033828, "balance_loss_clip": 1.02078092, "balance_loss_mlp": 1.03752732, "epoch": 0.843108372162934, "flos": 21544248211200.0, "grad_norm": 1.9070871300255943, "language_loss": 0.72268462, "learning_rate": 2.526131019933553e-07, "loss": 0.74410862, "num_input_tokens_seen": 302438280, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 14023, "time_per_iteration": 2.438582181930542 }, { "auxiliary_loss_clip": 0.01104221, "auxiliary_loss_mlp": 0.01033089, "balance_loss_clip": 1.02050018, "balance_loss_mlp": 1.03564441, "epoch": 0.843168495415602, "flos": 24608792862720.0, "grad_norm": 1.4052512547328715, "language_loss": 0.66955847, "learning_rate": 2.524236710204559e-07, "loss": 0.69093156, "num_input_tokens_seen": 302460860, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 14024, "time_per_iteration": 2.5093982219696045 }, { "auxiliary_loss_clip": 0.01102775, "auxiliary_loss_mlp": 0.01030908, "balance_loss_clip": 1.01857567, "balance_loss_mlp": 1.03493202, "epoch": 0.8432286186682699, "flos": 15122738286720.0, "grad_norm": 2.16844982046896, "language_loss": 0.80969793, "learning_rate": 2.522343063158261e-07, "loss": 0.83103472, "num_input_tokens_seen": 302476980, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 14025, "time_per_iteration": 2.3932833671569824 }, { "auxiliary_loss_clip": 0.01100278, "auxiliary_loss_mlp": 0.0102791, "balance_loss_clip": 1.01743138, "balance_loss_mlp": 1.03388274, "epoch": 0.843288741920938, "flos": 20301882554880.0, "grad_norm": 3.07771407347299, "language_loss": 0.77843535, "learning_rate": 2.5204500788664606e-07, "loss": 0.79971731, "num_input_tokens_seen": 302496380, "router_z_loss_clip": 0.10449219, "router_z_loss_mlp": 0.6640625, "step": 14026, "time_per_iteration": 2.4486069679260254 }, { "auxiliary_loss_clip": 0.01105166, "auxiliary_loss_mlp": 0.01035379, "balance_loss_clip": 1.02311194, "balance_loss_mlp": 1.03685892, "epoch": 0.8433488651736059, "flos": 23332096782720.0, "grad_norm": 1.5351969763871973, "language_loss": 0.82719064, "learning_rate": 2.518557757400945e-07, "loss": 0.8485961, "num_input_tokens_seen": 302516845, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 14027, "time_per_iteration": 2.4579789638519287 }, { "auxiliary_loss_clip": 0.01102991, "auxiliary_loss_mlp": 0.01031121, "balance_loss_clip": 1.01952839, "balance_loss_mlp": 1.03489399, "epoch": 0.8434089884262739, "flos": 39458105844480.0, "grad_norm": 1.6192688927811558, "language_loss": 0.56454682, "learning_rate": 2.5166660988334754e-07, "loss": 0.58588797, "num_input_tokens_seen": 302538865, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 14028, "time_per_iteration": 2.6165056228637695 }, { "auxiliary_loss_clip": 0.0110267, "auxiliary_loss_mlp": 0.01025228, "balance_loss_clip": 1.0138849, "balance_loss_mlp": 1.03451359, "epoch": 0.8434691116789418, "flos": 23768842250880.0, "grad_norm": 3.855740244763374, "language_loss": 0.63681698, "learning_rate": 2.51477510323578e-07, "loss": 0.65809602, "num_input_tokens_seen": 302557970, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 14029, "time_per_iteration": 2.4578168392181396 }, { "auxiliary_loss_clip": 0.01099833, "auxiliary_loss_mlp": 0.0103041, "balance_loss_clip": 1.01934147, "balance_loss_mlp": 1.03456509, "epoch": 0.8435292349316098, "flos": 22671411972480.0, "grad_norm": 1.8851766622628183, "language_loss": 0.75384736, "learning_rate": 2.51288477067956e-07, "loss": 0.77514982, "num_input_tokens_seen": 302578915, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65234375, "step": 14030, "time_per_iteration": 2.45270037651062 }, { "auxiliary_loss_clip": 0.01103809, "auxiliary_loss_mlp": 0.01031276, "balance_loss_clip": 1.01922405, "balance_loss_mlp": 1.03624582, "epoch": 0.8435893581842777, "flos": 18843622202880.0, "grad_norm": 2.0414180496821297, "language_loss": 0.83788735, "learning_rate": 2.510995101236502e-07, "loss": 0.85923815, "num_input_tokens_seen": 302596300, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.67578125, "step": 14031, "time_per_iteration": 2.420182943344116 }, { "auxiliary_loss_clip": 0.0110088, "auxiliary_loss_mlp": 0.01029799, "balance_loss_clip": 1.01834285, "balance_loss_mlp": 1.03400028, "epoch": 0.8436494814369457, "flos": 20704225772160.0, "grad_norm": 2.2829348607429965, "language_loss": 0.80221635, "learning_rate": 2.509106094978266e-07, "loss": 0.82352316, "num_input_tokens_seen": 302614975, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66796875, "step": 14032, "time_per_iteration": 2.4466521739959717 }, { "auxiliary_loss_clip": 0.01104054, "auxiliary_loss_mlp": 0.0103292, "balance_loss_clip": 1.01884103, "balance_loss_mlp": 1.03405058, "epoch": 0.8437096046896138, "flos": 22674177319680.0, "grad_norm": 1.4778740519513707, "language_loss": 0.75719661, "learning_rate": 2.507217751976478e-07, "loss": 0.77856636, "num_input_tokens_seen": 302636415, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.69921875, "step": 14033, "time_per_iteration": 2.452997922897339 }, { "auxiliary_loss_clip": 0.01103028, "auxiliary_loss_mlp": 0.01032234, "balance_loss_clip": 1.0207895, "balance_loss_mlp": 1.03409743, "epoch": 0.8437697279422817, "flos": 16180127879040.0, "grad_norm": 3.09599659190312, "language_loss": 0.83094811, "learning_rate": 2.505330072302743e-07, "loss": 0.85230076, "num_input_tokens_seen": 302653605, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 14034, "time_per_iteration": 2.420149326324463 }, { "auxiliary_loss_clip": 0.01106087, "auxiliary_loss_mlp": 0.01028058, "balance_loss_clip": 1.01514721, "balance_loss_mlp": 1.03709984, "epoch": 0.8438298511949497, "flos": 28765847629440.0, "grad_norm": 1.4537331388891896, "language_loss": 0.78490639, "learning_rate": 2.503443056028656e-07, "loss": 0.80624783, "num_input_tokens_seen": 302673965, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 14035, "time_per_iteration": 2.508141040802002 }, { "auxiliary_loss_clip": 0.01104376, "auxiliary_loss_mlp": 0.01029548, "balance_loss_clip": 1.01779437, "balance_loss_mlp": 1.03571856, "epoch": 0.8438899744476176, "flos": 33724284779520.0, "grad_norm": 1.3068017165259438, "language_loss": 0.72109538, "learning_rate": 2.501556703225751e-07, "loss": 0.74243468, "num_input_tokens_seen": 302695560, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 14036, "time_per_iteration": 2.54067063331604 }, { "auxiliary_loss_clip": 0.01098555, "auxiliary_loss_mlp": 0.01026289, "balance_loss_clip": 1.01590633, "balance_loss_mlp": 1.03422034, "epoch": 0.8439500977002856, "flos": 25110787386240.0, "grad_norm": 1.8037546423831319, "language_loss": 0.69454879, "learning_rate": 2.49967101396557e-07, "loss": 0.71579731, "num_input_tokens_seen": 302713480, "router_z_loss_clip": 0.10351562, "router_z_loss_mlp": 0.64453125, "step": 14037, "time_per_iteration": 2.4548964500427246 }, { "auxiliary_loss_clip": 0.01101832, "auxiliary_loss_mlp": 0.0102496, "balance_loss_clip": 1.01325357, "balance_loss_mlp": 1.03439116, "epoch": 0.8440102209529535, "flos": 32850362880000.0, "grad_norm": 1.9848964901302963, "language_loss": 0.68778861, "learning_rate": 2.4977859883196227e-07, "loss": 0.7090565, "num_input_tokens_seen": 302736860, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 14038, "time_per_iteration": 2.558917760848999 }, { "auxiliary_loss_clip": 0.01105993, "auxiliary_loss_mlp": 0.01036848, "balance_loss_clip": 1.02384233, "balance_loss_mlp": 1.0364269, "epoch": 0.8440703442056215, "flos": 23730202195200.0, "grad_norm": 1.5150284240150476, "language_loss": 0.76338136, "learning_rate": 2.49590162635938e-07, "loss": 0.78480983, "num_input_tokens_seen": 302757745, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 14039, "time_per_iteration": 2.4879868030548096 }, { "auxiliary_loss_clip": 0.01108963, "auxiliary_loss_mlp": 0.01027825, "balance_loss_clip": 1.01563025, "balance_loss_mlp": 1.03659391, "epoch": 0.8441304674582895, "flos": 20193719725440.0, "grad_norm": 2.6027205180599897, "language_loss": 0.79556996, "learning_rate": 2.4940179281563046e-07, "loss": 0.8169378, "num_input_tokens_seen": 302774885, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7265625, "step": 14040, "time_per_iteration": 2.475111722946167 }, { "auxiliary_loss_clip": 0.01106513, "auxiliary_loss_mlp": 0.0103337, "balance_loss_clip": 1.02067387, "balance_loss_mlp": 1.03746223, "epoch": 0.8441905907109575, "flos": 20219897761920.0, "grad_norm": 2.19849105785227, "language_loss": 0.69447136, "learning_rate": 2.492134893781821e-07, "loss": 0.71587014, "num_input_tokens_seen": 302791035, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 14041, "time_per_iteration": 3.8838589191436768 }, { "auxiliary_loss_clip": 0.01106384, "auxiliary_loss_mlp": 0.01034169, "balance_loss_clip": 1.02226555, "balance_loss_mlp": 1.03589702, "epoch": 0.8442507139636254, "flos": 13516453987200.0, "grad_norm": 1.9139624960349744, "language_loss": 0.6894455, "learning_rate": 2.490252523307341e-07, "loss": 0.71085107, "num_input_tokens_seen": 302808650, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 14042, "time_per_iteration": 2.430307626724243 }, { "auxiliary_loss_clip": 0.01101269, "auxiliary_loss_mlp": 0.0102889, "balance_loss_clip": 1.01748776, "balance_loss_mlp": 1.03453231, "epoch": 0.8443108372162934, "flos": 18220212731520.0, "grad_norm": 1.7388851327422234, "language_loss": 0.74714684, "learning_rate": 2.4883708168042373e-07, "loss": 0.76844847, "num_input_tokens_seen": 302824605, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66796875, "step": 14043, "time_per_iteration": 2.4264578819274902 }, { "auxiliary_loss_clip": 0.01103488, "auxiliary_loss_mlp": 0.01027458, "balance_loss_clip": 1.01560259, "balance_loss_mlp": 1.03585827, "epoch": 0.8443709604689613, "flos": 16105110324480.0, "grad_norm": 2.5321877719240766, "language_loss": 0.72266364, "learning_rate": 2.486489774343865e-07, "loss": 0.74397314, "num_input_tokens_seen": 302840170, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 14044, "time_per_iteration": 3.8598735332489014 }, { "auxiliary_loss_clip": 0.01100138, "auxiliary_loss_mlp": 0.01026578, "balance_loss_clip": 1.01398969, "balance_loss_mlp": 1.03344667, "epoch": 0.8444310837216293, "flos": 18512130562560.0, "grad_norm": 1.4741521121321344, "language_loss": 0.74726552, "learning_rate": 2.484609395997559e-07, "loss": 0.76853269, "num_input_tokens_seen": 302858320, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6640625, "step": 14045, "time_per_iteration": 2.449972629547119 }, { "auxiliary_loss_clip": 0.01102736, "auxiliary_loss_mlp": 0.01029963, "balance_loss_clip": 1.01812553, "balance_loss_mlp": 1.03399444, "epoch": 0.8444912069742974, "flos": 14939845211520.0, "grad_norm": 1.6791288987193813, "language_loss": 0.78665549, "learning_rate": 2.4827296818366216e-07, "loss": 0.80798244, "num_input_tokens_seen": 302875255, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 14046, "time_per_iteration": 3.7881689071655273 }, { "auxiliary_loss_clip": 0.01106577, "auxiliary_loss_mlp": 0.01030532, "balance_loss_clip": 1.01767576, "balance_loss_mlp": 1.03610849, "epoch": 0.8445513302269653, "flos": 20120318282880.0, "grad_norm": 2.1181787870463755, "language_loss": 0.7800048, "learning_rate": 2.4808506319323255e-07, "loss": 0.80137587, "num_input_tokens_seen": 302894690, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 14047, "time_per_iteration": 3.896054267883301 }, { "auxiliary_loss_clip": 0.0110408, "auxiliary_loss_mlp": 0.01027901, "balance_loss_clip": 1.01636112, "balance_loss_mlp": 1.03710318, "epoch": 0.8446114534796333, "flos": 31170928533120.0, "grad_norm": 1.9949667805901672, "language_loss": 0.72067553, "learning_rate": 2.478972246355935e-07, "loss": 0.74199533, "num_input_tokens_seen": 302912405, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 14048, "time_per_iteration": 2.5511138439178467 }, { "auxiliary_loss_clip": 0.01104648, "auxiliary_loss_mlp": 0.01035837, "balance_loss_clip": 1.02357078, "balance_loss_mlp": 1.03648794, "epoch": 0.8446715767323012, "flos": 23948323534080.0, "grad_norm": 5.733742387161512, "language_loss": 0.733778, "learning_rate": 2.477094525178667e-07, "loss": 0.75518286, "num_input_tokens_seen": 302932525, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 14049, "time_per_iteration": 2.5027642250061035 }, { "auxiliary_loss_clip": 0.01029572, "auxiliary_loss_mlp": 0.01001624, "balance_loss_clip": 1.00059915, "balance_loss_mlp": 1.00725639, "epoch": 0.8447316999849692, "flos": 67984897484160.0, "grad_norm": 0.801782468723947, "language_loss": 0.6066004, "learning_rate": 2.475217468471729e-07, "loss": 0.62691236, "num_input_tokens_seen": 302991285, "router_z_loss_clip": 0.01025391, "router_z_loss_mlp": 0.22363281, "step": 14050, "time_per_iteration": 3.0517477989196777 }, { "auxiliary_loss_clip": 0.01102853, "auxiliary_loss_mlp": 0.01028916, "balance_loss_clip": 1.01580894, "balance_loss_mlp": 1.03407693, "epoch": 0.8447918232376371, "flos": 22418924296320.0, "grad_norm": 4.204138899412241, "language_loss": 0.72131073, "learning_rate": 2.473341076306303e-07, "loss": 0.74262834, "num_input_tokens_seen": 303009515, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 14051, "time_per_iteration": 2.507084369659424 }, { "auxiliary_loss_clip": 0.01101175, "auxiliary_loss_mlp": 0.01029138, "balance_loss_clip": 1.01752734, "balance_loss_mlp": 1.0346508, "epoch": 0.8448519464903052, "flos": 23694147918720.0, "grad_norm": 1.8670458982619198, "language_loss": 0.74447143, "learning_rate": 2.471465348753547e-07, "loss": 0.76577449, "num_input_tokens_seen": 303026905, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 14052, "time_per_iteration": 2.488487720489502 }, { "auxiliary_loss_clip": 0.01098681, "auxiliary_loss_mlp": 0.01026187, "balance_loss_clip": 1.01541686, "balance_loss_mlp": 1.03504694, "epoch": 0.8449120697429731, "flos": 13735904129280.0, "grad_norm": 1.9883584186451946, "language_loss": 0.73721766, "learning_rate": 2.469590285884575e-07, "loss": 0.75846636, "num_input_tokens_seen": 303045245, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.63671875, "step": 14053, "time_per_iteration": 2.4847710132598877 }, { "auxiliary_loss_clip": 0.01102686, "auxiliary_loss_mlp": 0.01024895, "balance_loss_clip": 1.01334405, "balance_loss_mlp": 1.03529882, "epoch": 0.8449721929956411, "flos": 20886795624960.0, "grad_norm": 1.7472648030442597, "language_loss": 0.74372566, "learning_rate": 2.467715887770494e-07, "loss": 0.76500148, "num_input_tokens_seen": 303065205, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 14054, "time_per_iteration": 2.477738857269287 }, { "auxiliary_loss_clip": 0.01108695, "auxiliary_loss_mlp": 0.01029429, "balance_loss_clip": 1.01735878, "balance_loss_mlp": 1.03736496, "epoch": 0.845032316248309, "flos": 33216939129600.0, "grad_norm": 1.5964183725795418, "language_loss": 0.78634298, "learning_rate": 2.4658421544823895e-07, "loss": 0.80772418, "num_input_tokens_seen": 303088250, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.71484375, "step": 14055, "time_per_iteration": 2.64707350730896 }, { "auxiliary_loss_clip": 0.01102353, "auxiliary_loss_mlp": 0.01030257, "balance_loss_clip": 1.01858068, "balance_loss_mlp": 1.03539157, "epoch": 0.845092439500977, "flos": 23585230903680.0, "grad_norm": 1.808375578450867, "language_loss": 0.73026496, "learning_rate": 2.463969086091302e-07, "loss": 0.75159109, "num_input_tokens_seen": 303109280, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 14056, "time_per_iteration": 2.507768154144287 }, { "auxiliary_loss_clip": 0.01110525, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.02315211, "balance_loss_mlp": 1.03817391, "epoch": 0.8451525627536449, "flos": 13333920048000.0, "grad_norm": 2.3018472978546725, "language_loss": 0.67383707, "learning_rate": 2.4620966826682686e-07, "loss": 0.69530368, "num_input_tokens_seen": 303126075, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 14057, "time_per_iteration": 2.471860408782959 }, { "auxiliary_loss_clip": 0.01106462, "auxiliary_loss_mlp": 0.01028125, "balance_loss_clip": 1.01618004, "balance_loss_mlp": 1.03643107, "epoch": 0.8452126860063129, "flos": 27817985583360.0, "grad_norm": 1.7753080070788223, "language_loss": 0.77428424, "learning_rate": 2.460224944284284e-07, "loss": 0.7956301, "num_input_tokens_seen": 303146920, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.703125, "step": 14058, "time_per_iteration": 2.5149948596954346 }, { "auxiliary_loss_clip": 0.01105589, "auxiliary_loss_mlp": 0.01033069, "balance_loss_clip": 1.02108812, "balance_loss_mlp": 1.03614831, "epoch": 0.845272809258981, "flos": 27124694202240.0, "grad_norm": 3.1598039613829516, "language_loss": 0.69849706, "learning_rate": 2.45835387101033e-07, "loss": 0.71988368, "num_input_tokens_seen": 303167885, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 14059, "time_per_iteration": 2.5032758712768555 }, { "auxiliary_loss_clip": 0.01107468, "auxiliary_loss_mlp": 0.01034243, "balance_loss_clip": 1.02120113, "balance_loss_mlp": 1.0356133, "epoch": 0.8453329325116489, "flos": 18332577452160.0, "grad_norm": 3.3659086326208185, "language_loss": 0.57499897, "learning_rate": 2.4564834629173516e-07, "loss": 0.596416, "num_input_tokens_seen": 303185000, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 14060, "time_per_iteration": 2.4671683311462402 }, { "auxiliary_loss_clip": 0.01106998, "auxiliary_loss_mlp": 0.01034122, "balance_loss_clip": 1.02059209, "balance_loss_mlp": 1.03501797, "epoch": 0.8453930557643169, "flos": 22675254727680.0, "grad_norm": 3.485342885483422, "language_loss": 0.75728583, "learning_rate": 2.454613720076277e-07, "loss": 0.77869713, "num_input_tokens_seen": 303205210, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 14061, "time_per_iteration": 2.4841911792755127 }, { "auxiliary_loss_clip": 0.01106333, "auxiliary_loss_mlp": 0.01028764, "balance_loss_clip": 1.01605022, "balance_loss_mlp": 1.0354569, "epoch": 0.8454531790169848, "flos": 22487261921280.0, "grad_norm": 2.121176197347182, "language_loss": 0.70704025, "learning_rate": 2.452744642558013e-07, "loss": 0.72839123, "num_input_tokens_seen": 303224655, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 14062, "time_per_iteration": 2.4989094734191895 }, { "auxiliary_loss_clip": 0.01029474, "auxiliary_loss_mlp": 0.01001157, "balance_loss_clip": 1.00003612, "balance_loss_mlp": 1.00713861, "epoch": 0.8455133022696528, "flos": 58277848481280.0, "grad_norm": 0.6344894996292454, "language_loss": 0.52696484, "learning_rate": 2.450876230433432e-07, "loss": 0.54727113, "num_input_tokens_seen": 303289645, "router_z_loss_clip": 0.01123047, "router_z_loss_mlp": 0.22363281, "step": 14063, "time_per_iteration": 3.1919803619384766 }, { "auxiliary_loss_clip": 0.0110103, "auxiliary_loss_mlp": 0.01025738, "balance_loss_clip": 1.01505637, "balance_loss_mlp": 1.03621984, "epoch": 0.8455734255223207, "flos": 21361283308800.0, "grad_norm": 1.8884542719837267, "language_loss": 0.82340616, "learning_rate": 2.449008483773378e-07, "loss": 0.84467387, "num_input_tokens_seen": 303308350, "router_z_loss_clip": 0.10693359, "router_z_loss_mlp": 0.6484375, "step": 14064, "time_per_iteration": 2.4614455699920654 }, { "auxiliary_loss_clip": 0.01108674, "auxiliary_loss_mlp": 0.01032431, "balance_loss_clip": 1.01921713, "balance_loss_mlp": 1.03780293, "epoch": 0.8456335487749888, "flos": 20449260057600.0, "grad_norm": 2.059283487125177, "language_loss": 0.72753811, "learning_rate": 2.447141402648685e-07, "loss": 0.74894911, "num_input_tokens_seen": 303325230, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 14065, "time_per_iteration": 2.4764437675476074 }, { "auxiliary_loss_clip": 0.01102356, "auxiliary_loss_mlp": 0.01029017, "balance_loss_clip": 1.01765585, "balance_loss_mlp": 1.03561711, "epoch": 0.8456936720276567, "flos": 28840901097600.0, "grad_norm": 1.5580711126121112, "language_loss": 0.77494079, "learning_rate": 2.445274987130146e-07, "loss": 0.79625452, "num_input_tokens_seen": 303345810, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6640625, "step": 14066, "time_per_iteration": 2.5422987937927246 }, { "auxiliary_loss_clip": 0.01105708, "auxiliary_loss_mlp": 0.0102995, "balance_loss_clip": 1.01771307, "balance_loss_mlp": 1.03783071, "epoch": 0.8457537952803247, "flos": 22672884430080.0, "grad_norm": 1.5836353963314829, "language_loss": 0.69899058, "learning_rate": 2.4434092372885363e-07, "loss": 0.72034717, "num_input_tokens_seen": 303365140, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 14067, "time_per_iteration": 2.4800851345062256 }, { "auxiliary_loss_clip": 0.01099922, "auxiliary_loss_mlp": 0.01027412, "balance_loss_clip": 1.01575375, "balance_loss_mlp": 1.032323, "epoch": 0.8458139185329926, "flos": 33802929607680.0, "grad_norm": 2.6054612489334676, "language_loss": 0.70722771, "learning_rate": 2.4415441531946144e-07, "loss": 0.72850102, "num_input_tokens_seen": 303386150, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.67578125, "step": 14068, "time_per_iteration": 2.5598297119140625 }, { "auxiliary_loss_clip": 0.01029515, "auxiliary_loss_mlp": 0.00999258, "balance_loss_clip": 0.99813712, "balance_loss_mlp": 1.00724924, "epoch": 0.8458740417856606, "flos": 70295929603200.0, "grad_norm": 0.7366614438577591, "language_loss": 0.60446388, "learning_rate": 2.4396797349190976e-07, "loss": 0.62475163, "num_input_tokens_seen": 303453770, "router_z_loss_clip": 0.01123047, "router_z_loss_mlp": 0.22265625, "step": 14069, "time_per_iteration": 3.198593854904175 }, { "auxiliary_loss_clip": 0.01105129, "auxiliary_loss_mlp": 0.01028314, "balance_loss_clip": 1.01703656, "balance_loss_mlp": 1.03555429, "epoch": 0.8459341650383285, "flos": 24170862245760.0, "grad_norm": 1.5960270836558048, "language_loss": 0.74471283, "learning_rate": 2.4378159825326804e-07, "loss": 0.76604724, "num_input_tokens_seen": 303474520, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6953125, "step": 14070, "time_per_iteration": 2.498126268386841 }, { "auxiliary_loss_clip": 0.01103981, "auxiliary_loss_mlp": 0.01032607, "balance_loss_clip": 1.01996469, "balance_loss_mlp": 1.0357672, "epoch": 0.8459942882909965, "flos": 38181158369280.0, "grad_norm": 1.808119452543842, "language_loss": 0.66975677, "learning_rate": 2.435952896106039e-07, "loss": 0.69112265, "num_input_tokens_seen": 303497345, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 14071, "time_per_iteration": 2.6121561527252197 }, { "auxiliary_loss_clip": 0.01029301, "auxiliary_loss_mlp": 0.01001098, "balance_loss_clip": 1.00003147, "balance_loss_mlp": 1.00692463, "epoch": 0.8460544115436646, "flos": 64118252177280.0, "grad_norm": 0.7295907211902207, "language_loss": 0.60984659, "learning_rate": 2.4340904757098313e-07, "loss": 0.63015056, "num_input_tokens_seen": 303554890, "router_z_loss_clip": 0.01068115, "router_z_loss_mlp": 0.22460938, "step": 14072, "time_per_iteration": 2.9830613136291504 }, { "auxiliary_loss_clip": 0.01106265, "auxiliary_loss_mlp": 0.01032337, "balance_loss_clip": 1.01864552, "balance_loss_mlp": 1.03572762, "epoch": 0.8461145347963325, "flos": 24170826332160.0, "grad_norm": 1.8153348544627281, "language_loss": 0.72388935, "learning_rate": 2.4322287214146664e-07, "loss": 0.74527538, "num_input_tokens_seen": 303574380, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 14073, "time_per_iteration": 2.5334243774414062 }, { "auxiliary_loss_clip": 0.01110577, "auxiliary_loss_mlp": 0.01033897, "balance_loss_clip": 1.02079618, "balance_loss_mlp": 1.03675103, "epoch": 0.8461746580490005, "flos": 34893787697280.0, "grad_norm": 1.7311591824944383, "language_loss": 0.77831525, "learning_rate": 2.430367633291155e-07, "loss": 0.79975998, "num_input_tokens_seen": 303594910, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 14074, "time_per_iteration": 2.6696271896362305 }, { "auxiliary_loss_clip": 0.01104701, "auxiliary_loss_mlp": 0.01029647, "balance_loss_clip": 1.01709449, "balance_loss_mlp": 1.03576636, "epoch": 0.8462347813016684, "flos": 25557014044800.0, "grad_norm": 2.1323541735383005, "language_loss": 0.75353134, "learning_rate": 2.4285072114098583e-07, "loss": 0.77487481, "num_input_tokens_seen": 303613520, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 14075, "time_per_iteration": 2.5414071083068848 }, { "auxiliary_loss_clip": 0.01102985, "auxiliary_loss_mlp": 0.01030206, "balance_loss_clip": 1.01730752, "balance_loss_mlp": 1.03509212, "epoch": 0.8462949045543364, "flos": 21325336773120.0, "grad_norm": 2.1794960605185554, "language_loss": 0.73500717, "learning_rate": 2.4266474558413355e-07, "loss": 0.75633907, "num_input_tokens_seen": 303631225, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 14076, "time_per_iteration": 2.4891514778137207 }, { "auxiliary_loss_clip": 0.01106203, "auxiliary_loss_mlp": 0.01034151, "balance_loss_clip": 1.02200389, "balance_loss_mlp": 1.03534579, "epoch": 0.8463550278070043, "flos": 22637440684800.0, "grad_norm": 2.0661133275592203, "language_loss": 0.7763958, "learning_rate": 2.4247883666560945e-07, "loss": 0.79779929, "num_input_tokens_seen": 303649175, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.70703125, "step": 14077, "time_per_iteration": 2.4843361377716064 }, { "auxiliary_loss_clip": 0.01108912, "auxiliary_loss_mlp": 0.01033912, "balance_loss_clip": 1.02180028, "balance_loss_mlp": 1.03767991, "epoch": 0.8464151510596724, "flos": 13005588804480.0, "grad_norm": 2.37388092901161, "language_loss": 0.75036824, "learning_rate": 2.422929943924643e-07, "loss": 0.77179646, "num_input_tokens_seen": 303665915, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 14078, "time_per_iteration": 2.4371566772460938 }, { "auxiliary_loss_clip": 0.01102, "auxiliary_loss_mlp": 0.01029041, "balance_loss_clip": 1.01646447, "balance_loss_mlp": 1.03453767, "epoch": 0.8464752743123403, "flos": 15704921923200.0, "grad_norm": 2.384837431117847, "language_loss": 0.8536616, "learning_rate": 2.4210721877174565e-07, "loss": 0.87497199, "num_input_tokens_seen": 303679985, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.671875, "step": 14079, "time_per_iteration": 2.427987575531006 }, { "auxiliary_loss_clip": 0.01112637, "auxiliary_loss_mlp": 0.01037701, "balance_loss_clip": 1.0243082, "balance_loss_mlp": 1.03807068, "epoch": 0.8465353975650083, "flos": 21653955325440.0, "grad_norm": 2.0183576518305317, "language_loss": 0.58888412, "learning_rate": 2.419215098104965e-07, "loss": 0.61038756, "num_input_tokens_seen": 303698470, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.74609375, "step": 14080, "time_per_iteration": 2.4884049892425537 }, { "auxiliary_loss_clip": 0.01106479, "auxiliary_loss_mlp": 0.01030316, "balance_loss_clip": 1.01791227, "balance_loss_mlp": 1.03544366, "epoch": 0.8465955208176762, "flos": 18515650095360.0, "grad_norm": 2.211820702892896, "language_loss": 0.66133094, "learning_rate": 2.4173586751576014e-07, "loss": 0.68269885, "num_input_tokens_seen": 303716415, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 14081, "time_per_iteration": 2.4384000301361084 }, { "auxiliary_loss_clip": 0.01105571, "auxiliary_loss_mlp": 0.01027463, "balance_loss_clip": 1.01598346, "balance_loss_mlp": 1.03529191, "epoch": 0.8466556440703442, "flos": 24200559815040.0, "grad_norm": 1.770022012550579, "language_loss": 0.72922444, "learning_rate": 2.41550291894576e-07, "loss": 0.7505548, "num_input_tokens_seen": 303734490, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.703125, "step": 14082, "time_per_iteration": 2.513683795928955 }, { "auxiliary_loss_clip": 0.01104106, "auxiliary_loss_mlp": 0.0102738, "balance_loss_clip": 1.01591849, "balance_loss_mlp": 1.03434825, "epoch": 0.8467157673230121, "flos": 20375894528640.0, "grad_norm": 2.4425168904101984, "language_loss": 0.75775957, "learning_rate": 2.413647829539809e-07, "loss": 0.77907443, "num_input_tokens_seen": 303752310, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 14083, "time_per_iteration": 3.9560327529907227 }, { "auxiliary_loss_clip": 0.01109657, "auxiliary_loss_mlp": 0.01035143, "balance_loss_clip": 1.02159452, "balance_loss_mlp": 1.03671539, "epoch": 0.8467758905756801, "flos": 28473642489600.0, "grad_norm": 1.9601095383379383, "language_loss": 0.65600717, "learning_rate": 2.411793407010092e-07, "loss": 0.67745519, "num_input_tokens_seen": 303776065, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 14084, "time_per_iteration": 2.562562942504883 }, { "auxiliary_loss_clip": 0.01106294, "auxiliary_loss_mlp": 0.01031992, "balance_loss_clip": 1.01998174, "balance_loss_mlp": 1.03766811, "epoch": 0.8468360138283482, "flos": 11692551139200.0, "grad_norm": 24.881701092179956, "language_loss": 0.7000463, "learning_rate": 2.409939651426938e-07, "loss": 0.72142911, "num_input_tokens_seen": 303793500, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 14085, "time_per_iteration": 3.8990302085876465 }, { "auxiliary_loss_clip": 0.01104227, "auxiliary_loss_mlp": 0.01028774, "balance_loss_clip": 1.01691318, "balance_loss_mlp": 1.03481603, "epoch": 0.8468961370810161, "flos": 24607859109120.0, "grad_norm": 1.535918537293271, "language_loss": 0.71061456, "learning_rate": 2.408086562860634e-07, "loss": 0.73194456, "num_input_tokens_seen": 303814835, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 14086, "time_per_iteration": 2.509033203125 }, { "auxiliary_loss_clip": 0.01103092, "auxiliary_loss_mlp": 0.01031679, "balance_loss_clip": 1.02016962, "balance_loss_mlp": 1.03526378, "epoch": 0.8469562603336841, "flos": 19609812236160.0, "grad_norm": 1.6583112877305428, "language_loss": 0.74612409, "learning_rate": 2.4062341413814445e-07, "loss": 0.76747185, "num_input_tokens_seen": 303834505, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 14087, "time_per_iteration": 2.466146945953369 }, { "auxiliary_loss_clip": 0.0110419, "auxiliary_loss_mlp": 0.01027272, "balance_loss_clip": 1.01532698, "balance_loss_mlp": 1.03657055, "epoch": 0.847016383586352, "flos": 22638949056000.0, "grad_norm": 1.4102075283623787, "language_loss": 0.7398113, "learning_rate": 2.4043823870596227e-07, "loss": 0.76112592, "num_input_tokens_seen": 303855050, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 14088, "time_per_iteration": 3.8430697917938232 }, { "auxiliary_loss_clip": 0.01105426, "auxiliary_loss_mlp": 0.01032255, "balance_loss_clip": 1.02018476, "balance_loss_mlp": 1.03517962, "epoch": 0.84707650683902, "flos": 20960161153920.0, "grad_norm": 2.2734442921624205, "language_loss": 0.72358894, "learning_rate": 2.402531299965387e-07, "loss": 0.74496573, "num_input_tokens_seen": 303875635, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 14089, "time_per_iteration": 3.889759063720703 }, { "auxiliary_loss_clip": 0.01102619, "auxiliary_loss_mlp": 0.01026664, "balance_loss_clip": 1.01523161, "balance_loss_mlp": 1.03633022, "epoch": 0.8471366300916879, "flos": 24093007516800.0, "grad_norm": 3.0006861436981476, "language_loss": 0.79415292, "learning_rate": 2.400680880168928e-07, "loss": 0.81544578, "num_input_tokens_seen": 303896750, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 14090, "time_per_iteration": 2.521514415740967 }, { "auxiliary_loss_clip": 0.01106926, "auxiliary_loss_mlp": 0.01039401, "balance_loss_clip": 1.02593684, "balance_loss_mlp": 1.03597689, "epoch": 0.847196753344356, "flos": 18332900674560.0, "grad_norm": 3.683495762996887, "language_loss": 0.76640916, "learning_rate": 2.3988311277404085e-07, "loss": 0.78787243, "num_input_tokens_seen": 303915435, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 14091, "time_per_iteration": 2.4271202087402344 }, { "auxiliary_loss_clip": 0.01029276, "auxiliary_loss_mlp": 0.01002697, "balance_loss_clip": 1.00163043, "balance_loss_mlp": 1.0070281, "epoch": 0.8472568765970239, "flos": 49567536956160.0, "grad_norm": 0.8480296891023302, "language_loss": 0.59462857, "learning_rate": 2.396982042749982e-07, "loss": 0.61494827, "num_input_tokens_seen": 303977245, "router_z_loss_clip": 0.01068115, "router_z_loss_mlp": 0.22265625, "step": 14092, "time_per_iteration": 3.1880531311035156 }, { "auxiliary_loss_clip": 0.01102912, "auxiliary_loss_mlp": 0.01034066, "balance_loss_clip": 1.02085137, "balance_loss_mlp": 1.03364635, "epoch": 0.8473169998496919, "flos": 19279074781440.0, "grad_norm": 2.2005916375035333, "language_loss": 0.70284617, "learning_rate": 2.395133625267756e-07, "loss": 0.72421592, "num_input_tokens_seen": 303996055, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69140625, "step": 14093, "time_per_iteration": 2.4596238136291504 }, { "auxiliary_loss_clip": 0.01100399, "auxiliary_loss_mlp": 0.0102663, "balance_loss_clip": 1.0154717, "balance_loss_mlp": 1.0336256, "epoch": 0.8473771231023598, "flos": 17675555829120.0, "grad_norm": 1.914758911514629, "language_loss": 0.83397627, "learning_rate": 2.3932858753638263e-07, "loss": 0.85524654, "num_input_tokens_seen": 304012205, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6640625, "step": 14094, "time_per_iteration": 2.4330015182495117 }, { "auxiliary_loss_clip": 0.01100107, "auxiliary_loss_mlp": 0.01032375, "balance_loss_clip": 1.02048421, "balance_loss_mlp": 1.03467488, "epoch": 0.8474372463550278, "flos": 26359761144960.0, "grad_norm": 6.645817308489762, "language_loss": 0.71034753, "learning_rate": 2.3914387931082626e-07, "loss": 0.73167241, "num_input_tokens_seen": 304033475, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.65625, "step": 14095, "time_per_iteration": 2.5167508125305176 }, { "auxiliary_loss_clip": 0.01101743, "auxiliary_loss_mlp": 0.01034183, "balance_loss_clip": 1.02258992, "balance_loss_mlp": 1.0351305, "epoch": 0.8474973696076957, "flos": 23402050519680.0, "grad_norm": 1.8854521352713867, "language_loss": 0.80658489, "learning_rate": 2.3895923785711105e-07, "loss": 0.82794416, "num_input_tokens_seen": 304051845, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 14096, "time_per_iteration": 2.513422727584839 }, { "auxiliary_loss_clip": 0.01105912, "auxiliary_loss_mlp": 0.01031146, "balance_loss_clip": 1.01828933, "balance_loss_mlp": 1.03467572, "epoch": 0.8475574928603637, "flos": 25075666863360.0, "grad_norm": 2.245490514212322, "language_loss": 0.77357686, "learning_rate": 2.387746631822374e-07, "loss": 0.79494745, "num_input_tokens_seen": 304069965, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 14097, "time_per_iteration": 2.4792888164520264 }, { "auxiliary_loss_clip": 0.01104124, "auxiliary_loss_mlp": 0.01029072, "balance_loss_clip": 1.01738918, "balance_loss_mlp": 1.03711772, "epoch": 0.8476176161130318, "flos": 19966691813760.0, "grad_norm": 2.2459731061761925, "language_loss": 0.79972529, "learning_rate": 2.385901552932048e-07, "loss": 0.82105726, "num_input_tokens_seen": 304086805, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 14098, "time_per_iteration": 2.4774279594421387 }, { "auxiliary_loss_clip": 0.01104554, "auxiliary_loss_mlp": 0.01031485, "balance_loss_clip": 1.01927781, "balance_loss_mlp": 1.03716826, "epoch": 0.8476777393656997, "flos": 21285834791040.0, "grad_norm": 2.4559185306156412, "language_loss": 0.71810544, "learning_rate": 2.3840571419701062e-07, "loss": 0.73946583, "num_input_tokens_seen": 304105865, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 14099, "time_per_iteration": 2.4418599605560303 }, { "auxiliary_loss_clip": 0.01103101, "auxiliary_loss_mlp": 0.01032162, "balance_loss_clip": 1.01881015, "balance_loss_mlp": 1.03486323, "epoch": 0.8477378626183677, "flos": 29971476650880.0, "grad_norm": 2.287763463597233, "language_loss": 0.63773328, "learning_rate": 2.3822133990064787e-07, "loss": 0.65908593, "num_input_tokens_seen": 304128300, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.68359375, "step": 14100, "time_per_iteration": 2.5383992195129395 }, { "auxiliary_loss_clip": 0.01105898, "auxiliary_loss_mlp": 0.01030328, "balance_loss_clip": 1.01744199, "balance_loss_mlp": 1.03547776, "epoch": 0.8477979858710356, "flos": 24237727413120.0, "grad_norm": 2.210623344740743, "language_loss": 0.74185771, "learning_rate": 2.380370324111085e-07, "loss": 0.76321995, "num_input_tokens_seen": 304143695, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 14101, "time_per_iteration": 2.445971965789795 }, { "auxiliary_loss_clip": 0.01103074, "auxiliary_loss_mlp": 0.01028555, "balance_loss_clip": 1.01726556, "balance_loss_mlp": 1.03390503, "epoch": 0.8478581091237036, "flos": 25593678852480.0, "grad_norm": 1.506705845072467, "language_loss": 0.71355963, "learning_rate": 2.3785279173538163e-07, "loss": 0.73487592, "num_input_tokens_seen": 304165800, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.69140625, "step": 14102, "time_per_iteration": 2.499617576599121 }, { "auxiliary_loss_clip": 0.01106275, "auxiliary_loss_mlp": 0.01029701, "balance_loss_clip": 1.01687384, "balance_loss_mlp": 1.03492308, "epoch": 0.8479182323763715, "flos": 12057116227200.0, "grad_norm": 2.1427676382254575, "language_loss": 0.81977499, "learning_rate": 2.3766861788045366e-07, "loss": 0.84113473, "num_input_tokens_seen": 304182910, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 14103, "time_per_iteration": 2.4127197265625 }, { "auxiliary_loss_clip": 0.01103907, "auxiliary_loss_mlp": 0.01028445, "balance_loss_clip": 1.01658416, "balance_loss_mlp": 1.03631854, "epoch": 0.8479783556290396, "flos": 21433391861760.0, "grad_norm": 2.004683783018455, "language_loss": 0.7814675, "learning_rate": 2.374845108533079e-07, "loss": 0.802791, "num_input_tokens_seen": 304200175, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 14104, "time_per_iteration": 2.456799268722534 }, { "auxiliary_loss_clip": 0.01107111, "auxiliary_loss_mlp": 0.01030917, "balance_loss_clip": 1.01808977, "balance_loss_mlp": 1.03715622, "epoch": 0.8480384788817075, "flos": 19642634288640.0, "grad_norm": 1.9648041428901524, "language_loss": 0.791448, "learning_rate": 2.3730047066092607e-07, "loss": 0.81282824, "num_input_tokens_seen": 304217775, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 14105, "time_per_iteration": 2.4247376918792725 }, { "auxiliary_loss_clip": 0.01110131, "auxiliary_loss_mlp": 0.01032605, "balance_loss_clip": 1.01947451, "balance_loss_mlp": 1.03707767, "epoch": 0.8480986021343755, "flos": 22489201255680.0, "grad_norm": 2.60259378457724, "language_loss": 0.50664556, "learning_rate": 2.3711649731028749e-07, "loss": 0.52807289, "num_input_tokens_seen": 304235760, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 14106, "time_per_iteration": 2.4849355220794678 }, { "auxiliary_loss_clip": 0.01103413, "auxiliary_loss_mlp": 0.01034801, "balance_loss_clip": 1.02330303, "balance_loss_mlp": 1.03566277, "epoch": 0.8481587253870434, "flos": 22090557139200.0, "grad_norm": 1.9730222702203817, "language_loss": 0.76030898, "learning_rate": 2.3693259080836792e-07, "loss": 0.78169107, "num_input_tokens_seen": 304253985, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.67578125, "step": 14107, "time_per_iteration": 2.453690767288208 }, { "auxiliary_loss_clip": 0.01103399, "auxiliary_loss_mlp": 0.01027272, "balance_loss_clip": 1.01554203, "balance_loss_mlp": 1.03527009, "epoch": 0.8482188486397114, "flos": 33582689366400.0, "grad_norm": 1.777867127686351, "language_loss": 0.73650271, "learning_rate": 2.3674875116214087e-07, "loss": 0.75780946, "num_input_tokens_seen": 304276785, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 14108, "time_per_iteration": 2.5764753818511963 }, { "auxiliary_loss_clip": 0.01100658, "auxiliary_loss_mlp": 0.01025037, "balance_loss_clip": 1.01169169, "balance_loss_mlp": 1.0345943, "epoch": 0.8482789718923793, "flos": 20919402195840.0, "grad_norm": 2.044002749166272, "language_loss": 0.7281397, "learning_rate": 2.3656497837857836e-07, "loss": 0.74939668, "num_input_tokens_seen": 304296310, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.66015625, "step": 14109, "time_per_iteration": 2.4513697624206543 }, { "auxiliary_loss_clip": 0.01100892, "auxiliary_loss_mlp": 0.01032227, "balance_loss_clip": 1.01994836, "balance_loss_mlp": 1.03426802, "epoch": 0.8483390951450474, "flos": 12896204912640.0, "grad_norm": 15.366388402131989, "language_loss": 0.74252689, "learning_rate": 2.3638127246464811e-07, "loss": 0.76385802, "num_input_tokens_seen": 304311715, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6640625, "step": 14110, "time_per_iteration": 2.440793991088867 }, { "auxiliary_loss_clip": 0.01105929, "auxiliary_loss_mlp": 0.01033885, "balance_loss_clip": 1.022012, "balance_loss_mlp": 1.03721237, "epoch": 0.8483992183977154, "flos": 25081628520960.0, "grad_norm": 1.7147229567858926, "language_loss": 0.75745404, "learning_rate": 2.3619763342731658e-07, "loss": 0.77885222, "num_input_tokens_seen": 304331910, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 14111, "time_per_iteration": 2.4698703289031982 }, { "auxiliary_loss_clip": 0.0110392, "auxiliary_loss_mlp": 0.01028361, "balance_loss_clip": 1.01699424, "balance_loss_mlp": 1.03703833, "epoch": 0.8484593416503833, "flos": 25557445008000.0, "grad_norm": 1.642358660155697, "language_loss": 0.67803836, "learning_rate": 2.3601406127354772e-07, "loss": 0.69936109, "num_input_tokens_seen": 304351405, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66796875, "step": 14112, "time_per_iteration": 2.4991660118103027 }, { "auxiliary_loss_clip": 0.0110402, "auxiliary_loss_mlp": 0.01030064, "balance_loss_clip": 1.01834607, "balance_loss_mlp": 1.03473783, "epoch": 0.8485194649030513, "flos": 27198454780800.0, "grad_norm": 1.5719597966804624, "language_loss": 0.73719275, "learning_rate": 2.3583055601030312e-07, "loss": 0.7585336, "num_input_tokens_seen": 304372935, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 14113, "time_per_iteration": 2.50880765914917 }, { "auxiliary_loss_clip": 0.01103992, "auxiliary_loss_mlp": 0.01028425, "balance_loss_clip": 1.01648045, "balance_loss_mlp": 1.03579307, "epoch": 0.8485795881557192, "flos": 24205910941440.0, "grad_norm": 2.095366998555587, "language_loss": 0.66917956, "learning_rate": 2.3564711764454003e-07, "loss": 0.69050372, "num_input_tokens_seen": 304393070, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 14114, "time_per_iteration": 2.4855589866638184 }, { "auxiliary_loss_clip": 0.01105791, "auxiliary_loss_mlp": 0.01035928, "balance_loss_clip": 1.02287483, "balance_loss_mlp": 1.03590512, "epoch": 0.8486397114083872, "flos": 21141653598720.0, "grad_norm": 1.6963389190055176, "language_loss": 0.78941673, "learning_rate": 2.3546374618321495e-07, "loss": 0.81083393, "num_input_tokens_seen": 304411195, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 14115, "time_per_iteration": 2.4489214420318604 }, { "auxiliary_loss_clip": 0.01103397, "auxiliary_loss_mlp": 0.01031662, "balance_loss_clip": 1.02013469, "balance_loss_mlp": 1.03504694, "epoch": 0.8486998346610551, "flos": 19974772373760.0, "grad_norm": 2.398647492910261, "language_loss": 0.78881168, "learning_rate": 2.3528044163328187e-07, "loss": 0.81016231, "num_input_tokens_seen": 304429425, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 14116, "time_per_iteration": 2.45086407661438 }, { "auxiliary_loss_clip": 0.01105726, "auxiliary_loss_mlp": 0.01030026, "balance_loss_clip": 1.01735377, "balance_loss_mlp": 1.03470325, "epoch": 0.8487599579137232, "flos": 19792310261760.0, "grad_norm": 2.570418231160519, "language_loss": 0.68393743, "learning_rate": 2.3509720400169076e-07, "loss": 0.70529491, "num_input_tokens_seen": 304447460, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 14117, "time_per_iteration": 2.445023536682129 }, { "auxiliary_loss_clip": 0.01104662, "auxiliary_loss_mlp": 0.01028526, "balance_loss_clip": 1.0160985, "balance_loss_mlp": 1.03367448, "epoch": 0.8488200811663911, "flos": 26396030903040.0, "grad_norm": 2.2906243437658698, "language_loss": 0.65042847, "learning_rate": 2.3491403329539096e-07, "loss": 0.67176032, "num_input_tokens_seen": 304468230, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 14118, "time_per_iteration": 2.513298511505127 }, { "auxiliary_loss_clip": 0.01102464, "auxiliary_loss_mlp": 0.0102941, "balance_loss_clip": 1.01803184, "balance_loss_mlp": 1.03470755, "epoch": 0.8488802044190591, "flos": 16359285939840.0, "grad_norm": 2.034481180683133, "language_loss": 0.73323745, "learning_rate": 2.3473092952132757e-07, "loss": 0.75455618, "num_input_tokens_seen": 304484860, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.67578125, "step": 14119, "time_per_iteration": 2.4100940227508545 }, { "auxiliary_loss_clip": 0.01105098, "auxiliary_loss_mlp": 0.01031294, "balance_loss_clip": 1.01770985, "balance_loss_mlp": 1.03545594, "epoch": 0.848940327671727, "flos": 19208869649280.0, "grad_norm": 2.713374567326, "language_loss": 0.77694523, "learning_rate": 2.345478926864446e-07, "loss": 0.79830921, "num_input_tokens_seen": 304503575, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6953125, "step": 14120, "time_per_iteration": 2.441472291946411 }, { "auxiliary_loss_clip": 0.01105533, "auxiliary_loss_mlp": 0.01029464, "balance_loss_clip": 1.01705408, "balance_loss_mlp": 1.03592563, "epoch": 0.849000450924395, "flos": 21871178824320.0, "grad_norm": 6.72296142755312, "language_loss": 0.75600958, "learning_rate": 2.3436492279768227e-07, "loss": 0.7773596, "num_input_tokens_seen": 304525005, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 14121, "time_per_iteration": 2.4770314693450928 }, { "auxiliary_loss_clip": 0.01029184, "auxiliary_loss_mlp": 0.00999524, "balance_loss_clip": 0.99846935, "balance_loss_mlp": 1.00709152, "epoch": 0.8490605741770629, "flos": 71166475624320.0, "grad_norm": 0.823931215803947, "language_loss": 0.6016221, "learning_rate": 2.3418201986197883e-07, "loss": 0.62190914, "num_input_tokens_seen": 304585220, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.22070312, "step": 14122, "time_per_iteration": 3.080110549926758 }, { "auxiliary_loss_clip": 0.01105669, "auxiliary_loss_mlp": 0.0102923, "balance_loss_clip": 1.01752305, "balance_loss_mlp": 1.03666711, "epoch": 0.849120697429731, "flos": 24973357950720.0, "grad_norm": 1.9065523043176373, "language_loss": 0.79847133, "learning_rate": 2.3399918388627048e-07, "loss": 0.81982034, "num_input_tokens_seen": 304604665, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 14123, "time_per_iteration": 2.4734981060028076 }, { "auxiliary_loss_clip": 0.01101894, "auxiliary_loss_mlp": 0.01027807, "balance_loss_clip": 1.01615477, "balance_loss_mlp": 1.03626585, "epoch": 0.8491808206823989, "flos": 23032277959680.0, "grad_norm": 2.3958245205012516, "language_loss": 0.83123261, "learning_rate": 2.3381641487749016e-07, "loss": 0.85252964, "num_input_tokens_seen": 304620600, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.65625, "step": 14124, "time_per_iteration": 3.932361602783203 }, { "auxiliary_loss_clip": 0.01107281, "auxiliary_loss_mlp": 0.01031911, "balance_loss_clip": 1.01900673, "balance_loss_mlp": 1.03815699, "epoch": 0.8492409439350669, "flos": 23878549365120.0, "grad_norm": 1.955957867269213, "language_loss": 0.71725655, "learning_rate": 2.3363371284256805e-07, "loss": 0.73864847, "num_input_tokens_seen": 304639540, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 14125, "time_per_iteration": 2.4562623500823975 }, { "auxiliary_loss_clip": 0.0110824, "auxiliary_loss_mlp": 0.01035422, "balance_loss_clip": 1.02160573, "balance_loss_mlp": 1.03619432, "epoch": 0.8493010671877349, "flos": 22419893963520.0, "grad_norm": 1.5635021180270205, "language_loss": 0.73862129, "learning_rate": 2.3345107778843288e-07, "loss": 0.76005793, "num_input_tokens_seen": 304660595, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 14126, "time_per_iteration": 2.474271297454834 }, { "auxiliary_loss_clip": 0.01101677, "auxiliary_loss_mlp": 0.01029597, "balance_loss_clip": 1.01783717, "balance_loss_mlp": 1.03404295, "epoch": 0.8493611904404028, "flos": 17529435302400.0, "grad_norm": 1.7946492065102635, "language_loss": 0.67562425, "learning_rate": 2.3326850972200928e-07, "loss": 0.69693696, "num_input_tokens_seen": 304679580, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.67578125, "step": 14127, "time_per_iteration": 3.941413164138794 }, { "auxiliary_loss_clip": 0.01106662, "auxiliary_loss_mlp": 0.01027788, "balance_loss_clip": 1.0154562, "balance_loss_mlp": 1.03607583, "epoch": 0.8494213136930708, "flos": 19462937523840.0, "grad_norm": 2.2887485152094214, "language_loss": 0.69254887, "learning_rate": 2.330860086502211e-07, "loss": 0.71389341, "num_input_tokens_seen": 304698385, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.70703125, "step": 14128, "time_per_iteration": 2.4182512760162354 }, { "auxiliary_loss_clip": 0.01103734, "auxiliary_loss_mlp": 0.01031687, "balance_loss_clip": 1.01958156, "balance_loss_mlp": 1.03627956, "epoch": 0.8494814369457387, "flos": 18770292587520.0, "grad_norm": 1.8217711402757741, "language_loss": 0.78010201, "learning_rate": 2.3290357457998855e-07, "loss": 0.80145627, "num_input_tokens_seen": 304715430, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 14129, "time_per_iteration": 3.795470952987671 }, { "auxiliary_loss_clip": 0.01104831, "auxiliary_loss_mlp": 0.01028417, "balance_loss_clip": 1.01635265, "balance_loss_mlp": 1.03602862, "epoch": 0.8495415601984068, "flos": 23331486251520.0, "grad_norm": 1.9011393902927607, "language_loss": 0.68375844, "learning_rate": 2.3272120751823031e-07, "loss": 0.70509088, "num_input_tokens_seen": 304734345, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 14130, "time_per_iteration": 2.461280107498169 }, { "auxiliary_loss_clip": 0.0110311, "auxiliary_loss_mlp": 0.01029336, "balance_loss_clip": 1.01706302, "balance_loss_mlp": 1.0338738, "epoch": 0.8496016834510747, "flos": 26612859352320.0, "grad_norm": 3.1087441054041562, "language_loss": 0.71073747, "learning_rate": 2.3253890747186e-07, "loss": 0.73206192, "num_input_tokens_seen": 304755030, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69140625, "step": 14131, "time_per_iteration": 3.924522638320923 }, { "auxiliary_loss_clip": 0.01101987, "auxiliary_loss_mlp": 0.01028594, "balance_loss_clip": 1.01648831, "balance_loss_mlp": 1.03243208, "epoch": 0.8496618067037427, "flos": 25480380378240.0, "grad_norm": 1.7837400299937516, "language_loss": 0.68331873, "learning_rate": 2.3235667444779162e-07, "loss": 0.70462453, "num_input_tokens_seen": 304774320, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 14132, "time_per_iteration": 2.4990055561065674 }, { "auxiliary_loss_clip": 0.01099938, "auxiliary_loss_mlp": 0.01031471, "balance_loss_clip": 1.02051604, "balance_loss_mlp": 1.03324938, "epoch": 0.8497219299564106, "flos": 25374587846400.0, "grad_norm": 2.1943805720457883, "language_loss": 0.70186377, "learning_rate": 2.3217450845293564e-07, "loss": 0.72317785, "num_input_tokens_seen": 304795355, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6640625, "step": 14133, "time_per_iteration": 2.4667763710021973 }, { "auxiliary_loss_clip": 0.01028719, "auxiliary_loss_mlp": 0.010011, "balance_loss_clip": 1.0000031, "balance_loss_mlp": 1.00664377, "epoch": 0.8497820532090786, "flos": 67780279658880.0, "grad_norm": 0.7371007686057846, "language_loss": 0.57639706, "learning_rate": 2.3199240949419918e-07, "loss": 0.5966953, "num_input_tokens_seen": 304863915, "router_z_loss_clip": 0.01098633, "router_z_loss_mlp": 0.22070312, "step": 14134, "time_per_iteration": 3.22103214263916 }, { "auxiliary_loss_clip": 0.01105957, "auxiliary_loss_mlp": 0.01030976, "balance_loss_clip": 1.01839304, "balance_loss_mlp": 1.03563309, "epoch": 0.8498421764617465, "flos": 23440546920960.0, "grad_norm": 2.5835525578234786, "language_loss": 0.79060388, "learning_rate": 2.3181037757848787e-07, "loss": 0.81197315, "num_input_tokens_seen": 304881555, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 14135, "time_per_iteration": 2.45634126663208 }, { "auxiliary_loss_clip": 0.01105893, "auxiliary_loss_mlp": 0.01028255, "balance_loss_clip": 1.01534474, "balance_loss_mlp": 1.03541923, "epoch": 0.8499022997144146, "flos": 17712615686400.0, "grad_norm": 1.679749320979946, "language_loss": 0.627101, "learning_rate": 2.316284127127044e-07, "loss": 0.64844251, "num_input_tokens_seen": 304898760, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 14136, "time_per_iteration": 2.44699764251709 }, { "auxiliary_loss_clip": 0.01107189, "auxiliary_loss_mlp": 0.01031446, "balance_loss_clip": 1.01815462, "balance_loss_mlp": 1.03645468, "epoch": 0.8499624229670825, "flos": 18588512833920.0, "grad_norm": 1.9662272066991735, "language_loss": 0.84280956, "learning_rate": 2.3144651490374835e-07, "loss": 0.86419588, "num_input_tokens_seen": 304915465, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 14137, "time_per_iteration": 2.4279465675354004 }, { "auxiliary_loss_clip": 0.01101493, "auxiliary_loss_mlp": 0.01027991, "balance_loss_clip": 1.01694036, "balance_loss_mlp": 1.03509641, "epoch": 0.8500225462197505, "flos": 24345854328960.0, "grad_norm": 3.1038142401758195, "language_loss": 0.78957939, "learning_rate": 2.3126468415851773e-07, "loss": 0.81087422, "num_input_tokens_seen": 304933190, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6640625, "step": 14138, "time_per_iteration": 2.5029380321502686 }, { "auxiliary_loss_clip": 0.01104981, "auxiliary_loss_mlp": 0.01028499, "balance_loss_clip": 1.01670933, "balance_loss_mlp": 1.03649545, "epoch": 0.8500826694724185, "flos": 16545518979840.0, "grad_norm": 1.9608446949879623, "language_loss": 0.64380324, "learning_rate": 2.310829204839073e-07, "loss": 0.66513807, "num_input_tokens_seen": 304951110, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 14139, "time_per_iteration": 2.4195075035095215 }, { "auxiliary_loss_clip": 0.01102527, "auxiliary_loss_mlp": 0.01028272, "balance_loss_clip": 1.01718605, "balance_loss_mlp": 1.03506458, "epoch": 0.8501427927250864, "flos": 16289404030080.0, "grad_norm": 1.6091808070802733, "language_loss": 0.70701933, "learning_rate": 2.3090122388681043e-07, "loss": 0.72832727, "num_input_tokens_seen": 304969095, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.671875, "step": 14140, "time_per_iteration": 2.4378271102905273 }, { "auxiliary_loss_clip": 0.01106252, "auxiliary_loss_mlp": 0.01031813, "balance_loss_clip": 1.01952291, "balance_loss_mlp": 1.03484118, "epoch": 0.8502029159777544, "flos": 26687912820480.0, "grad_norm": 1.9704409810621424, "language_loss": 0.64269042, "learning_rate": 2.3071959437411648e-07, "loss": 0.66407114, "num_input_tokens_seen": 304989315, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 14141, "time_per_iteration": 2.4704861640930176 }, { "auxiliary_loss_clip": 0.01106202, "auxiliary_loss_mlp": 0.01033101, "balance_loss_clip": 1.02122784, "balance_loss_mlp": 1.0374248, "epoch": 0.8502630392304223, "flos": 35590778179200.0, "grad_norm": 2.6886481922608443, "language_loss": 0.70842803, "learning_rate": 2.3053803195271214e-07, "loss": 0.72982103, "num_input_tokens_seen": 305011020, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14142, "time_per_iteration": 2.588141679763794 }, { "auxiliary_loss_clip": 0.01102426, "auxiliary_loss_mlp": 0.01026089, "balance_loss_clip": 1.01452005, "balance_loss_mlp": 1.0334233, "epoch": 0.8503231624830904, "flos": 21649466125440.0, "grad_norm": 1.5175125595447683, "language_loss": 0.65333116, "learning_rate": 2.3035653662948375e-07, "loss": 0.67461628, "num_input_tokens_seen": 305033550, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.69140625, "step": 14143, "time_per_iteration": 2.5486536026000977 }, { "auxiliary_loss_clip": 0.01107765, "auxiliary_loss_mlp": 0.01032817, "balance_loss_clip": 1.02033532, "balance_loss_mlp": 1.03536677, "epoch": 0.8503832857357583, "flos": 22417451838720.0, "grad_norm": 2.441899185913551, "language_loss": 0.67818129, "learning_rate": 2.3017510841131216e-07, "loss": 0.69958711, "num_input_tokens_seen": 305052885, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 14144, "time_per_iteration": 2.4792871475219727 }, { "auxiliary_loss_clip": 0.01100503, "auxiliary_loss_mlp": 0.01027854, "balance_loss_clip": 1.01561129, "balance_loss_mlp": 1.03406668, "epoch": 0.8504434089884263, "flos": 18697968552960.0, "grad_norm": 2.0151189284181608, "language_loss": 0.65274912, "learning_rate": 2.299937473050777e-07, "loss": 0.67403269, "num_input_tokens_seen": 305071995, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 14145, "time_per_iteration": 2.417328357696533 }, { "auxiliary_loss_clip": 0.0110239, "auxiliary_loss_mlp": 0.01030859, "balance_loss_clip": 1.01824665, "balance_loss_mlp": 1.03470683, "epoch": 0.8505035322410942, "flos": 20007989475840.0, "grad_norm": 2.5649133249366156, "language_loss": 0.85699266, "learning_rate": 2.2981245331765842e-07, "loss": 0.87832516, "num_input_tokens_seen": 305090190, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 14146, "time_per_iteration": 2.4765806198120117 }, { "auxiliary_loss_clip": 0.01100803, "auxiliary_loss_mlp": 0.01026414, "balance_loss_clip": 1.01476109, "balance_loss_mlp": 1.03295636, "epoch": 0.8505636554937622, "flos": 20812173120000.0, "grad_norm": 6.152016805648826, "language_loss": 0.83667207, "learning_rate": 2.2963122645592814e-07, "loss": 0.85794425, "num_input_tokens_seen": 305109355, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 14147, "time_per_iteration": 2.4401369094848633 }, { "auxiliary_loss_clip": 0.01108688, "auxiliary_loss_mlp": 0.01030846, "balance_loss_clip": 1.01811504, "balance_loss_mlp": 1.03652668, "epoch": 0.8506237787464301, "flos": 14174445277440.0, "grad_norm": 3.03586465885903, "language_loss": 0.85266769, "learning_rate": 2.2945006672675894e-07, "loss": 0.87406301, "num_input_tokens_seen": 305124165, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 14148, "time_per_iteration": 2.4024665355682373 }, { "auxiliary_loss_clip": 0.01104236, "auxiliary_loss_mlp": 0.01029356, "balance_loss_clip": 1.01705945, "balance_loss_mlp": 1.0362289, "epoch": 0.8506839019990982, "flos": 23258372117760.0, "grad_norm": 1.680843261512899, "language_loss": 0.71874052, "learning_rate": 2.292689741370204e-07, "loss": 0.74007642, "num_input_tokens_seen": 305143940, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 14149, "time_per_iteration": 2.445404291152954 }, { "auxiliary_loss_clip": 0.01104308, "auxiliary_loss_mlp": 0.01028591, "balance_loss_clip": 1.01658058, "balance_loss_mlp": 1.03521729, "epoch": 0.8507440252517661, "flos": 23659206963840.0, "grad_norm": 1.9431087869950197, "language_loss": 0.76215041, "learning_rate": 2.290879486935804e-07, "loss": 0.78347933, "num_input_tokens_seen": 305163505, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 14150, "time_per_iteration": 2.487301826477051 }, { "auxiliary_loss_clip": 0.01104567, "auxiliary_loss_mlp": 0.0103131, "balance_loss_clip": 1.01953244, "balance_loss_mlp": 1.03762054, "epoch": 0.8508041485044341, "flos": 18661339658880.0, "grad_norm": 2.5439392999372634, "language_loss": 0.72130609, "learning_rate": 2.2890699040330231e-07, "loss": 0.74266493, "num_input_tokens_seen": 305182325, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.66796875, "step": 14151, "time_per_iteration": 2.4421226978302 }, { "auxiliary_loss_clip": 0.01028848, "auxiliary_loss_mlp": 0.01001999, "balance_loss_clip": 1.00097358, "balance_loss_mlp": 1.00670791, "epoch": 0.8508642717571021, "flos": 52510918055040.0, "grad_norm": 0.8828876684454757, "language_loss": 0.5962944, "learning_rate": 2.2872609927304909e-07, "loss": 0.61660284, "num_input_tokens_seen": 305230775, "router_z_loss_clip": 0.01025391, "router_z_loss_mlp": 0.22167969, "step": 14152, "time_per_iteration": 2.8737640380859375 }, { "auxiliary_loss_clip": 0.01029047, "auxiliary_loss_mlp": 0.00999938, "balance_loss_clip": 0.99887675, "balance_loss_mlp": 1.00698137, "epoch": 0.85092439500977, "flos": 69297145050240.0, "grad_norm": 0.6908549138810088, "language_loss": 0.61250007, "learning_rate": 2.285452753096797e-07, "loss": 0.63278985, "num_input_tokens_seen": 305296000, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.22070312, "step": 14153, "time_per_iteration": 3.125802516937256 }, { "auxiliary_loss_clip": 0.01105553, "auxiliary_loss_mlp": 0.01030228, "balance_loss_clip": 1.01759815, "balance_loss_mlp": 1.03679645, "epoch": 0.850984518262438, "flos": 24389737770240.0, "grad_norm": 3.6537407389026075, "language_loss": 0.80938345, "learning_rate": 2.2836451852005067e-07, "loss": 0.83074129, "num_input_tokens_seen": 305314705, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 14154, "time_per_iteration": 2.5027012825012207 }, { "auxiliary_loss_clip": 0.01098676, "auxiliary_loss_mlp": 0.01029541, "balance_loss_clip": 1.01829398, "balance_loss_mlp": 1.03361797, "epoch": 0.851044641515106, "flos": 23294821443840.0, "grad_norm": 1.7820550585418369, "language_loss": 0.7969985, "learning_rate": 2.281838289110165e-07, "loss": 0.81828076, "num_input_tokens_seen": 305333870, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6484375, "step": 14155, "time_per_iteration": 2.504718065261841 }, { "auxiliary_loss_clip": 0.01104758, "auxiliary_loss_mlp": 0.01030448, "balance_loss_clip": 1.01801407, "balance_loss_mlp": 1.03393698, "epoch": 0.851104764767774, "flos": 22050085489920.0, "grad_norm": 4.125766907615922, "language_loss": 0.70878327, "learning_rate": 2.2800320648942904e-07, "loss": 0.73013532, "num_input_tokens_seen": 305352780, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 14156, "time_per_iteration": 2.4616692066192627 }, { "auxiliary_loss_clip": 0.0110054, "auxiliary_loss_mlp": 0.01030034, "balance_loss_clip": 1.01863194, "balance_loss_mlp": 1.03448975, "epoch": 0.8511648880204419, "flos": 20704728562560.0, "grad_norm": 2.0483663748705734, "language_loss": 0.73725724, "learning_rate": 2.278226512621386e-07, "loss": 0.75856304, "num_input_tokens_seen": 305371370, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66015625, "step": 14157, "time_per_iteration": 2.491818428039551 }, { "auxiliary_loss_clip": 0.01098827, "auxiliary_loss_mlp": 0.01025299, "balance_loss_clip": 1.01454079, "balance_loss_mlp": 1.0335058, "epoch": 0.8512250112731099, "flos": 24024669891840.0, "grad_norm": 3.257789251716398, "language_loss": 0.79428583, "learning_rate": 2.2764216323598995e-07, "loss": 0.81552708, "num_input_tokens_seen": 305387955, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.65234375, "step": 14158, "time_per_iteration": 2.4693586826324463 }, { "auxiliary_loss_clip": 0.01103713, "auxiliary_loss_mlp": 0.01033211, "balance_loss_clip": 1.02030039, "balance_loss_mlp": 1.03525853, "epoch": 0.8512851345257778, "flos": 22015467757440.0, "grad_norm": 2.2220192231104163, "language_loss": 0.79022855, "learning_rate": 2.27461742417828e-07, "loss": 0.81159782, "num_input_tokens_seen": 305406285, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 14159, "time_per_iteration": 2.4563026428222656 }, { "auxiliary_loss_clip": 0.01106004, "auxiliary_loss_mlp": 0.01036316, "balance_loss_clip": 1.02431786, "balance_loss_mlp": 1.03687024, "epoch": 0.8513452577784458, "flos": 14830209924480.0, "grad_norm": 2.075352282032284, "language_loss": 0.70932794, "learning_rate": 2.2728138881449488e-07, "loss": 0.7307511, "num_input_tokens_seen": 305424500, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 14160, "time_per_iteration": 2.436479330062866 }, { "auxiliary_loss_clip": 0.01110405, "auxiliary_loss_mlp": 0.01035261, "balance_loss_clip": 1.02187419, "balance_loss_mlp": 1.03783166, "epoch": 0.8514053810311137, "flos": 33035662166400.0, "grad_norm": 2.0309107037863163, "language_loss": 0.70678949, "learning_rate": 2.2710110243282866e-07, "loss": 0.72824621, "num_input_tokens_seen": 305442990, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 14161, "time_per_iteration": 2.5661067962646484 }, { "auxiliary_loss_clip": 0.01103903, "auxiliary_loss_mlp": 0.01029893, "balance_loss_clip": 1.0180316, "balance_loss_mlp": 1.03272605, "epoch": 0.8514655042837818, "flos": 27564456412800.0, "grad_norm": 3.0511915738514928, "language_loss": 0.78177208, "learning_rate": 2.2692088327966653e-07, "loss": 0.80311, "num_input_tokens_seen": 305463065, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.7109375, "step": 14162, "time_per_iteration": 2.5128841400146484 }, { "auxiliary_loss_clip": 0.01105896, "auxiliary_loss_mlp": 0.01038388, "balance_loss_clip": 1.02572227, "balance_loss_mlp": 1.03678322, "epoch": 0.8515256275364497, "flos": 35556052705920.0, "grad_norm": 1.7707429066659992, "language_loss": 0.76621509, "learning_rate": 2.2674073136184235e-07, "loss": 0.78765792, "num_input_tokens_seen": 305489070, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 14163, "time_per_iteration": 2.598308563232422 }, { "auxiliary_loss_clip": 0.01028354, "auxiliary_loss_mlp": 0.00999181, "balance_loss_clip": 0.99821514, "balance_loss_mlp": 1.0064851, "epoch": 0.8515857507891177, "flos": 70207372621440.0, "grad_norm": 0.7049721793680261, "language_loss": 0.5499751, "learning_rate": 2.2656064668618735e-07, "loss": 0.57025039, "num_input_tokens_seen": 305551490, "router_z_loss_clip": 0.00964355, "router_z_loss_mlp": 0.21875, "step": 14164, "time_per_iteration": 3.1301491260528564 }, { "auxiliary_loss_clip": 0.01104371, "auxiliary_loss_mlp": 0.01035022, "balance_loss_clip": 1.02229643, "balance_loss_mlp": 1.03580523, "epoch": 0.8516458740417857, "flos": 22675290641280.0, "grad_norm": 1.8012575960248198, "language_loss": 0.73088956, "learning_rate": 2.2638062925953005e-07, "loss": 0.75228351, "num_input_tokens_seen": 305570535, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 14165, "time_per_iteration": 2.4959800243377686 }, { "auxiliary_loss_clip": 0.01101564, "auxiliary_loss_mlp": 0.01031047, "balance_loss_clip": 1.01848269, "balance_loss_mlp": 1.03483629, "epoch": 0.8517059972944536, "flos": 22747435107840.0, "grad_norm": 1.6603570649796089, "language_loss": 0.67195159, "learning_rate": 2.26200679088697e-07, "loss": 0.69327772, "num_input_tokens_seen": 305590800, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.66796875, "step": 14166, "time_per_iteration": 3.92834210395813 }, { "auxiliary_loss_clip": 0.01101332, "auxiliary_loss_mlp": 0.0103176, "balance_loss_clip": 1.02015519, "balance_loss_mlp": 1.03301263, "epoch": 0.8517661205471216, "flos": 21689147675520.0, "grad_norm": 1.7991280706897195, "language_loss": 0.73592669, "learning_rate": 2.260207961805125e-07, "loss": 0.7572577, "num_input_tokens_seen": 305609495, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 14167, "time_per_iteration": 2.4434871673583984 }, { "auxiliary_loss_clip": 0.01102739, "auxiliary_loss_mlp": 0.01029048, "balance_loss_clip": 1.01738906, "balance_loss_mlp": 1.03440857, "epoch": 0.8518262437997896, "flos": 25374839241600.0, "grad_norm": 3.621695079771097, "language_loss": 0.80413967, "learning_rate": 2.258409805417969e-07, "loss": 0.82545745, "num_input_tokens_seen": 305629420, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.68359375, "step": 14168, "time_per_iteration": 3.889540910720825 }, { "auxiliary_loss_clip": 0.01102548, "auxiliary_loss_mlp": 0.01025542, "balance_loss_clip": 1.01382411, "balance_loss_mlp": 1.03411555, "epoch": 0.8518863670524576, "flos": 27235406897280.0, "grad_norm": 1.9759259953388522, "language_loss": 0.76193047, "learning_rate": 2.2566123217936893e-07, "loss": 0.78321135, "num_input_tokens_seen": 305649835, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 14169, "time_per_iteration": 2.5040361881256104 }, { "auxiliary_loss_clip": 0.01106676, "auxiliary_loss_mlp": 0.01033018, "balance_loss_clip": 1.02048922, "balance_loss_mlp": 1.03631699, "epoch": 0.8519464903051255, "flos": 20959514709120.0, "grad_norm": 1.6716474159579278, "language_loss": 0.63955986, "learning_rate": 2.254815511000452e-07, "loss": 0.66095674, "num_input_tokens_seen": 305668840, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 14170, "time_per_iteration": 2.47432279586792 }, { "auxiliary_loss_clip": 0.01100702, "auxiliary_loss_mlp": 0.01027921, "balance_loss_clip": 1.01608384, "balance_loss_mlp": 1.03252292, "epoch": 0.8520066135577935, "flos": 18441745862400.0, "grad_norm": 2.531803117120962, "language_loss": 0.86470222, "learning_rate": 2.253019373106384e-07, "loss": 0.88598847, "num_input_tokens_seen": 305686955, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 14171, "time_per_iteration": 3.794816017150879 }, { "auxiliary_loss_clip": 0.01105767, "auxiliary_loss_mlp": 0.01037723, "balance_loss_clip": 1.02619565, "balance_loss_mlp": 1.03638244, "epoch": 0.8520667368104614, "flos": 29130233149440.0, "grad_norm": 1.9971128620210057, "language_loss": 0.55252337, "learning_rate": 2.2512239081796003e-07, "loss": 0.57395828, "num_input_tokens_seen": 305706290, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69140625, "step": 14172, "time_per_iteration": 3.9754881858825684 }, { "auxiliary_loss_clip": 0.01101527, "auxiliary_loss_mlp": 0.01025615, "balance_loss_clip": 1.01517177, "balance_loss_mlp": 1.03505397, "epoch": 0.8521268600631294, "flos": 16034366488320.0, "grad_norm": 3.495542251203398, "language_loss": 0.69794798, "learning_rate": 2.2494291162881862e-07, "loss": 0.71921939, "num_input_tokens_seen": 305723835, "router_z_loss_clip": 0.10449219, "router_z_loss_mlp": 0.6640625, "step": 14173, "time_per_iteration": 2.445312976837158 }, { "auxiliary_loss_clip": 0.01104724, "auxiliary_loss_mlp": 0.01030737, "balance_loss_clip": 1.01814222, "balance_loss_mlp": 1.03501129, "epoch": 0.8521869833157973, "flos": 22454870832000.0, "grad_norm": 2.6417502338498506, "language_loss": 0.76650298, "learning_rate": 2.247634997500205e-07, "loss": 0.78785753, "num_input_tokens_seen": 305741655, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 14174, "time_per_iteration": 2.458592176437378 }, { "auxiliary_loss_clip": 0.0110711, "auxiliary_loss_mlp": 0.01028631, "balance_loss_clip": 1.01703167, "balance_loss_mlp": 1.03698242, "epoch": 0.8522471065684654, "flos": 24972029147520.0, "grad_norm": 1.614288407763807, "language_loss": 0.82203186, "learning_rate": 2.245841551883676e-07, "loss": 0.84338921, "num_input_tokens_seen": 305761890, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.69921875, "step": 14175, "time_per_iteration": 2.5063297748565674 }, { "auxiliary_loss_clip": 0.01107896, "auxiliary_loss_mlp": 0.01031316, "balance_loss_clip": 1.01890659, "balance_loss_mlp": 1.03696251, "epoch": 0.8523072298211333, "flos": 17710604524800.0, "grad_norm": 3.733398279099352, "language_loss": 0.66089922, "learning_rate": 2.2440487795066153e-07, "loss": 0.68229133, "num_input_tokens_seen": 305779190, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 14176, "time_per_iteration": 2.434922695159912 }, { "auxiliary_loss_clip": 0.01102954, "auxiliary_loss_mlp": 0.0102949, "balance_loss_clip": 1.0167948, "balance_loss_mlp": 1.03526592, "epoch": 0.8523673530738013, "flos": 25446193608960.0, "grad_norm": 2.978632195755782, "language_loss": 0.78530216, "learning_rate": 2.2422566804370068e-07, "loss": 0.80662656, "num_input_tokens_seen": 305799870, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.67578125, "step": 14177, "time_per_iteration": 2.524279832839966 }, { "auxiliary_loss_clip": 0.01104753, "auxiliary_loss_mlp": 0.01029087, "balance_loss_clip": 1.01644468, "balance_loss_mlp": 1.03534961, "epoch": 0.8524274763264693, "flos": 31429593348480.0, "grad_norm": 2.1141611555498816, "language_loss": 0.73198754, "learning_rate": 2.2404652547428026e-07, "loss": 0.75332594, "num_input_tokens_seen": 305819695, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14178, "time_per_iteration": 2.51548433303833 }, { "auxiliary_loss_clip": 0.01105783, "auxiliary_loss_mlp": 0.0103627, "balance_loss_clip": 1.024194, "balance_loss_mlp": 1.03614509, "epoch": 0.8524875995791372, "flos": 17712651600000.0, "grad_norm": 1.9469871250955078, "language_loss": 0.75539124, "learning_rate": 2.238674502491935e-07, "loss": 0.77681178, "num_input_tokens_seen": 305837270, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 14179, "time_per_iteration": 2.4358878135681152 }, { "auxiliary_loss_clip": 0.01102193, "auxiliary_loss_mlp": 0.01026974, "balance_loss_clip": 1.01533341, "balance_loss_mlp": 1.03513837, "epoch": 0.8525477228318052, "flos": 21687316081920.0, "grad_norm": 2.102446459671496, "language_loss": 0.81823051, "learning_rate": 2.2368844237523165e-07, "loss": 0.83952218, "num_input_tokens_seen": 305855250, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 14180, "time_per_iteration": 2.4449222087860107 }, { "auxiliary_loss_clip": 0.01104256, "auxiliary_loss_mlp": 0.01032637, "balance_loss_clip": 1.02096629, "balance_loss_mlp": 1.03481996, "epoch": 0.8526078460844732, "flos": 24827057856000.0, "grad_norm": 3.4054035172965147, "language_loss": 0.61343801, "learning_rate": 2.235095018591815e-07, "loss": 0.63480699, "num_input_tokens_seen": 305875660, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 14181, "time_per_iteration": 2.482665777206421 }, { "auxiliary_loss_clip": 0.01103214, "auxiliary_loss_mlp": 0.01028888, "balance_loss_clip": 1.01784897, "balance_loss_mlp": 1.03624177, "epoch": 0.8526679693371412, "flos": 13516418073600.0, "grad_norm": 2.8486540400825735, "language_loss": 0.72296733, "learning_rate": 2.2333062870782894e-07, "loss": 0.74428838, "num_input_tokens_seen": 305892415, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66796875, "step": 14182, "time_per_iteration": 2.411341428756714 }, { "auxiliary_loss_clip": 0.01103472, "auxiliary_loss_mlp": 0.01030909, "balance_loss_clip": 1.01923251, "balance_loss_mlp": 1.03653145, "epoch": 0.8527280925898091, "flos": 23514092017920.0, "grad_norm": 1.5418233679512505, "language_loss": 0.70902169, "learning_rate": 2.2315182292795697e-07, "loss": 0.73036551, "num_input_tokens_seen": 305912665, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66796875, "step": 14183, "time_per_iteration": 2.5294382572174072 }, { "auxiliary_loss_clip": 0.01102923, "auxiliary_loss_mlp": 0.01032669, "balance_loss_clip": 1.02172565, "balance_loss_mlp": 1.0366559, "epoch": 0.8527882158424771, "flos": 20303031790080.0, "grad_norm": 1.8735517131393804, "language_loss": 0.73050487, "learning_rate": 2.2297308452634644e-07, "loss": 0.75186086, "num_input_tokens_seen": 305931515, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6640625, "step": 14184, "time_per_iteration": 2.430614471435547 }, { "auxiliary_loss_clip": 0.01103953, "auxiliary_loss_mlp": 0.01033181, "balance_loss_clip": 1.02111745, "balance_loss_mlp": 1.03547478, "epoch": 0.852848339095145, "flos": 17202504689280.0, "grad_norm": 1.7260115495000754, "language_loss": 0.76860565, "learning_rate": 2.2279441350977457e-07, "loss": 0.78997695, "num_input_tokens_seen": 305949965, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.68359375, "step": 14185, "time_per_iteration": 2.4532430171966553 }, { "auxiliary_loss_clip": 0.01104554, "auxiliary_loss_mlp": 0.01026562, "balance_loss_clip": 1.01404512, "balance_loss_mlp": 1.03475189, "epoch": 0.852908462347813, "flos": 18368990864640.0, "grad_norm": 1.9547240386860616, "language_loss": 0.79512966, "learning_rate": 2.2261580988501637e-07, "loss": 0.81644082, "num_input_tokens_seen": 305967820, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 14186, "time_per_iteration": 2.4083287715911865 }, { "auxiliary_loss_clip": 0.01102662, "auxiliary_loss_mlp": 0.01028202, "balance_loss_clip": 1.01550007, "balance_loss_mlp": 1.0330168, "epoch": 0.8529685856004809, "flos": 18624890332800.0, "grad_norm": 3.2085267167210274, "language_loss": 0.62775129, "learning_rate": 2.224372736588449e-07, "loss": 0.64905995, "num_input_tokens_seen": 305985505, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14187, "time_per_iteration": 2.4425575733184814 }, { "auxiliary_loss_clip": 0.01106596, "auxiliary_loss_mlp": 0.0103074, "balance_loss_clip": 1.01766896, "balance_loss_mlp": 1.03428316, "epoch": 0.853028708853149, "flos": 29607665748480.0, "grad_norm": 1.606613821463739, "language_loss": 0.76707894, "learning_rate": 2.2225880483803005e-07, "loss": 0.78845227, "num_input_tokens_seen": 306005220, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 14188, "time_per_iteration": 2.5203006267547607 }, { "auxiliary_loss_clip": 0.01107664, "auxiliary_loss_mlp": 0.01033814, "balance_loss_clip": 1.02093911, "balance_loss_mlp": 1.03652287, "epoch": 0.8530888321058169, "flos": 26353153042560.0, "grad_norm": 1.5872471266864667, "language_loss": 0.78385007, "learning_rate": 2.2208040342933932e-07, "loss": 0.80526489, "num_input_tokens_seen": 306023785, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 14189, "time_per_iteration": 2.5044617652893066 }, { "auxiliary_loss_clip": 0.01105011, "auxiliary_loss_mlp": 0.01029907, "balance_loss_clip": 1.01730037, "balance_loss_mlp": 1.03495145, "epoch": 0.8531489553584849, "flos": 20521979141760.0, "grad_norm": 2.074032859371818, "language_loss": 0.7960127, "learning_rate": 2.2190206943953793e-07, "loss": 0.81736195, "num_input_tokens_seen": 306041600, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 14190, "time_per_iteration": 2.458950996398926 }, { "auxiliary_loss_clip": 0.01103969, "auxiliary_loss_mlp": 0.0103082, "balance_loss_clip": 1.01826179, "balance_loss_mlp": 1.03583646, "epoch": 0.8532090786111529, "flos": 20704297599360.0, "grad_norm": 2.03961006448349, "language_loss": 0.75894487, "learning_rate": 2.2172380287538894e-07, "loss": 0.78029281, "num_input_tokens_seen": 306060345, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 14191, "time_per_iteration": 2.447667360305786 }, { "auxiliary_loss_clip": 0.01104281, "auxiliary_loss_mlp": 0.010305, "balance_loss_clip": 1.01808476, "balance_loss_mlp": 1.03656411, "epoch": 0.8532692018638208, "flos": 19828903242240.0, "grad_norm": 2.3679933525744574, "language_loss": 0.69395256, "learning_rate": 2.2154560374365073e-07, "loss": 0.71530044, "num_input_tokens_seen": 306078285, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.67578125, "step": 14192, "time_per_iteration": 2.4336178302764893 }, { "auxiliary_loss_clip": 0.01110047, "auxiliary_loss_mlp": 0.01036449, "balance_loss_clip": 1.02299666, "balance_loss_mlp": 1.03580797, "epoch": 0.8533293251164888, "flos": 20996790048000.0, "grad_norm": 6.6977926195996185, "language_loss": 0.63348347, "learning_rate": 2.2136747205108164e-07, "loss": 0.65494847, "num_input_tokens_seen": 306093760, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 14193, "time_per_iteration": 2.4672398567199707 }, { "auxiliary_loss_clip": 0.01105397, "auxiliary_loss_mlp": 0.01032369, "balance_loss_clip": 1.02075183, "balance_loss_mlp": 1.03638577, "epoch": 0.8533894483691568, "flos": 22419606654720.0, "grad_norm": 1.7448449450436228, "language_loss": 0.76659983, "learning_rate": 2.211894078044365e-07, "loss": 0.78797746, "num_input_tokens_seen": 306112595, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.69140625, "step": 14194, "time_per_iteration": 2.454002618789673 }, { "auxiliary_loss_clip": 0.01104513, "auxiliary_loss_mlp": 0.01028315, "balance_loss_clip": 1.01638842, "balance_loss_mlp": 1.03528953, "epoch": 0.8534495716218248, "flos": 21616536332160.0, "grad_norm": 1.7810479742223444, "language_loss": 0.69675678, "learning_rate": 2.2101141101046705e-07, "loss": 0.71808505, "num_input_tokens_seen": 306131800, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 14195, "time_per_iteration": 2.494328737258911 }, { "auxiliary_loss_clip": 0.01104474, "auxiliary_loss_mlp": 0.0103243, "balance_loss_clip": 1.01960325, "balance_loss_mlp": 1.0342598, "epoch": 0.8535096948744927, "flos": 22346277039360.0, "grad_norm": 2.5895482725305587, "language_loss": 0.85537964, "learning_rate": 2.2083348167592343e-07, "loss": 0.87674868, "num_input_tokens_seen": 306150590, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 14196, "time_per_iteration": 2.4331741333007812 }, { "auxiliary_loss_clip": 0.01029365, "auxiliary_loss_mlp": 0.01001612, "balance_loss_clip": 1.00060511, "balance_loss_mlp": 1.0072329, "epoch": 0.8535698181271607, "flos": 52762507891200.0, "grad_norm": 0.7667009498773184, "language_loss": 0.55133742, "learning_rate": 2.2065561980755243e-07, "loss": 0.57164723, "num_input_tokens_seen": 306205850, "router_z_loss_clip": 0.0100708, "router_z_loss_mlp": 0.22070312, "step": 14197, "time_per_iteration": 3.050687789916992 }, { "auxiliary_loss_clip": 0.01102145, "auxiliary_loss_mlp": 0.01030838, "balance_loss_clip": 1.01910162, "balance_loss_mlp": 1.03473616, "epoch": 0.8536299413798286, "flos": 19062892776960.0, "grad_norm": 1.8062329161225352, "language_loss": 0.81482261, "learning_rate": 2.2047782541209826e-07, "loss": 0.83615243, "num_input_tokens_seen": 306225220, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 14198, "time_per_iteration": 2.446580648422241 }, { "auxiliary_loss_clip": 0.01103418, "auxiliary_loss_mlp": 0.01028128, "balance_loss_clip": 1.01732719, "balance_loss_mlp": 1.0351907, "epoch": 0.8536900646324966, "flos": 49344743871360.0, "grad_norm": 1.407945406174214, "language_loss": 0.68441594, "learning_rate": 2.203000984963035e-07, "loss": 0.70573145, "num_input_tokens_seen": 306249865, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.68359375, "step": 14199, "time_per_iteration": 2.720280885696411 }, { "auxiliary_loss_clip": 0.01097997, "auxiliary_loss_mlp": 0.01027841, "balance_loss_clip": 1.01673722, "balance_loss_mlp": 1.03258681, "epoch": 0.8537501878851645, "flos": 21762333636480.0, "grad_norm": 1.7974876454403446, "language_loss": 0.86373663, "learning_rate": 2.201224390669072e-07, "loss": 0.88499498, "num_input_tokens_seen": 306270215, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 14200, "time_per_iteration": 2.4368972778320312 }, { "auxiliary_loss_clip": 0.0110272, "auxiliary_loss_mlp": 0.01029506, "balance_loss_clip": 1.01738811, "balance_loss_mlp": 1.03319383, "epoch": 0.8538103111378326, "flos": 22269176496000.0, "grad_norm": 1.849481456395279, "language_loss": 0.78151149, "learning_rate": 2.1994484713064666e-07, "loss": 0.80283374, "num_input_tokens_seen": 306288960, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 14201, "time_per_iteration": 2.484867572784424 }, { "auxiliary_loss_clip": 0.01103287, "auxiliary_loss_mlp": 0.01030103, "balance_loss_clip": 1.01853347, "balance_loss_mlp": 1.03609073, "epoch": 0.8538704343905005, "flos": 20303929630080.0, "grad_norm": 1.7428213952582057, "language_loss": 0.68564785, "learning_rate": 2.19767322694256e-07, "loss": 0.70698178, "num_input_tokens_seen": 306308735, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 14202, "time_per_iteration": 2.4453907012939453 }, { "auxiliary_loss_clip": 0.01103398, "auxiliary_loss_mlp": 0.01034148, "balance_loss_clip": 1.02260303, "balance_loss_mlp": 1.03464317, "epoch": 0.8539305576431685, "flos": 24755164784640.0, "grad_norm": 2.8975016545901418, "language_loss": 0.80119705, "learning_rate": 2.195898657644666e-07, "loss": 0.82257253, "num_input_tokens_seen": 306329015, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 14203, "time_per_iteration": 2.514035701751709 }, { "auxiliary_loss_clip": 0.011075, "auxiliary_loss_mlp": 0.01032296, "balance_loss_clip": 1.01963019, "balance_loss_mlp": 1.03656578, "epoch": 0.8539906808958365, "flos": 26687625511680.0, "grad_norm": 2.4223302233505506, "language_loss": 0.66134042, "learning_rate": 2.1941247634800808e-07, "loss": 0.68273842, "num_input_tokens_seen": 306349085, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 14204, "time_per_iteration": 2.491572141647339 }, { "auxiliary_loss_clip": 0.01106508, "auxiliary_loss_mlp": 0.01033496, "balance_loss_clip": 1.02086592, "balance_loss_mlp": 1.03628206, "epoch": 0.8540508041485044, "flos": 13365521038080.0, "grad_norm": 2.526515628257478, "language_loss": 0.59775925, "learning_rate": 2.1923515445160667e-07, "loss": 0.61915928, "num_input_tokens_seen": 306365385, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 14205, "time_per_iteration": 2.442063570022583 }, { "auxiliary_loss_clip": 0.01103165, "auxiliary_loss_mlp": 0.01026209, "balance_loss_clip": 1.01384068, "balance_loss_mlp": 1.03527021, "epoch": 0.8541109274011724, "flos": 32780876019840.0, "grad_norm": 3.909763598720466, "language_loss": 0.72024292, "learning_rate": 2.1905790008198655e-07, "loss": 0.74153662, "num_input_tokens_seen": 306384585, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 14206, "time_per_iteration": 2.537158727645874 }, { "auxiliary_loss_clip": 0.01105559, "auxiliary_loss_mlp": 0.01027915, "balance_loss_clip": 1.01576209, "balance_loss_mlp": 1.03574967, "epoch": 0.8541710506538404, "flos": 17639286071040.0, "grad_norm": 3.671134413530587, "language_loss": 0.76436388, "learning_rate": 2.1888071324586987e-07, "loss": 0.78569865, "num_input_tokens_seen": 306401565, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69921875, "step": 14207, "time_per_iteration": 3.9151575565338135 }, { "auxiliary_loss_clip": 0.01105556, "auxiliary_loss_mlp": 0.01032354, "balance_loss_clip": 1.0190922, "balance_loss_mlp": 1.03566432, "epoch": 0.8542311739065084, "flos": 20263062931200.0, "grad_norm": 1.831535476411863, "language_loss": 0.84800702, "learning_rate": 2.1870359394997485e-07, "loss": 0.86938614, "num_input_tokens_seen": 306419995, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 14208, "time_per_iteration": 2.4890456199645996 }, { "auxiliary_loss_clip": 0.01104612, "auxiliary_loss_mlp": 0.01031016, "balance_loss_clip": 1.01927412, "balance_loss_mlp": 1.0358789, "epoch": 0.8542912971591763, "flos": 17785657992960.0, "grad_norm": 1.5743827736240195, "language_loss": 0.66248637, "learning_rate": 2.1852654220101785e-07, "loss": 0.68384266, "num_input_tokens_seen": 306439240, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 14209, "time_per_iteration": 2.448228120803833 }, { "auxiliary_loss_clip": 0.01101235, "auxiliary_loss_mlp": 0.01028692, "balance_loss_clip": 1.01692009, "balance_loss_mlp": 1.03417563, "epoch": 0.8543514204118443, "flos": 26979507429120.0, "grad_norm": 2.230595502212182, "language_loss": 0.70286268, "learning_rate": 2.1834955800571287e-07, "loss": 0.72416198, "num_input_tokens_seen": 306458425, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 14210, "time_per_iteration": 3.9413187503814697 }, { "auxiliary_loss_clip": 0.01102499, "auxiliary_loss_mlp": 0.01029009, "balance_loss_clip": 1.0172962, "balance_loss_mlp": 1.03475213, "epoch": 0.8544115436645122, "flos": 24024598064640.0, "grad_norm": 1.4858506153782558, "language_loss": 0.70313483, "learning_rate": 2.1817264137077141e-07, "loss": 0.72444987, "num_input_tokens_seen": 306477210, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 14211, "time_per_iteration": 2.4740536212921143 }, { "auxiliary_loss_clip": 0.01104174, "auxiliary_loss_mlp": 0.01031686, "balance_loss_clip": 1.01936007, "balance_loss_mlp": 1.03430009, "epoch": 0.8544716669171802, "flos": 16617986668800.0, "grad_norm": 2.282806379734756, "language_loss": 0.81226254, "learning_rate": 2.1799579230290166e-07, "loss": 0.83362114, "num_input_tokens_seen": 306495820, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 14212, "time_per_iteration": 3.951425313949585 }, { "auxiliary_loss_clip": 0.01105669, "auxiliary_loss_mlp": 0.01035286, "balance_loss_clip": 1.02189279, "balance_loss_mlp": 1.0360353, "epoch": 0.8545317901698481, "flos": 40005779489280.0, "grad_norm": 2.2557750615796426, "language_loss": 0.66970575, "learning_rate": 2.178190108088105e-07, "loss": 0.69111532, "num_input_tokens_seen": 306516420, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6953125, "step": 14213, "time_per_iteration": 2.642624616622925 }, { "auxiliary_loss_clip": 0.01101533, "auxiliary_loss_mlp": 0.01025806, "balance_loss_clip": 1.01408803, "balance_loss_mlp": 1.03402972, "epoch": 0.8545919134225162, "flos": 19902520166400.0, "grad_norm": 1.7014636895543018, "language_loss": 0.78131199, "learning_rate": 2.1764229689520098e-07, "loss": 0.80258536, "num_input_tokens_seen": 306534785, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 14214, "time_per_iteration": 3.913161516189575 }, { "auxiliary_loss_clip": 0.01106185, "auxiliary_loss_mlp": 0.01029509, "balance_loss_clip": 1.01615798, "balance_loss_mlp": 1.03448892, "epoch": 0.8546520366751841, "flos": 18952970181120.0, "grad_norm": 2.8192340664882622, "language_loss": 0.66514206, "learning_rate": 2.1746565056877397e-07, "loss": 0.686499, "num_input_tokens_seen": 306552440, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 14215, "time_per_iteration": 2.444065809249878 }, { "auxiliary_loss_clip": 0.01104391, "auxiliary_loss_mlp": 0.0102783, "balance_loss_clip": 1.01566482, "balance_loss_mlp": 1.03615022, "epoch": 0.8547121599278521, "flos": 35621445415680.0, "grad_norm": 1.8885341943325569, "language_loss": 0.62391096, "learning_rate": 2.172890718362279e-07, "loss": 0.64523321, "num_input_tokens_seen": 306573600, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 14216, "time_per_iteration": 2.582826614379883 }, { "auxiliary_loss_clip": 0.01104677, "auxiliary_loss_mlp": 0.01030925, "balance_loss_clip": 1.01874781, "balance_loss_mlp": 1.03454423, "epoch": 0.8547722831805201, "flos": 16910048154240.0, "grad_norm": 1.7154143147125693, "language_loss": 0.65464675, "learning_rate": 2.17112560704259e-07, "loss": 0.67600274, "num_input_tokens_seen": 306592840, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 14217, "time_per_iteration": 2.457092046737671 }, { "auxiliary_loss_clip": 0.01101904, "auxiliary_loss_mlp": 0.01030432, "balance_loss_clip": 1.01882744, "balance_loss_mlp": 1.03595364, "epoch": 0.854832406433188, "flos": 23002616304000.0, "grad_norm": 1.7395309363019027, "language_loss": 0.64921415, "learning_rate": 2.1693611717956072e-07, "loss": 0.67053753, "num_input_tokens_seen": 306613210, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66015625, "step": 14218, "time_per_iteration": 2.475581169128418 }, { "auxiliary_loss_clip": 0.01104948, "auxiliary_loss_mlp": 0.01028663, "balance_loss_clip": 1.01709938, "balance_loss_mlp": 1.03407884, "epoch": 0.854892529685856, "flos": 20412595249920.0, "grad_norm": 2.0589664222817126, "language_loss": 0.69906425, "learning_rate": 2.167597412688238e-07, "loss": 0.72040033, "num_input_tokens_seen": 306631620, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.7109375, "step": 14219, "time_per_iteration": 2.4912936687469482 }, { "auxiliary_loss_clip": 0.01106706, "auxiliary_loss_mlp": 0.01031272, "balance_loss_clip": 1.01894605, "balance_loss_mlp": 1.0344063, "epoch": 0.854952652938524, "flos": 16398716094720.0, "grad_norm": 4.642188685883497, "language_loss": 0.66947579, "learning_rate": 2.1658343297873549e-07, "loss": 0.6908555, "num_input_tokens_seen": 306646695, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7265625, "step": 14220, "time_per_iteration": 2.420097827911377 }, { "auxiliary_loss_clip": 0.01101207, "auxiliary_loss_mlp": 0.01028293, "balance_loss_clip": 1.01655662, "balance_loss_mlp": 1.03495657, "epoch": 0.855012776191192, "flos": 21178677542400.0, "grad_norm": 2.2975125818500417, "language_loss": 0.71648562, "learning_rate": 2.164071923159827e-07, "loss": 0.73778063, "num_input_tokens_seen": 306665465, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 14221, "time_per_iteration": 2.47415828704834 }, { "auxiliary_loss_clip": 0.01105366, "auxiliary_loss_mlp": 0.01036439, "balance_loss_clip": 1.02398729, "balance_loss_mlp": 1.03548121, "epoch": 0.8550728994438599, "flos": 26140993361280.0, "grad_norm": 1.8552536044267138, "language_loss": 0.60185623, "learning_rate": 2.1623101928724763e-07, "loss": 0.62327433, "num_input_tokens_seen": 306685950, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69921875, "step": 14222, "time_per_iteration": 2.4852824211120605 }, { "auxiliary_loss_clip": 0.01103497, "auxiliary_loss_mlp": 0.01033603, "balance_loss_clip": 1.02156317, "balance_loss_mlp": 1.03572237, "epoch": 0.8551330226965279, "flos": 22786793435520.0, "grad_norm": 1.6637435663932363, "language_loss": 0.8419944, "learning_rate": 2.1605491389921093e-07, "loss": 0.86336535, "num_input_tokens_seen": 306705740, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 14223, "time_per_iteration": 2.5033836364746094 }, { "auxiliary_loss_clip": 0.01103087, "auxiliary_loss_mlp": 0.01033633, "balance_loss_clip": 1.02153289, "balance_loss_mlp": 1.03612542, "epoch": 0.8551931459491958, "flos": 22419032037120.0, "grad_norm": 1.6409581416772612, "language_loss": 0.73956728, "learning_rate": 2.158788761585515e-07, "loss": 0.76093447, "num_input_tokens_seen": 306725065, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.671875, "step": 14224, "time_per_iteration": 2.456188678741455 }, { "auxiliary_loss_clip": 0.01102861, "auxiliary_loss_mlp": 0.01027345, "balance_loss_clip": 1.0152756, "balance_loss_mlp": 1.03455782, "epoch": 0.8552532692018638, "flos": 19573183342080.0, "grad_norm": 1.8320841465873063, "language_loss": 0.75341487, "learning_rate": 2.1570290607194307e-07, "loss": 0.77471691, "num_input_tokens_seen": 306743630, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 14225, "time_per_iteration": 2.4529988765716553 }, { "auxiliary_loss_clip": 0.01103963, "auxiliary_loss_mlp": 0.01033765, "balance_loss_clip": 1.02258956, "balance_loss_mlp": 1.03621817, "epoch": 0.8553133924545318, "flos": 26432767537920.0, "grad_norm": 1.9625382904612785, "language_loss": 0.77241659, "learning_rate": 2.1552700364605925e-07, "loss": 0.79379392, "num_input_tokens_seen": 306763105, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.67578125, "step": 14226, "time_per_iteration": 2.511202335357666 }, { "auxiliary_loss_clip": 0.01107464, "auxiliary_loss_mlp": 0.01034072, "balance_loss_clip": 1.0214417, "balance_loss_mlp": 1.03615749, "epoch": 0.8553735157071998, "flos": 16362446336640.0, "grad_norm": 2.667124399742693, "language_loss": 0.54844958, "learning_rate": 2.153511688875702e-07, "loss": 0.56986499, "num_input_tokens_seen": 306779875, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 14227, "time_per_iteration": 2.458561420440674 }, { "auxiliary_loss_clip": 0.01104548, "auxiliary_loss_mlp": 0.0103095, "balance_loss_clip": 1.01932657, "balance_loss_mlp": 1.03687954, "epoch": 0.8554336389598677, "flos": 20887334328960.0, "grad_norm": 2.001433840408072, "language_loss": 0.65634561, "learning_rate": 2.151754018031442e-07, "loss": 0.67770064, "num_input_tokens_seen": 306800015, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 14228, "time_per_iteration": 2.4584476947784424 }, { "auxiliary_loss_clip": 0.01106784, "auxiliary_loss_mlp": 0.01036871, "balance_loss_clip": 1.02427101, "balance_loss_mlp": 1.03657436, "epoch": 0.8554937622125357, "flos": 21284721469440.0, "grad_norm": 4.068720509680687, "language_loss": 0.74446511, "learning_rate": 2.1499970239944542e-07, "loss": 0.76590168, "num_input_tokens_seen": 306814160, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 14229, "time_per_iteration": 2.4571378231048584 }, { "auxiliary_loss_clip": 0.01102758, "auxiliary_loss_mlp": 0.01029251, "balance_loss_clip": 1.01800323, "balance_loss_mlp": 1.03491831, "epoch": 0.8555538854652037, "flos": 22413178120320.0, "grad_norm": 1.7171125528480173, "language_loss": 0.72465509, "learning_rate": 2.1482407068313724e-07, "loss": 0.74597514, "num_input_tokens_seen": 306833310, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6796875, "step": 14230, "time_per_iteration": 2.4822802543640137 }, { "auxiliary_loss_clip": 0.01104362, "auxiliary_loss_mlp": 0.01028445, "balance_loss_clip": 1.01641107, "balance_loss_mlp": 1.03606963, "epoch": 0.8556140087178716, "flos": 20193719725440.0, "grad_norm": 1.94364129569631, "language_loss": 0.82323754, "learning_rate": 2.1464850666087897e-07, "loss": 0.84456563, "num_input_tokens_seen": 306851345, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 14231, "time_per_iteration": 2.455137014389038 }, { "auxiliary_loss_clip": 0.01105618, "auxiliary_loss_mlp": 0.01034693, "balance_loss_clip": 1.02162218, "balance_loss_mlp": 1.03628993, "epoch": 0.8556741319705397, "flos": 22638123043200.0, "grad_norm": 2.283548286224324, "language_loss": 0.67931449, "learning_rate": 2.1447301033932796e-07, "loss": 0.70071769, "num_input_tokens_seen": 306871040, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 14232, "time_per_iteration": 2.4564247131347656 }, { "auxiliary_loss_clip": 0.01105647, "auxiliary_loss_mlp": 0.01032628, "balance_loss_clip": 1.0200578, "balance_loss_mlp": 1.03537512, "epoch": 0.8557342552232076, "flos": 23549320281600.0, "grad_norm": 1.620246490811879, "language_loss": 0.66954291, "learning_rate": 2.1429758172513955e-07, "loss": 0.69092566, "num_input_tokens_seen": 306891625, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 14233, "time_per_iteration": 2.5179624557495117 }, { "auxiliary_loss_clip": 0.01101124, "auxiliary_loss_mlp": 0.01031842, "balance_loss_clip": 1.02003443, "balance_loss_mlp": 1.03350592, "epoch": 0.8557943784758756, "flos": 19609884063360.0, "grad_norm": 1.8348297889056844, "language_loss": 0.76960516, "learning_rate": 2.1412222082496556e-07, "loss": 0.79093486, "num_input_tokens_seen": 306910020, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 14234, "time_per_iteration": 2.4537951946258545 }, { "auxiliary_loss_clip": 0.01029902, "auxiliary_loss_mlp": 0.01005067, "balance_loss_clip": 1.00409567, "balance_loss_mlp": 1.00779486, "epoch": 0.8558545017285435, "flos": 70641891446400.0, "grad_norm": 0.7536477474696343, "language_loss": 0.58015007, "learning_rate": 2.1394692764545684e-07, "loss": 0.60049975, "num_input_tokens_seen": 306969505, "router_z_loss_clip": 0.00970459, "router_z_loss_mlp": 0.22167969, "step": 14235, "time_per_iteration": 3.064103126525879 }, { "auxiliary_loss_clip": 0.01029092, "auxiliary_loss_mlp": 0.01003014, "balance_loss_clip": 1.00207806, "balance_loss_mlp": 1.00707269, "epoch": 0.8559146249812115, "flos": 56649983086080.0, "grad_norm": 0.7977886107570387, "language_loss": 0.56663799, "learning_rate": 2.1377170219325858e-07, "loss": 0.58695906, "num_input_tokens_seen": 307027710, "router_z_loss_clip": 0.00933838, "router_z_loss_mlp": 0.22070312, "step": 14236, "time_per_iteration": 2.9935545921325684 }, { "auxiliary_loss_clip": 0.0110488, "auxiliary_loss_mlp": 0.01034115, "balance_loss_clip": 1.02168703, "balance_loss_mlp": 1.0352962, "epoch": 0.8559747482338794, "flos": 22888240421760.0, "grad_norm": 2.048086474014754, "language_loss": 0.69935316, "learning_rate": 2.1359654447501673e-07, "loss": 0.72074306, "num_input_tokens_seen": 307045515, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 14237, "time_per_iteration": 2.464505195617676 }, { "auxiliary_loss_clip": 0.01102011, "auxiliary_loss_mlp": 0.01029354, "balance_loss_clip": 1.01742709, "balance_loss_mlp": 1.03353643, "epoch": 0.8560348714865474, "flos": 22601925112320.0, "grad_norm": 2.932494628460188, "language_loss": 0.63630247, "learning_rate": 2.1342145449737314e-07, "loss": 0.65761614, "num_input_tokens_seen": 307064470, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14238, "time_per_iteration": 2.467806816101074 }, { "auxiliary_loss_clip": 0.01097939, "auxiliary_loss_mlp": 0.01031426, "balance_loss_clip": 1.02121544, "balance_loss_mlp": 1.03257322, "epoch": 0.8560949947392154, "flos": 17931455297280.0, "grad_norm": 1.6312782223457145, "language_loss": 0.69587189, "learning_rate": 2.1324643226696648e-07, "loss": 0.71716559, "num_input_tokens_seen": 307083900, "router_z_loss_clip": 0.10205078, "router_z_loss_mlp": 0.65625, "step": 14239, "time_per_iteration": 2.4544317722320557 }, { "auxiliary_loss_clip": 0.01106116, "auxiliary_loss_mlp": 0.01032574, "balance_loss_clip": 1.01995015, "balance_loss_mlp": 1.03486705, "epoch": 0.8561551179918834, "flos": 31026208636800.0, "grad_norm": 2.0264462946440056, "language_loss": 0.66937011, "learning_rate": 2.1307147779043455e-07, "loss": 0.69075704, "num_input_tokens_seen": 307104590, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 14240, "time_per_iteration": 2.5202839374542236 }, { "auxiliary_loss_clip": 0.01105716, "auxiliary_loss_mlp": 0.01030145, "balance_loss_clip": 1.01684773, "balance_loss_mlp": 1.03512502, "epoch": 0.8562152412445513, "flos": 30665198995200.0, "grad_norm": 1.5815960556584212, "language_loss": 0.61872619, "learning_rate": 2.1289659107441182e-07, "loss": 0.64008486, "num_input_tokens_seen": 307125580, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 14241, "time_per_iteration": 2.5380022525787354 }, { "auxiliary_loss_clip": 0.01108457, "auxiliary_loss_mlp": 0.01034788, "balance_loss_clip": 1.02167511, "balance_loss_mlp": 1.03600311, "epoch": 0.8562753644972193, "flos": 31576144838400.0, "grad_norm": 1.6522332914346232, "language_loss": 0.74512851, "learning_rate": 2.1272177212552855e-07, "loss": 0.76656091, "num_input_tokens_seen": 307147625, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 14242, "time_per_iteration": 2.5639169216156006 }, { "auxiliary_loss_clip": 0.01108654, "auxiliary_loss_mlp": 0.01041189, "balance_loss_clip": 1.02885675, "balance_loss_mlp": 1.03681052, "epoch": 0.8563354877498872, "flos": 26213640618240.0, "grad_norm": 2.0948645234956884, "language_loss": 0.76231247, "learning_rate": 2.1254702095041498e-07, "loss": 0.78381091, "num_input_tokens_seen": 307164665, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 14243, "time_per_iteration": 2.5043559074401855 }, { "auxiliary_loss_clip": 0.01104558, "auxiliary_loss_mlp": 0.01032792, "balance_loss_clip": 1.02079344, "balance_loss_mlp": 1.03469706, "epoch": 0.8563956110025552, "flos": 24134341092480.0, "grad_norm": 1.8420277113213703, "language_loss": 0.67869115, "learning_rate": 2.123723375556974e-07, "loss": 0.7000646, "num_input_tokens_seen": 307182530, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69921875, "step": 14244, "time_per_iteration": 2.4561164379119873 }, { "auxiliary_loss_clip": 0.01029503, "auxiliary_loss_mlp": 0.01002863, "balance_loss_clip": 1.00178993, "balance_loss_mlp": 1.00738394, "epoch": 0.8564557342552233, "flos": 56271986311680.0, "grad_norm": 0.7609103084472321, "language_loss": 0.58457416, "learning_rate": 2.1219772194800046e-07, "loss": 0.6048978, "num_input_tokens_seen": 307241240, "router_z_loss_clip": 0.01074219, "router_z_loss_mlp": 0.22070312, "step": 14245, "time_per_iteration": 3.0079548358917236 }, { "auxiliary_loss_clip": 0.01107683, "auxiliary_loss_mlp": 0.01030735, "balance_loss_clip": 1.01795626, "balance_loss_mlp": 1.03569651, "epoch": 0.8565158575078912, "flos": 23440618748160.0, "grad_norm": 1.9478895538178262, "language_loss": 0.77416879, "learning_rate": 2.1202317413394488e-07, "loss": 0.79555297, "num_input_tokens_seen": 307261485, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 14246, "time_per_iteration": 2.530839204788208 }, { "auxiliary_loss_clip": 0.01101706, "auxiliary_loss_mlp": 0.01029917, "balance_loss_clip": 1.01796663, "balance_loss_mlp": 1.03282499, "epoch": 0.8565759807605592, "flos": 20375930442240.0, "grad_norm": 3.0279838547913456, "language_loss": 0.81369722, "learning_rate": 2.1184869412014938e-07, "loss": 0.83501351, "num_input_tokens_seen": 307279160, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 14247, "time_per_iteration": 2.4614334106445312 }, { "auxiliary_loss_clip": 0.01105068, "auxiliary_loss_mlp": 0.01030977, "balance_loss_clip": 1.01830494, "balance_loss_mlp": 1.03591037, "epoch": 0.8566361040132271, "flos": 18807101049600.0, "grad_norm": 2.737575874217788, "language_loss": 0.77736086, "learning_rate": 2.1167428191323112e-07, "loss": 0.79872137, "num_input_tokens_seen": 307297920, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 14248, "time_per_iteration": 2.4648869037628174 }, { "auxiliary_loss_clip": 0.01104487, "auxiliary_loss_mlp": 0.01030258, "balance_loss_clip": 1.01738977, "balance_loss_mlp": 1.03467631, "epoch": 0.8566962272658951, "flos": 24535355506560.0, "grad_norm": 2.953944843207342, "language_loss": 0.77975112, "learning_rate": 2.1149993751980278e-07, "loss": 0.80109859, "num_input_tokens_seen": 307318320, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 14249, "time_per_iteration": 3.945405960083008 }, { "auxiliary_loss_clip": 0.01101974, "auxiliary_loss_mlp": 0.01034707, "balance_loss_clip": 1.02263713, "balance_loss_mlp": 1.03492129, "epoch": 0.856756350518563, "flos": 23178506227200.0, "grad_norm": 1.780165528669951, "language_loss": 0.78103071, "learning_rate": 2.1132566094647597e-07, "loss": 0.80239749, "num_input_tokens_seen": 307336720, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.66796875, "step": 14250, "time_per_iteration": 2.509218454360962 }, { "auxiliary_loss_clip": 0.01100327, "auxiliary_loss_mlp": 0.01031414, "balance_loss_clip": 1.02033353, "balance_loss_mlp": 1.03463054, "epoch": 0.856816473771231, "flos": 20808581760000.0, "grad_norm": 1.931721628805795, "language_loss": 0.79746109, "learning_rate": 2.1115145219985942e-07, "loss": 0.81877851, "num_input_tokens_seen": 307354120, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.65625, "step": 14251, "time_per_iteration": 2.4511773586273193 }, { "auxiliary_loss_clip": 0.01101049, "auxiliary_loss_mlp": 0.01030191, "balance_loss_clip": 1.01840138, "balance_loss_mlp": 1.03378081, "epoch": 0.856876597023899, "flos": 20228157889920.0, "grad_norm": 2.094833656398388, "language_loss": 0.61425, "learning_rate": 2.1097731128656005e-07, "loss": 0.63556242, "num_input_tokens_seen": 307373165, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 14252, "time_per_iteration": 3.8268277645111084 }, { "auxiliary_loss_clip": 0.01108946, "auxiliary_loss_mlp": 0.01031468, "balance_loss_clip": 1.01894546, "balance_loss_mlp": 1.03812599, "epoch": 0.856936720276567, "flos": 18296128126080.0, "grad_norm": 5.637568376026235, "language_loss": 0.69900227, "learning_rate": 2.1080323821317924e-07, "loss": 0.72040635, "num_input_tokens_seen": 307391000, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 14253, "time_per_iteration": 2.4641196727752686 }, { "auxiliary_loss_clip": 0.01029044, "auxiliary_loss_mlp": 0.01000687, "balance_loss_clip": 0.99967974, "balance_loss_mlp": 1.00699043, "epoch": 0.8569968435292349, "flos": 69878394933120.0, "grad_norm": 0.7947353780088887, "language_loss": 0.59173334, "learning_rate": 2.1062923298631907e-07, "loss": 0.61203063, "num_input_tokens_seen": 307452865, "router_z_loss_clip": 0.0100708, "router_z_loss_mlp": 0.22070312, "step": 14254, "time_per_iteration": 4.534661293029785 }, { "auxiliary_loss_clip": 0.01102182, "auxiliary_loss_mlp": 0.01034472, "balance_loss_clip": 1.02121043, "balance_loss_mlp": 1.03414881, "epoch": 0.8570569667819029, "flos": 25848572739840.0, "grad_norm": 1.7310220499985312, "language_loss": 0.81316894, "learning_rate": 2.1045529561257825e-07, "loss": 0.83453548, "num_input_tokens_seen": 307471940, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6796875, "step": 14255, "time_per_iteration": 3.9558396339416504 }, { "auxiliary_loss_clip": 0.0110132, "auxiliary_loss_mlp": 0.01026926, "balance_loss_clip": 1.0150702, "balance_loss_mlp": 1.03450787, "epoch": 0.8571170900345708, "flos": 23257115141760.0, "grad_norm": 2.89108086719859, "language_loss": 0.67590916, "learning_rate": 2.1028142609855126e-07, "loss": 0.6971916, "num_input_tokens_seen": 307488745, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.66796875, "step": 14256, "time_per_iteration": 2.4822745323181152 }, { "auxiliary_loss_clip": 0.01105374, "auxiliary_loss_mlp": 0.01030678, "balance_loss_clip": 1.01869178, "balance_loss_mlp": 1.03607142, "epoch": 0.8571772132872388, "flos": 18917670090240.0, "grad_norm": 1.7358319060442862, "language_loss": 0.69964659, "learning_rate": 2.1010762445083218e-07, "loss": 0.72100711, "num_input_tokens_seen": 307506855, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 14257, "time_per_iteration": 2.4701573848724365 }, { "auxiliary_loss_clip": 0.0110297, "auxiliary_loss_mlp": 0.01032838, "balance_loss_clip": 1.02048838, "balance_loss_mlp": 1.03483379, "epoch": 0.8572373365399069, "flos": 33250120318080.0, "grad_norm": 2.387364521810078, "language_loss": 0.76809609, "learning_rate": 2.0993389067601197e-07, "loss": 0.78945416, "num_input_tokens_seen": 307526115, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 14258, "time_per_iteration": 2.5701663494110107 }, { "auxiliary_loss_clip": 0.01103158, "auxiliary_loss_mlp": 0.01032312, "balance_loss_clip": 1.02002788, "balance_loss_mlp": 1.0360198, "epoch": 0.8572974597925748, "flos": 23327535755520.0, "grad_norm": 1.5930848934947572, "language_loss": 0.67869556, "learning_rate": 2.0976022478067735e-07, "loss": 0.70005023, "num_input_tokens_seen": 307545230, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 14259, "time_per_iteration": 2.471585512161255 }, { "auxiliary_loss_clip": 0.01104265, "auxiliary_loss_mlp": 0.01032919, "balance_loss_clip": 1.0202471, "balance_loss_mlp": 1.03448737, "epoch": 0.8573575830452428, "flos": 24535858296960.0, "grad_norm": 1.7423722037472296, "language_loss": 0.77040005, "learning_rate": 2.0958662677141437e-07, "loss": 0.79177189, "num_input_tokens_seen": 307564900, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 14260, "time_per_iteration": 2.5378363132476807 }, { "auxiliary_loss_clip": 0.01103899, "auxiliary_loss_mlp": 0.01029212, "balance_loss_clip": 1.01637292, "balance_loss_mlp": 1.03417969, "epoch": 0.8574177062979107, "flos": 24165403378560.0, "grad_norm": 1.7704232920146272, "language_loss": 0.74404323, "learning_rate": 2.09413096654806e-07, "loss": 0.76537436, "num_input_tokens_seen": 307583500, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 14261, "time_per_iteration": 2.4615423679351807 }, { "auxiliary_loss_clip": 0.01108173, "auxiliary_loss_mlp": 0.01033208, "balance_loss_clip": 1.02012515, "balance_loss_mlp": 1.03616691, "epoch": 0.8574778295505787, "flos": 17930737025280.0, "grad_norm": 2.251967790330257, "language_loss": 0.78684866, "learning_rate": 2.0923963443743276e-07, "loss": 0.80826247, "num_input_tokens_seen": 307601430, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 14262, "time_per_iteration": 2.4592225551605225 }, { "auxiliary_loss_clip": 0.01103081, "auxiliary_loss_mlp": 0.01030435, "balance_loss_clip": 1.01941395, "balance_loss_mlp": 1.03695655, "epoch": 0.8575379528032466, "flos": 21580697537280.0, "grad_norm": 1.5407301815686056, "language_loss": 0.68037307, "learning_rate": 2.0906624012587203e-07, "loss": 0.70170826, "num_input_tokens_seen": 307621495, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66015625, "step": 14263, "time_per_iteration": 2.4571352005004883 }, { "auxiliary_loss_clip": 0.01103615, "auxiliary_loss_mlp": 0.01031378, "balance_loss_clip": 1.01868868, "balance_loss_mlp": 1.03426158, "epoch": 0.8575980760559146, "flos": 21761579450880.0, "grad_norm": 1.6818782034068542, "language_loss": 0.79722083, "learning_rate": 2.088929137266986e-07, "loss": 0.81857073, "num_input_tokens_seen": 307640840, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14264, "time_per_iteration": 2.4779043197631836 }, { "auxiliary_loss_clip": 0.0110502, "auxiliary_loss_mlp": 0.01031258, "balance_loss_clip": 1.01963508, "balance_loss_mlp": 1.03610134, "epoch": 0.8576581993085826, "flos": 34386442047360.0, "grad_norm": 1.5531438015330288, "language_loss": 0.69635838, "learning_rate": 2.0871965524648582e-07, "loss": 0.71772116, "num_input_tokens_seen": 307663820, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 14265, "time_per_iteration": 2.5787832736968994 }, { "auxiliary_loss_clip": 0.0110047, "auxiliary_loss_mlp": 0.0102344, "balance_loss_clip": 1.01242554, "balance_loss_mlp": 1.03488398, "epoch": 0.8577183225612506, "flos": 23222497409280.0, "grad_norm": 1.813420660856701, "language_loss": 0.66241777, "learning_rate": 2.085464646918027e-07, "loss": 0.68365693, "num_input_tokens_seen": 307682385, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65625, "step": 14266, "time_per_iteration": 2.491445302963257 }, { "auxiliary_loss_clip": 0.01103911, "auxiliary_loss_mlp": 0.01032333, "balance_loss_clip": 1.02028656, "balance_loss_mlp": 1.03639472, "epoch": 0.8577784458139185, "flos": 28804164462720.0, "grad_norm": 1.7131665957664923, "language_loss": 0.75585949, "learning_rate": 2.0837334206921731e-07, "loss": 0.77722192, "num_input_tokens_seen": 307704680, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.67578125, "step": 14267, "time_per_iteration": 2.514122247695923 }, { "auxiliary_loss_clip": 0.01102825, "auxiliary_loss_mlp": 0.01028917, "balance_loss_clip": 1.01696014, "balance_loss_mlp": 1.0354836, "epoch": 0.8578385690665865, "flos": 19755573626880.0, "grad_norm": 3.131771600231767, "language_loss": 0.87512583, "learning_rate": 2.082002873852946e-07, "loss": 0.89644325, "num_input_tokens_seen": 307723245, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 14268, "time_per_iteration": 2.468358278274536 }, { "auxiliary_loss_clip": 0.01105439, "auxiliary_loss_mlp": 0.01035322, "balance_loss_clip": 1.02281702, "balance_loss_mlp": 1.03532958, "epoch": 0.8578986923192544, "flos": 20704082117760.0, "grad_norm": 1.8604486592246563, "language_loss": 0.7304585, "learning_rate": 2.0802730064659667e-07, "loss": 0.7518661, "num_input_tokens_seen": 307742510, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 14269, "time_per_iteration": 2.4724833965301514 }, { "auxiliary_loss_clip": 0.01105016, "auxiliary_loss_mlp": 0.01032065, "balance_loss_clip": 1.01958966, "balance_loss_mlp": 1.03553009, "epoch": 0.8579588155719224, "flos": 36101715189120.0, "grad_norm": 1.5302414713647754, "language_loss": 0.66332233, "learning_rate": 2.0785438185968252e-07, "loss": 0.68469316, "num_input_tokens_seen": 307766030, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 14270, "time_per_iteration": 2.6267945766448975 }, { "auxiliary_loss_clip": 0.01101398, "auxiliary_loss_mlp": 0.01026019, "balance_loss_clip": 1.01394892, "balance_loss_mlp": 1.03367186, "epoch": 0.8580189388245905, "flos": 22853479034880.0, "grad_norm": 2.146504049136726, "language_loss": 0.73978639, "learning_rate": 2.0768153103110997e-07, "loss": 0.76106054, "num_input_tokens_seen": 307785800, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 14271, "time_per_iteration": 2.44161319732666 }, { "auxiliary_loss_clip": 0.01029312, "auxiliary_loss_mlp": 0.01002593, "balance_loss_clip": 1.0015502, "balance_loss_mlp": 1.00728965, "epoch": 0.8580790620772584, "flos": 69642104290560.0, "grad_norm": 0.8186288423101017, "language_loss": 0.59441638, "learning_rate": 2.0750874816743358e-07, "loss": 0.61473542, "num_input_tokens_seen": 307850995, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.22070312, "step": 14272, "time_per_iteration": 3.145490884780884 }, { "auxiliary_loss_clip": 0.01108208, "auxiliary_loss_mlp": 0.01032707, "balance_loss_clip": 1.01968884, "balance_loss_mlp": 1.03600013, "epoch": 0.8581391853299264, "flos": 13334243270400.0, "grad_norm": 1.8192444044039133, "language_loss": 0.75474435, "learning_rate": 2.0733603327520499e-07, "loss": 0.77615345, "num_input_tokens_seen": 307868585, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 14273, "time_per_iteration": 2.4176292419433594 }, { "auxiliary_loss_clip": 0.01103664, "auxiliary_loss_mlp": 0.01032948, "balance_loss_clip": 1.02056813, "balance_loss_mlp": 1.03444076, "epoch": 0.8581993085825943, "flos": 19645651031040.0, "grad_norm": 1.8602666435480184, "language_loss": 0.82092035, "learning_rate": 2.0716338636097385e-07, "loss": 0.84228647, "num_input_tokens_seen": 307886820, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 14274, "time_per_iteration": 2.481391191482544 }, { "auxiliary_loss_clip": 0.01028843, "auxiliary_loss_mlp": 0.01000948, "balance_loss_clip": 0.99997646, "balance_loss_mlp": 1.00678205, "epoch": 0.8582594318352623, "flos": 55825077294720.0, "grad_norm": 0.7892957754864843, "language_loss": 0.60814118, "learning_rate": 2.0699080743128672e-07, "loss": 0.62843913, "num_input_tokens_seen": 307944020, "router_z_loss_clip": 0.00970459, "router_z_loss_mlp": 0.22070312, "step": 14275, "time_per_iteration": 3.1702747344970703 }, { "auxiliary_loss_clip": 0.01105953, "auxiliary_loss_mlp": 0.01028017, "balance_loss_clip": 1.01498699, "balance_loss_mlp": 1.03567266, "epoch": 0.8583195550879302, "flos": 24279563779200.0, "grad_norm": 2.0961943611962885, "language_loss": 0.59243196, "learning_rate": 2.0681829649268768e-07, "loss": 0.61377168, "num_input_tokens_seen": 307961055, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 14276, "time_per_iteration": 2.4742677211761475 }, { "auxiliary_loss_clip": 0.01105306, "auxiliary_loss_mlp": 0.01030395, "balance_loss_clip": 1.01820564, "balance_loss_mlp": 1.03582454, "epoch": 0.8583796783405983, "flos": 13444129952640.0, "grad_norm": 5.280311023971254, "language_loss": 0.76315594, "learning_rate": 2.0664585355171838e-07, "loss": 0.78451288, "num_input_tokens_seen": 307978690, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 14277, "time_per_iteration": 2.42974853515625 }, { "auxiliary_loss_clip": 0.01104537, "auxiliary_loss_mlp": 0.01030626, "balance_loss_clip": 1.01774526, "balance_loss_mlp": 1.03525519, "epoch": 0.8584398015932662, "flos": 16180271533440.0, "grad_norm": 2.0653104385295475, "language_loss": 0.83804119, "learning_rate": 2.0647347861491803e-07, "loss": 0.85939288, "num_input_tokens_seen": 307995870, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 14278, "time_per_iteration": 2.4785404205322266 }, { "auxiliary_loss_clip": 0.0110786, "auxiliary_loss_mlp": 0.01030401, "balance_loss_clip": 1.01737702, "balance_loss_mlp": 1.03583622, "epoch": 0.8584999248459342, "flos": 17450431338240.0, "grad_norm": 2.499378335730628, "language_loss": 0.74364841, "learning_rate": 2.0630117168882366e-07, "loss": 0.76503098, "num_input_tokens_seen": 308013645, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 14279, "time_per_iteration": 2.415811061859131 }, { "auxiliary_loss_clip": 0.01103856, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.02069199, "balance_loss_mlp": 1.03545213, "epoch": 0.8585600480986021, "flos": 23441013797760.0, "grad_norm": 2.399180150570005, "language_loss": 0.6624009, "learning_rate": 2.0612893277996845e-07, "loss": 0.68376791, "num_input_tokens_seen": 308032490, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 14280, "time_per_iteration": 2.481335401535034 }, { "auxiliary_loss_clip": 0.01101412, "auxiliary_loss_mlp": 0.01027751, "balance_loss_clip": 1.01591372, "balance_loss_mlp": 1.03414595, "epoch": 0.8586201713512701, "flos": 19937927998080.0, "grad_norm": 2.790292583806478, "language_loss": 0.62851334, "learning_rate": 2.0595676189488343e-07, "loss": 0.64980495, "num_input_tokens_seen": 308052110, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 14281, "time_per_iteration": 2.47189998626709 }, { "auxiliary_loss_clip": 0.01103371, "auxiliary_loss_mlp": 0.0103233, "balance_loss_clip": 1.01944375, "balance_loss_mlp": 1.03464985, "epoch": 0.858680294603938, "flos": 15304769435520.0, "grad_norm": 10.124153644868226, "language_loss": 0.73274714, "learning_rate": 2.0578465904009845e-07, "loss": 0.75410414, "num_input_tokens_seen": 308070660, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 14282, "time_per_iteration": 2.4561636447906494 }, { "auxiliary_loss_clip": 0.01099827, "auxiliary_loss_mlp": 0.01026855, "balance_loss_clip": 1.01546454, "balance_loss_mlp": 1.03187096, "epoch": 0.858740417856606, "flos": 22711237176960.0, "grad_norm": 1.7555894285811735, "language_loss": 0.75559026, "learning_rate": 2.0561262422213832e-07, "loss": 0.77685708, "num_input_tokens_seen": 308089520, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 14283, "time_per_iteration": 2.476895570755005 }, { "auxiliary_loss_clip": 0.01103121, "auxiliary_loss_mlp": 0.01028732, "balance_loss_clip": 1.01652503, "balance_loss_mlp": 1.03371704, "epoch": 0.8588005411092741, "flos": 34054303962240.0, "grad_norm": 4.949918998257687, "language_loss": 0.60219789, "learning_rate": 2.0544065744752736e-07, "loss": 0.62351638, "num_input_tokens_seen": 308111545, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 14284, "time_per_iteration": 2.584514856338501 }, { "auxiliary_loss_clip": 0.01101467, "auxiliary_loss_mlp": 0.01033121, "balance_loss_clip": 1.02100968, "balance_loss_mlp": 1.03501856, "epoch": 0.858860664361942, "flos": 28913584268160.0, "grad_norm": 1.8549979429279648, "language_loss": 0.75657129, "learning_rate": 2.0526875872278749e-07, "loss": 0.77791715, "num_input_tokens_seen": 308129690, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6640625, "step": 14285, "time_per_iteration": 2.5093958377838135 }, { "auxiliary_loss_clip": 0.01108906, "auxiliary_loss_mlp": 0.01034616, "balance_loss_clip": 1.02147317, "balance_loss_mlp": 1.03851104, "epoch": 0.85892078761461, "flos": 19792525743360.0, "grad_norm": 1.9484068134015131, "language_loss": 0.74567646, "learning_rate": 2.0509692805443524e-07, "loss": 0.76711166, "num_input_tokens_seen": 308147410, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 14286, "time_per_iteration": 2.4713032245635986 }, { "auxiliary_loss_clip": 0.01029572, "auxiliary_loss_mlp": 0.01001781, "balance_loss_clip": 1.0007441, "balance_loss_mlp": 1.00746059, "epoch": 0.8589809108672779, "flos": 67106630039040.0, "grad_norm": 0.7910270708800075, "language_loss": 0.49498022, "learning_rate": 2.0492516544898718e-07, "loss": 0.51529378, "num_input_tokens_seen": 308204875, "router_z_loss_clip": 0.01037598, "router_z_loss_mlp": 0.22070312, "step": 14287, "time_per_iteration": 3.0468406677246094 }, { "auxiliary_loss_clip": 0.01107811, "auxiliary_loss_mlp": 0.01030449, "balance_loss_clip": 1.01849842, "balance_loss_mlp": 1.03817928, "epoch": 0.8590410341199459, "flos": 29716259541120.0, "grad_norm": 1.9282730521003617, "language_loss": 0.79244757, "learning_rate": 2.0475347091295704e-07, "loss": 0.81383014, "num_input_tokens_seen": 308225690, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 14288, "time_per_iteration": 2.506094217300415 }, { "auxiliary_loss_clip": 0.0110606, "auxiliary_loss_mlp": 0.01034193, "balance_loss_clip": 1.02049589, "balance_loss_mlp": 1.03560984, "epoch": 0.8591011573726138, "flos": 23987430466560.0, "grad_norm": 1.9679498911750701, "language_loss": 0.80966377, "learning_rate": 2.045818444528553e-07, "loss": 0.83106631, "num_input_tokens_seen": 308245255, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 14289, "time_per_iteration": 2.4906744956970215 }, { "auxiliary_loss_clip": 0.01106948, "auxiliary_loss_mlp": 0.01028006, "balance_loss_clip": 1.01603746, "balance_loss_mlp": 1.03817451, "epoch": 0.8591612806252819, "flos": 14428656806400.0, "grad_norm": 2.526368890123752, "language_loss": 0.65256053, "learning_rate": 2.0441028607518973e-07, "loss": 0.67391008, "num_input_tokens_seen": 308261755, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 14290, "time_per_iteration": 3.8714938163757324 }, { "auxiliary_loss_clip": 0.01107555, "auxiliary_loss_mlp": 0.01030767, "balance_loss_clip": 1.01755834, "balance_loss_mlp": 1.03681266, "epoch": 0.8592214038779498, "flos": 31577150419200.0, "grad_norm": 2.089483776301685, "language_loss": 0.55442691, "learning_rate": 2.0423879578646642e-07, "loss": 0.57581013, "num_input_tokens_seen": 308285145, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 14291, "time_per_iteration": 2.5602757930755615 }, { "auxiliary_loss_clip": 0.01106643, "auxiliary_loss_mlp": 0.01029893, "balance_loss_clip": 1.01780486, "balance_loss_mlp": 1.03716266, "epoch": 0.8592815271306178, "flos": 17457290835840.0, "grad_norm": 2.305160890205681, "language_loss": 0.71447641, "learning_rate": 2.0406737359318792e-07, "loss": 0.73584175, "num_input_tokens_seen": 308304130, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 14292, "time_per_iteration": 2.416653633117676 }, { "auxiliary_loss_clip": 0.01102913, "auxiliary_loss_mlp": 0.01031816, "balance_loss_clip": 1.01957941, "balance_loss_mlp": 1.03394639, "epoch": 0.8593416503832857, "flos": 25411360394880.0, "grad_norm": 2.0915638557589413, "language_loss": 0.7088142, "learning_rate": 2.038960195018542e-07, "loss": 0.73016143, "num_input_tokens_seen": 308324670, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 14293, "time_per_iteration": 2.5167596340179443 }, { "auxiliary_loss_clip": 0.01101106, "auxiliary_loss_mlp": 0.01032722, "balance_loss_clip": 1.02105761, "balance_loss_mlp": 1.03416693, "epoch": 0.8594017736359537, "flos": 20996646393600.0, "grad_norm": 1.803423910406107, "language_loss": 0.68531269, "learning_rate": 2.0372473351896358e-07, "loss": 0.70665097, "num_input_tokens_seen": 308344215, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66796875, "step": 14294, "time_per_iteration": 3.9160590171813965 }, { "auxiliary_loss_clip": 0.01100665, "auxiliary_loss_mlp": 0.01032445, "balance_loss_clip": 1.02056026, "balance_loss_mlp": 1.03379476, "epoch": 0.8594618968886216, "flos": 22091059929600.0, "grad_norm": 2.4780548441515693, "language_loss": 0.78243959, "learning_rate": 2.0355351565101087e-07, "loss": 0.80377072, "num_input_tokens_seen": 308360520, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 14295, "time_per_iteration": 2.4508743286132812 }, { "auxiliary_loss_clip": 0.01108968, "auxiliary_loss_mlp": 0.01039443, "balance_loss_clip": 1.02537036, "balance_loss_mlp": 1.03579879, "epoch": 0.8595220201412896, "flos": 11656245467520.0, "grad_norm": 3.187724477878639, "language_loss": 0.68575883, "learning_rate": 2.0338236590448975e-07, "loss": 0.70724291, "num_input_tokens_seen": 308376865, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73046875, "step": 14296, "time_per_iteration": 5.301978588104248 }, { "auxiliary_loss_clip": 0.01104561, "auxiliary_loss_mlp": 0.01031216, "balance_loss_clip": 1.01861548, "balance_loss_mlp": 1.0354712, "epoch": 0.8595821433939577, "flos": 25040366772480.0, "grad_norm": 2.5625115515457884, "language_loss": 0.78531861, "learning_rate": 2.0321128428588842e-07, "loss": 0.80667639, "num_input_tokens_seen": 308395870, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 14297, "time_per_iteration": 2.490086555480957 }, { "auxiliary_loss_clip": 0.01100716, "auxiliary_loss_mlp": 0.0102883, "balance_loss_clip": 1.01820266, "balance_loss_mlp": 1.03421926, "epoch": 0.8596422666466256, "flos": 28511528359680.0, "grad_norm": 2.0198460899514337, "language_loss": 0.68234992, "learning_rate": 2.030402708016954e-07, "loss": 0.70364541, "num_input_tokens_seen": 308417250, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.6640625, "step": 14298, "time_per_iteration": 2.4987986087799072 }, { "auxiliary_loss_clip": 0.01103399, "auxiliary_loss_mlp": 0.01034132, "balance_loss_clip": 1.02242577, "balance_loss_mlp": 1.03659868, "epoch": 0.8597023898992936, "flos": 13589137157760.0, "grad_norm": 2.151545441473857, "language_loss": 0.68388116, "learning_rate": 2.0286932545839576e-07, "loss": 0.70525646, "num_input_tokens_seen": 308434565, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66796875, "step": 14299, "time_per_iteration": 2.4443891048431396 }, { "auxiliary_loss_clip": 0.01106373, "auxiliary_loss_mlp": 0.01034298, "balance_loss_clip": 1.02237701, "balance_loss_mlp": 1.03554523, "epoch": 0.8597625131519615, "flos": 32300821728000.0, "grad_norm": 28.05676155835702, "language_loss": 0.71684241, "learning_rate": 2.0269844826247096e-07, "loss": 0.73824918, "num_input_tokens_seen": 308450040, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 14300, "time_per_iteration": 2.5179519653320312 }, { "auxiliary_loss_clip": 0.0110131, "auxiliary_loss_mlp": 0.01031362, "balance_loss_clip": 1.01978719, "balance_loss_mlp": 1.03302789, "epoch": 0.8598226364046295, "flos": 28730367970560.0, "grad_norm": 1.7602541563393008, "language_loss": 0.68858176, "learning_rate": 2.0252763922040116e-07, "loss": 0.70990849, "num_input_tokens_seen": 308470545, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.68359375, "step": 14301, "time_per_iteration": 2.5128211975097656 }, { "auxiliary_loss_clip": 0.01104559, "auxiliary_loss_mlp": 0.01029685, "balance_loss_clip": 1.01784182, "balance_loss_mlp": 1.03602111, "epoch": 0.8598827596572974, "flos": 21871825269120.0, "grad_norm": 2.8030893720415846, "language_loss": 0.74088925, "learning_rate": 2.023568983386641e-07, "loss": 0.76223171, "num_input_tokens_seen": 308490020, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 14302, "time_per_iteration": 2.4440836906433105 }, { "auxiliary_loss_clip": 0.01099079, "auxiliary_loss_mlp": 0.01030312, "balance_loss_clip": 1.01900458, "balance_loss_mlp": 1.03341222, "epoch": 0.8599428829099655, "flos": 23767297966080.0, "grad_norm": 2.005152301173475, "language_loss": 0.83998048, "learning_rate": 2.02186225623733e-07, "loss": 0.86127436, "num_input_tokens_seen": 308509065, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65625, "step": 14303, "time_per_iteration": 2.50598406791687 }, { "auxiliary_loss_clip": 0.01105322, "auxiliary_loss_mlp": 0.01038497, "balance_loss_clip": 1.02537811, "balance_loss_mlp": 1.03516984, "epoch": 0.8600030061626334, "flos": 16212770363520.0, "grad_norm": 2.2350904884682206, "language_loss": 0.77555835, "learning_rate": 2.0201562108208025e-07, "loss": 0.79699653, "num_input_tokens_seen": 308524725, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 14304, "time_per_iteration": 2.453653573989868 }, { "auxiliary_loss_clip": 0.01106148, "auxiliary_loss_mlp": 0.01033038, "balance_loss_clip": 1.02016389, "balance_loss_mlp": 1.03665388, "epoch": 0.8600631294153014, "flos": 15669370437120.0, "grad_norm": 2.2275839637144923, "language_loss": 0.54013574, "learning_rate": 2.0184508472017537e-07, "loss": 0.56152755, "num_input_tokens_seen": 308543525, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 14305, "time_per_iteration": 2.4243204593658447 }, { "auxiliary_loss_clip": 0.01104577, "auxiliary_loss_mlp": 0.01027821, "balance_loss_clip": 1.01505959, "balance_loss_mlp": 1.03610349, "epoch": 0.8601232526679693, "flos": 17493093717120.0, "grad_norm": 2.578021546084559, "language_loss": 0.83713675, "learning_rate": 2.0167461654448558e-07, "loss": 0.85846072, "num_input_tokens_seen": 308557995, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 14306, "time_per_iteration": 2.431809425354004 }, { "auxiliary_loss_clip": 0.01100013, "auxiliary_loss_mlp": 0.01025531, "balance_loss_clip": 1.01454544, "balance_loss_mlp": 1.0343312, "epoch": 0.8601833759206373, "flos": 26985935963520.0, "grad_norm": 1.6996042396828936, "language_loss": 0.71640527, "learning_rate": 2.01504216561474e-07, "loss": 0.73766065, "num_input_tokens_seen": 308582750, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.65625, "step": 14307, "time_per_iteration": 2.5645837783813477 }, { "auxiliary_loss_clip": 0.01106941, "auxiliary_loss_mlp": 0.01042228, "balance_loss_clip": 1.0288949, "balance_loss_mlp": 1.03566611, "epoch": 0.8602434991733052, "flos": 25229760209280.0, "grad_norm": 39.987799946952485, "language_loss": 0.64016306, "learning_rate": 2.0133388477760316e-07, "loss": 0.66165471, "num_input_tokens_seen": 308603770, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 14308, "time_per_iteration": 2.5024831295013428 }, { "auxiliary_loss_clip": 0.01028474, "auxiliary_loss_mlp": 0.01001431, "balance_loss_clip": 1.00039434, "balance_loss_mlp": 1.00634062, "epoch": 0.8603036224259732, "flos": 71015363107200.0, "grad_norm": 0.6259902455692863, "language_loss": 0.48432261, "learning_rate": 2.0116362119933172e-07, "loss": 0.50462162, "num_input_tokens_seen": 308667735, "router_z_loss_clip": 0.01037598, "router_z_loss_mlp": 0.22070312, "step": 14309, "time_per_iteration": 3.1736340522766113 }, { "auxiliary_loss_clip": 0.01106063, "auxiliary_loss_mlp": 0.01036307, "balance_loss_clip": 1.02280641, "balance_loss_mlp": 1.03594112, "epoch": 0.8603637456786413, "flos": 20300625578880.0, "grad_norm": 1.866375094219907, "language_loss": 0.67068946, "learning_rate": 2.0099342583311563e-07, "loss": 0.69211316, "num_input_tokens_seen": 308686300, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 14310, "time_per_iteration": 2.4450089931488037 }, { "auxiliary_loss_clip": 0.01105662, "auxiliary_loss_mlp": 0.01028366, "balance_loss_clip": 1.01683891, "balance_loss_mlp": 1.03515625, "epoch": 0.8604238689313092, "flos": 21835842819840.0, "grad_norm": 1.7450803447883587, "language_loss": 0.78112066, "learning_rate": 2.0082329868540905e-07, "loss": 0.80246091, "num_input_tokens_seen": 308705825, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.703125, "step": 14311, "time_per_iteration": 2.490154981613159 }, { "auxiliary_loss_clip": 0.01101392, "auxiliary_loss_mlp": 0.01029511, "balance_loss_clip": 1.01748836, "balance_loss_mlp": 1.03397584, "epoch": 0.8604839921839772, "flos": 18004210295040.0, "grad_norm": 2.0468257759284403, "language_loss": 0.71953452, "learning_rate": 2.006532397626639e-07, "loss": 0.74084365, "num_input_tokens_seen": 308723340, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 14312, "time_per_iteration": 2.4173574447631836 }, { "auxiliary_loss_clip": 0.01102724, "auxiliary_loss_mlp": 0.01028056, "balance_loss_clip": 1.01630783, "balance_loss_mlp": 1.03380477, "epoch": 0.8605441154366451, "flos": 16252164604800.0, "grad_norm": 2.045904314713146, "language_loss": 0.77853918, "learning_rate": 2.0048324907132797e-07, "loss": 0.79984689, "num_input_tokens_seen": 308741280, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6875, "step": 14313, "time_per_iteration": 2.4366536140441895 }, { "auxiliary_loss_clip": 0.01103722, "auxiliary_loss_mlp": 0.01031296, "balance_loss_clip": 1.01818335, "balance_loss_mlp": 1.03709936, "epoch": 0.8606042386893131, "flos": 32267065921920.0, "grad_norm": 1.5494866035242056, "language_loss": 0.72944379, "learning_rate": 2.003133266178474e-07, "loss": 0.75079399, "num_input_tokens_seen": 308762875, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.66796875, "step": 14314, "time_per_iteration": 2.5344245433807373 }, { "auxiliary_loss_clip": 0.01101639, "auxiliary_loss_mlp": 0.01029632, "balance_loss_clip": 1.01737702, "balance_loss_mlp": 1.03304696, "epoch": 0.860664361941981, "flos": 20229774001920.0, "grad_norm": 2.6452102795892243, "language_loss": 0.69289893, "learning_rate": 2.001434724086657e-07, "loss": 0.71421164, "num_input_tokens_seen": 308780315, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 14315, "time_per_iteration": 2.4488935470581055 }, { "auxiliary_loss_clip": 0.01103359, "auxiliary_loss_mlp": 0.01037013, "balance_loss_clip": 1.02491951, "balance_loss_mlp": 1.03538048, "epoch": 0.8607244851946491, "flos": 25191622944000.0, "grad_norm": 1.7279515430505894, "language_loss": 0.72018015, "learning_rate": 1.9997368645022418e-07, "loss": 0.74158382, "num_input_tokens_seen": 308799435, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 14316, "time_per_iteration": 2.486017942428589 }, { "auxiliary_loss_clip": 0.01107943, "auxiliary_loss_mlp": 0.01027432, "balance_loss_clip": 1.0153439, "balance_loss_mlp": 1.03817689, "epoch": 0.860784608447317, "flos": 20482082110080.0, "grad_norm": 2.210082488361667, "language_loss": 0.83284128, "learning_rate": 1.9980396874896056e-07, "loss": 0.854195, "num_input_tokens_seen": 308817730, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 14317, "time_per_iteration": 2.4756920337677 }, { "auxiliary_loss_clip": 0.01103212, "auxiliary_loss_mlp": 0.01032363, "balance_loss_clip": 1.02014446, "balance_loss_mlp": 1.03640187, "epoch": 0.860844731699985, "flos": 50476037696640.0, "grad_norm": 1.6510734408413335, "language_loss": 0.66907847, "learning_rate": 1.996343193113108e-07, "loss": 0.69043422, "num_input_tokens_seen": 308841735, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.66796875, "step": 14318, "time_per_iteration": 2.6983845233917236 }, { "auxiliary_loss_clip": 0.01100773, "auxiliary_loss_mlp": 0.01029212, "balance_loss_clip": 1.01778579, "balance_loss_mlp": 1.03415775, "epoch": 0.8609048549526529, "flos": 41172768455040.0, "grad_norm": 2.3700657836403103, "language_loss": 0.71228576, "learning_rate": 1.9946473814370911e-07, "loss": 0.7335856, "num_input_tokens_seen": 308865050, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 14319, "time_per_iteration": 2.657284736633301 }, { "auxiliary_loss_clip": 0.01108904, "auxiliary_loss_mlp": 0.01032896, "balance_loss_clip": 1.02006888, "balance_loss_mlp": 1.03828311, "epoch": 0.8609649782053209, "flos": 23951196622080.0, "grad_norm": 1.6497003272092163, "language_loss": 0.6740967, "learning_rate": 1.992952252525839e-07, "loss": 0.69551468, "num_input_tokens_seen": 308885375, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 14320, "time_per_iteration": 2.481672763824463 }, { "auxiliary_loss_clip": 0.01105962, "auxiliary_loss_mlp": 0.01033025, "balance_loss_clip": 1.02044821, "balance_loss_mlp": 1.0344739, "epoch": 0.8610251014579888, "flos": 23112574813440.0, "grad_norm": 2.0672495500569537, "language_loss": 0.80387026, "learning_rate": 1.9912578064436446e-07, "loss": 0.8252601, "num_input_tokens_seen": 308904700, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 14321, "time_per_iteration": 2.4796221256256104 }, { "auxiliary_loss_clip": 0.01101846, "auxiliary_loss_mlp": 0.01031676, "balance_loss_clip": 1.01811647, "balance_loss_mlp": 1.03523481, "epoch": 0.8610852247106568, "flos": 19426811420160.0, "grad_norm": 1.9982690435977877, "language_loss": 0.71096218, "learning_rate": 1.9895640432547567e-07, "loss": 0.73229742, "num_input_tokens_seen": 308922985, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6640625, "step": 14322, "time_per_iteration": 2.431753396987915 }, { "auxiliary_loss_clip": 0.0110872, "auxiliary_loss_mlp": 0.01034607, "balance_loss_clip": 1.0211544, "balance_loss_mlp": 1.03621793, "epoch": 0.8611453479633249, "flos": 19312076401920.0, "grad_norm": 1.9278294521796655, "language_loss": 0.56236726, "learning_rate": 1.9878709630234102e-07, "loss": 0.58380049, "num_input_tokens_seen": 308940765, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 14323, "time_per_iteration": 2.472001791000366 }, { "auxiliary_loss_clip": 0.01102394, "auxiliary_loss_mlp": 0.01025307, "balance_loss_clip": 1.01336825, "balance_loss_mlp": 1.03452122, "epoch": 0.8612054712159928, "flos": 23253667436160.0, "grad_norm": 1.9094747932675802, "language_loss": 0.75905108, "learning_rate": 1.986178565813801e-07, "loss": 0.78032804, "num_input_tokens_seen": 308960110, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 14324, "time_per_iteration": 2.473640203475952 }, { "auxiliary_loss_clip": 0.01104952, "auxiliary_loss_mlp": 0.01032724, "balance_loss_clip": 1.01990867, "balance_loss_mlp": 1.03628707, "epoch": 0.8612655944686608, "flos": 16028440744320.0, "grad_norm": 2.495032129316789, "language_loss": 0.66630459, "learning_rate": 1.9844868516901036e-07, "loss": 0.68768132, "num_input_tokens_seen": 308976665, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 14325, "time_per_iteration": 2.4321625232696533 }, { "auxiliary_loss_clip": 0.01106237, "auxiliary_loss_mlp": 0.01033494, "balance_loss_clip": 1.02105415, "balance_loss_mlp": 1.03593373, "epoch": 0.8613257177213287, "flos": 22492720788480.0, "grad_norm": 1.7119248977865218, "language_loss": 0.64766264, "learning_rate": 1.982795820716472e-07, "loss": 0.66905993, "num_input_tokens_seen": 308997015, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 14326, "time_per_iteration": 2.4669063091278076 }, { "auxiliary_loss_clip": 0.01105521, "auxiliary_loss_mlp": 0.01031203, "balance_loss_clip": 1.01831055, "balance_loss_mlp": 1.03612638, "epoch": 0.8613858409739967, "flos": 17238056175360.0, "grad_norm": 2.0384921852149103, "language_loss": 0.84120977, "learning_rate": 1.9811054729570253e-07, "loss": 0.86257696, "num_input_tokens_seen": 309015250, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 14327, "time_per_iteration": 2.4936978816986084 }, { "auxiliary_loss_clip": 0.0110356, "auxiliary_loss_mlp": 0.01031798, "balance_loss_clip": 1.0194118, "balance_loss_mlp": 1.03466916, "epoch": 0.8614459642266646, "flos": 22821123859200.0, "grad_norm": 2.0797335052888037, "language_loss": 0.75616622, "learning_rate": 1.9794158084758661e-07, "loss": 0.77751982, "num_input_tokens_seen": 309034140, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 14328, "time_per_iteration": 2.4703567028045654 }, { "auxiliary_loss_clip": 0.01102908, "auxiliary_loss_mlp": 0.01026003, "balance_loss_clip": 1.01368284, "balance_loss_mlp": 1.03580618, "epoch": 0.8615060874793327, "flos": 26504301473280.0, "grad_norm": 13.008635934640477, "language_loss": 0.8017208, "learning_rate": 1.9777268273370673e-07, "loss": 0.82300991, "num_input_tokens_seen": 309055075, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.671875, "step": 14329, "time_per_iteration": 2.5045413970947266 }, { "auxiliary_loss_clip": 0.01102987, "auxiliary_loss_mlp": 0.01030803, "balance_loss_clip": 1.01848888, "balance_loss_mlp": 1.03447986, "epoch": 0.8615662107320006, "flos": 24061011477120.0, "grad_norm": 3.3289786670117993, "language_loss": 0.7704016, "learning_rate": 1.9760385296046757e-07, "loss": 0.79173946, "num_input_tokens_seen": 309074650, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 14330, "time_per_iteration": 2.4670917987823486 }, { "auxiliary_loss_clip": 0.0110252, "auxiliary_loss_mlp": 0.01026204, "balance_loss_clip": 1.01453328, "balance_loss_mlp": 1.0347563, "epoch": 0.8616263339846686, "flos": 24165044242560.0, "grad_norm": 2.4511659177217595, "language_loss": 0.65400487, "learning_rate": 1.974350915342702e-07, "loss": 0.67529213, "num_input_tokens_seen": 309094385, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 14331, "time_per_iteration": 2.481637716293335 }, { "auxiliary_loss_clip": 0.01102187, "auxiliary_loss_mlp": 0.01032986, "balance_loss_clip": 1.02143455, "balance_loss_mlp": 1.03505635, "epoch": 0.8616864572373365, "flos": 21724340025600.0, "grad_norm": 1.8064393255456315, "language_loss": 0.76133609, "learning_rate": 1.9726639846151506e-07, "loss": 0.78268784, "num_input_tokens_seen": 309111815, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 14332, "time_per_iteration": 3.972249984741211 }, { "auxiliary_loss_clip": 0.01106921, "auxiliary_loss_mlp": 0.01028896, "balance_loss_clip": 1.01534235, "balance_loss_mlp": 1.0350126, "epoch": 0.8617465804900045, "flos": 23766651521280.0, "grad_norm": 2.2866808040553086, "language_loss": 0.67151463, "learning_rate": 1.9709777374859904e-07, "loss": 0.69287276, "num_input_tokens_seen": 309131385, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 14333, "time_per_iteration": 2.4627177715301514 }, { "auxiliary_loss_clip": 0.01109374, "auxiliary_loss_mlp": 0.01036637, "balance_loss_clip": 1.02245152, "balance_loss_mlp": 1.036219, "epoch": 0.8618067037426724, "flos": 37703941251840.0, "grad_norm": 1.9798761491447046, "language_loss": 0.62200534, "learning_rate": 1.969292174019157e-07, "loss": 0.64346552, "num_input_tokens_seen": 309155020, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.73046875, "step": 14334, "time_per_iteration": 2.60182785987854 }, { "auxiliary_loss_clip": 0.01109307, "auxiliary_loss_mlp": 0.01037836, "balance_loss_clip": 1.02528286, "balance_loss_mlp": 1.03819406, "epoch": 0.8618668269953405, "flos": 21471026336640.0, "grad_norm": 3.450687596180473, "language_loss": 0.69416034, "learning_rate": 1.967607294278577e-07, "loss": 0.71563178, "num_input_tokens_seen": 309172865, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 14335, "time_per_iteration": 3.8102517127990723 }, { "auxiliary_loss_clip": 0.01106466, "auxiliary_loss_mlp": 0.01032847, "balance_loss_clip": 1.02053225, "balance_loss_mlp": 1.03648424, "epoch": 0.8619269502480085, "flos": 22232691256320.0, "grad_norm": 1.4701840087652143, "language_loss": 0.82882124, "learning_rate": 1.965923098328135e-07, "loss": 0.85021436, "num_input_tokens_seen": 309193575, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 14336, "time_per_iteration": 2.5080976486206055 }, { "auxiliary_loss_clip": 0.01108349, "auxiliary_loss_mlp": 0.01029587, "balance_loss_clip": 1.0170939, "balance_loss_mlp": 1.03607154, "epoch": 0.8619870735006764, "flos": 22710626645760.0, "grad_norm": 1.8495512198156465, "language_loss": 0.67878932, "learning_rate": 1.9642395862316907e-07, "loss": 0.70016867, "num_input_tokens_seen": 309212680, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.72265625, "step": 14337, "time_per_iteration": 3.820533514022827 }, { "auxiliary_loss_clip": 0.01102003, "auxiliary_loss_mlp": 0.01028111, "balance_loss_clip": 1.01621342, "balance_loss_mlp": 1.03374374, "epoch": 0.8620471967533444, "flos": 37520293991040.0, "grad_norm": 2.7411725605744364, "language_loss": 0.67078102, "learning_rate": 1.962556758053089e-07, "loss": 0.69208217, "num_input_tokens_seen": 309234485, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 14338, "time_per_iteration": 4.017763614654541 }, { "auxiliary_loss_clip": 0.01105754, "auxiliary_loss_mlp": 0.01031791, "balance_loss_clip": 1.02027535, "balance_loss_mlp": 1.03642392, "epoch": 0.8621073200060123, "flos": 19682459493120.0, "grad_norm": 3.0697070503959574, "language_loss": 0.61760521, "learning_rate": 1.9608746138561448e-07, "loss": 0.63898063, "num_input_tokens_seen": 309253630, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6953125, "step": 14339, "time_per_iteration": 2.4775195121765137 }, { "auxiliary_loss_clip": 0.01101992, "auxiliary_loss_mlp": 0.01031635, "balance_loss_clip": 1.01934421, "balance_loss_mlp": 1.03347135, "epoch": 0.8621674432586803, "flos": 14536855549440.0, "grad_norm": 1.9921215270207504, "language_loss": 0.62751508, "learning_rate": 1.9591931537046458e-07, "loss": 0.64885128, "num_input_tokens_seen": 309270950, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 14340, "time_per_iteration": 2.4141716957092285 }, { "auxiliary_loss_clip": 0.01098977, "auxiliary_loss_mlp": 0.0102345, "balance_loss_clip": 1.01231003, "balance_loss_mlp": 1.03482223, "epoch": 0.8622275665113482, "flos": 20740100480640.0, "grad_norm": 2.12313868424224, "language_loss": 0.80066061, "learning_rate": 1.9575123776623493e-07, "loss": 0.82188493, "num_input_tokens_seen": 309288780, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.640625, "step": 14341, "time_per_iteration": 2.4678778648376465 }, { "auxiliary_loss_clip": 0.01102233, "auxiliary_loss_mlp": 0.01032299, "balance_loss_clip": 1.02047944, "balance_loss_mlp": 1.03488278, "epoch": 0.8622876897640163, "flos": 24715914197760.0, "grad_norm": 2.5151316685397913, "language_loss": 0.74204898, "learning_rate": 1.9558322857929887e-07, "loss": 0.76339424, "num_input_tokens_seen": 309310875, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 14342, "time_per_iteration": 2.4924771785736084 }, { "auxiliary_loss_clip": 0.01106244, "auxiliary_loss_mlp": 0.01026727, "balance_loss_clip": 1.01398993, "balance_loss_mlp": 1.03643107, "epoch": 0.8623478130166842, "flos": 17457362663040.0, "grad_norm": 2.5889731328030696, "language_loss": 0.68731683, "learning_rate": 1.95415287816028e-07, "loss": 0.70864654, "num_input_tokens_seen": 309329900, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 14343, "time_per_iteration": 2.4617488384246826 }, { "auxiliary_loss_clip": 0.01104348, "auxiliary_loss_mlp": 0.01042034, "balance_loss_clip": 1.02868283, "balance_loss_mlp": 1.03509188, "epoch": 0.8624079362693522, "flos": 18109176814080.0, "grad_norm": 1.986222266282696, "language_loss": 0.68356532, "learning_rate": 1.9524741548278967e-07, "loss": 0.70502913, "num_input_tokens_seen": 309347870, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69140625, "step": 14344, "time_per_iteration": 2.4415948390960693 }, { "auxiliary_loss_clip": 0.01106237, "auxiliary_loss_mlp": 0.0103415, "balance_loss_clip": 1.02171612, "balance_loss_mlp": 1.03551483, "epoch": 0.8624680595220201, "flos": 30666455971200.0, "grad_norm": 1.780515107226022, "language_loss": 0.81540704, "learning_rate": 1.9507961158595054e-07, "loss": 0.83681095, "num_input_tokens_seen": 309371695, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.70703125, "step": 14345, "time_per_iteration": 2.5577573776245117 }, { "auxiliary_loss_clip": 0.01107597, "auxiliary_loss_mlp": 0.01029324, "balance_loss_clip": 1.01646733, "balance_loss_mlp": 1.03672194, "epoch": 0.8625281827746881, "flos": 37998588516480.0, "grad_norm": 2.3658831506339175, "language_loss": 0.50324655, "learning_rate": 1.9491187613187355e-07, "loss": 0.52461576, "num_input_tokens_seen": 309394645, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 14346, "time_per_iteration": 2.607738494873047 }, { "auxiliary_loss_clip": 0.01102878, "auxiliary_loss_mlp": 0.010323, "balance_loss_clip": 1.0198245, "balance_loss_mlp": 1.03417838, "epoch": 0.862588306027356, "flos": 26249730808320.0, "grad_norm": 1.7719447701844124, "language_loss": 0.75223345, "learning_rate": 1.9474420912691913e-07, "loss": 0.7735852, "num_input_tokens_seen": 309413170, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 14347, "time_per_iteration": 2.487339973449707 }, { "auxiliary_loss_clip": 0.01105756, "auxiliary_loss_mlp": 0.01028871, "balance_loss_clip": 1.01613975, "balance_loss_mlp": 1.03654706, "epoch": 0.862648429280024, "flos": 25878809013120.0, "grad_norm": 2.390948204806939, "language_loss": 0.81047481, "learning_rate": 1.945766105774449e-07, "loss": 0.83182108, "num_input_tokens_seen": 309431315, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14348, "time_per_iteration": 2.4812309741973877 }, { "auxiliary_loss_clip": 0.01099519, "auxiliary_loss_mlp": 0.01026502, "balance_loss_clip": 1.01503992, "balance_loss_mlp": 1.03294873, "epoch": 0.862708552532692, "flos": 37816413713280.0, "grad_norm": 1.9291131792616938, "language_loss": 0.6632778, "learning_rate": 1.9440908048980665e-07, "loss": 0.68453801, "num_input_tokens_seen": 309453020, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6640625, "step": 14349, "time_per_iteration": 2.598930597305298 }, { "auxiliary_loss_clip": 0.01105193, "auxiliary_loss_mlp": 0.01032947, "balance_loss_clip": 1.02038193, "balance_loss_mlp": 1.03624344, "epoch": 0.86276867578536, "flos": 19091800247040.0, "grad_norm": 2.287417657808177, "language_loss": 0.70007652, "learning_rate": 1.942416188703573e-07, "loss": 0.7214579, "num_input_tokens_seen": 309469780, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 14350, "time_per_iteration": 2.414499521255493 }, { "auxiliary_loss_clip": 0.01102484, "auxiliary_loss_mlp": 0.01032933, "balance_loss_clip": 1.0206486, "balance_loss_mlp": 1.03408551, "epoch": 0.862828799038028, "flos": 22164281804160.0, "grad_norm": 1.832080113430194, "language_loss": 0.77084076, "learning_rate": 1.9407422572544618e-07, "loss": 0.79219496, "num_input_tokens_seen": 309489610, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 14351, "time_per_iteration": 2.4763174057006836 }, { "auxiliary_loss_clip": 0.01103435, "auxiliary_loss_mlp": 0.01025006, "balance_loss_clip": 1.01358557, "balance_loss_mlp": 1.03455532, "epoch": 0.8628889222906959, "flos": 23145576433920.0, "grad_norm": 2.15038685176232, "language_loss": 0.85262609, "learning_rate": 1.9390690106142204e-07, "loss": 0.87391055, "num_input_tokens_seen": 309508295, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 14352, "time_per_iteration": 2.4574272632598877 }, { "auxiliary_loss_clip": 0.01028706, "auxiliary_loss_mlp": 0.00999223, "balance_loss_clip": 0.99825191, "balance_loss_mlp": 1.00671864, "epoch": 0.8629490455433639, "flos": 57817762151040.0, "grad_norm": 0.8034054370574442, "language_loss": 0.61913395, "learning_rate": 1.9373964488462913e-07, "loss": 0.63941324, "num_input_tokens_seen": 309567960, "router_z_loss_clip": 0.00970459, "router_z_loss_mlp": 0.22070312, "step": 14353, "time_per_iteration": 3.1072211265563965 }, { "auxiliary_loss_clip": 0.01103031, "auxiliary_loss_mlp": 0.01030386, "balance_loss_clip": 1.01898944, "balance_loss_mlp": 1.03613281, "epoch": 0.8630091687960318, "flos": 15919667383680.0, "grad_norm": 3.975287690162803, "language_loss": 0.81854582, "learning_rate": 1.9357245720140948e-07, "loss": 0.83987999, "num_input_tokens_seen": 309586050, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66796875, "step": 14354, "time_per_iteration": 2.464494228363037 }, { "auxiliary_loss_clip": 0.01103207, "auxiliary_loss_mlp": 0.01029355, "balance_loss_clip": 1.01656961, "balance_loss_mlp": 1.03445339, "epoch": 0.8630692920486999, "flos": 17961691570560.0, "grad_norm": 2.115567863606184, "language_loss": 0.85751426, "learning_rate": 1.934053380181031e-07, "loss": 0.87883991, "num_input_tokens_seen": 309602910, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 14355, "time_per_iteration": 2.4471800327301025 }, { "auxiliary_loss_clip": 0.01104745, "auxiliary_loss_mlp": 0.0103047, "balance_loss_clip": 1.01754761, "balance_loss_mlp": 1.03447998, "epoch": 0.8631294153013678, "flos": 22455158140800.0, "grad_norm": 2.8766841591117984, "language_loss": 0.58875024, "learning_rate": 1.9323828734104763e-07, "loss": 0.61010242, "num_input_tokens_seen": 309621175, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 14356, "time_per_iteration": 2.4629781246185303 }, { "auxiliary_loss_clip": 0.01106956, "auxiliary_loss_mlp": 0.01034636, "balance_loss_clip": 1.02140975, "balance_loss_mlp": 1.03555632, "epoch": 0.8631895385540358, "flos": 16837005847680.0, "grad_norm": 1.762828020720963, "language_loss": 0.77147466, "learning_rate": 1.9307130517657756e-07, "loss": 0.79289055, "num_input_tokens_seen": 309639395, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 14357, "time_per_iteration": 2.426063060760498 }, { "auxiliary_loss_clip": 0.01106449, "auxiliary_loss_mlp": 0.01031683, "balance_loss_clip": 1.01949954, "balance_loss_mlp": 1.03700209, "epoch": 0.8632496618067037, "flos": 18697214367360.0, "grad_norm": 2.8165527307172504, "language_loss": 0.7791186, "learning_rate": 1.9290439153102468e-07, "loss": 0.80049992, "num_input_tokens_seen": 309657265, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 14358, "time_per_iteration": 2.429208755493164 }, { "auxiliary_loss_clip": 0.01103878, "auxiliary_loss_mlp": 0.01026242, "balance_loss_clip": 1.01403475, "balance_loss_mlp": 1.03429627, "epoch": 0.8633097850593717, "flos": 24279922915200.0, "grad_norm": 2.9961794257254986, "language_loss": 0.74960387, "learning_rate": 1.9273754641071816e-07, "loss": 0.77090508, "num_input_tokens_seen": 309678610, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 14359, "time_per_iteration": 2.503187656402588 }, { "auxiliary_loss_clip": 0.0109868, "auxiliary_loss_mlp": 0.0102595, "balance_loss_clip": 1.01375508, "balance_loss_mlp": 1.03290963, "epoch": 0.8633699083120396, "flos": 21178569801600.0, "grad_norm": 1.8322482520283927, "language_loss": 0.70611417, "learning_rate": 1.9257076982198517e-07, "loss": 0.72736043, "num_input_tokens_seen": 309697710, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.65625, "step": 14360, "time_per_iteration": 2.452207565307617 }, { "auxiliary_loss_clip": 0.01109319, "auxiliary_loss_mlp": 0.01033331, "balance_loss_clip": 1.01956224, "balance_loss_mlp": 1.03822649, "epoch": 0.8634300315647077, "flos": 19244888012160.0, "grad_norm": 1.9380048687205198, "language_loss": 0.76306438, "learning_rate": 1.9240406177114953e-07, "loss": 0.78449082, "num_input_tokens_seen": 309715985, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 14361, "time_per_iteration": 2.4613006114959717 }, { "auxiliary_loss_clip": 0.01028815, "auxiliary_loss_mlp": 0.01002304, "balance_loss_clip": 1.00141013, "balance_loss_mlp": 1.00659645, "epoch": 0.8634901548173756, "flos": 66195648282240.0, "grad_norm": 0.9616432520899296, "language_loss": 0.5877108, "learning_rate": 1.922374222645329e-07, "loss": 0.60802203, "num_input_tokens_seen": 309779930, "router_z_loss_clip": 0.00891113, "router_z_loss_mlp": 0.22265625, "step": 14362, "time_per_iteration": 3.1202926635742188 }, { "auxiliary_loss_clip": 0.01108474, "auxiliary_loss_mlp": 0.01035228, "balance_loss_clip": 1.02135158, "balance_loss_mlp": 1.03687072, "epoch": 0.8635502780700436, "flos": 24789531121920.0, "grad_norm": 1.607381719464038, "language_loss": 0.80619907, "learning_rate": 1.9207085130845524e-07, "loss": 0.82763612, "num_input_tokens_seen": 309800580, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.71484375, "step": 14363, "time_per_iteration": 2.505315065383911 }, { "auxiliary_loss_clip": 0.01106292, "auxiliary_loss_mlp": 0.01035207, "balance_loss_clip": 1.02177238, "balance_loss_mlp": 1.03506494, "epoch": 0.8636104013227116, "flos": 25189970918400.0, "grad_norm": 2.5666715652946275, "language_loss": 0.72715235, "learning_rate": 1.9190434890923112e-07, "loss": 0.7485674, "num_input_tokens_seen": 309821725, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 14364, "time_per_iteration": 2.5069785118103027 }, { "auxiliary_loss_clip": 0.01106489, "auxiliary_loss_mlp": 0.01032863, "balance_loss_clip": 1.02082324, "balance_loss_mlp": 1.03523982, "epoch": 0.8636705245753795, "flos": 23878441624320.0, "grad_norm": 1.6662345665470915, "language_loss": 0.71901858, "learning_rate": 1.917379150731755e-07, "loss": 0.74041206, "num_input_tokens_seen": 309841565, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 14365, "time_per_iteration": 2.49751877784729 }, { "auxiliary_loss_clip": 0.01108816, "auxiliary_loss_mlp": 0.01036024, "balance_loss_clip": 1.02280319, "balance_loss_mlp": 1.03725541, "epoch": 0.8637306478280475, "flos": 23110455911040.0, "grad_norm": 2.4201454483464877, "language_loss": 0.70885527, "learning_rate": 1.915715498065993e-07, "loss": 0.73030365, "num_input_tokens_seen": 309858635, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 14366, "time_per_iteration": 2.4485979080200195 }, { "auxiliary_loss_clip": 0.01103327, "auxiliary_loss_mlp": 0.01025986, "balance_loss_clip": 1.01467299, "balance_loss_mlp": 1.03631306, "epoch": 0.8637907710807154, "flos": 21906802137600.0, "grad_norm": 1.988684811712209, "language_loss": 0.81447959, "learning_rate": 1.9140525311581146e-07, "loss": 0.83577275, "num_input_tokens_seen": 309877885, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.671875, "step": 14367, "time_per_iteration": 2.469235420227051 }, { "auxiliary_loss_clip": 0.01106438, "auxiliary_loss_mlp": 0.01028823, "balance_loss_clip": 1.01590657, "balance_loss_mlp": 1.03639174, "epoch": 0.8638508943333835, "flos": 23580526222080.0, "grad_norm": 3.4042786043419837, "language_loss": 0.61956519, "learning_rate": 1.9123902500711743e-07, "loss": 0.64091778, "num_input_tokens_seen": 309893140, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 14368, "time_per_iteration": 2.477280855178833 }, { "auxiliary_loss_clip": 0.01106287, "auxiliary_loss_mlp": 0.01030795, "balance_loss_clip": 1.0185107, "balance_loss_mlp": 1.03715444, "epoch": 0.8639110175860514, "flos": 25775853655680.0, "grad_norm": 2.297335372549922, "language_loss": 0.76510596, "learning_rate": 1.91072865486821e-07, "loss": 0.78647679, "num_input_tokens_seen": 309914175, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 14369, "time_per_iteration": 2.5044782161712646 }, { "auxiliary_loss_clip": 0.01106545, "auxiliary_loss_mlp": 0.01033476, "balance_loss_clip": 1.02044678, "balance_loss_mlp": 1.03529918, "epoch": 0.8639711408387194, "flos": 23369443948800.0, "grad_norm": 2.1750669614073868, "language_loss": 0.64912486, "learning_rate": 1.9090677456122294e-07, "loss": 0.67052507, "num_input_tokens_seen": 309932395, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 14370, "time_per_iteration": 2.482401132583618 }, { "auxiliary_loss_clip": 0.01106314, "auxiliary_loss_mlp": 0.01033504, "balance_loss_clip": 1.021083, "balance_loss_mlp": 1.03729069, "epoch": 0.8640312640913873, "flos": 22127221946880.0, "grad_norm": 2.080057025817762, "language_loss": 0.66137552, "learning_rate": 1.907407522366209e-07, "loss": 0.68277371, "num_input_tokens_seen": 309951720, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 14371, "time_per_iteration": 2.455251693725586 }, { "auxiliary_loss_clip": 0.01028553, "auxiliary_loss_mlp": 0.01002269, "balance_loss_clip": 1.00127971, "balance_loss_mlp": 1.0065527, "epoch": 0.8640913873440553, "flos": 57571735944960.0, "grad_norm": 0.8688474451507724, "language_loss": 0.56994963, "learning_rate": 1.905747985193107e-07, "loss": 0.59025788, "num_input_tokens_seen": 310006120, "router_z_loss_clip": 0.0098877, "router_z_loss_mlp": 0.22070312, "step": 14372, "time_per_iteration": 2.9498074054718018 }, { "auxiliary_loss_clip": 0.01104572, "auxiliary_loss_mlp": 0.01027741, "balance_loss_clip": 1.01528943, "balance_loss_mlp": 1.03773379, "epoch": 0.8641515105967232, "flos": 23987430466560.0, "grad_norm": 2.14998632282447, "language_loss": 0.79607475, "learning_rate": 1.9040891341558597e-07, "loss": 0.81739783, "num_input_tokens_seen": 310026740, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.66796875, "step": 14373, "time_per_iteration": 3.9218316078186035 }, { "auxiliary_loss_clip": 0.01105704, "auxiliary_loss_mlp": 0.01028971, "balance_loss_clip": 1.01657343, "balance_loss_mlp": 1.03571892, "epoch": 0.8642116338493913, "flos": 19062749122560.0, "grad_norm": 2.3506375415670666, "language_loss": 0.63723493, "learning_rate": 1.9024309693173656e-07, "loss": 0.65858167, "num_input_tokens_seen": 310044135, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 14374, "time_per_iteration": 2.475480556488037 }, { "auxiliary_loss_clip": 0.01102513, "auxiliary_loss_mlp": 0.01033292, "balance_loss_clip": 1.02119243, "balance_loss_mlp": 1.03554535, "epoch": 0.8642717571020592, "flos": 18254148105600.0, "grad_norm": 2.101055163841989, "language_loss": 0.77273464, "learning_rate": 1.9007734907404993e-07, "loss": 0.79409266, "num_input_tokens_seen": 310061560, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66796875, "step": 14375, "time_per_iteration": 2.429931402206421 }, { "auxiliary_loss_clip": 0.01104323, "auxiliary_loss_mlp": 0.0103225, "balance_loss_clip": 1.01974511, "balance_loss_mlp": 1.03522432, "epoch": 0.8643318803547272, "flos": 57663270777600.0, "grad_norm": 3.274449010149904, "language_loss": 0.60872161, "learning_rate": 1.899116698488117e-07, "loss": 0.63008738, "num_input_tokens_seen": 310087310, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 14376, "time_per_iteration": 4.184709548950195 }, { "auxiliary_loss_clip": 0.01102597, "auxiliary_loss_mlp": 0.0103509, "balance_loss_clip": 1.02284741, "balance_loss_mlp": 1.03417718, "epoch": 0.8643920036073952, "flos": 19609524927360.0, "grad_norm": 1.5472541322050837, "language_loss": 0.6628899, "learning_rate": 1.8974605926230457e-07, "loss": 0.68426681, "num_input_tokens_seen": 310106260, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 14377, "time_per_iteration": 2.461953639984131 }, { "auxiliary_loss_clip": 0.01105575, "auxiliary_loss_mlp": 0.01034562, "balance_loss_clip": 1.02174735, "balance_loss_mlp": 1.03486991, "epoch": 0.8644521268600631, "flos": 20850346298880.0, "grad_norm": 1.6448667690528647, "language_loss": 0.70348775, "learning_rate": 1.8958051732080804e-07, "loss": 0.7248891, "num_input_tokens_seen": 310125305, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 14378, "time_per_iteration": 2.4660725593566895 }, { "auxiliary_loss_clip": 0.01028616, "auxiliary_loss_mlp": 0.01000422, "balance_loss_clip": 0.99946833, "balance_loss_mlp": 1.00657988, "epoch": 0.8645122501127311, "flos": 66719550101760.0, "grad_norm": 0.8050226864463904, "language_loss": 0.60284859, "learning_rate": 1.894150440305995e-07, "loss": 0.62313896, "num_input_tokens_seen": 310189270, "router_z_loss_clip": 0.00952148, "router_z_loss_mlp": 0.22070312, "step": 14379, "time_per_iteration": 4.49169921875 }, { "auxiliary_loss_clip": 0.01101912, "auxiliary_loss_mlp": 0.01027887, "balance_loss_clip": 1.01609683, "balance_loss_mlp": 1.03384662, "epoch": 0.864572373365399, "flos": 21690009601920.0, "grad_norm": 1.6194133873390035, "language_loss": 0.74600983, "learning_rate": 1.8924963939795478e-07, "loss": 0.76730782, "num_input_tokens_seen": 310208395, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 14380, "time_per_iteration": 3.934675931930542 }, { "auxiliary_loss_clip": 0.01106054, "auxiliary_loss_mlp": 0.01028252, "balance_loss_clip": 1.01596189, "balance_loss_mlp": 1.03451931, "epoch": 0.8646324966180671, "flos": 20266402896000.0, "grad_norm": 2.3697330863388144, "language_loss": 0.75099564, "learning_rate": 1.8908430342914473e-07, "loss": 0.77233875, "num_input_tokens_seen": 310227415, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71484375, "step": 14381, "time_per_iteration": 2.4420042037963867 }, { "auxiliary_loss_clip": 0.01102744, "auxiliary_loss_mlp": 0.01031674, "balance_loss_clip": 1.02005732, "balance_loss_mlp": 1.03518033, "epoch": 0.864692619870735, "flos": 11946188050560.0, "grad_norm": 2.58951793206826, "language_loss": 0.84719336, "learning_rate": 1.8891903613043892e-07, "loss": 0.86853749, "num_input_tokens_seen": 310242625, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 14382, "time_per_iteration": 2.4311392307281494 }, { "auxiliary_loss_clip": 0.01105922, "auxiliary_loss_mlp": 0.01034512, "balance_loss_clip": 1.02125597, "balance_loss_mlp": 1.03619325, "epoch": 0.864752743123403, "flos": 21470703114240.0, "grad_norm": 3.526673668961888, "language_loss": 0.76143157, "learning_rate": 1.8875383750810504e-07, "loss": 0.7828359, "num_input_tokens_seen": 310260585, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 14383, "time_per_iteration": 2.445222854614258 }, { "auxiliary_loss_clip": 0.01104629, "auxiliary_loss_mlp": 0.01029933, "balance_loss_clip": 1.01774395, "balance_loss_mlp": 1.03751004, "epoch": 0.8648128663760709, "flos": 19530018172800.0, "grad_norm": 1.8824055349719089, "language_loss": 0.85295296, "learning_rate": 1.8858870756840738e-07, "loss": 0.87429857, "num_input_tokens_seen": 310277210, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 14384, "time_per_iteration": 2.472635269165039 }, { "auxiliary_loss_clip": 0.01100847, "auxiliary_loss_mlp": 0.01028101, "balance_loss_clip": 1.01612055, "balance_loss_mlp": 1.03285289, "epoch": 0.8648729896287389, "flos": 21287953693440.0, "grad_norm": 1.8127567538924803, "language_loss": 0.81100869, "learning_rate": 1.884236463176072e-07, "loss": 0.83229816, "num_input_tokens_seen": 310296610, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 14385, "time_per_iteration": 2.4547805786132812 }, { "auxiliary_loss_clip": 0.01110324, "auxiliary_loss_mlp": 0.01030627, "balance_loss_clip": 1.01768708, "balance_loss_mlp": 1.03871143, "epoch": 0.8649331128814068, "flos": 24604483230720.0, "grad_norm": 2.4426316369288656, "language_loss": 0.72735393, "learning_rate": 1.8825865376196437e-07, "loss": 0.74876344, "num_input_tokens_seen": 310316830, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 14386, "time_per_iteration": 2.522347927093506 }, { "auxiliary_loss_clip": 0.01105042, "auxiliary_loss_mlp": 0.01031344, "balance_loss_clip": 1.019858, "balance_loss_mlp": 1.03625715, "epoch": 0.8649932361340749, "flos": 15377811742080.0, "grad_norm": 1.9143309584590813, "language_loss": 0.82211334, "learning_rate": 1.8809372990773476e-07, "loss": 0.84347725, "num_input_tokens_seen": 310334355, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 14387, "time_per_iteration": 2.416262626647949 }, { "auxiliary_loss_clip": 0.01102163, "auxiliary_loss_mlp": 0.01026389, "balance_loss_clip": 1.01460493, "balance_loss_mlp": 1.03578699, "epoch": 0.8650533593867428, "flos": 19901227276800.0, "grad_norm": 1.897096997382461, "language_loss": 0.68835402, "learning_rate": 1.8792887476117224e-07, "loss": 0.70963955, "num_input_tokens_seen": 310352900, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6640625, "step": 14388, "time_per_iteration": 2.445418357849121 }, { "auxiliary_loss_clip": 0.01102179, "auxiliary_loss_mlp": 0.01033906, "balance_loss_clip": 1.02268863, "balance_loss_mlp": 1.03642774, "epoch": 0.8651134826394108, "flos": 25626931868160.0, "grad_norm": 1.6756284624283042, "language_loss": 0.90591741, "learning_rate": 1.877640883285283e-07, "loss": 0.92727828, "num_input_tokens_seen": 310372855, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.65625, "step": 14389, "time_per_iteration": 2.5010979175567627 }, { "auxiliary_loss_clip": 0.01102637, "auxiliary_loss_mlp": 0.01031248, "balance_loss_clip": 1.01964951, "balance_loss_mlp": 1.03570414, "epoch": 0.8651736058920788, "flos": 18734525619840.0, "grad_norm": 1.7225982387216234, "language_loss": 0.71293235, "learning_rate": 1.8759937061605212e-07, "loss": 0.73427129, "num_input_tokens_seen": 310391595, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 14390, "time_per_iteration": 2.4623730182647705 }, { "auxiliary_loss_clip": 0.01106108, "auxiliary_loss_mlp": 0.0103617, "balance_loss_clip": 1.02338529, "balance_loss_mlp": 1.03588343, "epoch": 0.8652337291447467, "flos": 20776765288320.0, "grad_norm": 1.7804637899446338, "language_loss": 0.82320744, "learning_rate": 1.8743472162998941e-07, "loss": 0.84463024, "num_input_tokens_seen": 310410090, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 14391, "time_per_iteration": 2.4571731090545654 }, { "auxiliary_loss_clip": 0.01029049, "auxiliary_loss_mlp": 0.00999491, "balance_loss_clip": 0.99850792, "balance_loss_mlp": 1.0070684, "epoch": 0.8652938523974147, "flos": 64227887464320.0, "grad_norm": 0.8076211606689742, "language_loss": 0.67960525, "learning_rate": 1.8727014137658337e-07, "loss": 0.69989073, "num_input_tokens_seen": 310470055, "router_z_loss_clip": 0.00982666, "router_z_loss_mlp": 0.21972656, "step": 14392, "time_per_iteration": 2.9919233322143555 }, { "auxiliary_loss_clip": 0.01107792, "auxiliary_loss_mlp": 0.01030375, "balance_loss_clip": 1.01720178, "balance_loss_mlp": 1.03545117, "epoch": 0.8653539756500827, "flos": 18040587793920.0, "grad_norm": 5.2427920301828586, "language_loss": 0.75717378, "learning_rate": 1.8710562986207523e-07, "loss": 0.77855539, "num_input_tokens_seen": 310487665, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 14393, "time_per_iteration": 2.4296715259552 }, { "auxiliary_loss_clip": 0.01104565, "auxiliary_loss_mlp": 0.01032446, "balance_loss_clip": 1.02012575, "balance_loss_mlp": 1.03403461, "epoch": 0.8654140989027507, "flos": 17382416935680.0, "grad_norm": 2.062608464879802, "language_loss": 0.73737431, "learning_rate": 1.8694118709270357e-07, "loss": 0.75874442, "num_input_tokens_seen": 310506130, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 14394, "time_per_iteration": 2.482370138168335 }, { "auxiliary_loss_clip": 0.01107395, "auxiliary_loss_mlp": 0.01029574, "balance_loss_clip": 1.01665175, "balance_loss_mlp": 1.03658903, "epoch": 0.8654742221554186, "flos": 53284862448000.0, "grad_norm": 2.0256005315050527, "language_loss": 0.65433651, "learning_rate": 1.867768130747036e-07, "loss": 0.67570615, "num_input_tokens_seen": 310532445, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 14395, "time_per_iteration": 2.8143155574798584 }, { "auxiliary_loss_clip": 0.01103559, "auxiliary_loss_mlp": 0.01032773, "balance_loss_clip": 1.02042294, "balance_loss_mlp": 1.03541112, "epoch": 0.8655343454080866, "flos": 23914711382400.0, "grad_norm": 1.7606082184086695, "language_loss": 0.67952037, "learning_rate": 1.8661250781430838e-07, "loss": 0.70088375, "num_input_tokens_seen": 310552300, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.68359375, "step": 14396, "time_per_iteration": 2.518637180328369 }, { "auxiliary_loss_clip": 0.01109236, "auxiliary_loss_mlp": 0.0103382, "balance_loss_clip": 1.02093351, "balance_loss_mlp": 1.03806543, "epoch": 0.8655944686607545, "flos": 24097209408000.0, "grad_norm": 2.628351828915589, "language_loss": 0.69123375, "learning_rate": 1.8644827131774954e-07, "loss": 0.71266437, "num_input_tokens_seen": 310572710, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 14397, "time_per_iteration": 2.5130021572113037 }, { "auxiliary_loss_clip": 0.0110269, "auxiliary_loss_mlp": 0.01030103, "balance_loss_clip": 1.01855183, "balance_loss_mlp": 1.03382444, "epoch": 0.8656545919134225, "flos": 23112718467840.0, "grad_norm": 2.3964214116005613, "language_loss": 0.63343513, "learning_rate": 1.86284103591253e-07, "loss": 0.65476304, "num_input_tokens_seen": 310592460, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 14398, "time_per_iteration": 2.4726145267486572 }, { "auxiliary_loss_clip": 0.01106074, "auxiliary_loss_mlp": 0.01030673, "balance_loss_clip": 1.01857948, "balance_loss_mlp": 1.03702354, "epoch": 0.8657147151660904, "flos": 21141761339520.0, "grad_norm": 1.988857771541082, "language_loss": 0.76128179, "learning_rate": 1.8612000464104517e-07, "loss": 0.78264928, "num_input_tokens_seen": 310609375, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 14399, "time_per_iteration": 2.4424972534179688 }, { "auxiliary_loss_clip": 0.01103065, "auxiliary_loss_mlp": 0.01027831, "balance_loss_clip": 1.01622009, "balance_loss_mlp": 1.03501582, "epoch": 0.8657748384187585, "flos": 16289439943680.0, "grad_norm": 2.187722888750652, "language_loss": 0.93167162, "learning_rate": 1.8595597447334855e-07, "loss": 0.95298058, "num_input_tokens_seen": 310627405, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 14400, "time_per_iteration": 2.4396872520446777 }, { "auxiliary_loss_clip": 0.01106394, "auxiliary_loss_mlp": 0.01034534, "balance_loss_clip": 1.02245235, "balance_loss_mlp": 1.0363034, "epoch": 0.8658349616714264, "flos": 30843890179200.0, "grad_norm": 3.008402071332485, "language_loss": 0.67553926, "learning_rate": 1.8579201309438353e-07, "loss": 0.69694859, "num_input_tokens_seen": 310649945, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69921875, "step": 14401, "time_per_iteration": 2.5114943981170654 }, { "auxiliary_loss_clip": 0.01105861, "auxiliary_loss_mlp": 0.01028051, "balance_loss_clip": 1.015558, "balance_loss_mlp": 1.03464699, "epoch": 0.8658950849240944, "flos": 18952862440320.0, "grad_norm": 2.311624473376312, "language_loss": 0.74401909, "learning_rate": 1.8562812051036714e-07, "loss": 0.76535821, "num_input_tokens_seen": 310668285, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 14402, "time_per_iteration": 2.4479823112487793 }, { "auxiliary_loss_clip": 0.011029, "auxiliary_loss_mlp": 0.01031309, "balance_loss_clip": 1.01959658, "balance_loss_mlp": 1.03557682, "epoch": 0.8659552081767624, "flos": 23364344217600.0, "grad_norm": 1.8200991321727782, "language_loss": 0.74924994, "learning_rate": 1.8546429672751397e-07, "loss": 0.77059197, "num_input_tokens_seen": 310687015, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 14403, "time_per_iteration": 2.485586643218994 }, { "auxiliary_loss_clip": 0.01106951, "auxiliary_loss_mlp": 0.0103298, "balance_loss_clip": 1.02004576, "balance_loss_mlp": 1.03691149, "epoch": 0.8660153314294303, "flos": 23841992298240.0, "grad_norm": 1.828547780782638, "language_loss": 0.73273551, "learning_rate": 1.853005417520368e-07, "loss": 0.75413477, "num_input_tokens_seen": 310707580, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 14404, "time_per_iteration": 2.482525110244751 }, { "auxiliary_loss_clip": 0.0110308, "auxiliary_loss_mlp": 0.01034688, "balance_loss_clip": 1.02235568, "balance_loss_mlp": 1.03597653, "epoch": 0.8660754546820983, "flos": 23112467072640.0, "grad_norm": 2.1932823795388523, "language_loss": 0.71070659, "learning_rate": 1.851368555901447e-07, "loss": 0.73208427, "num_input_tokens_seen": 310727300, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 14405, "time_per_iteration": 2.4650673866271973 }, { "auxiliary_loss_clip": 0.01106385, "auxiliary_loss_mlp": 0.0103275, "balance_loss_clip": 1.02034068, "balance_loss_mlp": 1.03516483, "epoch": 0.8661355779347663, "flos": 14391991998720.0, "grad_norm": 1.739102152379136, "language_loss": 0.6611743, "learning_rate": 1.8497323824804467e-07, "loss": 0.68256563, "num_input_tokens_seen": 310744935, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 14406, "time_per_iteration": 2.4627437591552734 }, { "auxiliary_loss_clip": 0.01103154, "auxiliary_loss_mlp": 0.01025613, "balance_loss_clip": 1.01414514, "balance_loss_mlp": 1.0345124, "epoch": 0.8661957011874343, "flos": 21870137329920.0, "grad_norm": 1.7537379237827775, "language_loss": 0.82781112, "learning_rate": 1.8480968973194177e-07, "loss": 0.84909874, "num_input_tokens_seen": 310765085, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6875, "step": 14407, "time_per_iteration": 2.439423084259033 }, { "auxiliary_loss_clip": 0.01103851, "auxiliary_loss_mlp": 0.01036623, "balance_loss_clip": 1.02475584, "balance_loss_mlp": 1.03610313, "epoch": 0.8662558244401022, "flos": 21835160461440.0, "grad_norm": 1.7772895468091126, "language_loss": 0.70409358, "learning_rate": 1.8464621004803748e-07, "loss": 0.72549832, "num_input_tokens_seen": 310783260, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 14408, "time_per_iteration": 2.465693712234497 }, { "auxiliary_loss_clip": 0.01098667, "auxiliary_loss_mlp": 0.0103038, "balance_loss_clip": 1.01909685, "balance_loss_mlp": 1.03349853, "epoch": 0.8663159476927702, "flos": 17384104874880.0, "grad_norm": 2.1201419526223537, "language_loss": 0.77424705, "learning_rate": 1.844827992025304e-07, "loss": 0.79553747, "num_input_tokens_seen": 310801970, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.65234375, "step": 14409, "time_per_iteration": 2.4217662811279297 }, { "auxiliary_loss_clip": 0.01109187, "auxiliary_loss_mlp": 0.01031617, "balance_loss_clip": 1.01809883, "balance_loss_mlp": 1.03850031, "epoch": 0.8663760709454381, "flos": 22747722416640.0, "grad_norm": 1.7823611656697793, "language_loss": 0.76962399, "learning_rate": 1.8431945720161757e-07, "loss": 0.79103208, "num_input_tokens_seen": 310822070, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 14410, "time_per_iteration": 2.491224527359009 }, { "auxiliary_loss_clip": 0.01104383, "auxiliary_loss_mlp": 0.01029976, "balance_loss_clip": 1.01729822, "balance_loss_mlp": 1.03495574, "epoch": 0.8664361941981061, "flos": 17376850327680.0, "grad_norm": 2.451850192099789, "language_loss": 0.77493942, "learning_rate": 1.8415618405149315e-07, "loss": 0.79628301, "num_input_tokens_seen": 310838355, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14411, "time_per_iteration": 2.4413952827453613 }, { "auxiliary_loss_clip": 0.01100058, "auxiliary_loss_mlp": 0.01034746, "balance_loss_clip": 1.02327824, "balance_loss_mlp": 1.0320344, "epoch": 0.866496317450774, "flos": 16034438315520.0, "grad_norm": 1.8123026497200319, "language_loss": 0.73907995, "learning_rate": 1.8399297975834794e-07, "loss": 0.76042795, "num_input_tokens_seen": 310856055, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 14412, "time_per_iteration": 2.4249157905578613 }, { "auxiliary_loss_clip": 0.01099671, "auxiliary_loss_mlp": 0.01026311, "balance_loss_clip": 1.01536751, "balance_loss_mlp": 1.03367674, "epoch": 0.8665564407034421, "flos": 20814830726400.0, "grad_norm": 2.10402134395029, "language_loss": 0.69479525, "learning_rate": 1.83829844328371e-07, "loss": 0.71605504, "num_input_tokens_seen": 310876695, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6640625, "step": 14413, "time_per_iteration": 2.4724907875061035 }, { "auxiliary_loss_clip": 0.0110413, "auxiliary_loss_mlp": 0.01031675, "balance_loss_clip": 1.01906228, "balance_loss_mlp": 1.03558826, "epoch": 0.86661656395611, "flos": 15815167741440.0, "grad_norm": 3.682233672773843, "language_loss": 0.63104022, "learning_rate": 1.8366677776774874e-07, "loss": 0.65239823, "num_input_tokens_seen": 310893880, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 14414, "time_per_iteration": 2.4298253059387207 }, { "auxiliary_loss_clip": 0.01105229, "auxiliary_loss_mlp": 0.01032679, "balance_loss_clip": 1.0205555, "balance_loss_mlp": 1.03659844, "epoch": 0.866676687208778, "flos": 23036910814080.0, "grad_norm": 1.6616847428497001, "language_loss": 0.63959789, "learning_rate": 1.8350378008266377e-07, "loss": 0.66097701, "num_input_tokens_seen": 310914145, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 14415, "time_per_iteration": 3.972308397293091 }, { "auxiliary_loss_clip": 0.01028973, "auxiliary_loss_mlp": 0.00999686, "balance_loss_clip": 0.99872607, "balance_loss_mlp": 1.00698805, "epoch": 0.866736810461446, "flos": 63802275212160.0, "grad_norm": 0.8016000967065423, "language_loss": 0.60420024, "learning_rate": 1.8334085127929754e-07, "loss": 0.6244868, "num_input_tokens_seen": 310972825, "router_z_loss_clip": 0.00958252, "router_z_loss_mlp": 0.22070312, "step": 14416, "time_per_iteration": 3.1418285369873047 }, { "auxiliary_loss_clip": 0.01107153, "auxiliary_loss_mlp": 0.01033076, "balance_loss_clip": 1.0200702, "balance_loss_mlp": 1.03504825, "epoch": 0.8667969337141139, "flos": 20449367798400.0, "grad_norm": 3.587998277566523, "language_loss": 0.74431276, "learning_rate": 1.831779913638285e-07, "loss": 0.76571506, "num_input_tokens_seen": 310992050, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 14417, "time_per_iteration": 2.4510326385498047 }, { "auxiliary_loss_clip": 0.01102938, "auxiliary_loss_mlp": 0.01036153, "balance_loss_clip": 1.02397001, "balance_loss_mlp": 1.03460503, "epoch": 0.866857056966782, "flos": 21653703930240.0, "grad_norm": 1.6830482476815825, "language_loss": 0.75288594, "learning_rate": 1.830152003424319e-07, "loss": 0.77427685, "num_input_tokens_seen": 311011105, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 14418, "time_per_iteration": 3.8366401195526123 }, { "auxiliary_loss_clip": 0.01104442, "auxiliary_loss_mlp": 0.01032398, "balance_loss_clip": 1.02024412, "balance_loss_mlp": 1.03561509, "epoch": 0.8669171802194499, "flos": 22852832590080.0, "grad_norm": 1.5188614781121768, "language_loss": 0.67968869, "learning_rate": 1.8285247822128126e-07, "loss": 0.70105708, "num_input_tokens_seen": 311032080, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 14419, "time_per_iteration": 2.5057992935180664 }, { "auxiliary_loss_clip": 0.01105205, "auxiliary_loss_mlp": 0.01031103, "balance_loss_clip": 1.01967692, "balance_loss_mlp": 1.03549588, "epoch": 0.8669773034721179, "flos": 18734166483840.0, "grad_norm": 3.530010626419939, "language_loss": 0.78733325, "learning_rate": 1.826898250065465e-07, "loss": 0.80869633, "num_input_tokens_seen": 311049735, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 14420, "time_per_iteration": 3.849677801132202 }, { "auxiliary_loss_clip": 0.01103986, "auxiliary_loss_mlp": 0.01028631, "balance_loss_clip": 1.01682949, "balance_loss_mlp": 1.03603959, "epoch": 0.8670374267247858, "flos": 18916018064640.0, "grad_norm": 1.7340857100532583, "language_loss": 0.8372373, "learning_rate": 1.8252724070439586e-07, "loss": 0.85856342, "num_input_tokens_seen": 311067675, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 14421, "time_per_iteration": 2.43774676322937 }, { "auxiliary_loss_clip": 0.01028821, "auxiliary_loss_mlp": 0.01003098, "balance_loss_clip": 1.00206685, "balance_loss_mlp": 1.00662804, "epoch": 0.8670975499774538, "flos": 48814527214080.0, "grad_norm": 0.7117700902083346, "language_loss": 0.49061948, "learning_rate": 1.823647253209941e-07, "loss": 0.51093864, "num_input_tokens_seen": 311126605, "router_z_loss_clip": 0.01031494, "router_z_loss_mlp": 0.22265625, "step": 14422, "time_per_iteration": 4.548937559127808 }, { "auxiliary_loss_clip": 0.01103895, "auxiliary_loss_mlp": 0.01028772, "balance_loss_clip": 1.01698267, "balance_loss_mlp": 1.03573632, "epoch": 0.8671576732301217, "flos": 26136145025280.0, "grad_norm": 1.7309079112652677, "language_loss": 0.73801482, "learning_rate": 1.8220227886250417e-07, "loss": 0.75934148, "num_input_tokens_seen": 311147325, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 14423, "time_per_iteration": 2.508481979370117 }, { "auxiliary_loss_clip": 0.01098466, "auxiliary_loss_mlp": 0.01025332, "balance_loss_clip": 1.01419795, "balance_loss_mlp": 1.03306508, "epoch": 0.8672177964827897, "flos": 18367446579840.0, "grad_norm": 2.20104060155229, "language_loss": 0.77172148, "learning_rate": 1.8203990133508684e-07, "loss": 0.79295951, "num_input_tokens_seen": 311165385, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 14424, "time_per_iteration": 2.4191079139709473 }, { "auxiliary_loss_clip": 0.01099919, "auxiliary_loss_mlp": 0.01032226, "balance_loss_clip": 1.0211637, "balance_loss_mlp": 1.03517878, "epoch": 0.8672779197354576, "flos": 28545355992960.0, "grad_norm": 3.0493616002178463, "language_loss": 0.71445906, "learning_rate": 1.8187759274489767e-07, "loss": 0.73578048, "num_input_tokens_seen": 311185860, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6484375, "step": 14425, "time_per_iteration": 2.5450334548950195 }, { "auxiliary_loss_clip": 0.01107226, "auxiliary_loss_mlp": 0.01031862, "balance_loss_clip": 1.01898706, "balance_loss_mlp": 1.03666222, "epoch": 0.8673380429881257, "flos": 22382474970240.0, "grad_norm": 1.6851357327551808, "language_loss": 0.68082237, "learning_rate": 1.817153530980926e-07, "loss": 0.70221329, "num_input_tokens_seen": 311205810, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 14426, "time_per_iteration": 2.4561331272125244 }, { "auxiliary_loss_clip": 0.01105574, "auxiliary_loss_mlp": 0.01029675, "balance_loss_clip": 1.01693106, "balance_loss_mlp": 1.03589368, "epoch": 0.8673981662407936, "flos": 20996430912000.0, "grad_norm": 1.8750378235800933, "language_loss": 0.70991528, "learning_rate": 1.815531824008234e-07, "loss": 0.73126781, "num_input_tokens_seen": 311226080, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 14427, "time_per_iteration": 2.504112720489502 }, { "auxiliary_loss_clip": 0.01105263, "auxiliary_loss_mlp": 0.01028554, "balance_loss_clip": 1.01661563, "balance_loss_mlp": 1.03643692, "epoch": 0.8674582894934616, "flos": 24426797627520.0, "grad_norm": 1.7333982988668641, "language_loss": 0.68215942, "learning_rate": 1.8139108065924004e-07, "loss": 0.70349753, "num_input_tokens_seen": 311246380, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14428, "time_per_iteration": 2.5061421394348145 }, { "auxiliary_loss_clip": 0.01103242, "auxiliary_loss_mlp": 0.01030082, "balance_loss_clip": 1.01853621, "balance_loss_mlp": 1.03498793, "epoch": 0.8675184127461296, "flos": 20737514701440.0, "grad_norm": 1.9576799149461384, "language_loss": 0.71044624, "learning_rate": 1.812290478794889e-07, "loss": 0.73177946, "num_input_tokens_seen": 311266465, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 14429, "time_per_iteration": 2.489757776260376 }, { "auxiliary_loss_clip": 0.01104608, "auxiliary_loss_mlp": 0.01027456, "balance_loss_clip": 1.01529706, "balance_loss_mlp": 1.03588128, "epoch": 0.8675785359987975, "flos": 19135647774720.0, "grad_norm": 1.8648177742610514, "language_loss": 0.6668638, "learning_rate": 1.810670840677151e-07, "loss": 0.68818438, "num_input_tokens_seen": 311285075, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 14430, "time_per_iteration": 2.435558795928955 }, { "auxiliary_loss_clip": 0.01107161, "auxiliary_loss_mlp": 0.01035011, "balance_loss_clip": 1.02180862, "balance_loss_mlp": 1.03623927, "epoch": 0.8676386592514655, "flos": 22710662559360.0, "grad_norm": 2.2328342807553483, "language_loss": 0.6967864, "learning_rate": 1.8090518923005948e-07, "loss": 0.71820813, "num_input_tokens_seen": 311303230, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 14431, "time_per_iteration": 2.4926114082336426 }, { "auxiliary_loss_clip": 0.01105909, "auxiliary_loss_mlp": 0.01038379, "balance_loss_clip": 1.02577281, "balance_loss_mlp": 1.03662109, "epoch": 0.8676987825041335, "flos": 14209853109120.0, "grad_norm": 2.6083262368361626, "language_loss": 0.6406672, "learning_rate": 1.8074336337266116e-07, "loss": 0.66211009, "num_input_tokens_seen": 311318070, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 14432, "time_per_iteration": 2.428551435470581 }, { "auxiliary_loss_clip": 0.01104704, "auxiliary_loss_mlp": 0.01036431, "balance_loss_clip": 1.02520108, "balance_loss_mlp": 1.03600502, "epoch": 0.8677589057568015, "flos": 13589927256960.0, "grad_norm": 13.93020003564486, "language_loss": 0.78541791, "learning_rate": 1.8058160650165656e-07, "loss": 0.80682921, "num_input_tokens_seen": 311334885, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6875, "step": 14433, "time_per_iteration": 2.4549665451049805 }, { "auxiliary_loss_clip": 0.01028784, "auxiliary_loss_mlp": 0.01001035, "balance_loss_clip": 1.0000037, "balance_loss_mlp": 1.00678301, "epoch": 0.8678190290094694, "flos": 68933657370240.0, "grad_norm": 0.7203471842956939, "language_loss": 0.58434844, "learning_rate": 1.804199186231805e-07, "loss": 0.60464668, "num_input_tokens_seen": 311399780, "router_z_loss_clip": 0.01031494, "router_z_loss_mlp": 0.22070312, "step": 14434, "time_per_iteration": 3.1690890789031982 }, { "auxiliary_loss_clip": 0.01101946, "auxiliary_loss_mlp": 0.01033986, "balance_loss_clip": 1.02275026, "balance_loss_mlp": 1.0353086, "epoch": 0.8678791522621374, "flos": 32557726776960.0, "grad_norm": 1.9437151731959796, "language_loss": 0.79955477, "learning_rate": 1.802582997433628e-07, "loss": 0.82091403, "num_input_tokens_seen": 311419610, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 14435, "time_per_iteration": 2.5497140884399414 }, { "auxiliary_loss_clip": 0.01104597, "auxiliary_loss_mlp": 0.0102946, "balance_loss_clip": 1.01728892, "balance_loss_mlp": 1.03370476, "epoch": 0.8679392755148053, "flos": 35042637657600.0, "grad_norm": 2.5346385275430006, "language_loss": 0.62453759, "learning_rate": 1.8009674986833322e-07, "loss": 0.6458782, "num_input_tokens_seen": 311440045, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.7109375, "step": 14436, "time_per_iteration": 2.5591182708740234 }, { "auxiliary_loss_clip": 0.01106023, "auxiliary_loss_mlp": 0.01027848, "balance_loss_clip": 1.01481843, "balance_loss_mlp": 1.03690004, "epoch": 0.8679993987674733, "flos": 18552494471040.0, "grad_norm": 2.180297895454664, "language_loss": 0.70662075, "learning_rate": 1.7993526900421706e-07, "loss": 0.72795939, "num_input_tokens_seen": 311456660, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69140625, "step": 14437, "time_per_iteration": 2.483703851699829 }, { "auxiliary_loss_clip": 0.01104389, "auxiliary_loss_mlp": 0.01029417, "balance_loss_clip": 1.01682258, "balance_loss_mlp": 1.03612673, "epoch": 0.8680595220201412, "flos": 27454390162560.0, "grad_norm": 2.0193750658712317, "language_loss": 0.80643022, "learning_rate": 1.797738571571381e-07, "loss": 0.82776833, "num_input_tokens_seen": 311475460, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 14438, "time_per_iteration": 2.5133285522460938 }, { "auxiliary_loss_clip": 0.01099295, "auxiliary_loss_mlp": 0.01025143, "balance_loss_clip": 1.01335311, "balance_loss_mlp": 1.03306913, "epoch": 0.8681196452728093, "flos": 19208797822080.0, "grad_norm": 1.8600708953787917, "language_loss": 0.67383432, "learning_rate": 1.7961251433321656e-07, "loss": 0.69507873, "num_input_tokens_seen": 311494575, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66015625, "step": 14439, "time_per_iteration": 2.5198822021484375 }, { "auxiliary_loss_clip": 0.01102928, "auxiliary_loss_mlp": 0.01033312, "balance_loss_clip": 1.02180243, "balance_loss_mlp": 1.03450251, "epoch": 0.8681797685254772, "flos": 37560442417920.0, "grad_norm": 1.6248210232165805, "language_loss": 0.64114755, "learning_rate": 1.7945124053857085e-07, "loss": 0.66250998, "num_input_tokens_seen": 311515805, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 14440, "time_per_iteration": 2.582707405090332 }, { "auxiliary_loss_clip": 0.01102203, "auxiliary_loss_mlp": 0.01033718, "balance_loss_clip": 1.02115321, "balance_loss_mlp": 1.03602719, "epoch": 0.8682398917781452, "flos": 23289937194240.0, "grad_norm": 1.7168217009510642, "language_loss": 0.657166, "learning_rate": 1.7929003577931722e-07, "loss": 0.67852527, "num_input_tokens_seen": 311536000, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6640625, "step": 14441, "time_per_iteration": 2.472355842590332 }, { "auxiliary_loss_clip": 0.01102823, "auxiliary_loss_mlp": 0.01025807, "balance_loss_clip": 1.01482749, "balance_loss_mlp": 1.03650951, "epoch": 0.8683000150308132, "flos": 21872794936320.0, "grad_norm": 1.8533985380058118, "language_loss": 0.66397566, "learning_rate": 1.7912890006156722e-07, "loss": 0.68526196, "num_input_tokens_seen": 311556220, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.6640625, "step": 14442, "time_per_iteration": 2.4663689136505127 }, { "auxiliary_loss_clip": 0.0110762, "auxiliary_loss_mlp": 0.010305, "balance_loss_clip": 1.0172143, "balance_loss_mlp": 1.03632283, "epoch": 0.8683601382834811, "flos": 14647209108480.0, "grad_norm": 1.9545488662297619, "language_loss": 0.72138518, "learning_rate": 1.7896783339143195e-07, "loss": 0.74276638, "num_input_tokens_seen": 311572530, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 14443, "time_per_iteration": 2.468099355697632 }, { "auxiliary_loss_clip": 0.01105145, "auxiliary_loss_mlp": 0.0102963, "balance_loss_clip": 1.01730943, "balance_loss_mlp": 1.03576684, "epoch": 0.8684202615361492, "flos": 26359904799360.0, "grad_norm": 2.7959691035239356, "language_loss": 0.83444548, "learning_rate": 1.7880683577501877e-07, "loss": 0.85579324, "num_input_tokens_seen": 311591105, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 14444, "time_per_iteration": 2.4862542152404785 }, { "auxiliary_loss_clip": 0.01105339, "auxiliary_loss_mlp": 0.0103049, "balance_loss_clip": 1.01824701, "balance_loss_mlp": 1.03579307, "epoch": 0.8684803847888171, "flos": 20704010290560.0, "grad_norm": 2.341596574212275, "language_loss": 0.77459157, "learning_rate": 1.7864590721843342e-07, "loss": 0.79594994, "num_input_tokens_seen": 311608350, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 14445, "time_per_iteration": 2.451247453689575 }, { "auxiliary_loss_clip": 0.01106741, "auxiliary_loss_mlp": 0.01029321, "balance_loss_clip": 1.01678598, "balance_loss_mlp": 1.03691256, "epoch": 0.8685405080414851, "flos": 22638123043200.0, "grad_norm": 2.057019075039856, "language_loss": 0.67641717, "learning_rate": 1.7848504772777728e-07, "loss": 0.69777775, "num_input_tokens_seen": 311626380, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 14446, "time_per_iteration": 2.4514055252075195 }, { "auxiliary_loss_clip": 0.01104532, "auxiliary_loss_mlp": 0.01036101, "balance_loss_clip": 1.0235486, "balance_loss_mlp": 1.03604794, "epoch": 0.868600631294153, "flos": 24822065865600.0, "grad_norm": 1.738243708805368, "language_loss": 0.82906461, "learning_rate": 1.7832425730915102e-07, "loss": 0.8504709, "num_input_tokens_seen": 311644345, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 14447, "time_per_iteration": 2.4984309673309326 }, { "auxiliary_loss_clip": 0.01103622, "auxiliary_loss_mlp": 0.01029872, "balance_loss_clip": 1.01829088, "balance_loss_mlp": 1.03532052, "epoch": 0.868660754546821, "flos": 25113983696640.0, "grad_norm": 2.4898283573264606, "language_loss": 0.7421909, "learning_rate": 1.781635359686515e-07, "loss": 0.76352584, "num_input_tokens_seen": 311663340, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.68359375, "step": 14448, "time_per_iteration": 2.4777913093566895 }, { "auxiliary_loss_clip": 0.01104449, "auxiliary_loss_mlp": 0.01030362, "balance_loss_clip": 1.0177083, "balance_loss_mlp": 1.03520215, "epoch": 0.8687208777994889, "flos": 12677832178560.0, "grad_norm": 2.2399168102806404, "language_loss": 0.80580145, "learning_rate": 1.7800288371237303e-07, "loss": 0.82714957, "num_input_tokens_seen": 311679860, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 14449, "time_per_iteration": 2.4371166229248047 }, { "auxiliary_loss_clip": 0.01028585, "auxiliary_loss_mlp": 0.01002217, "balance_loss_clip": 1.0011735, "balance_loss_mlp": 1.00642967, "epoch": 0.8687810010521569, "flos": 65617235573760.0, "grad_norm": 0.8442256120854295, "language_loss": 0.6057266, "learning_rate": 1.7784230054640758e-07, "loss": 0.62603462, "num_input_tokens_seen": 311738135, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.22167969, "step": 14450, "time_per_iteration": 2.998913526535034 }, { "auxiliary_loss_clip": 0.01105784, "auxiliary_loss_mlp": 0.01028575, "balance_loss_clip": 1.01636815, "balance_loss_mlp": 1.03566921, "epoch": 0.8688411243048249, "flos": 24244012293120.0, "grad_norm": 1.5902014819408063, "language_loss": 0.76359707, "learning_rate": 1.7768178647684517e-07, "loss": 0.78494072, "num_input_tokens_seen": 311756975, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 14451, "time_per_iteration": 2.511709213256836 }, { "auxiliary_loss_clip": 0.01102076, "auxiliary_loss_mlp": 0.01027241, "balance_loss_clip": 1.01537395, "balance_loss_mlp": 1.03446341, "epoch": 0.8689012475574929, "flos": 18221828843520.0, "grad_norm": 2.545208644711291, "language_loss": 0.72473222, "learning_rate": 1.7752134150977205e-07, "loss": 0.74602532, "num_input_tokens_seen": 311771830, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 14452, "time_per_iteration": 2.4076459407806396 }, { "auxiliary_loss_clip": 0.01107032, "auxiliary_loss_mlp": 0.01033284, "balance_loss_clip": 1.02029574, "balance_loss_mlp": 1.03670883, "epoch": 0.8689613708101608, "flos": 19646728439040.0, "grad_norm": 2.1602154059231844, "language_loss": 0.72463483, "learning_rate": 1.7736096565127201e-07, "loss": 0.74603796, "num_input_tokens_seen": 311790130, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 14453, "time_per_iteration": 2.4952516555786133 }, { "auxiliary_loss_clip": 0.01105059, "auxiliary_loss_mlp": 0.01036026, "balance_loss_clip": 1.02358079, "balance_loss_mlp": 1.0373857, "epoch": 0.8690214940628288, "flos": 11728749070080.0, "grad_norm": 2.1347029845111063, "language_loss": 0.73893428, "learning_rate": 1.7720065890742664e-07, "loss": 0.7603451, "num_input_tokens_seen": 311808360, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 14454, "time_per_iteration": 2.422600507736206 }, { "auxiliary_loss_clip": 0.01105116, "auxiliary_loss_mlp": 0.01030276, "balance_loss_clip": 1.01805115, "balance_loss_mlp": 1.03720558, "epoch": 0.8690816173154968, "flos": 34936450076160.0, "grad_norm": 2.7653011720416463, "language_loss": 0.59572083, "learning_rate": 1.7704042128431552e-07, "loss": 0.61707473, "num_input_tokens_seen": 311831325, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 14455, "time_per_iteration": 2.630646228790283 }, { "auxiliary_loss_clip": 0.01105289, "auxiliary_loss_mlp": 0.01029406, "balance_loss_clip": 1.0172224, "balance_loss_mlp": 1.03470731, "epoch": 0.8691417405681647, "flos": 11614804151040.0, "grad_norm": 2.012658943993169, "language_loss": 0.8047024, "learning_rate": 1.7688025278801378e-07, "loss": 0.82604939, "num_input_tokens_seen": 311848090, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 14456, "time_per_iteration": 2.4233081340789795 }, { "auxiliary_loss_clip": 0.01109892, "auxiliary_loss_mlp": 0.01033255, "balance_loss_clip": 1.01980877, "balance_loss_mlp": 1.03781104, "epoch": 0.8692018638208328, "flos": 24608038677120.0, "grad_norm": 2.2165588231495876, "language_loss": 0.74481517, "learning_rate": 1.7672015342459568e-07, "loss": 0.76624668, "num_input_tokens_seen": 311867855, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 14457, "time_per_iteration": 3.8859262466430664 }, { "auxiliary_loss_clip": 0.01102272, "auxiliary_loss_mlp": 0.01028591, "balance_loss_clip": 1.01711071, "balance_loss_mlp": 1.03572214, "epoch": 0.8692619870735007, "flos": 25995124229760.0, "grad_norm": 1.5583177308548293, "language_loss": 0.78692853, "learning_rate": 1.765601232001328e-07, "loss": 0.80823714, "num_input_tokens_seen": 311888675, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66796875, "step": 14458, "time_per_iteration": 2.498461961746216 }, { "auxiliary_loss_clip": 0.011034, "auxiliary_loss_mlp": 0.0103381, "balance_loss_clip": 1.02107859, "balance_loss_mlp": 1.03520131, "epoch": 0.8693221103261687, "flos": 18041808856320.0, "grad_norm": 1.9637276209155043, "language_loss": 0.71103531, "learning_rate": 1.7640016212069187e-07, "loss": 0.73240745, "num_input_tokens_seen": 311907310, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 14459, "time_per_iteration": 2.439218044281006 }, { "auxiliary_loss_clip": 0.01098911, "auxiliary_loss_mlp": 0.01030152, "balance_loss_clip": 1.01911891, "balance_loss_mlp": 1.03437591, "epoch": 0.8693822335788366, "flos": 27492347859840.0, "grad_norm": 1.5012418506400007, "language_loss": 0.73651874, "learning_rate": 1.762402701923398e-07, "loss": 0.75780934, "num_input_tokens_seen": 311929635, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.64453125, "step": 14460, "time_per_iteration": 3.895051956176758 }, { "auxiliary_loss_clip": 0.01108376, "auxiliary_loss_mlp": 0.0103254, "balance_loss_clip": 1.0199399, "balance_loss_mlp": 1.0368681, "epoch": 0.8694423568315046, "flos": 24097712198400.0, "grad_norm": 1.8834494107707387, "language_loss": 0.64702207, "learning_rate": 1.7608044742113947e-07, "loss": 0.66843122, "num_input_tokens_seen": 311948800, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 14461, "time_per_iteration": 2.47914719581604 }, { "auxiliary_loss_clip": 0.01102351, "auxiliary_loss_mlp": 0.01034265, "balance_loss_clip": 1.02127731, "balance_loss_mlp": 1.03327894, "epoch": 0.8695024800841725, "flos": 18362131367040.0, "grad_norm": 2.1743296675928594, "language_loss": 0.82702273, "learning_rate": 1.7592069381315123e-07, "loss": 0.84838891, "num_input_tokens_seen": 311964090, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6875, "step": 14462, "time_per_iteration": 3.798651933670044 }, { "auxiliary_loss_clip": 0.01104672, "auxiliary_loss_mlp": 0.01031283, "balance_loss_clip": 1.01849818, "balance_loss_mlp": 1.0354594, "epoch": 0.8695626033368405, "flos": 14027750133120.0, "grad_norm": 1.9874776940355954, "language_loss": 0.65420997, "learning_rate": 1.757610093744335e-07, "loss": 0.67556947, "num_input_tokens_seen": 311981460, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 14463, "time_per_iteration": 2.4453823566436768 }, { "auxiliary_loss_clip": 0.01109425, "auxiliary_loss_mlp": 0.01035989, "balance_loss_clip": 1.02288175, "balance_loss_mlp": 1.03785634, "epoch": 0.8696227265895085, "flos": 16836862193280.0, "grad_norm": 2.3667208829442106, "language_loss": 0.66943121, "learning_rate": 1.7560139411104058e-07, "loss": 0.69088531, "num_input_tokens_seen": 312000115, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 14464, "time_per_iteration": 3.870720624923706 }, { "auxiliary_loss_clip": 0.01108234, "auxiliary_loss_mlp": 0.01033175, "balance_loss_clip": 1.02011538, "balance_loss_mlp": 1.03588033, "epoch": 0.8696828498421765, "flos": 21799070271360.0, "grad_norm": 3.647160295640387, "language_loss": 0.62426645, "learning_rate": 1.7544184802902607e-07, "loss": 0.64568055, "num_input_tokens_seen": 312020770, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.72265625, "step": 14465, "time_per_iteration": 2.5086209774017334 }, { "auxiliary_loss_clip": 0.01098446, "auxiliary_loss_mlp": 0.01037702, "balance_loss_clip": 1.02680039, "balance_loss_mlp": 1.03338003, "epoch": 0.8697429730948444, "flos": 22894812610560.0, "grad_norm": 1.570182330394847, "language_loss": 0.8487106, "learning_rate": 1.7528237113443934e-07, "loss": 0.87007207, "num_input_tokens_seen": 312041870, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.6484375, "step": 14466, "time_per_iteration": 2.499664545059204 }, { "auxiliary_loss_clip": 0.01109446, "auxiliary_loss_mlp": 0.01041272, "balance_loss_clip": 1.02729511, "balance_loss_mlp": 1.037328, "epoch": 0.8698030963475124, "flos": 24717458482560.0, "grad_norm": 3.0843279313055745, "language_loss": 0.6244415, "learning_rate": 1.7512296343332779e-07, "loss": 0.64594871, "num_input_tokens_seen": 312058210, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 14467, "time_per_iteration": 2.48140025138855 }, { "auxiliary_loss_clip": 0.01099697, "auxiliary_loss_mlp": 0.01028637, "balance_loss_clip": 1.01737201, "balance_loss_mlp": 1.03429079, "epoch": 0.8698632196001803, "flos": 28442221067520.0, "grad_norm": 1.3300936198649993, "language_loss": 0.68779361, "learning_rate": 1.7496362493173655e-07, "loss": 0.70907688, "num_input_tokens_seen": 312082665, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.65625, "step": 14468, "time_per_iteration": 2.59614896774292 }, { "auxiliary_loss_clip": 0.01100796, "auxiliary_loss_mlp": 0.01029273, "balance_loss_clip": 1.01750731, "balance_loss_mlp": 1.03353655, "epoch": 0.8699233428528483, "flos": 27636457224960.0, "grad_norm": 1.5866321164351984, "language_loss": 0.70990956, "learning_rate": 1.7480435563570773e-07, "loss": 0.73121035, "num_input_tokens_seen": 312101960, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 14469, "time_per_iteration": 2.5074126720428467 }, { "auxiliary_loss_clip": 0.01099173, "auxiliary_loss_mlp": 0.010278, "balance_loss_clip": 1.01655293, "balance_loss_mlp": 1.03462338, "epoch": 0.8699834661055164, "flos": 20045659864320.0, "grad_norm": 2.414962312547574, "language_loss": 0.84318846, "learning_rate": 1.7464515555128024e-07, "loss": 0.8644582, "num_input_tokens_seen": 312117125, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.64453125, "step": 14470, "time_per_iteration": 2.424078941345215 }, { "auxiliary_loss_clip": 0.01105291, "auxiliary_loss_mlp": 0.01030324, "balance_loss_clip": 1.01838493, "balance_loss_mlp": 1.03682005, "epoch": 0.8700435893581843, "flos": 23732787974400.0, "grad_norm": 1.7362491705878362, "language_loss": 0.73072577, "learning_rate": 1.7448602468449148e-07, "loss": 0.75208187, "num_input_tokens_seen": 312135775, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 14471, "time_per_iteration": 2.4652726650238037 }, { "auxiliary_loss_clip": 0.01102307, "auxiliary_loss_mlp": 0.01025852, "balance_loss_clip": 1.01480174, "balance_loss_mlp": 1.03491998, "epoch": 0.8701037126108523, "flos": 23548422441600.0, "grad_norm": 1.6336440134272259, "language_loss": 0.78900146, "learning_rate": 1.7432696304137573e-07, "loss": 0.81028306, "num_input_tokens_seen": 312156070, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.671875, "step": 14472, "time_per_iteration": 2.4649879932403564 }, { "auxiliary_loss_clip": 0.01103261, "auxiliary_loss_mlp": 0.01029005, "balance_loss_clip": 1.0163449, "balance_loss_mlp": 1.03477716, "epoch": 0.8701638358635202, "flos": 18843442634880.0, "grad_norm": 1.936179858666365, "language_loss": 0.72586262, "learning_rate": 1.741679706279644e-07, "loss": 0.74718523, "num_input_tokens_seen": 312174380, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 14473, "time_per_iteration": 2.444687843322754 }, { "auxiliary_loss_clip": 0.01106986, "auxiliary_loss_mlp": 0.01031492, "balance_loss_clip": 1.01903462, "balance_loss_mlp": 1.03644478, "epoch": 0.8702239591161882, "flos": 27928339142400.0, "grad_norm": 1.560611398830136, "language_loss": 0.7258929, "learning_rate": 1.7400904745028644e-07, "loss": 0.74727768, "num_input_tokens_seen": 312195130, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 14474, "time_per_iteration": 2.498167037963867 }, { "auxiliary_loss_clip": 0.01103215, "auxiliary_loss_mlp": 0.01039797, "balance_loss_clip": 1.02615917, "balance_loss_mlp": 1.03416467, "epoch": 0.8702840823688561, "flos": 17233997938560.0, "grad_norm": 3.32569410302679, "language_loss": 0.67343432, "learning_rate": 1.7385019351436925e-07, "loss": 0.69486451, "num_input_tokens_seen": 312212300, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69140625, "step": 14475, "time_per_iteration": 2.4307451248168945 }, { "auxiliary_loss_clip": 0.01104094, "auxiliary_loss_mlp": 0.01025656, "balance_loss_clip": 1.01313317, "balance_loss_mlp": 1.03402615, "epoch": 0.8703442056215241, "flos": 19427565605760.0, "grad_norm": 1.6649307501641482, "language_loss": 0.78102756, "learning_rate": 1.736914088262349e-07, "loss": 0.80232501, "num_input_tokens_seen": 312231735, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 14476, "time_per_iteration": 2.464731454849243 }, { "auxiliary_loss_clip": 0.01100779, "auxiliary_loss_mlp": 0.01027016, "balance_loss_clip": 1.01528013, "balance_loss_mlp": 1.03454185, "epoch": 0.8704043288741921, "flos": 22273845264000.0, "grad_norm": 1.6273537200808696, "language_loss": 0.72132683, "learning_rate": 1.7353269339190525e-07, "loss": 0.74260473, "num_input_tokens_seen": 312253060, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 14477, "time_per_iteration": 2.492701768875122 }, { "auxiliary_loss_clip": 0.01106123, "auxiliary_loss_mlp": 0.01025028, "balance_loss_clip": 1.01318491, "balance_loss_mlp": 1.03627658, "epoch": 0.8704644521268601, "flos": 16648725732480.0, "grad_norm": 1.8417267967687174, "language_loss": 0.59738708, "learning_rate": 1.7337404721739946e-07, "loss": 0.6186986, "num_input_tokens_seen": 312269460, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69921875, "step": 14478, "time_per_iteration": 2.453341007232666 }, { "auxiliary_loss_clip": 0.01103967, "auxiliary_loss_mlp": 0.01025887, "balance_loss_clip": 1.0152775, "balance_loss_mlp": 1.0384295, "epoch": 0.870524575379528, "flos": 24280210224000.0, "grad_norm": 1.5300996789361072, "language_loss": 0.71780729, "learning_rate": 1.732154703087323e-07, "loss": 0.73910582, "num_input_tokens_seen": 312289830, "router_z_loss_clip": 0.10595703, "router_z_loss_mlp": 0.65625, "step": 14479, "time_per_iteration": 2.4987268447875977 }, { "auxiliary_loss_clip": 0.0110317, "auxiliary_loss_mlp": 0.01032994, "balance_loss_clip": 1.02032208, "balance_loss_mlp": 1.03506398, "epoch": 0.870584698632196, "flos": 28768684803840.0, "grad_norm": 1.6945021939631248, "language_loss": 0.71006858, "learning_rate": 1.7305696267191805e-07, "loss": 0.73143029, "num_input_tokens_seen": 312311320, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 14480, "time_per_iteration": 2.508025884628296 }, { "auxiliary_loss_clip": 0.01105225, "auxiliary_loss_mlp": 0.01030408, "balance_loss_clip": 1.01918483, "balance_loss_mlp": 1.03504014, "epoch": 0.8706448218848639, "flos": 32449635774720.0, "grad_norm": 2.4331971469910205, "language_loss": 0.70051062, "learning_rate": 1.728985243129666e-07, "loss": 0.72186697, "num_input_tokens_seen": 312332095, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.69921875, "step": 14481, "time_per_iteration": 2.5326364040374756 }, { "auxiliary_loss_clip": 0.01102406, "auxiliary_loss_mlp": 0.01030437, "balance_loss_clip": 1.01856971, "balance_loss_mlp": 1.03441668, "epoch": 0.8707049451375319, "flos": 22748009725440.0, "grad_norm": 1.7496457654823063, "language_loss": 0.77065903, "learning_rate": 1.7274015523788643e-07, "loss": 0.79198748, "num_input_tokens_seen": 312351225, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 14482, "time_per_iteration": 2.465012311935425 }, { "auxiliary_loss_clip": 0.01102999, "auxiliary_loss_mlp": 0.01030229, "balance_loss_clip": 1.01802218, "balance_loss_mlp": 1.0354594, "epoch": 0.8707650683902, "flos": 15851976203520.0, "grad_norm": 1.7531553834830287, "language_loss": 0.76818693, "learning_rate": 1.7258185545268234e-07, "loss": 0.78951919, "num_input_tokens_seen": 312369730, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 14483, "time_per_iteration": 2.4298059940338135 }, { "auxiliary_loss_clip": 0.01110258, "auxiliary_loss_mlp": 0.01037497, "balance_loss_clip": 1.02393126, "balance_loss_mlp": 1.03740227, "epoch": 0.8708251916428679, "flos": 16468131127680.0, "grad_norm": 3.014230494312093, "language_loss": 0.62031484, "learning_rate": 1.7242362496335749e-07, "loss": 0.64179242, "num_input_tokens_seen": 312386780, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 14484, "time_per_iteration": 2.436162233352661 }, { "auxiliary_loss_clip": 0.0110497, "auxiliary_loss_mlp": 0.01030589, "balance_loss_clip": 1.01847684, "balance_loss_mlp": 1.03735602, "epoch": 0.8708853148955359, "flos": 15377847655680.0, "grad_norm": 2.3968364204005415, "language_loss": 0.6805476, "learning_rate": 1.7226546377591222e-07, "loss": 0.70190316, "num_input_tokens_seen": 312404875, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.67578125, "step": 14485, "time_per_iteration": 2.440796136856079 }, { "auxiliary_loss_clip": 0.0110239, "auxiliary_loss_mlp": 0.01030746, "balance_loss_clip": 1.01787114, "balance_loss_mlp": 1.0344857, "epoch": 0.8709454381482038, "flos": 30551325903360.0, "grad_norm": 2.3193768147326734, "language_loss": 0.63413697, "learning_rate": 1.7210737189634373e-07, "loss": 0.65546834, "num_input_tokens_seen": 312425280, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6796875, "step": 14486, "time_per_iteration": 2.5105347633361816 }, { "auxiliary_loss_clip": 0.01107348, "auxiliary_loss_mlp": 0.01031725, "balance_loss_clip": 1.01826024, "balance_loss_mlp": 1.03555322, "epoch": 0.8710055614008718, "flos": 22601422321920.0, "grad_norm": 2.4307115932660097, "language_loss": 0.61711061, "learning_rate": 1.7194934933064653e-07, "loss": 0.63850129, "num_input_tokens_seen": 312443835, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 14487, "time_per_iteration": 2.4684948921203613 }, { "auxiliary_loss_clip": 0.01101266, "auxiliary_loss_mlp": 0.0102507, "balance_loss_clip": 1.01457906, "balance_loss_mlp": 1.03380358, "epoch": 0.8710656846535397, "flos": 18443146492800.0, "grad_norm": 2.021510474921445, "language_loss": 0.67891711, "learning_rate": 1.7179139608481318e-07, "loss": 0.70018041, "num_input_tokens_seen": 312460830, "router_z_loss_clip": 0.10498047, "router_z_loss_mlp": 0.67578125, "step": 14488, "time_per_iteration": 2.415896415710449 }, { "auxiliary_loss_clip": 0.01107094, "auxiliary_loss_mlp": 0.01030935, "balance_loss_clip": 1.01815534, "balance_loss_mlp": 1.03699994, "epoch": 0.8711258079062077, "flos": 16503862181760.0, "grad_norm": 1.8873126843153023, "language_loss": 0.85908228, "learning_rate": 1.716335121648338e-07, "loss": 0.88046265, "num_input_tokens_seen": 312477575, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 14489, "time_per_iteration": 2.457120418548584 }, { "auxiliary_loss_clip": 0.01110015, "auxiliary_loss_mlp": 0.01029691, "balance_loss_clip": 1.01638699, "balance_loss_mlp": 1.03693652, "epoch": 0.8711859311588757, "flos": 15663336952320.0, "grad_norm": 2.5020265066297673, "language_loss": 0.7567535, "learning_rate": 1.7147569757669445e-07, "loss": 0.77815056, "num_input_tokens_seen": 312492140, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 14490, "time_per_iteration": 2.4073216915130615 }, { "auxiliary_loss_clip": 0.01107105, "auxiliary_loss_mlp": 0.01029879, "balance_loss_clip": 1.0169034, "balance_loss_mlp": 1.03631496, "epoch": 0.8712460544115437, "flos": 15557544420480.0, "grad_norm": 2.395541660184382, "language_loss": 0.76368606, "learning_rate": 1.7131795232638012e-07, "loss": 0.78505594, "num_input_tokens_seen": 312508400, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 14491, "time_per_iteration": 2.4223387241363525 }, { "auxiliary_loss_clip": 0.01105538, "auxiliary_loss_mlp": 0.01024896, "balance_loss_clip": 1.01293921, "balance_loss_mlp": 1.03783798, "epoch": 0.8713061776642116, "flos": 16763568491520.0, "grad_norm": 4.546547513503777, "language_loss": 0.66767776, "learning_rate": 1.711602764198723e-07, "loss": 0.68898207, "num_input_tokens_seen": 312525915, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 14492, "time_per_iteration": 2.4149556159973145 }, { "auxiliary_loss_clip": 0.01101296, "auxiliary_loss_mlp": 0.01025788, "balance_loss_clip": 1.01438618, "balance_loss_mlp": 1.03455949, "epoch": 0.8713663009168796, "flos": 24279887001600.0, "grad_norm": 2.389442758251332, "language_loss": 0.70036238, "learning_rate": 1.7100266986314992e-07, "loss": 0.7216332, "num_input_tokens_seen": 312544735, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66796875, "step": 14493, "time_per_iteration": 2.4939281940460205 }, { "auxiliary_loss_clip": 0.01107628, "auxiliary_loss_mlp": 0.01035507, "balance_loss_clip": 1.02230465, "balance_loss_mlp": 1.03764403, "epoch": 0.8714264241695475, "flos": 23795594904960.0, "grad_norm": 3.1890304971478995, "language_loss": 0.89167106, "learning_rate": 1.7084513266218936e-07, "loss": 0.91310239, "num_input_tokens_seen": 312557910, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 14494, "time_per_iteration": 2.4315109252929688 }, { "auxiliary_loss_clip": 0.01105629, "auxiliary_loss_mlp": 0.01031001, "balance_loss_clip": 1.01943791, "balance_loss_mlp": 1.03822041, "epoch": 0.8714865474222155, "flos": 37997942071680.0, "grad_norm": 1.7620134297948777, "language_loss": 0.59350032, "learning_rate": 1.7068766482296514e-07, "loss": 0.61486661, "num_input_tokens_seen": 312580360, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.67578125, "step": 14495, "time_per_iteration": 2.6093106269836426 }, { "auxiliary_loss_clip": 0.01104441, "auxiliary_loss_mlp": 0.01030677, "balance_loss_clip": 1.01842844, "balance_loss_mlp": 1.03453135, "epoch": 0.8715466706748836, "flos": 22455696844800.0, "grad_norm": 2.0553816514423233, "language_loss": 0.80535907, "learning_rate": 1.7053026635144762e-07, "loss": 0.82671022, "num_input_tokens_seen": 312597550, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69921875, "step": 14496, "time_per_iteration": 2.441498279571533 }, { "auxiliary_loss_clip": 0.01105302, "auxiliary_loss_mlp": 0.01036584, "balance_loss_clip": 1.02298784, "balance_loss_mlp": 1.03573859, "epoch": 0.8716067939275515, "flos": 21215126868480.0, "grad_norm": 2.3429268170140793, "language_loss": 0.79075897, "learning_rate": 1.7037293725360624e-07, "loss": 0.81217784, "num_input_tokens_seen": 312616435, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6953125, "step": 14497, "time_per_iteration": 2.451307773590088 }, { "auxiliary_loss_clip": 0.01105787, "auxiliary_loss_mlp": 0.01032556, "balance_loss_clip": 1.01968765, "balance_loss_mlp": 1.03565955, "epoch": 0.8716669171802195, "flos": 22997732054400.0, "grad_norm": 2.119328035268288, "language_loss": 0.67263114, "learning_rate": 1.70215677535406e-07, "loss": 0.69401455, "num_input_tokens_seen": 312632770, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 14498, "time_per_iteration": 3.9056849479675293 }, { "auxiliary_loss_clip": 0.01101805, "auxiliary_loss_mlp": 0.01028507, "balance_loss_clip": 1.01647329, "balance_loss_mlp": 1.03309739, "epoch": 0.8717270404328874, "flos": 29784058462080.0, "grad_norm": 1.661471550847371, "language_loss": 0.57443589, "learning_rate": 1.700584872028108e-07, "loss": 0.59573901, "num_input_tokens_seen": 312651900, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 14499, "time_per_iteration": 2.524298906326294 }, { "auxiliary_loss_clip": 0.01105325, "auxiliary_loss_mlp": 0.01031497, "balance_loss_clip": 1.0188849, "balance_loss_mlp": 1.03467083, "epoch": 0.8717871636855554, "flos": 22018125363840.0, "grad_norm": 2.8634564579760005, "language_loss": 0.80062586, "learning_rate": 1.6990136626178097e-07, "loss": 0.82199413, "num_input_tokens_seen": 312671380, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 14500, "time_per_iteration": 2.454205274581909 }, { "auxiliary_loss_clip": 0.01103839, "auxiliary_loss_mlp": 0.01029284, "balance_loss_clip": 1.01707113, "balance_loss_mlp": 1.03578019, "epoch": 0.8718472869382233, "flos": 16654256426880.0, "grad_norm": 2.3711060369720363, "language_loss": 0.72438323, "learning_rate": 1.6974431471827466e-07, "loss": 0.74571443, "num_input_tokens_seen": 312689215, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 14501, "time_per_iteration": 2.470831871032715 }, { "auxiliary_loss_clip": 0.01110066, "auxiliary_loss_mlp": 0.01032887, "balance_loss_clip": 1.01969635, "balance_loss_mlp": 1.03848624, "epoch": 0.8719074101908914, "flos": 19495328613120.0, "grad_norm": 7.718795265293689, "language_loss": 0.6439116, "learning_rate": 1.695873325782482e-07, "loss": 0.66534108, "num_input_tokens_seen": 312706400, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 14502, "time_per_iteration": 3.9083564281463623 }, { "auxiliary_loss_clip": 0.01105833, "auxiliary_loss_mlp": 0.01035337, "balance_loss_clip": 1.02273619, "balance_loss_mlp": 1.03561807, "epoch": 0.8719675334435593, "flos": 33070890430080.0, "grad_norm": 8.888138034817995, "language_loss": 0.68765187, "learning_rate": 1.6943041984765262e-07, "loss": 0.70906353, "num_input_tokens_seen": 312727985, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 14503, "time_per_iteration": 2.5583882331848145 }, { "auxiliary_loss_clip": 0.01104127, "auxiliary_loss_mlp": 0.01029031, "balance_loss_clip": 1.01665676, "balance_loss_mlp": 1.0350523, "epoch": 0.8720276566962273, "flos": 13626268842240.0, "grad_norm": 2.517343596427101, "language_loss": 0.69702721, "learning_rate": 1.6927357653243912e-07, "loss": 0.71835876, "num_input_tokens_seen": 312745025, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 14504, "time_per_iteration": 3.809757947921753 }, { "auxiliary_loss_clip": 0.01104645, "auxiliary_loss_mlp": 0.0102439, "balance_loss_clip": 1.01236224, "balance_loss_mlp": 1.03526711, "epoch": 0.8720877799488952, "flos": 23514163845120.0, "grad_norm": 4.324830889561415, "language_loss": 0.70349866, "learning_rate": 1.691168026385552e-07, "loss": 0.72478902, "num_input_tokens_seen": 312764170, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 14505, "time_per_iteration": 3.961258888244629 }, { "auxiliary_loss_clip": 0.01103463, "auxiliary_loss_mlp": 0.01030143, "balance_loss_clip": 1.01837683, "balance_loss_mlp": 1.03554034, "epoch": 0.8721479032015632, "flos": 20814148368000.0, "grad_norm": 4.051384854944239, "language_loss": 0.78303432, "learning_rate": 1.6896009817194545e-07, "loss": 0.8043704, "num_input_tokens_seen": 312783830, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 14506, "time_per_iteration": 2.441920042037964 }, { "auxiliary_loss_clip": 0.01106253, "auxiliary_loss_mlp": 0.01029924, "balance_loss_clip": 1.01774716, "balance_loss_mlp": 1.03535724, "epoch": 0.8722080264542311, "flos": 19463655795840.0, "grad_norm": 2.59644388595873, "language_loss": 0.74319339, "learning_rate": 1.6880346313855221e-07, "loss": 0.7645551, "num_input_tokens_seen": 312802015, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 14507, "time_per_iteration": 2.462505578994751 }, { "auxiliary_loss_clip": 0.0110823, "auxiliary_loss_mlp": 0.01029335, "balance_loss_clip": 1.01639485, "balance_loss_mlp": 1.03637314, "epoch": 0.8722681497068991, "flos": 21761866759680.0, "grad_norm": 3.3382531834034386, "language_loss": 0.72382796, "learning_rate": 1.686468975443156e-07, "loss": 0.74520361, "num_input_tokens_seen": 312820650, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 14508, "time_per_iteration": 2.4397387504577637 }, { "auxiliary_loss_clip": 0.01110206, "auxiliary_loss_mlp": 0.01034093, "balance_loss_clip": 1.02139735, "balance_loss_mlp": 1.03751445, "epoch": 0.8723282729595672, "flos": 28877134942080.0, "grad_norm": 1.679977085217619, "language_loss": 0.68732721, "learning_rate": 1.6849040139517202e-07, "loss": 0.70877016, "num_input_tokens_seen": 312841310, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 14509, "time_per_iteration": 2.539095401763916 }, { "auxiliary_loss_clip": 0.01105133, "auxiliary_loss_mlp": 0.010312, "balance_loss_clip": 1.01955342, "balance_loss_mlp": 1.03546405, "epoch": 0.8723883962122351, "flos": 26469145036800.0, "grad_norm": 2.543094602740153, "language_loss": 0.58334821, "learning_rate": 1.683339746970558e-07, "loss": 0.60471153, "num_input_tokens_seen": 312862100, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 14510, "time_per_iteration": 2.480529308319092 }, { "auxiliary_loss_clip": 0.01111059, "auxiliary_loss_mlp": 0.01030856, "balance_loss_clip": 1.01712954, "balance_loss_mlp": 1.03686011, "epoch": 0.8724485194649031, "flos": 20521476351360.0, "grad_norm": 4.225180559825017, "language_loss": 0.67530602, "learning_rate": 1.6817761745589865e-07, "loss": 0.69672519, "num_input_tokens_seen": 312880220, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7421875, "step": 14511, "time_per_iteration": 2.455864667892456 }, { "auxiliary_loss_clip": 0.01106743, "auxiliary_loss_mlp": 0.01034531, "balance_loss_clip": 1.02143025, "balance_loss_mlp": 1.03472221, "epoch": 0.872508642717571, "flos": 24353360271360.0, "grad_norm": 1.7879028126191399, "language_loss": 0.81997859, "learning_rate": 1.6802132967763027e-07, "loss": 0.84139132, "num_input_tokens_seen": 312900765, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 14512, "time_per_iteration": 2.46928334236145 }, { "auxiliary_loss_clip": 0.01029311, "auxiliary_loss_mlp": 0.01000238, "balance_loss_clip": 0.99920654, "balance_loss_mlp": 1.00720692, "epoch": 0.872568765970239, "flos": 61410012485760.0, "grad_norm": 0.7948711787013794, "language_loss": 0.58611155, "learning_rate": 1.6786511136817617e-07, "loss": 0.60640705, "num_input_tokens_seen": 312955840, "router_z_loss_clip": 0.01031494, "router_z_loss_mlp": 0.22070312, "step": 14513, "time_per_iteration": 2.977532386779785 }, { "auxiliary_loss_clip": 0.01104598, "auxiliary_loss_mlp": 0.01029011, "balance_loss_clip": 1.01640451, "balance_loss_mlp": 1.03590083, "epoch": 0.8726288892229069, "flos": 22598046443520.0, "grad_norm": 2.0638080815736957, "language_loss": 0.7669149, "learning_rate": 1.6770896253346112e-07, "loss": 0.78825098, "num_input_tokens_seen": 312973565, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 14514, "time_per_iteration": 2.440068483352661 }, { "auxiliary_loss_clip": 0.01108165, "auxiliary_loss_mlp": 0.01028362, "balance_loss_clip": 1.0163815, "balance_loss_mlp": 1.03691673, "epoch": 0.872689012475575, "flos": 25885201633920.0, "grad_norm": 2.2938464259750866, "language_loss": 0.65412903, "learning_rate": 1.675528831794055e-07, "loss": 0.67549431, "num_input_tokens_seen": 312994660, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.71484375, "step": 14515, "time_per_iteration": 2.4874651432037354 }, { "auxiliary_loss_clip": 0.01106484, "auxiliary_loss_mlp": 0.01034917, "balance_loss_clip": 1.02188134, "balance_loss_mlp": 1.03625226, "epoch": 0.8727491357282429, "flos": 21506721477120.0, "grad_norm": 3.637721505464176, "language_loss": 0.79151082, "learning_rate": 1.6739687331192842e-07, "loss": 0.8129248, "num_input_tokens_seen": 313009860, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 14516, "time_per_iteration": 2.4627959728240967 }, { "auxiliary_loss_clip": 0.01107543, "auxiliary_loss_mlp": 0.01030273, "balance_loss_clip": 1.01757169, "balance_loss_mlp": 1.03639555, "epoch": 0.8728092589809109, "flos": 19207504932480.0, "grad_norm": 3.268455592009663, "language_loss": 0.72370541, "learning_rate": 1.672409329369453e-07, "loss": 0.74508351, "num_input_tokens_seen": 313027025, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 14517, "time_per_iteration": 2.4433906078338623 }, { "auxiliary_loss_clip": 0.01099198, "auxiliary_loss_mlp": 0.01022553, "balance_loss_clip": 1.01135933, "balance_loss_mlp": 1.03288376, "epoch": 0.8728693822335788, "flos": 20595308757120.0, "grad_norm": 1.9326885249608754, "language_loss": 0.72432351, "learning_rate": 1.6708506206036966e-07, "loss": 0.74554104, "num_input_tokens_seen": 313046830, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 14518, "time_per_iteration": 2.435575246810913 }, { "auxiliary_loss_clip": 0.01101689, "auxiliary_loss_mlp": 0.01032402, "balance_loss_clip": 1.02043331, "balance_loss_mlp": 1.03485096, "epoch": 0.8729295054862468, "flos": 21728613744000.0, "grad_norm": 1.8491172893067123, "language_loss": 0.74420142, "learning_rate": 1.6692926068811275e-07, "loss": 0.76554239, "num_input_tokens_seen": 313067715, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.66796875, "step": 14519, "time_per_iteration": 2.477978467941284 }, { "auxiliary_loss_clip": 0.01108234, "auxiliary_loss_mlp": 0.01030682, "balance_loss_clip": 1.01709187, "balance_loss_mlp": 1.03575051, "epoch": 0.8729896287389147, "flos": 17673436926720.0, "grad_norm": 2.775371846100625, "language_loss": 0.76792479, "learning_rate": 1.6677352882608142e-07, "loss": 0.78931391, "num_input_tokens_seen": 313082305, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.72265625, "step": 14520, "time_per_iteration": 2.4314608573913574 }, { "auxiliary_loss_clip": 0.01106672, "auxiliary_loss_mlp": 0.0103589, "balance_loss_clip": 1.022753, "balance_loss_mlp": 1.03612936, "epoch": 0.8730497519915827, "flos": 24571804832640.0, "grad_norm": 1.640572744724642, "language_loss": 0.82562387, "learning_rate": 1.666178664801816e-07, "loss": 0.84704947, "num_input_tokens_seen": 313101190, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 14521, "time_per_iteration": 2.5029923915863037 }, { "auxiliary_loss_clip": 0.01107246, "auxiliary_loss_mlp": 0.01036693, "balance_loss_clip": 1.02326429, "balance_loss_mlp": 1.03661013, "epoch": 0.8731098752442508, "flos": 13443734903040.0, "grad_norm": 2.515832848218154, "language_loss": 0.76448673, "learning_rate": 1.6646227365631616e-07, "loss": 0.7859261, "num_input_tokens_seen": 313118965, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 14522, "time_per_iteration": 2.4047162532806396 }, { "auxiliary_loss_clip": 0.0110143, "auxiliary_loss_mlp": 0.01026463, "balance_loss_clip": 1.01516771, "balance_loss_mlp": 1.03418446, "epoch": 0.8731699984969187, "flos": 23474446381440.0, "grad_norm": 2.1579687686000377, "language_loss": 0.76002324, "learning_rate": 1.66306750360385e-07, "loss": 0.78130221, "num_input_tokens_seen": 313139280, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.671875, "step": 14523, "time_per_iteration": 2.4765632152557373 }, { "auxiliary_loss_clip": 0.01101164, "auxiliary_loss_mlp": 0.01028777, "balance_loss_clip": 1.01679683, "balance_loss_mlp": 1.03366303, "epoch": 0.8732301217495867, "flos": 17712651600000.0, "grad_norm": 2.144984527235053, "language_loss": 0.78289658, "learning_rate": 1.6615129659828542e-07, "loss": 0.804196, "num_input_tokens_seen": 313156655, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 14524, "time_per_iteration": 2.4202418327331543 }, { "auxiliary_loss_clip": 0.0110055, "auxiliary_loss_mlp": 0.0103082, "balance_loss_clip": 1.01973403, "balance_loss_mlp": 1.03399611, "epoch": 0.8732902450022546, "flos": 22054359208320.0, "grad_norm": 4.485917829218257, "language_loss": 0.776169, "learning_rate": 1.6599591237591272e-07, "loss": 0.79748273, "num_input_tokens_seen": 313174050, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6640625, "step": 14525, "time_per_iteration": 2.4663338661193848 }, { "auxiliary_loss_clip": 0.011061, "auxiliary_loss_mlp": 0.01033717, "balance_loss_clip": 1.02146792, "balance_loss_mlp": 1.03548193, "epoch": 0.8733503682549226, "flos": 22272983337600.0, "grad_norm": 2.4210253299545648, "language_loss": 0.69444215, "learning_rate": 1.6584059769915902e-07, "loss": 0.71584034, "num_input_tokens_seen": 313192765, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.70703125, "step": 14526, "time_per_iteration": 2.4522745609283447 }, { "auxiliary_loss_clip": 0.01110014, "auxiliary_loss_mlp": 0.01036342, "balance_loss_clip": 1.02299023, "balance_loss_mlp": 1.03727674, "epoch": 0.8734104915075905, "flos": 23364344217600.0, "grad_norm": 2.0556738982135374, "language_loss": 0.61126691, "learning_rate": 1.6568535257391326e-07, "loss": 0.63273048, "num_input_tokens_seen": 313210925, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 14527, "time_per_iteration": 2.4628543853759766 }, { "auxiliary_loss_clip": 0.011123, "auxiliary_loss_mlp": 0.01032276, "balance_loss_clip": 1.01804209, "balance_loss_mlp": 1.03801918, "epoch": 0.8734706147602586, "flos": 17712292464000.0, "grad_norm": 2.0958568650810734, "language_loss": 0.65853727, "learning_rate": 1.6553017700606265e-07, "loss": 0.67998302, "num_input_tokens_seen": 313228250, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7421875, "step": 14528, "time_per_iteration": 2.4045956134796143 }, { "auxiliary_loss_clip": 0.01103381, "auxiliary_loss_mlp": 0.01030433, "balance_loss_clip": 1.01842248, "balance_loss_mlp": 1.0364511, "epoch": 0.8735307380129265, "flos": 22049367217920.0, "grad_norm": 1.9559397576202298, "language_loss": 0.89672643, "learning_rate": 1.6537507100149205e-07, "loss": 0.91806453, "num_input_tokens_seen": 313247880, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 14529, "time_per_iteration": 2.464369773864746 }, { "auxiliary_loss_clip": 0.01102669, "auxiliary_loss_mlp": 0.010277, "balance_loss_clip": 1.01561189, "balance_loss_mlp": 1.03550434, "epoch": 0.8735908612655945, "flos": 25338425829120.0, "grad_norm": 1.8294819622280785, "language_loss": 0.85053468, "learning_rate": 1.6522003456608258e-07, "loss": 0.87183833, "num_input_tokens_seen": 313266790, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 14530, "time_per_iteration": 2.503098964691162 }, { "auxiliary_loss_clip": 0.01104829, "auxiliary_loss_mlp": 0.01032817, "balance_loss_clip": 1.02138448, "balance_loss_mlp": 1.03547168, "epoch": 0.8736509845182624, "flos": 21540908246400.0, "grad_norm": 2.276128825120457, "language_loss": 0.74405247, "learning_rate": 1.650650677057128e-07, "loss": 0.76542896, "num_input_tokens_seen": 313286805, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 14531, "time_per_iteration": 2.474973201751709 }, { "auxiliary_loss_clip": 0.01100113, "auxiliary_loss_mlp": 0.0102974, "balance_loss_clip": 1.01783109, "balance_loss_mlp": 1.03356731, "epoch": 0.8737111077709304, "flos": 22017227523840.0, "grad_norm": 1.80138728734971, "language_loss": 0.61550522, "learning_rate": 1.6491017042625966e-07, "loss": 0.63680375, "num_input_tokens_seen": 313305415, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 14532, "time_per_iteration": 2.5062344074249268 }, { "auxiliary_loss_clip": 0.01028654, "auxiliary_loss_mlp": 0.01000157, "balance_loss_clip": 0.99913806, "balance_loss_mlp": 1.00661802, "epoch": 0.8737712310235983, "flos": 70066315912320.0, "grad_norm": 0.8838119937442446, "language_loss": 0.58721644, "learning_rate": 1.6475534273359704e-07, "loss": 0.60750461, "num_input_tokens_seen": 313369940, "router_z_loss_clip": 0.01019287, "router_z_loss_mlp": 0.22070312, "step": 14533, "time_per_iteration": 3.2293012142181396 }, { "auxiliary_loss_clip": 0.01101802, "auxiliary_loss_mlp": 0.0103112, "balance_loss_clip": 1.01925278, "balance_loss_mlp": 1.03439999, "epoch": 0.8738313542762663, "flos": 28658331244800.0, "grad_norm": 1.5547178838003297, "language_loss": 0.76882929, "learning_rate": 1.646005846335954e-07, "loss": 0.79015851, "num_input_tokens_seen": 313390965, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 14534, "time_per_iteration": 2.510362148284912 }, { "auxiliary_loss_clip": 0.01104244, "auxiliary_loss_mlp": 0.01029587, "balance_loss_clip": 1.01725447, "balance_loss_mlp": 1.03505683, "epoch": 0.8738914775289344, "flos": 22346384780160.0, "grad_norm": 1.9023957711517558, "language_loss": 0.75158876, "learning_rate": 1.6444589613212357e-07, "loss": 0.77292705, "num_input_tokens_seen": 313409680, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 14535, "time_per_iteration": 2.4556281566619873 }, { "auxiliary_loss_clip": 0.01103551, "auxiliary_loss_mlp": 0.01028453, "balance_loss_clip": 1.01581669, "balance_loss_mlp": 1.03387761, "epoch": 0.8739516007816023, "flos": 31759648444800.0, "grad_norm": 9.405322553879179, "language_loss": 0.7481339, "learning_rate": 1.64291277235048e-07, "loss": 0.76945388, "num_input_tokens_seen": 313431335, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 14536, "time_per_iteration": 2.5599825382232666 }, { "auxiliary_loss_clip": 0.01102713, "auxiliary_loss_mlp": 0.01031894, "balance_loss_clip": 1.02023518, "balance_loss_mlp": 1.03368998, "epoch": 0.8740117240342703, "flos": 21211715076480.0, "grad_norm": 2.495233521715002, "language_loss": 0.64096916, "learning_rate": 1.641367279482304e-07, "loss": 0.66231525, "num_input_tokens_seen": 313449225, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 14537, "time_per_iteration": 2.4426259994506836 }, { "auxiliary_loss_clip": 0.01102188, "auxiliary_loss_mlp": 0.01026765, "balance_loss_clip": 1.01415849, "balance_loss_mlp": 1.0341692, "epoch": 0.8740718472869382, "flos": 25186666867200.0, "grad_norm": 1.8358057143834008, "language_loss": 0.58422703, "learning_rate": 1.6398224827753216e-07, "loss": 0.60551655, "num_input_tokens_seen": 313467715, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 14538, "time_per_iteration": 2.481525421142578 }, { "auxiliary_loss_clip": 0.01103748, "auxiliary_loss_mlp": 0.01028047, "balance_loss_clip": 1.01620376, "balance_loss_mlp": 1.03819203, "epoch": 0.8741319705396062, "flos": 19500931134720.0, "grad_norm": 1.9148904916354428, "language_loss": 0.68357527, "learning_rate": 1.6382783822881142e-07, "loss": 0.70489317, "num_input_tokens_seen": 313486805, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.65625, "step": 14539, "time_per_iteration": 2.4834916591644287 }, { "auxiliary_loss_clip": 0.01106244, "auxiliary_loss_mlp": 0.01031701, "balance_loss_clip": 1.01916075, "balance_loss_mlp": 1.03393579, "epoch": 0.8741920937922741, "flos": 14100900180480.0, "grad_norm": 2.990688384535169, "language_loss": 0.74438339, "learning_rate": 1.6367349780792262e-07, "loss": 0.76576293, "num_input_tokens_seen": 313504880, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 14540, "time_per_iteration": 3.9973037242889404 }, { "auxiliary_loss_clip": 0.01104527, "auxiliary_loss_mlp": 0.01035731, "balance_loss_clip": 1.0234648, "balance_loss_mlp": 1.03525639, "epoch": 0.8742522170449422, "flos": 27709858667520.0, "grad_norm": 1.9216780866360188, "language_loss": 0.7919777, "learning_rate": 1.635192270207193e-07, "loss": 0.81338024, "num_input_tokens_seen": 313524995, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 14541, "time_per_iteration": 2.502455234527588 }, { "auxiliary_loss_clip": 0.01109439, "auxiliary_loss_mlp": 0.01034486, "balance_loss_clip": 1.02034199, "balance_loss_mlp": 1.03670073, "epoch": 0.8743123402976101, "flos": 21142587352320.0, "grad_norm": 2.310691951796789, "language_loss": 0.66696131, "learning_rate": 1.6336502587305035e-07, "loss": 0.68840051, "num_input_tokens_seen": 313541740, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7265625, "step": 14542, "time_per_iteration": 2.4952781200408936 }, { "auxiliary_loss_clip": 0.01028479, "auxiliary_loss_mlp": 0.01001449, "balance_loss_clip": 1.00039959, "balance_loss_mlp": 1.00628614, "epoch": 0.8743724635502781, "flos": 60870024351360.0, "grad_norm": 0.785261649998395, "language_loss": 0.54496956, "learning_rate": 1.632108943707642e-07, "loss": 0.56526881, "num_input_tokens_seen": 313593445, "router_z_loss_clip": 0.01049805, "router_z_loss_mlp": 0.22265625, "step": 14543, "time_per_iteration": 4.329946279525757 }, { "auxiliary_loss_clip": 0.01107004, "auxiliary_loss_mlp": 0.01035346, "balance_loss_clip": 1.0223881, "balance_loss_mlp": 1.03640127, "epoch": 0.874432586802946, "flos": 28109292883200.0, "grad_norm": 2.268382151162741, "language_loss": 0.69453382, "learning_rate": 1.6305683251970458e-07, "loss": 0.71595728, "num_input_tokens_seen": 313615640, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 14544, "time_per_iteration": 2.511199712753296 }, { "auxiliary_loss_clip": 0.01099883, "auxiliary_loss_mlp": 0.01029194, "balance_loss_clip": 1.01804161, "balance_loss_mlp": 1.03423345, "epoch": 0.874492710055614, "flos": 23550289948800.0, "grad_norm": 1.4475728527834808, "language_loss": 0.7561692, "learning_rate": 1.62902840325714e-07, "loss": 0.77745998, "num_input_tokens_seen": 313635550, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 14545, "time_per_iteration": 3.875189781188965 }, { "auxiliary_loss_clip": 0.01105166, "auxiliary_loss_mlp": 0.01038335, "balance_loss_clip": 1.02357066, "balance_loss_mlp": 1.0356499, "epoch": 0.8745528333082819, "flos": 40915647924480.0, "grad_norm": 1.856906014904783, "language_loss": 0.66056192, "learning_rate": 1.6274891779463217e-07, "loss": 0.68199694, "num_input_tokens_seen": 313659275, "router_z_loss_clip": 0.14746094, "router_z_loss_mlp": 0.6953125, "step": 14546, "time_per_iteration": 4.1067633628845215 }, { "auxiliary_loss_clip": 0.01103816, "auxiliary_loss_mlp": 0.01032052, "balance_loss_clip": 1.01985145, "balance_loss_mlp": 1.03489792, "epoch": 0.87461295656095, "flos": 23622901292160.0, "grad_norm": 1.8459991959982727, "language_loss": 0.7315411, "learning_rate": 1.6259506493229536e-07, "loss": 0.75289977, "num_input_tokens_seen": 313680595, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 14547, "time_per_iteration": 2.479008197784424 }, { "auxiliary_loss_clip": 0.01111364, "auxiliary_loss_mlp": 0.01038745, "balance_loss_clip": 1.025352, "balance_loss_mlp": 1.03650534, "epoch": 0.874673079813618, "flos": 38794116983040.0, "grad_norm": 2.246188709616702, "language_loss": 0.69488835, "learning_rate": 1.6244128174453752e-07, "loss": 0.71638942, "num_input_tokens_seen": 313699730, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.75, "step": 14548, "time_per_iteration": 2.597466468811035 }, { "auxiliary_loss_clip": 0.01107497, "auxiliary_loss_mlp": 0.01032742, "balance_loss_clip": 1.02012944, "balance_loss_mlp": 1.03500962, "epoch": 0.8747332030662859, "flos": 23696159080320.0, "grad_norm": 2.103146973994653, "language_loss": 0.70767808, "learning_rate": 1.6228756823719093e-07, "loss": 0.72908044, "num_input_tokens_seen": 313720090, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 14549, "time_per_iteration": 2.4707705974578857 }, { "auxiliary_loss_clip": 0.01108172, "auxiliary_loss_mlp": 0.01036397, "balance_loss_clip": 1.02254462, "balance_loss_mlp": 1.03535986, "epoch": 0.8747933263189539, "flos": 24462456854400.0, "grad_norm": 2.4288065294204237, "language_loss": 0.83928829, "learning_rate": 1.6213392441608352e-07, "loss": 0.86073399, "num_input_tokens_seen": 313736795, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 14550, "time_per_iteration": 2.4773049354553223 }, { "auxiliary_loss_clip": 0.01107593, "auxiliary_loss_mlp": 0.01042593, "balance_loss_clip": 1.03007007, "balance_loss_mlp": 1.03618848, "epoch": 0.8748534495716218, "flos": 13809161917440.0, "grad_norm": 1.5795525830099064, "language_loss": 0.71794897, "learning_rate": 1.6198035028704183e-07, "loss": 0.73945081, "num_input_tokens_seen": 313754820, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71484375, "step": 14551, "time_per_iteration": 2.460813522338867 }, { "auxiliary_loss_clip": 0.01103763, "auxiliary_loss_mlp": 0.01033509, "balance_loss_clip": 1.02101552, "balance_loss_mlp": 1.03607488, "epoch": 0.8749135728242898, "flos": 29862092759040.0, "grad_norm": 1.8883590709677243, "language_loss": 0.64367807, "learning_rate": 1.6182684585588934e-07, "loss": 0.66505075, "num_input_tokens_seen": 313775830, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 14552, "time_per_iteration": 2.5247743129730225 }, { "auxiliary_loss_clip": 0.01107666, "auxiliary_loss_mlp": 0.01028139, "balance_loss_clip": 1.01458454, "balance_loss_mlp": 1.03620386, "epoch": 0.8749736960769577, "flos": 24133479166080.0, "grad_norm": 2.0713619830168803, "language_loss": 0.79444575, "learning_rate": 1.616734111284479e-07, "loss": 0.81580377, "num_input_tokens_seen": 313795745, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71484375, "step": 14553, "time_per_iteration": 2.4674415588378906 }, { "auxiliary_loss_clip": 0.01104779, "auxiliary_loss_mlp": 0.01030724, "balance_loss_clip": 1.01880896, "balance_loss_mlp": 1.03383136, "epoch": 0.8750338193296258, "flos": 17202540602880.0, "grad_norm": 5.704514120230434, "language_loss": 0.70136672, "learning_rate": 1.6152004611053416e-07, "loss": 0.72272176, "num_input_tokens_seen": 313813895, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 14554, "time_per_iteration": 2.4434986114501953 }, { "auxiliary_loss_clip": 0.01106343, "auxiliary_loss_mlp": 0.01026759, "balance_loss_clip": 1.01445651, "balance_loss_mlp": 1.03710926, "epoch": 0.8750939425822937, "flos": 23733218937600.0, "grad_norm": 1.7559056447179826, "language_loss": 0.83233583, "learning_rate": 1.6136675080796457e-07, "loss": 0.85366678, "num_input_tokens_seen": 313834225, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 14555, "time_per_iteration": 2.460181474685669 }, { "auxiliary_loss_clip": 0.01103398, "auxiliary_loss_mlp": 0.0103171, "balance_loss_clip": 1.01931262, "balance_loss_mlp": 1.03412306, "epoch": 0.8751540658349617, "flos": 26541684552960.0, "grad_norm": 3.2491252495928844, "language_loss": 0.71011138, "learning_rate": 1.6121352522655252e-07, "loss": 0.73146248, "num_input_tokens_seen": 313854430, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 14556, "time_per_iteration": 2.497525691986084 }, { "auxiliary_loss_clip": 0.01106413, "auxiliary_loss_mlp": 0.01033106, "balance_loss_clip": 1.01939738, "balance_loss_mlp": 1.03436959, "epoch": 0.8752141890876296, "flos": 19386806647680.0, "grad_norm": 2.5995412624980907, "language_loss": 0.76882446, "learning_rate": 1.6106036937210732e-07, "loss": 0.7902196, "num_input_tokens_seen": 313871600, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 14557, "time_per_iteration": 2.4331610202789307 }, { "auxiliary_loss_clip": 0.0110671, "auxiliary_loss_mlp": 0.01037306, "balance_loss_clip": 1.02433026, "balance_loss_mlp": 1.03681207, "epoch": 0.8752743123402976, "flos": 25374408278400.0, "grad_norm": 1.9253742215161715, "language_loss": 0.82964242, "learning_rate": 1.6090728325043767e-07, "loss": 0.85108256, "num_input_tokens_seen": 313891570, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 14558, "time_per_iteration": 2.497232437133789 }, { "auxiliary_loss_clip": 0.01028594, "auxiliary_loss_mlp": 0.01001892, "balance_loss_clip": 1.00089693, "balance_loss_mlp": 1.00653553, "epoch": 0.8753344355929655, "flos": 59952398578560.0, "grad_norm": 0.9964246977143036, "language_loss": 0.56050515, "learning_rate": 1.6075426686734784e-07, "loss": 0.58081001, "num_input_tokens_seen": 313951290, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22070312, "step": 14559, "time_per_iteration": 3.0768401622772217 }, { "auxiliary_loss_clip": 0.01102837, "auxiliary_loss_mlp": 0.01031054, "balance_loss_clip": 1.0189662, "balance_loss_mlp": 1.0348103, "epoch": 0.8753945588456336, "flos": 17894646835200.0, "grad_norm": 1.8133506353991062, "language_loss": 0.65915096, "learning_rate": 1.606013202286407e-07, "loss": 0.6804899, "num_input_tokens_seen": 313968645, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 14560, "time_per_iteration": 2.4164328575134277 }, { "auxiliary_loss_clip": 0.01102544, "auxiliary_loss_mlp": 0.01028624, "balance_loss_clip": 1.01689959, "balance_loss_mlp": 1.03419507, "epoch": 0.8754546820983016, "flos": 30914885410560.0, "grad_norm": 2.6272510466502466, "language_loss": 0.79524416, "learning_rate": 1.6044844334011541e-07, "loss": 0.81655586, "num_input_tokens_seen": 313987580, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 14561, "time_per_iteration": 2.505171060562134 }, { "auxiliary_loss_clip": 0.01105355, "auxiliary_loss_mlp": 0.01036093, "balance_loss_clip": 1.02243721, "balance_loss_mlp": 1.03387582, "epoch": 0.8755148053509695, "flos": 20631075724800.0, "grad_norm": 2.3950598295660686, "language_loss": 0.77875471, "learning_rate": 1.6029563620756982e-07, "loss": 0.80016923, "num_input_tokens_seen": 314004460, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 14562, "time_per_iteration": 2.438887357711792 }, { "auxiliary_loss_clip": 0.01098561, "auxiliary_loss_mlp": 0.01028248, "balance_loss_clip": 1.01723886, "balance_loss_mlp": 1.03356111, "epoch": 0.8755749286036375, "flos": 34969739005440.0, "grad_norm": 1.4355825175016281, "language_loss": 0.71574616, "learning_rate": 1.601428988367981e-07, "loss": 0.73701423, "num_input_tokens_seen": 314026855, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6484375, "step": 14563, "time_per_iteration": 2.5562360286712646 }, { "auxiliary_loss_clip": 0.01108664, "auxiliary_loss_mlp": 0.01032298, "balance_loss_clip": 1.01963234, "balance_loss_mlp": 1.03740466, "epoch": 0.8756350518563054, "flos": 18186456925440.0, "grad_norm": 2.2241115040330826, "language_loss": 0.65465009, "learning_rate": 1.5999023123359235e-07, "loss": 0.67605972, "num_input_tokens_seen": 314042830, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 14564, "time_per_iteration": 2.4324610233306885 }, { "auxiliary_loss_clip": 0.01102391, "auxiliary_loss_mlp": 0.01036238, "balance_loss_clip": 1.02425718, "balance_loss_mlp": 1.03350747, "epoch": 0.8756951751089734, "flos": 20084012611200.0, "grad_norm": 2.3895900388018148, "language_loss": 0.70452309, "learning_rate": 1.598376334037408e-07, "loss": 0.72590941, "num_input_tokens_seen": 314062225, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 14565, "time_per_iteration": 2.4301841259002686 }, { "auxiliary_loss_clip": 0.0111151, "auxiliary_loss_mlp": 0.01034789, "balance_loss_clip": 1.02138948, "balance_loss_mlp": 1.03739071, "epoch": 0.8757552983616413, "flos": 27525241739520.0, "grad_norm": 1.6090082134801047, "language_loss": 0.77652705, "learning_rate": 1.5968510535303102e-07, "loss": 0.79799008, "num_input_tokens_seen": 314082325, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7421875, "step": 14566, "time_per_iteration": 2.5104174613952637 }, { "auxiliary_loss_clip": 0.01107628, "auxiliary_loss_mlp": 0.01032571, "balance_loss_clip": 1.02034068, "balance_loss_mlp": 1.03867745, "epoch": 0.8758154216143094, "flos": 18073014796800.0, "grad_norm": 2.2542454576442954, "language_loss": 0.71319145, "learning_rate": 1.5953264708724624e-07, "loss": 0.73459351, "num_input_tokens_seen": 314100310, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 14567, "time_per_iteration": 2.440394878387451 }, { "auxiliary_loss_clip": 0.01103876, "auxiliary_loss_mlp": 0.01033791, "balance_loss_clip": 1.02092242, "balance_loss_mlp": 1.03519177, "epoch": 0.8758755448669773, "flos": 25045681985280.0, "grad_norm": 1.7504969060355247, "language_loss": 0.74368882, "learning_rate": 1.5938025861216776e-07, "loss": 0.76506543, "num_input_tokens_seen": 314121330, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 14568, "time_per_iteration": 2.492316246032715 }, { "auxiliary_loss_clip": 0.01102052, "auxiliary_loss_mlp": 0.0102785, "balance_loss_clip": 1.01647758, "balance_loss_mlp": 1.0342629, "epoch": 0.8759356681196453, "flos": 22856818999680.0, "grad_norm": 2.0954070256093145, "language_loss": 0.86797875, "learning_rate": 1.5922793993357475e-07, "loss": 0.88927776, "num_input_tokens_seen": 314139875, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 14569, "time_per_iteration": 2.4754977226257324 }, { "auxiliary_loss_clip": 0.01104112, "auxiliary_loss_mlp": 0.01027585, "balance_loss_clip": 1.01582503, "balance_loss_mlp": 1.03394222, "epoch": 0.8759957913723132, "flos": 21032521102080.0, "grad_norm": 4.167306380527241, "language_loss": 0.73871177, "learning_rate": 1.5907569105724284e-07, "loss": 0.76002878, "num_input_tokens_seen": 314157850, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.703125, "step": 14570, "time_per_iteration": 2.4538979530334473 }, { "auxiliary_loss_clip": 0.01107547, "auxiliary_loss_mlp": 0.01033199, "balance_loss_clip": 1.02008021, "balance_loss_mlp": 1.03600931, "epoch": 0.8760559146249812, "flos": 20010467514240.0, "grad_norm": 1.6705838815997531, "language_loss": 0.6757772, "learning_rate": 1.5892351198894472e-07, "loss": 0.69718468, "num_input_tokens_seen": 314176720, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 14571, "time_per_iteration": 2.466662645339966 }, { "auxiliary_loss_clip": 0.01101521, "auxiliary_loss_mlp": 0.01029536, "balance_loss_clip": 1.01769221, "balance_loss_mlp": 1.03395128, "epoch": 0.8761160378776491, "flos": 19974161842560.0, "grad_norm": 2.626993564517021, "language_loss": 0.62773383, "learning_rate": 1.5877140273445156e-07, "loss": 0.64904439, "num_input_tokens_seen": 314196645, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 14572, "time_per_iteration": 2.462125062942505 }, { "auxiliary_loss_clip": 0.01100821, "auxiliary_loss_mlp": 0.01030052, "balance_loss_clip": 1.01888824, "balance_loss_mlp": 1.03432024, "epoch": 0.8761761611303172, "flos": 28804415857920.0, "grad_norm": 4.992471799102764, "language_loss": 0.73538411, "learning_rate": 1.5861936329953162e-07, "loss": 0.75669289, "num_input_tokens_seen": 314217430, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6640625, "step": 14573, "time_per_iteration": 2.5048301219940186 }, { "auxiliary_loss_clip": 0.01100855, "auxiliary_loss_mlp": 0.01031917, "balance_loss_clip": 1.02118182, "balance_loss_mlp": 1.0348227, "epoch": 0.8762362843829851, "flos": 18332505624960.0, "grad_norm": 2.1997572988292764, "language_loss": 0.72925234, "learning_rate": 1.5846739368994966e-07, "loss": 0.75058007, "num_input_tokens_seen": 314235310, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.66015625, "step": 14574, "time_per_iteration": 2.4637744426727295 }, { "auxiliary_loss_clip": 0.01101875, "auxiliary_loss_mlp": 0.01033924, "balance_loss_clip": 1.02172852, "balance_loss_mlp": 1.03381932, "epoch": 0.8762964076356531, "flos": 15779149378560.0, "grad_norm": 2.5249653229221094, "language_loss": 0.75983983, "learning_rate": 1.5831549391146903e-07, "loss": 0.78119779, "num_input_tokens_seen": 314252355, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 14575, "time_per_iteration": 2.4467828273773193 }, { "auxiliary_loss_clip": 0.01104101, "auxiliary_loss_mlp": 0.01036183, "balance_loss_clip": 1.02434587, "balance_loss_mlp": 1.03676999, "epoch": 0.8763565308883211, "flos": 33176754789120.0, "grad_norm": 3.0686603456255552, "language_loss": 0.66876578, "learning_rate": 1.5816366396984916e-07, "loss": 0.69016856, "num_input_tokens_seen": 314272755, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 14576, "time_per_iteration": 2.5522401332855225 }, { "auxiliary_loss_clip": 0.01102029, "auxiliary_loss_mlp": 0.01029631, "balance_loss_clip": 1.01821721, "balance_loss_mlp": 1.03388226, "epoch": 0.876416654140989, "flos": 15888102307200.0, "grad_norm": 33.94911732422002, "language_loss": 0.67215192, "learning_rate": 1.5801190387084806e-07, "loss": 0.69346857, "num_input_tokens_seen": 314291365, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 14577, "time_per_iteration": 2.4136459827423096 }, { "auxiliary_loss_clip": 0.01106167, "auxiliary_loss_mlp": 0.01032367, "balance_loss_clip": 1.01930809, "balance_loss_mlp": 1.03663027, "epoch": 0.876476777393657, "flos": 25885237547520.0, "grad_norm": 2.377513043107719, "language_loss": 0.71079546, "learning_rate": 1.5786021362021962e-07, "loss": 0.73218083, "num_input_tokens_seen": 314310075, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 14578, "time_per_iteration": 2.487355947494507 }, { "auxiliary_loss_clip": 0.01105606, "auxiliary_loss_mlp": 0.01031865, "balance_loss_clip": 1.0195744, "balance_loss_mlp": 1.03488302, "epoch": 0.876536900646325, "flos": 13589675861760.0, "grad_norm": 2.0418010420126356, "language_loss": 0.71873462, "learning_rate": 1.5770859322371676e-07, "loss": 0.74010932, "num_input_tokens_seen": 314325695, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 14579, "time_per_iteration": 2.398510456085205 }, { "auxiliary_loss_clip": 0.01099404, "auxiliary_loss_mlp": 0.01028966, "balance_loss_clip": 1.01746273, "balance_loss_mlp": 1.03401709, "epoch": 0.876597023898993, "flos": 12203344494720.0, "grad_norm": 1.7740121061798602, "language_loss": 0.7024197, "learning_rate": 1.5755704268708912e-07, "loss": 0.72370338, "num_input_tokens_seen": 314343605, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65625, "step": 14580, "time_per_iteration": 2.4414989948272705 }, { "auxiliary_loss_clip": 0.01102688, "auxiliary_loss_mlp": 0.01028132, "balance_loss_clip": 1.01656246, "balance_loss_mlp": 1.03635943, "epoch": 0.8766571471516609, "flos": 25336773803520.0, "grad_norm": 1.8941073413352798, "language_loss": 0.65791178, "learning_rate": 1.5740556201608256e-07, "loss": 0.67921996, "num_input_tokens_seen": 314364275, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6640625, "step": 14581, "time_per_iteration": 2.4784748554229736 }, { "auxiliary_loss_clip": 0.01101251, "auxiliary_loss_mlp": 0.01028193, "balance_loss_clip": 1.01634932, "balance_loss_mlp": 1.03462648, "epoch": 0.8767172704043289, "flos": 30113287545600.0, "grad_norm": 1.6685233559373682, "language_loss": 0.73617101, "learning_rate": 1.572541512164416e-07, "loss": 0.75746536, "num_input_tokens_seen": 314385140, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6640625, "step": 14582, "time_per_iteration": 3.953508138656616 }, { "auxiliary_loss_clip": 0.01103286, "auxiliary_loss_mlp": 0.01031272, "balance_loss_clip": 1.01870728, "balance_loss_mlp": 1.03450894, "epoch": 0.8767773936569968, "flos": 19281157770240.0, "grad_norm": 2.293136049485445, "language_loss": 0.6683321, "learning_rate": 1.5710281029390826e-07, "loss": 0.6896776, "num_input_tokens_seen": 314403715, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 14583, "time_per_iteration": 2.4406092166900635 }, { "auxiliary_loss_clip": 0.01105717, "auxiliary_loss_mlp": 0.0102741, "balance_loss_clip": 1.01520872, "balance_loss_mlp": 1.03490829, "epoch": 0.8768375169096648, "flos": 21247230648960.0, "grad_norm": 2.3022970872251505, "language_loss": 0.79452729, "learning_rate": 1.5695153925422067e-07, "loss": 0.8158586, "num_input_tokens_seen": 314421880, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 14584, "time_per_iteration": 2.4699480533599854 }, { "auxiliary_loss_clip": 0.01104794, "auxiliary_loss_mlp": 0.01028258, "balance_loss_clip": 1.01584864, "balance_loss_mlp": 1.03455496, "epoch": 0.8768976401623327, "flos": 23295539715840.0, "grad_norm": 1.6382919291517368, "language_loss": 0.72359461, "learning_rate": 1.5680033810311555e-07, "loss": 0.74492514, "num_input_tokens_seen": 314441585, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 14585, "time_per_iteration": 3.9488420486450195 }, { "auxiliary_loss_clip": 0.01106021, "auxiliary_loss_mlp": 0.01031367, "balance_loss_clip": 1.01859975, "balance_loss_mlp": 1.03656602, "epoch": 0.8769577634150008, "flos": 21361247395200.0, "grad_norm": 2.4972761341421026, "language_loss": 0.74361783, "learning_rate": 1.5664920684632654e-07, "loss": 0.7649917, "num_input_tokens_seen": 314459020, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 14586, "time_per_iteration": 2.4378247261047363 }, { "auxiliary_loss_clip": 0.01101051, "auxiliary_loss_mlp": 0.01031401, "balance_loss_clip": 1.01839495, "balance_loss_mlp": 1.03281116, "epoch": 0.8770178866676687, "flos": 23514056104320.0, "grad_norm": 1.9049888026006165, "language_loss": 0.78739715, "learning_rate": 1.564981454895844e-07, "loss": 0.80872166, "num_input_tokens_seen": 314478935, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.68359375, "step": 14587, "time_per_iteration": 3.853058099746704 }, { "auxiliary_loss_clip": 0.01104657, "auxiliary_loss_mlp": 0.0103069, "balance_loss_clip": 1.01702261, "balance_loss_mlp": 1.03560352, "epoch": 0.8770780099203367, "flos": 19719052473600.0, "grad_norm": 1.7390494545269972, "language_loss": 0.74153644, "learning_rate": 1.5634715403861697e-07, "loss": 0.76288986, "num_input_tokens_seen": 314497635, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69140625, "step": 14588, "time_per_iteration": 3.8903417587280273 }, { "auxiliary_loss_clip": 0.01101611, "auxiliary_loss_mlp": 0.01032016, "balance_loss_clip": 1.02019048, "balance_loss_mlp": 1.03387594, "epoch": 0.8771381331730047, "flos": 21395901041280.0, "grad_norm": 1.9330377245670123, "language_loss": 0.66913354, "learning_rate": 1.5619623249915016e-07, "loss": 0.6904698, "num_input_tokens_seen": 314515445, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 14589, "time_per_iteration": 2.444108009338379 }, { "auxiliary_loss_clip": 0.01105295, "auxiliary_loss_mlp": 0.01031451, "balance_loss_clip": 1.01966751, "balance_loss_mlp": 1.03603077, "epoch": 0.8771982564256726, "flos": 20261770041600.0, "grad_norm": 2.2644340703231896, "language_loss": 0.70786375, "learning_rate": 1.5604538087690732e-07, "loss": 0.72923124, "num_input_tokens_seen": 314533040, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 14590, "time_per_iteration": 2.4572465419769287 }, { "auxiliary_loss_clip": 0.01110923, "auxiliary_loss_mlp": 0.01036132, "balance_loss_clip": 1.02264965, "balance_loss_mlp": 1.03676748, "epoch": 0.8772583796783406, "flos": 12489372495360.0, "grad_norm": 2.0789220762061618, "language_loss": 0.74479926, "learning_rate": 1.558945991776086e-07, "loss": 0.76626986, "num_input_tokens_seen": 314548280, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7421875, "step": 14591, "time_per_iteration": 2.4777920246124268 }, { "auxiliary_loss_clip": 0.01098815, "auxiliary_loss_mlp": 0.01027661, "balance_loss_clip": 1.01622856, "balance_loss_mlp": 1.03408337, "epoch": 0.8773185029310085, "flos": 15921103927680.0, "grad_norm": 1.6361838164449993, "language_loss": 0.79825032, "learning_rate": 1.5574388740697096e-07, "loss": 0.81951505, "num_input_tokens_seen": 314565345, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6484375, "step": 14592, "time_per_iteration": 2.4297561645507812 }, { "auxiliary_loss_clip": 0.0110094, "auxiliary_loss_mlp": 0.01030601, "balance_loss_clip": 1.01922202, "balance_loss_mlp": 1.03473401, "epoch": 0.8773786261836766, "flos": 21504530747520.0, "grad_norm": 1.6602393607173176, "language_loss": 0.82679105, "learning_rate": 1.5559324557071052e-07, "loss": 0.84810644, "num_input_tokens_seen": 314584190, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6640625, "step": 14593, "time_per_iteration": 2.4801437854766846 }, { "auxiliary_loss_clip": 0.01101733, "auxiliary_loss_mlp": 0.01026167, "balance_loss_clip": 1.0143292, "balance_loss_mlp": 1.03486323, "epoch": 0.8774387494363445, "flos": 26761493831040.0, "grad_norm": 1.6557184957163635, "language_loss": 0.76236355, "learning_rate": 1.5544267367453845e-07, "loss": 0.78364253, "num_input_tokens_seen": 314605625, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 14594, "time_per_iteration": 2.501987934112549 }, { "auxiliary_loss_clip": 0.01105736, "auxiliary_loss_mlp": 0.01032663, "balance_loss_clip": 1.02014005, "balance_loss_mlp": 1.03497982, "epoch": 0.8774988726890125, "flos": 18478841633280.0, "grad_norm": 2.315184242927002, "language_loss": 0.77519441, "learning_rate": 1.552921717241651e-07, "loss": 0.79657841, "num_input_tokens_seen": 314622630, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 14595, "time_per_iteration": 2.423994541168213 }, { "auxiliary_loss_clip": 0.0110447, "auxiliary_loss_mlp": 0.01037975, "balance_loss_clip": 1.02516007, "balance_loss_mlp": 1.03615165, "epoch": 0.8775589959416804, "flos": 24426366664320.0, "grad_norm": 1.7841999752211908, "language_loss": 0.70743287, "learning_rate": 1.5514173972529743e-07, "loss": 0.72885728, "num_input_tokens_seen": 314642460, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.68359375, "step": 14596, "time_per_iteration": 2.5191667079925537 }, { "auxiliary_loss_clip": 0.01104861, "auxiliary_loss_mlp": 0.01023444, "balance_loss_clip": 1.01178575, "balance_loss_mlp": 1.03610516, "epoch": 0.8776191191943484, "flos": 23440151871360.0, "grad_norm": 2.171469657596724, "language_loss": 0.85892898, "learning_rate": 1.5499137768364067e-07, "loss": 0.88021207, "num_input_tokens_seen": 314659875, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 14597, "time_per_iteration": 2.4630844593048096 }, { "auxiliary_loss_clip": 0.01103911, "auxiliary_loss_mlp": 0.0102837, "balance_loss_clip": 1.01672864, "balance_loss_mlp": 1.03515887, "epoch": 0.8776792424470163, "flos": 26830872950400.0, "grad_norm": 1.7616205245565502, "language_loss": 0.72904807, "learning_rate": 1.5484108560489494e-07, "loss": 0.75037086, "num_input_tokens_seen": 314680260, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 14598, "time_per_iteration": 2.5342366695404053 }, { "auxiliary_loss_clip": 0.0110645, "auxiliary_loss_mlp": 0.01032092, "balance_loss_clip": 1.02005756, "balance_loss_mlp": 1.03711677, "epoch": 0.8777393656996844, "flos": 15626169354240.0, "grad_norm": 2.2031700733775303, "language_loss": 0.77718848, "learning_rate": 1.5469086349476036e-07, "loss": 0.79857397, "num_input_tokens_seen": 314696260, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6953125, "step": 14599, "time_per_iteration": 2.395477056503296 }, { "auxiliary_loss_clip": 0.01104204, "auxiliary_loss_mlp": 0.01028457, "balance_loss_clip": 1.01700091, "balance_loss_mlp": 1.0350728, "epoch": 0.8777994889523523, "flos": 18879999701760.0, "grad_norm": 2.0516702564674, "language_loss": 0.67915368, "learning_rate": 1.545407113589332e-07, "loss": 0.70048028, "num_input_tokens_seen": 314714215, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.69140625, "step": 14600, "time_per_iteration": 2.473447322845459 }, { "auxiliary_loss_clip": 0.01105229, "auxiliary_loss_mlp": 0.01036468, "balance_loss_clip": 1.02377868, "balance_loss_mlp": 1.03516603, "epoch": 0.8778596122050203, "flos": 48826516400640.0, "grad_norm": 1.8961927137049253, "language_loss": 0.69716632, "learning_rate": 1.543906292031072e-07, "loss": 0.71858329, "num_input_tokens_seen": 314735700, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 14601, "time_per_iteration": 2.6704020500183105 }, { "auxiliary_loss_clip": 0.01107726, "auxiliary_loss_mlp": 0.01032272, "balance_loss_clip": 1.01979077, "balance_loss_mlp": 1.03573227, "epoch": 0.8779197354576883, "flos": 25660184883840.0, "grad_norm": 2.753748601273318, "language_loss": 0.73172426, "learning_rate": 1.542406170329733e-07, "loss": 0.75312424, "num_input_tokens_seen": 314753335, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.71875, "step": 14602, "time_per_iteration": 2.5194976329803467 }, { "auxiliary_loss_clip": 0.0110127, "auxiliary_loss_mlp": 0.01030774, "balance_loss_clip": 1.01924062, "balance_loss_mlp": 1.03383267, "epoch": 0.8779798587103562, "flos": 18843227153280.0, "grad_norm": 1.9398702027880574, "language_loss": 0.70954835, "learning_rate": 1.5409067485422056e-07, "loss": 0.73086882, "num_input_tokens_seen": 314770800, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 14603, "time_per_iteration": 2.4502625465393066 }, { "auxiliary_loss_clip": 0.01028237, "auxiliary_loss_mlp": 0.01003598, "balance_loss_clip": 1.0026145, "balance_loss_mlp": 1.00600052, "epoch": 0.8780399819630242, "flos": 68613119377920.0, "grad_norm": 0.7462697971314336, "language_loss": 0.54153621, "learning_rate": 1.539408026725344e-07, "loss": 0.56185448, "num_input_tokens_seen": 314837275, "router_z_loss_clip": 0.00982666, "router_z_loss_mlp": 0.22265625, "step": 14604, "time_per_iteration": 3.089872360229492 }, { "auxiliary_loss_clip": 0.01028438, "auxiliary_loss_mlp": 0.01002698, "balance_loss_clip": 1.00170231, "balance_loss_mlp": 1.00623786, "epoch": 0.8781001052156922, "flos": 65734807766400.0, "grad_norm": 0.7069207810544398, "language_loss": 0.59212232, "learning_rate": 1.537910004935976e-07, "loss": 0.61243367, "num_input_tokens_seen": 314902220, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22265625, "step": 14605, "time_per_iteration": 3.0834360122680664 }, { "auxiliary_loss_clip": 0.01106102, "auxiliary_loss_mlp": 0.01032608, "balance_loss_clip": 1.02011514, "balance_loss_mlp": 1.03586805, "epoch": 0.8781602284683602, "flos": 22049654526720.0, "grad_norm": 1.9809639344271688, "language_loss": 0.85210788, "learning_rate": 1.536412683230912e-07, "loss": 0.87349498, "num_input_tokens_seen": 314921645, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 14606, "time_per_iteration": 2.4803988933563232 }, { "auxiliary_loss_clip": 0.01107281, "auxiliary_loss_mlp": 0.01028946, "balance_loss_clip": 1.01586258, "balance_loss_mlp": 1.03663433, "epoch": 0.8782203517210281, "flos": 17562939713280.0, "grad_norm": 2.0365287668634102, "language_loss": 0.70584488, "learning_rate": 1.534916061666931e-07, "loss": 0.72720712, "num_input_tokens_seen": 314939390, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 14607, "time_per_iteration": 2.4153027534484863 }, { "auxiliary_loss_clip": 0.01102338, "auxiliary_loss_mlp": 0.01037539, "balance_loss_clip": 1.0264287, "balance_loss_mlp": 1.0352037, "epoch": 0.8782804749736961, "flos": 25520421064320.0, "grad_norm": 1.9132057975108887, "language_loss": 0.72325146, "learning_rate": 1.533420140300785e-07, "loss": 0.74465024, "num_input_tokens_seen": 314959205, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.671875, "step": 14608, "time_per_iteration": 2.5094289779663086 }, { "auxiliary_loss_clip": 0.01109097, "auxiliary_loss_mlp": 0.01033917, "balance_loss_clip": 1.02153695, "balance_loss_mlp": 1.03630769, "epoch": 0.878340598226364, "flos": 21798747048960.0, "grad_norm": 2.4428893361438098, "language_loss": 0.87658739, "learning_rate": 1.5319249191891936e-07, "loss": 0.89801753, "num_input_tokens_seen": 314977485, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 14609, "time_per_iteration": 2.453545331954956 }, { "auxiliary_loss_clip": 0.01105649, "auxiliary_loss_mlp": 0.0103019, "balance_loss_clip": 1.01809025, "balance_loss_mlp": 1.03683829, "epoch": 0.878400721479032, "flos": 21102403011840.0, "grad_norm": 1.8777271428929987, "language_loss": 0.70397067, "learning_rate": 1.5304303983888643e-07, "loss": 0.7253291, "num_input_tokens_seen": 314997830, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 14610, "time_per_iteration": 2.467329978942871 }, { "auxiliary_loss_clip": 0.01102204, "auxiliary_loss_mlp": 0.01030935, "balance_loss_clip": 1.01928234, "balance_loss_mlp": 1.03569138, "epoch": 0.8784608447316999, "flos": 20923532259840.0, "grad_norm": 2.5033516362986803, "language_loss": 0.80704349, "learning_rate": 1.5289365779564612e-07, "loss": 0.82837486, "num_input_tokens_seen": 315016480, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6640625, "step": 14611, "time_per_iteration": 2.4437291622161865 }, { "auxiliary_loss_clip": 0.0110524, "auxiliary_loss_mlp": 0.0103396, "balance_loss_clip": 1.02150309, "balance_loss_mlp": 1.03587985, "epoch": 0.878520967984368, "flos": 23330660238720.0, "grad_norm": 1.8950582689567776, "language_loss": 0.7679038, "learning_rate": 1.5274434579486338e-07, "loss": 0.78929579, "num_input_tokens_seen": 315036135, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 14612, "time_per_iteration": 2.469264507293701 }, { "auxiliary_loss_clip": 0.0110271, "auxiliary_loss_mlp": 0.01033844, "balance_loss_clip": 1.02185154, "balance_loss_mlp": 1.03516507, "epoch": 0.8785810912370359, "flos": 25518984520320.0, "grad_norm": 1.5938976297860674, "language_loss": 0.72517145, "learning_rate": 1.525951038422002e-07, "loss": 0.74653697, "num_input_tokens_seen": 315057995, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 14613, "time_per_iteration": 2.4783194065093994 }, { "auxiliary_loss_clip": 0.01028536, "auxiliary_loss_mlp": 0.0100141, "balance_loss_clip": 1.00037336, "balance_loss_mlp": 1.00617862, "epoch": 0.8786412144897039, "flos": 61841047691520.0, "grad_norm": 1.028884502344467, "language_loss": 0.64590305, "learning_rate": 1.5244593194331667e-07, "loss": 0.66620255, "num_input_tokens_seen": 315104010, "router_z_loss_clip": 0.01037598, "router_z_loss_mlp": 0.22363281, "step": 14614, "time_per_iteration": 2.852465867996216 }, { "auxiliary_loss_clip": 0.01028865, "auxiliary_loss_mlp": 0.0100326, "balance_loss_clip": 1.00218725, "balance_loss_mlp": 1.00668573, "epoch": 0.8787013377423719, "flos": 70989364638720.0, "grad_norm": 0.6783522316646887, "language_loss": 0.58534002, "learning_rate": 1.5229683010386762e-07, "loss": 0.60566127, "num_input_tokens_seen": 315174550, "router_z_loss_clip": 0.01074219, "router_z_loss_mlp": 0.22167969, "step": 14615, "time_per_iteration": 3.172259569168091 }, { "auxiliary_loss_clip": 0.01103106, "auxiliary_loss_mlp": 0.01027971, "balance_loss_clip": 1.01635981, "balance_loss_mlp": 1.03378248, "epoch": 0.8787614609950398, "flos": 17347404153600.0, "grad_norm": 2.0061875026854006, "language_loss": 0.73069584, "learning_rate": 1.5214779832950807e-07, "loss": 0.75200653, "num_input_tokens_seen": 315191825, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 14616, "time_per_iteration": 2.425454616546631 }, { "auxiliary_loss_clip": 0.01028826, "auxiliary_loss_mlp": 0.01001078, "balance_loss_clip": 0.99999923, "balance_loss_mlp": 1.00635517, "epoch": 0.8788215842477078, "flos": 72511401588480.0, "grad_norm": 0.8253241483145186, "language_loss": 0.57977289, "learning_rate": 1.5199883662588953e-07, "loss": 0.60007191, "num_input_tokens_seen": 315255075, "router_z_loss_clip": 0.01080322, "router_z_loss_mlp": 0.22460938, "step": 14617, "time_per_iteration": 3.1914803981781006 }, { "auxiliary_loss_clip": 0.01101436, "auxiliary_loss_mlp": 0.01029116, "balance_loss_clip": 1.01735008, "balance_loss_mlp": 1.03469813, "epoch": 0.8788817075003758, "flos": 24827452905600.0, "grad_norm": 2.3207772881743747, "language_loss": 0.83648109, "learning_rate": 1.5184994499865987e-07, "loss": 0.85778666, "num_input_tokens_seen": 315273995, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.66796875, "step": 14618, "time_per_iteration": 2.478407144546509 }, { "auxiliary_loss_clip": 0.01099655, "auxiliary_loss_mlp": 0.01027014, "balance_loss_clip": 1.01563549, "balance_loss_mlp": 1.03513503, "epoch": 0.8789418307530438, "flos": 22638769488000.0, "grad_norm": 1.784401496008984, "language_loss": 0.69323111, "learning_rate": 1.5170112345346598e-07, "loss": 0.7144978, "num_input_tokens_seen": 315294485, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.64453125, "step": 14619, "time_per_iteration": 2.4773902893066406 }, { "auxiliary_loss_clip": 0.01107028, "auxiliary_loss_mlp": 0.01034027, "balance_loss_clip": 1.022017, "balance_loss_mlp": 1.03605413, "epoch": 0.8790019540057117, "flos": 19785738072960.0, "grad_norm": 1.993061482213491, "language_loss": 0.77346009, "learning_rate": 1.5155237199595016e-07, "loss": 0.79487062, "num_input_tokens_seen": 315310420, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 14620, "time_per_iteration": 2.425504684448242 }, { "auxiliary_loss_clip": 0.01106525, "auxiliary_loss_mlp": 0.01027198, "balance_loss_clip": 1.01381707, "balance_loss_mlp": 1.03650093, "epoch": 0.8790620772583797, "flos": 20229774001920.0, "grad_norm": 1.9408701550334264, "language_loss": 0.79376078, "learning_rate": 1.514036906317542e-07, "loss": 0.81509805, "num_input_tokens_seen": 315330110, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.69921875, "step": 14621, "time_per_iteration": 2.460341453552246 }, { "auxiliary_loss_clip": 0.01106645, "auxiliary_loss_mlp": 0.01035514, "balance_loss_clip": 1.02293777, "balance_loss_mlp": 1.03509545, "epoch": 0.8791222005110476, "flos": 24130785646080.0, "grad_norm": 2.0168816383920984, "language_loss": 0.66377091, "learning_rate": 1.5125507936651506e-07, "loss": 0.68519253, "num_input_tokens_seen": 315350080, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 14622, "time_per_iteration": 2.4738945960998535 }, { "auxiliary_loss_clip": 0.01102782, "auxiliary_loss_mlp": 0.01036757, "balance_loss_clip": 1.025015, "balance_loss_mlp": 1.03473556, "epoch": 0.8791823237637156, "flos": 21614201948160.0, "grad_norm": 8.40170003736847, "language_loss": 0.73227406, "learning_rate": 1.511065382058687e-07, "loss": 0.7536695, "num_input_tokens_seen": 315366360, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 14623, "time_per_iteration": 3.9300074577331543 }, { "auxiliary_loss_clip": 0.01098308, "auxiliary_loss_mlp": 0.01031547, "balance_loss_clip": 1.01919055, "balance_loss_mlp": 1.03067899, "epoch": 0.8792424470163835, "flos": 24243401761920.0, "grad_norm": 1.6974083226920629, "language_loss": 0.784738, "learning_rate": 1.5095806715544801e-07, "loss": 0.80603659, "num_input_tokens_seen": 315385890, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.67578125, "step": 14624, "time_per_iteration": 2.4584827423095703 }, { "auxiliary_loss_clip": 0.01105939, "auxiliary_loss_mlp": 0.01036118, "balance_loss_clip": 1.02292728, "balance_loss_mlp": 1.03549361, "epoch": 0.8793025702690516, "flos": 24893204751360.0, "grad_norm": 1.9280725351103616, "language_loss": 0.79631746, "learning_rate": 1.5080966622088265e-07, "loss": 0.817738, "num_input_tokens_seen": 315403400, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 14625, "time_per_iteration": 2.480031728744507 }, { "auxiliary_loss_clip": 0.01102631, "auxiliary_loss_mlp": 0.01036904, "balance_loss_clip": 1.02516747, "balance_loss_mlp": 1.03593802, "epoch": 0.8793626935217195, "flos": 25373115388800.0, "grad_norm": 1.6715093359109192, "language_loss": 0.74270326, "learning_rate": 1.5066133540779967e-07, "loss": 0.76409864, "num_input_tokens_seen": 315423670, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 14626, "time_per_iteration": 2.479998826980591 }, { "auxiliary_loss_clip": 0.01105107, "auxiliary_loss_mlp": 0.01031714, "balance_loss_clip": 1.01940608, "balance_loss_mlp": 1.03379023, "epoch": 0.8794228167743875, "flos": 34678000742400.0, "grad_norm": 1.7787805471922746, "language_loss": 0.71046984, "learning_rate": 1.505130747218246e-07, "loss": 0.73183799, "num_input_tokens_seen": 315446265, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 14627, "time_per_iteration": 3.920769691467285 }, { "auxiliary_loss_clip": 0.01103581, "auxiliary_loss_mlp": 0.01030754, "balance_loss_clip": 1.01783204, "balance_loss_mlp": 1.03430796, "epoch": 0.8794829400270555, "flos": 19464014931840.0, "grad_norm": 1.8569890632724542, "language_loss": 0.72347915, "learning_rate": 1.5036488416857873e-07, "loss": 0.74482244, "num_input_tokens_seen": 315464655, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 14628, "time_per_iteration": 3.8317694664001465 }, { "auxiliary_loss_clip": 0.0110499, "auxiliary_loss_mlp": 0.01034675, "balance_loss_clip": 1.02205634, "balance_loss_mlp": 1.03584504, "epoch": 0.8795430632797234, "flos": 15231403906560.0, "grad_norm": 2.758656348355426, "language_loss": 0.68809402, "learning_rate": 1.5021676375368175e-07, "loss": 0.70949066, "num_input_tokens_seen": 315481090, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 14629, "time_per_iteration": 3.891347646713257 }, { "auxiliary_loss_clip": 0.01099821, "auxiliary_loss_mlp": 0.0103257, "balance_loss_clip": 1.02111948, "balance_loss_mlp": 1.03283489, "epoch": 0.8796031865323914, "flos": 27744727795200.0, "grad_norm": 1.800692209584024, "language_loss": 0.68353713, "learning_rate": 1.5006871348275053e-07, "loss": 0.70486104, "num_input_tokens_seen": 315502010, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66796875, "step": 14630, "time_per_iteration": 2.522947072982788 }, { "auxiliary_loss_clip": 0.01100285, "auxiliary_loss_mlp": 0.01035912, "balance_loss_clip": 1.02327013, "balance_loss_mlp": 1.03444862, "epoch": 0.8796633097850594, "flos": 31285412156160.0, "grad_norm": 1.5812075059908732, "language_loss": 0.74397653, "learning_rate": 1.499207333613999e-07, "loss": 0.76533854, "num_input_tokens_seen": 315523040, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.65625, "step": 14631, "time_per_iteration": 2.5485622882843018 }, { "auxiliary_loss_clip": 0.01100193, "auxiliary_loss_mlp": 0.01032498, "balance_loss_clip": 1.02071452, "balance_loss_mlp": 1.03471243, "epoch": 0.8797234330377274, "flos": 24243150366720.0, "grad_norm": 2.858440953004644, "language_loss": 0.69558597, "learning_rate": 1.4977282339523954e-07, "loss": 0.71691287, "num_input_tokens_seen": 315541865, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.65625, "step": 14632, "time_per_iteration": 2.4642350673675537 }, { "auxiliary_loss_clip": 0.01105707, "auxiliary_loss_mlp": 0.01032409, "balance_loss_clip": 1.02096486, "balance_loss_mlp": 1.03678334, "epoch": 0.8797835562903953, "flos": 24167414540160.0, "grad_norm": 3.1268187955505065, "language_loss": 0.64979088, "learning_rate": 1.4962498358987929e-07, "loss": 0.67117208, "num_input_tokens_seen": 315561470, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 14633, "time_per_iteration": 2.477360486984253 }, { "auxiliary_loss_clip": 0.01103668, "auxiliary_loss_mlp": 0.01036475, "balance_loss_clip": 1.02425015, "balance_loss_mlp": 1.0354445, "epoch": 0.8798436795430633, "flos": 19284677303040.0, "grad_norm": 1.6274818501871213, "language_loss": 0.84276736, "learning_rate": 1.4947721395092528e-07, "loss": 0.86416876, "num_input_tokens_seen": 315583140, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 14634, "time_per_iteration": 2.488762378692627 }, { "auxiliary_loss_clip": 0.01103269, "auxiliary_loss_mlp": 0.01036786, "balance_loss_clip": 1.02410769, "balance_loss_mlp": 1.03437233, "epoch": 0.8799038027957312, "flos": 28179390274560.0, "grad_norm": 1.6851706785591138, "language_loss": 0.79881573, "learning_rate": 1.4932951448398056e-07, "loss": 0.82021624, "num_input_tokens_seen": 315601935, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 14635, "time_per_iteration": 2.5097596645355225 }, { "auxiliary_loss_clip": 0.01104656, "auxiliary_loss_mlp": 0.01024611, "balance_loss_clip": 1.01285696, "balance_loss_mlp": 1.03558373, "epoch": 0.8799639260483992, "flos": 24644703484800.0, "grad_norm": 2.2035330235330455, "language_loss": 0.65366733, "learning_rate": 1.4918188519464648e-07, "loss": 0.67496002, "num_input_tokens_seen": 315619995, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69140625, "step": 14636, "time_per_iteration": 2.4959120750427246 }, { "auxiliary_loss_clip": 0.0110533, "auxiliary_loss_mlp": 0.01034829, "balance_loss_clip": 1.02216291, "balance_loss_mlp": 1.03589404, "epoch": 0.8800240493010671, "flos": 22200479735040.0, "grad_norm": 1.5118885477298476, "language_loss": 0.7054882, "learning_rate": 1.4903432608852074e-07, "loss": 0.72688985, "num_input_tokens_seen": 315637895, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14637, "time_per_iteration": 2.462924003601074 }, { "auxiliary_loss_clip": 0.01107352, "auxiliary_loss_mlp": 0.01030169, "balance_loss_clip": 1.01827765, "balance_loss_mlp": 1.03884661, "epoch": 0.8800841725537352, "flos": 14246086953600.0, "grad_norm": 2.013693528971756, "language_loss": 0.66120678, "learning_rate": 1.4888683717119843e-07, "loss": 0.68258202, "num_input_tokens_seen": 315655520, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14638, "time_per_iteration": 2.3996145725250244 }, { "auxiliary_loss_clip": 0.0110492, "auxiliary_loss_mlp": 0.01029285, "balance_loss_clip": 1.01714325, "balance_loss_mlp": 1.03571153, "epoch": 0.8801442958064031, "flos": 37415794348800.0, "grad_norm": 1.6283147286810493, "language_loss": 0.58298445, "learning_rate": 1.4873941844827286e-07, "loss": 0.60432649, "num_input_tokens_seen": 315678955, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 14639, "time_per_iteration": 2.600264072418213 }, { "auxiliary_loss_clip": 0.0110499, "auxiliary_loss_mlp": 0.01034717, "balance_loss_clip": 1.02193809, "balance_loss_mlp": 1.03509212, "epoch": 0.8802044190590711, "flos": 25047334010880.0, "grad_norm": 1.4974668211272888, "language_loss": 0.74550176, "learning_rate": 1.4859206992533402e-07, "loss": 0.76689881, "num_input_tokens_seen": 315700360, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 14640, "time_per_iteration": 2.508209466934204 }, { "auxiliary_loss_clip": 0.01104336, "auxiliary_loss_mlp": 0.01035456, "balance_loss_clip": 1.02311814, "balance_loss_mlp": 1.03502679, "epoch": 0.8802645423117391, "flos": 24133874215680.0, "grad_norm": 2.936916073996092, "language_loss": 0.69553488, "learning_rate": 1.4844479160796985e-07, "loss": 0.71693277, "num_input_tokens_seen": 315719270, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 14641, "time_per_iteration": 2.465754985809326 }, { "auxiliary_loss_clip": 0.01104904, "auxiliary_loss_mlp": 0.01028067, "balance_loss_clip": 1.01494193, "balance_loss_mlp": 1.03453457, "epoch": 0.880324665564407, "flos": 17931203902080.0, "grad_norm": 4.95755655071794, "language_loss": 0.85251951, "learning_rate": 1.4829758350176457e-07, "loss": 0.87384921, "num_input_tokens_seen": 315737425, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 14642, "time_per_iteration": 2.418720006942749 }, { "auxiliary_loss_clip": 0.01103864, "auxiliary_loss_mlp": 0.01034773, "balance_loss_clip": 1.02144575, "balance_loss_mlp": 1.03604293, "epoch": 0.880384788817075, "flos": 21287630471040.0, "grad_norm": 2.256757072251481, "language_loss": 0.78859055, "learning_rate": 1.4815044561230038e-07, "loss": 0.80997694, "num_input_tokens_seen": 315755725, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6796875, "step": 14643, "time_per_iteration": 2.454533576965332 }, { "auxiliary_loss_clip": 0.01097371, "auxiliary_loss_mlp": 0.01025774, "balance_loss_clip": 1.01454425, "balance_loss_mlp": 1.0321101, "epoch": 0.880444912069743, "flos": 12458489777280.0, "grad_norm": 1.5519891514222053, "language_loss": 0.7329402, "learning_rate": 1.4800337794515705e-07, "loss": 0.75417167, "num_input_tokens_seen": 315773835, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.65234375, "step": 14644, "time_per_iteration": 2.4110584259033203 }, { "auxiliary_loss_clip": 0.01106991, "auxiliary_loss_mlp": 0.01033901, "balance_loss_clip": 1.02119946, "balance_loss_mlp": 1.03545594, "epoch": 0.880505035322411, "flos": 13625945619840.0, "grad_norm": 2.687411824272091, "language_loss": 0.79293525, "learning_rate": 1.47856380505911e-07, "loss": 0.81434417, "num_input_tokens_seen": 315790615, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 14645, "time_per_iteration": 2.4326767921447754 }, { "auxiliary_loss_clip": 0.0110096, "auxiliary_loss_mlp": 0.01036312, "balance_loss_clip": 1.02392054, "balance_loss_mlp": 1.03416789, "epoch": 0.8805651585750789, "flos": 23183067254400.0, "grad_norm": 2.1932538933308776, "language_loss": 0.64017379, "learning_rate": 1.477094533001364e-07, "loss": 0.66154647, "num_input_tokens_seen": 315811010, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.66796875, "step": 14646, "time_per_iteration": 2.4594345092773438 }, { "auxiliary_loss_clip": 0.01107263, "auxiliary_loss_mlp": 0.01029357, "balance_loss_clip": 1.0165298, "balance_loss_mlp": 1.03548837, "epoch": 0.8806252818277469, "flos": 14903000835840.0, "grad_norm": 2.146699199198226, "language_loss": 0.7689392, "learning_rate": 1.475625963334055e-07, "loss": 0.79030538, "num_input_tokens_seen": 315828130, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 14647, "time_per_iteration": 2.4421749114990234 }, { "auxiliary_loss_clip": 0.01101048, "auxiliary_loss_mlp": 0.01031293, "balance_loss_clip": 1.01992643, "balance_loss_mlp": 1.03470278, "epoch": 0.8806854050804148, "flos": 17639178330240.0, "grad_norm": 1.988211357835444, "language_loss": 0.74985278, "learning_rate": 1.4741580961128652e-07, "loss": 0.77117616, "num_input_tokens_seen": 315844900, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6640625, "step": 14648, "time_per_iteration": 2.4193522930145264 }, { "auxiliary_loss_clip": 0.01103922, "auxiliary_loss_mlp": 0.01027052, "balance_loss_clip": 1.01495278, "balance_loss_mlp": 1.03364313, "epoch": 0.8807455283330828, "flos": 25332392344320.0, "grad_norm": 2.0088254678180983, "language_loss": 0.65538359, "learning_rate": 1.4726909313934522e-07, "loss": 0.67669332, "num_input_tokens_seen": 315863745, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 14649, "time_per_iteration": 2.5790653228759766 }, { "auxiliary_loss_clip": 0.01104157, "auxiliary_loss_mlp": 0.01032922, "balance_loss_clip": 1.02021456, "balance_loss_mlp": 1.0359633, "epoch": 0.8808056515857507, "flos": 25265168040960.0, "grad_norm": 2.2618504437935405, "language_loss": 0.62376964, "learning_rate": 1.4712244692314578e-07, "loss": 0.64514041, "num_input_tokens_seen": 315885765, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 14650, "time_per_iteration": 2.580829620361328 }, { "auxiliary_loss_clip": 0.01102431, "auxiliary_loss_mlp": 0.01029707, "balance_loss_clip": 1.01780438, "balance_loss_mlp": 1.03522038, "epoch": 0.8808657748384188, "flos": 26578852151040.0, "grad_norm": 1.4594881387826584, "language_loss": 0.72872889, "learning_rate": 1.4697587096824914e-07, "loss": 0.75005031, "num_input_tokens_seen": 315907340, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 14651, "time_per_iteration": 2.528909206390381 }, { "auxiliary_loss_clip": 0.01106428, "auxiliary_loss_mlp": 0.01030848, "balance_loss_clip": 1.01756239, "balance_loss_mlp": 1.0359292, "epoch": 0.8809258980910867, "flos": 18661231918080.0, "grad_norm": 1.8844211989980701, "language_loss": 0.72474635, "learning_rate": 1.4682936528021284e-07, "loss": 0.74611914, "num_input_tokens_seen": 315924935, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 14652, "time_per_iteration": 2.4259085655212402 }, { "auxiliary_loss_clip": 0.01101119, "auxiliary_loss_mlp": 0.01030857, "balance_loss_clip": 1.01884019, "balance_loss_mlp": 1.03351617, "epoch": 0.8809860213437547, "flos": 19792274348160.0, "grad_norm": 2.4012537376264618, "language_loss": 0.74815243, "learning_rate": 1.4668292986459286e-07, "loss": 0.76947224, "num_input_tokens_seen": 315943165, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 14653, "time_per_iteration": 2.471930503845215 }, { "auxiliary_loss_clip": 0.0110666, "auxiliary_loss_mlp": 0.01035319, "balance_loss_clip": 1.02228963, "balance_loss_mlp": 1.03479302, "epoch": 0.8810461445964227, "flos": 17894467267200.0, "grad_norm": 1.8054268879890865, "language_loss": 0.71113044, "learning_rate": 1.465365647269421e-07, "loss": 0.7325502, "num_input_tokens_seen": 315961340, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 14654, "time_per_iteration": 2.4139273166656494 }, { "auxiliary_loss_clip": 0.01105731, "auxiliary_loss_mlp": 0.0103583, "balance_loss_clip": 1.02261019, "balance_loss_mlp": 1.03600156, "epoch": 0.8811062678490906, "flos": 29163917128320.0, "grad_norm": 1.5977707731059403, "language_loss": 0.71257955, "learning_rate": 1.4639026987281012e-07, "loss": 0.7339952, "num_input_tokens_seen": 315981335, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 14655, "time_per_iteration": 2.522433280944824 }, { "auxiliary_loss_clip": 0.01102456, "auxiliary_loss_mlp": 0.01030003, "balance_loss_clip": 1.01823711, "balance_loss_mlp": 1.03445041, "epoch": 0.8811663911017587, "flos": 20338834671360.0, "grad_norm": 1.6930034858152387, "language_loss": 0.81030124, "learning_rate": 1.462440453077449e-07, "loss": 0.83162582, "num_input_tokens_seen": 316001325, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 14656, "time_per_iteration": 2.446840763092041 }, { "auxiliary_loss_clip": 0.01104123, "auxiliary_loss_mlp": 0.01030596, "balance_loss_clip": 1.01919961, "balance_loss_mlp": 1.03529572, "epoch": 0.8812265143544266, "flos": 25885704424320.0, "grad_norm": 1.8688319628691274, "language_loss": 0.68685639, "learning_rate": 1.460978910372914e-07, "loss": 0.70820355, "num_input_tokens_seen": 316022540, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 14657, "time_per_iteration": 2.490939140319824 }, { "auxiliary_loss_clip": 0.01105238, "auxiliary_loss_mlp": 0.01028541, "balance_loss_clip": 1.01632774, "balance_loss_mlp": 1.03591299, "epoch": 0.8812866376070946, "flos": 27195509865600.0, "grad_norm": 20.918422014529998, "language_loss": 0.8347404, "learning_rate": 1.4595180706699207e-07, "loss": 0.85607821, "num_input_tokens_seen": 316037735, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69140625, "step": 14658, "time_per_iteration": 2.4841010570526123 }, { "auxiliary_loss_clip": 0.01110948, "auxiliary_loss_mlp": 0.01035786, "balance_loss_clip": 1.02229786, "balance_loss_mlp": 1.03775382, "epoch": 0.8813467608597625, "flos": 23807194997760.0, "grad_norm": 2.0023072162254345, "language_loss": 0.77372825, "learning_rate": 1.4580579340238554e-07, "loss": 0.79519558, "num_input_tokens_seen": 316058105, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.73046875, "step": 14659, "time_per_iteration": 2.4891607761383057 }, { "auxiliary_loss_clip": 0.01103477, "auxiliary_loss_mlp": 0.01030949, "balance_loss_clip": 1.01856303, "balance_loss_mlp": 1.03496575, "epoch": 0.8814068841124305, "flos": 21105455667840.0, "grad_norm": 2.077588976836419, "language_loss": 0.6037305, "learning_rate": 1.4565985004900894e-07, "loss": 0.62507474, "num_input_tokens_seen": 316074415, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 14660, "time_per_iteration": 2.458721160888672 }, { "auxiliary_loss_clip": 0.01104137, "auxiliary_loss_mlp": 0.01037242, "balance_loss_clip": 1.023664, "balance_loss_mlp": 1.03500688, "epoch": 0.8814670073650984, "flos": 24716991605760.0, "grad_norm": 3.536257592709624, "language_loss": 0.77718508, "learning_rate": 1.455139770123972e-07, "loss": 0.79859889, "num_input_tokens_seen": 316094405, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.69140625, "step": 14661, "time_per_iteration": 2.495140790939331 }, { "auxiliary_loss_clip": 0.01106928, "auxiliary_loss_mlp": 0.01040133, "balance_loss_clip": 1.02729416, "balance_loss_mlp": 1.03731632, "epoch": 0.8815271306177664, "flos": 22966274718720.0, "grad_norm": 1.923645879622976, "language_loss": 0.76888514, "learning_rate": 1.45368174298081e-07, "loss": 0.7903558, "num_input_tokens_seen": 316113390, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 14662, "time_per_iteration": 2.452951431274414 }, { "auxiliary_loss_clip": 0.01101727, "auxiliary_loss_mlp": 0.01026826, "balance_loss_clip": 1.01534009, "balance_loss_mlp": 1.03464901, "epoch": 0.8815872538704344, "flos": 19460064435840.0, "grad_norm": 2.346470596210554, "language_loss": 0.73801112, "learning_rate": 1.4522244191158929e-07, "loss": 0.75929666, "num_input_tokens_seen": 316131085, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 14663, "time_per_iteration": 2.441070795059204 }, { "auxiliary_loss_clip": 0.01104136, "auxiliary_loss_mlp": 0.01032139, "balance_loss_clip": 1.02005076, "balance_loss_mlp": 1.03591394, "epoch": 0.8816473771231024, "flos": 32156604622080.0, "grad_norm": 1.538643935195323, "language_loss": 0.69625008, "learning_rate": 1.450767798584489e-07, "loss": 0.71761274, "num_input_tokens_seen": 316151440, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 14664, "time_per_iteration": 4.02431583404541 }, { "auxiliary_loss_clip": 0.01101249, "auxiliary_loss_mlp": 0.0103253, "balance_loss_clip": 1.02171206, "balance_loss_mlp": 1.03450561, "epoch": 0.8817075003757703, "flos": 19682279925120.0, "grad_norm": 1.4590454033936124, "language_loss": 0.81176293, "learning_rate": 1.449311881441828e-07, "loss": 0.83310068, "num_input_tokens_seen": 316170750, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.66796875, "step": 14665, "time_per_iteration": 2.4434382915496826 }, { "auxiliary_loss_clip": 0.011065, "auxiliary_loss_mlp": 0.01033058, "balance_loss_clip": 1.02132201, "balance_loss_mlp": 1.03693688, "epoch": 0.8817676236284383, "flos": 15668616251520.0, "grad_norm": 2.394001772543077, "language_loss": 0.58258086, "learning_rate": 1.447856667743117e-07, "loss": 0.60397649, "num_input_tokens_seen": 316187265, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 14666, "time_per_iteration": 2.414483070373535 }, { "auxiliary_loss_clip": 0.01106743, "auxiliary_loss_mlp": 0.01031125, "balance_loss_clip": 1.01768422, "balance_loss_mlp": 1.03729248, "epoch": 0.8818277468811063, "flos": 17895185539200.0, "grad_norm": 1.885253251200787, "language_loss": 0.83682179, "learning_rate": 1.4464021575435403e-07, "loss": 0.85820043, "num_input_tokens_seen": 316206555, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 14667, "time_per_iteration": 2.4376585483551025 }, { "auxiliary_loss_clip": 0.01104191, "auxiliary_loss_mlp": 0.01034016, "balance_loss_clip": 1.02091527, "balance_loss_mlp": 1.03530085, "epoch": 0.8818878701337742, "flos": 18770508069120.0, "grad_norm": 2.0203761713933153, "language_loss": 0.62569535, "learning_rate": 1.4449483508982563e-07, "loss": 0.64707744, "num_input_tokens_seen": 316225210, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 14668, "time_per_iteration": 3.8921072483062744 }, { "auxiliary_loss_clip": 0.01100997, "auxiliary_loss_mlp": 0.01026336, "balance_loss_clip": 1.0156374, "balance_loss_mlp": 1.0347923, "epoch": 0.8819479933864423, "flos": 17712292464000.0, "grad_norm": 3.0474473227109877, "language_loss": 0.56838119, "learning_rate": 1.4434952478623918e-07, "loss": 0.58965451, "num_input_tokens_seen": 316242685, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.6640625, "step": 14669, "time_per_iteration": 2.400689125061035 }, { "auxiliary_loss_clip": 0.01102567, "auxiliary_loss_mlp": 0.01033307, "balance_loss_clip": 1.0214932, "balance_loss_mlp": 1.03407979, "epoch": 0.8820081166391102, "flos": 11728749070080.0, "grad_norm": 1.925534628283795, "language_loss": 0.71269703, "learning_rate": 1.442042848491043e-07, "loss": 0.73405576, "num_input_tokens_seen": 316260935, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 14670, "time_per_iteration": 5.05807900428772 }, { "auxiliary_loss_clip": 0.01102122, "auxiliary_loss_mlp": 0.01030474, "balance_loss_clip": 1.01855338, "balance_loss_mlp": 1.03353179, "epoch": 0.8820682398917782, "flos": 27490372611840.0, "grad_norm": 2.4596618698918262, "language_loss": 0.7374599, "learning_rate": 1.44059115283929e-07, "loss": 0.75878584, "num_input_tokens_seen": 316281190, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 14671, "time_per_iteration": 2.502042293548584 }, { "auxiliary_loss_clip": 0.0110571, "auxiliary_loss_mlp": 0.01029524, "balance_loss_clip": 1.01616645, "balance_loss_mlp": 1.03432536, "epoch": 0.8821283631444461, "flos": 16873850223360.0, "grad_norm": 2.0696643157991264, "language_loss": 0.84721744, "learning_rate": 1.43914016096218e-07, "loss": 0.86856979, "num_input_tokens_seen": 316297115, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 14672, "time_per_iteration": 2.421268939971924 }, { "auxiliary_loss_clip": 0.01102446, "auxiliary_loss_mlp": 0.0103209, "balance_loss_clip": 1.01979911, "balance_loss_mlp": 1.03551471, "epoch": 0.8821884863971141, "flos": 24280964409600.0, "grad_norm": 1.7364157365812123, "language_loss": 0.72435915, "learning_rate": 1.4376898729147336e-07, "loss": 0.74570453, "num_input_tokens_seen": 316318235, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66796875, "step": 14673, "time_per_iteration": 2.4885094165802 }, { "auxiliary_loss_clip": 0.01029265, "auxiliary_loss_mlp": 0.0100189, "balance_loss_clip": 1.00084722, "balance_loss_mlp": 1.0070405, "epoch": 0.882248609649782, "flos": 59432342492160.0, "grad_norm": 0.8132188119506971, "language_loss": 0.49334124, "learning_rate": 1.4362402887519487e-07, "loss": 0.5136528, "num_input_tokens_seen": 316384705, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.22265625, "step": 14674, "time_per_iteration": 3.1748147010803223 }, { "auxiliary_loss_clip": 0.01103624, "auxiliary_loss_mlp": 0.01030688, "balance_loss_clip": 1.01811719, "balance_loss_mlp": 1.03341699, "epoch": 0.88230873290245, "flos": 19937784343680.0, "grad_norm": 1.875209022270809, "language_loss": 0.76721764, "learning_rate": 1.4347914085287971e-07, "loss": 0.78856069, "num_input_tokens_seen": 316401165, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 14675, "time_per_iteration": 2.4451472759246826 }, { "auxiliary_loss_clip": 0.01101287, "auxiliary_loss_mlp": 0.01030553, "balance_loss_clip": 1.0186739, "balance_loss_mlp": 1.03455019, "epoch": 0.882368856155118, "flos": 16362769559040.0, "grad_norm": 1.8003358814767711, "language_loss": 0.79476899, "learning_rate": 1.4333432323002105e-07, "loss": 0.81608742, "num_input_tokens_seen": 316418780, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.66796875, "step": 14676, "time_per_iteration": 2.4137775897979736 }, { "auxiliary_loss_clip": 0.01028975, "auxiliary_loss_mlp": 0.01000923, "balance_loss_clip": 0.99986821, "balance_loss_mlp": 1.00653529, "epoch": 0.882428979407786, "flos": 70594563277440.0, "grad_norm": 0.6875658958119766, "language_loss": 0.54697287, "learning_rate": 1.431895760121109e-07, "loss": 0.56727183, "num_input_tokens_seen": 316482030, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.22460938, "step": 14677, "time_per_iteration": 3.198146343231201 }, { "auxiliary_loss_clip": 0.0110104, "auxiliary_loss_mlp": 0.01026954, "balance_loss_clip": 1.0150156, "balance_loss_mlp": 1.03403234, "epoch": 0.8824891026604539, "flos": 18150294908160.0, "grad_norm": 2.312357185496014, "language_loss": 0.65269339, "learning_rate": 1.4304489920463847e-07, "loss": 0.67397338, "num_input_tokens_seen": 316499175, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 14678, "time_per_iteration": 2.4013359546661377 }, { "auxiliary_loss_clip": 0.01107272, "auxiliary_loss_mlp": 0.01031293, "balance_loss_clip": 1.01909161, "balance_loss_mlp": 1.03606093, "epoch": 0.8825492259131219, "flos": 27232713377280.0, "grad_norm": 1.9123123242636257, "language_loss": 0.71499324, "learning_rate": 1.4290029281308936e-07, "loss": 0.73637891, "num_input_tokens_seen": 316519495, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.7109375, "step": 14679, "time_per_iteration": 2.503934144973755 }, { "auxiliary_loss_clip": 0.01100911, "auxiliary_loss_mlp": 0.01031317, "balance_loss_clip": 1.02074289, "balance_loss_mlp": 1.03407955, "epoch": 0.8826093491657898, "flos": 22274419881600.0, "grad_norm": 1.7865588634585612, "language_loss": 0.64105546, "learning_rate": 1.4275575684294694e-07, "loss": 0.66237772, "num_input_tokens_seen": 316538180, "router_z_loss_clip": 0.10546875, "router_z_loss_mlp": 0.66796875, "step": 14680, "time_per_iteration": 2.449589490890503 }, { "auxiliary_loss_clip": 0.01103036, "auxiliary_loss_mlp": 0.01034195, "balance_loss_clip": 1.02190471, "balance_loss_mlp": 1.03534293, "epoch": 0.8826694724184578, "flos": 14204753377920.0, "grad_norm": 2.279400969789895, "language_loss": 0.77020919, "learning_rate": 1.4261129129969328e-07, "loss": 0.79158151, "num_input_tokens_seen": 316551750, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.67578125, "step": 14681, "time_per_iteration": 2.412834405899048 }, { "auxiliary_loss_clip": 0.01106849, "auxiliary_loss_mlp": 0.01031108, "balance_loss_clip": 1.01818562, "balance_loss_mlp": 1.03593731, "epoch": 0.8827295956711259, "flos": 20631686256000.0, "grad_norm": 1.8218441010230082, "language_loss": 0.7290647, "learning_rate": 1.424668961888047e-07, "loss": 0.75044429, "num_input_tokens_seen": 316570680, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 14682, "time_per_iteration": 2.463205099105835 }, { "auxiliary_loss_clip": 0.01108886, "auxiliary_loss_mlp": 0.01028853, "balance_loss_clip": 1.01464903, "balance_loss_mlp": 1.03665423, "epoch": 0.8827897189237938, "flos": 18513064316160.0, "grad_norm": 1.8684743840038394, "language_loss": 0.74749422, "learning_rate": 1.4232257151575765e-07, "loss": 0.76887167, "num_input_tokens_seen": 316588635, "router_z_loss_clip": 0.14257812, "router_z_loss_mlp": 0.72265625, "step": 14683, "time_per_iteration": 2.446641445159912 }, { "auxiliary_loss_clip": 0.01105339, "auxiliary_loss_mlp": 0.01029284, "balance_loss_clip": 1.01676714, "balance_loss_mlp": 1.03653765, "epoch": 0.8828498421764618, "flos": 22747399194240.0, "grad_norm": 3.2393437277595396, "language_loss": 0.65506548, "learning_rate": 1.4217831728602492e-07, "loss": 0.67641169, "num_input_tokens_seen": 316607550, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 14684, "time_per_iteration": 2.459256887435913 }, { "auxiliary_loss_clip": 0.01101553, "auxiliary_loss_mlp": 0.01025645, "balance_loss_clip": 1.0144577, "balance_loss_mlp": 1.0340836, "epoch": 0.8829099654291297, "flos": 15012384727680.0, "grad_norm": 1.9877302625232744, "language_loss": 0.69394529, "learning_rate": 1.4203413350507677e-07, "loss": 0.71521735, "num_input_tokens_seen": 316624460, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.67578125, "step": 14685, "time_per_iteration": 2.443905830383301 }, { "auxiliary_loss_clip": 0.01108137, "auxiliary_loss_mlp": 0.01029458, "balance_loss_clip": 1.01633275, "balance_loss_mlp": 1.03710508, "epoch": 0.8829700886817977, "flos": 16720546976640.0, "grad_norm": 2.199891023369171, "language_loss": 0.74185038, "learning_rate": 1.418900201783806e-07, "loss": 0.76322639, "num_input_tokens_seen": 316640765, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 14686, "time_per_iteration": 2.4458439350128174 }, { "auxiliary_loss_clip": 0.01102688, "auxiliary_loss_mlp": 0.01025986, "balance_loss_clip": 1.01407075, "balance_loss_mlp": 1.0351367, "epoch": 0.8830302119344656, "flos": 15263256291840.0, "grad_norm": 1.7866780650509293, "language_loss": 0.62863135, "learning_rate": 1.417459773114007e-07, "loss": 0.64991814, "num_input_tokens_seen": 316656120, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 14687, "time_per_iteration": 2.443192720413208 }, { "auxiliary_loss_clip": 0.01107035, "auxiliary_loss_mlp": 0.01034881, "balance_loss_clip": 1.02235198, "balance_loss_mlp": 1.0364325, "epoch": 0.8830903351871336, "flos": 28617751854720.0, "grad_norm": 2.1920201652004434, "language_loss": 0.691728, "learning_rate": 1.4160200490959984e-07, "loss": 0.71314722, "num_input_tokens_seen": 316676095, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 14688, "time_per_iteration": 2.4854776859283447 }, { "auxiliary_loss_clip": 0.01101427, "auxiliary_loss_mlp": 0.0102622, "balance_loss_clip": 1.01425719, "balance_loss_mlp": 1.0352248, "epoch": 0.8831504584398016, "flos": 28001632844160.0, "grad_norm": 2.1532884617490704, "language_loss": 0.66914058, "learning_rate": 1.4145810297843697e-07, "loss": 0.69041699, "num_input_tokens_seen": 316696235, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.66015625, "step": 14689, "time_per_iteration": 2.5367748737335205 }, { "auxiliary_loss_clip": 0.01104579, "auxiliary_loss_mlp": 0.01029673, "balance_loss_clip": 1.01797903, "balance_loss_mlp": 1.0373069, "epoch": 0.8832105816924696, "flos": 26579642250240.0, "grad_norm": 1.3618861181013713, "language_loss": 0.74606252, "learning_rate": 1.4131427152336905e-07, "loss": 0.76740497, "num_input_tokens_seen": 316719680, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 14690, "time_per_iteration": 2.5220136642456055 }, { "auxiliary_loss_clip": 0.01104712, "auxiliary_loss_mlp": 0.0103316, "balance_loss_clip": 1.02007723, "balance_loss_mlp": 1.03530312, "epoch": 0.8832707049451375, "flos": 24898771359360.0, "grad_norm": 2.017720093336724, "language_loss": 0.72725284, "learning_rate": 1.4117051054985018e-07, "loss": 0.7486316, "num_input_tokens_seen": 316739830, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 14691, "time_per_iteration": 2.5084381103515625 }, { "auxiliary_loss_clip": 0.01109072, "auxiliary_loss_mlp": 0.01029729, "balance_loss_clip": 1.01706898, "balance_loss_mlp": 1.03671741, "epoch": 0.8833308281978055, "flos": 15451141357440.0, "grad_norm": 2.2707034967545114, "language_loss": 0.52084756, "learning_rate": 1.4102682006333243e-07, "loss": 0.54223555, "num_input_tokens_seen": 316758105, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 14692, "time_per_iteration": 2.414259672164917 }, { "auxiliary_loss_clip": 0.01105611, "auxiliary_loss_mlp": 0.01029328, "balance_loss_clip": 1.0171504, "balance_loss_mlp": 1.03601909, "epoch": 0.8833909514504734, "flos": 20301523418880.0, "grad_norm": 2.38602137058011, "language_loss": 0.60630417, "learning_rate": 1.4088320006926346e-07, "loss": 0.62765354, "num_input_tokens_seen": 316777455, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 14693, "time_per_iteration": 2.4384429454803467 }, { "auxiliary_loss_clip": 0.01101383, "auxiliary_loss_mlp": 0.01026099, "balance_loss_clip": 1.01474404, "balance_loss_mlp": 1.03618419, "epoch": 0.8834510747031414, "flos": 20374027021440.0, "grad_norm": 1.782306976990541, "language_loss": 0.75509876, "learning_rate": 1.407396505730898e-07, "loss": 0.77637357, "num_input_tokens_seen": 316796300, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.65234375, "step": 14694, "time_per_iteration": 2.4383275508880615 }, { "auxiliary_loss_clip": 0.01104726, "auxiliary_loss_mlp": 0.01028351, "balance_loss_clip": 1.01693046, "balance_loss_mlp": 1.03331673, "epoch": 0.8835111979558095, "flos": 29752026508800.0, "grad_norm": 2.1157270643475217, "language_loss": 0.72985506, "learning_rate": 1.4059617158025527e-07, "loss": 0.75118577, "num_input_tokens_seen": 316819090, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.71484375, "step": 14695, "time_per_iteration": 2.516897201538086 }, { "auxiliary_loss_clip": 0.01099752, "auxiliary_loss_mlp": 0.0102881, "balance_loss_clip": 1.01706243, "balance_loss_mlp": 1.03468299, "epoch": 0.8835713212084774, "flos": 24134556574080.0, "grad_norm": 1.5524773810122507, "language_loss": 0.79836559, "learning_rate": 1.404527630961998e-07, "loss": 0.81965125, "num_input_tokens_seen": 316839250, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6484375, "step": 14696, "time_per_iteration": 2.456092357635498 }, { "auxiliary_loss_clip": 0.01105005, "auxiliary_loss_mlp": 0.01030603, "balance_loss_clip": 1.01908159, "balance_loss_mlp": 1.03575587, "epoch": 0.8836314444611454, "flos": 27672331933440.0, "grad_norm": 1.599550148464573, "language_loss": 0.74816787, "learning_rate": 1.4030942512636236e-07, "loss": 0.76952398, "num_input_tokens_seen": 316861315, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.69140625, "step": 14697, "time_per_iteration": 2.500638246536255 }, { "auxiliary_loss_clip": 0.01104761, "auxiliary_loss_mlp": 0.01033477, "balance_loss_clip": 1.0215317, "balance_loss_mlp": 1.03589439, "epoch": 0.8836915677138133, "flos": 16836969934080.0, "grad_norm": 3.1574043912198357, "language_loss": 0.7204867, "learning_rate": 1.401661576761779e-07, "loss": 0.74186909, "num_input_tokens_seen": 316879325, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14698, "time_per_iteration": 2.4146692752838135 }, { "auxiliary_loss_clip": 0.01028421, "auxiliary_loss_mlp": 0.01000507, "balance_loss_clip": 0.99942797, "balance_loss_mlp": 1.00615931, "epoch": 0.8837516909664813, "flos": 69310540823040.0, "grad_norm": 0.869780597823081, "language_loss": 0.53728175, "learning_rate": 1.4002296075107856e-07, "loss": 0.55757105, "num_input_tokens_seen": 316936425, "router_z_loss_clip": 0.01080322, "router_z_loss_mlp": 0.22265625, "step": 14699, "time_per_iteration": 3.108506441116333 }, { "auxiliary_loss_clip": 0.01108074, "auxiliary_loss_mlp": 0.01028305, "balance_loss_clip": 1.01574087, "balance_loss_mlp": 1.03603697, "epoch": 0.8838118142191492, "flos": 21324726241920.0, "grad_norm": 1.8615160142742313, "language_loss": 0.77311373, "learning_rate": 1.3987983435649508e-07, "loss": 0.79447746, "num_input_tokens_seen": 316956360, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 14700, "time_per_iteration": 2.456644296646118 }, { "auxiliary_loss_clip": 0.01102678, "auxiliary_loss_mlp": 0.01030879, "balance_loss_clip": 1.01870227, "balance_loss_mlp": 1.03608298, "epoch": 0.8838719374718172, "flos": 21470559459840.0, "grad_norm": 1.7278278894427461, "language_loss": 0.73425102, "learning_rate": 1.3973677849785494e-07, "loss": 0.75558656, "num_input_tokens_seen": 316975295, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6640625, "step": 14701, "time_per_iteration": 2.4818012714385986 }, { "auxiliary_loss_clip": 0.011074, "auxiliary_loss_mlp": 0.01033062, "balance_loss_clip": 1.02033067, "balance_loss_mlp": 1.03667498, "epoch": 0.8839320607244852, "flos": 26468929555200.0, "grad_norm": 2.2424432722236265, "language_loss": 0.71112287, "learning_rate": 1.3959379318058262e-07, "loss": 0.73252749, "num_input_tokens_seen": 316994520, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 14702, "time_per_iteration": 2.4741299152374268 }, { "auxiliary_loss_clip": 0.01106667, "auxiliary_loss_mlp": 0.01035938, "balance_loss_clip": 1.02324259, "balance_loss_mlp": 1.03756011, "epoch": 0.8839921839771532, "flos": 45222270923520.0, "grad_norm": 2.164383171777954, "language_loss": 0.71430004, "learning_rate": 1.3945087841010006e-07, "loss": 0.73572606, "num_input_tokens_seen": 317018095, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 14703, "time_per_iteration": 2.6682047843933105 }, { "auxiliary_loss_clip": 0.01102148, "auxiliary_loss_mlp": 0.01028929, "balance_loss_clip": 1.01746142, "balance_loss_mlp": 1.03498185, "epoch": 0.8840523072298211, "flos": 20006876154240.0, "grad_norm": 1.9064521724643262, "language_loss": 0.66985589, "learning_rate": 1.3930803419182645e-07, "loss": 0.69116664, "num_input_tokens_seen": 317035755, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 14704, "time_per_iteration": 2.4304862022399902 }, { "auxiliary_loss_clip": 0.01099651, "auxiliary_loss_mlp": 0.01026072, "balance_loss_clip": 1.01497412, "balance_loss_mlp": 1.03331137, "epoch": 0.8841124304824891, "flos": 24426007528320.0, "grad_norm": 1.7548658858976691, "language_loss": 0.70288593, "learning_rate": 1.3916526053117905e-07, "loss": 0.72414315, "num_input_tokens_seen": 317055765, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6640625, "step": 14705, "time_per_iteration": 2.4862353801727295 }, { "auxiliary_loss_clip": 0.01103398, "auxiliary_loss_mlp": 0.01033619, "balance_loss_clip": 1.02311707, "balance_loss_mlp": 1.03589785, "epoch": 0.884172553735157, "flos": 31284622056960.0, "grad_norm": 1.8665110315530635, "language_loss": 0.7095716, "learning_rate": 1.3902255743357104e-07, "loss": 0.73094177, "num_input_tokens_seen": 317077955, "router_z_loss_clip": 0.10546875, "router_z_loss_mlp": 0.67578125, "step": 14706, "time_per_iteration": 3.876613140106201 }, { "auxiliary_loss_clip": 0.01101887, "auxiliary_loss_mlp": 0.01029669, "balance_loss_clip": 1.01817083, "balance_loss_mlp": 1.03370333, "epoch": 0.884232676987825, "flos": 21391160446080.0, "grad_norm": 1.6656588792452338, "language_loss": 0.74412429, "learning_rate": 1.3887992490441413e-07, "loss": 0.76543987, "num_input_tokens_seen": 317095825, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 14707, "time_per_iteration": 2.4304144382476807 }, { "auxiliary_loss_clip": 0.01028834, "auxiliary_loss_mlp": 0.01000741, "balance_loss_clip": 0.99968606, "balance_loss_mlp": 1.00653541, "epoch": 0.8842928002404931, "flos": 57911451799680.0, "grad_norm": 0.8303976771839836, "language_loss": 0.60365605, "learning_rate": 1.387373629491173e-07, "loss": 0.62395179, "num_input_tokens_seen": 317152875, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.22265625, "step": 14708, "time_per_iteration": 2.927062749862671 }, { "auxiliary_loss_clip": 0.01098212, "auxiliary_loss_mlp": 0.0102866, "balance_loss_clip": 1.01787782, "balance_loss_mlp": 1.03378153, "epoch": 0.884352923493161, "flos": 41463896186880.0, "grad_norm": 1.643767131967632, "language_loss": 0.67049956, "learning_rate": 1.3859487157308625e-07, "loss": 0.69176829, "num_input_tokens_seen": 317176725, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.64453125, "step": 14709, "time_per_iteration": 2.6200063228607178 }, { "auxiliary_loss_clip": 0.01108701, "auxiliary_loss_mlp": 0.01035591, "balance_loss_clip": 1.02172661, "balance_loss_mlp": 1.03551912, "epoch": 0.884413046745829, "flos": 46541234332800.0, "grad_norm": 1.8029641248136092, "language_loss": 0.62408042, "learning_rate": 1.3845245078172373e-07, "loss": 0.64552337, "num_input_tokens_seen": 317206880, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 14710, "time_per_iteration": 4.0781590938568115 }, { "auxiliary_loss_clip": 0.01101037, "auxiliary_loss_mlp": 0.010266, "balance_loss_clip": 1.01569867, "balance_loss_mlp": 1.03450227, "epoch": 0.8844731699984969, "flos": 19135324552320.0, "grad_norm": 24.58743689148598, "language_loss": 0.6404767, "learning_rate": 1.38310100580431e-07, "loss": 0.66175306, "num_input_tokens_seen": 317224135, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.6640625, "step": 14711, "time_per_iteration": 2.4226107597351074 }, { "auxiliary_loss_clip": 0.01105326, "auxiliary_loss_mlp": 0.01035154, "balance_loss_clip": 1.02280974, "balance_loss_mlp": 1.0337497, "epoch": 0.8845332932511649, "flos": 23260634674560.0, "grad_norm": 2.161933690470374, "language_loss": 0.75926471, "learning_rate": 1.38167820974606e-07, "loss": 0.78066951, "num_input_tokens_seen": 317244505, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71484375, "step": 14712, "time_per_iteration": 5.361508131027222 }, { "auxiliary_loss_clip": 0.01102602, "auxiliary_loss_mlp": 0.01027139, "balance_loss_clip": 1.01522982, "balance_loss_mlp": 1.03355312, "epoch": 0.8845934165038328, "flos": 17564591738880.0, "grad_norm": 2.5560445388600637, "language_loss": 0.81026709, "learning_rate": 1.3802561196964368e-07, "loss": 0.83156455, "num_input_tokens_seen": 317257830, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 14713, "time_per_iteration": 2.3962442874908447 }, { "auxiliary_loss_clip": 0.01102542, "auxiliary_loss_mlp": 0.01027217, "balance_loss_clip": 1.01501036, "balance_loss_mlp": 1.033988, "epoch": 0.8846535397565009, "flos": 27485739757440.0, "grad_norm": 1.4140139248992274, "language_loss": 0.55350012, "learning_rate": 1.3788347357093688e-07, "loss": 0.57479775, "num_input_tokens_seen": 317278430, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 14714, "time_per_iteration": 2.512218475341797 }, { "auxiliary_loss_clip": 0.01102515, "auxiliary_loss_mlp": 0.01032726, "balance_loss_clip": 1.02082253, "balance_loss_mlp": 1.03420603, "epoch": 0.8847136630091688, "flos": 28761430256640.0, "grad_norm": 1.8579305981930356, "language_loss": 0.74155521, "learning_rate": 1.377414057838755e-07, "loss": 0.76290762, "num_input_tokens_seen": 317295970, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 14715, "time_per_iteration": 2.483020544052124 }, { "auxiliary_loss_clip": 0.0110455, "auxiliary_loss_mlp": 0.01028533, "balance_loss_clip": 1.01648676, "balance_loss_mlp": 1.03571355, "epoch": 0.8847737862618368, "flos": 23476924419840.0, "grad_norm": 2.3580523693578677, "language_loss": 0.75265837, "learning_rate": 1.375994086138461e-07, "loss": 0.7739892, "num_input_tokens_seen": 317316185, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 14716, "time_per_iteration": 2.4828927516937256 }, { "auxiliary_loss_clip": 0.01103609, "auxiliary_loss_mlp": 0.01037281, "balance_loss_clip": 1.02577758, "balance_loss_mlp": 1.03639781, "epoch": 0.8848339095145047, "flos": 18660872782080.0, "grad_norm": 2.4029340848884115, "language_loss": 0.71316922, "learning_rate": 1.3745748206623397e-07, "loss": 0.73457813, "num_input_tokens_seen": 317333275, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 14717, "time_per_iteration": 2.426909923553467 }, { "auxiliary_loss_clip": 0.01099723, "auxiliary_loss_mlp": 0.01031702, "balance_loss_clip": 1.02006769, "balance_loss_mlp": 1.03487253, "epoch": 0.8848940327671727, "flos": 32270298145920.0, "grad_norm": 3.175868325613425, "language_loss": 0.73843241, "learning_rate": 1.373156261464208e-07, "loss": 0.75974667, "num_input_tokens_seen": 317351245, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6484375, "step": 14718, "time_per_iteration": 2.549016237258911 }, { "auxiliary_loss_clip": 0.01106211, "auxiliary_loss_mlp": 0.01030166, "balance_loss_clip": 1.01767254, "balance_loss_mlp": 1.03559017, "epoch": 0.8849541560198406, "flos": 24021832717440.0, "grad_norm": 2.682511153392628, "language_loss": 0.78496802, "learning_rate": 1.3717384085978602e-07, "loss": 0.80633181, "num_input_tokens_seen": 317370740, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 14719, "time_per_iteration": 2.4852054119110107 }, { "auxiliary_loss_clip": 0.01104312, "auxiliary_loss_mlp": 0.01026913, "balance_loss_clip": 1.01449132, "balance_loss_mlp": 1.03500557, "epoch": 0.8850142792725086, "flos": 16873060124160.0, "grad_norm": 1.8053632205399122, "language_loss": 0.71968544, "learning_rate": 1.3703212621170579e-07, "loss": 0.74099767, "num_input_tokens_seen": 317388370, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 14720, "time_per_iteration": 2.4417808055877686 }, { "auxiliary_loss_clip": 0.01107, "auxiliary_loss_mlp": 0.01031922, "balance_loss_clip": 1.01959014, "balance_loss_mlp": 1.03536236, "epoch": 0.8850744025251767, "flos": 24024059360640.0, "grad_norm": 3.8502011576927506, "language_loss": 0.82492393, "learning_rate": 1.3689048220755383e-07, "loss": 0.84631312, "num_input_tokens_seen": 317407390, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.71875, "step": 14721, "time_per_iteration": 2.4613778591156006 }, { "auxiliary_loss_clip": 0.0110321, "auxiliary_loss_mlp": 0.01033195, "balance_loss_clip": 1.02002871, "balance_loss_mlp": 1.03360891, "epoch": 0.8851345257778446, "flos": 47955575329920.0, "grad_norm": 3.1289966688715034, "language_loss": 0.62581915, "learning_rate": 1.3674890885270186e-07, "loss": 0.64718318, "num_input_tokens_seen": 317430825, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 14722, "time_per_iteration": 2.6972076892852783 }, { "auxiliary_loss_clip": 0.0110552, "auxiliary_loss_mlp": 0.01030849, "balance_loss_clip": 1.01835549, "balance_loss_mlp": 1.03499937, "epoch": 0.8851946490305126, "flos": 36611000173440.0, "grad_norm": 2.123588794963692, "language_loss": 0.68651497, "learning_rate": 1.3660740615251754e-07, "loss": 0.70787871, "num_input_tokens_seen": 317451905, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 14723, "time_per_iteration": 2.5711019039154053 }, { "auxiliary_loss_clip": 0.01104378, "auxiliary_loss_mlp": 0.01032983, "balance_loss_clip": 1.02050734, "balance_loss_mlp": 1.03605247, "epoch": 0.8852547722831805, "flos": 21544248211200.0, "grad_norm": 1.6691572059547164, "language_loss": 0.78041458, "learning_rate": 1.3646597411236703e-07, "loss": 0.80178821, "num_input_tokens_seen": 317470030, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 14724, "time_per_iteration": 2.481297731399536 }, { "auxiliary_loss_clip": 0.01029098, "auxiliary_loss_mlp": 0.01001491, "balance_loss_clip": 1.00038207, "balance_loss_mlp": 1.00698805, "epoch": 0.8853148955358485, "flos": 63059246472960.0, "grad_norm": 0.9071100576600751, "language_loss": 0.58980322, "learning_rate": 1.363246127376143e-07, "loss": 0.61010909, "num_input_tokens_seen": 317527460, "router_z_loss_clip": 0.0111084, "router_z_loss_mlp": 0.22070312, "step": 14725, "time_per_iteration": 2.947593927383423 }, { "auxiliary_loss_clip": 0.01107017, "auxiliary_loss_mlp": 0.01038466, "balance_loss_clip": 1.02581787, "balance_loss_mlp": 1.03403533, "epoch": 0.8853750187885164, "flos": 18149828031360.0, "grad_norm": 5.454724078203395, "language_loss": 0.69014025, "learning_rate": 1.3618332203361837e-07, "loss": 0.71159512, "num_input_tokens_seen": 317544070, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 14726, "time_per_iteration": 2.43559193611145 }, { "auxiliary_loss_clip": 0.01103713, "auxiliary_loss_mlp": 0.01030821, "balance_loss_clip": 1.01861429, "balance_loss_mlp": 1.03659272, "epoch": 0.8854351420411845, "flos": 39570542392320.0, "grad_norm": 1.3520939892828345, "language_loss": 0.6974014, "learning_rate": 1.3604210200573785e-07, "loss": 0.71874672, "num_input_tokens_seen": 317570275, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 14727, "time_per_iteration": 2.6483139991760254 }, { "auxiliary_loss_clip": 0.01107043, "auxiliary_loss_mlp": 0.01036607, "balance_loss_clip": 1.02432203, "balance_loss_mlp": 1.0379045, "epoch": 0.8854952652938524, "flos": 23769309127680.0, "grad_norm": 1.6297704208022978, "language_loss": 0.69808483, "learning_rate": 1.3590095265932733e-07, "loss": 0.71952128, "num_input_tokens_seen": 317590160, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 14728, "time_per_iteration": 2.493060827255249 }, { "auxiliary_loss_clip": 0.01103667, "auxiliary_loss_mlp": 0.01030252, "balance_loss_clip": 1.01850402, "balance_loss_mlp": 1.03464758, "epoch": 0.8855553885465204, "flos": 18290310122880.0, "grad_norm": 2.0138115017859155, "language_loss": 0.66384602, "learning_rate": 1.3575987399973987e-07, "loss": 0.68518519, "num_input_tokens_seen": 317608340, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69140625, "step": 14729, "time_per_iteration": 2.408761501312256 }, { "auxiliary_loss_clip": 0.01105192, "auxiliary_loss_mlp": 0.01032654, "balance_loss_clip": 1.02115619, "balance_loss_mlp": 1.03748035, "epoch": 0.8856155117991883, "flos": 36867402432000.0, "grad_norm": 1.9712595267259572, "language_loss": 0.63105977, "learning_rate": 1.3561886603232453e-07, "loss": 0.65243828, "num_input_tokens_seen": 317629910, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 14730, "time_per_iteration": 2.5903806686401367 }, { "auxiliary_loss_clip": 0.0110031, "auxiliary_loss_mlp": 0.01033966, "balance_loss_clip": 1.02208042, "balance_loss_mlp": 1.03318691, "epoch": 0.8856756350518563, "flos": 22163886754560.0, "grad_norm": 1.524398052072806, "language_loss": 0.79323566, "learning_rate": 1.3547792876242904e-07, "loss": 0.81457841, "num_input_tokens_seen": 317650265, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 14731, "time_per_iteration": 2.443748950958252 }, { "auxiliary_loss_clip": 0.01104379, "auxiliary_loss_mlp": 0.01033896, "balance_loss_clip": 1.02161109, "balance_loss_mlp": 1.03382349, "epoch": 0.8857357583045242, "flos": 20740962407040.0, "grad_norm": 4.868181518303036, "language_loss": 0.82981026, "learning_rate": 1.3533706219539708e-07, "loss": 0.85119301, "num_input_tokens_seen": 317669045, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.70703125, "step": 14732, "time_per_iteration": 2.450533390045166 }, { "auxiliary_loss_clip": 0.01029352, "auxiliary_loss_mlp": 0.0100159, "balance_loss_clip": 1.00046337, "balance_loss_mlp": 1.00714958, "epoch": 0.8857958815571922, "flos": 69892329409920.0, "grad_norm": 0.9111031113487661, "language_loss": 0.59974182, "learning_rate": 1.3519626633657045e-07, "loss": 0.62005126, "num_input_tokens_seen": 317728065, "router_z_loss_clip": 0.0112915, "router_z_loss_mlp": 0.22265625, "step": 14733, "time_per_iteration": 3.0716052055358887 }, { "auxiliary_loss_clip": 0.01106993, "auxiliary_loss_mlp": 0.01030555, "balance_loss_clip": 1.01822305, "balance_loss_mlp": 1.03761935, "epoch": 0.8858560048098603, "flos": 15121948187520.0, "grad_norm": 2.589137987188821, "language_loss": 0.6662699, "learning_rate": 1.3505554119128838e-07, "loss": 0.68764538, "num_input_tokens_seen": 317746120, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 14734, "time_per_iteration": 2.4318947792053223 }, { "auxiliary_loss_clip": 0.01105122, "auxiliary_loss_mlp": 0.01035719, "balance_loss_clip": 1.02414942, "balance_loss_mlp": 1.03759456, "epoch": 0.8859161280625282, "flos": 16611019430400.0, "grad_norm": 1.932066216287695, "language_loss": 0.75535554, "learning_rate": 1.3491488676488682e-07, "loss": 0.77676392, "num_input_tokens_seen": 317762280, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 14735, "time_per_iteration": 2.4114651679992676 }, { "auxiliary_loss_clip": 0.01103805, "auxiliary_loss_mlp": 0.01030403, "balance_loss_clip": 1.01821947, "balance_loss_mlp": 1.03439641, "epoch": 0.8859762513151962, "flos": 18694484933760.0, "grad_norm": 1.7163833296956152, "language_loss": 0.7056002, "learning_rate": 1.3477430306270066e-07, "loss": 0.7269423, "num_input_tokens_seen": 317780615, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 14736, "time_per_iteration": 2.4911272525787354 }, { "auxiliary_loss_clip": 0.01105446, "auxiliary_loss_mlp": 0.01029985, "balance_loss_clip": 1.01811767, "balance_loss_mlp": 1.03558552, "epoch": 0.8860363745678641, "flos": 19536877670400.0, "grad_norm": 2.3479065834505213, "language_loss": 0.84882236, "learning_rate": 1.3463379009005892e-07, "loss": 0.87017667, "num_input_tokens_seen": 317798830, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 14737, "time_per_iteration": 2.4322752952575684 }, { "auxiliary_loss_clip": 0.01110473, "auxiliary_loss_mlp": 0.01035767, "balance_loss_clip": 1.02263045, "balance_loss_mlp": 1.03695965, "epoch": 0.8860964978205321, "flos": 35954912304000.0, "grad_norm": 2.289419530167471, "language_loss": 0.68164986, "learning_rate": 1.3449334785229093e-07, "loss": 0.70311224, "num_input_tokens_seen": 317819235, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 14738, "time_per_iteration": 2.5835888385772705 }, { "auxiliary_loss_clip": 0.011084, "auxiliary_loss_mlp": 0.01029945, "balance_loss_clip": 1.01701069, "balance_loss_mlp": 1.03468108, "epoch": 0.8861566210732, "flos": 21212577002880.0, "grad_norm": 1.6843672285740108, "language_loss": 0.75360823, "learning_rate": 1.343529763547222e-07, "loss": 0.77499163, "num_input_tokens_seen": 317836785, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73828125, "step": 14739, "time_per_iteration": 2.4341158866882324 }, { "auxiliary_loss_clip": 0.01103358, "auxiliary_loss_mlp": 0.01029532, "balance_loss_clip": 1.01803398, "balance_loss_mlp": 1.03577518, "epoch": 0.886216744325868, "flos": 14609071843200.0, "grad_norm": 2.4736973967654468, "language_loss": 0.87335956, "learning_rate": 1.3421267560267559e-07, "loss": 0.89468843, "num_input_tokens_seen": 317854225, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.67578125, "step": 14740, "time_per_iteration": 2.4297924041748047 }, { "auxiliary_loss_clip": 0.0110351, "auxiliary_loss_mlp": 0.01030372, "balance_loss_clip": 1.01820076, "balance_loss_mlp": 1.03591132, "epoch": 0.886276867578536, "flos": 26651643062400.0, "grad_norm": 2.6580003285977614, "language_loss": 0.63326883, "learning_rate": 1.34072445601471e-07, "loss": 0.65460765, "num_input_tokens_seen": 317874865, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 14741, "time_per_iteration": 2.4930214881896973 }, { "auxiliary_loss_clip": 0.01104062, "auxiliary_loss_mlp": 0.01029122, "balance_loss_clip": 1.0171777, "balance_loss_mlp": 1.03559613, "epoch": 0.886336990831204, "flos": 16764071281920.0, "grad_norm": 2.0122354836105547, "language_loss": 0.72691822, "learning_rate": 1.3393228635642717e-07, "loss": 0.74825007, "num_input_tokens_seen": 317892830, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 14742, "time_per_iteration": 2.440479278564453 }, { "auxiliary_loss_clip": 0.01104015, "auxiliary_loss_mlp": 0.01031817, "balance_loss_clip": 1.01933575, "balance_loss_mlp": 1.03578281, "epoch": 0.8863971140838719, "flos": 25265275781760.0, "grad_norm": 2.0246070135183523, "language_loss": 0.59630024, "learning_rate": 1.3379219787285733e-07, "loss": 0.61765856, "num_input_tokens_seen": 317911780, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 14743, "time_per_iteration": 2.464714765548706 }, { "auxiliary_loss_clip": 0.01106579, "auxiliary_loss_mlp": 0.01030819, "balance_loss_clip": 1.01707983, "balance_loss_mlp": 1.03507471, "epoch": 0.8864572373365399, "flos": 23404313076480.0, "grad_norm": 2.10223512243065, "language_loss": 0.60347724, "learning_rate": 1.3365218015607437e-07, "loss": 0.62485123, "num_input_tokens_seen": 317932855, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71484375, "step": 14744, "time_per_iteration": 2.4928135871887207 }, { "auxiliary_loss_clip": 0.01105656, "auxiliary_loss_mlp": 0.01035764, "balance_loss_clip": 1.0227406, "balance_loss_mlp": 1.03599453, "epoch": 0.8865173605892078, "flos": 18548759456640.0, "grad_norm": 1.7099732957801523, "language_loss": 0.76672363, "learning_rate": 1.3351223321138762e-07, "loss": 0.78813785, "num_input_tokens_seen": 317952090, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 14745, "time_per_iteration": 2.447831392288208 }, { "auxiliary_loss_clip": 0.0110444, "auxiliary_loss_mlp": 0.01029886, "balance_loss_clip": 1.01797771, "balance_loss_mlp": 1.03638101, "epoch": 0.8865774838418758, "flos": 19025868833280.0, "grad_norm": 1.7777976985732948, "language_loss": 0.77405012, "learning_rate": 1.3337235704410454e-07, "loss": 0.79539335, "num_input_tokens_seen": 317970370, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 14746, "time_per_iteration": 2.4838054180145264 }, { "auxiliary_loss_clip": 0.01109635, "auxiliary_loss_mlp": 0.01032252, "balance_loss_clip": 1.01935387, "balance_loss_mlp": 1.0378828, "epoch": 0.8866376070945439, "flos": 22163168482560.0, "grad_norm": 2.357911272246834, "language_loss": 0.76424897, "learning_rate": 1.3323255165952873e-07, "loss": 0.7856679, "num_input_tokens_seen": 317989125, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 14747, "time_per_iteration": 3.9341981410980225 }, { "auxiliary_loss_clip": 0.01100083, "auxiliary_loss_mlp": 0.01028843, "balance_loss_clip": 1.01682663, "balance_loss_mlp": 1.03317571, "epoch": 0.8866977303472118, "flos": 20704261685760.0, "grad_norm": 1.6913179097667619, "language_loss": 0.82577324, "learning_rate": 1.3309281706296127e-07, "loss": 0.84706253, "num_input_tokens_seen": 318007820, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66796875, "step": 14748, "time_per_iteration": 2.4550602436065674 }, { "auxiliary_loss_clip": 0.01105178, "auxiliary_loss_mlp": 0.01032025, "balance_loss_clip": 1.01931691, "balance_loss_mlp": 1.03571784, "epoch": 0.8867578535998798, "flos": 48794448533760.0, "grad_norm": 1.8072061827629056, "language_loss": 0.7741909, "learning_rate": 1.3295315325970148e-07, "loss": 0.79556298, "num_input_tokens_seen": 318030435, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14749, "time_per_iteration": 2.692556858062744 }, { "auxiliary_loss_clip": 0.01106489, "auxiliary_loss_mlp": 0.01034287, "balance_loss_clip": 1.02084661, "balance_loss_mlp": 1.03437448, "epoch": 0.8868179768525477, "flos": 21105312013440.0, "grad_norm": 2.2527659439294783, "language_loss": 0.69839305, "learning_rate": 1.328135602550451e-07, "loss": 0.71980083, "num_input_tokens_seen": 318049465, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 14750, "time_per_iteration": 2.482541799545288 }, { "auxiliary_loss_clip": 0.01102968, "auxiliary_loss_mlp": 0.0103184, "balance_loss_clip": 1.01999664, "balance_loss_mlp": 1.0347997, "epoch": 0.8868781001052157, "flos": 21830922656640.0, "grad_norm": 1.9679140907410027, "language_loss": 0.59947306, "learning_rate": 1.3267403805428546e-07, "loss": 0.62082112, "num_input_tokens_seen": 318067760, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.68359375, "step": 14751, "time_per_iteration": 3.940575361251831 }, { "auxiliary_loss_clip": 0.01103916, "auxiliary_loss_mlp": 0.01032395, "balance_loss_clip": 1.01949048, "balance_loss_mlp": 1.03571212, "epoch": 0.8869382233578836, "flos": 13516418073600.0, "grad_norm": 2.1721400824269925, "language_loss": 0.81358939, "learning_rate": 1.3253458666271344e-07, "loss": 0.83495247, "num_input_tokens_seen": 318082785, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 14752, "time_per_iteration": 2.4194133281707764 }, { "auxiliary_loss_clip": 0.01109692, "auxiliary_loss_mlp": 0.01032109, "balance_loss_clip": 1.01915097, "balance_loss_mlp": 1.03733194, "epoch": 0.8869983466105517, "flos": 22704988210560.0, "grad_norm": 2.0529283865980115, "language_loss": 0.80419672, "learning_rate": 1.3239520608561793e-07, "loss": 0.82561469, "num_input_tokens_seen": 318101925, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 14753, "time_per_iteration": 3.837738275527954 }, { "auxiliary_loss_clip": 0.01102136, "auxiliary_loss_mlp": 0.01031655, "balance_loss_clip": 1.02005041, "balance_loss_mlp": 1.03393722, "epoch": 0.8870584698632196, "flos": 15340751884800.0, "grad_norm": 1.7970865147598658, "language_loss": 0.65357149, "learning_rate": 1.3225589632828248e-07, "loss": 0.67490941, "num_input_tokens_seen": 318119945, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 14754, "time_per_iteration": 3.853039026260376 }, { "auxiliary_loss_clip": 0.01106419, "auxiliary_loss_mlp": 0.01031448, "balance_loss_clip": 1.01874018, "balance_loss_mlp": 1.03648984, "epoch": 0.8871185931158876, "flos": 26615624699520.0, "grad_norm": 1.8778796159016353, "language_loss": 0.74728203, "learning_rate": 1.3211665739599065e-07, "loss": 0.76866066, "num_input_tokens_seen": 318139685, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 14755, "time_per_iteration": 2.4927070140838623 }, { "auxiliary_loss_clip": 0.01104674, "auxiliary_loss_mlp": 0.01029294, "balance_loss_clip": 1.0160563, "balance_loss_mlp": 1.03435755, "epoch": 0.8871787163685555, "flos": 21799034357760.0, "grad_norm": 1.6707839407608889, "language_loss": 0.779055, "learning_rate": 1.3197748929402262e-07, "loss": 0.80039465, "num_input_tokens_seen": 318160375, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 14756, "time_per_iteration": 2.4739770889282227 }, { "auxiliary_loss_clip": 0.01105841, "auxiliary_loss_mlp": 0.01033149, "balance_loss_clip": 1.0204885, "balance_loss_mlp": 1.03604698, "epoch": 0.8872388396212235, "flos": 14902964922240.0, "grad_norm": 3.2415973793304755, "language_loss": 0.7663635, "learning_rate": 1.3183839202765535e-07, "loss": 0.7877534, "num_input_tokens_seen": 318177995, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 14757, "time_per_iteration": 2.411433458328247 }, { "auxiliary_loss_clip": 0.01100217, "auxiliary_loss_mlp": 0.01033784, "balance_loss_clip": 1.02202368, "balance_loss_mlp": 1.03396749, "epoch": 0.8872989628738914, "flos": 26432157006720.0, "grad_norm": 1.9677449965996825, "language_loss": 0.68295068, "learning_rate": 1.316993656021632e-07, "loss": 0.70429075, "num_input_tokens_seen": 318197030, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6640625, "step": 14758, "time_per_iteration": 2.4997622966766357 }, { "auxiliary_loss_clip": 0.01104257, "auxiliary_loss_mlp": 0.01037868, "balance_loss_clip": 1.02460551, "balance_loss_mlp": 1.03575695, "epoch": 0.8873590861265594, "flos": 48142562555520.0, "grad_norm": 7.6115649161726555, "language_loss": 0.68939763, "learning_rate": 1.3156041002281915e-07, "loss": 0.71081889, "num_input_tokens_seen": 318221780, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6875, "step": 14759, "time_per_iteration": 2.6901230812072754 }, { "auxiliary_loss_clip": 0.01101189, "auxiliary_loss_mlp": 0.01030643, "balance_loss_clip": 1.01813805, "balance_loss_mlp": 1.03330708, "epoch": 0.8874192093792275, "flos": 18332972501760.0, "grad_norm": 1.8252870903220733, "language_loss": 0.74601769, "learning_rate": 1.3142152529489092e-07, "loss": 0.76733607, "num_input_tokens_seen": 318239710, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 14760, "time_per_iteration": 2.429196834564209 }, { "auxiliary_loss_clip": 0.01107703, "auxiliary_loss_mlp": 0.01034267, "balance_loss_clip": 1.02148747, "balance_loss_mlp": 1.0364027, "epoch": 0.8874793326318954, "flos": 17894215872000.0, "grad_norm": 2.296176376332965, "language_loss": 0.76541245, "learning_rate": 1.3128271142364565e-07, "loss": 0.78683215, "num_input_tokens_seen": 318257425, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 14761, "time_per_iteration": 2.4243829250335693 }, { "auxiliary_loss_clip": 0.01104643, "auxiliary_loss_mlp": 0.01034301, "balance_loss_clip": 1.0222491, "balance_loss_mlp": 1.0347203, "epoch": 0.8875394558845634, "flos": 31102231772160.0, "grad_norm": 1.797418638971787, "language_loss": 0.61470544, "learning_rate": 1.3114396841434717e-07, "loss": 0.63609487, "num_input_tokens_seen": 318278485, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 14762, "time_per_iteration": 2.5354225635528564 }, { "auxiliary_loss_clip": 0.01104369, "auxiliary_loss_mlp": 0.01028862, "balance_loss_clip": 1.01600516, "balance_loss_mlp": 1.03485107, "epoch": 0.8875995791372313, "flos": 21142048648320.0, "grad_norm": 2.2129630221390726, "language_loss": 0.64334428, "learning_rate": 1.3100529627225697e-07, "loss": 0.66467661, "num_input_tokens_seen": 318297560, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 14763, "time_per_iteration": 2.4848172664642334 }, { "auxiliary_loss_clip": 0.01105274, "auxiliary_loss_mlp": 0.01031433, "balance_loss_clip": 1.01820683, "balance_loss_mlp": 1.03578162, "epoch": 0.8876597023898993, "flos": 17455136019840.0, "grad_norm": 3.013921515739404, "language_loss": 0.71165431, "learning_rate": 1.3086669500263335e-07, "loss": 0.73302138, "num_input_tokens_seen": 318313060, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 14764, "time_per_iteration": 2.46407151222229 }, { "auxiliary_loss_clip": 0.01108268, "auxiliary_loss_mlp": 0.01034861, "balance_loss_clip": 1.02251744, "balance_loss_mlp": 1.03533304, "epoch": 0.8877198256425672, "flos": 22707933125760.0, "grad_norm": 2.1498159614386405, "language_loss": 0.66270661, "learning_rate": 1.3072816461073166e-07, "loss": 0.68413788, "num_input_tokens_seen": 318332030, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.7265625, "step": 14765, "time_per_iteration": 2.4495015144348145 }, { "auxiliary_loss_clip": 0.01100284, "auxiliary_loss_mlp": 0.01027376, "balance_loss_clip": 1.01650429, "balance_loss_mlp": 1.03469324, "epoch": 0.8877799488952353, "flos": 24535104111360.0, "grad_norm": 2.1960620691636983, "language_loss": 0.76505423, "learning_rate": 1.3058970510180568e-07, "loss": 0.78633082, "num_input_tokens_seen": 318351090, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.65625, "step": 14766, "time_per_iteration": 2.496011734008789 }, { "auxiliary_loss_clip": 0.0110052, "auxiliary_loss_mlp": 0.01029251, "balance_loss_clip": 1.01747298, "balance_loss_mlp": 1.03411007, "epoch": 0.8878400721479032, "flos": 20959191486720.0, "grad_norm": 4.522962366156192, "language_loss": 0.73632979, "learning_rate": 1.3045131648110496e-07, "loss": 0.75762749, "num_input_tokens_seen": 318372000, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 14767, "time_per_iteration": 2.4431653022766113 }, { "auxiliary_loss_clip": 0.01100399, "auxiliary_loss_mlp": 0.01026696, "balance_loss_clip": 1.01516294, "balance_loss_mlp": 1.03464627, "epoch": 0.8879001954005712, "flos": 25295260659840.0, "grad_norm": 1.8258635211167107, "language_loss": 0.7104528, "learning_rate": 1.303129987538778e-07, "loss": 0.73172379, "num_input_tokens_seen": 318391530, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65625, "step": 14768, "time_per_iteration": 2.5149219036102295 }, { "auxiliary_loss_clip": 0.0110328, "auxiliary_loss_mlp": 0.01030873, "balance_loss_clip": 1.0191071, "balance_loss_mlp": 1.03524339, "epoch": 0.8879603186532391, "flos": 23185329811200.0, "grad_norm": 2.1518222478634, "language_loss": 0.70185673, "learning_rate": 1.3017475192536932e-07, "loss": 0.72319829, "num_input_tokens_seen": 318410690, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 14769, "time_per_iteration": 2.4615399837493896 }, { "auxiliary_loss_clip": 0.01102773, "auxiliary_loss_mlp": 0.01033826, "balance_loss_clip": 1.02230453, "balance_loss_mlp": 1.03575552, "epoch": 0.8880204419059071, "flos": 13655427707520.0, "grad_norm": 4.336503704222136, "language_loss": 0.67203861, "learning_rate": 1.3003657600082174e-07, "loss": 0.69340461, "num_input_tokens_seen": 318427380, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 14770, "time_per_iteration": 2.432985305786133 }, { "auxiliary_loss_clip": 0.01099631, "auxiliary_loss_mlp": 0.01031598, "balance_loss_clip": 1.01946211, "balance_loss_mlp": 1.03413284, "epoch": 0.888080565158575, "flos": 20631865824000.0, "grad_norm": 2.4643495986374053, "language_loss": 0.65596384, "learning_rate": 1.2989847098547424e-07, "loss": 0.67727613, "num_input_tokens_seen": 318448530, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.65625, "step": 14771, "time_per_iteration": 2.4741287231445312 }, { "auxiliary_loss_clip": 0.0110255, "auxiliary_loss_mlp": 0.01025245, "balance_loss_clip": 1.01349711, "balance_loss_mlp": 1.03426147, "epoch": 0.888140688411243, "flos": 28620014411520.0, "grad_norm": 1.5767258016421357, "language_loss": 0.8270089, "learning_rate": 1.2976043688456396e-07, "loss": 0.84828687, "num_input_tokens_seen": 318468655, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.68359375, "step": 14772, "time_per_iteration": 2.5397448539733887 }, { "auxiliary_loss_clip": 0.01098925, "auxiliary_loss_mlp": 0.0102331, "balance_loss_clip": 1.01203847, "balance_loss_mlp": 1.03376639, "epoch": 0.8882008116639111, "flos": 25520241496320.0, "grad_norm": 1.8704733130442186, "language_loss": 0.76400012, "learning_rate": 1.296224737033258e-07, "loss": 0.78522247, "num_input_tokens_seen": 318488740, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.65234375, "step": 14773, "time_per_iteration": 2.481797218322754 }, { "auxiliary_loss_clip": 0.01102173, "auxiliary_loss_mlp": 0.01028434, "balance_loss_clip": 1.01703739, "balance_loss_mlp": 1.03617597, "epoch": 0.888260934916579, "flos": 27673696650240.0, "grad_norm": 1.8267276140564481, "language_loss": 0.75115681, "learning_rate": 1.294845814469907e-07, "loss": 0.7724629, "num_input_tokens_seen": 318508810, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66015625, "step": 14774, "time_per_iteration": 2.5120327472686768 }, { "auxiliary_loss_clip": 0.01105636, "auxiliary_loss_mlp": 0.01032844, "balance_loss_clip": 1.0199399, "balance_loss_mlp": 1.03552985, "epoch": 0.888321058169247, "flos": 21611077464960.0, "grad_norm": 2.349276861336024, "language_loss": 0.71815383, "learning_rate": 1.2934676012078783e-07, "loss": 0.73953855, "num_input_tokens_seen": 318526860, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 14775, "time_per_iteration": 2.4433298110961914 }, { "auxiliary_loss_clip": 0.01101463, "auxiliary_loss_mlp": 0.01028602, "balance_loss_clip": 1.01717567, "balance_loss_mlp": 1.03382075, "epoch": 0.8883811814219149, "flos": 18149109759360.0, "grad_norm": 1.6208518910323968, "language_loss": 0.80100167, "learning_rate": 1.292090097299432e-07, "loss": 0.82230234, "num_input_tokens_seen": 318545180, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 14776, "time_per_iteration": 2.4274282455444336 }, { "auxiliary_loss_clip": 0.01106404, "auxiliary_loss_mlp": 0.01034043, "balance_loss_clip": 1.02122784, "balance_loss_mlp": 1.03393972, "epoch": 0.8884413046745829, "flos": 28324648874880.0, "grad_norm": 2.1465816680331926, "language_loss": 0.69638216, "learning_rate": 1.290713302796802e-07, "loss": 0.71778667, "num_input_tokens_seen": 318564350, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 14777, "time_per_iteration": 2.5222816467285156 }, { "auxiliary_loss_clip": 0.01101317, "auxiliary_loss_mlp": 0.01035031, "balance_loss_clip": 1.02309823, "balance_loss_mlp": 1.03337359, "epoch": 0.8885014279272508, "flos": 15158756649600.0, "grad_norm": 1.7030251021460174, "language_loss": 0.7059688, "learning_rate": 1.2893372177522e-07, "loss": 0.72733223, "num_input_tokens_seen": 318582275, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 14778, "time_per_iteration": 2.4274182319641113 }, { "auxiliary_loss_clip": 0.01103507, "auxiliary_loss_mlp": 0.01027417, "balance_loss_clip": 1.01579976, "balance_loss_mlp": 1.03481174, "epoch": 0.8885615511799189, "flos": 19099593498240.0, "grad_norm": 1.719233896879589, "language_loss": 0.7760964, "learning_rate": 1.287961842217804e-07, "loss": 0.7974056, "num_input_tokens_seen": 318601230, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 14779, "time_per_iteration": 2.4210078716278076 }, { "auxiliary_loss_clip": 0.0102864, "auxiliary_loss_mlp": 0.01000878, "balance_loss_clip": 0.99984694, "balance_loss_mlp": 1.00644004, "epoch": 0.8886216744325868, "flos": 51186567605760.0, "grad_norm": 0.8810066588444091, "language_loss": 0.56817806, "learning_rate": 1.2865871762457747e-07, "loss": 0.5884732, "num_input_tokens_seen": 318645595, "router_z_loss_clip": 0.01031494, "router_z_loss_mlp": 0.22265625, "step": 14780, "time_per_iteration": 2.9056034088134766 }, { "auxiliary_loss_clip": 0.01028841, "auxiliary_loss_mlp": 0.01002224, "balance_loss_clip": 1.00104415, "balance_loss_mlp": 1.00646126, "epoch": 0.8886817976852548, "flos": 61612981263360.0, "grad_norm": 0.8009575542652585, "language_loss": 0.62367463, "learning_rate": 1.2852132198882326e-07, "loss": 0.64398527, "num_input_tokens_seen": 318707850, "router_z_loss_clip": 0.01177979, "router_z_loss_mlp": 0.22460938, "step": 14781, "time_per_iteration": 3.1652517318725586 }, { "auxiliary_loss_clip": 0.01028883, "auxiliary_loss_mlp": 0.0100055, "balance_loss_clip": 0.99946541, "balance_loss_mlp": 1.00654805, "epoch": 0.8887419209379227, "flos": 60646946935680.0, "grad_norm": 0.7881478070149669, "language_loss": 0.58180964, "learning_rate": 1.2838399731972805e-07, "loss": 0.60210395, "num_input_tokens_seen": 318764915, "router_z_loss_clip": 0.01086426, "router_z_loss_mlp": 0.22363281, "step": 14782, "time_per_iteration": 2.940699577331543 }, { "auxiliary_loss_clip": 0.0110187, "auxiliary_loss_mlp": 0.01028024, "balance_loss_clip": 1.01652002, "balance_loss_mlp": 1.03525746, "epoch": 0.8888020441905907, "flos": 29205861235200.0, "grad_norm": 3.040340771567338, "language_loss": 0.65835816, "learning_rate": 1.2824674362249922e-07, "loss": 0.6796571, "num_input_tokens_seen": 318785660, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 14783, "time_per_iteration": 2.5436904430389404 }, { "auxiliary_loss_clip": 0.01108343, "auxiliary_loss_mlp": 0.01032449, "balance_loss_clip": 1.01921666, "balance_loss_mlp": 1.03672719, "epoch": 0.8888621674432586, "flos": 22162701605760.0, "grad_norm": 1.6541187705570481, "language_loss": 0.77575368, "learning_rate": 1.281095609023415e-07, "loss": 0.79716164, "num_input_tokens_seen": 318806080, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 14784, "time_per_iteration": 2.450160503387451 }, { "auxiliary_loss_clip": 0.01106171, "auxiliary_loss_mlp": 0.01034116, "balance_loss_clip": 1.02152157, "balance_loss_mlp": 1.0364449, "epoch": 0.8889222906959267, "flos": 27672834723840.0, "grad_norm": 4.385368981706124, "language_loss": 0.60646957, "learning_rate": 1.279724491644565e-07, "loss": 0.62787247, "num_input_tokens_seen": 318826445, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 14785, "time_per_iteration": 2.5371973514556885 }, { "auxiliary_loss_clip": 0.01106453, "auxiliary_loss_mlp": 0.01028193, "balance_loss_clip": 1.01604605, "balance_loss_mlp": 1.03811443, "epoch": 0.8889824139485947, "flos": 14168627274240.0, "grad_norm": 2.2152914138094646, "language_loss": 0.65022475, "learning_rate": 1.278354084140445e-07, "loss": 0.67157125, "num_input_tokens_seen": 318843915, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 14786, "time_per_iteration": 2.4223380088806152 }, { "auxiliary_loss_clip": 0.0110927, "auxiliary_loss_mlp": 0.01032509, "balance_loss_clip": 1.01868677, "balance_loss_mlp": 1.03639531, "epoch": 0.8890425372012626, "flos": 12853003829760.0, "grad_norm": 2.6028293104753737, "language_loss": 0.85428333, "learning_rate": 1.276984386563009e-07, "loss": 0.87570107, "num_input_tokens_seen": 318859670, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.7265625, "step": 14787, "time_per_iteration": 2.4278829097747803 }, { "auxiliary_loss_clip": 0.01103877, "auxiliary_loss_mlp": 0.0103039, "balance_loss_clip": 1.01826048, "balance_loss_mlp": 1.03533423, "epoch": 0.8891026604539306, "flos": 21689291329920.0, "grad_norm": 1.796348617560746, "language_loss": 0.70563388, "learning_rate": 1.2756153989642027e-07, "loss": 0.72697651, "num_input_tokens_seen": 318877855, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 14788, "time_per_iteration": 2.4744417667388916 }, { "auxiliary_loss_clip": 0.01101133, "auxiliary_loss_mlp": 0.01026872, "balance_loss_clip": 1.01532066, "balance_loss_mlp": 1.03499269, "epoch": 0.8891627837065985, "flos": 21871430219520.0, "grad_norm": 1.6182878238329175, "language_loss": 0.70058328, "learning_rate": 1.274247121395935e-07, "loss": 0.72186333, "num_input_tokens_seen": 318896045, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66015625, "step": 14789, "time_per_iteration": 3.764434576034546 }, { "auxiliary_loss_clip": 0.01104542, "auxiliary_loss_mlp": 0.01023297, "balance_loss_clip": 1.01146579, "balance_loss_mlp": 1.03626561, "epoch": 0.8892229069592665, "flos": 21580230660480.0, "grad_norm": 1.8143952390244715, "language_loss": 0.70445776, "learning_rate": 1.2728795539100956e-07, "loss": 0.72573614, "num_input_tokens_seen": 318915515, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 14790, "time_per_iteration": 2.4480252265930176 }, { "auxiliary_loss_clip": 0.01104021, "auxiliary_loss_mlp": 0.0102749, "balance_loss_clip": 1.01632047, "balance_loss_mlp": 1.0356257, "epoch": 0.8892830302119344, "flos": 23075981832960.0, "grad_norm": 1.9964401138267958, "language_loss": 0.73076653, "learning_rate": 1.2715126965585387e-07, "loss": 0.75208163, "num_input_tokens_seen": 318934305, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.68359375, "step": 14791, "time_per_iteration": 2.5014820098876953 }, { "auxiliary_loss_clip": 0.01102667, "auxiliary_loss_mlp": 0.01033433, "balance_loss_clip": 1.02142286, "balance_loss_mlp": 1.03607059, "epoch": 0.8893431534646025, "flos": 23072139077760.0, "grad_norm": 1.439685121357616, "language_loss": 0.74079514, "learning_rate": 1.2701465493931008e-07, "loss": 0.76215625, "num_input_tokens_seen": 318953880, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66796875, "step": 14792, "time_per_iteration": 2.445831298828125 }, { "auxiliary_loss_clip": 0.01107712, "auxiliary_loss_mlp": 0.01031367, "balance_loss_clip": 1.01796198, "balance_loss_mlp": 1.0356667, "epoch": 0.8894032767172704, "flos": 22454978572800.0, "grad_norm": 3.4558135215575194, "language_loss": 0.65966326, "learning_rate": 1.2687811124655801e-07, "loss": 0.681054, "num_input_tokens_seen": 318971395, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 14793, "time_per_iteration": 3.842465877532959 }, { "auxiliary_loss_clip": 0.0110664, "auxiliary_loss_mlp": 0.0103195, "balance_loss_clip": 1.01877689, "balance_loss_mlp": 1.03569257, "epoch": 0.8894633999699384, "flos": 25338246261120.0, "grad_norm": 1.660917820876462, "language_loss": 0.71589184, "learning_rate": 1.2674163858277552e-07, "loss": 0.73727775, "num_input_tokens_seen": 318990580, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 14794, "time_per_iteration": 2.479031801223755 }, { "auxiliary_loss_clip": 0.01109796, "auxiliary_loss_mlp": 0.01030128, "balance_loss_clip": 1.01714587, "balance_loss_mlp": 1.03740025, "epoch": 0.8895235232226063, "flos": 20994096528000.0, "grad_norm": 2.4841651679423737, "language_loss": 0.75363451, "learning_rate": 1.2660523695313785e-07, "loss": 0.77503377, "num_input_tokens_seen": 319010040, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.72265625, "step": 14795, "time_per_iteration": 3.815742015838623 }, { "auxiliary_loss_clip": 0.01028773, "auxiliary_loss_mlp": 0.01000252, "balance_loss_clip": 0.99918538, "balance_loss_mlp": 1.00644445, "epoch": 0.8895836464752743, "flos": 69732956764800.0, "grad_norm": 0.7714447912484077, "language_loss": 0.56083864, "learning_rate": 1.2646890636281727e-07, "loss": 0.58112884, "num_input_tokens_seen": 319063860, "router_z_loss_clip": 0.01068115, "router_z_loss_mlp": 0.22265625, "step": 14796, "time_per_iteration": 4.40482759475708 }, { "auxiliary_loss_clip": 0.01106363, "auxiliary_loss_mlp": 0.01030752, "balance_loss_clip": 1.01724553, "balance_loss_mlp": 1.03622007, "epoch": 0.8896437697279422, "flos": 23221815050880.0, "grad_norm": 2.73361078178719, "language_loss": 0.70052958, "learning_rate": 1.263326468169843e-07, "loss": 0.7219007, "num_input_tokens_seen": 319082335, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.703125, "step": 14797, "time_per_iteration": 2.4890127182006836 }, { "auxiliary_loss_clip": 0.01028964, "auxiliary_loss_mlp": 0.00999714, "balance_loss_clip": 0.99868309, "balance_loss_mlp": 1.00668192, "epoch": 0.8897038929806103, "flos": 70752711882240.0, "grad_norm": 0.7470851647914386, "language_loss": 0.58010769, "learning_rate": 1.2619645832080417e-07, "loss": 0.60039437, "num_input_tokens_seen": 319147075, "router_z_loss_clip": 0.01031494, "router_z_loss_mlp": 0.22265625, "step": 14798, "time_per_iteration": 3.1437697410583496 }, { "auxiliary_loss_clip": 0.01104978, "auxiliary_loss_mlp": 0.01027316, "balance_loss_clip": 1.01422048, "balance_loss_mlp": 1.03647828, "epoch": 0.8897640162332782, "flos": 19245103493760.0, "grad_norm": 1.6014681121527836, "language_loss": 0.79240024, "learning_rate": 1.2606034087944251e-07, "loss": 0.81372315, "num_input_tokens_seen": 319166630, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.68359375, "step": 14799, "time_per_iteration": 2.4945309162139893 }, { "auxiliary_loss_clip": 0.01029351, "auxiliary_loss_mlp": 0.0099923, "balance_loss_clip": 0.99813288, "balance_loss_mlp": 1.00695944, "epoch": 0.8898241394859462, "flos": 41356275039360.0, "grad_norm": 0.8843802374130578, "language_loss": 0.5810551, "learning_rate": 1.2592429449806053e-07, "loss": 0.60134095, "num_input_tokens_seen": 319221865, "router_z_loss_clip": 0.01098633, "router_z_loss_mlp": 0.22460938, "step": 14800, "time_per_iteration": 3.0253374576568604 }, { "auxiliary_loss_clip": 0.01104456, "auxiliary_loss_mlp": 0.01027806, "balance_loss_clip": 1.01596856, "balance_loss_mlp": 1.03606367, "epoch": 0.8898842627386142, "flos": 18986295024000.0, "grad_norm": 2.2179636743433027, "language_loss": 0.66536605, "learning_rate": 1.2578831918181698e-07, "loss": 0.68668866, "num_input_tokens_seen": 319240710, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 14801, "time_per_iteration": 2.523621082305908 }, { "auxiliary_loss_clip": 0.01108892, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.02364945, "balance_loss_mlp": 1.03762579, "epoch": 0.8899443859912821, "flos": 13217173868160.0, "grad_norm": 5.2246525074396635, "language_loss": 0.75609183, "learning_rate": 1.256524149358682e-07, "loss": 0.77755773, "num_input_tokens_seen": 319256495, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7109375, "step": 14802, "time_per_iteration": 2.4046499729156494 }, { "auxiliary_loss_clip": 0.0110268, "auxiliary_loss_mlp": 0.01032177, "balance_loss_clip": 1.02073884, "balance_loss_mlp": 1.03560603, "epoch": 0.8900045092439501, "flos": 22674680110080.0, "grad_norm": 2.151133199268768, "language_loss": 0.73232269, "learning_rate": 1.2551658176536805e-07, "loss": 0.75367123, "num_input_tokens_seen": 319273620, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 14803, "time_per_iteration": 2.492058753967285 }, { "auxiliary_loss_clip": 0.01102949, "auxiliary_loss_mlp": 0.01032124, "balance_loss_clip": 1.01982188, "balance_loss_mlp": 1.03517318, "epoch": 0.890064632496618, "flos": 21141617685120.0, "grad_norm": 2.6038894035214804, "language_loss": 0.7162168, "learning_rate": 1.2538081967546664e-07, "loss": 0.73756754, "num_input_tokens_seen": 319291720, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 14804, "time_per_iteration": 2.437375783920288 }, { "auxiliary_loss_clip": 0.01104114, "auxiliary_loss_mlp": 0.01032446, "balance_loss_clip": 1.01958966, "balance_loss_mlp": 1.03402412, "epoch": 0.8901247557492861, "flos": 23397058529280.0, "grad_norm": 2.0384421413775455, "language_loss": 0.8110835, "learning_rate": 1.252451286713123e-07, "loss": 0.83244908, "num_input_tokens_seen": 319310380, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 14805, "time_per_iteration": 2.4753150939941406 }, { "auxiliary_loss_clip": 0.01107023, "auxiliary_loss_mlp": 0.01030884, "balance_loss_clip": 1.01855731, "balance_loss_mlp": 1.03575754, "epoch": 0.890184879001954, "flos": 29169591477120.0, "grad_norm": 5.249151456515436, "language_loss": 0.67079508, "learning_rate": 1.251095087580505e-07, "loss": 0.69217414, "num_input_tokens_seen": 319331765, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 14806, "time_per_iteration": 2.507202386856079 }, { "auxiliary_loss_clip": 0.01102383, "auxiliary_loss_mlp": 0.01030948, "balance_loss_clip": 1.01856804, "balance_loss_mlp": 1.03383934, "epoch": 0.890245002254622, "flos": 14427830793600.0, "grad_norm": 2.3423789035296187, "language_loss": 0.66929317, "learning_rate": 1.2497395994082438e-07, "loss": 0.69062644, "num_input_tokens_seen": 319349135, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 14807, "time_per_iteration": 2.4678919315338135 }, { "auxiliary_loss_clip": 0.01101174, "auxiliary_loss_mlp": 0.01029747, "balance_loss_clip": 1.01824927, "balance_loss_mlp": 1.03369188, "epoch": 0.8903051255072899, "flos": 22382187661440.0, "grad_norm": 2.8444774096445884, "language_loss": 0.75397468, "learning_rate": 1.248384822247732e-07, "loss": 0.77528387, "num_input_tokens_seen": 319368410, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 14808, "time_per_iteration": 2.4333574771881104 }, { "auxiliary_loss_clip": 0.01105271, "auxiliary_loss_mlp": 0.01032446, "balance_loss_clip": 1.02094817, "balance_loss_mlp": 1.03544545, "epoch": 0.8903652487599579, "flos": 20777375819520.0, "grad_norm": 1.9811107608554028, "language_loss": 0.81343651, "learning_rate": 1.2470307561503513e-07, "loss": 0.83481371, "num_input_tokens_seen": 319387535, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6953125, "step": 14809, "time_per_iteration": 2.4731216430664062 }, { "auxiliary_loss_clip": 0.01103431, "auxiliary_loss_mlp": 0.01029811, "balance_loss_clip": 1.0181464, "balance_loss_mlp": 1.03486431, "epoch": 0.8904253720126258, "flos": 24424499157120.0, "grad_norm": 2.2591135162387195, "language_loss": 0.68374419, "learning_rate": 1.2456774011674442e-07, "loss": 0.70507658, "num_input_tokens_seen": 319407210, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 14810, "time_per_iteration": 2.454364776611328 }, { "auxiliary_loss_clip": 0.0110523, "auxiliary_loss_mlp": 0.0102957, "balance_loss_clip": 1.01641512, "balance_loss_mlp": 1.0342772, "epoch": 0.8904854952652939, "flos": 19463871277440.0, "grad_norm": 2.6420988769640763, "language_loss": 0.70685852, "learning_rate": 1.2443247573503257e-07, "loss": 0.72820657, "num_input_tokens_seen": 319425340, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 14811, "time_per_iteration": 2.4661083221435547 }, { "auxiliary_loss_clip": 0.0110702, "auxiliary_loss_mlp": 0.01032966, "balance_loss_clip": 1.02084875, "balance_loss_mlp": 1.03683007, "epoch": 0.8905456185179618, "flos": 50800741666560.0, "grad_norm": 2.23464340843497, "language_loss": 0.65643293, "learning_rate": 1.2429728247502924e-07, "loss": 0.67783278, "num_input_tokens_seen": 319448150, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 14812, "time_per_iteration": 2.6935648918151855 }, { "auxiliary_loss_clip": 0.01103543, "auxiliary_loss_mlp": 0.01031215, "balance_loss_clip": 1.01938319, "balance_loss_mlp": 1.03591776, "epoch": 0.8906057417706298, "flos": 17784867893760.0, "grad_norm": 2.8105981969639036, "language_loss": 0.6820327, "learning_rate": 1.24162160341861e-07, "loss": 0.70338035, "num_input_tokens_seen": 319466115, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 14813, "time_per_iteration": 2.4894564151763916 }, { "auxiliary_loss_clip": 0.01110513, "auxiliary_loss_mlp": 0.01034029, "balance_loss_clip": 1.01967657, "balance_loss_mlp": 1.03675151, "epoch": 0.8906658650232978, "flos": 21944867575680.0, "grad_norm": 1.9062296129648415, "language_loss": 0.7551719, "learning_rate": 1.2402710934065198e-07, "loss": 0.77661735, "num_input_tokens_seen": 319485255, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.734375, "step": 14814, "time_per_iteration": 2.4448440074920654 }, { "auxiliary_loss_clip": 0.01105712, "auxiliary_loss_mlp": 0.01029443, "balance_loss_clip": 1.01661634, "balance_loss_mlp": 1.03520465, "epoch": 0.8907259882759657, "flos": 21287810039040.0, "grad_norm": 2.6702817183552194, "language_loss": 0.74503565, "learning_rate": 1.2389212947652229e-07, "loss": 0.76638722, "num_input_tokens_seen": 319501800, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 14815, "time_per_iteration": 2.4730358123779297 }, { "auxiliary_loss_clip": 0.01102521, "auxiliary_loss_mlp": 0.01028606, "balance_loss_clip": 1.01675653, "balance_loss_mlp": 1.03541064, "epoch": 0.8907861115286337, "flos": 20120426023680.0, "grad_norm": 2.3762399351949806, "language_loss": 0.75265729, "learning_rate": 1.237572207545914e-07, "loss": 0.77396852, "num_input_tokens_seen": 319520415, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 14816, "time_per_iteration": 2.441027879714966 }, { "auxiliary_loss_clip": 0.01104007, "auxiliary_loss_mlp": 0.01029668, "balance_loss_clip": 1.01759791, "balance_loss_mlp": 1.0344913, "epoch": 0.8908462347813016, "flos": 20084156265600.0, "grad_norm": 2.0547252993605136, "language_loss": 0.77570808, "learning_rate": 1.2362238317997476e-07, "loss": 0.79704475, "num_input_tokens_seen": 319538410, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 14817, "time_per_iteration": 2.462719440460205 }, { "auxiliary_loss_clip": 0.01029248, "auxiliary_loss_mlp": 0.01001229, "balance_loss_clip": 1.00014389, "balance_loss_mlp": 1.00689304, "epoch": 0.8909063580339697, "flos": 65503649790720.0, "grad_norm": 0.7541498092149606, "language_loss": 0.56534827, "learning_rate": 1.2348761675778517e-07, "loss": 0.58565295, "num_input_tokens_seen": 319602565, "router_z_loss_clip": 0.01086426, "router_z_loss_mlp": 0.22363281, "step": 14818, "time_per_iteration": 3.1325511932373047 }, { "auxiliary_loss_clip": 0.01104553, "auxiliary_loss_mlp": 0.01036211, "balance_loss_clip": 1.02386081, "balance_loss_mlp": 1.03607631, "epoch": 0.8909664812866376, "flos": 29863062426240.0, "grad_norm": 1.8203296173672139, "language_loss": 0.64411139, "learning_rate": 1.2335292149313325e-07, "loss": 0.665519, "num_input_tokens_seen": 319624645, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.68359375, "step": 14819, "time_per_iteration": 2.526853322982788 }, { "auxiliary_loss_clip": 0.0110451, "auxiliary_loss_mlp": 0.01029051, "balance_loss_clip": 1.01606941, "balance_loss_mlp": 1.03513384, "epoch": 0.8910266045393056, "flos": 25447127362560.0, "grad_norm": 1.8894993948902759, "language_loss": 0.78122628, "learning_rate": 1.2321829739112731e-07, "loss": 0.80256188, "num_input_tokens_seen": 319644040, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 14820, "time_per_iteration": 2.4800448417663574 }, { "auxiliary_loss_clip": 0.01104491, "auxiliary_loss_mlp": 0.01031897, "balance_loss_clip": 1.02006519, "balance_loss_mlp": 1.03530431, "epoch": 0.8910867277919735, "flos": 24499121662080.0, "grad_norm": 1.6827908555498707, "language_loss": 0.76601565, "learning_rate": 1.2308374445687087e-07, "loss": 0.78737962, "num_input_tokens_seen": 319663930, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 14821, "time_per_iteration": 2.5385560989379883 }, { "auxiliary_loss_clip": 0.01029252, "auxiliary_loss_mlp": 0.01001855, "balance_loss_clip": 1.00076377, "balance_loss_mlp": 1.00697708, "epoch": 0.8911468510446415, "flos": 60688136856960.0, "grad_norm": 0.8042142626119093, "language_loss": 0.59299308, "learning_rate": 1.2294926269546712e-07, "loss": 0.6133042, "num_input_tokens_seen": 319721245, "router_z_loss_clip": 0.01092529, "router_z_loss_mlp": 0.22265625, "step": 14822, "time_per_iteration": 2.9922962188720703 }, { "auxiliary_loss_clip": 0.01103579, "auxiliary_loss_mlp": 0.01032325, "balance_loss_clip": 1.02012372, "balance_loss_mlp": 1.03441727, "epoch": 0.8912069742973094, "flos": 25337492075520.0, "grad_norm": 2.0252853703352325, "language_loss": 0.68866646, "learning_rate": 1.2281485211201515e-07, "loss": 0.71002549, "num_input_tokens_seen": 319741200, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 14823, "time_per_iteration": 2.4715800285339355 }, { "auxiliary_loss_clip": 0.01101453, "auxiliary_loss_mlp": 0.01034498, "balance_loss_clip": 1.0221951, "balance_loss_mlp": 1.03424513, "epoch": 0.8912670975499775, "flos": 18223516782720.0, "grad_norm": 1.9834258374729221, "language_loss": 0.69608092, "learning_rate": 1.2268051271161262e-07, "loss": 0.71744043, "num_input_tokens_seen": 319759265, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 14824, "time_per_iteration": 2.4665420055389404 }, { "auxiliary_loss_clip": 0.01105908, "auxiliary_loss_mlp": 0.01032078, "balance_loss_clip": 1.018852, "balance_loss_mlp": 1.03465056, "epoch": 0.8913272208026454, "flos": 26504481041280.0, "grad_norm": 2.1941271447042, "language_loss": 0.70199442, "learning_rate": 1.2254624449935303e-07, "loss": 0.72337431, "num_input_tokens_seen": 319777560, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 14825, "time_per_iteration": 2.5401554107666016 }, { "auxiliary_loss_clip": 0.01103686, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 1.01815283, "balance_loss_mlp": 1.03620207, "epoch": 0.8913873440553134, "flos": 18802324540800.0, "grad_norm": 1.8254118903164975, "language_loss": 0.71403873, "learning_rate": 1.2241204748032786e-07, "loss": 0.73538172, "num_input_tokens_seen": 319794125, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.67578125, "step": 14826, "time_per_iteration": 2.448338747024536 }, { "auxiliary_loss_clip": 0.01103885, "auxiliary_loss_mlp": 0.01026303, "balance_loss_clip": 1.01389933, "balance_loss_mlp": 1.03561366, "epoch": 0.8914474673079814, "flos": 20884892204160.0, "grad_norm": 2.08867074255787, "language_loss": 0.75307405, "learning_rate": 1.2227792165962615e-07, "loss": 0.77437592, "num_input_tokens_seen": 319810310, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 14827, "time_per_iteration": 2.4419920444488525 }, { "auxiliary_loss_clip": 0.01105034, "auxiliary_loss_mlp": 0.01031576, "balance_loss_clip": 1.01913095, "balance_loss_mlp": 1.03533959, "epoch": 0.8915075905606493, "flos": 20952439729920.0, "grad_norm": 1.8066387086415157, "language_loss": 0.78117269, "learning_rate": 1.221438670423336e-07, "loss": 0.80253881, "num_input_tokens_seen": 319828505, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 14828, "time_per_iteration": 2.448078155517578 }, { "auxiliary_loss_clip": 0.0110262, "auxiliary_loss_mlp": 0.01032062, "balance_loss_clip": 1.01953292, "balance_loss_mlp": 1.03448462, "epoch": 0.8915677138133173, "flos": 23076305055360.0, "grad_norm": 1.8407035748648866, "language_loss": 0.75241745, "learning_rate": 1.2200988363353392e-07, "loss": 0.77376431, "num_input_tokens_seen": 319848680, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 14829, "time_per_iteration": 2.4604756832122803 }, { "auxiliary_loss_clip": 0.01103097, "auxiliary_loss_mlp": 0.01034983, "balance_loss_clip": 1.02338398, "balance_loss_mlp": 1.03350616, "epoch": 0.8916278370659853, "flos": 23440259612160.0, "grad_norm": 1.6155707417457326, "language_loss": 0.84667283, "learning_rate": 1.2187597143830773e-07, "loss": 0.86805367, "num_input_tokens_seen": 319868835, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 14830, "time_per_iteration": 2.452850818634033 }, { "auxiliary_loss_clip": 0.01101171, "auxiliary_loss_mlp": 0.01026501, "balance_loss_clip": 1.01525915, "balance_loss_mlp": 1.03462768, "epoch": 0.8916879603186533, "flos": 25160488830720.0, "grad_norm": 1.6662205951804383, "language_loss": 0.74820423, "learning_rate": 1.2174213046173299e-07, "loss": 0.76948088, "num_input_tokens_seen": 319891585, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 14831, "time_per_iteration": 3.968169927597046 }, { "auxiliary_loss_clip": 0.01105413, "auxiliary_loss_mlp": 0.01027497, "balance_loss_clip": 1.01492655, "balance_loss_mlp": 1.03476548, "epoch": 0.8917480835713212, "flos": 20229845829120.0, "grad_norm": 2.8162101036226264, "language_loss": 0.73123801, "learning_rate": 1.216083607088847e-07, "loss": 0.75256711, "num_input_tokens_seen": 319910315, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 14832, "time_per_iteration": 2.4573447704315186 }, { "auxiliary_loss_clip": 0.01105265, "auxiliary_loss_mlp": 0.01029848, "balance_loss_clip": 1.01777244, "balance_loss_mlp": 1.03459501, "epoch": 0.8918082068239892, "flos": 26101922342400.0, "grad_norm": 2.0661006321877378, "language_loss": 0.67351472, "learning_rate": 1.214746621848355e-07, "loss": 0.69486582, "num_input_tokens_seen": 319932275, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.70703125, "step": 14833, "time_per_iteration": 2.477123498916626 }, { "auxiliary_loss_clip": 0.01110106, "auxiliary_loss_mlp": 0.01035747, "balance_loss_clip": 1.02224624, "balance_loss_mlp": 1.03808558, "epoch": 0.8918683300766571, "flos": 24831439315200.0, "grad_norm": 1.7167816217190266, "language_loss": 0.73933184, "learning_rate": 1.2134103489465575e-07, "loss": 0.76079035, "num_input_tokens_seen": 319955335, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 14834, "time_per_iteration": 4.007714509963989 }, { "auxiliary_loss_clip": 0.01103661, "auxiliary_loss_mlp": 0.01031091, "balance_loss_clip": 1.01948011, "balance_loss_mlp": 1.03562999, "epoch": 0.8919284533293251, "flos": 22305158945280.0, "grad_norm": 2.2921050211604066, "language_loss": 0.79071379, "learning_rate": 1.2120747884341188e-07, "loss": 0.81206137, "num_input_tokens_seen": 319973990, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 14835, "time_per_iteration": 2.4637451171875 }, { "auxiliary_loss_clip": 0.01099725, "auxiliary_loss_mlp": 0.01028434, "balance_loss_clip": 1.01712108, "balance_loss_mlp": 1.03350496, "epoch": 0.891988576581993, "flos": 30373532559360.0, "grad_norm": 1.4524798829755452, "language_loss": 0.73924494, "learning_rate": 1.210739940361689e-07, "loss": 0.76052648, "num_input_tokens_seen": 319995555, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 14836, "time_per_iteration": 2.529411554336548 }, { "auxiliary_loss_clip": 0.01103131, "auxiliary_loss_mlp": 0.01031531, "balance_loss_clip": 1.01891828, "balance_loss_mlp": 1.03399372, "epoch": 0.8920486998346611, "flos": 15552947479680.0, "grad_norm": 4.5408443229435385, "language_loss": 0.6891436, "learning_rate": 1.2094058047798838e-07, "loss": 0.71049023, "num_input_tokens_seen": 320012385, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 14837, "time_per_iteration": 3.8188347816467285 }, { "auxiliary_loss_clip": 0.01108998, "auxiliary_loss_mlp": 0.0103283, "balance_loss_clip": 1.01972258, "balance_loss_mlp": 1.03610373, "epoch": 0.892108823087329, "flos": 21214983214080.0, "grad_norm": 2.0420281176880137, "language_loss": 0.67660511, "learning_rate": 1.2080723817392913e-07, "loss": 0.69802332, "num_input_tokens_seen": 320032390, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 14838, "time_per_iteration": 3.911189556121826 }, { "auxiliary_loss_clip": 0.01105668, "auxiliary_loss_mlp": 0.01029517, "balance_loss_clip": 1.01653528, "balance_loss_mlp": 1.03525734, "epoch": 0.892168946339997, "flos": 21978982517760.0, "grad_norm": 2.0946854415596543, "language_loss": 0.76524913, "learning_rate": 1.2067396712904777e-07, "loss": 0.78660095, "num_input_tokens_seen": 320052885, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 14839, "time_per_iteration": 2.480756998062134 }, { "auxiliary_loss_clip": 0.0102902, "auxiliary_loss_mlp": 0.01004381, "balance_loss_clip": 1.0032841, "balance_loss_mlp": 1.0066812, "epoch": 0.892229069592665, "flos": 67475289277440.0, "grad_norm": 0.68675381885676, "language_loss": 0.49448133, "learning_rate": 1.205407673483978e-07, "loss": 0.51481533, "num_input_tokens_seen": 320113685, "router_z_loss_clip": 0.01098633, "router_z_loss_mlp": 0.22363281, "step": 14840, "time_per_iteration": 3.0518627166748047 }, { "auxiliary_loss_clip": 0.01110903, "auxiliary_loss_mlp": 0.0103144, "balance_loss_clip": 1.01770675, "balance_loss_mlp": 1.03678381, "epoch": 0.8922891928453329, "flos": 19459561645440.0, "grad_norm": 2.637936885705978, "language_loss": 0.64044088, "learning_rate": 1.2040763883703074e-07, "loss": 0.66186428, "num_input_tokens_seen": 320130810, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7421875, "step": 14841, "time_per_iteration": 2.4249887466430664 }, { "auxiliary_loss_clip": 0.0110036, "auxiliary_loss_mlp": 0.0103335, "balance_loss_clip": 1.02220988, "balance_loss_mlp": 1.0343678, "epoch": 0.8923493160980009, "flos": 23367396873600.0, "grad_norm": 1.6998877382650721, "language_loss": 0.68231207, "learning_rate": 1.2027458159999438e-07, "loss": 0.70364916, "num_input_tokens_seen": 320152170, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 14842, "time_per_iteration": 2.486980438232422 }, { "auxiliary_loss_clip": 0.01101958, "auxiliary_loss_mlp": 0.01034158, "balance_loss_clip": 1.02312517, "balance_loss_mlp": 1.03584242, "epoch": 0.8924094393506689, "flos": 26177047637760.0, "grad_norm": 7.699865646906407, "language_loss": 0.79908276, "learning_rate": 1.2014159564233373e-07, "loss": 0.82044399, "num_input_tokens_seen": 320172360, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66015625, "step": 14843, "time_per_iteration": 2.4886882305145264 }, { "auxiliary_loss_clip": 0.01106971, "auxiliary_loss_mlp": 0.01033782, "balance_loss_clip": 1.02065659, "balance_loss_mlp": 1.03542399, "epoch": 0.8924695626033369, "flos": 22018520413440.0, "grad_norm": 1.9726299308764506, "language_loss": 0.68273091, "learning_rate": 1.2000868096909257e-07, "loss": 0.70413846, "num_input_tokens_seen": 320192130, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 14844, "time_per_iteration": 2.4526360034942627 }, { "auxiliary_loss_clip": 0.01104841, "auxiliary_loss_mlp": 0.01029223, "balance_loss_clip": 1.01740921, "balance_loss_mlp": 1.03636765, "epoch": 0.8925296858560048, "flos": 14793940166400.0, "grad_norm": 2.169229599939738, "language_loss": 0.91602463, "learning_rate": 1.1987583758531038e-07, "loss": 0.93736523, "num_input_tokens_seen": 320207760, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 14845, "time_per_iteration": 2.416904926300049 }, { "auxiliary_loss_clip": 0.01102156, "auxiliary_loss_mlp": 0.01024697, "balance_loss_clip": 1.01330614, "balance_loss_mlp": 1.035303, "epoch": 0.8925898091086728, "flos": 22346636175360.0, "grad_norm": 2.2915539133755685, "language_loss": 0.72484988, "learning_rate": 1.1974306549602476e-07, "loss": 0.74611843, "num_input_tokens_seen": 320225325, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66796875, "step": 14846, "time_per_iteration": 2.4541163444519043 }, { "auxiliary_loss_clip": 0.01107416, "auxiliary_loss_mlp": 0.01034215, "balance_loss_clip": 1.02169168, "balance_loss_mlp": 1.03689158, "epoch": 0.8926499323613407, "flos": 45806322067200.0, "grad_norm": 1.772961681885717, "language_loss": 0.56808114, "learning_rate": 1.1961036470627094e-07, "loss": 0.58949745, "num_input_tokens_seen": 320247645, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 14847, "time_per_iteration": 2.6864335536956787 }, { "auxiliary_loss_clip": 0.01103535, "auxiliary_loss_mlp": 0.01031303, "balance_loss_clip": 1.02000213, "balance_loss_mlp": 1.03399241, "epoch": 0.8927100556140087, "flos": 22127042378880.0, "grad_norm": 2.025114398155318, "language_loss": 0.76891977, "learning_rate": 1.1947773522108052e-07, "loss": 0.79026818, "num_input_tokens_seen": 320266005, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6953125, "step": 14848, "time_per_iteration": 2.475431442260742 }, { "auxiliary_loss_clip": 0.01103246, "auxiliary_loss_mlp": 0.01031258, "balance_loss_clip": 1.01931298, "balance_loss_mlp": 1.03576303, "epoch": 0.8927701788666766, "flos": 28330143655680.0, "grad_norm": 1.8106170241685167, "language_loss": 0.69190735, "learning_rate": 1.1934517704548251e-07, "loss": 0.71325237, "num_input_tokens_seen": 320285555, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 14849, "time_per_iteration": 2.5039494037628174 }, { "auxiliary_loss_clip": 0.0110818, "auxiliary_loss_mlp": 0.01033988, "balance_loss_clip": 1.02173305, "balance_loss_mlp": 1.03837419, "epoch": 0.8928303021193447, "flos": 25294973351040.0, "grad_norm": 1.6996035620035683, "language_loss": 0.80709183, "learning_rate": 1.1921269018450364e-07, "loss": 0.8285135, "num_input_tokens_seen": 320305395, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 14850, "time_per_iteration": 2.501044511795044 }, { "auxiliary_loss_clip": 0.01102491, "auxiliary_loss_mlp": 0.01036045, "balance_loss_clip": 1.02407026, "balance_loss_mlp": 1.035043, "epoch": 0.8928904253720126, "flos": 22236713579520.0, "grad_norm": 2.668020110426299, "language_loss": 0.75055742, "learning_rate": 1.1908027464316872e-07, "loss": 0.77194273, "num_input_tokens_seen": 320324220, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 14851, "time_per_iteration": 2.4648306369781494 }, { "auxiliary_loss_clip": 0.01102674, "auxiliary_loss_mlp": 0.01030806, "balance_loss_clip": 1.01872373, "balance_loss_mlp": 1.03486872, "epoch": 0.8929505486246806, "flos": 27092374940160.0, "grad_norm": 1.929179941722222, "language_loss": 0.78576016, "learning_rate": 1.1894793042649775e-07, "loss": 0.80709493, "num_input_tokens_seen": 320347195, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 14852, "time_per_iteration": 2.5343146324157715 }, { "auxiliary_loss_clip": 0.01103718, "auxiliary_loss_mlp": 0.01030173, "balance_loss_clip": 1.01847219, "balance_loss_mlp": 1.03717196, "epoch": 0.8930106718773486, "flos": 23039352938880.0, "grad_norm": 2.0857569236330615, "language_loss": 0.69253165, "learning_rate": 1.1881565753951006e-07, "loss": 0.71387058, "num_input_tokens_seen": 320366850, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 14853, "time_per_iteration": 2.458214044570923 }, { "auxiliary_loss_clip": 0.01104282, "auxiliary_loss_mlp": 0.01032026, "balance_loss_clip": 1.01965189, "balance_loss_mlp": 1.03591704, "epoch": 0.8930707951300165, "flos": 35626652887680.0, "grad_norm": 1.7260871543681253, "language_loss": 0.67425907, "learning_rate": 1.1868345598722118e-07, "loss": 0.69562209, "num_input_tokens_seen": 320388895, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.68359375, "step": 14854, "time_per_iteration": 2.591348648071289 }, { "auxiliary_loss_clip": 0.01101446, "auxiliary_loss_mlp": 0.01033401, "balance_loss_clip": 1.0222255, "balance_loss_mlp": 1.03492808, "epoch": 0.8931309183826845, "flos": 23039891642880.0, "grad_norm": 1.4119908294665238, "language_loss": 0.74853289, "learning_rate": 1.1855132577464399e-07, "loss": 0.76988137, "num_input_tokens_seen": 320408520, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6640625, "step": 14855, "time_per_iteration": 2.463057279586792 }, { "auxiliary_loss_clip": 0.01102996, "auxiliary_loss_mlp": 0.01034222, "balance_loss_clip": 1.02188981, "balance_loss_mlp": 1.03485692, "epoch": 0.8931910416353525, "flos": 26504624695680.0, "grad_norm": 2.0174605279807247, "language_loss": 0.63981915, "learning_rate": 1.1841926690678893e-07, "loss": 0.66119134, "num_input_tokens_seen": 320427400, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.68359375, "step": 14856, "time_per_iteration": 2.5159645080566406 }, { "auxiliary_loss_clip": 0.01103751, "auxiliary_loss_mlp": 0.01028938, "balance_loss_clip": 1.01716018, "balance_loss_mlp": 1.03464365, "epoch": 0.8932511648880205, "flos": 24973609345920.0, "grad_norm": 2.763861418108804, "language_loss": 0.66403186, "learning_rate": 1.1828727938866378e-07, "loss": 0.68535876, "num_input_tokens_seen": 320447570, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69140625, "step": 14857, "time_per_iteration": 2.490957498550415 }, { "auxiliary_loss_clip": 0.01105447, "auxiliary_loss_mlp": 0.01035338, "balance_loss_clip": 1.02252841, "balance_loss_mlp": 1.03549933, "epoch": 0.8933112881406884, "flos": 24460733001600.0, "grad_norm": 2.738542256791677, "language_loss": 0.75157177, "learning_rate": 1.1815536322527408e-07, "loss": 0.77297962, "num_input_tokens_seen": 320464405, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 14858, "time_per_iteration": 2.526606321334839 }, { "auxiliary_loss_clip": 0.01103988, "auxiliary_loss_mlp": 0.01030229, "balance_loss_clip": 1.01765251, "balance_loss_mlp": 1.03513503, "epoch": 0.8933714113933564, "flos": 28293083798400.0, "grad_norm": 1.8676733339665974, "language_loss": 0.69193423, "learning_rate": 1.1802351842162139e-07, "loss": 0.71327639, "num_input_tokens_seen": 320485525, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 14859, "time_per_iteration": 2.515707492828369 }, { "auxiliary_loss_clip": 0.01097715, "auxiliary_loss_mlp": 0.01028184, "balance_loss_clip": 1.01697278, "balance_loss_mlp": 1.03420281, "epoch": 0.8934315346460243, "flos": 21434864319360.0, "grad_norm": 1.6988676975227988, "language_loss": 0.75600815, "learning_rate": 1.1789174498270526e-07, "loss": 0.77726716, "num_input_tokens_seen": 320506725, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6328125, "step": 14860, "time_per_iteration": 2.4730210304260254 }, { "auxiliary_loss_clip": 0.01106748, "auxiliary_loss_mlp": 0.01033786, "balance_loss_clip": 1.02073884, "balance_loss_mlp": 1.03633833, "epoch": 0.8934916578986923, "flos": 23769596436480.0, "grad_norm": 2.8366922093254576, "language_loss": 0.58002102, "learning_rate": 1.1776004291352303e-07, "loss": 0.60142648, "num_input_tokens_seen": 320525425, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 14861, "time_per_iteration": 2.4772064685821533 }, { "auxiliary_loss_clip": 0.0110198, "auxiliary_loss_mlp": 0.01029469, "balance_loss_clip": 1.01756036, "balance_loss_mlp": 1.03412747, "epoch": 0.8935517811513602, "flos": 18916161719040.0, "grad_norm": 5.005353171210953, "language_loss": 0.638982, "learning_rate": 1.176284122190685e-07, "loss": 0.66029644, "num_input_tokens_seen": 320543010, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 14862, "time_per_iteration": 2.434818983078003 }, { "auxiliary_loss_clip": 0.0110061, "auxiliary_loss_mlp": 0.01029407, "balance_loss_clip": 1.01710486, "balance_loss_mlp": 1.03349686, "epoch": 0.8936119044040283, "flos": 24061370613120.0, "grad_norm": 7.0186251393743175, "language_loss": 0.7827515, "learning_rate": 1.1749685290433298e-07, "loss": 0.80405164, "num_input_tokens_seen": 320562180, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 14863, "time_per_iteration": 2.520338296890259 }, { "auxiliary_loss_clip": 0.01099243, "auxiliary_loss_mlp": 0.01028089, "balance_loss_clip": 1.01740754, "balance_loss_mlp": 1.03323853, "epoch": 0.8936720276566962, "flos": 21324079797120.0, "grad_norm": 1.9153536833623765, "language_loss": 0.70601952, "learning_rate": 1.1736536497430627e-07, "loss": 0.72729278, "num_input_tokens_seen": 320580395, "router_z_loss_clip": 0.10693359, "router_z_loss_mlp": 0.66015625, "step": 14864, "time_per_iteration": 2.4556849002838135 }, { "auxiliary_loss_clip": 0.01111863, "auxiliary_loss_mlp": 0.01036924, "balance_loss_clip": 1.02433586, "balance_loss_mlp": 1.03870416, "epoch": 0.8937321509093642, "flos": 18406122549120.0, "grad_norm": 2.0931960059616674, "language_loss": 0.7603144, "learning_rate": 1.1723394843397283e-07, "loss": 0.7818023, "num_input_tokens_seen": 320599505, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.73046875, "step": 14865, "time_per_iteration": 2.4464194774627686 }, { "auxiliary_loss_clip": 0.01101524, "auxiliary_loss_mlp": 0.01032556, "balance_loss_clip": 1.02104068, "balance_loss_mlp": 1.03413248, "epoch": 0.8937922741620322, "flos": 22054754257920.0, "grad_norm": 1.5713272529463571, "language_loss": 0.71649033, "learning_rate": 1.1710260328831668e-07, "loss": 0.73783118, "num_input_tokens_seen": 320619825, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 14866, "time_per_iteration": 2.4872286319732666 }, { "auxiliary_loss_clip": 0.01108411, "auxiliary_loss_mlp": 0.01029965, "balance_loss_clip": 1.01673925, "balance_loss_mlp": 1.0368973, "epoch": 0.8938523974147001, "flos": 25664386775040.0, "grad_norm": 1.7193229629668878, "language_loss": 0.84079194, "learning_rate": 1.1697132954231869e-07, "loss": 0.8621757, "num_input_tokens_seen": 320638515, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 14867, "time_per_iteration": 2.4844560623168945 }, { "auxiliary_loss_clip": 0.01103932, "auxiliary_loss_mlp": 0.01027784, "balance_loss_clip": 1.01697791, "balance_loss_mlp": 1.03453255, "epoch": 0.8939125206673681, "flos": 25742852035200.0, "grad_norm": 1.8155660800748943, "language_loss": 0.8074351, "learning_rate": 1.168401272009567e-07, "loss": 0.82875228, "num_input_tokens_seen": 320659430, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.6953125, "step": 14868, "time_per_iteration": 2.5140929222106934 }, { "auxiliary_loss_clip": 0.01105393, "auxiliary_loss_mlp": 0.01031221, "balance_loss_clip": 1.0189662, "balance_loss_mlp": 1.03618515, "epoch": 0.8939726439200361, "flos": 27344503480320.0, "grad_norm": 1.9650135806087758, "language_loss": 0.77235347, "learning_rate": 1.167089962692056e-07, "loss": 0.79371953, "num_input_tokens_seen": 320679295, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69140625, "step": 14869, "time_per_iteration": 2.4986698627471924 }, { "auxiliary_loss_clip": 0.01103192, "auxiliary_loss_mlp": 0.01025374, "balance_loss_clip": 1.01311374, "balance_loss_mlp": 1.03490782, "epoch": 0.8940327671727041, "flos": 20338834671360.0, "grad_norm": 1.9571243952282926, "language_loss": 0.65906185, "learning_rate": 1.1657793675203853e-07, "loss": 0.68034756, "num_input_tokens_seen": 320697535, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 14870, "time_per_iteration": 2.448239803314209 }, { "auxiliary_loss_clip": 0.01029165, "auxiliary_loss_mlp": 0.01000819, "balance_loss_clip": 0.99975824, "balance_loss_mlp": 1.00687742, "epoch": 0.894092890425372, "flos": 58410573235200.0, "grad_norm": 0.7950938445078917, "language_loss": 0.56014401, "learning_rate": 1.1644694865442461e-07, "loss": 0.58044386, "num_input_tokens_seen": 320758635, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.22265625, "step": 14871, "time_per_iteration": 3.160444974899292 }, { "auxiliary_loss_clip": 0.01102875, "auxiliary_loss_mlp": 0.01034714, "balance_loss_clip": 1.02301371, "balance_loss_mlp": 1.03606129, "epoch": 0.89415301367804, "flos": 19829657427840.0, "grad_norm": 2.123431843262431, "language_loss": 0.76793969, "learning_rate": 1.16316031981331e-07, "loss": 0.78931558, "num_input_tokens_seen": 320777175, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66796875, "step": 14872, "time_per_iteration": 3.775794506072998 }, { "auxiliary_loss_clip": 0.01099872, "auxiliary_loss_mlp": 0.0102648, "balance_loss_clip": 1.01588845, "balance_loss_mlp": 1.03432512, "epoch": 0.8942131369307079, "flos": 25775781828480.0, "grad_norm": 1.567650563775556, "language_loss": 0.66894454, "learning_rate": 1.1618518673772215e-07, "loss": 0.69020808, "num_input_tokens_seen": 320797670, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.65625, "step": 14873, "time_per_iteration": 2.536745548248291 }, { "auxiliary_loss_clip": 0.01100548, "auxiliary_loss_mlp": 0.01034143, "balance_loss_clip": 1.02179909, "balance_loss_mlp": 1.03430557, "epoch": 0.8942732601833759, "flos": 23149024139520.0, "grad_norm": 1.5182093842096727, "language_loss": 0.5973804, "learning_rate": 1.1605441292856033e-07, "loss": 0.61872733, "num_input_tokens_seen": 320817410, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6640625, "step": 14874, "time_per_iteration": 2.4562876224517822 }, { "auxiliary_loss_clip": 0.0110785, "auxiliary_loss_mlp": 0.01031474, "balance_loss_clip": 1.01879621, "balance_loss_mlp": 1.03800607, "epoch": 0.8943333834360438, "flos": 27855548231040.0, "grad_norm": 1.9180655078564042, "language_loss": 0.75791347, "learning_rate": 1.1592371055880356e-07, "loss": 0.77930677, "num_input_tokens_seen": 320836745, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14875, "time_per_iteration": 2.5677642822265625 }, { "auxiliary_loss_clip": 0.01109663, "auxiliary_loss_mlp": 0.01032615, "balance_loss_clip": 1.01845312, "balance_loss_mlp": 1.03694677, "epoch": 0.8943935066887119, "flos": 22163958581760.0, "grad_norm": 2.136961816873209, "language_loss": 0.77444255, "learning_rate": 1.1579307963340857e-07, "loss": 0.7958653, "num_input_tokens_seen": 320853305, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7265625, "step": 14876, "time_per_iteration": 3.8293747901916504 }, { "auxiliary_loss_clip": 0.0110235, "auxiliary_loss_mlp": 0.01027405, "balance_loss_clip": 1.01622868, "balance_loss_mlp": 1.03485608, "epoch": 0.8944536299413798, "flos": 21470056669440.0, "grad_norm": 1.708621654712331, "language_loss": 0.78889978, "learning_rate": 1.156625201573287e-07, "loss": 0.81019735, "num_input_tokens_seen": 320872885, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.671875, "step": 14877, "time_per_iteration": 2.4832756519317627 }, { "auxiliary_loss_clip": 0.01104062, "auxiliary_loss_mlp": 0.0102878, "balance_loss_clip": 1.01654315, "balance_loss_mlp": 1.03594947, "epoch": 0.8945137531940478, "flos": 17748777703680.0, "grad_norm": 2.1713135212957337, "language_loss": 0.75937337, "learning_rate": 1.155320321355151e-07, "loss": 0.78070176, "num_input_tokens_seen": 320889755, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 14878, "time_per_iteration": 3.8021767139434814 }, { "auxiliary_loss_clip": 0.0110408, "auxiliary_loss_mlp": 0.0102716, "balance_loss_clip": 1.01479769, "balance_loss_mlp": 1.03467071, "epoch": 0.8945738764467158, "flos": 21142264129920.0, "grad_norm": 1.8389102517083988, "language_loss": 0.76059222, "learning_rate": 1.1540161557291539e-07, "loss": 0.78190458, "num_input_tokens_seen": 320907860, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 14879, "time_per_iteration": 3.984123945236206 }, { "auxiliary_loss_clip": 0.01106843, "auxiliary_loss_mlp": 0.01032384, "balance_loss_clip": 1.0201298, "balance_loss_mlp": 1.03793955, "epoch": 0.8946339996993837, "flos": 14903000835840.0, "grad_norm": 2.575337504783017, "language_loss": 0.74567807, "learning_rate": 1.1527127047447538e-07, "loss": 0.76707035, "num_input_tokens_seen": 320925825, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 14880, "time_per_iteration": 2.4405460357666016 }, { "auxiliary_loss_clip": 0.01103527, "auxiliary_loss_mlp": 0.01028172, "balance_loss_clip": 1.01570916, "balance_loss_mlp": 1.03482127, "epoch": 0.8946941229520518, "flos": 27382173868800.0, "grad_norm": 2.2647260698052936, "language_loss": 0.83139145, "learning_rate": 1.1514099684513822e-07, "loss": 0.85270846, "num_input_tokens_seen": 320946165, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 14881, "time_per_iteration": 2.505863904953003 }, { "auxiliary_loss_clip": 0.01100016, "auxiliary_loss_mlp": 0.01028323, "balance_loss_clip": 1.01649141, "balance_loss_mlp": 1.03274906, "epoch": 0.8947542462047197, "flos": 31796277338880.0, "grad_norm": 2.0686956637269684, "language_loss": 0.67404133, "learning_rate": 1.1501079468984287e-07, "loss": 0.69532472, "num_input_tokens_seen": 320969330, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 14882, "time_per_iteration": 2.524707794189453 }, { "auxiliary_loss_clip": 0.01109491, "auxiliary_loss_mlp": 0.01029146, "balance_loss_clip": 1.01558018, "balance_loss_mlp": 1.03694224, "epoch": 0.8948143694573877, "flos": 20883599314560.0, "grad_norm": 2.248461238742758, "language_loss": 0.75201166, "learning_rate": 1.1488066401352691e-07, "loss": 0.77339792, "num_input_tokens_seen": 320985055, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 14883, "time_per_iteration": 2.4452171325683594 }, { "auxiliary_loss_clip": 0.01098812, "auxiliary_loss_mlp": 0.01033382, "balance_loss_clip": 1.02135432, "balance_loss_mlp": 1.03345394, "epoch": 0.8948744927100556, "flos": 28215552291840.0, "grad_norm": 1.690899344792564, "language_loss": 0.72210705, "learning_rate": 1.147506048211253e-07, "loss": 0.74342901, "num_input_tokens_seen": 321004720, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.65625, "step": 14884, "time_per_iteration": 2.504901170730591 }, { "auxiliary_loss_clip": 0.01098427, "auxiliary_loss_mlp": 0.01025233, "balance_loss_clip": 1.01397324, "balance_loss_mlp": 1.0322057, "epoch": 0.8949346159627236, "flos": 21902672073600.0, "grad_norm": 1.6811932496189494, "language_loss": 0.76057279, "learning_rate": 1.1462061711756987e-07, "loss": 0.78180945, "num_input_tokens_seen": 321022350, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 14885, "time_per_iteration": 2.4353435039520264 }, { "auxiliary_loss_clip": 0.0110491, "auxiliary_loss_mlp": 0.01032773, "balance_loss_clip": 1.01974893, "balance_loss_mlp": 1.03419673, "epoch": 0.8949947392153915, "flos": 21359128492800.0, "grad_norm": 1.8740052072018578, "language_loss": 0.81471688, "learning_rate": 1.1449070090778911e-07, "loss": 0.83609366, "num_input_tokens_seen": 321040450, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 14886, "time_per_iteration": 2.450695276260376 }, { "auxiliary_loss_clip": 0.01103519, "auxiliary_loss_mlp": 0.01030925, "balance_loss_clip": 1.01906407, "balance_loss_mlp": 1.03504717, "epoch": 0.8950548624680595, "flos": 52445342799360.0, "grad_norm": 1.6177289812779876, "language_loss": 0.63999361, "learning_rate": 1.1436085619671043e-07, "loss": 0.66133803, "num_input_tokens_seen": 321063970, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.68359375, "step": 14887, "time_per_iteration": 2.7311580181121826 }, { "auxiliary_loss_clip": 0.01106841, "auxiliary_loss_mlp": 0.0103188, "balance_loss_clip": 1.01982236, "balance_loss_mlp": 1.03595591, "epoch": 0.8951149857207275, "flos": 20121323863680.0, "grad_norm": 2.653469341528898, "language_loss": 0.61394894, "learning_rate": 1.1423108298925698e-07, "loss": 0.6353361, "num_input_tokens_seen": 321083840, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7109375, "step": 14888, "time_per_iteration": 2.4725992679595947 }, { "auxiliary_loss_clip": 0.0110502, "auxiliary_loss_mlp": 0.01023235, "balance_loss_clip": 1.01153421, "balance_loss_mlp": 1.03462517, "epoch": 0.8951751089733955, "flos": 29862631463040.0, "grad_norm": 2.5704619863887195, "language_loss": 0.70497078, "learning_rate": 1.1410138129034952e-07, "loss": 0.72625339, "num_input_tokens_seen": 321104165, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.703125, "step": 14889, "time_per_iteration": 2.5182881355285645 }, { "auxiliary_loss_clip": 0.01105801, "auxiliary_loss_mlp": 0.0103283, "balance_loss_clip": 1.01978827, "balance_loss_mlp": 1.03565407, "epoch": 0.8952352322260634, "flos": 15262789415040.0, "grad_norm": 3.2292078567405955, "language_loss": 0.71514428, "learning_rate": 1.1397175110490676e-07, "loss": 0.7365306, "num_input_tokens_seen": 321117290, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 14890, "time_per_iteration": 2.420440912246704 }, { "auxiliary_loss_clip": 0.01102694, "auxiliary_loss_mlp": 0.01031014, "balance_loss_clip": 1.01867557, "balance_loss_mlp": 1.03365588, "epoch": 0.8952953554787314, "flos": 26798338206720.0, "grad_norm": 1.7247126868102844, "language_loss": 0.7598753, "learning_rate": 1.1384219243784454e-07, "loss": 0.78121233, "num_input_tokens_seen": 321137115, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 14891, "time_per_iteration": 2.488610029220581 }, { "auxiliary_loss_clip": 0.0110617, "auxiliary_loss_mlp": 0.01031134, "balance_loss_clip": 1.01816392, "balance_loss_mlp": 1.03429615, "epoch": 0.8953554787313994, "flos": 14137205852160.0, "grad_norm": 2.0572099585336074, "language_loss": 0.76539838, "learning_rate": 1.1371270529407517e-07, "loss": 0.78677142, "num_input_tokens_seen": 321154490, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 14892, "time_per_iteration": 2.452378273010254 }, { "auxiliary_loss_clip": 0.01104891, "auxiliary_loss_mlp": 0.01028849, "balance_loss_clip": 1.0169996, "balance_loss_mlp": 1.03589225, "epoch": 0.8954156019840673, "flos": 25703314139520.0, "grad_norm": 1.6359729899785398, "language_loss": 0.81746078, "learning_rate": 1.1358328967850895e-07, "loss": 0.83879822, "num_input_tokens_seen": 321175625, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 14893, "time_per_iteration": 2.4855856895446777 }, { "auxiliary_loss_clip": 0.01100043, "auxiliary_loss_mlp": 0.01027978, "balance_loss_clip": 1.01649213, "balance_loss_mlp": 1.03394592, "epoch": 0.8954757252367354, "flos": 21907987286400.0, "grad_norm": 1.7183738061451252, "language_loss": 0.74847555, "learning_rate": 1.1345394559605348e-07, "loss": 0.76975578, "num_input_tokens_seen": 321193895, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66015625, "step": 14894, "time_per_iteration": 2.45687198638916 }, { "auxiliary_loss_clip": 0.01109969, "auxiliary_loss_mlp": 0.01032502, "balance_loss_clip": 1.01962101, "balance_loss_mlp": 1.03926635, "epoch": 0.8955358484894033, "flos": 12970396454400.0, "grad_norm": 2.1681497790415394, "language_loss": 0.66783094, "learning_rate": 1.1332467305161352e-07, "loss": 0.68925571, "num_input_tokens_seen": 321211610, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 14895, "time_per_iteration": 2.4054348468780518 }, { "auxiliary_loss_clip": 0.01108309, "auxiliary_loss_mlp": 0.01028662, "balance_loss_clip": 1.01556063, "balance_loss_mlp": 1.03681993, "epoch": 0.8955959717420713, "flos": 17273966797440.0, "grad_norm": 1.8566359272861641, "language_loss": 0.67345077, "learning_rate": 1.1319547205009094e-07, "loss": 0.69482052, "num_input_tokens_seen": 321229805, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 14896, "time_per_iteration": 2.4577765464782715 }, { "auxiliary_loss_clip": 0.01103122, "auxiliary_loss_mlp": 0.01029759, "balance_loss_clip": 1.0174154, "balance_loss_mlp": 1.0351094, "epoch": 0.8956560949947392, "flos": 14793868339200.0, "grad_norm": 1.8894823076153018, "language_loss": 0.75613809, "learning_rate": 1.1306634259638492e-07, "loss": 0.77746701, "num_input_tokens_seen": 321247165, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 14897, "time_per_iteration": 2.4020235538482666 }, { "auxiliary_loss_clip": 0.01029364, "auxiliary_loss_mlp": 0.01001095, "balance_loss_clip": 1.00003457, "balance_loss_mlp": 1.0069176, "epoch": 0.8957162182474072, "flos": 63607817957760.0, "grad_norm": 0.7591459274841895, "language_loss": 0.55338609, "learning_rate": 1.129372846953931e-07, "loss": 0.57369065, "num_input_tokens_seen": 321308425, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.22460938, "step": 14898, "time_per_iteration": 3.111382484436035 }, { "auxiliary_loss_clip": 0.01104016, "auxiliary_loss_mlp": 0.01029548, "balance_loss_clip": 1.0173527, "balance_loss_mlp": 1.03522575, "epoch": 0.8957763415000751, "flos": 25009843190400.0, "grad_norm": 1.8140696321140195, "language_loss": 0.70378625, "learning_rate": 1.12808298352008e-07, "loss": 0.72512186, "num_input_tokens_seen": 321329295, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 14899, "time_per_iteration": 2.490166425704956 }, { "auxiliary_loss_clip": 0.01106685, "auxiliary_loss_mlp": 0.01036707, "balance_loss_clip": 1.022861, "balance_loss_mlp": 1.0369997, "epoch": 0.8958364647527431, "flos": 19828615933440.0, "grad_norm": 2.1044167315945854, "language_loss": 0.74164611, "learning_rate": 1.1267938357112106e-07, "loss": 0.76308, "num_input_tokens_seen": 321347580, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.6953125, "step": 14900, "time_per_iteration": 2.448964834213257 }, { "auxiliary_loss_clip": 0.01029147, "auxiliary_loss_mlp": 0.01000208, "balance_loss_clip": 0.99916524, "balance_loss_mlp": 1.00685239, "epoch": 0.895896588005411, "flos": 65537190115200.0, "grad_norm": 0.7681802256575466, "language_loss": 0.61811334, "learning_rate": 1.1255054035762124e-07, "loss": 0.63840687, "num_input_tokens_seen": 321407820, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.22265625, "step": 14901, "time_per_iteration": 3.077805757522583 }, { "auxiliary_loss_clip": 0.01104205, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 1.01704133, "balance_loss_mlp": 1.03411734, "epoch": 0.8959567112580791, "flos": 25591021246080.0, "grad_norm": 1.7080795679068763, "language_loss": 0.70466197, "learning_rate": 1.1242176871639441e-07, "loss": 0.72599316, "num_input_tokens_seen": 321426745, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69921875, "step": 14902, "time_per_iteration": 2.475184202194214 }, { "auxiliary_loss_clip": 0.0109944, "auxiliary_loss_mlp": 0.01028698, "balance_loss_clip": 1.01724792, "balance_loss_mlp": 1.03373075, "epoch": 0.896016834510747, "flos": 24201780877440.0, "grad_norm": 1.7289020761706744, "language_loss": 0.7816807, "learning_rate": 1.1229306865232313e-07, "loss": 0.80296206, "num_input_tokens_seen": 321446165, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.65625, "step": 14903, "time_per_iteration": 2.5011510848999023 }, { "auxiliary_loss_clip": 0.01108066, "auxiliary_loss_mlp": 0.01031514, "balance_loss_clip": 1.01864576, "balance_loss_mlp": 1.03679132, "epoch": 0.896076957763415, "flos": 23075945919360.0, "grad_norm": 15.974141319468883, "language_loss": 0.72857106, "learning_rate": 1.121644401702877e-07, "loss": 0.74996686, "num_input_tokens_seen": 321465285, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 14904, "time_per_iteration": 2.454212188720703 }, { "auxiliary_loss_clip": 0.01104521, "auxiliary_loss_mlp": 0.0102858, "balance_loss_clip": 1.01473379, "balance_loss_mlp": 1.03490698, "epoch": 0.8961370810160829, "flos": 22236605838720.0, "grad_norm": 2.0552432016078312, "language_loss": 0.75165296, "learning_rate": 1.12035883275166e-07, "loss": 0.77298397, "num_input_tokens_seen": 321483670, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.6953125, "step": 14905, "time_per_iteration": 2.4690487384796143 }, { "auxiliary_loss_clip": 0.01103068, "auxiliary_loss_mlp": 0.0102846, "balance_loss_clip": 1.01613331, "balance_loss_mlp": 1.03519845, "epoch": 0.8961972042687509, "flos": 23072318645760.0, "grad_norm": 1.6569693644127697, "language_loss": 0.76520544, "learning_rate": 1.1190739797183279e-07, "loss": 0.78652072, "num_input_tokens_seen": 321501190, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 14906, "time_per_iteration": 2.4840009212493896 }, { "auxiliary_loss_clip": 0.01105804, "auxiliary_loss_mlp": 0.01031421, "balance_loss_clip": 1.01927423, "balance_loss_mlp": 1.03669429, "epoch": 0.896257327521419, "flos": 18185882307840.0, "grad_norm": 1.6100104622184948, "language_loss": 0.7394948, "learning_rate": 1.1177898426515996e-07, "loss": 0.760867, "num_input_tokens_seen": 321518540, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 14907, "time_per_iteration": 2.432305097579956 }, { "auxiliary_loss_clip": 0.01104605, "auxiliary_loss_mlp": 0.01033863, "balance_loss_clip": 1.02181697, "balance_loss_mlp": 1.03720737, "epoch": 0.8963174507740869, "flos": 17895472848000.0, "grad_norm": 2.4321583628250196, "language_loss": 0.83425057, "learning_rate": 1.1165064216001785e-07, "loss": 0.85563529, "num_input_tokens_seen": 321536555, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.671875, "step": 14908, "time_per_iteration": 2.4445271492004395 }, { "auxiliary_loss_clip": 0.01105649, "auxiliary_loss_mlp": 0.01030186, "balance_loss_clip": 1.01671529, "balance_loss_mlp": 1.03480434, "epoch": 0.8963775740267549, "flos": 21032269706880.0, "grad_norm": 2.0336887910713966, "language_loss": 0.70645773, "learning_rate": 1.1152237166127232e-07, "loss": 0.72781616, "num_input_tokens_seen": 321557655, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 14909, "time_per_iteration": 2.4843368530273438 }, { "auxiliary_loss_clip": 0.01107116, "auxiliary_loss_mlp": 0.0103432, "balance_loss_clip": 1.02186251, "balance_loss_mlp": 1.03701925, "epoch": 0.8964376972794228, "flos": 23179619548800.0, "grad_norm": 1.860516121700817, "language_loss": 0.71958977, "learning_rate": 1.113941727737877e-07, "loss": 0.74100411, "num_input_tokens_seen": 321576160, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 14910, "time_per_iteration": 2.457674980163574 }, { "auxiliary_loss_clip": 0.0110166, "auxiliary_loss_mlp": 0.01029063, "balance_loss_clip": 1.01718378, "balance_loss_mlp": 1.03352785, "epoch": 0.8964978205320908, "flos": 24972998814720.0, "grad_norm": 3.2576713137439013, "language_loss": 0.63206589, "learning_rate": 1.1126604550242502e-07, "loss": 0.65337312, "num_input_tokens_seen": 321596205, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 14911, "time_per_iteration": 2.4766392707824707 }, { "auxiliary_loss_clip": 0.01107121, "auxiliary_loss_mlp": 0.01030326, "balance_loss_clip": 1.01746893, "balance_loss_mlp": 1.03682506, "epoch": 0.8965579437847587, "flos": 19172025273600.0, "grad_norm": 2.0463669520482193, "language_loss": 0.75090808, "learning_rate": 1.111379898520437e-07, "loss": 0.77228254, "num_input_tokens_seen": 321614800, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 14912, "time_per_iteration": 2.4614272117614746 }, { "auxiliary_loss_clip": 0.01103693, "auxiliary_loss_mlp": 0.01031313, "balance_loss_clip": 1.01865911, "balance_loss_mlp": 1.03390074, "epoch": 0.8966180670374267, "flos": 24276690691200.0, "grad_norm": 3.079933432620596, "language_loss": 0.81878066, "learning_rate": 1.1101000582749876e-07, "loss": 0.84013081, "num_input_tokens_seen": 321633445, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 14913, "time_per_iteration": 2.4838943481445312 }, { "auxiliary_loss_clip": 0.01105789, "auxiliary_loss_mlp": 0.01033066, "balance_loss_clip": 1.01995921, "balance_loss_mlp": 1.035074, "epoch": 0.8966781902900947, "flos": 13553190622080.0, "grad_norm": 2.4224881316428193, "language_loss": 0.61242342, "learning_rate": 1.1088209343364407e-07, "loss": 0.63381201, "num_input_tokens_seen": 321650890, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 14914, "time_per_iteration": 3.9381299018859863 }, { "auxiliary_loss_clip": 0.01029336, "auxiliary_loss_mlp": 0.00999134, "balance_loss_clip": 0.9980554, "balance_loss_mlp": 1.00698137, "epoch": 0.8967383135427627, "flos": 65066114223360.0, "grad_norm": 0.7247340323610875, "language_loss": 0.55066502, "learning_rate": 1.1075425267532956e-07, "loss": 0.57094973, "num_input_tokens_seen": 321710960, "router_z_loss_clip": 0.01080322, "router_z_loss_mlp": 0.22363281, "step": 14915, "time_per_iteration": 3.0830752849578857 }, { "auxiliary_loss_clip": 0.01100815, "auxiliary_loss_mlp": 0.01028026, "balance_loss_clip": 1.01702309, "balance_loss_mlp": 1.03431821, "epoch": 0.8967984367954306, "flos": 29713027317120.0, "grad_norm": 1.4761451671875516, "language_loss": 0.71571577, "learning_rate": 1.1062648355740289e-07, "loss": 0.73700416, "num_input_tokens_seen": 321733290, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.6640625, "step": 14916, "time_per_iteration": 2.5077006816864014 }, { "auxiliary_loss_clip": 0.01105112, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.01882339, "balance_loss_mlp": 1.03561485, "epoch": 0.8968585600480986, "flos": 25702488126720.0, "grad_norm": 1.7129737553993905, "language_loss": 0.77839863, "learning_rate": 1.1049878608470931e-07, "loss": 0.79975581, "num_input_tokens_seen": 321753120, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 14917, "time_per_iteration": 3.840003252029419 }, { "auxiliary_loss_clip": 0.01110014, "auxiliary_loss_mlp": 0.01038693, "balance_loss_clip": 1.02537727, "balance_loss_mlp": 1.03751659, "epoch": 0.8969186833007665, "flos": 30044698525440.0, "grad_norm": 2.0057636129460588, "language_loss": 0.68110275, "learning_rate": 1.1037116026209137e-07, "loss": 0.70258975, "num_input_tokens_seen": 321772840, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 14918, "time_per_iteration": 2.520618200302124 }, { "auxiliary_loss_clip": 0.01103596, "auxiliary_loss_mlp": 0.01030343, "balance_loss_clip": 1.01870275, "balance_loss_mlp": 1.03387356, "epoch": 0.8969788065534345, "flos": 22818143030400.0, "grad_norm": 2.008964300003331, "language_loss": 0.83766413, "learning_rate": 1.102436060943881e-07, "loss": 0.85900354, "num_input_tokens_seen": 321791020, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.69921875, "step": 14919, "time_per_iteration": 2.444148302078247 }, { "auxiliary_loss_clip": 0.01105516, "auxiliary_loss_mlp": 0.0102871, "balance_loss_clip": 1.01575232, "balance_loss_mlp": 1.03488708, "epoch": 0.8970389298061026, "flos": 13261488272640.0, "grad_norm": 2.247620700587094, "language_loss": 0.7211979, "learning_rate": 1.1011612358643696e-07, "loss": 0.74254018, "num_input_tokens_seen": 321810075, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 14920, "time_per_iteration": 5.225815773010254 }, { "auxiliary_loss_clip": 0.0110497, "auxiliary_loss_mlp": 0.01029497, "balance_loss_clip": 1.01633072, "balance_loss_mlp": 1.03565133, "epoch": 0.8970990530587705, "flos": 10266071345280.0, "grad_norm": 2.4425367361274586, "language_loss": 0.90935767, "learning_rate": 1.0998871274307164e-07, "loss": 0.93070239, "num_input_tokens_seen": 321822635, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 14921, "time_per_iteration": 2.395789861679077 }, { "auxiliary_loss_clip": 0.01105521, "auxiliary_loss_mlp": 0.01029768, "balance_loss_clip": 1.01750755, "balance_loss_mlp": 1.03503001, "epoch": 0.8971591763114385, "flos": 20302708567680.0, "grad_norm": 1.8939230475924347, "language_loss": 0.74067205, "learning_rate": 1.0986137356912384e-07, "loss": 0.76202494, "num_input_tokens_seen": 321841130, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 14922, "time_per_iteration": 2.4324183464050293 }, { "auxiliary_loss_clip": 0.01102575, "auxiliary_loss_mlp": 0.01028806, "balance_loss_clip": 1.01626503, "balance_loss_mlp": 1.03407311, "epoch": 0.8972192995641064, "flos": 23257043314560.0, "grad_norm": 2.632114744832997, "language_loss": 0.70277959, "learning_rate": 1.097341060694219e-07, "loss": 0.72409344, "num_input_tokens_seen": 321859855, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 14923, "time_per_iteration": 2.460616111755371 }, { "auxiliary_loss_clip": 0.01104228, "auxiliary_loss_mlp": 0.01029993, "balance_loss_clip": 1.01674938, "balance_loss_mlp": 1.0340234, "epoch": 0.8972794228167744, "flos": 18369601395840.0, "grad_norm": 2.136554889909423, "language_loss": 0.70679355, "learning_rate": 1.0960691024879221e-07, "loss": 0.72813576, "num_input_tokens_seen": 321877990, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 14924, "time_per_iteration": 2.4407176971435547 }, { "auxiliary_loss_clip": 0.01102507, "auxiliary_loss_mlp": 0.01035413, "balance_loss_clip": 1.02399898, "balance_loss_mlp": 1.03395152, "epoch": 0.8973395460694423, "flos": 23952058548480.0, "grad_norm": 1.6729492024879224, "language_loss": 0.72242177, "learning_rate": 1.0947978611205844e-07, "loss": 0.743801, "num_input_tokens_seen": 321898120, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 14925, "time_per_iteration": 2.476149797439575 }, { "auxiliary_loss_clip": 0.01109776, "auxiliary_loss_mlp": 0.01030937, "balance_loss_clip": 1.0181756, "balance_loss_mlp": 1.03950083, "epoch": 0.8973996693221103, "flos": 24970843998720.0, "grad_norm": 2.9122761789904144, "language_loss": 0.82477564, "learning_rate": 1.0935273366404008e-07, "loss": 0.8461827, "num_input_tokens_seen": 321918140, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 14926, "time_per_iteration": 2.475386619567871 }, { "auxiliary_loss_clip": 0.0110254, "auxiliary_loss_mlp": 0.01030054, "balance_loss_clip": 1.01830626, "balance_loss_mlp": 1.03427374, "epoch": 0.8974597925747783, "flos": 25738937452800.0, "grad_norm": 1.691301735341115, "language_loss": 0.78989482, "learning_rate": 1.092257529095555e-07, "loss": 0.81122077, "num_input_tokens_seen": 321938580, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 14927, "time_per_iteration": 2.5029103755950928 }, { "auxiliary_loss_clip": 0.01102054, "auxiliary_loss_mlp": 0.01026736, "balance_loss_clip": 1.01508915, "balance_loss_mlp": 1.03428674, "epoch": 0.8975199158274463, "flos": 38071918131840.0, "grad_norm": 1.5913642385565714, "language_loss": 0.66408974, "learning_rate": 1.0909884385341994e-07, "loss": 0.6853776, "num_input_tokens_seen": 321961135, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 14928, "time_per_iteration": 2.6002657413482666 }, { "auxiliary_loss_clip": 0.01107186, "auxiliary_loss_mlp": 0.01041921, "balance_loss_clip": 1.02679968, "balance_loss_mlp": 1.03571546, "epoch": 0.8975800390801142, "flos": 25411683617280.0, "grad_norm": 2.5329444159204306, "language_loss": 0.71002543, "learning_rate": 1.0897200650044602e-07, "loss": 0.73151648, "num_input_tokens_seen": 321980945, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.71484375, "step": 14929, "time_per_iteration": 2.5004825592041016 }, { "auxiliary_loss_clip": 0.0110386, "auxiliary_loss_mlp": 0.01028328, "balance_loss_clip": 1.01705098, "balance_loss_mlp": 1.03562331, "epoch": 0.8976401623327822, "flos": 21759604202880.0, "grad_norm": 1.7749501837077986, "language_loss": 0.6772747, "learning_rate": 1.0884524085544256e-07, "loss": 0.6985966, "num_input_tokens_seen": 322000350, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.68359375, "step": 14930, "time_per_iteration": 2.4561493396759033 }, { "auxiliary_loss_clip": 0.01100752, "auxiliary_loss_mlp": 0.01027627, "balance_loss_clip": 1.0150919, "balance_loss_mlp": 1.03392386, "epoch": 0.8977002855854501, "flos": 13845323934720.0, "grad_norm": 1.973008632141582, "language_loss": 0.74605721, "learning_rate": 1.0871854692321769e-07, "loss": 0.76734102, "num_input_tokens_seen": 322018980, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.66796875, "step": 14931, "time_per_iteration": 2.4276716709136963 }, { "auxiliary_loss_clip": 0.01102968, "auxiliary_loss_mlp": 0.0102681, "balance_loss_clip": 1.01571178, "balance_loss_mlp": 1.03547192, "epoch": 0.8977604088381181, "flos": 19427529692160.0, "grad_norm": 1.7089141759637547, "language_loss": 0.63348389, "learning_rate": 1.0859192470857492e-07, "loss": 0.6547817, "num_input_tokens_seen": 322037675, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.671875, "step": 14932, "time_per_iteration": 2.4282405376434326 }, { "auxiliary_loss_clip": 0.01100249, "auxiliary_loss_mlp": 0.01029046, "balance_loss_clip": 1.01784039, "balance_loss_mlp": 1.03531957, "epoch": 0.8978205320907862, "flos": 22742083981440.0, "grad_norm": 1.722256797271666, "language_loss": 0.71823716, "learning_rate": 1.0846537421631552e-07, "loss": 0.73953015, "num_input_tokens_seen": 322055130, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6484375, "step": 14933, "time_per_iteration": 2.502758502960205 }, { "auxiliary_loss_clip": 0.01105013, "auxiliary_loss_mlp": 0.01030831, "balance_loss_clip": 1.01811194, "balance_loss_mlp": 1.03465974, "epoch": 0.8978806553434541, "flos": 21360529123200.0, "grad_norm": 1.6269661776049713, "language_loss": 0.74587464, "learning_rate": 1.0833889545123898e-07, "loss": 0.76723301, "num_input_tokens_seen": 322074850, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 14934, "time_per_iteration": 2.48652982711792 }, { "auxiliary_loss_clip": 0.01103294, "auxiliary_loss_mlp": 0.01030343, "balance_loss_clip": 1.01836824, "balance_loss_mlp": 1.03585255, "epoch": 0.8979407785961221, "flos": 20924178704640.0, "grad_norm": 1.7896547532153255, "language_loss": 0.60594362, "learning_rate": 1.0821248841814123e-07, "loss": 0.62728, "num_input_tokens_seen": 322093315, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 14935, "time_per_iteration": 2.4584453105926514 }, { "auxiliary_loss_clip": 0.01102775, "auxiliary_loss_mlp": 0.01028382, "balance_loss_clip": 1.01581109, "balance_loss_mlp": 1.03606415, "epoch": 0.89800090184879, "flos": 25228934196480.0, "grad_norm": 2.1625836068704465, "language_loss": 0.77022976, "learning_rate": 1.0808615312181512e-07, "loss": 0.79154134, "num_input_tokens_seen": 322112555, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6640625, "step": 14936, "time_per_iteration": 2.4723663330078125 }, { "auxiliary_loss_clip": 0.01103934, "auxiliary_loss_mlp": 0.01030798, "balance_loss_clip": 1.0190084, "balance_loss_mlp": 1.03566122, "epoch": 0.898061025101458, "flos": 22562674525440.0, "grad_norm": 1.7495563744555676, "language_loss": 0.74150372, "learning_rate": 1.0795988956705193e-07, "loss": 0.762851, "num_input_tokens_seen": 322130440, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 14937, "time_per_iteration": 2.4746181964874268 }, { "auxiliary_loss_clip": 0.01029291, "auxiliary_loss_mlp": 0.01000389, "balance_loss_clip": 0.99933952, "balance_loss_mlp": 1.00704312, "epoch": 0.8981211483541259, "flos": 56192551384320.0, "grad_norm": 0.8434979489838281, "language_loss": 0.63476622, "learning_rate": 1.0783369775863915e-07, "loss": 0.65506297, "num_input_tokens_seen": 322187295, "router_z_loss_clip": 0.01049805, "router_z_loss_mlp": 0.22265625, "step": 14938, "time_per_iteration": 2.971391439437866 }, { "auxiliary_loss_clip": 0.01102681, "auxiliary_loss_mlp": 0.01030761, "balance_loss_clip": 1.01853037, "balance_loss_mlp": 1.03627157, "epoch": 0.898181271606794, "flos": 16392718523520.0, "grad_norm": 2.36284483109297, "language_loss": 0.80763018, "learning_rate": 1.0770757770136251e-07, "loss": 0.82896459, "num_input_tokens_seen": 322202965, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 14939, "time_per_iteration": 2.4276883602142334 }, { "auxiliary_loss_clip": 0.01028934, "auxiliary_loss_mlp": 0.00999989, "balance_loss_clip": 0.99893409, "balance_loss_mlp": 1.00661159, "epoch": 0.8982413948594619, "flos": 63440259989760.0, "grad_norm": 0.7144734924217825, "language_loss": 0.52851576, "learning_rate": 1.0758152940000375e-07, "loss": 0.548805, "num_input_tokens_seen": 322269490, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.22265625, "step": 14940, "time_per_iteration": 3.2142903804779053 }, { "auxiliary_loss_clip": 0.01103555, "auxiliary_loss_mlp": 0.01030567, "balance_loss_clip": 1.01754308, "balance_loss_mlp": 1.03435707, "epoch": 0.8983015181121299, "flos": 21835340029440.0, "grad_norm": 1.9026406823751933, "language_loss": 0.7783317, "learning_rate": 1.0745555285934327e-07, "loss": 0.79967296, "num_input_tokens_seen": 322288060, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 14941, "time_per_iteration": 2.4709434509277344 }, { "auxiliary_loss_clip": 0.01105081, "auxiliary_loss_mlp": 0.01033572, "balance_loss_clip": 1.02100778, "balance_loss_mlp": 1.03554392, "epoch": 0.8983616413647978, "flos": 28949961767040.0, "grad_norm": 2.8195793631605244, "language_loss": 0.73413134, "learning_rate": 1.0732964808415834e-07, "loss": 0.75551784, "num_input_tokens_seen": 322307930, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 14942, "time_per_iteration": 2.4936845302581787 }, { "auxiliary_loss_clip": 0.01107086, "auxiliary_loss_mlp": 0.01035373, "balance_loss_clip": 1.02287972, "balance_loss_mlp": 1.03648853, "epoch": 0.8984217646174658, "flos": 17785083375360.0, "grad_norm": 2.041886266081426, "language_loss": 0.79761982, "learning_rate": 1.0720381507922205e-07, "loss": 0.81904441, "num_input_tokens_seen": 322326155, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 14943, "time_per_iteration": 2.4349112510681152 }, { "auxiliary_loss_clip": 0.01106694, "auxiliary_loss_mlp": 0.01032101, "balance_loss_clip": 1.01854122, "balance_loss_mlp": 1.03546166, "epoch": 0.8984818878701337, "flos": 23404528558080.0, "grad_norm": 2.346266386179806, "language_loss": 0.71076685, "learning_rate": 1.0707805384930701e-07, "loss": 0.73215479, "num_input_tokens_seen": 322345850, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7109375, "step": 14944, "time_per_iteration": 2.44962215423584 }, { "auxiliary_loss_clip": 0.01109534, "auxiliary_loss_mlp": 0.01031169, "balance_loss_clip": 1.01837206, "balance_loss_mlp": 1.03793645, "epoch": 0.8985420111228017, "flos": 22346061557760.0, "grad_norm": 2.3118605867180224, "language_loss": 0.75695503, "learning_rate": 1.0695236439918187e-07, "loss": 0.77836204, "num_input_tokens_seen": 322364715, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 14945, "time_per_iteration": 2.4711291790008545 }, { "auxiliary_loss_clip": 0.01110017, "auxiliary_loss_mlp": 0.01036157, "balance_loss_clip": 1.02305019, "balance_loss_mlp": 1.03584886, "epoch": 0.8986021343754698, "flos": 21392776558080.0, "grad_norm": 2.60268039197608, "language_loss": 0.7383548, "learning_rate": 1.0682674673361302e-07, "loss": 0.75981653, "num_input_tokens_seen": 322383570, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7421875, "step": 14946, "time_per_iteration": 2.445060968399048 }, { "auxiliary_loss_clip": 0.01102409, "auxiliary_loss_mlp": 0.01029688, "balance_loss_clip": 1.01707006, "balance_loss_mlp": 1.03407407, "epoch": 0.8986622576281377, "flos": 21325372686720.0, "grad_norm": 1.9011627471952188, "language_loss": 0.641819, "learning_rate": 1.0670120085736334e-07, "loss": 0.66313994, "num_input_tokens_seen": 322401375, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 14947, "time_per_iteration": 2.4960055351257324 }, { "auxiliary_loss_clip": 0.01103807, "auxiliary_loss_mlp": 0.01035579, "balance_loss_clip": 1.02334225, "balance_loss_mlp": 1.03600705, "epoch": 0.8987223808808057, "flos": 23988292392960.0, "grad_norm": 1.635301844239344, "language_loss": 0.6953913, "learning_rate": 1.0657572677519411e-07, "loss": 0.71678513, "num_input_tokens_seen": 322421890, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 14948, "time_per_iteration": 2.4660019874572754 }, { "auxiliary_loss_clip": 0.01103573, "auxiliary_loss_mlp": 0.01029928, "balance_loss_clip": 1.017524, "balance_loss_mlp": 1.03454053, "epoch": 0.8987825041334736, "flos": 41500956044160.0, "grad_norm": 1.8391812573006026, "language_loss": 0.74839646, "learning_rate": 1.0645032449186309e-07, "loss": 0.76973152, "num_input_tokens_seen": 322445730, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 14949, "time_per_iteration": 2.6387531757354736 }, { "auxiliary_loss_clip": 0.01105914, "auxiliary_loss_mlp": 0.01036365, "balance_loss_clip": 1.02267945, "balance_loss_mlp": 1.03571177, "epoch": 0.8988426273861416, "flos": 27564276844800.0, "grad_norm": 2.9898990924295537, "language_loss": 0.75981379, "learning_rate": 1.0632499401212513e-07, "loss": 0.78123659, "num_input_tokens_seen": 322464595, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 14950, "time_per_iteration": 2.5020058155059814 }, { "auxiliary_loss_clip": 0.0110535, "auxiliary_loss_mlp": 0.01026862, "balance_loss_clip": 1.0153048, "balance_loss_mlp": 1.03764105, "epoch": 0.8989027506388095, "flos": 17092653920640.0, "grad_norm": 1.5942315143668169, "language_loss": 0.6659615, "learning_rate": 1.0619973534073334e-07, "loss": 0.68728364, "num_input_tokens_seen": 322483305, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.67578125, "step": 14951, "time_per_iteration": 2.451700448989868 }, { "auxiliary_loss_clip": 0.0110576, "auxiliary_loss_mlp": 0.01028791, "balance_loss_clip": 1.01697099, "balance_loss_mlp": 1.03344011, "epoch": 0.8989628738914776, "flos": 20555124416640.0, "grad_norm": 2.13387049395659, "language_loss": 0.74124956, "learning_rate": 1.0607454848243769e-07, "loss": 0.76259506, "num_input_tokens_seen": 322501905, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.7265625, "step": 14952, "time_per_iteration": 2.4439339637756348 }, { "auxiliary_loss_clip": 0.01104451, "auxiliary_loss_mlp": 0.01035219, "balance_loss_clip": 1.022964, "balance_loss_mlp": 1.03576851, "epoch": 0.8990229971441455, "flos": 16251087196800.0, "grad_norm": 2.1274120092634528, "language_loss": 0.56751192, "learning_rate": 1.0594943344198481e-07, "loss": 0.58890867, "num_input_tokens_seen": 322518135, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 14953, "time_per_iteration": 2.4944863319396973 }, { "auxiliary_loss_clip": 0.01103347, "auxiliary_loss_mlp": 0.01032513, "balance_loss_clip": 1.02065206, "balance_loss_mlp": 1.03538227, "epoch": 0.8990831203968135, "flos": 21981316901760.0, "grad_norm": 15.716483454401063, "language_loss": 0.81772292, "learning_rate": 1.0582439022411915e-07, "loss": 0.83908153, "num_input_tokens_seen": 322537905, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6796875, "step": 14954, "time_per_iteration": 2.4473958015441895 }, { "auxiliary_loss_clip": 0.01102103, "auxiliary_loss_mlp": 0.0103286, "balance_loss_clip": 1.0207305, "balance_loss_mlp": 1.0353651, "epoch": 0.8991432436494814, "flos": 27447171528960.0, "grad_norm": 1.9194316031640475, "language_loss": 0.60348356, "learning_rate": 1.0569941883358224e-07, "loss": 0.62483323, "num_input_tokens_seen": 322557945, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66796875, "step": 14955, "time_per_iteration": 3.821556806564331 }, { "auxiliary_loss_clip": 0.01101133, "auxiliary_loss_mlp": 0.01027245, "balance_loss_clip": 1.01564622, "balance_loss_mlp": 1.0345397, "epoch": 0.8992033669021494, "flos": 21579835610880.0, "grad_norm": 2.296995283344768, "language_loss": 0.54493791, "learning_rate": 1.0557451927511341e-07, "loss": 0.56622171, "num_input_tokens_seen": 322575765, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 14956, "time_per_iteration": 2.446436882019043 }, { "auxiliary_loss_clip": 0.01104659, "auxiliary_loss_mlp": 0.01032825, "balance_loss_clip": 1.02070165, "balance_loss_mlp": 1.0355922, "epoch": 0.8992634901548173, "flos": 28584211530240.0, "grad_norm": 2.005161222414963, "language_loss": 0.79834759, "learning_rate": 1.0544969155344863e-07, "loss": 0.81972247, "num_input_tokens_seen": 322595665, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 14957, "time_per_iteration": 2.5047543048858643 }, { "auxiliary_loss_clip": 0.01108497, "auxiliary_loss_mlp": 0.01029851, "balance_loss_clip": 1.01694036, "balance_loss_mlp": 1.03725755, "epoch": 0.8993236134074853, "flos": 19867435557120.0, "grad_norm": 1.7409138170146579, "language_loss": 0.78641772, "learning_rate": 1.0532493567332123e-07, "loss": 0.80780113, "num_input_tokens_seen": 322614755, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 14958, "time_per_iteration": 2.4670512676239014 }, { "auxiliary_loss_clip": 0.01104061, "auxiliary_loss_mlp": 0.01025715, "balance_loss_clip": 1.0142113, "balance_loss_mlp": 1.03706837, "epoch": 0.8993837366601534, "flos": 19390649402880.0, "grad_norm": 1.526332830703961, "language_loss": 0.74727571, "learning_rate": 1.0520025163946277e-07, "loss": 0.76857352, "num_input_tokens_seen": 322633425, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 14959, "time_per_iteration": 3.788839817047119 }, { "auxiliary_loss_clip": 0.01099186, "auxiliary_loss_mlp": 0.01029358, "balance_loss_clip": 1.01775312, "balance_loss_mlp": 1.03305197, "epoch": 0.8994438599128213, "flos": 18551740285440.0, "grad_norm": 2.3996445719966513, "language_loss": 0.68516195, "learning_rate": 1.0507563945660015e-07, "loss": 0.70644736, "num_input_tokens_seen": 322652065, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 14960, "time_per_iteration": 2.429513931274414 }, { "auxiliary_loss_clip": 0.01104611, "auxiliary_loss_mlp": 0.01027503, "balance_loss_clip": 1.01602924, "balance_loss_mlp": 1.03610611, "epoch": 0.8995039831654893, "flos": 24427587726720.0, "grad_norm": 2.4319480203908608, "language_loss": 0.65975022, "learning_rate": 1.049510991294591e-07, "loss": 0.6810714, "num_input_tokens_seen": 322673275, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6875, "step": 14961, "time_per_iteration": 2.475139617919922 }, { "auxiliary_loss_clip": 0.01099934, "auxiliary_loss_mlp": 0.01027758, "balance_loss_clip": 1.01635611, "balance_loss_mlp": 1.03313863, "epoch": 0.8995641064181572, "flos": 21251324799360.0, "grad_norm": 1.616536741484402, "language_loss": 0.83099806, "learning_rate": 1.0482663066276254e-07, "loss": 0.85227501, "num_input_tokens_seen": 322693375, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66796875, "step": 14962, "time_per_iteration": 5.2811384201049805 }, { "auxiliary_loss_clip": 0.01108846, "auxiliary_loss_mlp": 0.01029439, "balance_loss_clip": 1.0161407, "balance_loss_mlp": 1.03721416, "epoch": 0.8996242296708252, "flos": 23513661054720.0, "grad_norm": 7.318230618479735, "language_loss": 0.7638225, "learning_rate": 1.047022340612298e-07, "loss": 0.78520536, "num_input_tokens_seen": 322712615, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 14963, "time_per_iteration": 2.4872655868530273 }, { "auxiliary_loss_clip": 0.01028987, "auxiliary_loss_mlp": 0.01001024, "balance_loss_clip": 1.00002885, "balance_loss_mlp": 1.00659466, "epoch": 0.8996843529234931, "flos": 62403230430720.0, "grad_norm": 0.7763775724261363, "language_loss": 0.57504922, "learning_rate": 1.0457790932957867e-07, "loss": 0.59534931, "num_input_tokens_seen": 322766855, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22460938, "step": 14964, "time_per_iteration": 2.9395229816436768 }, { "auxiliary_loss_clip": 0.01111142, "auxiliary_loss_mlp": 0.01034235, "balance_loss_clip": 1.02103293, "balance_loss_mlp": 1.03804708, "epoch": 0.8997444761761612, "flos": 24236829573120.0, "grad_norm": 3.7322247582229022, "language_loss": 0.68274617, "learning_rate": 1.0445365647252269e-07, "loss": 0.70419991, "num_input_tokens_seen": 322781130, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73046875, "step": 14965, "time_per_iteration": 2.446282148361206 }, { "auxiliary_loss_clip": 0.01104943, "auxiliary_loss_mlp": 0.01031066, "balance_loss_clip": 1.01919901, "balance_loss_mlp": 1.03517163, "epoch": 0.8998045994288291, "flos": 21361103740800.0, "grad_norm": 5.189902790907496, "language_loss": 0.71717596, "learning_rate": 1.0432947549477433e-07, "loss": 0.738536, "num_input_tokens_seen": 322800310, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69921875, "step": 14966, "time_per_iteration": 2.435528516769409 }, { "auxiliary_loss_clip": 0.01105589, "auxiliary_loss_mlp": 0.01031388, "balance_loss_clip": 1.01857853, "balance_loss_mlp": 1.03704214, "epoch": 0.8998647226814971, "flos": 28986159697920.0, "grad_norm": 2.097847667362585, "language_loss": 0.73141462, "learning_rate": 1.0420536640104205e-07, "loss": 0.75278437, "num_input_tokens_seen": 322820955, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.68359375, "step": 14967, "time_per_iteration": 2.5143144130706787 }, { "auxiliary_loss_clip": 0.0110217, "auxiliary_loss_mlp": 0.0103045, "balance_loss_clip": 1.01862454, "balance_loss_mlp": 1.03394651, "epoch": 0.899924845934165, "flos": 13625909706240.0, "grad_norm": 2.2702241034370134, "language_loss": 0.72303176, "learning_rate": 1.040813291960323e-07, "loss": 0.74435794, "num_input_tokens_seen": 322838780, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 14968, "time_per_iteration": 2.419447660446167 }, { "auxiliary_loss_clip": 0.01103917, "auxiliary_loss_mlp": 0.01030392, "balance_loss_clip": 1.01842332, "balance_loss_mlp": 1.03533173, "epoch": 0.899984969186833, "flos": 20882629647360.0, "grad_norm": 2.6124909717705007, "language_loss": 0.71033537, "learning_rate": 1.0395736388444864e-07, "loss": 0.73167843, "num_input_tokens_seen": 322856710, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 14969, "time_per_iteration": 2.470583438873291 }, { "auxiliary_loss_clip": 0.01106877, "auxiliary_loss_mlp": 0.01030593, "balance_loss_clip": 1.01855266, "balance_loss_mlp": 1.03731632, "epoch": 0.9000450924395009, "flos": 20921808407040.0, "grad_norm": 1.950183574304821, "language_loss": 0.76433146, "learning_rate": 1.0383347047099201e-07, "loss": 0.78570616, "num_input_tokens_seen": 322876070, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 14970, "time_per_iteration": 2.4454801082611084 }, { "auxiliary_loss_clip": 0.01105635, "auxiliary_loss_mlp": 0.01035617, "balance_loss_clip": 1.02398849, "balance_loss_mlp": 1.03523326, "epoch": 0.900105215692169, "flos": 17165049782400.0, "grad_norm": 1.714926727303716, "language_loss": 0.73098779, "learning_rate": 1.0370964896035972e-07, "loss": 0.75240028, "num_input_tokens_seen": 322895095, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.703125, "step": 14971, "time_per_iteration": 2.445263624191284 }, { "auxiliary_loss_clip": 0.01104122, "auxiliary_loss_mlp": 0.01027684, "balance_loss_clip": 1.0147264, "balance_loss_mlp": 1.03471732, "epoch": 0.900165338944837, "flos": 19931930426880.0, "grad_norm": 2.3009538481168472, "language_loss": 0.81450582, "learning_rate": 1.035858993572476e-07, "loss": 0.83582389, "num_input_tokens_seen": 322911845, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 14972, "time_per_iteration": 2.432135581970215 }, { "auxiliary_loss_clip": 0.01107818, "auxiliary_loss_mlp": 0.0103212, "balance_loss_clip": 1.01970387, "balance_loss_mlp": 1.03585958, "epoch": 0.9002254621975049, "flos": 16107085572480.0, "grad_norm": 2.1881170153230842, "language_loss": 0.81834662, "learning_rate": 1.0346222166634855e-07, "loss": 0.839746, "num_input_tokens_seen": 322928170, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 14973, "time_per_iteration": 2.434185028076172 }, { "auxiliary_loss_clip": 0.01102966, "auxiliary_loss_mlp": 0.01033502, "balance_loss_clip": 1.02101493, "balance_loss_mlp": 1.03543973, "epoch": 0.9002855854501729, "flos": 28476120528000.0, "grad_norm": 1.8616080652799458, "language_loss": 0.58532178, "learning_rate": 1.0333861589235193e-07, "loss": 0.60668647, "num_input_tokens_seen": 322948165, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 14974, "time_per_iteration": 2.509700298309326 }, { "auxiliary_loss_clip": 0.01107176, "auxiliary_loss_mlp": 0.01036072, "balance_loss_clip": 1.02405584, "balance_loss_mlp": 1.03771257, "epoch": 0.9003457087028408, "flos": 25630307746560.0, "grad_norm": 2.328975329719452, "language_loss": 0.63533533, "learning_rate": 1.0321508203994489e-07, "loss": 0.65676779, "num_input_tokens_seen": 322968880, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 14975, "time_per_iteration": 2.5155622959136963 }, { "auxiliary_loss_clip": 0.01104999, "auxiliary_loss_mlp": 0.01030179, "balance_loss_clip": 1.01782322, "balance_loss_mlp": 1.03525579, "epoch": 0.9004058319555088, "flos": 24389414547840.0, "grad_norm": 1.8312294969081855, "language_loss": 0.73280191, "learning_rate": 1.0309162011381257e-07, "loss": 0.75415373, "num_input_tokens_seen": 322989395, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69921875, "step": 14976, "time_per_iteration": 2.474827527999878 }, { "auxiliary_loss_clip": 0.01106609, "auxiliary_loss_mlp": 0.01031289, "balance_loss_clip": 1.01937389, "balance_loss_mlp": 1.03734326, "epoch": 0.9004659552081767, "flos": 29059345658880.0, "grad_norm": 1.808114201513914, "language_loss": 0.69764388, "learning_rate": 1.0296823011863565e-07, "loss": 0.71902287, "num_input_tokens_seen": 323009060, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 14977, "time_per_iteration": 2.5285897254943848 }, { "auxiliary_loss_clip": 0.01105585, "auxiliary_loss_mlp": 0.01034674, "balance_loss_clip": 1.02113223, "balance_loss_mlp": 1.03498423, "epoch": 0.9005260784608448, "flos": 16763855800320.0, "grad_norm": 2.1066638269224076, "language_loss": 0.65729839, "learning_rate": 1.0284491205909351e-07, "loss": 0.67870098, "num_input_tokens_seen": 323027530, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.70703125, "step": 14978, "time_per_iteration": 2.426825523376465 }, { "auxiliary_loss_clip": 0.01109744, "auxiliary_loss_mlp": 0.01035829, "balance_loss_clip": 1.02245998, "balance_loss_mlp": 1.03745401, "epoch": 0.9005862017135127, "flos": 20376002269440.0, "grad_norm": 1.732541580980458, "language_loss": 0.78792375, "learning_rate": 1.0272166593986286e-07, "loss": 0.80937946, "num_input_tokens_seen": 323045370, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 14979, "time_per_iteration": 2.4520909786224365 }, { "auxiliary_loss_clip": 0.01028595, "auxiliary_loss_mlp": 0.01002164, "balance_loss_clip": 1.00119293, "balance_loss_mlp": 1.00637031, "epoch": 0.9006463249661807, "flos": 67580255796480.0, "grad_norm": 0.8969217646243586, "language_loss": 0.53546119, "learning_rate": 1.0259849176561642e-07, "loss": 0.55576885, "num_input_tokens_seen": 323105660, "router_z_loss_clip": 0.00970459, "router_z_loss_mlp": 0.22265625, "step": 14980, "time_per_iteration": 3.115839719772339 }, { "auxiliary_loss_clip": 0.01108593, "auxiliary_loss_mlp": 0.01038556, "balance_loss_clip": 1.0252763, "balance_loss_mlp": 1.03717017, "epoch": 0.9007064482188486, "flos": 28293335193600.0, "grad_norm": 2.8882222754921343, "language_loss": 0.82326031, "learning_rate": 1.0247538954102553e-07, "loss": 0.84473181, "num_input_tokens_seen": 323126365, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 14981, "time_per_iteration": 2.5174155235290527 }, { "auxiliary_loss_clip": 0.01102119, "auxiliary_loss_mlp": 0.01029416, "balance_loss_clip": 1.0177331, "balance_loss_mlp": 1.03481197, "epoch": 0.9007665714715166, "flos": 21616320850560.0, "grad_norm": 2.5478476713465517, "language_loss": 0.81298631, "learning_rate": 1.0235235927075758e-07, "loss": 0.83430165, "num_input_tokens_seen": 323145655, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 14982, "time_per_iteration": 2.445222854614258 }, { "auxiliary_loss_clip": 0.01099787, "auxiliary_loss_mlp": 0.01031987, "balance_loss_clip": 1.02122283, "balance_loss_mlp": 1.03495264, "epoch": 0.9008266947241845, "flos": 26541864120960.0, "grad_norm": 2.4357957230001044, "language_loss": 0.71510595, "learning_rate": 1.0222940095947885e-07, "loss": 0.73642373, "num_input_tokens_seen": 323164540, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.6484375, "step": 14983, "time_per_iteration": 2.496030330657959 }, { "auxiliary_loss_clip": 0.01103319, "auxiliary_loss_mlp": 0.01024531, "balance_loss_clip": 1.01334918, "balance_loss_mlp": 1.03695703, "epoch": 0.9008868179768525, "flos": 23110527738240.0, "grad_norm": 1.6712096579130615, "language_loss": 0.75154996, "learning_rate": 1.0210651461185115e-07, "loss": 0.77282846, "num_input_tokens_seen": 323186960, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6640625, "step": 14984, "time_per_iteration": 2.5526363849639893 }, { "auxiliary_loss_clip": 0.01099581, "auxiliary_loss_mlp": 0.0103441, "balance_loss_clip": 1.02242351, "balance_loss_mlp": 1.03330171, "epoch": 0.9009469412295206, "flos": 19060809788160.0, "grad_norm": 1.5329815799582907, "language_loss": 0.70284879, "learning_rate": 1.0198370023253456e-07, "loss": 0.72418869, "num_input_tokens_seen": 323206135, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6640625, "step": 14985, "time_per_iteration": 2.5388991832733154 }, { "auxiliary_loss_clip": 0.01104511, "auxiliary_loss_mlp": 0.01033892, "balance_loss_clip": 1.02105904, "balance_loss_mlp": 1.03334939, "epoch": 0.9010070644821885, "flos": 23222281927680.0, "grad_norm": 2.1535798094162146, "language_loss": 0.70346868, "learning_rate": 1.0186095782618643e-07, "loss": 0.72485268, "num_input_tokens_seen": 323225980, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 14986, "time_per_iteration": 2.459791421890259 }, { "auxiliary_loss_clip": 0.01104207, "auxiliary_loss_mlp": 0.01035905, "balance_loss_clip": 1.02347124, "balance_loss_mlp": 1.0344404, "epoch": 0.9010671877348565, "flos": 17384823146880.0, "grad_norm": 1.994655207238969, "language_loss": 0.76872402, "learning_rate": 1.0173828739746104e-07, "loss": 0.79012513, "num_input_tokens_seen": 323243700, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 14987, "time_per_iteration": 2.4832403659820557 }, { "auxiliary_loss_clip": 0.01104259, "auxiliary_loss_mlp": 0.01029336, "balance_loss_clip": 1.0170033, "balance_loss_mlp": 1.0359416, "epoch": 0.9011273109875244, "flos": 21908166854400.0, "grad_norm": 1.8717376906874694, "language_loss": 0.73936093, "learning_rate": 1.0161568895100981e-07, "loss": 0.76069689, "num_input_tokens_seen": 323261535, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 14988, "time_per_iteration": 2.442140579223633 }, { "auxiliary_loss_clip": 0.01108307, "auxiliary_loss_mlp": 0.01027195, "balance_loss_clip": 1.01423073, "balance_loss_mlp": 1.03662658, "epoch": 0.9011874342401924, "flos": 24060831909120.0, "grad_norm": 2.3489864435083057, "language_loss": 0.69347256, "learning_rate": 1.0149316249148188e-07, "loss": 0.7148276, "num_input_tokens_seen": 323281855, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 14989, "time_per_iteration": 2.48665452003479 }, { "auxiliary_loss_clip": 0.01104865, "auxiliary_loss_mlp": 0.01029253, "balance_loss_clip": 1.01724863, "balance_loss_mlp": 1.03561378, "epoch": 0.9012475574928603, "flos": 16758791982720.0, "grad_norm": 3.545708488103557, "language_loss": 0.79974508, "learning_rate": 1.0137070802352376e-07, "loss": 0.82108629, "num_input_tokens_seen": 323299505, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 14990, "time_per_iteration": 2.4053235054016113 }, { "auxiliary_loss_clip": 0.01109121, "auxiliary_loss_mlp": 0.01034885, "balance_loss_clip": 1.02249956, "balance_loss_mlp": 1.03790593, "epoch": 0.9013076807455284, "flos": 19971109186560.0, "grad_norm": 2.3629844590688447, "language_loss": 0.77894866, "learning_rate": 1.0124832555177842e-07, "loss": 0.80038869, "num_input_tokens_seen": 323318365, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 14991, "time_per_iteration": 2.466501474380493 }, { "auxiliary_loss_clip": 0.01029189, "auxiliary_loss_mlp": 0.01002629, "balance_loss_clip": 1.00163352, "balance_loss_mlp": 1.00679326, "epoch": 0.9013678039981963, "flos": 65180274624000.0, "grad_norm": 0.7782200578289884, "language_loss": 0.60237432, "learning_rate": 1.0112601508088726e-07, "loss": 0.62269247, "num_input_tokens_seen": 323371835, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22460938, "step": 14992, "time_per_iteration": 3.0122528076171875 }, { "auxiliary_loss_clip": 0.0110201, "auxiliary_loss_mlp": 0.0102605, "balance_loss_clip": 1.01379526, "balance_loss_mlp": 1.0335989, "epoch": 0.9014279272508643, "flos": 20521224956160.0, "grad_norm": 2.141405851832273, "language_loss": 0.82887191, "learning_rate": 1.0100377661548764e-07, "loss": 0.85015249, "num_input_tokens_seen": 323388495, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 14993, "time_per_iteration": 2.4583933353424072 }, { "auxiliary_loss_clip": 0.01103721, "auxiliary_loss_mlp": 0.01032261, "balance_loss_clip": 1.01986945, "balance_loss_mlp": 1.03452051, "epoch": 0.9014880505035322, "flos": 17309051406720.0, "grad_norm": 2.0883561786878047, "language_loss": 0.73143172, "learning_rate": 1.0088161016021502e-07, "loss": 0.75279152, "num_input_tokens_seen": 323405280, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 14994, "time_per_iteration": 2.414203643798828 }, { "auxiliary_loss_clip": 0.01100182, "auxiliary_loss_mlp": 0.01028405, "balance_loss_clip": 1.01722908, "balance_loss_mlp": 1.03359795, "epoch": 0.9015481737562002, "flos": 28402862739840.0, "grad_norm": 3.3043148311940267, "language_loss": 0.64720309, "learning_rate": 1.0075951571970187e-07, "loss": 0.66848898, "num_input_tokens_seen": 323425310, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6640625, "step": 14995, "time_per_iteration": 2.5190861225128174 }, { "auxiliary_loss_clip": 0.01104741, "auxiliary_loss_mlp": 0.01030717, "balance_loss_clip": 1.0178957, "balance_loss_mlp": 1.03447878, "epoch": 0.9016082970088681, "flos": 29752672953600.0, "grad_norm": 1.7360862985768633, "language_loss": 0.66364634, "learning_rate": 1.0063749329857873e-07, "loss": 0.68500096, "num_input_tokens_seen": 323447805, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 14996, "time_per_iteration": 2.5099480152130127 }, { "auxiliary_loss_clip": 0.01100284, "auxiliary_loss_mlp": 0.01030104, "balance_loss_clip": 1.01831436, "balance_loss_mlp": 1.03302407, "epoch": 0.9016684202615362, "flos": 23513230091520.0, "grad_norm": 1.6635015961102444, "language_loss": 0.65892327, "learning_rate": 1.0051554290147168e-07, "loss": 0.68022722, "num_input_tokens_seen": 323467150, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 14997, "time_per_iteration": 3.938462734222412 }, { "auxiliary_loss_clip": 0.01103424, "auxiliary_loss_mlp": 0.01031926, "balance_loss_clip": 1.0198319, "balance_loss_mlp": 1.0349673, "epoch": 0.9017285435142042, "flos": 16979247705600.0, "grad_norm": 1.9288593524660471, "language_loss": 0.77329296, "learning_rate": 1.0039366453300613e-07, "loss": 0.79464644, "num_input_tokens_seen": 323484250, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 14998, "time_per_iteration": 2.455646276473999 }, { "auxiliary_loss_clip": 0.01103663, "auxiliary_loss_mlp": 0.01030126, "balance_loss_clip": 1.01746035, "balance_loss_mlp": 1.0341903, "epoch": 0.9017886667668721, "flos": 21393351175680.0, "grad_norm": 1.6598867291985757, "language_loss": 0.75138092, "learning_rate": 1.0027185819780281e-07, "loss": 0.77271879, "num_input_tokens_seen": 323502910, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 14999, "time_per_iteration": 2.468021869659424 }, { "auxiliary_loss_clip": 0.01102806, "auxiliary_loss_mlp": 0.01029551, "balance_loss_clip": 1.01697409, "balance_loss_mlp": 1.03477931, "epoch": 0.9018487900195401, "flos": 20996574566400.0, "grad_norm": 2.098888497785466, "language_loss": 0.75457954, "learning_rate": 1.0015012390048117e-07, "loss": 0.77590311, "num_input_tokens_seen": 323521820, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 15000, "time_per_iteration": 3.8140430450439453 }, { "auxiliary_loss_clip": 0.01100927, "auxiliary_loss_mlp": 0.01027968, "balance_loss_clip": 1.01669693, "balance_loss_mlp": 1.03392196, "epoch": 0.901908913272208, "flos": 53358443458560.0, "grad_norm": 2.1326107996105996, "language_loss": 0.80924749, "learning_rate": 1.0002846164565704e-07, "loss": 0.83053648, "num_input_tokens_seen": 323543200, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.671875, "step": 15001, "time_per_iteration": 2.7393085956573486 }, { "auxiliary_loss_clip": 0.01103127, "auxiliary_loss_mlp": 0.0102894, "balance_loss_clip": 1.01784205, "balance_loss_mlp": 1.03588724, "epoch": 0.901969036524876, "flos": 22089838867200.0, "grad_norm": 1.4817436972799325, "language_loss": 0.78378302, "learning_rate": 9.990687143794407e-08, "loss": 0.80510366, "num_input_tokens_seen": 323563075, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.671875, "step": 15002, "time_per_iteration": 2.452075958251953 }, { "auxiliary_loss_clip": 0.01106619, "auxiliary_loss_mlp": 0.01033247, "balance_loss_clip": 1.02048588, "balance_loss_mlp": 1.03764391, "epoch": 0.9020291597775439, "flos": 23835025059840.0, "grad_norm": 3.06322180589594, "language_loss": 0.67947501, "learning_rate": 9.978535328195347e-08, "loss": 0.70087367, "num_input_tokens_seen": 323579065, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 15003, "time_per_iteration": 3.879450798034668 }, { "auxiliary_loss_clip": 0.01105105, "auxiliary_loss_mlp": 0.01032565, "balance_loss_clip": 1.01988149, "balance_loss_mlp": 1.03462601, "epoch": 0.902089283030212, "flos": 18326005263360.0, "grad_norm": 1.8367362136637695, "language_loss": 0.85958844, "learning_rate": 9.9663907182292e-08, "loss": 0.88096517, "num_input_tokens_seen": 323594835, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 15004, "time_per_iteration": 3.9130916595458984 }, { "auxiliary_loss_clip": 0.01106116, "auxiliary_loss_mlp": 0.01031355, "balance_loss_clip": 1.01889789, "balance_loss_mlp": 1.03609741, "epoch": 0.9021494062828799, "flos": 24170359455360.0, "grad_norm": 3.160142670296263, "language_loss": 0.72272867, "learning_rate": 9.954253314356575e-08, "loss": 0.74410337, "num_input_tokens_seen": 323611475, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 15005, "time_per_iteration": 2.4708728790283203 }, { "auxiliary_loss_clip": 0.01105604, "auxiliary_loss_mlp": 0.01032974, "balance_loss_clip": 1.02023053, "balance_loss_mlp": 1.03321064, "epoch": 0.9022095295355479, "flos": 21616859554560.0, "grad_norm": 1.9690989045090703, "language_loss": 0.70903677, "learning_rate": 9.942123117037748e-08, "loss": 0.7304225, "num_input_tokens_seen": 323629730, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 15006, "time_per_iteration": 2.4669265747070312 }, { "auxiliary_loss_clip": 0.01107806, "auxiliary_loss_mlp": 0.01030543, "balance_loss_clip": 1.01883698, "balance_loss_mlp": 1.03702712, "epoch": 0.9022696527882158, "flos": 18726229578240.0, "grad_norm": 2.1440411419746037, "language_loss": 0.84501582, "learning_rate": 9.930000126732618e-08, "loss": 0.86639929, "num_input_tokens_seen": 323646000, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.7109375, "step": 15007, "time_per_iteration": 2.4172005653381348 }, { "auxiliary_loss_clip": 0.01102442, "auxiliary_loss_mlp": 0.01030757, "balance_loss_clip": 1.01869869, "balance_loss_mlp": 1.03548133, "epoch": 0.9023297760408838, "flos": 26761206522240.0, "grad_norm": 2.222539076097586, "language_loss": 0.78907901, "learning_rate": 9.917884343900928e-08, "loss": 0.81041098, "num_input_tokens_seen": 323667250, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.66796875, "step": 15008, "time_per_iteration": 2.5140817165374756 }, { "auxiliary_loss_clip": 0.01100346, "auxiliary_loss_mlp": 0.01029066, "balance_loss_clip": 1.01729441, "balance_loss_mlp": 1.03572619, "epoch": 0.9023898992935517, "flos": 20522553759360.0, "grad_norm": 1.9277536826283248, "language_loss": 0.73441517, "learning_rate": 9.905775769002156e-08, "loss": 0.75570935, "num_input_tokens_seen": 323687150, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6484375, "step": 15009, "time_per_iteration": 2.4840643405914307 }, { "auxiliary_loss_clip": 0.01103708, "auxiliary_loss_mlp": 0.01036378, "balance_loss_clip": 1.02432632, "balance_loss_mlp": 1.03569722, "epoch": 0.9024500225462198, "flos": 17456644391040.0, "grad_norm": 1.923035802158132, "language_loss": 0.73423398, "learning_rate": 9.893674402495399e-08, "loss": 0.75563478, "num_input_tokens_seen": 323703660, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 15010, "time_per_iteration": 2.4622178077697754 }, { "auxiliary_loss_clip": 0.01105462, "auxiliary_loss_mlp": 0.01032026, "balance_loss_clip": 1.01959276, "balance_loss_mlp": 1.03586102, "epoch": 0.9025101457988878, "flos": 20813609664000.0, "grad_norm": 1.9789856248775164, "language_loss": 0.74397284, "learning_rate": 9.881580244839538e-08, "loss": 0.76534772, "num_input_tokens_seen": 323722060, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 15011, "time_per_iteration": 2.456475019454956 }, { "auxiliary_loss_clip": 0.01107548, "auxiliary_loss_mlp": 0.01031073, "balance_loss_clip": 1.01828837, "balance_loss_mlp": 1.03542995, "epoch": 0.9025702690515557, "flos": 19026371623680.0, "grad_norm": 1.8233753624054339, "language_loss": 0.72989959, "learning_rate": 9.869493296493204e-08, "loss": 0.75128579, "num_input_tokens_seen": 323740645, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 15012, "time_per_iteration": 2.453585386276245 }, { "auxiliary_loss_clip": 0.01104755, "auxiliary_loss_mlp": 0.01034749, "balance_loss_clip": 1.02297091, "balance_loss_mlp": 1.03630865, "epoch": 0.9026303923042237, "flos": 19682818629120.0, "grad_norm": 1.6785523153667243, "language_loss": 0.69365597, "learning_rate": 9.857413557914763e-08, "loss": 0.71505105, "num_input_tokens_seen": 323758905, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.68359375, "step": 15013, "time_per_iteration": 2.455565929412842 }, { "auxiliary_loss_clip": 0.01099762, "auxiliary_loss_mlp": 0.01032012, "balance_loss_clip": 1.02029967, "balance_loss_mlp": 1.03360832, "epoch": 0.9026905155568916, "flos": 24608110504320.0, "grad_norm": 1.5839627856482374, "language_loss": 0.72982103, "learning_rate": 9.845341029562249e-08, "loss": 0.75113881, "num_input_tokens_seen": 323780595, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 15014, "time_per_iteration": 2.5042576789855957 }, { "auxiliary_loss_clip": 0.01102423, "auxiliary_loss_mlp": 0.01028789, "balance_loss_clip": 1.01604581, "balance_loss_mlp": 1.03321993, "epoch": 0.9027506388095596, "flos": 20521799573760.0, "grad_norm": 2.345878313425182, "language_loss": 0.71885848, "learning_rate": 9.833275711893474e-08, "loss": 0.7401706, "num_input_tokens_seen": 323798160, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 15015, "time_per_iteration": 2.4420547485351562 }, { "auxiliary_loss_clip": 0.01104355, "auxiliary_loss_mlp": 0.01027856, "balance_loss_clip": 1.01620317, "balance_loss_mlp": 1.03463697, "epoch": 0.9028107620622275, "flos": 22784494965120.0, "grad_norm": 2.140546295343701, "language_loss": 0.68941218, "learning_rate": 9.821217605365895e-08, "loss": 0.71073425, "num_input_tokens_seen": 323816810, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 15016, "time_per_iteration": 2.481015920639038 }, { "auxiliary_loss_clip": 0.01101786, "auxiliary_loss_mlp": 0.01026666, "balance_loss_clip": 1.01548445, "balance_loss_mlp": 1.03415012, "epoch": 0.9028708853148956, "flos": 25410534382080.0, "grad_norm": 1.8688898500724902, "language_loss": 0.70797706, "learning_rate": 9.809166710436855e-08, "loss": 0.72926152, "num_input_tokens_seen": 323836900, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.67578125, "step": 15017, "time_per_iteration": 2.487779378890991 }, { "auxiliary_loss_clip": 0.01107462, "auxiliary_loss_mlp": 0.01034968, "balance_loss_clip": 1.02325535, "balance_loss_mlp": 1.03877544, "epoch": 0.9029310085675635, "flos": 21871322478720.0, "grad_norm": 1.8717478517691521, "language_loss": 0.6969353, "learning_rate": 9.797123027563237e-08, "loss": 0.71835959, "num_input_tokens_seen": 323855325, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 15018, "time_per_iteration": 2.4814460277557373 }, { "auxiliary_loss_clip": 0.01105946, "auxiliary_loss_mlp": 0.01031883, "balance_loss_clip": 1.01941979, "balance_loss_mlp": 1.03616405, "epoch": 0.9029911318202315, "flos": 26214394803840.0, "grad_norm": 1.8643447944191238, "language_loss": 0.69216049, "learning_rate": 9.785086557201782e-08, "loss": 0.71353877, "num_input_tokens_seen": 323875650, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 15019, "time_per_iteration": 2.511072874069214 }, { "auxiliary_loss_clip": 0.01100368, "auxiliary_loss_mlp": 0.01029837, "balance_loss_clip": 1.01817846, "balance_loss_mlp": 1.03422844, "epoch": 0.9030512550728994, "flos": 15961360095360.0, "grad_norm": 2.918934977186504, "language_loss": 0.72209162, "learning_rate": 9.773057299808951e-08, "loss": 0.74339366, "num_input_tokens_seen": 323892920, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66015625, "step": 15020, "time_per_iteration": 2.4305825233459473 }, { "auxiliary_loss_clip": 0.01102921, "auxiliary_loss_mlp": 0.0103043, "balance_loss_clip": 1.01776457, "balance_loss_mlp": 1.03307629, "epoch": 0.9031113783255674, "flos": 23987610034560.0, "grad_norm": 1.7484727565861935, "language_loss": 0.74131727, "learning_rate": 9.7610352558408e-08, "loss": 0.76265079, "num_input_tokens_seen": 323913835, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 15021, "time_per_iteration": 2.471824884414673 }, { "auxiliary_loss_clip": 0.01106864, "auxiliary_loss_mlp": 0.01030193, "balance_loss_clip": 1.01741958, "balance_loss_mlp": 1.03567553, "epoch": 0.9031715015782353, "flos": 22237216369920.0, "grad_norm": 3.136930776266228, "language_loss": 0.72742891, "learning_rate": 9.749020425753251e-08, "loss": 0.74879944, "num_input_tokens_seen": 323933440, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 15022, "time_per_iteration": 2.4760708808898926 }, { "auxiliary_loss_clip": 0.01098185, "auxiliary_loss_mlp": 0.0102573, "balance_loss_clip": 1.01442301, "balance_loss_mlp": 1.03431201, "epoch": 0.9032316248309034, "flos": 26323168164480.0, "grad_norm": 1.9694646699409504, "language_loss": 0.72227621, "learning_rate": 9.737012810001943e-08, "loss": 0.74351537, "num_input_tokens_seen": 323954090, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.63671875, "step": 15023, "time_per_iteration": 2.489856481552124 }, { "auxiliary_loss_clip": 0.0110332, "auxiliary_loss_mlp": 0.01032578, "balance_loss_clip": 1.02095532, "balance_loss_mlp": 1.03524244, "epoch": 0.9032917480835713, "flos": 22636686499200.0, "grad_norm": 2.1241424306695653, "language_loss": 0.82393599, "learning_rate": 9.725012409042155e-08, "loss": 0.84529495, "num_input_tokens_seen": 323974040, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 15024, "time_per_iteration": 2.4755449295043945 }, { "auxiliary_loss_clip": 0.01105054, "auxiliary_loss_mlp": 0.01024965, "balance_loss_clip": 1.01327634, "balance_loss_mlp": 1.03482723, "epoch": 0.9033518713362393, "flos": 23878764846720.0, "grad_norm": 1.5956246234313016, "language_loss": 0.69615418, "learning_rate": 9.713019223328966e-08, "loss": 0.71745437, "num_input_tokens_seen": 323996125, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.703125, "step": 15025, "time_per_iteration": 2.491847515106201 }, { "auxiliary_loss_clip": 0.01100026, "auxiliary_loss_mlp": 0.01029754, "balance_loss_clip": 1.01782703, "balance_loss_mlp": 1.03245282, "epoch": 0.9034119945889073, "flos": 26905279973760.0, "grad_norm": 1.6470803601295358, "language_loss": 0.76979268, "learning_rate": 9.70103325331717e-08, "loss": 0.79109049, "num_input_tokens_seen": 324017645, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 15026, "time_per_iteration": 2.5031793117523193 }, { "auxiliary_loss_clip": 0.01104464, "auxiliary_loss_mlp": 0.01029575, "balance_loss_clip": 1.01812506, "balance_loss_mlp": 1.03689277, "epoch": 0.9034721178415752, "flos": 20850166730880.0, "grad_norm": 1.8231886376949147, "language_loss": 0.68516892, "learning_rate": 9.68905449946129e-08, "loss": 0.70650935, "num_input_tokens_seen": 324036875, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 15027, "time_per_iteration": 2.4670910835266113 }, { "auxiliary_loss_clip": 0.01100277, "auxiliary_loss_mlp": 0.01027888, "balance_loss_clip": 1.01636076, "balance_loss_mlp": 1.0348258, "epoch": 0.9035322410942432, "flos": 22234307368320.0, "grad_norm": 1.7309367461962175, "language_loss": 0.7594952, "learning_rate": 9.677082962215477e-08, "loss": 0.78077686, "num_input_tokens_seen": 324057045, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65625, "step": 15028, "time_per_iteration": 2.4774177074432373 }, { "auxiliary_loss_clip": 0.01102347, "auxiliary_loss_mlp": 0.01032372, "balance_loss_clip": 1.02104688, "balance_loss_mlp": 1.03527534, "epoch": 0.9035923643469111, "flos": 25923410726400.0, "grad_norm": 1.6961022084275827, "language_loss": 0.69074053, "learning_rate": 9.665118642033765e-08, "loss": 0.71208769, "num_input_tokens_seen": 324079735, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66796875, "step": 15029, "time_per_iteration": 2.5224688053131104 }, { "auxiliary_loss_clip": 0.01106644, "auxiliary_loss_mlp": 0.01030874, "balance_loss_clip": 1.01791549, "balance_loss_mlp": 1.03630686, "epoch": 0.9036524875995792, "flos": 20339804338560.0, "grad_norm": 2.1962476065974768, "language_loss": 0.73758703, "learning_rate": 9.653161539369858e-08, "loss": 0.75896227, "num_input_tokens_seen": 324097785, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 15030, "time_per_iteration": 2.461215019226074 }, { "auxiliary_loss_clip": 0.01105828, "auxiliary_loss_mlp": 0.01029546, "balance_loss_clip": 1.01741683, "balance_loss_mlp": 1.03535461, "epoch": 0.9037126108522471, "flos": 40114624677120.0, "grad_norm": 1.8966995438817915, "language_loss": 0.68250459, "learning_rate": 9.641211654677151e-08, "loss": 0.70385838, "num_input_tokens_seen": 324121625, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.703125, "step": 15031, "time_per_iteration": 2.617938995361328 }, { "auxiliary_loss_clip": 0.01100485, "auxiliary_loss_mlp": 0.01026259, "balance_loss_clip": 1.01482701, "balance_loss_mlp": 1.03371119, "epoch": 0.9037727341049151, "flos": 23332024955520.0, "grad_norm": 1.6615488778774314, "language_loss": 0.76592863, "learning_rate": 9.629268988408723e-08, "loss": 0.7871961, "num_input_tokens_seen": 324142535, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66796875, "step": 15032, "time_per_iteration": 2.4975295066833496 }, { "auxiliary_loss_clip": 0.01104329, "auxiliary_loss_mlp": 0.01030084, "balance_loss_clip": 1.01813352, "balance_loss_mlp": 1.03502572, "epoch": 0.903832857357583, "flos": 12822659815680.0, "grad_norm": 4.053122608217073, "language_loss": 0.74812907, "learning_rate": 9.617333541017502e-08, "loss": 0.7694732, "num_input_tokens_seen": 324159610, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 15033, "time_per_iteration": 2.4330520629882812 }, { "auxiliary_loss_clip": 0.01105008, "auxiliary_loss_mlp": 0.01036149, "balance_loss_clip": 1.02336931, "balance_loss_mlp": 1.03517342, "epoch": 0.903892980610251, "flos": 25703026830720.0, "grad_norm": 1.9480687614374017, "language_loss": 0.74003267, "learning_rate": 9.605405312956105e-08, "loss": 0.76144421, "num_input_tokens_seen": 324182510, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 15034, "time_per_iteration": 2.5287840366363525 }, { "auxiliary_loss_clip": 0.01104845, "auxiliary_loss_mlp": 0.01031991, "balance_loss_clip": 1.019701, "balance_loss_mlp": 1.03676736, "epoch": 0.9039531038629189, "flos": 14684089397760.0, "grad_norm": 1.7728189138337473, "language_loss": 0.63289618, "learning_rate": 9.593484304676791e-08, "loss": 0.65426457, "num_input_tokens_seen": 324200555, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 15035, "time_per_iteration": 2.4306375980377197 }, { "auxiliary_loss_clip": 0.01105345, "auxiliary_loss_mlp": 0.01030309, "balance_loss_clip": 1.01759541, "balance_loss_mlp": 1.03634536, "epoch": 0.904013227115587, "flos": 24024921287040.0, "grad_norm": 2.042753049774842, "language_loss": 0.61742008, "learning_rate": 9.581570516631643e-08, "loss": 0.63877666, "num_input_tokens_seen": 324220255, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 15036, "time_per_iteration": 2.4793014526367188 }, { "auxiliary_loss_clip": 0.01100129, "auxiliary_loss_mlp": 0.01030483, "balance_loss_clip": 1.01868081, "balance_loss_mlp": 1.03465593, "epoch": 0.9040733503682549, "flos": 22856459863680.0, "grad_norm": 1.9543476920546534, "language_loss": 0.823497, "learning_rate": 9.569663949272455e-08, "loss": 0.84480315, "num_input_tokens_seen": 324237855, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.65625, "step": 15037, "time_per_iteration": 2.495856523513794 }, { "auxiliary_loss_clip": 0.01107285, "auxiliary_loss_mlp": 0.01028354, "balance_loss_clip": 1.01607513, "balance_loss_mlp": 1.03669655, "epoch": 0.9041334736209229, "flos": 19974951941760.0, "grad_norm": 2.1129631263882813, "language_loss": 0.67514312, "learning_rate": 9.557764603050667e-08, "loss": 0.69649947, "num_input_tokens_seen": 324257050, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.70703125, "step": 15038, "time_per_iteration": 2.431957960128784 }, { "auxiliary_loss_clip": 0.01103885, "auxiliary_loss_mlp": 0.01037895, "balance_loss_clip": 1.02532494, "balance_loss_mlp": 1.03496647, "epoch": 0.9041935968735909, "flos": 17530548624000.0, "grad_norm": 2.224842743873458, "language_loss": 0.75402606, "learning_rate": 9.545872478417494e-08, "loss": 0.77544379, "num_input_tokens_seen": 324275510, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 15039, "time_per_iteration": 3.91967511177063 }, { "auxiliary_loss_clip": 0.01102367, "auxiliary_loss_mlp": 0.01028984, "balance_loss_clip": 1.01716495, "balance_loss_mlp": 1.03577125, "epoch": 0.9042537201262588, "flos": 22780149419520.0, "grad_norm": 1.9351966420768079, "language_loss": 0.70051825, "learning_rate": 9.533987575823977e-08, "loss": 0.72183174, "num_input_tokens_seen": 324295150, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 15040, "time_per_iteration": 2.4743728637695312 }, { "auxiliary_loss_clip": 0.01101347, "auxiliary_loss_mlp": 0.01025572, "balance_loss_clip": 1.01394272, "balance_loss_mlp": 1.03391671, "epoch": 0.9043138433789268, "flos": 20595416497920.0, "grad_norm": 2.452020584255588, "language_loss": 0.67391539, "learning_rate": 9.522109895720709e-08, "loss": 0.69518459, "num_input_tokens_seen": 324313855, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 15041, "time_per_iteration": 2.4516067504882812 }, { "auxiliary_loss_clip": 0.01103037, "auxiliary_loss_mlp": 0.01028713, "balance_loss_clip": 1.01627982, "balance_loss_mlp": 1.03450108, "epoch": 0.9043739666315948, "flos": 32962978995840.0, "grad_norm": 2.1514485590648684, "language_loss": 0.57467622, "learning_rate": 9.510239438558155e-08, "loss": 0.59599376, "num_input_tokens_seen": 324338465, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 15042, "time_per_iteration": 3.987308979034424 }, { "auxiliary_loss_clip": 0.01028543, "auxiliary_loss_mlp": 0.01001339, "balance_loss_clip": 1.00029564, "balance_loss_mlp": 1.00608635, "epoch": 0.9044340898842628, "flos": 67296418525440.0, "grad_norm": 0.8162610970583137, "language_loss": 0.56967139, "learning_rate": 9.498376204786351e-08, "loss": 0.58997023, "num_input_tokens_seen": 324398740, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.22460938, "step": 15043, "time_per_iteration": 3.070753574371338 }, { "auxiliary_loss_clip": 0.01103924, "auxiliary_loss_mlp": 0.01026289, "balance_loss_clip": 1.01356936, "balance_loss_mlp": 1.03460503, "epoch": 0.9044942131369307, "flos": 17713154390400.0, "grad_norm": 4.466030824646577, "language_loss": 0.70187968, "learning_rate": 9.486520194855274e-08, "loss": 0.72318184, "num_input_tokens_seen": 324417335, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 15044, "time_per_iteration": 2.4532032012939453 }, { "auxiliary_loss_clip": 0.01104723, "auxiliary_loss_mlp": 0.01036332, "balance_loss_clip": 1.02323079, "balance_loss_mlp": 1.03502345, "epoch": 0.9045543363895987, "flos": 17820563034240.0, "grad_norm": 5.11381633149618, "language_loss": 0.69911814, "learning_rate": 9.474671409214407e-08, "loss": 0.72052866, "num_input_tokens_seen": 324433240, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 15045, "time_per_iteration": 5.2618560791015625 }, { "auxiliary_loss_clip": 0.01109077, "auxiliary_loss_mlp": 0.01035201, "balance_loss_clip": 1.02228439, "balance_loss_mlp": 1.03824413, "epoch": 0.9046144596422666, "flos": 21872723109120.0, "grad_norm": 1.9439623508082882, "language_loss": 0.65680325, "learning_rate": 9.462829848313081e-08, "loss": 0.67824602, "num_input_tokens_seen": 324452675, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 15046, "time_per_iteration": 2.4836835861206055 }, { "auxiliary_loss_clip": 0.01105616, "auxiliary_loss_mlp": 0.01032829, "balance_loss_clip": 1.02017546, "balance_loss_mlp": 1.03457153, "epoch": 0.9046745828949346, "flos": 17672646827520.0, "grad_norm": 2.560195471821705, "language_loss": 0.62208182, "learning_rate": 9.450995512600379e-08, "loss": 0.64346629, "num_input_tokens_seen": 324467865, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 15047, "time_per_iteration": 2.4306230545043945 }, { "auxiliary_loss_clip": 0.01104048, "auxiliary_loss_mlp": 0.01028873, "balance_loss_clip": 1.01743495, "balance_loss_mlp": 1.03601325, "epoch": 0.9047347061476025, "flos": 25702559953920.0, "grad_norm": 1.6806129530817755, "language_loss": 0.71371239, "learning_rate": 9.439168402525032e-08, "loss": 0.73504156, "num_input_tokens_seen": 324490430, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 15048, "time_per_iteration": 2.5315182209014893 }, { "auxiliary_loss_clip": 0.01103787, "auxiliary_loss_mlp": 0.01031344, "balance_loss_clip": 1.01833797, "balance_loss_mlp": 1.03355229, "epoch": 0.9047948294002706, "flos": 15158146118400.0, "grad_norm": 2.6695281883651205, "language_loss": 0.75423127, "learning_rate": 9.427348518535483e-08, "loss": 0.77558261, "num_input_tokens_seen": 324506620, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 15049, "time_per_iteration": 2.4197633266448975 }, { "auxiliary_loss_clip": 0.01103636, "auxiliary_loss_mlp": 0.01027886, "balance_loss_clip": 1.01577413, "balance_loss_mlp": 1.03697395, "epoch": 0.9048549526529385, "flos": 21872292145920.0, "grad_norm": 2.158381680385367, "language_loss": 0.75781476, "learning_rate": 9.415535861079993e-08, "loss": 0.77912998, "num_input_tokens_seen": 324525505, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66796875, "step": 15050, "time_per_iteration": 2.507033348083496 }, { "auxiliary_loss_clip": 0.01103634, "auxiliary_loss_mlp": 0.01033866, "balance_loss_clip": 1.021963, "balance_loss_mlp": 1.03471029, "epoch": 0.9049150759056065, "flos": 23546626761600.0, "grad_norm": 2.152999542451171, "language_loss": 0.81787246, "learning_rate": 9.403730430606472e-08, "loss": 0.83924747, "num_input_tokens_seen": 324544415, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 15051, "time_per_iteration": 2.4697210788726807 }, { "auxiliary_loss_clip": 0.01103747, "auxiliary_loss_mlp": 0.01031726, "balance_loss_clip": 1.01991236, "balance_loss_mlp": 1.03511465, "epoch": 0.9049751991582745, "flos": 19645902426240.0, "grad_norm": 2.206419732265858, "language_loss": 0.88901615, "learning_rate": 9.391932227562582e-08, "loss": 0.91037083, "num_input_tokens_seen": 324562555, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 15052, "time_per_iteration": 2.4515633583068848 }, { "auxiliary_loss_clip": 0.01109186, "auxiliary_loss_mlp": 0.01030212, "balance_loss_clip": 1.01783252, "balance_loss_mlp": 1.0374186, "epoch": 0.9050353224109424, "flos": 15596220389760.0, "grad_norm": 2.4745220757149062, "language_loss": 0.77328038, "learning_rate": 9.380141252395724e-08, "loss": 0.7946744, "num_input_tokens_seen": 324580865, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.71875, "step": 15053, "time_per_iteration": 2.4245121479034424 }, { "auxiliary_loss_clip": 0.01101343, "auxiliary_loss_mlp": 0.01035915, "balance_loss_clip": 1.02379751, "balance_loss_mlp": 1.03453493, "epoch": 0.9050954456636104, "flos": 28183592165760.0, "grad_norm": 4.209219289691103, "language_loss": 0.73393142, "learning_rate": 9.368357505553049e-08, "loss": 0.75530398, "num_input_tokens_seen": 324600665, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66796875, "step": 15054, "time_per_iteration": 2.5319907665252686 }, { "auxiliary_loss_clip": 0.01103008, "auxiliary_loss_mlp": 0.01028379, "balance_loss_clip": 1.01670218, "balance_loss_mlp": 1.03524232, "epoch": 0.9051555689162784, "flos": 25731611078400.0, "grad_norm": 1.6774674636524243, "language_loss": 0.83347446, "learning_rate": 9.356580987481333e-08, "loss": 0.8547883, "num_input_tokens_seen": 324618145, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 15055, "time_per_iteration": 2.5094988346099854 }, { "auxiliary_loss_clip": 0.01102387, "auxiliary_loss_mlp": 0.0103254, "balance_loss_clip": 1.0204041, "balance_loss_mlp": 1.03516674, "epoch": 0.9052156921689464, "flos": 23257258796160.0, "grad_norm": 1.7565594871254635, "language_loss": 0.85199904, "learning_rate": 9.344811698627176e-08, "loss": 0.87334824, "num_input_tokens_seen": 324638165, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 15056, "time_per_iteration": 2.4751663208007812 }, { "auxiliary_loss_clip": 0.01102838, "auxiliary_loss_mlp": 0.01029166, "balance_loss_clip": 1.01754951, "balance_loss_mlp": 1.03535056, "epoch": 0.9052758154216143, "flos": 29564285097600.0, "grad_norm": 1.9952871475656155, "language_loss": 0.72159135, "learning_rate": 9.333049639436863e-08, "loss": 0.74291134, "num_input_tokens_seen": 324658560, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 15057, "time_per_iteration": 2.51012921333313 }, { "auxiliary_loss_clip": 0.01101411, "auxiliary_loss_mlp": 0.01026746, "balance_loss_clip": 1.01517725, "balance_loss_mlp": 1.03433681, "epoch": 0.9053359386742823, "flos": 22127688823680.0, "grad_norm": 2.3389980464359112, "language_loss": 0.81110251, "learning_rate": 9.321294810356418e-08, "loss": 0.83238411, "num_input_tokens_seen": 324679185, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 15058, "time_per_iteration": 2.4705655574798584 }, { "auxiliary_loss_clip": 0.01028852, "auxiliary_loss_mlp": 0.01001486, "balance_loss_clip": 1.00047231, "balance_loss_mlp": 1.00637269, "epoch": 0.9053960619269502, "flos": 67090112760960.0, "grad_norm": 0.7402709342070719, "language_loss": 0.51390213, "learning_rate": 9.309547211831592e-08, "loss": 0.53420544, "num_input_tokens_seen": 324744830, "router_z_loss_clip": 0.01013184, "router_z_loss_mlp": 0.22460938, "step": 15059, "time_per_iteration": 3.185554027557373 }, { "auxiliary_loss_clip": 0.01101835, "auxiliary_loss_mlp": 0.01027703, "balance_loss_clip": 1.01554346, "balance_loss_mlp": 1.03358912, "epoch": 0.9054561851796182, "flos": 15815419136640.0, "grad_norm": 1.9663930656464679, "language_loss": 0.67395675, "learning_rate": 9.297806844307831e-08, "loss": 0.69525218, "num_input_tokens_seen": 324762905, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 15060, "time_per_iteration": 2.4373128414154053 }, { "auxiliary_loss_clip": 0.01106283, "auxiliary_loss_mlp": 0.01029434, "balance_loss_clip": 1.01760292, "balance_loss_mlp": 1.03655696, "epoch": 0.9055163084322861, "flos": 17566997950080.0, "grad_norm": 2.322446991157398, "language_loss": 0.64615476, "learning_rate": 9.286073708230357e-08, "loss": 0.66751194, "num_input_tokens_seen": 324781905, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 15061, "time_per_iteration": 2.427799940109253 }, { "auxiliary_loss_clip": 0.01106221, "auxiliary_loss_mlp": 0.01033802, "balance_loss_clip": 1.0218569, "balance_loss_mlp": 1.03741539, "epoch": 0.9055764316849542, "flos": 17639573379840.0, "grad_norm": 3.059872669071861, "language_loss": 0.71363807, "learning_rate": 9.274347804044058e-08, "loss": 0.73503828, "num_input_tokens_seen": 324799260, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 15062, "time_per_iteration": 2.4533097743988037 }, { "auxiliary_loss_clip": 0.01100256, "auxiliary_loss_mlp": 0.01030597, "balance_loss_clip": 1.01902747, "balance_loss_mlp": 1.03281987, "epoch": 0.9056365549376221, "flos": 20120856986880.0, "grad_norm": 1.6706141204006555, "language_loss": 0.70822173, "learning_rate": 9.2626291321936e-08, "loss": 0.72953027, "num_input_tokens_seen": 324817800, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.67578125, "step": 15063, "time_per_iteration": 2.4368371963500977 }, { "auxiliary_loss_clip": 0.01099314, "auxiliary_loss_mlp": 0.01030599, "balance_loss_clip": 1.01889825, "balance_loss_mlp": 1.03322792, "epoch": 0.9056966781902901, "flos": 27598786836480.0, "grad_norm": 1.7305376164282131, "language_loss": 0.72166258, "learning_rate": 9.250917693123406e-08, "loss": 0.7429617, "num_input_tokens_seen": 324838445, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66015625, "step": 15064, "time_per_iteration": 2.5133094787597656 }, { "auxiliary_loss_clip": 0.01103345, "auxiliary_loss_mlp": 0.01030532, "balance_loss_clip": 1.01846206, "balance_loss_mlp": 1.03307629, "epoch": 0.9057568014429581, "flos": 25920106675200.0, "grad_norm": 2.8693453318847837, "language_loss": 0.69800705, "learning_rate": 9.23921348727752e-08, "loss": 0.71934581, "num_input_tokens_seen": 324859895, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 15065, "time_per_iteration": 2.4946391582489014 }, { "auxiliary_loss_clip": 0.01104838, "auxiliary_loss_mlp": 0.01035759, "balance_loss_clip": 1.02396917, "balance_loss_mlp": 1.03631198, "epoch": 0.905816924695626, "flos": 22930364096640.0, "grad_norm": 1.6490891450348129, "language_loss": 0.63263613, "learning_rate": 9.227516515099743e-08, "loss": 0.65404212, "num_input_tokens_seen": 324879580, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 15066, "time_per_iteration": 2.5083415508270264 }, { "auxiliary_loss_clip": 0.01106121, "auxiliary_loss_mlp": 0.01027763, "balance_loss_clip": 1.01432776, "balance_loss_mlp": 1.03348947, "epoch": 0.905877047948294, "flos": 22157422306560.0, "grad_norm": 1.8340003819283488, "language_loss": 0.79691172, "learning_rate": 9.215826777033675e-08, "loss": 0.81825054, "num_input_tokens_seen": 324898950, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 15067, "time_per_iteration": 2.603487730026245 }, { "auxiliary_loss_clip": 0.01106211, "auxiliary_loss_mlp": 0.01032922, "balance_loss_clip": 1.01996446, "balance_loss_mlp": 1.0360539, "epoch": 0.905937171200962, "flos": 15304805349120.0, "grad_norm": 2.9966960143755466, "language_loss": 0.69932616, "learning_rate": 9.204144273522563e-08, "loss": 0.72071743, "num_input_tokens_seen": 324917455, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 15068, "time_per_iteration": 2.4966013431549072 }, { "auxiliary_loss_clip": 0.01099841, "auxiliary_loss_mlp": 0.01028189, "balance_loss_clip": 1.01664972, "balance_loss_mlp": 1.03352916, "epoch": 0.90599729445363, "flos": 19462973437440.0, "grad_norm": 2.6265216115783794, "language_loss": 0.85595101, "learning_rate": 9.19246900500943e-08, "loss": 0.87723136, "num_input_tokens_seen": 324934495, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 15069, "time_per_iteration": 2.4316933155059814 }, { "auxiliary_loss_clip": 0.01107384, "auxiliary_loss_mlp": 0.01029315, "balance_loss_clip": 1.01669049, "balance_loss_mlp": 1.03495693, "epoch": 0.9060574177062979, "flos": 23732967542400.0, "grad_norm": 2.033567492462709, "language_loss": 0.59626555, "learning_rate": 9.180800971936987e-08, "loss": 0.61763257, "num_input_tokens_seen": 324953230, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7265625, "step": 15070, "time_per_iteration": 2.4809181690216064 }, { "auxiliary_loss_clip": 0.0110607, "auxiliary_loss_mlp": 0.01025581, "balance_loss_clip": 1.01274824, "balance_loss_mlp": 1.03504133, "epoch": 0.9061175409589659, "flos": 17311134395520.0, "grad_norm": 4.368442336969501, "language_loss": 0.81742668, "learning_rate": 9.169140174747724e-08, "loss": 0.83874321, "num_input_tokens_seen": 324969880, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 15071, "time_per_iteration": 2.4159817695617676 }, { "auxiliary_loss_clip": 0.01107079, "auxiliary_loss_mlp": 0.01035304, "balance_loss_clip": 1.02206552, "balance_loss_mlp": 1.03526616, "epoch": 0.9061776642116338, "flos": 17778439359360.0, "grad_norm": 2.0974174105758254, "language_loss": 0.61957216, "learning_rate": 9.157486613883758e-08, "loss": 0.64099598, "num_input_tokens_seen": 324987005, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 15072, "time_per_iteration": 2.4316365718841553 }, { "auxiliary_loss_clip": 0.01101585, "auxiliary_loss_mlp": 0.01031488, "balance_loss_clip": 1.01972806, "balance_loss_mlp": 1.03341508, "epoch": 0.9062377874643018, "flos": 42777688037760.0, "grad_norm": 1.9701979849226432, "language_loss": 0.72984743, "learning_rate": 9.145840289787021e-08, "loss": 0.75117815, "num_input_tokens_seen": 325010700, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 15073, "time_per_iteration": 2.645292282104492 }, { "auxiliary_loss_clip": 0.01101817, "auxiliary_loss_mlp": 0.01025547, "balance_loss_clip": 1.01442456, "balance_loss_mlp": 1.03491926, "epoch": 0.9062979107169697, "flos": 16361620323840.0, "grad_norm": 1.8248786932406018, "language_loss": 0.81200039, "learning_rate": 9.134201202899161e-08, "loss": 0.83327401, "num_input_tokens_seen": 325028760, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.671875, "step": 15074, "time_per_iteration": 2.4420015811920166 }, { "auxiliary_loss_clip": 0.01029101, "auxiliary_loss_mlp": 0.01000713, "balance_loss_clip": 0.99972916, "balance_loss_mlp": 1.00681341, "epoch": 0.9063580339696378, "flos": 69313988528640.0, "grad_norm": 0.7511075207953364, "language_loss": 0.52345383, "learning_rate": 9.122569353661513e-08, "loss": 0.54375196, "num_input_tokens_seen": 325093545, "router_z_loss_clip": 0.00982666, "router_z_loss_mlp": 0.22265625, "step": 15075, "time_per_iteration": 3.1455225944519043 }, { "auxiliary_loss_clip": 0.01028743, "auxiliary_loss_mlp": 0.01001368, "balance_loss_clip": 1.00032485, "balance_loss_mlp": 1.00629532, "epoch": 0.9064181572223057, "flos": 58794747148800.0, "grad_norm": 0.725243667094879, "language_loss": 0.62079918, "learning_rate": 9.11094474251517e-08, "loss": 0.64110029, "num_input_tokens_seen": 325152295, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.22460938, "step": 15076, "time_per_iteration": 3.0004663467407227 }, { "auxiliary_loss_clip": 0.01102078, "auxiliary_loss_mlp": 0.01034044, "balance_loss_clip": 1.0226593, "balance_loss_mlp": 1.03434038, "epoch": 0.9064782804749737, "flos": 21762692772480.0, "grad_norm": 1.954553892719111, "language_loss": 0.82405251, "learning_rate": 9.09932736990091e-08, "loss": 0.84541374, "num_input_tokens_seen": 325169705, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 15077, "time_per_iteration": 2.444687604904175 }, { "auxiliary_loss_clip": 0.0110002, "auxiliary_loss_mlp": 0.01021531, "balance_loss_clip": 1.01051581, "balance_loss_mlp": 1.03328347, "epoch": 0.9065384037276417, "flos": 21397373498880.0, "grad_norm": 1.5434332252457004, "language_loss": 0.84103274, "learning_rate": 9.08771723625934e-08, "loss": 0.8622483, "num_input_tokens_seen": 325189175, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66796875, "step": 15078, "time_per_iteration": 2.482229471206665 }, { "auxiliary_loss_clip": 0.0110125, "auxiliary_loss_mlp": 0.01033922, "balance_loss_clip": 1.02168477, "balance_loss_mlp": 1.03606248, "epoch": 0.9065985269803096, "flos": 38283646849920.0, "grad_norm": 1.8659504348647769, "language_loss": 0.65253103, "learning_rate": 9.076114342030617e-08, "loss": 0.67388272, "num_input_tokens_seen": 325211020, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.65234375, "step": 15079, "time_per_iteration": 2.6021461486816406 }, { "auxiliary_loss_clip": 0.01102183, "auxiliary_loss_mlp": 0.01030088, "balance_loss_clip": 1.01805389, "balance_loss_mlp": 1.03398967, "epoch": 0.9066586502329776, "flos": 44818562989440.0, "grad_norm": 12.533931696891255, "language_loss": 0.71151525, "learning_rate": 9.064518687654765e-08, "loss": 0.73283798, "num_input_tokens_seen": 325236970, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 15080, "time_per_iteration": 4.261369228363037 }, { "auxiliary_loss_clip": 0.01106084, "auxiliary_loss_mlp": 0.01031494, "balance_loss_clip": 1.01905489, "balance_loss_mlp": 1.03585076, "epoch": 0.9067187734856456, "flos": 18623992492800.0, "grad_norm": 2.2609193223653983, "language_loss": 0.71220839, "learning_rate": 9.052930273571547e-08, "loss": 0.73358417, "num_input_tokens_seen": 325252670, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 15081, "time_per_iteration": 2.4449543952941895 }, { "auxiliary_loss_clip": 0.0110276, "auxiliary_loss_mlp": 0.01031582, "balance_loss_clip": 1.0195837, "balance_loss_mlp": 1.03569484, "epoch": 0.9067788967383136, "flos": 22747578762240.0, "grad_norm": 1.8256956706789007, "language_loss": 0.74166393, "learning_rate": 9.04134910022032e-08, "loss": 0.7630074, "num_input_tokens_seen": 325273860, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 15082, "time_per_iteration": 2.4674713611602783 }, { "auxiliary_loss_clip": 0.01102058, "auxiliary_loss_mlp": 0.01034463, "balance_loss_clip": 1.02241731, "balance_loss_mlp": 1.0351398, "epoch": 0.9068390199909815, "flos": 27670787648640.0, "grad_norm": 3.0200505375864237, "language_loss": 0.78196669, "learning_rate": 9.029775168040266e-08, "loss": 0.80333185, "num_input_tokens_seen": 325294140, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.66796875, "step": 15083, "time_per_iteration": 2.5144662857055664 }, { "auxiliary_loss_clip": 0.01101502, "auxiliary_loss_mlp": 0.01033298, "balance_loss_clip": 1.02205098, "balance_loss_mlp": 1.03594673, "epoch": 0.9068991432436495, "flos": 24244012293120.0, "grad_norm": 1.6125092854934489, "language_loss": 0.69278526, "learning_rate": 9.01820847747028e-08, "loss": 0.71413326, "num_input_tokens_seen": 325313130, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.65625, "step": 15084, "time_per_iteration": 3.8514883518218994 }, { "auxiliary_loss_clip": 0.01104464, "auxiliary_loss_mlp": 0.01029809, "balance_loss_clip": 1.01822758, "balance_loss_mlp": 1.03647792, "epoch": 0.9069592664963174, "flos": 28033305661440.0, "grad_norm": 3.512001097485327, "language_loss": 0.66942269, "learning_rate": 9.006649028948965e-08, "loss": 0.6907655, "num_input_tokens_seen": 325334880, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 15085, "time_per_iteration": 2.4979841709136963 }, { "auxiliary_loss_clip": 0.01029221, "auxiliary_loss_mlp": 0.0100255, "balance_loss_clip": 1.00148952, "balance_loss_mlp": 1.0067122, "epoch": 0.9070193897489854, "flos": 68778414789120.0, "grad_norm": 0.7763298095189469, "language_loss": 0.61296678, "learning_rate": 8.995096822914638e-08, "loss": 0.63328457, "num_input_tokens_seen": 325394175, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.22460938, "step": 15086, "time_per_iteration": 4.469374895095825 }, { "auxiliary_loss_clip": 0.01103264, "auxiliary_loss_mlp": 0.01037443, "balance_loss_clip": 1.02494407, "balance_loss_mlp": 1.03538167, "epoch": 0.9070795130016533, "flos": 23441624328960.0, "grad_norm": 1.500076790117068, "language_loss": 0.72121263, "learning_rate": 8.983551859805416e-08, "loss": 0.74261969, "num_input_tokens_seen": 325415020, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 15087, "time_per_iteration": 3.8751044273376465 }, { "auxiliary_loss_clip": 0.01103506, "auxiliary_loss_mlp": 0.01028562, "balance_loss_clip": 1.01707566, "balance_loss_mlp": 1.03575349, "epoch": 0.9071396362543214, "flos": 18916413114240.0, "grad_norm": 2.1039427389086494, "language_loss": 0.76888072, "learning_rate": 8.972014140059058e-08, "loss": 0.79020143, "num_input_tokens_seen": 325433595, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.67578125, "step": 15088, "time_per_iteration": 2.4234282970428467 }, { "auxiliary_loss_clip": 0.01099697, "auxiliary_loss_mlp": 0.01029695, "balance_loss_clip": 1.01863265, "balance_loss_mlp": 1.03442216, "epoch": 0.9071997595069893, "flos": 25228646887680.0, "grad_norm": 2.0620971599696296, "language_loss": 0.73376346, "learning_rate": 8.960483664113038e-08, "loss": 0.75505733, "num_input_tokens_seen": 325451605, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65234375, "step": 15089, "time_per_iteration": 2.4918899536132812 }, { "auxiliary_loss_clip": 0.01099236, "auxiliary_loss_mlp": 0.01027732, "balance_loss_clip": 1.0165261, "balance_loss_mlp": 1.03481853, "epoch": 0.9072598827596573, "flos": 24346608514560.0, "grad_norm": 1.859076689549944, "language_loss": 0.75642967, "learning_rate": 8.948960432404628e-08, "loss": 0.77769935, "num_input_tokens_seen": 325470645, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.64453125, "step": 15090, "time_per_iteration": 2.4703214168548584 }, { "auxiliary_loss_clip": 0.01105565, "auxiliary_loss_mlp": 0.01026573, "balance_loss_clip": 1.01359081, "balance_loss_mlp": 1.03528357, "epoch": 0.9073200060123253, "flos": 22674967418880.0, "grad_norm": 2.713073586984337, "language_loss": 0.77630526, "learning_rate": 8.93744444537079e-08, "loss": 0.79762661, "num_input_tokens_seen": 325488070, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 15091, "time_per_iteration": 2.4708478450775146 }, { "auxiliary_loss_clip": 0.01096201, "auxiliary_loss_mlp": 0.01025495, "balance_loss_clip": 1.01459312, "balance_loss_mlp": 1.03270292, "epoch": 0.9073801292649932, "flos": 23695476721920.0, "grad_norm": 1.7894312476262495, "language_loss": 0.8606624, "learning_rate": 8.925935703448217e-08, "loss": 0.88187933, "num_input_tokens_seen": 325509285, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.6328125, "step": 15092, "time_per_iteration": 2.4751193523406982 }, { "auxiliary_loss_clip": 0.01103089, "auxiliary_loss_mlp": 0.01031752, "balance_loss_clip": 1.0200814, "balance_loss_mlp": 1.03608704, "epoch": 0.9074402525176612, "flos": 25375413859200.0, "grad_norm": 1.7968164721342559, "language_loss": 0.78850681, "learning_rate": 8.914434207073296e-08, "loss": 0.80985516, "num_input_tokens_seen": 325529360, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66796875, "step": 15093, "time_per_iteration": 2.5007877349853516 }, { "auxiliary_loss_clip": 0.01029041, "auxiliary_loss_mlp": 0.00999736, "balance_loss_clip": 0.99866879, "balance_loss_mlp": 1.00662565, "epoch": 0.9075003757703292, "flos": 67649024384640.0, "grad_norm": 0.7385604558643557, "language_loss": 0.57023013, "learning_rate": 8.902939956682188e-08, "loss": 0.59051788, "num_input_tokens_seen": 325583565, "router_z_loss_clip": 0.01068115, "router_z_loss_mlp": 0.22460938, "step": 15094, "time_per_iteration": 3.016348361968994 }, { "auxiliary_loss_clip": 0.01105714, "auxiliary_loss_mlp": 0.01032246, "balance_loss_clip": 1.01941943, "balance_loss_mlp": 1.03561401, "epoch": 0.9075604990229972, "flos": 22453649769600.0, "grad_norm": 1.8882168589709842, "language_loss": 0.71380913, "learning_rate": 8.891452952710742e-08, "loss": 0.73518872, "num_input_tokens_seen": 325603690, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 15095, "time_per_iteration": 2.4579379558563232 }, { "auxiliary_loss_clip": 0.01103729, "auxiliary_loss_mlp": 0.01032837, "balance_loss_clip": 1.02110684, "balance_loss_mlp": 1.03507113, "epoch": 0.9076206222756651, "flos": 19536662188800.0, "grad_norm": 1.6215890387430025, "language_loss": 0.74182826, "learning_rate": 8.879973195594526e-08, "loss": 0.76319391, "num_input_tokens_seen": 325622255, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 15096, "time_per_iteration": 2.412912607192993 }, { "auxiliary_loss_clip": 0.01104466, "auxiliary_loss_mlp": 0.01036819, "balance_loss_clip": 1.02338445, "balance_loss_mlp": 1.03508103, "epoch": 0.9076807455283331, "flos": 30116914819200.0, "grad_norm": 6.086394470305266, "language_loss": 0.56988019, "learning_rate": 8.868500685768898e-08, "loss": 0.59129304, "num_input_tokens_seen": 325640165, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 15097, "time_per_iteration": 2.5354819297790527 }, { "auxiliary_loss_clip": 0.01099257, "auxiliary_loss_mlp": 0.01024354, "balance_loss_clip": 1.01301074, "balance_loss_mlp": 1.0320121, "epoch": 0.907740868781001, "flos": 18697537589760.0, "grad_norm": 1.742174880701817, "language_loss": 0.7975167, "learning_rate": 8.857035423668935e-08, "loss": 0.81875288, "num_input_tokens_seen": 325659455, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 15098, "time_per_iteration": 2.4283227920532227 }, { "auxiliary_loss_clip": 0.01106335, "auxiliary_loss_mlp": 0.01028263, "balance_loss_clip": 1.01574636, "balance_loss_mlp": 1.03557146, "epoch": 0.907800992033669, "flos": 22638805401600.0, "grad_norm": 2.260509794358381, "language_loss": 0.66165733, "learning_rate": 8.845577409729266e-08, "loss": 0.68300331, "num_input_tokens_seen": 325678095, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 15099, "time_per_iteration": 2.5035669803619385 }, { "auxiliary_loss_clip": 0.01106657, "auxiliary_loss_mlp": 0.01033428, "balance_loss_clip": 1.02107751, "balance_loss_mlp": 1.03651261, "epoch": 0.907861115286337, "flos": 21287666384640.0, "grad_norm": 2.1106671374546258, "language_loss": 0.70163679, "learning_rate": 8.834126644384477e-08, "loss": 0.7230376, "num_input_tokens_seen": 325695825, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69921875, "step": 15100, "time_per_iteration": 2.4431161880493164 }, { "auxiliary_loss_clip": 0.01028844, "auxiliary_loss_mlp": 0.01001392, "balance_loss_clip": 1.00028372, "balance_loss_mlp": 1.00644517, "epoch": 0.907921238539005, "flos": 69739493040000.0, "grad_norm": 0.6386810425081265, "language_loss": 0.53372914, "learning_rate": 8.822683128068775e-08, "loss": 0.55403149, "num_input_tokens_seen": 325764515, "router_z_loss_clip": 0.0111084, "router_z_loss_mlp": 0.22460938, "step": 15101, "time_per_iteration": 3.155816078186035 }, { "auxiliary_loss_clip": 0.01104315, "auxiliary_loss_mlp": 0.01027603, "balance_loss_clip": 1.01559305, "balance_loss_mlp": 1.03626812, "epoch": 0.9079813617916729, "flos": 23477391296640.0, "grad_norm": 1.7180328617781062, "language_loss": 0.6795758, "learning_rate": 8.811246861216081e-08, "loss": 0.70089501, "num_input_tokens_seen": 325783235, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 15102, "time_per_iteration": 2.4687724113464355 }, { "auxiliary_loss_clip": 0.01104065, "auxiliary_loss_mlp": 0.01028757, "balance_loss_clip": 1.01674056, "balance_loss_mlp": 1.03633261, "epoch": 0.9080414850443409, "flos": 22929933133440.0, "grad_norm": 2.8214437608625427, "language_loss": 0.78998613, "learning_rate": 8.799817844260049e-08, "loss": 0.8113144, "num_input_tokens_seen": 325800195, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 15103, "time_per_iteration": 2.4669415950775146 }, { "auxiliary_loss_clip": 0.0110457, "auxiliary_loss_mlp": 0.0103075, "balance_loss_clip": 1.01832867, "balance_loss_mlp": 1.03471172, "epoch": 0.9081016082970089, "flos": 26177083551360.0, "grad_norm": 1.9846948329038956, "language_loss": 0.71300054, "learning_rate": 8.78839607763413e-08, "loss": 0.73435378, "num_input_tokens_seen": 325820215, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 15104, "time_per_iteration": 2.541090488433838 }, { "auxiliary_loss_clip": 0.01102074, "auxiliary_loss_mlp": 0.01023743, "balance_loss_clip": 1.0128293, "balance_loss_mlp": 1.03442025, "epoch": 0.9081617315496768, "flos": 24462169545600.0, "grad_norm": 2.447666070559784, "language_loss": 0.77351719, "learning_rate": 8.77698156177138e-08, "loss": 0.79477537, "num_input_tokens_seen": 325838415, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.67578125, "step": 15105, "time_per_iteration": 2.4950389862060547 }, { "auxiliary_loss_clip": 0.01102449, "auxiliary_loss_mlp": 0.01033352, "balance_loss_clip": 1.02102542, "balance_loss_mlp": 1.03370631, "epoch": 0.9082218548023449, "flos": 24746868743040.0, "grad_norm": 2.0219613603034587, "language_loss": 0.73940414, "learning_rate": 8.765574297104628e-08, "loss": 0.76076216, "num_input_tokens_seen": 325855580, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15106, "time_per_iteration": 2.4750237464904785 }, { "auxiliary_loss_clip": 0.01103953, "auxiliary_loss_mlp": 0.010353, "balance_loss_clip": 1.02314091, "balance_loss_mlp": 1.03465176, "epoch": 0.9082819780550128, "flos": 24421302846720.0, "grad_norm": 1.8023637117490814, "language_loss": 0.80500907, "learning_rate": 8.754174284066462e-08, "loss": 0.82640159, "num_input_tokens_seen": 325874890, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 15107, "time_per_iteration": 2.47629976272583 }, { "auxiliary_loss_clip": 0.01028852, "auxiliary_loss_mlp": 0.01000789, "balance_loss_clip": 0.99971026, "balance_loss_mlp": 1.00655735, "epoch": 0.9083421013076808, "flos": 59609704872960.0, "grad_norm": 0.8175410051982683, "language_loss": 0.59714621, "learning_rate": 8.742781523089205e-08, "loss": 0.61744261, "num_input_tokens_seen": 325935835, "router_z_loss_clip": 0.01080322, "router_z_loss_mlp": 0.22265625, "step": 15108, "time_per_iteration": 3.075493574142456 }, { "auxiliary_loss_clip": 0.01103639, "auxiliary_loss_mlp": 0.01024755, "balance_loss_clip": 1.01260722, "balance_loss_mlp": 1.03461337, "epoch": 0.9084022245603487, "flos": 33620216100480.0, "grad_norm": 3.906760818686636, "language_loss": 0.73539746, "learning_rate": 8.73139601460482e-08, "loss": 0.75668138, "num_input_tokens_seen": 325958035, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 15109, "time_per_iteration": 2.5490002632141113 }, { "auxiliary_loss_clip": 0.01099822, "auxiliary_loss_mlp": 0.01028762, "balance_loss_clip": 1.01728201, "balance_loss_mlp": 1.03288698, "epoch": 0.9084623478130167, "flos": 24971705925120.0, "grad_norm": 1.8397358767364973, "language_loss": 0.71489263, "learning_rate": 8.720017759045073e-08, "loss": 0.73617846, "num_input_tokens_seen": 325979870, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 15110, "time_per_iteration": 2.4984130859375 }, { "auxiliary_loss_clip": 0.01101767, "auxiliary_loss_mlp": 0.01029097, "balance_loss_clip": 1.01730776, "balance_loss_mlp": 1.03472638, "epoch": 0.9085224710656846, "flos": 31461804869760.0, "grad_norm": 2.252174282655335, "language_loss": 0.68998182, "learning_rate": 8.708646756841421e-08, "loss": 0.71129048, "num_input_tokens_seen": 325998245, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 15111, "time_per_iteration": 2.5193729400634766 }, { "auxiliary_loss_clip": 0.01028675, "auxiliary_loss_mlp": 0.01000501, "balance_loss_clip": 0.99949342, "balance_loss_mlp": 1.00621045, "epoch": 0.9085825943183526, "flos": 64917012867840.0, "grad_norm": 1.3473539739001126, "language_loss": 0.51746261, "learning_rate": 8.697283008425026e-08, "loss": 0.5377543, "num_input_tokens_seen": 326061770, "router_z_loss_clip": 0.0100708, "router_z_loss_mlp": 0.22460938, "step": 15112, "time_per_iteration": 3.147771120071411 }, { "auxiliary_loss_clip": 0.01102522, "auxiliary_loss_mlp": 0.0103105, "balance_loss_clip": 1.01915896, "balance_loss_mlp": 1.03376222, "epoch": 0.9086427175710206, "flos": 18953221576320.0, "grad_norm": 1.8343830762645335, "language_loss": 0.69452274, "learning_rate": 8.685926514226837e-08, "loss": 0.71585846, "num_input_tokens_seen": 326080945, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 15113, "time_per_iteration": 2.4362497329711914 }, { "auxiliary_loss_clip": 0.01104176, "auxiliary_loss_mlp": 0.0102913, "balance_loss_clip": 1.01723313, "balance_loss_mlp": 1.03530765, "epoch": 0.9087028408236886, "flos": 34014873807360.0, "grad_norm": 2.1021639507634866, "language_loss": 0.79493266, "learning_rate": 8.674577274677508e-08, "loss": 0.8162657, "num_input_tokens_seen": 326100630, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 15114, "time_per_iteration": 2.5813708305358887 }, { "auxiliary_loss_clip": 0.01107499, "auxiliary_loss_mlp": 0.0103159, "balance_loss_clip": 1.01854229, "balance_loss_mlp": 1.03607821, "epoch": 0.9087629640763565, "flos": 21944580266880.0, "grad_norm": 4.944652190018464, "language_loss": 0.70505202, "learning_rate": 8.663235290207405e-08, "loss": 0.72644293, "num_input_tokens_seen": 326120145, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71484375, "step": 15115, "time_per_iteration": 2.4396681785583496 }, { "auxiliary_loss_clip": 0.01110924, "auxiliary_loss_mlp": 0.01032085, "balance_loss_clip": 1.01912081, "balance_loss_mlp": 1.03889966, "epoch": 0.9088230873290245, "flos": 21762908254080.0, "grad_norm": 2.0446287303540824, "language_loss": 0.6599524, "learning_rate": 8.651900561246561e-08, "loss": 0.68138254, "num_input_tokens_seen": 326140715, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 15116, "time_per_iteration": 2.4690630435943604 }, { "auxiliary_loss_clip": 0.01101708, "auxiliary_loss_mlp": 0.01032484, "balance_loss_clip": 1.02007461, "balance_loss_mlp": 1.03542364, "epoch": 0.9088832105816925, "flos": 21541267382400.0, "grad_norm": 2.078128860356159, "language_loss": 0.69710702, "learning_rate": 8.640573088224812e-08, "loss": 0.71844894, "num_input_tokens_seen": 326159130, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6640625, "step": 15117, "time_per_iteration": 2.444836139678955 }, { "auxiliary_loss_clip": 0.01102017, "auxiliary_loss_mlp": 0.01025641, "balance_loss_clip": 1.01423883, "balance_loss_mlp": 1.03475296, "epoch": 0.9089433338343604, "flos": 25996704428160.0, "grad_norm": 1.4989172739830923, "language_loss": 0.74454564, "learning_rate": 8.629252871571745e-08, "loss": 0.76582217, "num_input_tokens_seen": 326181375, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 15118, "time_per_iteration": 2.5162322521209717 }, { "auxiliary_loss_clip": 0.0110752, "auxiliary_loss_mlp": 0.0103478, "balance_loss_clip": 1.02122593, "balance_loss_mlp": 1.03448117, "epoch": 0.9090034570870285, "flos": 21178426147200.0, "grad_norm": 2.432695121647472, "language_loss": 0.73120761, "learning_rate": 8.617939911716554e-08, "loss": 0.75263059, "num_input_tokens_seen": 326199740, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.73046875, "step": 15119, "time_per_iteration": 2.4423229694366455 }, { "auxiliary_loss_clip": 0.01109354, "auxiliary_loss_mlp": 0.01030263, "balance_loss_clip": 1.01692355, "balance_loss_mlp": 1.03742743, "epoch": 0.9090635803396964, "flos": 16141811045760.0, "grad_norm": 2.3813149787933154, "language_loss": 0.7042619, "learning_rate": 8.60663420908827e-08, "loss": 0.72565806, "num_input_tokens_seen": 326214350, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 15120, "time_per_iteration": 2.428581714630127 }, { "auxiliary_loss_clip": 0.01104823, "auxiliary_loss_mlp": 0.01028142, "balance_loss_clip": 1.01597703, "balance_loss_mlp": 1.0351665, "epoch": 0.9091237035923644, "flos": 20591537829120.0, "grad_norm": 3.4896703750886573, "language_loss": 0.66468024, "learning_rate": 8.595335764115596e-08, "loss": 0.68600994, "num_input_tokens_seen": 326234580, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 15121, "time_per_iteration": 2.4706573486328125 }, { "auxiliary_loss_clip": 0.01103002, "auxiliary_loss_mlp": 0.01038336, "balance_loss_clip": 1.02617121, "balance_loss_mlp": 1.03489971, "epoch": 0.9091838268450323, "flos": 52227760164480.0, "grad_norm": 1.7353375422880601, "language_loss": 0.70072961, "learning_rate": 8.58404457722699e-08, "loss": 0.72214305, "num_input_tokens_seen": 326259080, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 15122, "time_per_iteration": 4.164681911468506 }, { "auxiliary_loss_clip": 0.01101247, "auxiliary_loss_mlp": 0.01030572, "balance_loss_clip": 1.01872301, "balance_loss_mlp": 1.03399062, "epoch": 0.9092439500977003, "flos": 20559613616640.0, "grad_norm": 1.4065702939460152, "language_loss": 0.74730563, "learning_rate": 8.572760648850575e-08, "loss": 0.76862383, "num_input_tokens_seen": 326280175, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 15123, "time_per_iteration": 2.4772701263427734 }, { "auxiliary_loss_clip": 0.01100285, "auxiliary_loss_mlp": 0.01028667, "balance_loss_clip": 1.01709807, "balance_loss_mlp": 1.03439248, "epoch": 0.9093040733503682, "flos": 28617859595520.0, "grad_norm": 2.317979122581173, "language_loss": 0.75959772, "learning_rate": 8.561483979414253e-08, "loss": 0.78088725, "num_input_tokens_seen": 326297990, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66015625, "step": 15124, "time_per_iteration": 2.5030508041381836 }, { "auxiliary_loss_clip": 0.01103573, "auxiliary_loss_mlp": 0.01030564, "balance_loss_clip": 1.01820755, "balance_loss_mlp": 1.03587389, "epoch": 0.9093641966030362, "flos": 23440187784960.0, "grad_norm": 2.175342842201619, "language_loss": 0.72433358, "learning_rate": 8.55021456934566e-08, "loss": 0.74567491, "num_input_tokens_seen": 326316735, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.67578125, "step": 15125, "time_per_iteration": 3.873112916946411 }, { "auxiliary_loss_clip": 0.01102974, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.02385879, "balance_loss_mlp": 1.03647232, "epoch": 0.9094243198557042, "flos": 16800197385600.0, "grad_norm": 1.7082818863994091, "language_loss": 0.79028267, "learning_rate": 8.538952419072143e-08, "loss": 0.81166768, "num_input_tokens_seen": 326334370, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6640625, "step": 15126, "time_per_iteration": 2.432356357574463 }, { "auxiliary_loss_clip": 0.01101318, "auxiliary_loss_mlp": 0.01032872, "balance_loss_clip": 1.02095723, "balance_loss_mlp": 1.03462029, "epoch": 0.9094844431083722, "flos": 24273278899200.0, "grad_norm": 5.367868368751144, "language_loss": 0.75441462, "learning_rate": 8.527697529020694e-08, "loss": 0.77575648, "num_input_tokens_seen": 326353435, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 15127, "time_per_iteration": 2.4863901138305664 }, { "auxiliary_loss_clip": 0.01102209, "auxiliary_loss_mlp": 0.01031085, "balance_loss_clip": 1.01894355, "balance_loss_mlp": 1.03315282, "epoch": 0.9095445663610401, "flos": 21944652094080.0, "grad_norm": 1.835515418893183, "language_loss": 0.62814027, "learning_rate": 8.516449899618173e-08, "loss": 0.64947319, "num_input_tokens_seen": 326371810, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 15128, "time_per_iteration": 3.8180692195892334 }, { "auxiliary_loss_clip": 0.01100696, "auxiliary_loss_mlp": 0.01026016, "balance_loss_clip": 1.01421416, "balance_loss_mlp": 1.03354311, "epoch": 0.9096046896137081, "flos": 19792848965760.0, "grad_norm": 2.278453462682387, "language_loss": 0.77195984, "learning_rate": 8.505209531291013e-08, "loss": 0.79322696, "num_input_tokens_seen": 326391380, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 15129, "time_per_iteration": 3.8804078102111816 }, { "auxiliary_loss_clip": 0.01105967, "auxiliary_loss_mlp": 0.01026721, "balance_loss_clip": 1.01488996, "balance_loss_mlp": 1.03652585, "epoch": 0.909664812866376, "flos": 22638087129600.0, "grad_norm": 1.8201909515741412, "language_loss": 0.83400959, "learning_rate": 8.49397642446552e-08, "loss": 0.85533643, "num_input_tokens_seen": 326408800, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 15130, "time_per_iteration": 2.479175090789795 }, { "auxiliary_loss_clip": 0.01106476, "auxiliary_loss_mlp": 0.01032529, "balance_loss_clip": 1.01967263, "balance_loss_mlp": 1.03670967, "epoch": 0.909724936119044, "flos": 39852153020160.0, "grad_norm": 2.766449884796397, "language_loss": 0.75431055, "learning_rate": 8.482750579567644e-08, "loss": 0.77570063, "num_input_tokens_seen": 326431565, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 15131, "time_per_iteration": 2.596897840499878 }, { "auxiliary_loss_clip": 0.01105265, "auxiliary_loss_mlp": 0.01030116, "balance_loss_clip": 1.01786757, "balance_loss_mlp": 1.03659284, "epoch": 0.9097850593717121, "flos": 35071616954880.0, "grad_norm": 1.8573168948033862, "language_loss": 0.5954833, "learning_rate": 8.471531997023085e-08, "loss": 0.61683714, "num_input_tokens_seen": 326451715, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 15132, "time_per_iteration": 2.5909509658813477 }, { "auxiliary_loss_clip": 0.01104667, "auxiliary_loss_mlp": 0.01030418, "balance_loss_clip": 1.01890206, "balance_loss_mlp": 1.03640795, "epoch": 0.90984518262438, "flos": 23367468700800.0, "grad_norm": 1.4823180754527636, "language_loss": 0.82434928, "learning_rate": 8.460320677257193e-08, "loss": 0.84570014, "num_input_tokens_seen": 326470855, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 15133, "time_per_iteration": 2.477242946624756 }, { "auxiliary_loss_clip": 0.01103945, "auxiliary_loss_mlp": 0.0102926, "balance_loss_clip": 1.01708913, "balance_loss_mlp": 1.03423095, "epoch": 0.909905305877048, "flos": 27523302405120.0, "grad_norm": 2.155377209370412, "language_loss": 0.73700905, "learning_rate": 8.449116620695118e-08, "loss": 0.75834107, "num_input_tokens_seen": 326490480, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 15134, "time_per_iteration": 2.519826889038086 }, { "auxiliary_loss_clip": 0.01108024, "auxiliary_loss_mlp": 0.01031257, "balance_loss_clip": 1.01890123, "balance_loss_mlp": 1.03596795, "epoch": 0.9099654291297159, "flos": 24347865490560.0, "grad_norm": 1.5141291231315142, "language_loss": 0.72869289, "learning_rate": 8.437919827761786e-08, "loss": 0.75008565, "num_input_tokens_seen": 326509445, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.71875, "step": 15135, "time_per_iteration": 2.4733407497406006 }, { "auxiliary_loss_clip": 0.01102247, "auxiliary_loss_mlp": 0.01030221, "balance_loss_clip": 1.01850271, "balance_loss_mlp": 1.0359478, "epoch": 0.9100255523823839, "flos": 21215234609280.0, "grad_norm": 2.3134507000014066, "language_loss": 0.70100719, "learning_rate": 8.426730298881702e-08, "loss": 0.72233188, "num_input_tokens_seen": 326528380, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 15136, "time_per_iteration": 2.4791269302368164 }, { "auxiliary_loss_clip": 0.01028737, "auxiliary_loss_mlp": 0.01001263, "balance_loss_clip": 1.00021982, "balance_loss_mlp": 1.00644922, "epoch": 0.9100856756350518, "flos": 46052276446080.0, "grad_norm": 0.8172904873976614, "language_loss": 0.59227288, "learning_rate": 8.415548034479214e-08, "loss": 0.61257291, "num_input_tokens_seen": 326576940, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.22265625, "step": 15137, "time_per_iteration": 2.842479944229126 }, { "auxiliary_loss_clip": 0.01103399, "auxiliary_loss_mlp": 0.01032909, "balance_loss_clip": 1.02161431, "balance_loss_mlp": 1.03486788, "epoch": 0.9101457988877198, "flos": 20229917656320.0, "grad_norm": 1.7429358846047145, "language_loss": 0.82223374, "learning_rate": 8.40437303497834e-08, "loss": 0.84359682, "num_input_tokens_seen": 326596100, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.68359375, "step": 15138, "time_per_iteration": 2.4749631881713867 }, { "auxiliary_loss_clip": 0.01099668, "auxiliary_loss_mlp": 0.01024442, "balance_loss_clip": 1.01357019, "balance_loss_mlp": 1.03514445, "epoch": 0.9102059221403878, "flos": 26615157822720.0, "grad_norm": 1.5835513144855267, "language_loss": 0.81028175, "learning_rate": 8.39320530080283e-08, "loss": 0.83152282, "num_input_tokens_seen": 326615700, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.64453125, "step": 15139, "time_per_iteration": 2.482414722442627 }, { "auxiliary_loss_clip": 0.01102923, "auxiliary_loss_mlp": 0.01031535, "balance_loss_clip": 1.02025771, "balance_loss_mlp": 1.03518271, "epoch": 0.9102660453930558, "flos": 21908561904000.0, "grad_norm": 1.7940397100942806, "language_loss": 0.77218264, "learning_rate": 8.382044832376167e-08, "loss": 0.79352725, "num_input_tokens_seen": 326635905, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6796875, "step": 15140, "time_per_iteration": 2.4751834869384766 }, { "auxiliary_loss_clip": 0.01102435, "auxiliary_loss_mlp": 0.01025653, "balance_loss_clip": 1.01406014, "balance_loss_mlp": 1.03440082, "epoch": 0.9103261686457237, "flos": 36176660916480.0, "grad_norm": 1.762922165957944, "language_loss": 0.66276777, "learning_rate": 8.370891630121569e-08, "loss": 0.68404865, "num_input_tokens_seen": 326661855, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 15141, "time_per_iteration": 2.5914032459259033 }, { "auxiliary_loss_clip": 0.01104276, "auxiliary_loss_mlp": 0.0103312, "balance_loss_clip": 1.02159226, "balance_loss_mlp": 1.03472018, "epoch": 0.9103862918983917, "flos": 23878549365120.0, "grad_norm": 1.7091196730936722, "language_loss": 0.75107795, "learning_rate": 8.359745694462005e-08, "loss": 0.77245188, "num_input_tokens_seen": 326679320, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6953125, "step": 15142, "time_per_iteration": 2.4765632152557373 }, { "auxiliary_loss_clip": 0.01100731, "auxiliary_loss_mlp": 0.0103109, "balance_loss_clip": 1.01959801, "balance_loss_mlp": 1.03316617, "epoch": 0.9104464151510596, "flos": 14939521989120.0, "grad_norm": 2.759815193851568, "language_loss": 0.64205438, "learning_rate": 8.348607025820076e-08, "loss": 0.66337252, "num_input_tokens_seen": 326698110, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 15143, "time_per_iteration": 2.4285523891448975 }, { "auxiliary_loss_clip": 0.01104358, "auxiliary_loss_mlp": 0.0102982, "balance_loss_clip": 1.01742184, "balance_loss_mlp": 1.03376579, "epoch": 0.9105065384037276, "flos": 33655803500160.0, "grad_norm": 1.9728199316965518, "language_loss": 0.61071765, "learning_rate": 8.337475624618152e-08, "loss": 0.63205945, "num_input_tokens_seen": 326718370, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 15144, "time_per_iteration": 2.5689849853515625 }, { "auxiliary_loss_clip": 0.01099573, "auxiliary_loss_mlp": 0.01028052, "balance_loss_clip": 1.01673305, "balance_loss_mlp": 1.03459954, "epoch": 0.9105666616563957, "flos": 24316695463680.0, "grad_norm": 3.040240635137317, "language_loss": 0.71338457, "learning_rate": 8.326351491278382e-08, "loss": 0.73466074, "num_input_tokens_seen": 326738445, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6484375, "step": 15145, "time_per_iteration": 2.4820854663848877 }, { "auxiliary_loss_clip": 0.01099285, "auxiliary_loss_mlp": 0.01028994, "balance_loss_clip": 1.0173825, "balance_loss_mlp": 1.03334188, "epoch": 0.9106267849090636, "flos": 29971692132480.0, "grad_norm": 2.2424022993986306, "language_loss": 0.70783126, "learning_rate": 8.315234626222545e-08, "loss": 0.72911406, "num_input_tokens_seen": 326758855, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66015625, "step": 15146, "time_per_iteration": 2.529702663421631 }, { "auxiliary_loss_clip": 0.01103059, "auxiliary_loss_mlp": 0.01028264, "balance_loss_clip": 1.01668239, "balance_loss_mlp": 1.03484118, "epoch": 0.9106869081617316, "flos": 25337743470720.0, "grad_norm": 2.2422080630893344, "language_loss": 0.73086411, "learning_rate": 8.304125029872233e-08, "loss": 0.75217736, "num_input_tokens_seen": 326777140, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.68359375, "step": 15147, "time_per_iteration": 2.4687178134918213 }, { "auxiliary_loss_clip": 0.01106056, "auxiliary_loss_mlp": 0.01028001, "balance_loss_clip": 1.01599717, "balance_loss_mlp": 1.03449869, "epoch": 0.9107470314143995, "flos": 18187031543040.0, "grad_norm": 2.917255520760988, "language_loss": 0.79820496, "learning_rate": 8.293022702648711e-08, "loss": 0.81954563, "num_input_tokens_seen": 326794070, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.71484375, "step": 15148, "time_per_iteration": 2.43587327003479 }, { "auxiliary_loss_clip": 0.01103416, "auxiliary_loss_mlp": 0.01037411, "balance_loss_clip": 1.0256331, "balance_loss_mlp": 1.033952, "epoch": 0.9108071546670675, "flos": 23550828652800.0, "grad_norm": 1.8833571945942253, "language_loss": 0.67842865, "learning_rate": 8.281927644972996e-08, "loss": 0.69983691, "num_input_tokens_seen": 326814695, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 15149, "time_per_iteration": 2.4593982696533203 }, { "auxiliary_loss_clip": 0.01103781, "auxiliary_loss_mlp": 0.0102771, "balance_loss_clip": 1.01513362, "balance_loss_mlp": 1.03613472, "epoch": 0.9108672779197354, "flos": 25630307746560.0, "grad_norm": 3.8847783419577633, "language_loss": 0.63358462, "learning_rate": 8.270839857265776e-08, "loss": 0.6548996, "num_input_tokens_seen": 326835295, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 15150, "time_per_iteration": 2.4997994899749756 }, { "auxiliary_loss_clip": 0.01102152, "auxiliary_loss_mlp": 0.01031145, "balance_loss_clip": 1.01872921, "balance_loss_mlp": 1.03421342, "epoch": 0.9109274011724035, "flos": 22339094319360.0, "grad_norm": 1.909732842097868, "language_loss": 0.72637188, "learning_rate": 8.259759339947514e-08, "loss": 0.74770486, "num_input_tokens_seen": 326853350, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 15151, "time_per_iteration": 2.442272186279297 }, { "auxiliary_loss_clip": 0.01102631, "auxiliary_loss_mlp": 0.01027697, "balance_loss_clip": 1.01567447, "balance_loss_mlp": 1.03468359, "epoch": 0.9109875244250714, "flos": 26688200129280.0, "grad_norm": 1.8024104540182155, "language_loss": 0.64379919, "learning_rate": 8.248686093438429e-08, "loss": 0.66510242, "num_input_tokens_seen": 326873425, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 15152, "time_per_iteration": 2.5076751708984375 }, { "auxiliary_loss_clip": 0.01103987, "auxiliary_loss_mlp": 0.01029123, "balance_loss_clip": 1.01616478, "balance_loss_mlp": 1.03558695, "epoch": 0.9110476476777394, "flos": 22930112701440.0, "grad_norm": 1.8205215319866968, "language_loss": 0.73283315, "learning_rate": 8.23762011815834e-08, "loss": 0.75416422, "num_input_tokens_seen": 326893455, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 15153, "time_per_iteration": 2.4575517177581787 }, { "auxiliary_loss_clip": 0.01105576, "auxiliary_loss_mlp": 0.01026799, "balance_loss_clip": 1.01488972, "balance_loss_mlp": 1.03643847, "epoch": 0.9111077709304073, "flos": 13472857854720.0, "grad_norm": 1.8954033425606949, "language_loss": 0.7202844, "learning_rate": 8.226561414526956e-08, "loss": 0.74160814, "num_input_tokens_seen": 326910210, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 15154, "time_per_iteration": 2.473618507385254 }, { "auxiliary_loss_clip": 0.01105575, "auxiliary_loss_mlp": 0.0103022, "balance_loss_clip": 1.01884711, "balance_loss_mlp": 1.0378294, "epoch": 0.9111678941830753, "flos": 20850561780480.0, "grad_norm": 2.2036544114379972, "language_loss": 0.82022643, "learning_rate": 8.215509982963564e-08, "loss": 0.84158432, "num_input_tokens_seen": 326929350, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 15155, "time_per_iteration": 2.4447052478790283 }, { "auxiliary_loss_clip": 0.01104288, "auxiliary_loss_mlp": 0.01029066, "balance_loss_clip": 1.0171628, "balance_loss_mlp": 1.03631759, "epoch": 0.9112280174357432, "flos": 19682244011520.0, "grad_norm": 1.7867280809890709, "language_loss": 0.59629536, "learning_rate": 8.204465823887252e-08, "loss": 0.61762893, "num_input_tokens_seen": 326949060, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 15156, "time_per_iteration": 2.473160743713379 }, { "auxiliary_loss_clip": 0.01104917, "auxiliary_loss_mlp": 0.01028789, "balance_loss_clip": 1.01561618, "balance_loss_mlp": 1.03334808, "epoch": 0.9112881406884112, "flos": 25447163276160.0, "grad_norm": 1.8821860621223683, "language_loss": 0.73750305, "learning_rate": 8.193428937716796e-08, "loss": 0.75884002, "num_input_tokens_seen": 326968950, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 15157, "time_per_iteration": 2.4951443672180176 }, { "auxiliary_loss_clip": 0.01104019, "auxiliary_loss_mlp": 0.01032064, "balance_loss_clip": 1.02072155, "balance_loss_mlp": 1.03530955, "epoch": 0.9113482639410793, "flos": 33066975847680.0, "grad_norm": 1.8567727184201128, "language_loss": 0.59710968, "learning_rate": 8.182399324870747e-08, "loss": 0.61847055, "num_input_tokens_seen": 326989455, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6875, "step": 15158, "time_per_iteration": 2.5517232418060303 }, { "auxiliary_loss_clip": 0.01102353, "auxiliary_loss_mlp": 0.01033791, "balance_loss_clip": 1.02244258, "balance_loss_mlp": 1.03482378, "epoch": 0.9114083871937472, "flos": 21835591424640.0, "grad_norm": 1.8006380491285623, "language_loss": 0.67887247, "learning_rate": 8.171376985767375e-08, "loss": 0.70023394, "num_input_tokens_seen": 327009640, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.67578125, "step": 15159, "time_per_iteration": 2.4468090534210205 }, { "auxiliary_loss_clip": 0.01103131, "auxiliary_loss_mlp": 0.01027671, "balance_loss_clip": 1.01588082, "balance_loss_mlp": 1.03423858, "epoch": 0.9114685104464152, "flos": 27088999061760.0, "grad_norm": 6.381337139635099, "language_loss": 0.78501332, "learning_rate": 8.160361920824588e-08, "loss": 0.80632132, "num_input_tokens_seen": 327027690, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 15160, "time_per_iteration": 2.5077128410339355 }, { "auxiliary_loss_clip": 0.01105945, "auxiliary_loss_mlp": 0.01028004, "balance_loss_clip": 1.01471269, "balance_loss_mlp": 1.03655124, "epoch": 0.9115286336990831, "flos": 17967042696960.0, "grad_norm": 1.8100477840024074, "language_loss": 0.68762708, "learning_rate": 8.149354130460073e-08, "loss": 0.70896661, "num_input_tokens_seen": 327045915, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 15161, "time_per_iteration": 2.422703981399536 }, { "auxiliary_loss_clip": 0.01105844, "auxiliary_loss_mlp": 0.01029316, "balance_loss_clip": 1.01682281, "balance_loss_mlp": 1.03617358, "epoch": 0.9115887569517511, "flos": 22929861306240.0, "grad_norm": 1.9201741782405561, "language_loss": 0.75960422, "learning_rate": 8.138353615091321e-08, "loss": 0.78095585, "num_input_tokens_seen": 327066355, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 15162, "time_per_iteration": 2.474249839782715 }, { "auxiliary_loss_clip": 0.0110237, "auxiliary_loss_mlp": 0.01034238, "balance_loss_clip": 1.02214384, "balance_loss_mlp": 1.03433466, "epoch": 0.911648880204419, "flos": 23988436047360.0, "grad_norm": 3.8409524622586546, "language_loss": 0.66998708, "learning_rate": 8.127360375135395e-08, "loss": 0.6913532, "num_input_tokens_seen": 327086735, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 15163, "time_per_iteration": 3.9749324321746826 }, { "auxiliary_loss_clip": 0.01107442, "auxiliary_loss_mlp": 0.01032047, "balance_loss_clip": 1.01947594, "balance_loss_mlp": 1.03567672, "epoch": 0.911709003457087, "flos": 17055306754560.0, "grad_norm": 2.133549155606496, "language_loss": 0.7068994, "learning_rate": 8.116374411009186e-08, "loss": 0.72829431, "num_input_tokens_seen": 327104035, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 15164, "time_per_iteration": 2.423020362854004 }, { "auxiliary_loss_clip": 0.01102693, "auxiliary_loss_mlp": 0.01033655, "balance_loss_clip": 1.02179408, "balance_loss_mlp": 1.03744984, "epoch": 0.911769126709755, "flos": 21653344794240.0, "grad_norm": 1.6751407961150835, "language_loss": 0.7615521, "learning_rate": 8.105395723129315e-08, "loss": 0.78291553, "num_input_tokens_seen": 327124370, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.65234375, "step": 15165, "time_per_iteration": 2.467954397201538 }, { "auxiliary_loss_clip": 0.01105108, "auxiliary_loss_mlp": 0.01032306, "balance_loss_clip": 1.02015305, "balance_loss_mlp": 1.03626633, "epoch": 0.911829249962423, "flos": 24790321221120.0, "grad_norm": 2.9556586918172583, "language_loss": 0.72035587, "learning_rate": 8.094424311912074e-08, "loss": 0.74172997, "num_input_tokens_seen": 327140915, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6875, "step": 15166, "time_per_iteration": 2.469290018081665 }, { "auxiliary_loss_clip": 0.01104296, "auxiliary_loss_mlp": 0.01037609, "balance_loss_clip": 1.02488959, "balance_loss_mlp": 1.03447533, "epoch": 0.9118893732150909, "flos": 20959406968320.0, "grad_norm": 1.839707856827634, "language_loss": 0.72807771, "learning_rate": 8.083460177773482e-08, "loss": 0.7494967, "num_input_tokens_seen": 327158940, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 15167, "time_per_iteration": 3.8537893295288086 }, { "auxiliary_loss_clip": 0.01028818, "auxiliary_loss_mlp": 0.01000713, "balance_loss_clip": 0.99965763, "balance_loss_mlp": 1.00639558, "epoch": 0.9119494964677589, "flos": 67917385872000.0, "grad_norm": 0.7723611011491119, "language_loss": 0.65526301, "learning_rate": 8.072503321129298e-08, "loss": 0.67555833, "num_input_tokens_seen": 327217450, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.22460938, "step": 15168, "time_per_iteration": 3.0544838905334473 }, { "auxiliary_loss_clip": 0.01101534, "auxiliary_loss_mlp": 0.0103036, "balance_loss_clip": 1.01913095, "balance_loss_mlp": 1.03436136, "epoch": 0.9120096197204268, "flos": 18551524803840.0, "grad_norm": 2.594486582069897, "language_loss": 0.78392696, "learning_rate": 8.061553742395033e-08, "loss": 0.80524588, "num_input_tokens_seen": 327233905, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 15169, "time_per_iteration": 2.433950424194336 }, { "auxiliary_loss_clip": 0.01103767, "auxiliary_loss_mlp": 0.01027837, "balance_loss_clip": 1.01609468, "balance_loss_mlp": 1.03538728, "epoch": 0.9120697429730948, "flos": 19025725178880.0, "grad_norm": 1.7224806194931048, "language_loss": 0.82139575, "learning_rate": 8.05061144198591e-08, "loss": 0.84271181, "num_input_tokens_seen": 327252430, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 15170, "time_per_iteration": 5.2606048583984375 }, { "auxiliary_loss_clip": 0.01107614, "auxiliary_loss_mlp": 0.0103032, "balance_loss_clip": 1.01820874, "balance_loss_mlp": 1.03822505, "epoch": 0.9121298662257629, "flos": 17163685065600.0, "grad_norm": 2.5297540003647803, "language_loss": 0.77455354, "learning_rate": 8.039676420316799e-08, "loss": 0.79593289, "num_input_tokens_seen": 327269215, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 15171, "time_per_iteration": 2.4123740196228027 }, { "auxiliary_loss_clip": 0.01101365, "auxiliary_loss_mlp": 0.01032243, "balance_loss_clip": 1.01986933, "balance_loss_mlp": 1.03316891, "epoch": 0.9121899894784308, "flos": 19682710888320.0, "grad_norm": 1.5623179078294662, "language_loss": 0.66918057, "learning_rate": 8.02874867780241e-08, "loss": 0.69051665, "num_input_tokens_seen": 327290320, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 15172, "time_per_iteration": 2.4690868854522705 }, { "auxiliary_loss_clip": 0.01107668, "auxiliary_loss_mlp": 0.01033226, "balance_loss_clip": 1.02076268, "balance_loss_mlp": 1.03799736, "epoch": 0.9122501127310988, "flos": 22235743912320.0, "grad_norm": 1.8578065703917606, "language_loss": 0.75023627, "learning_rate": 8.017828214857103e-08, "loss": 0.77164519, "num_input_tokens_seen": 327310150, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 15173, "time_per_iteration": 2.455268144607544 }, { "auxiliary_loss_clip": 0.01110232, "auxiliary_loss_mlp": 0.0103146, "balance_loss_clip": 1.01761973, "balance_loss_mlp": 1.0372262, "epoch": 0.9123102359837667, "flos": 15957122290560.0, "grad_norm": 2.9965718066925047, "language_loss": 0.65782082, "learning_rate": 8.00691503189499e-08, "loss": 0.67923766, "num_input_tokens_seen": 327326660, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.73046875, "step": 15174, "time_per_iteration": 2.4427809715270996 }, { "auxiliary_loss_clip": 0.01107554, "auxiliary_loss_mlp": 0.01033122, "balance_loss_clip": 1.019979, "balance_loss_mlp": 1.03728056, "epoch": 0.9123703592364347, "flos": 25155784149120.0, "grad_norm": 1.75761592400757, "language_loss": 0.75119007, "learning_rate": 7.996009129329894e-08, "loss": 0.77259684, "num_input_tokens_seen": 327346700, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 15175, "time_per_iteration": 2.470508575439453 }, { "auxiliary_loss_clip": 0.01028135, "auxiliary_loss_mlp": 0.01001081, "balance_loss_clip": 1.00005555, "balance_loss_mlp": 1.00587225, "epoch": 0.9124304824891026, "flos": 60801650812800.0, "grad_norm": 0.9584618931537071, "language_loss": 0.58410436, "learning_rate": 7.985110507575421e-08, "loss": 0.60439646, "num_input_tokens_seen": 327403050, "router_z_loss_clip": 0.01025391, "router_z_loss_mlp": 0.22265625, "step": 15176, "time_per_iteration": 3.13146710395813 }, { "auxiliary_loss_clip": 0.01104927, "auxiliary_loss_mlp": 0.01034586, "balance_loss_clip": 1.02316558, "balance_loss_mlp": 1.0357691, "epoch": 0.9124906057417707, "flos": 18150941352960.0, "grad_norm": 2.2647323449981895, "language_loss": 0.65324473, "learning_rate": 7.97421916704475e-08, "loss": 0.67463982, "num_input_tokens_seen": 327422225, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.69140625, "step": 15177, "time_per_iteration": 2.4443321228027344 }, { "auxiliary_loss_clip": 0.01102334, "auxiliary_loss_mlp": 0.0102866, "balance_loss_clip": 1.01698387, "balance_loss_mlp": 1.03447425, "epoch": 0.9125507289944386, "flos": 11686769049600.0, "grad_norm": 3.1206807564299797, "language_loss": 0.81382561, "learning_rate": 7.963335108150926e-08, "loss": 0.83513558, "num_input_tokens_seen": 327437025, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 15178, "time_per_iteration": 2.444506883621216 }, { "auxiliary_loss_clip": 0.0110405, "auxiliary_loss_mlp": 0.01027771, "balance_loss_clip": 1.01587975, "balance_loss_mlp": 1.0357132, "epoch": 0.9126108522471066, "flos": 17748813617280.0, "grad_norm": 2.488058823473273, "language_loss": 0.78818679, "learning_rate": 7.952458331306711e-08, "loss": 0.80950499, "num_input_tokens_seen": 327453915, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 15179, "time_per_iteration": 2.429805278778076 }, { "auxiliary_loss_clip": 0.0110086, "auxiliary_loss_mlp": 0.0102994, "balance_loss_clip": 1.01846027, "balance_loss_mlp": 1.03374243, "epoch": 0.9126709754997745, "flos": 27635738952960.0, "grad_norm": 20.268648683869735, "language_loss": 0.68154764, "learning_rate": 7.941588836924507e-08, "loss": 0.70285565, "num_input_tokens_seen": 327474415, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 15180, "time_per_iteration": 2.523505449295044 }, { "auxiliary_loss_clip": 0.0110001, "auxiliary_loss_mlp": 0.01026319, "balance_loss_clip": 1.01519668, "balance_loss_mlp": 1.03343463, "epoch": 0.9127310987524425, "flos": 15924982596480.0, "grad_norm": 1.8154125716159186, "language_loss": 0.7573837, "learning_rate": 7.930726625416495e-08, "loss": 0.77864707, "num_input_tokens_seen": 327492750, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6640625, "step": 15181, "time_per_iteration": 2.4302406311035156 }, { "auxiliary_loss_clip": 0.0110872, "auxiliary_loss_mlp": 0.01030749, "balance_loss_clip": 1.01857138, "balance_loss_mlp": 1.03714561, "epoch": 0.9127912220051104, "flos": 21536885923200.0, "grad_norm": 1.7882325208388392, "language_loss": 0.75115275, "learning_rate": 7.919871697194614e-08, "loss": 0.77254748, "num_input_tokens_seen": 327509470, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71484375, "step": 15182, "time_per_iteration": 2.4589383602142334 }, { "auxiliary_loss_clip": 0.01105584, "auxiliary_loss_mlp": 0.01031335, "balance_loss_clip": 1.01909232, "balance_loss_mlp": 1.03475213, "epoch": 0.9128513452577784, "flos": 24063561342720.0, "grad_norm": 1.463513847984632, "language_loss": 0.76370537, "learning_rate": 7.909024052670421e-08, "loss": 0.78507459, "num_input_tokens_seen": 327530520, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.7109375, "step": 15183, "time_per_iteration": 2.50016713142395 }, { "auxiliary_loss_clip": 0.0110612, "auxiliary_loss_mlp": 0.01032735, "balance_loss_clip": 1.02068901, "balance_loss_mlp": 1.03545988, "epoch": 0.9129114685104465, "flos": 16216469464320.0, "grad_norm": 14.937567143905726, "language_loss": 0.76583719, "learning_rate": 7.898183692255256e-08, "loss": 0.78722572, "num_input_tokens_seen": 327546960, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 15184, "time_per_iteration": 2.434480905532837 }, { "auxiliary_loss_clip": 0.01105678, "auxiliary_loss_mlp": 0.0103406, "balance_loss_clip": 1.02228212, "balance_loss_mlp": 1.03663611, "epoch": 0.9129715917631144, "flos": 19384364522880.0, "grad_norm": 1.9818351001866572, "language_loss": 0.74940848, "learning_rate": 7.887350616360233e-08, "loss": 0.77080584, "num_input_tokens_seen": 327564830, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69140625, "step": 15185, "time_per_iteration": 2.4329564571380615 }, { "auxiliary_loss_clip": 0.01103735, "auxiliary_loss_mlp": 0.0102894, "balance_loss_clip": 1.01703656, "balance_loss_mlp": 1.03534245, "epoch": 0.9130317150157824, "flos": 20590460421120.0, "grad_norm": 2.1773789789983797, "language_loss": 0.69278955, "learning_rate": 7.876524825396158e-08, "loss": 0.71411633, "num_input_tokens_seen": 327583675, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 15186, "time_per_iteration": 2.4395837783813477 }, { "auxiliary_loss_clip": 0.01110678, "auxiliary_loss_mlp": 0.01034202, "balance_loss_clip": 1.02116656, "balance_loss_mlp": 1.03676438, "epoch": 0.9130918382684503, "flos": 20189230525440.0, "grad_norm": 3.929606382486174, "language_loss": 0.77418399, "learning_rate": 7.865706319773502e-08, "loss": 0.79563272, "num_input_tokens_seen": 327602280, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73828125, "step": 15187, "time_per_iteration": 2.4259753227233887 }, { "auxiliary_loss_clip": 0.01102916, "auxiliary_loss_mlp": 0.01029894, "balance_loss_clip": 1.0188731, "balance_loss_mlp": 1.03433824, "epoch": 0.9131519615211183, "flos": 25556870390400.0, "grad_norm": 2.0981268033185496, "language_loss": 0.65525359, "learning_rate": 7.854895099902515e-08, "loss": 0.67658168, "num_input_tokens_seen": 327623515, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6875, "step": 15188, "time_per_iteration": 2.498934030532837 }, { "auxiliary_loss_clip": 0.01100964, "auxiliary_loss_mlp": 0.01029592, "balance_loss_clip": 1.01797485, "balance_loss_mlp": 1.03350127, "epoch": 0.9132120847737862, "flos": 17931563038080.0, "grad_norm": 3.2897996060031125, "language_loss": 0.76055366, "learning_rate": 7.844091166193157e-08, "loss": 0.78185922, "num_input_tokens_seen": 327642875, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 15189, "time_per_iteration": 2.4288575649261475 }, { "auxiliary_loss_clip": 0.01102454, "auxiliary_loss_mlp": 0.01027746, "balance_loss_clip": 1.01703548, "balance_loss_mlp": 1.03552771, "epoch": 0.9132722080264543, "flos": 20047635112320.0, "grad_norm": 4.631912340820418, "language_loss": 0.75610214, "learning_rate": 7.8332945190551e-08, "loss": 0.77740419, "num_input_tokens_seen": 327662450, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.66796875, "step": 15190, "time_per_iteration": 2.4472978115081787 }, { "auxiliary_loss_clip": 0.01028303, "auxiliary_loss_mlp": 0.01000204, "balance_loss_clip": 0.99919635, "balance_loss_mlp": 1.00610471, "epoch": 0.9133323312791222, "flos": 70439967141120.0, "grad_norm": 0.6974605428532014, "language_loss": 0.57269526, "learning_rate": 7.822505158897797e-08, "loss": 0.59298027, "num_input_tokens_seen": 327723845, "router_z_loss_clip": 0.0100708, "router_z_loss_mlp": 0.22265625, "step": 15191, "time_per_iteration": 3.1239359378814697 }, { "auxiliary_loss_clip": 0.01106278, "auxiliary_loss_mlp": 0.01033428, "balance_loss_clip": 1.02071381, "balance_loss_mlp": 1.03600955, "epoch": 0.9133924545317902, "flos": 25483792170240.0, "grad_norm": 2.3490638834618243, "language_loss": 0.7420696, "learning_rate": 7.81172308613034e-08, "loss": 0.7634666, "num_input_tokens_seen": 327742590, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 15192, "time_per_iteration": 2.4814798831939697 }, { "auxiliary_loss_clip": 0.01102541, "auxiliary_loss_mlp": 0.01027516, "balance_loss_clip": 1.01593471, "balance_loss_mlp": 1.03633571, "epoch": 0.9134525777844581, "flos": 39930690107520.0, "grad_norm": 1.6433570322674074, "language_loss": 0.69256276, "learning_rate": 7.800948301161647e-08, "loss": 0.71386337, "num_input_tokens_seen": 327764350, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6640625, "step": 15193, "time_per_iteration": 2.6300621032714844 }, { "auxiliary_loss_clip": 0.01101638, "auxiliary_loss_mlp": 0.01038274, "balance_loss_clip": 1.02666903, "balance_loss_mlp": 1.03521454, "epoch": 0.9135127010371261, "flos": 20886723797760.0, "grad_norm": 2.3417907796409474, "language_loss": 0.73340178, "learning_rate": 7.790180804400215e-08, "loss": 0.75480092, "num_input_tokens_seen": 327783120, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 15194, "time_per_iteration": 2.5274412631988525 }, { "auxiliary_loss_clip": 0.01106611, "auxiliary_loss_mlp": 0.01032335, "balance_loss_clip": 1.01859665, "balance_loss_mlp": 1.03487706, "epoch": 0.913572824289794, "flos": 20813250528000.0, "grad_norm": 3.79241709884669, "language_loss": 0.61822677, "learning_rate": 7.779420596254383e-08, "loss": 0.63961619, "num_input_tokens_seen": 327801960, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.71875, "step": 15195, "time_per_iteration": 2.4843523502349854 }, { "auxiliary_loss_clip": 0.01102493, "auxiliary_loss_mlp": 0.01033171, "balance_loss_clip": 1.02116644, "balance_loss_mlp": 1.03400517, "epoch": 0.913632947542462, "flos": 25703278225920.0, "grad_norm": 1.5943433880306912, "language_loss": 0.71259052, "learning_rate": 7.768667677132201e-08, "loss": 0.73394716, "num_input_tokens_seen": 327823795, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 15196, "time_per_iteration": 2.5075323581695557 }, { "auxiliary_loss_clip": 0.01104054, "auxiliary_loss_mlp": 0.01030814, "balance_loss_clip": 1.01917946, "balance_loss_mlp": 1.03607607, "epoch": 0.9136930707951301, "flos": 26286216048000.0, "grad_norm": 1.5468812472812268, "language_loss": 0.71180403, "learning_rate": 7.757922047441411e-08, "loss": 0.73315275, "num_input_tokens_seen": 327845175, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 15197, "time_per_iteration": 2.5192606449127197 }, { "auxiliary_loss_clip": 0.0110509, "auxiliary_loss_mlp": 0.0103074, "balance_loss_clip": 1.01788306, "balance_loss_mlp": 1.03389931, "epoch": 0.913753194047798, "flos": 22091885942400.0, "grad_norm": 1.8944155549718675, "language_loss": 0.78056568, "learning_rate": 7.747183707589489e-08, "loss": 0.80192399, "num_input_tokens_seen": 327863150, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 15198, "time_per_iteration": 2.4478461742401123 }, { "auxiliary_loss_clip": 0.01101116, "auxiliary_loss_mlp": 0.01030259, "balance_loss_clip": 1.01848745, "balance_loss_mlp": 1.03514016, "epoch": 0.913813317300466, "flos": 23587206151680.0, "grad_norm": 1.533571428650766, "language_loss": 0.6807633, "learning_rate": 7.736452657983616e-08, "loss": 0.70207703, "num_input_tokens_seen": 327883445, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.66015625, "step": 15199, "time_per_iteration": 2.483856678009033 }, { "auxiliary_loss_clip": 0.01103899, "auxiliary_loss_mlp": 0.01033433, "balance_loss_clip": 1.02168465, "balance_loss_mlp": 1.03497541, "epoch": 0.9138734405531339, "flos": 28876452583680.0, "grad_norm": 1.559471902569334, "language_loss": 0.6761806, "learning_rate": 7.725728899030714e-08, "loss": 0.69755393, "num_input_tokens_seen": 327905745, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 15200, "time_per_iteration": 2.507582902908325 }, { "auxiliary_loss_clip": 0.01103963, "auxiliary_loss_mlp": 0.01028518, "balance_loss_clip": 1.01748562, "balance_loss_mlp": 1.03696942, "epoch": 0.9139335638058019, "flos": 22821087945600.0, "grad_norm": 2.1334415623841587, "language_loss": 0.71047288, "learning_rate": 7.715012431137435e-08, "loss": 0.7317977, "num_input_tokens_seen": 327925435, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.671875, "step": 15201, "time_per_iteration": 2.477358102798462 }, { "auxiliary_loss_clip": 0.0110123, "auxiliary_loss_mlp": 0.01025303, "balance_loss_clip": 1.01452088, "balance_loss_mlp": 1.03357661, "epoch": 0.9139936870584698, "flos": 18004174381440.0, "grad_norm": 1.7022432066379725, "language_loss": 0.70892322, "learning_rate": 7.704303254710165e-08, "loss": 0.73018849, "num_input_tokens_seen": 327944145, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.67578125, "step": 15202, "time_per_iteration": 2.4337961673736572 }, { "auxiliary_loss_clip": 0.01102636, "auxiliary_loss_mlp": 0.01031535, "balance_loss_clip": 1.01891088, "balance_loss_mlp": 1.03394985, "epoch": 0.9140538103111379, "flos": 15813767111040.0, "grad_norm": 2.3311537083293943, "language_loss": 0.66924471, "learning_rate": 7.693601370155001e-08, "loss": 0.69058645, "num_input_tokens_seen": 327960565, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 15203, "time_per_iteration": 2.4217164516448975 }, { "auxiliary_loss_clip": 0.01106191, "auxiliary_loss_mlp": 0.01029808, "balance_loss_clip": 1.01698709, "balance_loss_mlp": 1.03728068, "epoch": 0.9141139335638058, "flos": 23987035416960.0, "grad_norm": 1.6023981887370098, "language_loss": 0.68839365, "learning_rate": 7.682906777877751e-08, "loss": 0.70975363, "num_input_tokens_seen": 327981180, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 15204, "time_per_iteration": 2.4878146648406982 }, { "auxiliary_loss_clip": 0.01102423, "auxiliary_loss_mlp": 0.01028322, "balance_loss_clip": 1.01560283, "balance_loss_mlp": 1.03282332, "epoch": 0.9141740568164738, "flos": 24024418496640.0, "grad_norm": 1.9897238590809703, "language_loss": 0.59704614, "learning_rate": 7.672219478283915e-08, "loss": 0.61835361, "num_input_tokens_seen": 328001500, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 15205, "time_per_iteration": 3.7367258071899414 }, { "auxiliary_loss_clip": 0.01100491, "auxiliary_loss_mlp": 0.01030995, "balance_loss_clip": 1.01838279, "balance_loss_mlp": 1.03490472, "epoch": 0.9142341800691417, "flos": 27018291139200.0, "grad_norm": 1.6723921897311633, "language_loss": 0.81361282, "learning_rate": 7.661539471778811e-08, "loss": 0.83492768, "num_input_tokens_seen": 328023025, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.65625, "step": 15206, "time_per_iteration": 2.513841390609741 }, { "auxiliary_loss_clip": 0.01103296, "auxiliary_loss_mlp": 0.01027354, "balance_loss_clip": 1.01443148, "balance_loss_mlp": 1.03312397, "epoch": 0.9142943033218097, "flos": 20412487509120.0, "grad_norm": 4.091124253175859, "language_loss": 0.73436016, "learning_rate": 7.650866758767382e-08, "loss": 0.75566661, "num_input_tokens_seen": 328041410, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 15207, "time_per_iteration": 2.4567008018493652 }, { "auxiliary_loss_clip": 0.01103373, "auxiliary_loss_mlp": 0.01036396, "balance_loss_clip": 1.02351546, "balance_loss_mlp": 1.03481114, "epoch": 0.9143544265744776, "flos": 19755322231680.0, "grad_norm": 1.8360287783656963, "language_loss": 0.73168617, "learning_rate": 7.640201339654373e-08, "loss": 0.75308388, "num_input_tokens_seen": 328060495, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 15208, "time_per_iteration": 2.4350948333740234 }, { "auxiliary_loss_clip": 0.01102156, "auxiliary_loss_mlp": 0.01030934, "balance_loss_clip": 1.01973414, "balance_loss_mlp": 1.03524065, "epoch": 0.9144145498271457, "flos": 17165444832000.0, "grad_norm": 2.1318114399120045, "language_loss": 0.86092341, "learning_rate": 7.629543214844237e-08, "loss": 0.88225424, "num_input_tokens_seen": 328076905, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.671875, "step": 15209, "time_per_iteration": 3.848520517349243 }, { "auxiliary_loss_clip": 0.01102967, "auxiliary_loss_mlp": 0.01034176, "balance_loss_clip": 1.02289832, "balance_loss_mlp": 1.03553677, "epoch": 0.9144746730798137, "flos": 23726072131200.0, "grad_norm": 1.6610349674271547, "language_loss": 0.75180441, "learning_rate": 7.618892384741093e-08, "loss": 0.77317584, "num_input_tokens_seen": 328096960, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.671875, "step": 15210, "time_per_iteration": 2.4943509101867676 }, { "auxiliary_loss_clip": 0.01101218, "auxiliary_loss_mlp": 0.01032177, "balance_loss_clip": 1.02006507, "balance_loss_mlp": 1.03219926, "epoch": 0.9145347963324816, "flos": 25847854467840.0, "grad_norm": 1.9051554907097494, "language_loss": 0.78135514, "learning_rate": 7.6082488497488e-08, "loss": 0.80268908, "num_input_tokens_seen": 328115445, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 15211, "time_per_iteration": 2.463083267211914 }, { "auxiliary_loss_clip": 0.01105133, "auxiliary_loss_mlp": 0.01031499, "balance_loss_clip": 1.0203892, "balance_loss_mlp": 1.03622496, "epoch": 0.9145949195851496, "flos": 19242769109760.0, "grad_norm": 2.5480488914540964, "language_loss": 0.8286007, "learning_rate": 7.597612610270986e-08, "loss": 0.84996706, "num_input_tokens_seen": 328133965, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6875, "step": 15212, "time_per_iteration": 5.245059013366699 }, { "auxiliary_loss_clip": 0.01100959, "auxiliary_loss_mlp": 0.01026926, "balance_loss_clip": 1.01541066, "balance_loss_mlp": 1.03398204, "epoch": 0.9146550428378175, "flos": 18296379521280.0, "grad_norm": 2.0438808707831617, "language_loss": 0.83919805, "learning_rate": 7.586983666711022e-08, "loss": 0.86047691, "num_input_tokens_seen": 328151520, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 15213, "time_per_iteration": 2.4329323768615723 }, { "auxiliary_loss_clip": 0.01104009, "auxiliary_loss_mlp": 0.01027974, "balance_loss_clip": 1.01639843, "balance_loss_mlp": 1.03554654, "epoch": 0.9147151660904855, "flos": 20084264006400.0, "grad_norm": 1.7583236280763286, "language_loss": 0.7085681, "learning_rate": 7.576362019471894e-08, "loss": 0.72988796, "num_input_tokens_seen": 328171275, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.68359375, "step": 15214, "time_per_iteration": 2.4469590187072754 }, { "auxiliary_loss_clip": 0.01107495, "auxiliary_loss_mlp": 0.01036127, "balance_loss_clip": 1.0228647, "balance_loss_mlp": 1.03682065, "epoch": 0.9147752893431534, "flos": 24389127239040.0, "grad_norm": 1.9881639164596223, "language_loss": 0.63105822, "learning_rate": 7.565747668956413e-08, "loss": 0.65249443, "num_input_tokens_seen": 328192115, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 15215, "time_per_iteration": 2.4944632053375244 }, { "auxiliary_loss_clip": 0.01108742, "auxiliary_loss_mlp": 0.01031985, "balance_loss_clip": 1.01919997, "balance_loss_mlp": 1.03669, "epoch": 0.9148354125958215, "flos": 18150402648960.0, "grad_norm": 13.390611954722111, "language_loss": 0.76255536, "learning_rate": 7.555140615567058e-08, "loss": 0.78396261, "num_input_tokens_seen": 328208990, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 15216, "time_per_iteration": 2.4402554035186768 }, { "auxiliary_loss_clip": 0.01104391, "auxiliary_loss_mlp": 0.01030456, "balance_loss_clip": 1.01820755, "balance_loss_mlp": 1.03655052, "epoch": 0.9148955358484894, "flos": 23367540528000.0, "grad_norm": 7.538523125240036, "language_loss": 0.68137819, "learning_rate": 7.544540859706062e-08, "loss": 0.70272666, "num_input_tokens_seen": 328227840, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 15217, "time_per_iteration": 2.4987499713897705 }, { "auxiliary_loss_clip": 0.01102093, "auxiliary_loss_mlp": 0.01030849, "balance_loss_clip": 1.01936293, "balance_loss_mlp": 1.03529608, "epoch": 0.9149556591011574, "flos": 18076498416000.0, "grad_norm": 1.8480900924699646, "language_loss": 0.79768014, "learning_rate": 7.533948401775347e-08, "loss": 0.81900954, "num_input_tokens_seen": 328246250, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.66796875, "step": 15218, "time_per_iteration": 2.456069231033325 }, { "auxiliary_loss_clip": 0.01028967, "auxiliary_loss_mlp": 0.01000014, "balance_loss_clip": 0.9989832, "balance_loss_mlp": 1.00649822, "epoch": 0.9150157823538253, "flos": 54586374825600.0, "grad_norm": 0.8482430894413818, "language_loss": 0.59293365, "learning_rate": 7.523363242176595e-08, "loss": 0.61322355, "num_input_tokens_seen": 328303625, "router_z_loss_clip": 0.01031494, "router_z_loss_mlp": 0.22460938, "step": 15219, "time_per_iteration": 3.0499844551086426 }, { "auxiliary_loss_clip": 0.01101702, "auxiliary_loss_mlp": 0.0103211, "balance_loss_clip": 1.01981378, "balance_loss_mlp": 1.03471231, "epoch": 0.9150759056064933, "flos": 17893102550400.0, "grad_norm": 2.493349632596411, "language_loss": 0.7883327, "learning_rate": 7.512785381311216e-08, "loss": 0.80967081, "num_input_tokens_seen": 328322135, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 15220, "time_per_iteration": 2.462905168533325 }, { "auxiliary_loss_clip": 0.01105539, "auxiliary_loss_mlp": 0.01033776, "balance_loss_clip": 1.0203948, "balance_loss_mlp": 1.03413749, "epoch": 0.9151360288591612, "flos": 18073517587200.0, "grad_norm": 1.9291610286799072, "language_loss": 0.66364884, "learning_rate": 7.50221481958031e-08, "loss": 0.68504202, "num_input_tokens_seen": 328340750, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71484375, "step": 15221, "time_per_iteration": 2.440675735473633 }, { "auxiliary_loss_clip": 0.01104102, "auxiliary_loss_mlp": 0.01030885, "balance_loss_clip": 1.01920235, "balance_loss_mlp": 1.03506351, "epoch": 0.9151961521118293, "flos": 19354523299200.0, "grad_norm": 2.4546818388843232, "language_loss": 0.84402001, "learning_rate": 7.491651557384692e-08, "loss": 0.86536992, "num_input_tokens_seen": 328359995, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69140625, "step": 15222, "time_per_iteration": 2.434382200241089 }, { "auxiliary_loss_clip": 0.01028584, "auxiliary_loss_mlp": 0.00999206, "balance_loss_clip": 0.99821645, "balance_loss_mlp": 1.00630379, "epoch": 0.9152562753644973, "flos": 72146621018880.0, "grad_norm": 0.7242740863116143, "language_loss": 0.49575862, "learning_rate": 7.481095595124953e-08, "loss": 0.51603651, "num_input_tokens_seen": 328426865, "router_z_loss_clip": 0.0098877, "router_z_loss_mlp": 0.22265625, "step": 15223, "time_per_iteration": 3.1304705142974854 }, { "auxiliary_loss_clip": 0.01106349, "auxiliary_loss_mlp": 0.01038474, "balance_loss_clip": 1.02572477, "balance_loss_mlp": 1.03651786, "epoch": 0.9153163986171652, "flos": 20777016683520.0, "grad_norm": 2.18834144493422, "language_loss": 0.721771, "learning_rate": 7.470546933201349e-08, "loss": 0.74321926, "num_input_tokens_seen": 328445970, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 15224, "time_per_iteration": 2.435460090637207 }, { "auxiliary_loss_clip": 0.01102194, "auxiliary_loss_mlp": 0.01026987, "balance_loss_clip": 1.01471472, "balance_loss_mlp": 1.0348258, "epoch": 0.9153765218698332, "flos": 23040107124480.0, "grad_norm": 4.786604524924507, "language_loss": 0.81222838, "learning_rate": 7.460005572013895e-08, "loss": 0.83352023, "num_input_tokens_seen": 328464585, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.671875, "step": 15225, "time_per_iteration": 2.481238842010498 }, { "auxiliary_loss_clip": 0.01102027, "auxiliary_loss_mlp": 0.01025047, "balance_loss_clip": 1.01345396, "balance_loss_mlp": 1.0338285, "epoch": 0.9154366451225011, "flos": 28990900293120.0, "grad_norm": 1.4158051805123057, "language_loss": 0.71493483, "learning_rate": 7.44947151196238e-08, "loss": 0.73620564, "num_input_tokens_seen": 328490155, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 15226, "time_per_iteration": 2.535691976547241 }, { "auxiliary_loss_clip": 0.01105027, "auxiliary_loss_mlp": 0.01028448, "balance_loss_clip": 1.015728, "balance_loss_mlp": 1.03464437, "epoch": 0.9154967683751691, "flos": 22309504490880.0, "grad_norm": 2.3584350641626117, "language_loss": 0.74834311, "learning_rate": 7.43894475344613e-08, "loss": 0.76967788, "num_input_tokens_seen": 328508275, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 15227, "time_per_iteration": 2.46640944480896 }, { "auxiliary_loss_clip": 0.0110209, "auxiliary_loss_mlp": 0.01028488, "balance_loss_clip": 1.01707995, "balance_loss_mlp": 1.03491652, "epoch": 0.915556891627837, "flos": 24571481610240.0, "grad_norm": 1.6454838637298959, "language_loss": 0.74408621, "learning_rate": 7.428425296864404e-08, "loss": 0.76539201, "num_input_tokens_seen": 328529425, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 15228, "time_per_iteration": 2.4757213592529297 }, { "auxiliary_loss_clip": 0.01100449, "auxiliary_loss_mlp": 0.01031915, "balance_loss_clip": 1.02010131, "balance_loss_mlp": 1.03290033, "epoch": 0.9156170148805051, "flos": 22164676853760.0, "grad_norm": 1.7974699494789361, "language_loss": 0.71835136, "learning_rate": 7.417913142616106e-08, "loss": 0.73967505, "num_input_tokens_seen": 328550200, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 15229, "time_per_iteration": 2.470113515853882 }, { "auxiliary_loss_clip": 0.01106515, "auxiliary_loss_mlp": 0.01032299, "balance_loss_clip": 1.01900077, "balance_loss_mlp": 1.03693056, "epoch": 0.915677138133173, "flos": 20920659171840.0, "grad_norm": 3.8697274682286067, "language_loss": 0.83414006, "learning_rate": 7.407408291099848e-08, "loss": 0.85552824, "num_input_tokens_seen": 328568540, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 15230, "time_per_iteration": 2.440944194793701 }, { "auxiliary_loss_clip": 0.01102286, "auxiliary_loss_mlp": 0.01031072, "balance_loss_clip": 1.01980662, "balance_loss_mlp": 1.03558838, "epoch": 0.915737261385841, "flos": 24345136056960.0, "grad_norm": 3.253357791187342, "language_loss": 0.83451217, "learning_rate": 7.396910742713957e-08, "loss": 0.85584581, "num_input_tokens_seen": 328587300, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.66796875, "step": 15231, "time_per_iteration": 2.495021343231201 }, { "auxiliary_loss_clip": 0.01099449, "auxiliary_loss_mlp": 0.0102683, "balance_loss_clip": 1.01532042, "balance_loss_mlp": 1.03188753, "epoch": 0.9157973846385089, "flos": 26761386090240.0, "grad_norm": 2.280136015837448, "language_loss": 0.72212589, "learning_rate": 7.386420497856516e-08, "loss": 0.74338865, "num_input_tokens_seen": 328610055, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 15232, "time_per_iteration": 2.4965362548828125 }, { "auxiliary_loss_clip": 0.01104584, "auxiliary_loss_mlp": 0.01030571, "balance_loss_clip": 1.01861393, "balance_loss_mlp": 1.03446722, "epoch": 0.9158575078911769, "flos": 18478733892480.0, "grad_norm": 3.835123469257852, "language_loss": 0.67334366, "learning_rate": 7.375937556925338e-08, "loss": 0.69469523, "num_input_tokens_seen": 328626815, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69921875, "step": 15233, "time_per_iteration": 2.449956178665161 }, { "auxiliary_loss_clip": 0.01106364, "auxiliary_loss_mlp": 0.01036013, "balance_loss_clip": 1.02326405, "balance_loss_mlp": 1.03581524, "epoch": 0.9159176311438448, "flos": 21798926616960.0, "grad_norm": 2.3458324928154193, "language_loss": 0.69619811, "learning_rate": 7.365461920317861e-08, "loss": 0.71762186, "num_input_tokens_seen": 328643995, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 15234, "time_per_iteration": 2.4494130611419678 }, { "auxiliary_loss_clip": 0.01105867, "auxiliary_loss_mlp": 0.01031761, "balance_loss_clip": 1.01954222, "balance_loss_mlp": 1.0364151, "epoch": 0.9159777543965129, "flos": 24783749032320.0, "grad_norm": 2.22514990531948, "language_loss": 0.88140476, "learning_rate": 7.354993588431391e-08, "loss": 0.90278101, "num_input_tokens_seen": 328659565, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 15235, "time_per_iteration": 2.4818005561828613 }, { "auxiliary_loss_clip": 0.01106961, "auxiliary_loss_mlp": 0.01032253, "balance_loss_clip": 1.02026641, "balance_loss_mlp": 1.03737426, "epoch": 0.9160378776491809, "flos": 26868758820480.0, "grad_norm": 1.764896900011972, "language_loss": 0.77157712, "learning_rate": 7.344532561662853e-08, "loss": 0.79296923, "num_input_tokens_seen": 328679045, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 15236, "time_per_iteration": 2.497983455657959 }, { "auxiliary_loss_clip": 0.01028296, "auxiliary_loss_mlp": 0.00998379, "balance_loss_clip": 0.99737722, "balance_loss_mlp": 1.00605822, "epoch": 0.9160980009018488, "flos": 70578222589440.0, "grad_norm": 0.6923499507126514, "language_loss": 0.62238628, "learning_rate": 7.334078840409019e-08, "loss": 0.64265299, "num_input_tokens_seen": 328744565, "router_z_loss_clip": 0.01000977, "router_z_loss_mlp": 0.22265625, "step": 15237, "time_per_iteration": 3.051495313644409 }, { "auxiliary_loss_clip": 0.01106461, "auxiliary_loss_mlp": 0.01031335, "balance_loss_clip": 1.01839447, "balance_loss_mlp": 1.03566945, "epoch": 0.9161581241545168, "flos": 16289332202880.0, "grad_norm": 1.9385870628580504, "language_loss": 0.75013852, "learning_rate": 7.323632425066151e-08, "loss": 0.77151656, "num_input_tokens_seen": 328762455, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 15238, "time_per_iteration": 2.434680461883545 }, { "auxiliary_loss_clip": 0.01104511, "auxiliary_loss_mlp": 0.01025269, "balance_loss_clip": 1.01351452, "balance_loss_mlp": 1.03506947, "epoch": 0.9162182474071847, "flos": 18438154502400.0, "grad_norm": 3.8211627349029738, "language_loss": 0.74629366, "learning_rate": 7.313193316030464e-08, "loss": 0.76759148, "num_input_tokens_seen": 328780320, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6953125, "step": 15239, "time_per_iteration": 2.4487733840942383 }, { "auxiliary_loss_clip": 0.01105289, "auxiliary_loss_mlp": 0.01030729, "balance_loss_clip": 1.01865315, "balance_loss_mlp": 1.03510153, "epoch": 0.9162783706598527, "flos": 19167248764800.0, "grad_norm": 2.1831973910768543, "language_loss": 0.63570917, "learning_rate": 7.302761513697819e-08, "loss": 0.65706933, "num_input_tokens_seen": 328797570, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.703125, "step": 15240, "time_per_iteration": 2.430314540863037 }, { "auxiliary_loss_clip": 0.011042, "auxiliary_loss_mlp": 0.01024763, "balance_loss_clip": 1.01290119, "balance_loss_mlp": 1.0367012, "epoch": 0.9163384939125206, "flos": 20412990299520.0, "grad_norm": 4.189083475877129, "language_loss": 0.76509392, "learning_rate": 7.292337018463746e-08, "loss": 0.78638357, "num_input_tokens_seen": 328814075, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 15241, "time_per_iteration": 2.4631736278533936 }, { "auxiliary_loss_clip": 0.0111153, "auxiliary_loss_mlp": 0.01031317, "balance_loss_clip": 1.01726258, "balance_loss_mlp": 1.0362494, "epoch": 0.9163986171651887, "flos": 19645902426240.0, "grad_norm": 3.0177261143484126, "language_loss": 0.675861, "learning_rate": 7.281919830723549e-08, "loss": 0.69728947, "num_input_tokens_seen": 328831990, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.75390625, "step": 15242, "time_per_iteration": 2.423043966293335 }, { "auxiliary_loss_clip": 0.0110339, "auxiliary_loss_mlp": 0.010297, "balance_loss_clip": 1.01711726, "balance_loss_mlp": 1.03441107, "epoch": 0.9164587404178566, "flos": 12823054865280.0, "grad_norm": 2.3622552393327774, "language_loss": 0.80909246, "learning_rate": 7.271509950872334e-08, "loss": 0.83042336, "num_input_tokens_seen": 328849105, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 15243, "time_per_iteration": 2.4389779567718506 }, { "auxiliary_loss_clip": 0.01104066, "auxiliary_loss_mlp": 0.0102646, "balance_loss_clip": 1.01463425, "balance_loss_mlp": 1.03373384, "epoch": 0.9165188636705246, "flos": 22309396750080.0, "grad_norm": 2.164232124333268, "language_loss": 0.81976557, "learning_rate": 7.261107379304721e-08, "loss": 0.84107089, "num_input_tokens_seen": 328866810, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 15244, "time_per_iteration": 2.4405055046081543 }, { "auxiliary_loss_clip": 0.01107099, "auxiliary_loss_mlp": 0.01033744, "balance_loss_clip": 1.02081025, "balance_loss_mlp": 1.03552926, "epoch": 0.9165789869231925, "flos": 18223337214720.0, "grad_norm": 3.4338199805477956, "language_loss": 0.72607714, "learning_rate": 7.250712116415214e-08, "loss": 0.74748558, "num_input_tokens_seen": 328885325, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 15245, "time_per_iteration": 2.4355409145355225 }, { "auxiliary_loss_clip": 0.01102475, "auxiliary_loss_mlp": 0.01031941, "balance_loss_clip": 1.02040195, "balance_loss_mlp": 1.03420687, "epoch": 0.9166391101758605, "flos": 13691553811200.0, "grad_norm": 2.0473916717210034, "language_loss": 0.74581468, "learning_rate": 7.240324162598033e-08, "loss": 0.76715875, "num_input_tokens_seen": 328902655, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 15246, "time_per_iteration": 3.8995461463928223 }, { "auxiliary_loss_clip": 0.01103938, "auxiliary_loss_mlp": 0.01031647, "balance_loss_clip": 1.01890361, "balance_loss_mlp": 1.0356319, "epoch": 0.9166992334285284, "flos": 17346793622400.0, "grad_norm": 2.148885099251566, "language_loss": 0.7550199, "learning_rate": 7.229943518247106e-08, "loss": 0.77637571, "num_input_tokens_seen": 328918440, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 15247, "time_per_iteration": 2.4579813480377197 }, { "auxiliary_loss_clip": 0.01108067, "auxiliary_loss_mlp": 0.01028604, "balance_loss_clip": 1.01626027, "balance_loss_mlp": 1.03792608, "epoch": 0.9167593566811965, "flos": 23731135948800.0, "grad_norm": 1.7604171173288374, "language_loss": 0.75997853, "learning_rate": 7.219570183756052e-08, "loss": 0.78134525, "num_input_tokens_seen": 328938055, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 15248, "time_per_iteration": 2.4612314701080322 }, { "auxiliary_loss_clip": 0.01105378, "auxiliary_loss_mlp": 0.01034646, "balance_loss_clip": 1.02119327, "balance_loss_mlp": 1.03548861, "epoch": 0.9168194799338644, "flos": 27818201064960.0, "grad_norm": 2.201400731959478, "language_loss": 0.72770661, "learning_rate": 7.209204159518178e-08, "loss": 0.74910682, "num_input_tokens_seen": 328957895, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.69921875, "step": 15249, "time_per_iteration": 2.507277250289917 }, { "auxiliary_loss_clip": 0.01106562, "auxiliary_loss_mlp": 0.01028885, "balance_loss_clip": 1.01587367, "balance_loss_mlp": 1.03725994, "epoch": 0.9168796031865324, "flos": 21717552355200.0, "grad_norm": 2.056572343381768, "language_loss": 0.75897086, "learning_rate": 7.198845445926616e-08, "loss": 0.78032535, "num_input_tokens_seen": 328971365, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 15250, "time_per_iteration": 3.8187661170959473 }, { "auxiliary_loss_clip": 0.01102892, "auxiliary_loss_mlp": 0.01031115, "balance_loss_clip": 1.01879454, "balance_loss_mlp": 1.03507578, "epoch": 0.9169397264392004, "flos": 23404420817280.0, "grad_norm": 1.8360535264852187, "language_loss": 0.75952506, "learning_rate": 7.188494043374138e-08, "loss": 0.78086507, "num_input_tokens_seen": 328990830, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 15251, "time_per_iteration": 2.4571926593780518 }, { "auxiliary_loss_clip": 0.0110816, "auxiliary_loss_mlp": 0.01029926, "balance_loss_clip": 1.01613307, "balance_loss_mlp": 1.03792226, "epoch": 0.9169998496918683, "flos": 23950981140480.0, "grad_norm": 2.335448030908928, "language_loss": 0.79936528, "learning_rate": 7.178149952253298e-08, "loss": 0.82074606, "num_input_tokens_seen": 329008345, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.703125, "step": 15252, "time_per_iteration": 2.4895410537719727 }, { "auxiliary_loss_clip": 0.01104123, "auxiliary_loss_mlp": 0.0103064, "balance_loss_clip": 1.0185051, "balance_loss_mlp": 1.0348562, "epoch": 0.9170599729445363, "flos": 18332469711360.0, "grad_norm": 1.9513432114481433, "language_loss": 0.77144498, "learning_rate": 7.167813172956316e-08, "loss": 0.79279256, "num_input_tokens_seen": 329027440, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 15253, "time_per_iteration": 3.804204225540161 }, { "auxiliary_loss_clip": 0.01106392, "auxiliary_loss_mlp": 0.01026968, "balance_loss_clip": 1.01522017, "balance_loss_mlp": 1.03709292, "epoch": 0.9171200961972042, "flos": 22674859678080.0, "grad_norm": 1.9750581647456265, "language_loss": 0.73050582, "learning_rate": 7.157483705875256e-08, "loss": 0.7518394, "num_input_tokens_seen": 329046445, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 15254, "time_per_iteration": 3.900503158569336 }, { "auxiliary_loss_clip": 0.01101151, "auxiliary_loss_mlp": 0.01024639, "balance_loss_clip": 1.01311159, "balance_loss_mlp": 1.0347569, "epoch": 0.9171802194498723, "flos": 26719298328960.0, "grad_norm": 1.5869744962624381, "language_loss": 0.7933712, "learning_rate": 7.14716155140167e-08, "loss": 0.81462908, "num_input_tokens_seen": 329065555, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 15255, "time_per_iteration": 2.515338897705078 }, { "auxiliary_loss_clip": 0.01105253, "auxiliary_loss_mlp": 0.01032127, "balance_loss_clip": 1.01975322, "balance_loss_mlp": 1.03452015, "epoch": 0.9172403427025402, "flos": 37889240538240.0, "grad_norm": 2.712102283569314, "language_loss": 0.68839025, "learning_rate": 7.136846709927047e-08, "loss": 0.70976412, "num_input_tokens_seen": 329087515, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 15256, "time_per_iteration": 2.6001956462860107 }, { "auxiliary_loss_clip": 0.01104358, "auxiliary_loss_mlp": 0.0103166, "balance_loss_clip": 1.01982808, "balance_loss_mlp": 1.03632808, "epoch": 0.9173004659552082, "flos": 17055163100160.0, "grad_norm": 1.6380840957504554, "language_loss": 0.83925623, "learning_rate": 7.126539181842561e-08, "loss": 0.86061639, "num_input_tokens_seen": 329106820, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 15257, "time_per_iteration": 2.50575590133667 }, { "auxiliary_loss_clip": 0.01101657, "auxiliary_loss_mlp": 0.01031618, "balance_loss_clip": 1.02027535, "balance_loss_mlp": 1.03486204, "epoch": 0.9173605892078761, "flos": 22201593056640.0, "grad_norm": 1.6377682928683015, "language_loss": 0.77545977, "learning_rate": 7.116238967539012e-08, "loss": 0.79679251, "num_input_tokens_seen": 329126515, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66796875, "step": 15258, "time_per_iteration": 2.459995985031128 }, { "auxiliary_loss_clip": 0.01105744, "auxiliary_loss_mlp": 0.01032871, "balance_loss_clip": 1.02083135, "balance_loss_mlp": 1.03777289, "epoch": 0.9174207124605441, "flos": 16507776764160.0, "grad_norm": 1.8504672291408752, "language_loss": 0.78794932, "learning_rate": 7.105946067406999e-08, "loss": 0.80933547, "num_input_tokens_seen": 329142660, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 15259, "time_per_iteration": 2.4244954586029053 }, { "auxiliary_loss_clip": 0.01102195, "auxiliary_loss_mlp": 0.01031659, "balance_loss_clip": 1.02003646, "balance_loss_mlp": 1.03471613, "epoch": 0.917480835713212, "flos": 24535606901760.0, "grad_norm": 1.566368623973325, "language_loss": 0.76413852, "learning_rate": 7.095660481836895e-08, "loss": 0.78547704, "num_input_tokens_seen": 329162575, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 15260, "time_per_iteration": 2.4638187885284424 }, { "auxiliary_loss_clip": 0.01101953, "auxiliary_loss_mlp": 0.01029725, "balance_loss_clip": 1.01780438, "balance_loss_mlp": 1.03364849, "epoch": 0.9175409589658801, "flos": 20880726226560.0, "grad_norm": 1.9908649280549007, "language_loss": 0.60940522, "learning_rate": 7.085382211218637e-08, "loss": 0.63072205, "num_input_tokens_seen": 329182090, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 15261, "time_per_iteration": 2.48588490486145 }, { "auxiliary_loss_clip": 0.0110149, "auxiliary_loss_mlp": 0.01027607, "balance_loss_clip": 1.01559687, "balance_loss_mlp": 1.03376162, "epoch": 0.917601082218548, "flos": 14276035918080.0, "grad_norm": 2.082329326339225, "language_loss": 0.73786867, "learning_rate": 7.075111255942002e-08, "loss": 0.75915968, "num_input_tokens_seen": 329196535, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 15262, "time_per_iteration": 2.400402545928955 }, { "auxiliary_loss_clip": 0.01105431, "auxiliary_loss_mlp": 0.01034172, "balance_loss_clip": 1.02204204, "balance_loss_mlp": 1.03344381, "epoch": 0.917661205471216, "flos": 19099234362240.0, "grad_norm": 1.9929107826227002, "language_loss": 0.78005797, "learning_rate": 7.064847616396496e-08, "loss": 0.80145395, "num_input_tokens_seen": 329215135, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71875, "step": 15263, "time_per_iteration": 2.4421963691711426 }, { "auxiliary_loss_clip": 0.01106867, "auxiliary_loss_mlp": 0.01030037, "balance_loss_clip": 1.01782453, "balance_loss_mlp": 1.03523946, "epoch": 0.917721328723884, "flos": 21106568989440.0, "grad_norm": 2.4088023509769028, "language_loss": 0.75774646, "learning_rate": 7.054591292971324e-08, "loss": 0.7791155, "num_input_tokens_seen": 329235150, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 15264, "time_per_iteration": 2.450918197631836 }, { "auxiliary_loss_clip": 0.01104826, "auxiliary_loss_mlp": 0.01034393, "balance_loss_clip": 1.02293706, "balance_loss_mlp": 1.0357244, "epoch": 0.9177814519765519, "flos": 21943215550080.0, "grad_norm": 1.6372745174369168, "language_loss": 0.82960016, "learning_rate": 7.044342286055394e-08, "loss": 0.85099232, "num_input_tokens_seen": 329254365, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.69140625, "step": 15265, "time_per_iteration": 2.4571197032928467 }, { "auxiliary_loss_clip": 0.01109784, "auxiliary_loss_mlp": 0.0104006, "balance_loss_clip": 1.02670217, "balance_loss_mlp": 1.03671634, "epoch": 0.9178415752292199, "flos": 24205982768640.0, "grad_norm": 1.6704130890996527, "language_loss": 0.73435736, "learning_rate": 7.034100596037306e-08, "loss": 0.7558558, "num_input_tokens_seen": 329274385, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 15266, "time_per_iteration": 2.46632719039917 }, { "auxiliary_loss_clip": 0.01103557, "auxiliary_loss_mlp": 0.01029685, "balance_loss_clip": 1.01804435, "balance_loss_mlp": 1.03443885, "epoch": 0.9179016984818879, "flos": 20042068504320.0, "grad_norm": 1.8281022956141353, "language_loss": 0.77761042, "learning_rate": 7.023866223305486e-08, "loss": 0.79894286, "num_input_tokens_seen": 329292160, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.69140625, "step": 15267, "time_per_iteration": 2.4425723552703857 }, { "auxiliary_loss_clip": 0.01028322, "auxiliary_loss_mlp": 0.0100115, "balance_loss_clip": 1.00011337, "balance_loss_mlp": 1.00600767, "epoch": 0.9179618217345559, "flos": 65555901100800.0, "grad_norm": 0.7682475026296551, "language_loss": 0.56223881, "learning_rate": 7.013639168247975e-08, "loss": 0.5825336, "num_input_tokens_seen": 329351870, "router_z_loss_clip": 0.01037598, "router_z_loss_mlp": 0.22363281, "step": 15268, "time_per_iteration": 3.1065659523010254 }, { "auxiliary_loss_clip": 0.01107111, "auxiliary_loss_mlp": 0.01029966, "balance_loss_clip": 1.01747918, "balance_loss_mlp": 1.0362606, "epoch": 0.9180219449872238, "flos": 21324618501120.0, "grad_norm": 2.2773299233219935, "language_loss": 0.76642406, "learning_rate": 7.0034194312526e-08, "loss": 0.78779483, "num_input_tokens_seen": 329370930, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 15269, "time_per_iteration": 2.4721696376800537 }, { "auxiliary_loss_clip": 0.01102934, "auxiliary_loss_mlp": 0.01031765, "balance_loss_clip": 1.01938486, "balance_loss_mlp": 1.0342257, "epoch": 0.9180820682398918, "flos": 41060008684800.0, "grad_norm": 1.9655758780881745, "language_loss": 0.72877407, "learning_rate": 6.993207012706936e-08, "loss": 0.75012112, "num_input_tokens_seen": 329391275, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 15270, "time_per_iteration": 2.630018711090088 }, { "auxiliary_loss_clip": 0.01100908, "auxiliary_loss_mlp": 0.01030225, "balance_loss_clip": 1.01777971, "balance_loss_mlp": 1.0331912, "epoch": 0.9181421914925597, "flos": 28072915384320.0, "grad_norm": 1.5681711002451086, "language_loss": 0.79817069, "learning_rate": 6.98300191299821e-08, "loss": 0.81948203, "num_input_tokens_seen": 329412775, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.67578125, "step": 15271, "time_per_iteration": 2.525144577026367 }, { "auxiliary_loss_clip": 0.01103512, "auxiliary_loss_mlp": 0.01032634, "balance_loss_clip": 1.02023673, "balance_loss_mlp": 1.03393233, "epoch": 0.9182023147452277, "flos": 29169411909120.0, "grad_norm": 2.2069631758969863, "language_loss": 0.72792315, "learning_rate": 6.972804132513355e-08, "loss": 0.74928463, "num_input_tokens_seen": 329432440, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 15272, "time_per_iteration": 2.514082431793213 }, { "auxiliary_loss_clip": 0.01103619, "auxiliary_loss_mlp": 0.01030278, "balance_loss_clip": 1.01894689, "balance_loss_mlp": 1.03490257, "epoch": 0.9182624379978956, "flos": 24060831909120.0, "grad_norm": 2.0193386675474723, "language_loss": 0.72650564, "learning_rate": 6.962613671639105e-08, "loss": 0.74784458, "num_input_tokens_seen": 329450605, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6875, "step": 15273, "time_per_iteration": 2.479964017868042 }, { "auxiliary_loss_clip": 0.01096773, "auxiliary_loss_mlp": 0.01026776, "balance_loss_clip": 1.01583898, "balance_loss_mlp": 1.03225708, "epoch": 0.9183225612505637, "flos": 23293528554240.0, "grad_norm": 1.6398169067263961, "language_loss": 0.74035048, "learning_rate": 6.952430530761933e-08, "loss": 0.76158595, "num_input_tokens_seen": 329470550, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.64453125, "step": 15274, "time_per_iteration": 2.466515302658081 }, { "auxiliary_loss_clip": 0.01103595, "auxiliary_loss_mlp": 0.01039612, "balance_loss_clip": 1.0278939, "balance_loss_mlp": 1.03353953, "epoch": 0.9183826845032316, "flos": 19609237618560.0, "grad_norm": 1.6814005460671153, "language_loss": 0.68786621, "learning_rate": 6.942254710267902e-08, "loss": 0.70929825, "num_input_tokens_seen": 329489765, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.703125, "step": 15275, "time_per_iteration": 2.4645586013793945 }, { "auxiliary_loss_clip": 0.01102379, "auxiliary_loss_mlp": 0.01031211, "balance_loss_clip": 1.01921844, "balance_loss_mlp": 1.03432357, "epoch": 0.9184428077558996, "flos": 18479057114880.0, "grad_norm": 2.023450042118166, "language_loss": 0.72721016, "learning_rate": 6.932086210542953e-08, "loss": 0.748546, "num_input_tokens_seen": 329507040, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 15276, "time_per_iteration": 2.4490084648132324 }, { "auxiliary_loss_clip": 0.01105757, "auxiliary_loss_mlp": 0.01029852, "balance_loss_clip": 1.01822948, "balance_loss_mlp": 1.03716862, "epoch": 0.9185029310085676, "flos": 20741034234240.0, "grad_norm": 3.4684368372019305, "language_loss": 0.73187709, "learning_rate": 6.921925031972642e-08, "loss": 0.75323319, "num_input_tokens_seen": 329525540, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 15277, "time_per_iteration": 2.4822094440460205 }, { "auxiliary_loss_clip": 0.01028219, "auxiliary_loss_mlp": 0.01001699, "balance_loss_clip": 1.00069797, "balance_loss_mlp": 1.00583005, "epoch": 0.9185630542612355, "flos": 68209231875840.0, "grad_norm": 0.7231892255774365, "language_loss": 0.59207398, "learning_rate": 6.91177117494226e-08, "loss": 0.61237317, "num_input_tokens_seen": 329592905, "router_z_loss_clip": 0.01000977, "router_z_loss_mlp": 0.22460938, "step": 15278, "time_per_iteration": 3.1577515602111816 }, { "auxiliary_loss_clip": 0.01098212, "auxiliary_loss_mlp": 0.01025825, "balance_loss_clip": 1.01477456, "balance_loss_mlp": 1.03147912, "epoch": 0.9186231775139035, "flos": 12239470598400.0, "grad_norm": 1.8974055045147338, "language_loss": 0.64356768, "learning_rate": 6.901624639836879e-08, "loss": 0.66480803, "num_input_tokens_seen": 329610150, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66796875, "step": 15279, "time_per_iteration": 2.431821823120117 }, { "auxiliary_loss_clip": 0.01028705, "auxiliary_loss_mlp": 0.00999355, "balance_loss_clip": 0.99839514, "balance_loss_mlp": 1.00627327, "epoch": 0.9186833007665715, "flos": 63939237770880.0, "grad_norm": 0.8550828863192297, "language_loss": 0.60193622, "learning_rate": 6.891485427041211e-08, "loss": 0.62221682, "num_input_tokens_seen": 329673650, "router_z_loss_clip": 0.00958252, "router_z_loss_mlp": 0.22460938, "step": 15280, "time_per_iteration": 3.0748493671417236 }, { "auxiliary_loss_clip": 0.01105392, "auxiliary_loss_mlp": 0.0103467, "balance_loss_clip": 1.02203953, "balance_loss_mlp": 1.03525937, "epoch": 0.9187434240192395, "flos": 19974700546560.0, "grad_norm": 1.6862967384629997, "language_loss": 0.6973263, "learning_rate": 6.881353536939815e-08, "loss": 0.71872687, "num_input_tokens_seen": 329692520, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 15281, "time_per_iteration": 2.4245047569274902 }, { "auxiliary_loss_clip": 0.01104905, "auxiliary_loss_mlp": 0.01029521, "balance_loss_clip": 1.01619923, "balance_loss_mlp": 1.03425336, "epoch": 0.9188035472719074, "flos": 25227820874880.0, "grad_norm": 3.9174672386933516, "language_loss": 0.84638077, "learning_rate": 6.871228969916831e-08, "loss": 0.86772501, "num_input_tokens_seen": 329713750, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 15282, "time_per_iteration": 2.496040105819702 }, { "auxiliary_loss_clip": 0.01102759, "auxiliary_loss_mlp": 0.01032456, "balance_loss_clip": 1.01970077, "balance_loss_mlp": 1.034881, "epoch": 0.9188636705245754, "flos": 18405547931520.0, "grad_norm": 1.876568237004075, "language_loss": 0.60512364, "learning_rate": 6.861111726356194e-08, "loss": 0.62647581, "num_input_tokens_seen": 329730960, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 15283, "time_per_iteration": 2.4146740436553955 }, { "auxiliary_loss_clip": 0.01108505, "auxiliary_loss_mlp": 0.01031867, "balance_loss_clip": 1.01936173, "balance_loss_mlp": 1.0359813, "epoch": 0.9189237937772433, "flos": 23769129559680.0, "grad_norm": 1.6474008790723427, "language_loss": 0.65774536, "learning_rate": 6.851001806641554e-08, "loss": 0.67914903, "num_input_tokens_seen": 329750975, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 15284, "time_per_iteration": 2.4706039428710938 }, { "auxiliary_loss_clip": 0.01101782, "auxiliary_loss_mlp": 0.01031652, "balance_loss_clip": 1.01911104, "balance_loss_mlp": 1.03352249, "epoch": 0.9189839170299113, "flos": 21214624078080.0, "grad_norm": 1.7613035045794445, "language_loss": 0.73869109, "learning_rate": 6.840899211156292e-08, "loss": 0.76002538, "num_input_tokens_seen": 329769645, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.68359375, "step": 15285, "time_per_iteration": 2.4618477821350098 }, { "auxiliary_loss_clip": 0.01102373, "auxiliary_loss_mlp": 0.01030347, "balance_loss_clip": 1.01817596, "balance_loss_mlp": 1.03458714, "epoch": 0.9190440402825792, "flos": 16727370560640.0, "grad_norm": 1.9290674877627059, "language_loss": 0.71907216, "learning_rate": 6.830803940283458e-08, "loss": 0.74039936, "num_input_tokens_seen": 329788185, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 15286, "time_per_iteration": 2.421664237976074 }, { "auxiliary_loss_clip": 0.01105152, "auxiliary_loss_mlp": 0.01030377, "balance_loss_clip": 1.01778841, "balance_loss_mlp": 1.03563046, "epoch": 0.9191041635352473, "flos": 23441193365760.0, "grad_norm": 2.8166219069989054, "language_loss": 0.73291415, "learning_rate": 6.820715994405945e-08, "loss": 0.75426942, "num_input_tokens_seen": 329806780, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15287, "time_per_iteration": 2.4818007946014404 }, { "auxiliary_loss_clip": 0.01106648, "auxiliary_loss_mlp": 0.01029447, "balance_loss_clip": 1.01586354, "balance_loss_mlp": 1.03688586, "epoch": 0.9191642867879152, "flos": 18807532012800.0, "grad_norm": 2.0526344478020873, "language_loss": 0.65679908, "learning_rate": 6.810635373906226e-08, "loss": 0.67816007, "num_input_tokens_seen": 329826350, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6953125, "step": 15288, "time_per_iteration": 3.762131452560425 }, { "auxiliary_loss_clip": 0.01108102, "auxiliary_loss_mlp": 0.01034176, "balance_loss_clip": 1.02174246, "balance_loss_mlp": 1.03891861, "epoch": 0.9192244100405832, "flos": 32160950167680.0, "grad_norm": 4.565049515354641, "language_loss": 0.71424872, "learning_rate": 6.800562079166549e-08, "loss": 0.73567152, "num_input_tokens_seen": 329846160, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 15289, "time_per_iteration": 2.5377109050750732 }, { "auxiliary_loss_clip": 0.01105973, "auxiliary_loss_mlp": 0.01030661, "balance_loss_clip": 1.01838219, "balance_loss_mlp": 1.03612781, "epoch": 0.9192845332932512, "flos": 16357669827840.0, "grad_norm": 2.1702419426476496, "language_loss": 0.74381781, "learning_rate": 6.790496110568921e-08, "loss": 0.7651841, "num_input_tokens_seen": 329862020, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 15290, "time_per_iteration": 2.4264137744903564 }, { "auxiliary_loss_clip": 0.01102379, "auxiliary_loss_mlp": 0.01029308, "balance_loss_clip": 1.01763701, "balance_loss_mlp": 1.03537703, "epoch": 0.9193446565459191, "flos": 26614475464320.0, "grad_norm": 2.03806929825342, "language_loss": 0.72185796, "learning_rate": 6.78043746849506e-08, "loss": 0.74317491, "num_input_tokens_seen": 329880185, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 15291, "time_per_iteration": 2.4912126064300537 }, { "auxiliary_loss_clip": 0.01103407, "auxiliary_loss_mlp": 0.01027575, "balance_loss_clip": 1.01576161, "balance_loss_mlp": 1.03569996, "epoch": 0.9194047797985871, "flos": 22492182084480.0, "grad_norm": 1.7425262548571059, "language_loss": 0.71070802, "learning_rate": 6.770386153326346e-08, "loss": 0.73201787, "num_input_tokens_seen": 329900255, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 15292, "time_per_iteration": 3.888129234313965 }, { "auxiliary_loss_clip": 0.01104458, "auxiliary_loss_mlp": 0.01028964, "balance_loss_clip": 1.01639318, "balance_loss_mlp": 1.03500867, "epoch": 0.9194649030512551, "flos": 25078791346560.0, "grad_norm": 2.144377949899426, "language_loss": 0.73413825, "learning_rate": 6.760342165443988e-08, "loss": 0.75547254, "num_input_tokens_seen": 329919095, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15293, "time_per_iteration": 2.4687469005584717 }, { "auxiliary_loss_clip": 0.0110267, "auxiliary_loss_mlp": 0.01028975, "balance_loss_clip": 1.0168097, "balance_loss_mlp": 1.03490531, "epoch": 0.9195250263039231, "flos": 11911139354880.0, "grad_norm": 2.0573086289419806, "language_loss": 0.77886963, "learning_rate": 6.750305505228837e-08, "loss": 0.8001861, "num_input_tokens_seen": 329936505, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 15294, "time_per_iteration": 2.4374947547912598 }, { "auxiliary_loss_clip": 0.01107592, "auxiliary_loss_mlp": 0.01031419, "balance_loss_clip": 1.01803732, "balance_loss_mlp": 1.03540683, "epoch": 0.919585149556591, "flos": 21834154880640.0, "grad_norm": 1.5935014671826475, "language_loss": 0.77379519, "learning_rate": 6.74027617306141e-08, "loss": 0.79518527, "num_input_tokens_seen": 329956795, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 15295, "time_per_iteration": 3.907710313796997 }, { "auxiliary_loss_clip": 0.01101828, "auxiliary_loss_mlp": 0.01025942, "balance_loss_clip": 1.01522517, "balance_loss_mlp": 1.03582692, "epoch": 0.919645272809259, "flos": 28184059042560.0, "grad_norm": 1.9665953698395404, "language_loss": 0.71295846, "learning_rate": 6.730254169322114e-08, "loss": 0.73423612, "num_input_tokens_seen": 329977195, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.66015625, "step": 15296, "time_per_iteration": 3.9500958919525146 }, { "auxiliary_loss_clip": 0.01103338, "auxiliary_loss_mlp": 0.01036809, "balance_loss_clip": 1.02502525, "balance_loss_mlp": 1.03493786, "epoch": 0.9197053960619269, "flos": 18332828847360.0, "grad_norm": 1.9745993429337891, "language_loss": 0.75360012, "learning_rate": 6.720239494390912e-08, "loss": 0.77500159, "num_input_tokens_seen": 329992095, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.68359375, "step": 15297, "time_per_iteration": 2.41904616355896 }, { "auxiliary_loss_clip": 0.01103209, "auxiliary_loss_mlp": 0.01027757, "balance_loss_clip": 1.01547813, "balance_loss_mlp": 1.03526843, "epoch": 0.9197655193145949, "flos": 28183448511360.0, "grad_norm": 1.8112583725852107, "language_loss": 0.73616481, "learning_rate": 6.710232148647676e-08, "loss": 0.75747442, "num_input_tokens_seen": 330011490, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 15298, "time_per_iteration": 2.531247854232788 }, { "auxiliary_loss_clip": 0.01105319, "auxiliary_loss_mlp": 0.01033459, "balance_loss_clip": 1.02122259, "balance_loss_mlp": 1.03588867, "epoch": 0.9198256425672628, "flos": 17306321973120.0, "grad_norm": 2.1729609327003154, "language_loss": 0.7919575, "learning_rate": 6.70023213247175e-08, "loss": 0.81334525, "num_input_tokens_seen": 330027885, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 15299, "time_per_iteration": 2.4116809368133545 }, { "auxiliary_loss_clip": 0.01103704, "auxiliary_loss_mlp": 0.01027546, "balance_loss_clip": 1.01645398, "balance_loss_mlp": 1.0356648, "epoch": 0.9198857658199309, "flos": 17858520731520.0, "grad_norm": 2.104764964350845, "language_loss": 0.63970244, "learning_rate": 6.690239446242385e-08, "loss": 0.66101497, "num_input_tokens_seen": 330046230, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6796875, "step": 15300, "time_per_iteration": 2.424867868423462 }, { "auxiliary_loss_clip": 0.01098127, "auxiliary_loss_mlp": 0.01029113, "balance_loss_clip": 1.01863408, "balance_loss_mlp": 1.03355265, "epoch": 0.9199458890725988, "flos": 22127545169280.0, "grad_norm": 2.102330650626103, "language_loss": 0.69526124, "learning_rate": 6.680254090338545e-08, "loss": 0.71653366, "num_input_tokens_seen": 330065535, "router_z_loss_clip": 0.10498047, "router_z_loss_mlp": 0.64453125, "step": 15301, "time_per_iteration": 2.458543539047241 }, { "auxiliary_loss_clip": 0.0110641, "auxiliary_loss_mlp": 0.01033565, "balance_loss_clip": 1.02041674, "balance_loss_mlp": 1.03573287, "epoch": 0.9200060123252668, "flos": 16034043265920.0, "grad_norm": 1.9437122481651967, "language_loss": 0.70861399, "learning_rate": 6.670276065138814e-08, "loss": 0.73001373, "num_input_tokens_seen": 330082920, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 15302, "time_per_iteration": 2.4243052005767822 }, { "auxiliary_loss_clip": 0.0110469, "auxiliary_loss_mlp": 0.01031737, "balance_loss_clip": 1.02006614, "balance_loss_mlp": 1.0353961, "epoch": 0.9200661355779348, "flos": 26864521015680.0, "grad_norm": 1.7634696100611538, "language_loss": 0.76350915, "learning_rate": 6.660305371021579e-08, "loss": 0.78487337, "num_input_tokens_seen": 330101165, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 15303, "time_per_iteration": 2.501826286315918 }, { "auxiliary_loss_clip": 0.01106097, "auxiliary_loss_mlp": 0.01028982, "balance_loss_clip": 1.01704299, "balance_loss_mlp": 1.03706455, "epoch": 0.9201262588306027, "flos": 12786749193600.0, "grad_norm": 2.23846231297316, "language_loss": 0.87969559, "learning_rate": 6.650342008365006e-08, "loss": 0.9010464, "num_input_tokens_seen": 330118775, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69140625, "step": 15304, "time_per_iteration": 2.4222960472106934 }, { "auxiliary_loss_clip": 0.0110862, "auxiliary_loss_mlp": 0.01036147, "balance_loss_clip": 1.02169323, "balance_loss_mlp": 1.03696561, "epoch": 0.9201863820832707, "flos": 20631614428800.0, "grad_norm": 2.17866311145225, "language_loss": 0.7750448, "learning_rate": 6.64038597754677e-08, "loss": 0.79649246, "num_input_tokens_seen": 330135570, "router_z_loss_clip": 0.14453125, "router_z_loss_mlp": 0.71484375, "step": 15305, "time_per_iteration": 2.45361328125 }, { "auxiliary_loss_clip": 0.01103356, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.02374172, "balance_loss_mlp": 1.03420353, "epoch": 0.9202465053359387, "flos": 26395815421440.0, "grad_norm": 4.70625640526602, "language_loss": 0.81505376, "learning_rate": 6.630437278944501e-08, "loss": 0.83645034, "num_input_tokens_seen": 330152840, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 15306, "time_per_iteration": 2.4801902770996094 }, { "auxiliary_loss_clip": 0.01101119, "auxiliary_loss_mlp": 0.01033237, "balance_loss_clip": 1.02215672, "balance_loss_mlp": 1.03458929, "epoch": 0.9203066285886067, "flos": 10488179093760.0, "grad_norm": 2.292287149640285, "language_loss": 0.72476184, "learning_rate": 6.62049591293541e-08, "loss": 0.74610543, "num_input_tokens_seen": 330168605, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6640625, "step": 15307, "time_per_iteration": 2.4305977821350098 }, { "auxiliary_loss_clip": 0.01106034, "auxiliary_loss_mlp": 0.01030595, "balance_loss_clip": 1.01801229, "balance_loss_mlp": 1.03582144, "epoch": 0.9203667518412746, "flos": 19390721230080.0, "grad_norm": 5.137370315904447, "language_loss": 0.78603089, "learning_rate": 6.610561879896526e-08, "loss": 0.80739713, "num_input_tokens_seen": 330186160, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 15308, "time_per_iteration": 2.4324097633361816 }, { "auxiliary_loss_clip": 0.01101988, "auxiliary_loss_mlp": 0.01033377, "balance_loss_clip": 1.02093184, "balance_loss_mlp": 1.03370011, "epoch": 0.9204268750939426, "flos": 15924982596480.0, "grad_norm": 3.950425538603585, "language_loss": 0.78153825, "learning_rate": 6.600635180204484e-08, "loss": 0.80289185, "num_input_tokens_seen": 330201780, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.68359375, "step": 15309, "time_per_iteration": 2.4438607692718506 }, { "auxiliary_loss_clip": 0.01104036, "auxiliary_loss_mlp": 0.01028751, "balance_loss_clip": 1.01615095, "balance_loss_mlp": 1.03527212, "epoch": 0.9204869983466105, "flos": 16471758401280.0, "grad_norm": 2.6980644616031184, "language_loss": 0.66354728, "learning_rate": 6.590715814235781e-08, "loss": 0.68487513, "num_input_tokens_seen": 330219165, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 15310, "time_per_iteration": 2.4391520023345947 }, { "auxiliary_loss_clip": 0.01104281, "auxiliary_loss_mlp": 0.0103076, "balance_loss_clip": 1.01843393, "balance_loss_mlp": 1.03449631, "epoch": 0.9205471215992785, "flos": 21539220307200.0, "grad_norm": 1.7974134430965096, "language_loss": 0.66200083, "learning_rate": 6.580803782366495e-08, "loss": 0.68335128, "num_input_tokens_seen": 330238975, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 15311, "time_per_iteration": 2.451589584350586 }, { "auxiliary_loss_clip": 0.01103899, "auxiliary_loss_mlp": 0.01032798, "balance_loss_clip": 1.02099061, "balance_loss_mlp": 1.0346446, "epoch": 0.9206072448519464, "flos": 25005892694400.0, "grad_norm": 2.211913371589611, "language_loss": 0.76195717, "learning_rate": 6.570899084972503e-08, "loss": 0.78332406, "num_input_tokens_seen": 330259755, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 15312, "time_per_iteration": 2.4806840419769287 }, { "auxiliary_loss_clip": 0.01101949, "auxiliary_loss_mlp": 0.01032325, "balance_loss_clip": 1.02091694, "balance_loss_mlp": 1.03537726, "epoch": 0.9206673681046145, "flos": 20522661500160.0, "grad_norm": 2.2651683284826354, "language_loss": 0.79101706, "learning_rate": 6.561001722429394e-08, "loss": 0.81235981, "num_input_tokens_seen": 330277660, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 15313, "time_per_iteration": 2.444669246673584 }, { "auxiliary_loss_clip": 0.01107028, "auxiliary_loss_mlp": 0.01033292, "balance_loss_clip": 1.02134085, "balance_loss_mlp": 1.03672945, "epoch": 0.9207274913572824, "flos": 20883455660160.0, "grad_norm": 1.8647780691309024, "language_loss": 0.78148687, "learning_rate": 6.55111169511251e-08, "loss": 0.80289006, "num_input_tokens_seen": 330295455, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 15314, "time_per_iteration": 2.451388120651245 }, { "auxiliary_loss_clip": 0.01108958, "auxiliary_loss_mlp": 0.01035355, "balance_loss_clip": 1.02195001, "balance_loss_mlp": 1.03617525, "epoch": 0.9207876146099504, "flos": 22708256348160.0, "grad_norm": 1.9636539420255275, "language_loss": 0.79267514, "learning_rate": 6.541229003396864e-08, "loss": 0.81411827, "num_input_tokens_seen": 330315310, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 15315, "time_per_iteration": 2.4870500564575195 }, { "auxiliary_loss_clip": 0.0110842, "auxiliary_loss_mlp": 0.01032734, "balance_loss_clip": 1.01998413, "balance_loss_mlp": 1.03569579, "epoch": 0.9208477378626184, "flos": 18507354053760.0, "grad_norm": 2.3304676388195915, "language_loss": 0.76353317, "learning_rate": 6.531353647657156e-08, "loss": 0.78494477, "num_input_tokens_seen": 330333260, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7265625, "step": 15316, "time_per_iteration": 2.440495252609253 }, { "auxiliary_loss_clip": 0.01103325, "auxiliary_loss_mlp": 0.01035778, "balance_loss_clip": 1.02316618, "balance_loss_mlp": 1.03342664, "epoch": 0.9209078611152863, "flos": 22999635475200.0, "grad_norm": 1.8033185559020646, "language_loss": 0.6962266, "learning_rate": 6.521485628267931e-08, "loss": 0.71761763, "num_input_tokens_seen": 330352465, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 15317, "time_per_iteration": 2.475855827331543 }, { "auxiliary_loss_clip": 0.01105061, "auxiliary_loss_mlp": 0.01031582, "balance_loss_clip": 1.01918423, "balance_loss_mlp": 1.03635097, "epoch": 0.9209679843679544, "flos": 24061514267520.0, "grad_norm": 2.2573442206118846, "language_loss": 0.83424777, "learning_rate": 6.511624945603378e-08, "loss": 0.85561419, "num_input_tokens_seen": 330372685, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 15318, "time_per_iteration": 2.462261915206909 }, { "auxiliary_loss_clip": 0.01104553, "auxiliary_loss_mlp": 0.01031572, "balance_loss_clip": 1.01947176, "balance_loss_mlp": 1.03554761, "epoch": 0.9210281076206223, "flos": 13553370190080.0, "grad_norm": 1.9976459445364578, "language_loss": 0.85348892, "learning_rate": 6.501771600037354e-08, "loss": 0.87485015, "num_input_tokens_seen": 330388860, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 15319, "time_per_iteration": 2.4211959838867188 }, { "auxiliary_loss_clip": 0.01028232, "auxiliary_loss_mlp": 0.01002224, "balance_loss_clip": 1.00116313, "balance_loss_mlp": 1.00583112, "epoch": 0.9210882308732903, "flos": 71426289674880.0, "grad_norm": 0.7700388103718167, "language_loss": 0.56202316, "learning_rate": 6.491925591943559e-08, "loss": 0.58232772, "num_input_tokens_seen": 330448735, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.22460938, "step": 15320, "time_per_iteration": 3.116410732269287 }, { "auxiliary_loss_clip": 0.01108525, "auxiliary_loss_mlp": 0.01039685, "balance_loss_clip": 1.0261066, "balance_loss_mlp": 1.03603446, "epoch": 0.9211483541259582, "flos": 18509113820160.0, "grad_norm": 2.4829955076271704, "language_loss": 0.63922083, "learning_rate": 6.482086921695384e-08, "loss": 0.66070294, "num_input_tokens_seen": 330465600, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 15321, "time_per_iteration": 2.425875425338745 }, { "auxiliary_loss_clip": 0.01099876, "auxiliary_loss_mlp": 0.01030316, "balance_loss_clip": 1.01876462, "balance_loss_mlp": 1.03477085, "epoch": 0.9212084773786262, "flos": 23258228463360.0, "grad_norm": 1.633385240156442, "language_loss": 0.71451306, "learning_rate": 6.47225558966582e-08, "loss": 0.73581493, "num_input_tokens_seen": 330485770, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6484375, "step": 15322, "time_per_iteration": 2.4930899143218994 }, { "auxiliary_loss_clip": 0.0110305, "auxiliary_loss_mlp": 0.01031464, "balance_loss_clip": 1.02006745, "balance_loss_mlp": 1.03480935, "epoch": 0.9212686006312941, "flos": 16289511770880.0, "grad_norm": 1.9339078938498855, "language_loss": 0.70137525, "learning_rate": 6.462431596227725e-08, "loss": 0.72272032, "num_input_tokens_seen": 330504255, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.68359375, "step": 15323, "time_per_iteration": 2.4575283527374268 }, { "auxiliary_loss_clip": 0.01105279, "auxiliary_loss_mlp": 0.01035263, "balance_loss_clip": 1.02184558, "balance_loss_mlp": 1.03380585, "epoch": 0.9213287238839621, "flos": 19785773986560.0, "grad_norm": 2.1661148195237745, "language_loss": 0.75068009, "learning_rate": 6.452614941753597e-08, "loss": 0.77208555, "num_input_tokens_seen": 330520705, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71484375, "step": 15324, "time_per_iteration": 2.4251692295074463 }, { "auxiliary_loss_clip": 0.011061, "auxiliary_loss_mlp": 0.01036055, "balance_loss_clip": 1.02376485, "balance_loss_mlp": 1.03699875, "epoch": 0.92138884713663, "flos": 21030402199680.0, "grad_norm": 2.196368763781427, "language_loss": 0.71341282, "learning_rate": 6.442805626615744e-08, "loss": 0.73483437, "num_input_tokens_seen": 330539245, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 15325, "time_per_iteration": 2.4616658687591553 }, { "auxiliary_loss_clip": 0.01102063, "auxiliary_loss_mlp": 0.01030863, "balance_loss_clip": 1.0187875, "balance_loss_mlp": 1.03419185, "epoch": 0.9214489703892981, "flos": 28587264186240.0, "grad_norm": 1.6659465867380152, "language_loss": 0.78563029, "learning_rate": 6.433003651186109e-08, "loss": 0.80695963, "num_input_tokens_seen": 330561815, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 15326, "time_per_iteration": 2.495901584625244 }, { "auxiliary_loss_clip": 0.0110762, "auxiliary_loss_mlp": 0.01033459, "balance_loss_clip": 1.020787, "balance_loss_mlp": 1.03685117, "epoch": 0.921509093641966, "flos": 16361476669440.0, "grad_norm": 2.4096421456169455, "language_loss": 0.7117011, "learning_rate": 6.42320901583635e-08, "loss": 0.73311186, "num_input_tokens_seen": 330579760, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.70703125, "step": 15327, "time_per_iteration": 2.4266953468322754 }, { "auxiliary_loss_clip": 0.01109779, "auxiliary_loss_mlp": 0.01040228, "balance_loss_clip": 1.02705598, "balance_loss_mlp": 1.03823805, "epoch": 0.921569216894634, "flos": 26830837036800.0, "grad_norm": 2.988777461028535, "language_loss": 0.77990299, "learning_rate": 6.413421720937906e-08, "loss": 0.80140305, "num_input_tokens_seen": 330598545, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71484375, "step": 15328, "time_per_iteration": 2.477238893508911 }, { "auxiliary_loss_clip": 0.01103557, "auxiliary_loss_mlp": 0.01032128, "balance_loss_clip": 1.02043307, "balance_loss_mlp": 1.03533685, "epoch": 0.921629340147302, "flos": 24645134448000.0, "grad_norm": 2.1478378049830082, "language_loss": 0.71293765, "learning_rate": 6.4036417668619e-08, "loss": 0.73429447, "num_input_tokens_seen": 330616700, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 15329, "time_per_iteration": 3.800200939178467 }, { "auxiliary_loss_clip": 0.01102413, "auxiliary_loss_mlp": 0.01025362, "balance_loss_clip": 1.01395953, "balance_loss_mlp": 1.0344106, "epoch": 0.9216894633999699, "flos": 15086504442240.0, "grad_norm": 1.9592994224868048, "language_loss": 0.86724752, "learning_rate": 6.393869153979192e-08, "loss": 0.88852525, "num_input_tokens_seen": 330633355, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 15330, "time_per_iteration": 2.4199001789093018 }, { "auxiliary_loss_clip": 0.01104669, "auxiliary_loss_mlp": 0.01029914, "balance_loss_clip": 1.01813614, "balance_loss_mlp": 1.03507853, "epoch": 0.921749586652638, "flos": 19204524103680.0, "grad_norm": 2.0254527818955035, "language_loss": 0.75460064, "learning_rate": 6.384103882660397e-08, "loss": 0.7759465, "num_input_tokens_seen": 330651470, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6953125, "step": 15331, "time_per_iteration": 2.435554265975952 }, { "auxiliary_loss_clip": 0.01101915, "auxiliary_loss_mlp": 0.01027481, "balance_loss_clip": 1.01545238, "balance_loss_mlp": 1.03355598, "epoch": 0.9218097099053059, "flos": 20522446018560.0, "grad_norm": 1.787981059671185, "language_loss": 0.75397044, "learning_rate": 6.374345953275794e-08, "loss": 0.77526438, "num_input_tokens_seen": 330669170, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 15332, "time_per_iteration": 2.458894729614258 }, { "auxiliary_loss_clip": 0.01102324, "auxiliary_loss_mlp": 0.0102825, "balance_loss_clip": 1.01705027, "balance_loss_mlp": 1.03380251, "epoch": 0.9218698331579739, "flos": 17348625216000.0, "grad_norm": 1.7792383614119742, "language_loss": 0.74689317, "learning_rate": 6.364595366195358e-08, "loss": 0.76819885, "num_input_tokens_seen": 330686635, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.68359375, "step": 15333, "time_per_iteration": 2.4367215633392334 }, { "auxiliary_loss_clip": 0.01028247, "auxiliary_loss_mlp": 0.01000111, "balance_loss_clip": 0.99912161, "balance_loss_mlp": 1.00604105, "epoch": 0.9219299564106418, "flos": 61958332575360.0, "grad_norm": 0.8195698256845574, "language_loss": 0.52944517, "learning_rate": 6.354852121788879e-08, "loss": 0.54972875, "num_input_tokens_seen": 330749160, "router_z_loss_clip": 0.0098877, "router_z_loss_mlp": 0.22265625, "step": 15334, "time_per_iteration": 4.466628789901733 }, { "auxiliary_loss_clip": 0.0110047, "auxiliary_loss_mlp": 0.01031715, "balance_loss_clip": 1.02000308, "balance_loss_mlp": 1.0345763, "epoch": 0.9219900796633098, "flos": 15701761526400.0, "grad_norm": 3.2953049323296595, "language_loss": 0.62402356, "learning_rate": 6.345116220425839e-08, "loss": 0.64534545, "num_input_tokens_seen": 330766840, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.65625, "step": 15335, "time_per_iteration": 2.4300081729888916 }, { "auxiliary_loss_clip": 0.01102708, "auxiliary_loss_mlp": 0.01030851, "balance_loss_clip": 1.01887679, "balance_loss_mlp": 1.03502953, "epoch": 0.9220502029159777, "flos": 24932670819840.0, "grad_norm": 3.505329869211571, "language_loss": 0.71370357, "learning_rate": 6.335387662475366e-08, "loss": 0.73503911, "num_input_tokens_seen": 330785585, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 15336, "time_per_iteration": 3.8538482189178467 }, { "auxiliary_loss_clip": 0.01099425, "auxiliary_loss_mlp": 0.0102916, "balance_loss_clip": 1.01823497, "balance_loss_mlp": 1.03346705, "epoch": 0.9221103261686457, "flos": 15667215621120.0, "grad_norm": 1.917377012576526, "language_loss": 0.71854109, "learning_rate": 6.325666448306433e-08, "loss": 0.73982692, "num_input_tokens_seen": 330800750, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.66015625, "step": 15337, "time_per_iteration": 3.8658804893493652 }, { "auxiliary_loss_clip": 0.01027974, "auxiliary_loss_mlp": 0.01000861, "balance_loss_clip": 0.99983579, "balance_loss_mlp": 1.00577092, "epoch": 0.9221704494213137, "flos": 67516299630720.0, "grad_norm": 0.8811320682226693, "language_loss": 0.65317142, "learning_rate": 6.31595257828763e-08, "loss": 0.67345977, "num_input_tokens_seen": 330863640, "router_z_loss_clip": 0.01025391, "router_z_loss_mlp": 0.22265625, "step": 15338, "time_per_iteration": 3.0705723762512207 }, { "auxiliary_loss_clip": 0.01106229, "auxiliary_loss_mlp": 0.01034078, "balance_loss_clip": 1.02160859, "balance_loss_mlp": 1.03651166, "epoch": 0.9222305726739817, "flos": 30226945155840.0, "grad_norm": 2.7439432172110885, "language_loss": 0.67136735, "learning_rate": 6.306246052787289e-08, "loss": 0.69277042, "num_input_tokens_seen": 330884675, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 15339, "time_per_iteration": 2.5125608444213867 }, { "auxiliary_loss_clip": 0.01104704, "auxiliary_loss_mlp": 0.01029268, "balance_loss_clip": 1.01675081, "balance_loss_mlp": 1.03528857, "epoch": 0.9222906959266496, "flos": 25337204766720.0, "grad_norm": 1.773512808513323, "language_loss": 0.71766639, "learning_rate": 6.296546872173513e-08, "loss": 0.7390061, "num_input_tokens_seen": 330904125, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 15340, "time_per_iteration": 2.496365785598755 }, { "auxiliary_loss_clip": 0.01102779, "auxiliary_loss_mlp": 0.01030394, "balance_loss_clip": 1.01858604, "balance_loss_mlp": 1.03536224, "epoch": 0.9223508191793176, "flos": 27599864244480.0, "grad_norm": 1.8492126889346063, "language_loss": 0.70198691, "learning_rate": 6.286855036814098e-08, "loss": 0.72331858, "num_input_tokens_seen": 330925140, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 15341, "time_per_iteration": 2.498953104019165 }, { "auxiliary_loss_clip": 0.01098284, "auxiliary_loss_mlp": 0.01026579, "balance_loss_clip": 1.01579094, "balance_loss_mlp": 1.03311157, "epoch": 0.9224109424319856, "flos": 27307587277440.0, "grad_norm": 1.7379648742554663, "language_loss": 0.67372, "learning_rate": 6.277170547076571e-08, "loss": 0.69496858, "num_input_tokens_seen": 330946625, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.6484375, "step": 15342, "time_per_iteration": 2.5116193294525146 }, { "auxiliary_loss_clip": 0.01105637, "auxiliary_loss_mlp": 0.01035734, "balance_loss_clip": 1.02437925, "balance_loss_mlp": 1.0361104, "epoch": 0.9224710656846535, "flos": 48208314401280.0, "grad_norm": 2.3617635326831223, "language_loss": 0.69781387, "learning_rate": 6.26749340332815e-08, "loss": 0.71922755, "num_input_tokens_seen": 330967795, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6953125, "step": 15343, "time_per_iteration": 2.6759657859802246 }, { "auxiliary_loss_clip": 0.0102798, "auxiliary_loss_mlp": 0.01000753, "balance_loss_clip": 0.99969161, "balance_loss_mlp": 1.00566578, "epoch": 0.9225311889373216, "flos": 66722171794560.0, "grad_norm": 0.777166718956653, "language_loss": 0.51992565, "learning_rate": 6.257823605935786e-08, "loss": 0.54021293, "num_input_tokens_seen": 331040850, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.22265625, "step": 15344, "time_per_iteration": 3.2615585327148438 }, { "auxiliary_loss_clip": 0.01097753, "auxiliary_loss_mlp": 0.01028845, "balance_loss_clip": 1.01778817, "balance_loss_mlp": 1.03359389, "epoch": 0.9225913121899895, "flos": 22271295398400.0, "grad_norm": 1.6946305256448513, "language_loss": 0.7069205, "learning_rate": 6.248161155266162e-08, "loss": 0.72818649, "num_input_tokens_seen": 331060595, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.640625, "step": 15345, "time_per_iteration": 2.453268051147461 }, { "auxiliary_loss_clip": 0.01103842, "auxiliary_loss_mlp": 0.01033405, "balance_loss_clip": 1.02081013, "balance_loss_mlp": 1.03417516, "epoch": 0.9226514354426575, "flos": 20082719721600.0, "grad_norm": 1.921177051240554, "language_loss": 0.77123559, "learning_rate": 6.238506051685677e-08, "loss": 0.79260802, "num_input_tokens_seen": 331080195, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15346, "time_per_iteration": 2.470862865447998 }, { "auxiliary_loss_clip": 0.01109204, "auxiliary_loss_mlp": 0.01036062, "balance_loss_clip": 1.02334797, "balance_loss_mlp": 1.03684485, "epoch": 0.9227115586953254, "flos": 16070851728000.0, "grad_norm": 2.1966951096910288, "language_loss": 0.76101601, "learning_rate": 6.228858295560457e-08, "loss": 0.7824688, "num_input_tokens_seen": 331097645, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 15347, "time_per_iteration": 2.4182043075561523 }, { "auxiliary_loss_clip": 0.01100833, "auxiliary_loss_mlp": 0.01032013, "balance_loss_clip": 1.02107596, "balance_loss_mlp": 1.03556681, "epoch": 0.9227716819479934, "flos": 20446027833600.0, "grad_norm": 1.6157999638524567, "language_loss": 0.76990771, "learning_rate": 6.219217887256367e-08, "loss": 0.79123616, "num_input_tokens_seen": 331116830, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.65234375, "step": 15348, "time_per_iteration": 2.4851925373077393 }, { "auxiliary_loss_clip": 0.01104104, "auxiliary_loss_mlp": 0.01030396, "balance_loss_clip": 1.01784277, "balance_loss_mlp": 1.03404355, "epoch": 0.9228318052006613, "flos": 25007401065600.0, "grad_norm": 2.359343389277399, "language_loss": 0.67691749, "learning_rate": 6.209584827138959e-08, "loss": 0.69826251, "num_input_tokens_seen": 331137235, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 15349, "time_per_iteration": 2.475681781768799 }, { "auxiliary_loss_clip": 0.01102775, "auxiliary_loss_mlp": 0.01029319, "balance_loss_clip": 1.01723719, "balance_loss_mlp": 1.03334391, "epoch": 0.9228919284533293, "flos": 12677257560960.0, "grad_norm": 2.7893091080492614, "language_loss": 0.87097967, "learning_rate": 6.199959115573495e-08, "loss": 0.89230061, "num_input_tokens_seen": 331153155, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 15350, "time_per_iteration": 2.452791213989258 }, { "auxiliary_loss_clip": 0.01028134, "auxiliary_loss_mlp": 0.01001754, "balance_loss_clip": 1.00066376, "balance_loss_mlp": 1.00591028, "epoch": 0.9229520517059973, "flos": 69986162712960.0, "grad_norm": 0.9486934287656318, "language_loss": 0.60359776, "learning_rate": 6.190340752924994e-08, "loss": 0.62389666, "num_input_tokens_seen": 331214895, "router_z_loss_clip": 0.01092529, "router_z_loss_mlp": 0.22265625, "step": 15351, "time_per_iteration": 3.040306329727173 }, { "auxiliary_loss_clip": 0.01104157, "auxiliary_loss_mlp": 0.01024003, "balance_loss_clip": 1.01230884, "balance_loss_mlp": 1.03378141, "epoch": 0.9230121749586653, "flos": 14793832425600.0, "grad_norm": 2.5800698161306888, "language_loss": 0.77563965, "learning_rate": 6.180729739558233e-08, "loss": 0.79692125, "num_input_tokens_seen": 331232185, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.703125, "step": 15352, "time_per_iteration": 2.4319918155670166 }, { "auxiliary_loss_clip": 0.0110645, "auxiliary_loss_mlp": 0.01037776, "balance_loss_clip": 1.02469921, "balance_loss_mlp": 1.03437006, "epoch": 0.9230722982113332, "flos": 22967208472320.0, "grad_norm": 2.5303311151936865, "language_loss": 0.59492618, "learning_rate": 6.171126075837585e-08, "loss": 0.61636847, "num_input_tokens_seen": 331251065, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 15353, "time_per_iteration": 2.459099769592285 }, { "auxiliary_loss_clip": 0.01102925, "auxiliary_loss_mlp": 0.01030751, "balance_loss_clip": 1.01966453, "balance_loss_mlp": 1.03574646, "epoch": 0.9231324214640012, "flos": 18551452976640.0, "grad_norm": 1.7089428244293074, "language_loss": 0.74547756, "learning_rate": 6.161529762127293e-08, "loss": 0.76681435, "num_input_tokens_seen": 331269110, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.671875, "step": 15354, "time_per_iteration": 2.437217950820923 }, { "auxiliary_loss_clip": 0.01106933, "auxiliary_loss_mlp": 0.01036837, "balance_loss_clip": 1.0237118, "balance_loss_mlp": 1.03527224, "epoch": 0.9231925447166691, "flos": 22082727974400.0, "grad_norm": 2.0347339218082388, "language_loss": 0.65108728, "learning_rate": 6.1519407987912e-08, "loss": 0.67252493, "num_input_tokens_seen": 331286555, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 15355, "time_per_iteration": 2.443004846572876 }, { "auxiliary_loss_clip": 0.01101768, "auxiliary_loss_mlp": 0.01032039, "balance_loss_clip": 1.02051759, "balance_loss_mlp": 1.03559017, "epoch": 0.9232526679693371, "flos": 26541145848960.0, "grad_norm": 1.6477253217901588, "language_loss": 0.74201393, "learning_rate": 6.142359186192947e-08, "loss": 0.76335204, "num_input_tokens_seen": 331307660, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 15356, "time_per_iteration": 2.4960856437683105 }, { "auxiliary_loss_clip": 0.01105965, "auxiliary_loss_mlp": 0.0103315, "balance_loss_clip": 1.02060962, "balance_loss_mlp": 1.03653109, "epoch": 0.9233127912220052, "flos": 14756664827520.0, "grad_norm": 1.7834953788963692, "language_loss": 0.61105788, "learning_rate": 6.132784924695844e-08, "loss": 0.63244903, "num_input_tokens_seen": 331324885, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 15357, "time_per_iteration": 2.4104530811309814 }, { "auxiliary_loss_clip": 0.01106284, "auxiliary_loss_mlp": 0.0103206, "balance_loss_clip": 1.01977539, "balance_loss_mlp": 1.03502953, "epoch": 0.9233729144746731, "flos": 25261792162560.0, "grad_norm": 1.566910015038041, "language_loss": 0.6984694, "learning_rate": 6.123218014662956e-08, "loss": 0.71985281, "num_input_tokens_seen": 331345885, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 15358, "time_per_iteration": 2.493748426437378 }, { "auxiliary_loss_clip": 0.01103954, "auxiliary_loss_mlp": 0.0102995, "balance_loss_clip": 1.01789832, "balance_loss_mlp": 1.03501153, "epoch": 0.9234330377273411, "flos": 27849837968640.0, "grad_norm": 2.190430643984881, "language_loss": 0.73585457, "learning_rate": 6.113658456457104e-08, "loss": 0.75719357, "num_input_tokens_seen": 331364320, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 15359, "time_per_iteration": 2.4873008728027344 }, { "auxiliary_loss_clip": 0.01105535, "auxiliary_loss_mlp": 0.01034354, "balance_loss_clip": 1.02208126, "balance_loss_mlp": 1.03625822, "epoch": 0.923493160980009, "flos": 24608361899520.0, "grad_norm": 2.0948890583032598, "language_loss": 0.64767557, "learning_rate": 6.104106250440732e-08, "loss": 0.66907448, "num_input_tokens_seen": 331384135, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 15360, "time_per_iteration": 2.5019497871398926 }, { "auxiliary_loss_clip": 0.01028876, "auxiliary_loss_mlp": 0.01001778, "balance_loss_clip": 1.00080037, "balance_loss_mlp": 1.00657582, "epoch": 0.923553284232677, "flos": 67700916558720.0, "grad_norm": 0.7625559911204399, "language_loss": 0.55104399, "learning_rate": 6.094561396976083e-08, "loss": 0.57135051, "num_input_tokens_seen": 331440645, "router_z_loss_clip": 0.00976562, "router_z_loss_mlp": 0.22363281, "step": 15361, "time_per_iteration": 3.0340383052825928 }, { "auxiliary_loss_clip": 0.0110589, "auxiliary_loss_mlp": 0.01029369, "balance_loss_clip": 1.01634514, "balance_loss_mlp": 1.03460121, "epoch": 0.9236134074853449, "flos": 18807244704000.0, "grad_norm": 2.4095973103180586, "language_loss": 0.69848281, "learning_rate": 6.085023896425112e-08, "loss": 0.7198354, "num_input_tokens_seen": 331459580, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 15362, "time_per_iteration": 2.4668445587158203 }, { "auxiliary_loss_clip": 0.01106811, "auxiliary_loss_mlp": 0.0103226, "balance_loss_clip": 1.018152, "balance_loss_mlp": 1.03550839, "epoch": 0.923673530738013, "flos": 27782362270080.0, "grad_norm": 1.7343907569973152, "language_loss": 0.75868034, "learning_rate": 6.075493749149463e-08, "loss": 0.78007102, "num_input_tokens_seen": 331481560, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7109375, "step": 15363, "time_per_iteration": 2.499537229537964 }, { "auxiliary_loss_clip": 0.01103437, "auxiliary_loss_mlp": 0.01030307, "balance_loss_clip": 1.01832604, "balance_loss_mlp": 1.03453898, "epoch": 0.9237336539906809, "flos": 26797117144320.0, "grad_norm": 1.9456864767785664, "language_loss": 0.83371288, "learning_rate": 6.065970955510514e-08, "loss": 0.85505033, "num_input_tokens_seen": 331499090, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 15364, "time_per_iteration": 2.4964699745178223 }, { "auxiliary_loss_clip": 0.01102472, "auxiliary_loss_mlp": 0.01023313, "balance_loss_clip": 1.0122143, "balance_loss_mlp": 1.03492665, "epoch": 0.9237937772433489, "flos": 23587708942080.0, "grad_norm": 1.5521723751509884, "language_loss": 0.67919183, "learning_rate": 6.056455515869419e-08, "loss": 0.70044971, "num_input_tokens_seen": 331519420, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.67578125, "step": 15365, "time_per_iteration": 2.4956936836242676 }, { "auxiliary_loss_clip": 0.011051, "auxiliary_loss_mlp": 0.01032657, "balance_loss_clip": 1.02083182, "balance_loss_mlp": 1.03583026, "epoch": 0.9238539004960168, "flos": 26140562398080.0, "grad_norm": 2.4970360770359177, "language_loss": 0.63256335, "learning_rate": 6.046947430586913e-08, "loss": 0.65394092, "num_input_tokens_seen": 331538720, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 15366, "time_per_iteration": 2.5379462242126465 }, { "auxiliary_loss_clip": 0.01103717, "auxiliary_loss_mlp": 0.01029278, "balance_loss_clip": 1.01653445, "balance_loss_mlp": 1.03586447, "epoch": 0.9239140237486848, "flos": 21068000760960.0, "grad_norm": 1.5840933290933896, "language_loss": 0.74613494, "learning_rate": 6.037446700023619e-08, "loss": 0.76746488, "num_input_tokens_seen": 331558505, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 15367, "time_per_iteration": 2.449902057647705 }, { "auxiliary_loss_clip": 0.01100435, "auxiliary_loss_mlp": 0.01031942, "balance_loss_clip": 1.02069485, "balance_loss_mlp": 1.03505838, "epoch": 0.9239741470013527, "flos": 24607930936320.0, "grad_norm": 2.028058047415932, "language_loss": 0.64658594, "learning_rate": 6.027953324539759e-08, "loss": 0.66790968, "num_input_tokens_seen": 331578440, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.65625, "step": 15368, "time_per_iteration": 2.4891438484191895 }, { "auxiliary_loss_clip": 0.01106618, "auxiliary_loss_mlp": 0.01033096, "balance_loss_clip": 1.02044177, "balance_loss_mlp": 1.03543019, "epoch": 0.9240342702540207, "flos": 24718248581760.0, "grad_norm": 4.252347562755928, "language_loss": 0.74732757, "learning_rate": 6.018467304495401e-08, "loss": 0.76872474, "num_input_tokens_seen": 331598945, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 15369, "time_per_iteration": 2.483241319656372 }, { "auxiliary_loss_clip": 0.01109986, "auxiliary_loss_mlp": 0.01035487, "balance_loss_clip": 1.02179527, "balance_loss_mlp": 1.03745246, "epoch": 0.9240943935066888, "flos": 20849987162880.0, "grad_norm": 4.078162051468444, "language_loss": 0.76911116, "learning_rate": 6.008988640250145e-08, "loss": 0.79056591, "num_input_tokens_seen": 331616700, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 15370, "time_per_iteration": 2.464600086212158 }, { "auxiliary_loss_clip": 0.01104344, "auxiliary_loss_mlp": 0.01035097, "balance_loss_clip": 1.02307439, "balance_loss_mlp": 1.03480005, "epoch": 0.9241545167593567, "flos": 24462313200000.0, "grad_norm": 2.351697349612187, "language_loss": 0.67506462, "learning_rate": 5.999517332163528e-08, "loss": 0.69645905, "num_input_tokens_seen": 331635625, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 15371, "time_per_iteration": 3.790220022201538 }, { "auxiliary_loss_clip": 0.01028383, "auxiliary_loss_mlp": 0.01002225, "balance_loss_clip": 1.00117004, "balance_loss_mlp": 1.0060277, "epoch": 0.9242146400120247, "flos": 61827259847040.0, "grad_norm": 0.7234371848516196, "language_loss": 0.57743907, "learning_rate": 5.99005338059464e-08, "loss": 0.59774518, "num_input_tokens_seen": 331698595, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.22363281, "step": 15372, "time_per_iteration": 3.0438780784606934 }, { "auxiliary_loss_clip": 0.01103478, "auxiliary_loss_mlp": 0.01031901, "balance_loss_clip": 1.02054596, "balance_loss_mlp": 1.03665876, "epoch": 0.9242747632646926, "flos": 22048397550720.0, "grad_norm": 3.07625405038741, "language_loss": 0.69609427, "learning_rate": 5.98059678590237e-08, "loss": 0.71744806, "num_input_tokens_seen": 331717975, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.66796875, "step": 15373, "time_per_iteration": 2.4795074462890625 }, { "auxiliary_loss_clip": 0.01105287, "auxiliary_loss_mlp": 0.01039628, "balance_loss_clip": 1.02792716, "balance_loss_mlp": 1.03614163, "epoch": 0.9243348865173606, "flos": 18478338842880.0, "grad_norm": 3.235276610365283, "language_loss": 0.75109929, "learning_rate": 5.971147548445299e-08, "loss": 0.7725485, "num_input_tokens_seen": 331737220, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69140625, "step": 15374, "time_per_iteration": 2.417283535003662 }, { "auxiliary_loss_clip": 0.01107009, "auxiliary_loss_mlp": 0.01031616, "balance_loss_clip": 1.01973104, "balance_loss_mlp": 1.03707743, "epoch": 0.9243950097700285, "flos": 23258767167360.0, "grad_norm": 1.7169031248822795, "language_loss": 0.64835113, "learning_rate": 5.961705668581784e-08, "loss": 0.6697374, "num_input_tokens_seen": 331757300, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 15375, "time_per_iteration": 3.8623600006103516 }, { "auxiliary_loss_clip": 0.01103654, "auxiliary_loss_mlp": 0.01032244, "balance_loss_clip": 1.02070999, "balance_loss_mlp": 1.03657413, "epoch": 0.9244551330226966, "flos": 29749081593600.0, "grad_norm": 7.379874370769505, "language_loss": 0.6605165, "learning_rate": 5.952271146669829e-08, "loss": 0.68187547, "num_input_tokens_seen": 331776995, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 15376, "time_per_iteration": 2.5289859771728516 }, { "auxiliary_loss_clip": 0.01027928, "auxiliary_loss_mlp": 0.01000244, "balance_loss_clip": 0.99927282, "balance_loss_mlp": 1.00572383, "epoch": 0.9245152562753645, "flos": 68864960609280.0, "grad_norm": 0.6556209179952912, "language_loss": 0.61172986, "learning_rate": 5.94284398306717e-08, "loss": 0.63201159, "num_input_tokens_seen": 331845015, "router_z_loss_clip": 0.00970459, "router_z_loss_mlp": 0.22265625, "step": 15377, "time_per_iteration": 3.1234703063964844 }, { "auxiliary_loss_clip": 0.01103786, "auxiliary_loss_mlp": 0.01036778, "balance_loss_clip": 1.02488053, "balance_loss_mlp": 1.03524864, "epoch": 0.9245753795280325, "flos": 21579260993280.0, "grad_norm": 1.9286457327847761, "language_loss": 0.73953271, "learning_rate": 5.933424178131341e-08, "loss": 0.76093829, "num_input_tokens_seen": 331862795, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 15378, "time_per_iteration": 3.82297682762146 }, { "auxiliary_loss_clip": 0.01105517, "auxiliary_loss_mlp": 0.0103233, "balance_loss_clip": 1.01963985, "balance_loss_mlp": 1.03631341, "epoch": 0.9246355027807004, "flos": 34496077334400.0, "grad_norm": 2.122217998480653, "language_loss": 0.62162352, "learning_rate": 5.924011732219503e-08, "loss": 0.64300203, "num_input_tokens_seen": 331882535, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 15379, "time_per_iteration": 4.0048487186431885 }, { "auxiliary_loss_clip": 0.01102298, "auxiliary_loss_mlp": 0.01027176, "balance_loss_clip": 1.0151062, "balance_loss_mlp": 1.03594744, "epoch": 0.9246956260333684, "flos": 15953854152960.0, "grad_norm": 2.15435721211821, "language_loss": 0.8368116, "learning_rate": 5.914606645688591e-08, "loss": 0.85810637, "num_input_tokens_seen": 331899335, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6640625, "step": 15380, "time_per_iteration": 2.4199647903442383 }, { "auxiliary_loss_clip": 0.01105302, "auxiliary_loss_mlp": 0.0103199, "balance_loss_clip": 1.01891315, "balance_loss_mlp": 1.03442144, "epoch": 0.9247557492860363, "flos": 23368366540800.0, "grad_norm": 1.7394099742973004, "language_loss": 0.73381257, "learning_rate": 5.905208918895233e-08, "loss": 0.75518548, "num_input_tokens_seen": 331919030, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 15381, "time_per_iteration": 2.478900671005249 }, { "auxiliary_loss_clip": 0.01103675, "auxiliary_loss_mlp": 0.01030214, "balance_loss_clip": 1.01887679, "balance_loss_mlp": 1.03520513, "epoch": 0.9248158725387043, "flos": 23039855729280.0, "grad_norm": 1.952124660000088, "language_loss": 0.78616673, "learning_rate": 5.8958185521958524e-08, "loss": 0.80750561, "num_input_tokens_seen": 331936465, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.68359375, "step": 15382, "time_per_iteration": 2.4609320163726807 }, { "auxiliary_loss_clip": 0.01103851, "auxiliary_loss_mlp": 0.01035273, "balance_loss_clip": 1.02240419, "balance_loss_mlp": 1.03375173, "epoch": 0.9248759957913724, "flos": 22522418357760.0, "grad_norm": 2.023045180832583, "language_loss": 0.74954075, "learning_rate": 5.886435545946455e-08, "loss": 0.77093196, "num_input_tokens_seen": 331954625, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 15383, "time_per_iteration": 2.4668891429901123 }, { "auxiliary_loss_clip": 0.01100263, "auxiliary_loss_mlp": 0.01030501, "balance_loss_clip": 1.01890826, "balance_loss_mlp": 1.03352153, "epoch": 0.9249361190440403, "flos": 25447271016960.0, "grad_norm": 1.8803004504988459, "language_loss": 0.7586391, "learning_rate": 5.8770599005028456e-08, "loss": 0.77994674, "num_input_tokens_seen": 331975865, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 15384, "time_per_iteration": 2.5007247924804688 }, { "auxiliary_loss_clip": 0.01101006, "auxiliary_loss_mlp": 0.01032476, "balance_loss_clip": 1.02004898, "balance_loss_mlp": 1.03442621, "epoch": 0.9249962422967083, "flos": 12378623886720.0, "grad_norm": 1.992581609248843, "language_loss": 0.66246045, "learning_rate": 5.8676916162206045e-08, "loss": 0.68379521, "num_input_tokens_seen": 331992760, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.66796875, "step": 15385, "time_per_iteration": 2.4628782272338867 }, { "auxiliary_loss_clip": 0.01101638, "auxiliary_loss_mlp": 0.01033481, "balance_loss_clip": 1.0216434, "balance_loss_mlp": 1.03383613, "epoch": 0.9250563655493762, "flos": 22929430343040.0, "grad_norm": 1.908844407116279, "language_loss": 0.80488086, "learning_rate": 5.85833069345496e-08, "loss": 0.82623202, "num_input_tokens_seen": 332011890, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 15386, "time_per_iteration": 2.45890474319458 }, { "auxiliary_loss_clip": 0.01103408, "auxiliary_loss_mlp": 0.01039391, "balance_loss_clip": 1.02705312, "balance_loss_mlp": 1.03648186, "epoch": 0.9251164888020442, "flos": 18478662065280.0, "grad_norm": 2.212309047078155, "language_loss": 0.75183094, "learning_rate": 5.8489771325608504e-08, "loss": 0.77325892, "num_input_tokens_seen": 332029485, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.671875, "step": 15387, "time_per_iteration": 2.4396822452545166 }, { "auxiliary_loss_clip": 0.01101366, "auxiliary_loss_mlp": 0.01032041, "balance_loss_clip": 1.02095413, "balance_loss_mlp": 1.03429222, "epoch": 0.9251766120547121, "flos": 33037062796800.0, "grad_norm": 4.0193957974990075, "language_loss": 0.70120597, "learning_rate": 5.839630933893014e-08, "loss": 0.72254008, "num_input_tokens_seen": 332052970, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.66796875, "step": 15388, "time_per_iteration": 2.5485570430755615 }, { "auxiliary_loss_clip": 0.01105466, "auxiliary_loss_mlp": 0.01028089, "balance_loss_clip": 1.01594126, "balance_loss_mlp": 1.03510618, "epoch": 0.9252367353073802, "flos": 24387906176640.0, "grad_norm": 1.6804576761098549, "language_loss": 0.8207947, "learning_rate": 5.8302920978058115e-08, "loss": 0.8421303, "num_input_tokens_seen": 332070395, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.703125, "step": 15389, "time_per_iteration": 2.490328073501587 }, { "auxiliary_loss_clip": 0.01110767, "auxiliary_loss_mlp": 0.01033592, "balance_loss_clip": 1.02068174, "balance_loss_mlp": 1.03720951, "epoch": 0.9252968585600481, "flos": 18916844077440.0, "grad_norm": 2.7775512807884355, "language_loss": 0.79191726, "learning_rate": 5.820960624653381e-08, "loss": 0.81336081, "num_input_tokens_seen": 332090185, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 15390, "time_per_iteration": 2.445124626159668 }, { "auxiliary_loss_clip": 0.0110587, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.02724314, "balance_loss_mlp": 1.0350368, "epoch": 0.9253569818127161, "flos": 21725345606400.0, "grad_norm": 1.9877606996885602, "language_loss": 0.75658107, "learning_rate": 5.811636514789597e-08, "loss": 0.77804106, "num_input_tokens_seen": 332109050, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 15391, "time_per_iteration": 2.4672977924346924 }, { "auxiliary_loss_clip": 0.01103212, "auxiliary_loss_mlp": 0.01033211, "balance_loss_clip": 1.01987767, "balance_loss_mlp": 1.03356719, "epoch": 0.925417105065384, "flos": 34240357434240.0, "grad_norm": 2.5593736407480185, "language_loss": 0.53107619, "learning_rate": 5.80231976856802e-08, "loss": 0.5524404, "num_input_tokens_seen": 332131180, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 15392, "time_per_iteration": 2.5568387508392334 }, { "auxiliary_loss_clip": 0.01102635, "auxiliary_loss_mlp": 0.01027142, "balance_loss_clip": 1.01566231, "balance_loss_mlp": 1.03315639, "epoch": 0.925477228318052, "flos": 25959536830080.0, "grad_norm": 1.8127044208590024, "language_loss": 0.7711736, "learning_rate": 5.7930103863419454e-08, "loss": 0.79247129, "num_input_tokens_seen": 332149555, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6953125, "step": 15393, "time_per_iteration": 2.510708808898926 }, { "auxiliary_loss_clip": 0.01103073, "auxiliary_loss_mlp": 0.01031046, "balance_loss_clip": 1.01907134, "balance_loss_mlp": 1.03513527, "epoch": 0.9255373515707199, "flos": 11838240702720.0, "grad_norm": 2.9845972549134707, "language_loss": 0.69401503, "learning_rate": 5.783708368464357e-08, "loss": 0.71535623, "num_input_tokens_seen": 332165830, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 15394, "time_per_iteration": 2.4125590324401855 }, { "auxiliary_loss_clip": 0.01104166, "auxiliary_loss_mlp": 0.01024534, "balance_loss_clip": 1.01254129, "balance_loss_mlp": 1.03498316, "epoch": 0.925597474823388, "flos": 21434325615360.0, "grad_norm": 2.262575714678761, "language_loss": 0.72726023, "learning_rate": 5.7744137152879956e-08, "loss": 0.74854726, "num_input_tokens_seen": 332185130, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 15395, "time_per_iteration": 2.46966552734375 }, { "auxiliary_loss_clip": 0.01098404, "auxiliary_loss_mlp": 0.01029125, "balance_loss_clip": 1.01785398, "balance_loss_mlp": 1.03250468, "epoch": 0.925657598076056, "flos": 22857573185280.0, "grad_norm": 2.0034843935184847, "language_loss": 0.72034311, "learning_rate": 5.7651264271653785e-08, "loss": 0.74161839, "num_input_tokens_seen": 332203695, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.65625, "step": 15396, "time_per_iteration": 2.4457616806030273 }, { "auxiliary_loss_clip": 0.011016, "auxiliary_loss_mlp": 0.01027932, "balance_loss_clip": 1.01558793, "balance_loss_mlp": 1.03386068, "epoch": 0.9257177213287239, "flos": 25704032411520.0, "grad_norm": 1.589432273382533, "language_loss": 0.87524426, "learning_rate": 5.755846504448603e-08, "loss": 0.89653957, "num_input_tokens_seen": 332224850, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.67578125, "step": 15397, "time_per_iteration": 2.4894652366638184 }, { "auxiliary_loss_clip": 0.01028356, "auxiliary_loss_mlp": 0.01002252, "balance_loss_clip": 1.00116754, "balance_loss_mlp": 1.00600767, "epoch": 0.9257778445813919, "flos": 59592933221760.0, "grad_norm": 0.8037409391961301, "language_loss": 0.55154711, "learning_rate": 5.746573947489586e-08, "loss": 0.57185322, "num_input_tokens_seen": 332278085, "router_z_loss_clip": 0.01086426, "router_z_loss_mlp": 0.22363281, "step": 15398, "time_per_iteration": 2.954453945159912 }, { "auxiliary_loss_clip": 0.01109627, "auxiliary_loss_mlp": 0.01032924, "balance_loss_clip": 1.0190897, "balance_loss_mlp": 1.03614855, "epoch": 0.9258379678340598, "flos": 27709427704320.0, "grad_norm": 2.0100835766405747, "language_loss": 0.76251411, "learning_rate": 5.7373087566400025e-08, "loss": 0.78393966, "num_input_tokens_seen": 332297875, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.734375, "step": 15399, "time_per_iteration": 2.498613119125366 }, { "auxiliary_loss_clip": 0.01098724, "auxiliary_loss_mlp": 0.01031232, "balance_loss_clip": 1.02025926, "balance_loss_mlp": 1.03330898, "epoch": 0.9258980910867278, "flos": 24863543095680.0, "grad_norm": 1.4611050633993232, "language_loss": 0.78124928, "learning_rate": 5.7280509322510826e-08, "loss": 0.80254889, "num_input_tokens_seen": 332318500, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.65625, "step": 15400, "time_per_iteration": 2.485628604888916 }, { "auxiliary_loss_clip": 0.0102832, "auxiliary_loss_mlp": 0.00999507, "balance_loss_clip": 0.99845839, "balance_loss_mlp": 1.00606298, "epoch": 0.9259582143393957, "flos": 63134587249920.0, "grad_norm": 0.7280365879753068, "language_loss": 0.51319498, "learning_rate": 5.718800474673946e-08, "loss": 0.53347325, "num_input_tokens_seen": 332381980, "router_z_loss_clip": 0.01049805, "router_z_loss_mlp": 0.22265625, "step": 15401, "time_per_iteration": 3.0679218769073486 }, { "auxiliary_loss_clip": 0.0110219, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.02135336, "balance_loss_mlp": 1.03670788, "epoch": 0.9260183375920638, "flos": 24127122458880.0, "grad_norm": 1.886209883011213, "language_loss": 0.82340539, "learning_rate": 5.709557384259378e-08, "loss": 0.84475744, "num_input_tokens_seen": 332399510, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.65625, "step": 15402, "time_per_iteration": 2.4725182056427 }, { "auxiliary_loss_clip": 0.01028068, "auxiliary_loss_mlp": 0.01002024, "balance_loss_clip": 1.0008738, "balance_loss_mlp": 1.00579846, "epoch": 0.9260784608447317, "flos": 63042872849280.0, "grad_norm": 0.7399077824161059, "language_loss": 0.51114273, "learning_rate": 5.700321661357876e-08, "loss": 0.53144372, "num_input_tokens_seen": 332459130, "router_z_loss_clip": 0.01147461, "router_z_loss_mlp": 0.22265625, "step": 15403, "time_per_iteration": 3.173511505126953 }, { "auxiliary_loss_clip": 0.01028026, "auxiliary_loss_mlp": 0.01000759, "balance_loss_clip": 0.99965018, "balance_loss_mlp": 1.00577223, "epoch": 0.9261385840973997, "flos": 70585979927040.0, "grad_norm": 0.6892857000571079, "language_loss": 0.58769441, "learning_rate": 5.69109330631965e-08, "loss": 0.60798216, "num_input_tokens_seen": 332526555, "router_z_loss_clip": 0.0111084, "router_z_loss_mlp": 0.22265625, "step": 15404, "time_per_iteration": 3.1268160343170166 }, { "auxiliary_loss_clip": 0.01105495, "auxiliary_loss_mlp": 0.01032448, "balance_loss_clip": 1.01921618, "balance_loss_mlp": 1.03511214, "epoch": 0.9261987073500676, "flos": 20229917656320.0, "grad_norm": 4.1572631751840206, "language_loss": 0.71908331, "learning_rate": 5.681872319494596e-08, "loss": 0.74046278, "num_input_tokens_seen": 332544005, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 15405, "time_per_iteration": 2.490873098373413 }, { "auxiliary_loss_clip": 0.01107519, "auxiliary_loss_mlp": 0.01033669, "balance_loss_clip": 1.02049613, "balance_loss_mlp": 1.0363369, "epoch": 0.9262588306027356, "flos": 20954163582720.0, "grad_norm": 1.9741127933044718, "language_loss": 0.68912584, "learning_rate": 5.672658701232458e-08, "loss": 0.71053773, "num_input_tokens_seen": 332563070, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 15406, "time_per_iteration": 2.479433298110962 }, { "auxiliary_loss_clip": 0.01106251, "auxiliary_loss_mlp": 0.01033222, "balance_loss_clip": 1.01998937, "balance_loss_mlp": 1.03675318, "epoch": 0.9263189538554035, "flos": 22158679282560.0, "grad_norm": 2.270677232826802, "language_loss": 0.76550019, "learning_rate": 5.663452451882555e-08, "loss": 0.78689492, "num_input_tokens_seen": 332579620, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 15407, "time_per_iteration": 2.4581761360168457 }, { "auxiliary_loss_clip": 0.01106042, "auxiliary_loss_mlp": 0.01035167, "balance_loss_clip": 1.0222863, "balance_loss_mlp": 1.0327909, "epoch": 0.9263790771080715, "flos": 18187211111040.0, "grad_norm": 4.839180846700492, "language_loss": 0.72512001, "learning_rate": 5.6542535717940096e-08, "loss": 0.74653208, "num_input_tokens_seen": 332597795, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 15408, "time_per_iteration": 2.454145908355713 }, { "auxiliary_loss_clip": 0.01101567, "auxiliary_loss_mlp": 0.01026107, "balance_loss_clip": 1.01578391, "balance_loss_mlp": 1.03463602, "epoch": 0.9264392003607396, "flos": 48178545004800.0, "grad_norm": 2.3315044306182786, "language_loss": 0.68447638, "learning_rate": 5.645062061315675e-08, "loss": 0.70575309, "num_input_tokens_seen": 332620375, "router_z_loss_clip": 0.10302734, "router_z_loss_mlp": 0.66796875, "step": 15409, "time_per_iteration": 2.6898787021636963 }, { "auxiliary_loss_clip": 0.01105897, "auxiliary_loss_mlp": 0.01032339, "balance_loss_clip": 1.01938081, "balance_loss_mlp": 1.03649759, "epoch": 0.9264993236134075, "flos": 26389458714240.0, "grad_norm": 1.8155322287962612, "language_loss": 0.75459707, "learning_rate": 5.6358779207960506e-08, "loss": 0.77597952, "num_input_tokens_seen": 332639510, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 15410, "time_per_iteration": 2.5089519023895264 }, { "auxiliary_loss_clip": 0.01104721, "auxiliary_loss_mlp": 0.0102747, "balance_loss_clip": 1.01542449, "balance_loss_mlp": 1.03503287, "epoch": 0.9265594468660755, "flos": 20920084554240.0, "grad_norm": 1.5483667946110131, "language_loss": 0.81991196, "learning_rate": 5.6267011505833905e-08, "loss": 0.84123397, "num_input_tokens_seen": 332658350, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 15411, "time_per_iteration": 2.4583921432495117 }, { "auxiliary_loss_clip": 0.01109234, "auxiliary_loss_mlp": 0.01030496, "balance_loss_clip": 1.01900423, "balance_loss_mlp": 1.03981674, "epoch": 0.9266195701187434, "flos": 17525017929600.0, "grad_norm": 1.7028780776780357, "language_loss": 0.75537515, "learning_rate": 5.617531751025728e-08, "loss": 0.7767725, "num_input_tokens_seen": 332676715, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6953125, "step": 15412, "time_per_iteration": 2.453153133392334 }, { "auxiliary_loss_clip": 0.01101819, "auxiliary_loss_mlp": 0.01027847, "balance_loss_clip": 1.01599741, "balance_loss_mlp": 1.03287005, "epoch": 0.9266796933714114, "flos": 33688733293440.0, "grad_norm": 2.003651530001233, "language_loss": 0.66717744, "learning_rate": 5.6083697224707406e-08, "loss": 0.68847406, "num_input_tokens_seen": 332701470, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69140625, "step": 15413, "time_per_iteration": 3.956157684326172 }, { "auxiliary_loss_clip": 0.01106561, "auxiliary_loss_mlp": 0.01033703, "balance_loss_clip": 1.02067375, "balance_loss_mlp": 1.03592038, "epoch": 0.9267398166240793, "flos": 18916520855040.0, "grad_norm": 1.751725415129756, "language_loss": 0.75864822, "learning_rate": 5.5992150652658167e-08, "loss": 0.78005087, "num_input_tokens_seen": 332719060, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 15414, "time_per_iteration": 2.449108362197876 }, { "auxiliary_loss_clip": 0.01103083, "auxiliary_loss_mlp": 0.01029436, "balance_loss_clip": 1.01797438, "balance_loss_mlp": 1.03519416, "epoch": 0.9267999398767474, "flos": 20478957626880.0, "grad_norm": 2.566450438228259, "language_loss": 0.81701458, "learning_rate": 5.59006777975819e-08, "loss": 0.83833975, "num_input_tokens_seen": 332736345, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 15415, "time_per_iteration": 2.4458906650543213 }, { "auxiliary_loss_clip": 0.01104819, "auxiliary_loss_mlp": 0.01034615, "balance_loss_clip": 1.02243233, "balance_loss_mlp": 1.03437185, "epoch": 0.9268600631294153, "flos": 24789351553920.0, "grad_norm": 1.612012168246438, "language_loss": 0.54180604, "learning_rate": 5.580927866294671e-08, "loss": 0.56320041, "num_input_tokens_seen": 332756270, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 15416, "time_per_iteration": 2.4780821800231934 }, { "auxiliary_loss_clip": 0.01101168, "auxiliary_loss_mlp": 0.01029772, "balance_loss_clip": 1.0185014, "balance_loss_mlp": 1.0344367, "epoch": 0.9269201863820833, "flos": 18697178453760.0, "grad_norm": 6.817949426275836, "language_loss": 0.71815652, "learning_rate": 5.571795325221807e-08, "loss": 0.73946589, "num_input_tokens_seen": 332775185, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66796875, "step": 15417, "time_per_iteration": 3.8260819911956787 }, { "auxiliary_loss_clip": 0.01104082, "auxiliary_loss_mlp": 0.01028824, "balance_loss_clip": 1.0163722, "balance_loss_mlp": 1.03593898, "epoch": 0.9269803096347512, "flos": 20923999136640.0, "grad_norm": 12.598833882409362, "language_loss": 0.76009619, "learning_rate": 5.5626701568859624e-08, "loss": 0.78142518, "num_input_tokens_seen": 332794320, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 15418, "time_per_iteration": 2.4365034103393555 }, { "auxiliary_loss_clip": 0.01101999, "auxiliary_loss_mlp": 0.01030553, "balance_loss_clip": 1.01830459, "balance_loss_mlp": 1.0341692, "epoch": 0.9270404328874192, "flos": 28002710252160.0, "grad_norm": 2.8524946798566004, "language_loss": 0.76104218, "learning_rate": 5.553552361633174e-08, "loss": 0.78236771, "num_input_tokens_seen": 332818095, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 15419, "time_per_iteration": 2.5218334197998047 }, { "auxiliary_loss_clip": 0.01099094, "auxiliary_loss_mlp": 0.01030846, "balance_loss_clip": 1.01951551, "balance_loss_mlp": 1.03313172, "epoch": 0.9271005561400871, "flos": 25889870401920.0, "grad_norm": 1.8249293519058714, "language_loss": 0.75730097, "learning_rate": 5.5444419398091636e-08, "loss": 0.77860034, "num_input_tokens_seen": 332839860, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 15420, "time_per_iteration": 5.288715362548828 }, { "auxiliary_loss_clip": 0.01106749, "auxiliary_loss_mlp": 0.01030022, "balance_loss_clip": 1.0173924, "balance_loss_mlp": 1.03520834, "epoch": 0.9271606793927551, "flos": 27053914452480.0, "grad_norm": 2.2288182317578973, "language_loss": 0.76643288, "learning_rate": 5.535338891759389e-08, "loss": 0.78780055, "num_input_tokens_seen": 332861155, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 15421, "time_per_iteration": 2.500701665878296 }, { "auxiliary_loss_clip": 0.01104574, "auxiliary_loss_mlp": 0.01030091, "balance_loss_clip": 1.01856327, "balance_loss_mlp": 1.03611708, "epoch": 0.9272208026454232, "flos": 26209869690240.0, "grad_norm": 2.0074164997947097, "language_loss": 0.73142803, "learning_rate": 5.526243217829041e-08, "loss": 0.7527746, "num_input_tokens_seen": 332881110, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 15422, "time_per_iteration": 2.5103342533111572 }, { "auxiliary_loss_clip": 0.01106924, "auxiliary_loss_mlp": 0.01037735, "balance_loss_clip": 1.02471793, "balance_loss_mlp": 1.03594542, "epoch": 0.9272809258980911, "flos": 12458453863680.0, "grad_norm": 2.7963340505129644, "language_loss": 0.77180552, "learning_rate": 5.517154918363065e-08, "loss": 0.79325211, "num_input_tokens_seen": 332899350, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 15423, "time_per_iteration": 2.4144153594970703 }, { "auxiliary_loss_clip": 0.0110561, "auxiliary_loss_mlp": 0.01030974, "balance_loss_clip": 1.01810479, "balance_loss_mlp": 1.0348835, "epoch": 0.9273410491507591, "flos": 22856890826880.0, "grad_norm": 1.9680930936657, "language_loss": 0.75450897, "learning_rate": 5.508073993706053e-08, "loss": 0.77587479, "num_input_tokens_seen": 332918105, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 15424, "time_per_iteration": 2.4686384201049805 }, { "auxiliary_loss_clip": 0.01027972, "auxiliary_loss_mlp": 0.01001022, "balance_loss_clip": 0.99993151, "balance_loss_mlp": 1.00553322, "epoch": 0.927401172403427, "flos": 47665384329600.0, "grad_norm": 0.7783226159108408, "language_loss": 0.60644352, "learning_rate": 5.499000444202351e-08, "loss": 0.62673348, "num_input_tokens_seen": 332969490, "router_z_loss_clip": 0.01092529, "router_z_loss_mlp": 0.22460938, "step": 15425, "time_per_iteration": 2.8613505363464355 }, { "auxiliary_loss_clip": 0.01105073, "auxiliary_loss_mlp": 0.01030883, "balance_loss_clip": 1.01844919, "balance_loss_mlp": 1.03626204, "epoch": 0.927461295656095, "flos": 29972374490880.0, "grad_norm": 1.5962331462443655, "language_loss": 0.70470291, "learning_rate": 5.489934270196106e-08, "loss": 0.72606248, "num_input_tokens_seen": 332988805, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 15426, "time_per_iteration": 2.534653902053833 }, { "auxiliary_loss_clip": 0.01103194, "auxiliary_loss_mlp": 0.01025438, "balance_loss_clip": 1.0137254, "balance_loss_mlp": 1.03535712, "epoch": 0.9275214189087629, "flos": 20375427651840.0, "grad_norm": 1.8570592240009736, "language_loss": 0.8284632, "learning_rate": 5.480875472030977e-08, "loss": 0.84974951, "num_input_tokens_seen": 333007960, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 15427, "time_per_iteration": 2.4666171073913574 }, { "auxiliary_loss_clip": 0.01105979, "auxiliary_loss_mlp": 0.01035201, "balance_loss_clip": 1.02292883, "balance_loss_mlp": 1.03707576, "epoch": 0.927581542161431, "flos": 22383193242240.0, "grad_norm": 1.8037828158648939, "language_loss": 0.76696628, "learning_rate": 5.471824050050555e-08, "loss": 0.78837812, "num_input_tokens_seen": 333026035, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15428, "time_per_iteration": 2.4660091400146484 }, { "auxiliary_loss_clip": 0.01101223, "auxiliary_loss_mlp": 0.01032114, "balance_loss_clip": 1.01981163, "balance_loss_mlp": 1.0330236, "epoch": 0.9276416654140989, "flos": 23952453598080.0, "grad_norm": 2.0799747475202075, "language_loss": 0.74617517, "learning_rate": 5.4627800045980555e-08, "loss": 0.76750857, "num_input_tokens_seen": 333045590, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 15429, "time_per_iteration": 2.46636700630188 }, { "auxiliary_loss_clip": 0.0110199, "auxiliary_loss_mlp": 0.01032807, "balance_loss_clip": 1.02101111, "balance_loss_mlp": 1.03529465, "epoch": 0.9277017886667669, "flos": 13917719796480.0, "grad_norm": 1.7930910354202694, "language_loss": 0.74495935, "learning_rate": 5.45374333601647e-08, "loss": 0.76630735, "num_input_tokens_seen": 333063355, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 15430, "time_per_iteration": 2.4295616149902344 }, { "auxiliary_loss_clip": 0.01103615, "auxiliary_loss_mlp": 0.01032201, "balance_loss_clip": 1.01929617, "balance_loss_mlp": 1.0345577, "epoch": 0.9277619119194348, "flos": 35666478092160.0, "grad_norm": 1.5338919744059396, "language_loss": 0.76253974, "learning_rate": 5.444714044648391e-08, "loss": 0.78389794, "num_input_tokens_seen": 333088045, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 15431, "time_per_iteration": 2.5978329181671143 }, { "auxiliary_loss_clip": 0.01103497, "auxiliary_loss_mlp": 0.01030531, "balance_loss_clip": 1.01905096, "balance_loss_mlp": 1.0361588, "epoch": 0.9278220351721028, "flos": 23841238112640.0, "grad_norm": 2.1516971077549893, "language_loss": 0.70445228, "learning_rate": 5.4356921308363e-08, "loss": 0.72579253, "num_input_tokens_seen": 333108005, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.671875, "step": 15432, "time_per_iteration": 2.5005571842193604 }, { "auxiliary_loss_clip": 0.01105961, "auxiliary_loss_mlp": 0.01028943, "balance_loss_clip": 1.01656914, "balance_loss_mlp": 1.03542972, "epoch": 0.9278821584247707, "flos": 15228135768960.0, "grad_norm": 2.555088494941442, "language_loss": 0.81805956, "learning_rate": 5.4266775949222354e-08, "loss": 0.83940858, "num_input_tokens_seen": 333124335, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 15433, "time_per_iteration": 2.389780282974243 }, { "auxiliary_loss_clip": 0.01100078, "auxiliary_loss_mlp": 0.01029199, "balance_loss_clip": 1.01857793, "balance_loss_mlp": 1.03486836, "epoch": 0.9279422816774388, "flos": 24681404206080.0, "grad_norm": 2.113909628883914, "language_loss": 0.66616666, "learning_rate": 5.417670437248056e-08, "loss": 0.68745947, "num_input_tokens_seen": 333143995, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.65234375, "step": 15434, "time_per_iteration": 2.496626853942871 }, { "auxiliary_loss_clip": 0.01098749, "auxiliary_loss_mlp": 0.01028304, "balance_loss_clip": 1.01734257, "balance_loss_mlp": 1.03388011, "epoch": 0.9280024049301068, "flos": 19169188099200.0, "grad_norm": 1.976496247374368, "language_loss": 0.68889952, "learning_rate": 5.40867065815529e-08, "loss": 0.71017009, "num_input_tokens_seen": 333162805, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6484375, "step": 15435, "time_per_iteration": 2.4449777603149414 }, { "auxiliary_loss_clip": 0.01104372, "auxiliary_loss_mlp": 0.01031444, "balance_loss_clip": 1.01889157, "balance_loss_mlp": 1.03512526, "epoch": 0.9280625281827747, "flos": 11393701983360.0, "grad_norm": 54.75054683799928, "language_loss": 0.72271109, "learning_rate": 5.399678257985263e-08, "loss": 0.74406922, "num_input_tokens_seen": 333175770, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15436, "time_per_iteration": 2.4368538856506348 }, { "auxiliary_loss_clip": 0.01102999, "auxiliary_loss_mlp": 0.01030277, "balance_loss_clip": 1.01845694, "balance_loss_mlp": 1.03489077, "epoch": 0.9281226514354427, "flos": 24785616539520.0, "grad_norm": 8.467731340518451, "language_loss": 0.66882604, "learning_rate": 5.390693237078925e-08, "loss": 0.69015884, "num_input_tokens_seen": 333194775, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 15437, "time_per_iteration": 2.4783976078033447 }, { "auxiliary_loss_clip": 0.01105944, "auxiliary_loss_mlp": 0.01032878, "balance_loss_clip": 1.01978278, "balance_loss_mlp": 1.03479946, "epoch": 0.9281827746881106, "flos": 15083128563840.0, "grad_norm": 2.030043185478654, "language_loss": 0.71433592, "learning_rate": 5.3817155957770254e-08, "loss": 0.73572421, "num_input_tokens_seen": 333208920, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 15438, "time_per_iteration": 2.4148354530334473 }, { "auxiliary_loss_clip": 0.01103914, "auxiliary_loss_mlp": 0.01030979, "balance_loss_clip": 1.01873016, "balance_loss_mlp": 1.0348345, "epoch": 0.9282428979407786, "flos": 24135059364480.0, "grad_norm": 2.027652064701521, "language_loss": 0.64700115, "learning_rate": 5.3727453344199366e-08, "loss": 0.6683501, "num_input_tokens_seen": 333229350, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69140625, "step": 15439, "time_per_iteration": 2.4666051864624023 }, { "auxiliary_loss_clip": 0.01104515, "auxiliary_loss_mlp": 0.010304, "balance_loss_clip": 1.01835418, "balance_loss_mlp": 1.03587377, "epoch": 0.9283030211934465, "flos": 24823215100800.0, "grad_norm": 1.8127243754190676, "language_loss": 0.6999352, "learning_rate": 5.363782453347876e-08, "loss": 0.72128439, "num_input_tokens_seen": 333246125, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 15440, "time_per_iteration": 2.485358953475952 }, { "auxiliary_loss_clip": 0.01107817, "auxiliary_loss_mlp": 0.01035319, "balance_loss_clip": 1.02238441, "balance_loss_mlp": 1.0361644, "epoch": 0.9283631444461146, "flos": 23981037845760.0, "grad_norm": 1.748130333097644, "language_loss": 0.76779127, "learning_rate": 5.354826952900682e-08, "loss": 0.7892226, "num_input_tokens_seen": 333263685, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 15441, "time_per_iteration": 2.4569828510284424 }, { "auxiliary_loss_clip": 0.01099565, "auxiliary_loss_mlp": 0.01026611, "balance_loss_clip": 1.01648962, "balance_loss_mlp": 1.03461194, "epoch": 0.9284232676987825, "flos": 22784530878720.0, "grad_norm": 1.6799835808093837, "language_loss": 0.64123642, "learning_rate": 5.345878833417949e-08, "loss": 0.66249812, "num_input_tokens_seen": 333282435, "router_z_loss_clip": 0.10107422, "router_z_loss_mlp": 0.6484375, "step": 15442, "time_per_iteration": 2.463585138320923 }, { "auxiliary_loss_clip": 0.01106978, "auxiliary_loss_mlp": 0.0103666, "balance_loss_clip": 1.02419639, "balance_loss_mlp": 1.03566647, "epoch": 0.9284833909514505, "flos": 19500500171520.0, "grad_norm": 2.291601437889485, "language_loss": 0.80716693, "learning_rate": 5.3369380952390295e-08, "loss": 0.82860333, "num_input_tokens_seen": 333300400, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 15443, "time_per_iteration": 2.4322524070739746 }, { "auxiliary_loss_clip": 0.01104872, "auxiliary_loss_mlp": 0.01029872, "balance_loss_clip": 1.01785624, "balance_loss_mlp": 1.03600502, "epoch": 0.9285435142041184, "flos": 23185976256000.0, "grad_norm": 1.8880325603383679, "language_loss": 0.65511411, "learning_rate": 5.328004738702896e-08, "loss": 0.67646158, "num_input_tokens_seen": 333318980, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 15444, "time_per_iteration": 2.4833202362060547 }, { "auxiliary_loss_clip": 0.01103664, "auxiliary_loss_mlp": 0.01030851, "balance_loss_clip": 1.01898932, "balance_loss_mlp": 1.03439379, "epoch": 0.9286036374567864, "flos": 17675519915520.0, "grad_norm": 2.0393187733669746, "language_loss": 0.73507673, "learning_rate": 5.3190787641483215e-08, "loss": 0.75642186, "num_input_tokens_seen": 333334135, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 15445, "time_per_iteration": 2.4132559299468994 }, { "auxiliary_loss_clip": 0.01105235, "auxiliary_loss_mlp": 0.01033329, "balance_loss_clip": 1.02057934, "balance_loss_mlp": 1.0365411, "epoch": 0.9286637607094543, "flos": 20886687884160.0, "grad_norm": 1.7522210887550476, "language_loss": 0.71408397, "learning_rate": 5.3101601719138135e-08, "loss": 0.73546958, "num_input_tokens_seen": 333353325, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 15446, "time_per_iteration": 2.476815700531006 }, { "auxiliary_loss_clip": 0.01107598, "auxiliary_loss_mlp": 0.01027946, "balance_loss_clip": 1.0151546, "balance_loss_mlp": 1.03519964, "epoch": 0.9287238839621224, "flos": 19026012487680.0, "grad_norm": 1.749730706361438, "language_loss": 0.69181073, "learning_rate": 5.301248962337523e-08, "loss": 0.71316618, "num_input_tokens_seen": 333371110, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 15447, "time_per_iteration": 2.4279427528381348 }, { "auxiliary_loss_clip": 0.01098165, "auxiliary_loss_mlp": 0.01027377, "balance_loss_clip": 1.01637352, "balance_loss_mlp": 1.03393793, "epoch": 0.9287840072147904, "flos": 20557027837440.0, "grad_norm": 1.8282471612477822, "language_loss": 0.72579587, "learning_rate": 5.292345135757403e-08, "loss": 0.74705124, "num_input_tokens_seen": 333391420, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.640625, "step": 15448, "time_per_iteration": 2.4879119396209717 }, { "auxiliary_loss_clip": 0.01103061, "auxiliary_loss_mlp": 0.01026735, "balance_loss_clip": 1.01357985, "balance_loss_mlp": 1.0345757, "epoch": 0.9288441304674583, "flos": 21250822008960.0, "grad_norm": 1.7387798662742844, "language_loss": 0.74329758, "learning_rate": 5.283448692511072e-08, "loss": 0.76459551, "num_input_tokens_seen": 333410365, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.68359375, "step": 15449, "time_per_iteration": 2.469477653503418 }, { "auxiliary_loss_clip": 0.01102981, "auxiliary_loss_mlp": 0.01025329, "balance_loss_clip": 1.01292503, "balance_loss_mlp": 1.03426528, "epoch": 0.9289042537201263, "flos": 27669853895040.0, "grad_norm": 1.8456870237242553, "language_loss": 0.68026739, "learning_rate": 5.27455963293586e-08, "loss": 0.70155048, "num_input_tokens_seen": 333430000, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 15450, "time_per_iteration": 2.5110130310058594 }, { "auxiliary_loss_clip": 0.01103946, "auxiliary_loss_mlp": 0.0102818, "balance_loss_clip": 1.01557946, "balance_loss_mlp": 1.03461862, "epoch": 0.9289643769727942, "flos": 19317750750720.0, "grad_norm": 1.960518869374049, "language_loss": 0.71929276, "learning_rate": 5.265677957368875e-08, "loss": 0.74061406, "num_input_tokens_seen": 333445800, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15451, "time_per_iteration": 2.471369743347168 }, { "auxiliary_loss_clip": 0.01106309, "auxiliary_loss_mlp": 0.01035032, "balance_loss_clip": 1.02310538, "balance_loss_mlp": 1.03665602, "epoch": 0.9290245002254622, "flos": 14058058233600.0, "grad_norm": 2.2454369191780614, "language_loss": 0.73451746, "learning_rate": 5.25680366614687e-08, "loss": 0.7559309, "num_input_tokens_seen": 333461550, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 15452, "time_per_iteration": 2.425936460494995 }, { "auxiliary_loss_clip": 0.0110693, "auxiliary_loss_mlp": 0.0102613, "balance_loss_clip": 1.01399469, "balance_loss_mlp": 1.03864777, "epoch": 0.9290846234781301, "flos": 20047132321920.0, "grad_norm": 2.802656168191223, "language_loss": 0.73926729, "learning_rate": 5.2479367596064196e-08, "loss": 0.76059788, "num_input_tokens_seen": 333478835, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 15453, "time_per_iteration": 2.432260751724243 }, { "auxiliary_loss_clip": 0.01028319, "auxiliary_loss_mlp": 0.01001106, "balance_loss_clip": 1.00012898, "balance_loss_mlp": 1.00586605, "epoch": 0.9291447467307982, "flos": 61227514460160.0, "grad_norm": 0.8255987974398501, "language_loss": 0.60639697, "learning_rate": 5.2390772380837226e-08, "loss": 0.62669122, "num_input_tokens_seen": 333535250, "router_z_loss_clip": 0.00976562, "router_z_loss_mlp": 0.22460938, "step": 15454, "time_per_iteration": 4.363851070404053 }, { "auxiliary_loss_clip": 0.01103358, "auxiliary_loss_mlp": 0.01031678, "balance_loss_clip": 1.01934552, "balance_loss_mlp": 1.03380609, "epoch": 0.9292048699834661, "flos": 20553328736640.0, "grad_norm": 1.704485144356235, "language_loss": 0.69074714, "learning_rate": 5.230225101914709e-08, "loss": 0.71209747, "num_input_tokens_seen": 333553805, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 15455, "time_per_iteration": 2.4430718421936035 }, { "auxiliary_loss_clip": 0.01105291, "auxiliary_loss_mlp": 0.01033366, "balance_loss_clip": 1.02066994, "balance_loss_mlp": 1.03638554, "epoch": 0.9292649932361341, "flos": 23623655477760.0, "grad_norm": 1.7639656717705994, "language_loss": 0.64962834, "learning_rate": 5.22138035143509e-08, "loss": 0.6710149, "num_input_tokens_seen": 333572800, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 15456, "time_per_iteration": 2.474829912185669 }, { "auxiliary_loss_clip": 0.01104425, "auxiliary_loss_mlp": 0.01027226, "balance_loss_clip": 1.01476252, "balance_loss_mlp": 1.03706264, "epoch": 0.929325116488802, "flos": 15009942602880.0, "grad_norm": 1.9037971344309708, "language_loss": 0.68073219, "learning_rate": 5.2125429869802615e-08, "loss": 0.70204866, "num_input_tokens_seen": 333588520, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 15457, "time_per_iteration": 2.394907236099243 }, { "auxiliary_loss_clip": 0.01104163, "auxiliary_loss_mlp": 0.0102846, "balance_loss_clip": 1.01657462, "balance_loss_mlp": 1.03353059, "epoch": 0.92938523974147, "flos": 17967365919360.0, "grad_norm": 1.9328595411689862, "language_loss": 0.80806065, "learning_rate": 5.203713008885291e-08, "loss": 0.82938689, "num_input_tokens_seen": 333603435, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 15458, "time_per_iteration": 3.802706003189087 }, { "auxiliary_loss_clip": 0.0110304, "auxiliary_loss_mlp": 0.01030472, "balance_loss_clip": 1.01837182, "balance_loss_mlp": 1.03369761, "epoch": 0.9294453629941379, "flos": 23003047267200.0, "grad_norm": 1.7510638530785232, "language_loss": 0.7223472, "learning_rate": 5.194890417485065e-08, "loss": 0.74368227, "num_input_tokens_seen": 333623305, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 15459, "time_per_iteration": 2.4684722423553467 }, { "auxiliary_loss_clip": 0.01105267, "auxiliary_loss_mlp": 0.01034841, "balance_loss_clip": 1.02304554, "balance_loss_mlp": 1.03631461, "epoch": 0.929505486246806, "flos": 17055234927360.0, "grad_norm": 2.2755718435530152, "language_loss": 0.59272182, "learning_rate": 5.1860752131141384e-08, "loss": 0.61412287, "num_input_tokens_seen": 333641205, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 15460, "time_per_iteration": 2.415708065032959 }, { "auxiliary_loss_clip": 0.01107061, "auxiliary_loss_mlp": 0.01033251, "balance_loss_clip": 1.02063251, "balance_loss_mlp": 1.0361445, "epoch": 0.9295656094994739, "flos": 27340409329920.0, "grad_norm": 1.7227587533528257, "language_loss": 0.80636263, "learning_rate": 5.177267396106733e-08, "loss": 0.82776582, "num_input_tokens_seen": 333659615, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 15461, "time_per_iteration": 2.511362314224243 }, { "auxiliary_loss_clip": 0.01100907, "auxiliary_loss_mlp": 0.01029014, "balance_loss_clip": 1.01674151, "balance_loss_mlp": 1.03421509, "epoch": 0.9296257327521419, "flos": 21470954509440.0, "grad_norm": 1.875552539978444, "language_loss": 0.78476036, "learning_rate": 5.168466966796869e-08, "loss": 0.8060596, "num_input_tokens_seen": 333678985, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.66796875, "step": 15462, "time_per_iteration": 5.333228588104248 }, { "auxiliary_loss_clip": 0.0110217, "auxiliary_loss_mlp": 0.01025509, "balance_loss_clip": 1.01356459, "balance_loss_mlp": 1.03301072, "epoch": 0.9296858560048099, "flos": 16362661818240.0, "grad_norm": 1.9038565693713423, "language_loss": 0.62603116, "learning_rate": 5.159673925518282e-08, "loss": 0.64730799, "num_input_tokens_seen": 333696410, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.69140625, "step": 15463, "time_per_iteration": 2.4466965198516846 }, { "auxiliary_loss_clip": 0.01100715, "auxiliary_loss_mlp": 0.01030002, "balance_loss_clip": 1.01892173, "balance_loss_mlp": 1.03283548, "epoch": 0.9297459792574778, "flos": 29858609139840.0, "grad_norm": 3.6756611920751667, "language_loss": 0.71415305, "learning_rate": 5.15088827260437e-08, "loss": 0.73546028, "num_input_tokens_seen": 333716615, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.67578125, "step": 15464, "time_per_iteration": 2.5421154499053955 }, { "auxiliary_loss_clip": 0.01104283, "auxiliary_loss_mlp": 0.01029375, "balance_loss_clip": 1.01759076, "balance_loss_mlp": 1.03522277, "epoch": 0.9298061025101458, "flos": 15924838942080.0, "grad_norm": 1.968787795106715, "language_loss": 0.7773056, "learning_rate": 5.1421100083883115e-08, "loss": 0.79864222, "num_input_tokens_seen": 333732800, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 15465, "time_per_iteration": 2.408475637435913 }, { "auxiliary_loss_clip": 0.01028197, "auxiliary_loss_mlp": 0.01001501, "balance_loss_clip": 1.00035083, "balance_loss_mlp": 1.00573206, "epoch": 0.9298662257628137, "flos": 64096994304000.0, "grad_norm": 0.704091213402388, "language_loss": 0.56438309, "learning_rate": 5.133339133202952e-08, "loss": 0.58468008, "num_input_tokens_seen": 333799300, "router_z_loss_clip": 0.01147461, "router_z_loss_mlp": 0.22460938, "step": 15466, "time_per_iteration": 3.209336280822754 }, { "auxiliary_loss_clip": 0.01103041, "auxiliary_loss_mlp": 0.01039905, "balance_loss_clip": 1.02627993, "balance_loss_mlp": 1.03349614, "epoch": 0.9299263490154818, "flos": 24280210224000.0, "grad_norm": 1.5571004322057425, "language_loss": 0.72904915, "learning_rate": 5.1245756473809355e-08, "loss": 0.75047863, "num_input_tokens_seen": 333820360, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6953125, "step": 15467, "time_per_iteration": 2.4708826541900635 }, { "auxiliary_loss_clip": 0.01106099, "auxiliary_loss_mlp": 0.0103351, "balance_loss_clip": 1.02108288, "balance_loss_mlp": 1.03598177, "epoch": 0.9299864722681497, "flos": 23294354567040.0, "grad_norm": 1.7482376843011487, "language_loss": 0.71738291, "learning_rate": 5.1158195512545076e-08, "loss": 0.73877907, "num_input_tokens_seen": 333840415, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 15468, "time_per_iteration": 2.4735267162323 }, { "auxiliary_loss_clip": 0.01105144, "auxiliary_loss_mlp": 0.0102936, "balance_loss_clip": 1.01690269, "balance_loss_mlp": 1.03407562, "epoch": 0.9300465955208177, "flos": 21395972868480.0, "grad_norm": 2.4656396720062084, "language_loss": 0.75745904, "learning_rate": 5.107070845155737e-08, "loss": 0.77880412, "num_input_tokens_seen": 333859910, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 15469, "time_per_iteration": 2.4484760761260986 }, { "auxiliary_loss_clip": 0.01103088, "auxiliary_loss_mlp": 0.01033949, "balance_loss_clip": 1.02169442, "balance_loss_mlp": 1.03407514, "epoch": 0.9301067187734856, "flos": 24571445696640.0, "grad_norm": 2.0963948786333093, "language_loss": 0.75784361, "learning_rate": 5.098329529416379e-08, "loss": 0.77921402, "num_input_tokens_seen": 333880495, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 15470, "time_per_iteration": 2.504671573638916 }, { "auxiliary_loss_clip": 0.01101924, "auxiliary_loss_mlp": 0.0103072, "balance_loss_clip": 1.01913929, "balance_loss_mlp": 1.03383148, "epoch": 0.9301668420261536, "flos": 22196960202240.0, "grad_norm": 1.5578997652459419, "language_loss": 0.74745268, "learning_rate": 5.089595604367902e-08, "loss": 0.7687791, "num_input_tokens_seen": 333897640, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 15471, "time_per_iteration": 2.4512832164764404 }, { "auxiliary_loss_clip": 0.0110316, "auxiliary_loss_mlp": 0.01028191, "balance_loss_clip": 1.01587641, "balance_loss_mlp": 1.03410697, "epoch": 0.9302269652788215, "flos": 17747628468480.0, "grad_norm": 2.375207010431937, "language_loss": 0.68597895, "learning_rate": 5.080869070341487e-08, "loss": 0.70729244, "num_input_tokens_seen": 333913670, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15472, "time_per_iteration": 2.4488327503204346 }, { "auxiliary_loss_clip": 0.01097716, "auxiliary_loss_mlp": 0.01031089, "balance_loss_clip": 1.01980042, "balance_loss_mlp": 1.0332284, "epoch": 0.9302870885314896, "flos": 19390793057280.0, "grad_norm": 1.575925962960958, "language_loss": 0.88806057, "learning_rate": 5.0721499276680233e-08, "loss": 0.90934873, "num_input_tokens_seen": 333934105, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6484375, "step": 15473, "time_per_iteration": 2.4560446739196777 }, { "auxiliary_loss_clip": 0.01107409, "auxiliary_loss_mlp": 0.01036679, "balance_loss_clip": 1.02320862, "balance_loss_mlp": 1.03558517, "epoch": 0.9303472117841575, "flos": 21760286561280.0, "grad_norm": 3.1431909679314765, "language_loss": 0.64220166, "learning_rate": 5.063438176678203e-08, "loss": 0.66364253, "num_input_tokens_seen": 333953635, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 15474, "time_per_iteration": 2.472311019897461 }, { "auxiliary_loss_clip": 0.01103879, "auxiliary_loss_mlp": 0.01034732, "balance_loss_clip": 1.0226562, "balance_loss_mlp": 1.03533506, "epoch": 0.9304073350368255, "flos": 19609740408960.0, "grad_norm": 1.8925222147065057, "language_loss": 0.7446717, "learning_rate": 5.054733817702339e-08, "loss": 0.76605785, "num_input_tokens_seen": 333971825, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 15475, "time_per_iteration": 2.4378726482391357 }, { "auxiliary_loss_clip": 0.01102439, "auxiliary_loss_mlp": 0.01026276, "balance_loss_clip": 1.01451051, "balance_loss_mlp": 1.03401446, "epoch": 0.9304674582894935, "flos": 30441582875520.0, "grad_norm": 2.133194592320582, "language_loss": 0.66271967, "learning_rate": 5.0460368510704786e-08, "loss": 0.68400681, "num_input_tokens_seen": 333990120, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.68359375, "step": 15476, "time_per_iteration": 2.523101806640625 }, { "auxiliary_loss_clip": 0.01105565, "auxiliary_loss_mlp": 0.01033888, "balance_loss_clip": 1.02134693, "balance_loss_mlp": 1.03599286, "epoch": 0.9305275815421614, "flos": 17785693906560.0, "grad_norm": 2.845276980617397, "language_loss": 0.69328678, "learning_rate": 5.0373472771124914e-08, "loss": 0.71468127, "num_input_tokens_seen": 334007970, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 15477, "time_per_iteration": 2.4248454570770264 }, { "auxiliary_loss_clip": 0.01102963, "auxiliary_loss_mlp": 0.01029841, "balance_loss_clip": 1.01815867, "balance_loss_mlp": 1.03574193, "epoch": 0.9305877047948294, "flos": 25298456970240.0, "grad_norm": 2.6949716655528135, "language_loss": 0.58600152, "learning_rate": 5.0286650961578027e-08, "loss": 0.60732949, "num_input_tokens_seen": 334027120, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 15478, "time_per_iteration": 2.4986789226531982 }, { "auxiliary_loss_clip": 0.0110919, "auxiliary_loss_mlp": 0.01028336, "balance_loss_clip": 1.01481199, "balance_loss_mlp": 1.03590083, "epoch": 0.9306478280474973, "flos": 16977236544000.0, "grad_norm": 2.0182770948192235, "language_loss": 0.78673166, "learning_rate": 5.01999030853566e-08, "loss": 0.8081069, "num_input_tokens_seen": 334042785, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.734375, "step": 15479, "time_per_iteration": 2.3992929458618164 }, { "auxiliary_loss_clip": 0.01102324, "auxiliary_loss_mlp": 0.01029493, "balance_loss_clip": 1.01762009, "balance_loss_mlp": 1.03358567, "epoch": 0.9307079513001654, "flos": 35663353608960.0, "grad_norm": 1.8841536026784709, "language_loss": 0.68851048, "learning_rate": 5.0113229145750445e-08, "loss": 0.70982862, "num_input_tokens_seen": 334063480, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 15480, "time_per_iteration": 2.600372791290283 }, { "auxiliary_loss_clip": 0.0110427, "auxiliary_loss_mlp": 0.01032054, "balance_loss_clip": 1.01994812, "balance_loss_mlp": 1.03579712, "epoch": 0.9307680745528333, "flos": 19208151377280.0, "grad_norm": 1.761770534510588, "language_loss": 0.67531967, "learning_rate": 5.002662914604583e-08, "loss": 0.69668293, "num_input_tokens_seen": 334082005, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 15481, "time_per_iteration": 2.4339351654052734 }, { "auxiliary_loss_clip": 0.01100806, "auxiliary_loss_mlp": 0.01027626, "balance_loss_clip": 1.0163486, "balance_loss_mlp": 1.03376305, "epoch": 0.9308281978055013, "flos": 19062641381760.0, "grad_norm": 2.507208494300077, "language_loss": 0.7467134, "learning_rate": 4.994010308952701e-08, "loss": 0.76799774, "num_input_tokens_seen": 334101375, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.671875, "step": 15482, "time_per_iteration": 2.465191125869751 }, { "auxiliary_loss_clip": 0.01101148, "auxiliary_loss_mlp": 0.01026166, "balance_loss_clip": 1.01470399, "balance_loss_mlp": 1.03425002, "epoch": 0.9308883210581692, "flos": 20521548178560.0, "grad_norm": 1.9944564689093167, "language_loss": 0.80327344, "learning_rate": 4.985365097947469e-08, "loss": 0.82454658, "num_input_tokens_seen": 334119460, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 15483, "time_per_iteration": 2.4397878646850586 }, { "auxiliary_loss_clip": 0.01104462, "auxiliary_loss_mlp": 0.01034643, "balance_loss_clip": 1.02207875, "balance_loss_mlp": 1.03555238, "epoch": 0.9309484443108372, "flos": 13001422826880.0, "grad_norm": 1.9515219808487345, "language_loss": 0.74562007, "learning_rate": 4.976727281916782e-08, "loss": 0.76701117, "num_input_tokens_seen": 334136065, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 15484, "time_per_iteration": 2.430774450302124 }, { "auxiliary_loss_clip": 0.01108215, "auxiliary_loss_mlp": 0.01031798, "balance_loss_clip": 1.01896513, "balance_loss_mlp": 1.03696036, "epoch": 0.9310085675635051, "flos": 12567765928320.0, "grad_norm": 2.1547766705796096, "language_loss": 0.76286209, "learning_rate": 4.968096861188087e-08, "loss": 0.78426218, "num_input_tokens_seen": 334153690, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 15485, "time_per_iteration": 2.4153926372528076 }, { "auxiliary_loss_clip": 0.01103573, "auxiliary_loss_mlp": 0.01030969, "balance_loss_clip": 1.01727223, "balance_loss_mlp": 1.03337109, "epoch": 0.9310686908161732, "flos": 23477570864640.0, "grad_norm": 1.819390171293516, "language_loss": 0.78125012, "learning_rate": 4.959473836088723e-08, "loss": 0.8025955, "num_input_tokens_seen": 334171880, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 15486, "time_per_iteration": 2.482117176055908 }, { "auxiliary_loss_clip": 0.01109928, "auxiliary_loss_mlp": 0.01029981, "balance_loss_clip": 1.01697493, "balance_loss_mlp": 1.03876996, "epoch": 0.9311288140688411, "flos": 24170287628160.0, "grad_norm": 1.8551204415421787, "language_loss": 0.76788914, "learning_rate": 4.950858206945674e-08, "loss": 0.78928822, "num_input_tokens_seen": 334190005, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 15487, "time_per_iteration": 2.4743130207061768 }, { "auxiliary_loss_clip": 0.0110292, "auxiliary_loss_mlp": 0.01029289, "balance_loss_clip": 1.01662874, "balance_loss_mlp": 1.03518987, "epoch": 0.9311889373215091, "flos": 35590203561600.0, "grad_norm": 2.7400608608519654, "language_loss": 0.67279238, "learning_rate": 4.942249974085633e-08, "loss": 0.69411451, "num_input_tokens_seen": 334209545, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 15488, "time_per_iteration": 2.5736496448516846 }, { "auxiliary_loss_clip": 0.01100336, "auxiliary_loss_mlp": 0.01033199, "balance_loss_clip": 1.02067053, "balance_loss_mlp": 1.03431773, "epoch": 0.9312490605741771, "flos": 20230528187520.0, "grad_norm": 5.680879605591114, "language_loss": 0.74635839, "learning_rate": 4.933649137834983e-08, "loss": 0.76769376, "num_input_tokens_seen": 334228900, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.66015625, "step": 15489, "time_per_iteration": 2.443751573562622 }, { "auxiliary_loss_clip": 0.01105588, "auxiliary_loss_mlp": 0.01031109, "balance_loss_clip": 1.01856804, "balance_loss_mlp": 1.03516901, "epoch": 0.931309183826845, "flos": 13950577762560.0, "grad_norm": 2.35803166194615, "language_loss": 0.81077254, "learning_rate": 4.925055698519931e-08, "loss": 0.83213949, "num_input_tokens_seen": 334245500, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 15490, "time_per_iteration": 2.445591449737549 }, { "auxiliary_loss_clip": 0.01105146, "auxiliary_loss_mlp": 0.01032332, "balance_loss_clip": 1.01957655, "balance_loss_mlp": 1.03513932, "epoch": 0.931369307079513, "flos": 20156731695360.0, "grad_norm": 2.383203572419261, "language_loss": 0.72232664, "learning_rate": 4.9164696564663264e-08, "loss": 0.7437014, "num_input_tokens_seen": 334264370, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 15491, "time_per_iteration": 2.4401490688323975 }, { "auxiliary_loss_clip": 0.0110001, "auxiliary_loss_mlp": 0.0102634, "balance_loss_clip": 1.01509905, "balance_loss_mlp": 1.03369856, "epoch": 0.931429430332181, "flos": 25338569483520.0, "grad_norm": 2.567342755437744, "language_loss": 0.74478233, "learning_rate": 4.9078910119997096e-08, "loss": 0.76604581, "num_input_tokens_seen": 334283905, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 15492, "time_per_iteration": 2.5026543140411377 }, { "auxiliary_loss_clip": 0.0102844, "auxiliary_loss_mlp": 0.01001815, "balance_loss_clip": 1.00076032, "balance_loss_mlp": 1.00599456, "epoch": 0.931489553584849, "flos": 71226193985280.0, "grad_norm": 0.7934487010710999, "language_loss": 0.53448355, "learning_rate": 4.899319765445442e-08, "loss": 0.55478609, "num_input_tokens_seen": 334339925, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.22460938, "step": 15493, "time_per_iteration": 2.9557886123657227 }, { "auxiliary_loss_clip": 0.01102865, "auxiliary_loss_mlp": 0.01029738, "balance_loss_clip": 1.0183655, "balance_loss_mlp": 1.03488958, "epoch": 0.9315496768375169, "flos": 14643653662080.0, "grad_norm": 1.914622413302856, "language_loss": 0.70982593, "learning_rate": 4.890755917128531e-08, "loss": 0.73115194, "num_input_tokens_seen": 334357225, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6796875, "step": 15494, "time_per_iteration": 2.421978712081909 }, { "auxiliary_loss_clip": 0.01105375, "auxiliary_loss_mlp": 0.01028174, "balance_loss_clip": 1.01579416, "balance_loss_mlp": 1.03466523, "epoch": 0.9316098000901849, "flos": 28329928174080.0, "grad_norm": 1.6272195130792184, "language_loss": 0.68415213, "learning_rate": 4.882199467373671e-08, "loss": 0.70548761, "num_input_tokens_seen": 334375945, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 15495, "time_per_iteration": 3.892132043838501 }, { "auxiliary_loss_clip": 0.01099261, "auxiliary_loss_mlp": 0.01032437, "balance_loss_clip": 1.02112377, "balance_loss_mlp": 1.03193414, "epoch": 0.9316699233428528, "flos": 28512677594880.0, "grad_norm": 2.2972711109715807, "language_loss": 0.61813235, "learning_rate": 4.8736504165053815e-08, "loss": 0.6394493, "num_input_tokens_seen": 334395310, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 15496, "time_per_iteration": 2.5002896785736084 }, { "auxiliary_loss_clip": 0.01103146, "auxiliary_loss_mlp": 0.01034623, "balance_loss_clip": 1.02224874, "balance_loss_mlp": 1.03438926, "epoch": 0.9317300465955208, "flos": 33693402061440.0, "grad_norm": 2.2702608713865065, "language_loss": 0.76985538, "learning_rate": 4.865108764847825e-08, "loss": 0.79123294, "num_input_tokens_seen": 334416965, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 15497, "time_per_iteration": 2.5812792778015137 }, { "auxiliary_loss_clip": 0.01107275, "auxiliary_loss_mlp": 0.01033942, "balance_loss_clip": 1.02103758, "balance_loss_mlp": 1.03650069, "epoch": 0.9317901698481887, "flos": 23658237296640.0, "grad_norm": 1.690556747038912, "language_loss": 0.66454834, "learning_rate": 4.856574512724898e-08, "loss": 0.68596053, "num_input_tokens_seen": 334435620, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 15498, "time_per_iteration": 2.483290433883667 }, { "auxiliary_loss_clip": 0.01106391, "auxiliary_loss_mlp": 0.01036396, "balance_loss_clip": 1.02307487, "balance_loss_mlp": 1.03653812, "epoch": 0.9318502931008568, "flos": 20960017499520.0, "grad_norm": 3.3870310863267123, "language_loss": 0.79313231, "learning_rate": 4.8480476604602305e-08, "loss": 0.81456017, "num_input_tokens_seen": 334456210, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 15499, "time_per_iteration": 2.4827206134796143 }, { "auxiliary_loss_clip": 0.01104362, "auxiliary_loss_mlp": 0.01031864, "balance_loss_clip": 1.01964474, "balance_loss_mlp": 1.03747046, "epoch": 0.9319104163535247, "flos": 23441049711360.0, "grad_norm": 1.620770128687872, "language_loss": 0.767488, "learning_rate": 4.8395282083771196e-08, "loss": 0.78885019, "num_input_tokens_seen": 334475485, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.66796875, "step": 15500, "time_per_iteration": 3.9623138904571533 }, { "auxiliary_loss_clip": 0.01100268, "auxiliary_loss_mlp": 0.01026243, "balance_loss_clip": 1.01442313, "balance_loss_mlp": 1.03292894, "epoch": 0.9319705396061927, "flos": 22347426274560.0, "grad_norm": 2.844961544192776, "language_loss": 0.72022671, "learning_rate": 4.8310161567987064e-08, "loss": 0.74149179, "num_input_tokens_seen": 334494740, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 15501, "time_per_iteration": 2.474942684173584 }, { "auxiliary_loss_clip": 0.01106645, "auxiliary_loss_mlp": 0.01035242, "balance_loss_clip": 1.02256453, "balance_loss_mlp": 1.03571439, "epoch": 0.9320306628588607, "flos": 20993557824000.0, "grad_norm": 2.0201912798196813, "language_loss": 0.66384089, "learning_rate": 4.822511506047666e-08, "loss": 0.6852597, "num_input_tokens_seen": 334511910, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 15502, "time_per_iteration": 2.425542116165161 }, { "auxiliary_loss_clip": 0.01105289, "auxiliary_loss_mlp": 0.01035711, "balance_loss_clip": 1.02389097, "balance_loss_mlp": 1.03518021, "epoch": 0.9320907861115286, "flos": 24538300421760.0, "grad_norm": 1.5495589130543936, "language_loss": 0.65763587, "learning_rate": 4.814014256446586e-08, "loss": 0.6790458, "num_input_tokens_seen": 334533150, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 15503, "time_per_iteration": 2.5244991779327393 }, { "auxiliary_loss_clip": 0.01105824, "auxiliary_loss_mlp": 0.01031523, "balance_loss_clip": 1.01866019, "balance_loss_mlp": 1.03527534, "epoch": 0.9321509093641966, "flos": 19785414850560.0, "grad_norm": 1.5398619675073124, "language_loss": 0.75199348, "learning_rate": 4.805524408317652e-08, "loss": 0.77336693, "num_input_tokens_seen": 334550940, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 15504, "time_per_iteration": 5.3010711669921875 }, { "auxiliary_loss_clip": 0.01105868, "auxiliary_loss_mlp": 0.01026505, "balance_loss_clip": 1.01318383, "balance_loss_mlp": 1.03631854, "epoch": 0.9322110326168646, "flos": 24972675592320.0, "grad_norm": 2.203420448733145, "language_loss": 0.7110672, "learning_rate": 4.797041961982762e-08, "loss": 0.73239094, "num_input_tokens_seen": 334570935, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 15505, "time_per_iteration": 2.4657881259918213 }, { "auxiliary_loss_clip": 0.01105294, "auxiliary_loss_mlp": 0.01029712, "balance_loss_clip": 1.01677215, "balance_loss_mlp": 1.03555012, "epoch": 0.9322711558695326, "flos": 16143642639360.0, "grad_norm": 3.0598927900899, "language_loss": 0.74970126, "learning_rate": 4.788566917763614e-08, "loss": 0.77105129, "num_input_tokens_seen": 334589315, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 15506, "time_per_iteration": 2.4454433917999268 }, { "auxiliary_loss_clip": 0.01100998, "auxiliary_loss_mlp": 0.01028477, "balance_loss_clip": 1.01673532, "balance_loss_mlp": 1.03511965, "epoch": 0.9323312791222005, "flos": 23732428838400.0, "grad_norm": 2.0329962404074955, "language_loss": 0.83284533, "learning_rate": 4.780099275981597e-08, "loss": 0.85414004, "num_input_tokens_seen": 334608990, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66015625, "step": 15507, "time_per_iteration": 2.454021692276001 }, { "auxiliary_loss_clip": 0.01105204, "auxiliary_loss_mlp": 0.01028638, "balance_loss_clip": 1.01629376, "balance_loss_mlp": 1.03476906, "epoch": 0.9323914023748685, "flos": 20777914523520.0, "grad_norm": 1.7469027443306686, "language_loss": 0.67662507, "learning_rate": 4.771639036957742e-08, "loss": 0.69796348, "num_input_tokens_seen": 334628655, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 15508, "time_per_iteration": 2.4647040367126465 }, { "auxiliary_loss_clip": 0.01103642, "auxiliary_loss_mlp": 0.01030304, "balance_loss_clip": 1.01837754, "balance_loss_mlp": 1.03593636, "epoch": 0.9324515256275364, "flos": 23915178259200.0, "grad_norm": 1.780600280214316, "language_loss": 0.72243559, "learning_rate": 4.7631862010129033e-08, "loss": 0.74377507, "num_input_tokens_seen": 334648295, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 15509, "time_per_iteration": 2.471862554550171 }, { "auxiliary_loss_clip": 0.01104959, "auxiliary_loss_mlp": 0.01031167, "balance_loss_clip": 1.01885247, "balance_loss_mlp": 1.03615177, "epoch": 0.9325116488802044, "flos": 18005215875840.0, "grad_norm": 6.5668242892147575, "language_loss": 0.74477386, "learning_rate": 4.754740768467624e-08, "loss": 0.7661351, "num_input_tokens_seen": 334666280, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15510, "time_per_iteration": 2.4515299797058105 }, { "auxiliary_loss_clip": 0.01104951, "auxiliary_loss_mlp": 0.01025786, "balance_loss_clip": 1.0141269, "balance_loss_mlp": 1.0338186, "epoch": 0.9325717721328723, "flos": 29021603443200.0, "grad_norm": 1.8238916504261806, "language_loss": 0.70428288, "learning_rate": 4.746302739642161e-08, "loss": 0.72559029, "num_input_tokens_seen": 334688830, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.7109375, "step": 15511, "time_per_iteration": 2.508478879928589 }, { "auxiliary_loss_clip": 0.01104763, "auxiliary_loss_mlp": 0.01035201, "balance_loss_clip": 1.0231545, "balance_loss_mlp": 1.03531599, "epoch": 0.9326318953855404, "flos": 21646341642240.0, "grad_norm": 3.6654736900945495, "language_loss": 0.78104085, "learning_rate": 4.737872114856412e-08, "loss": 0.80244052, "num_input_tokens_seen": 334705205, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6953125, "step": 15512, "time_per_iteration": 2.4695565700531006 }, { "auxiliary_loss_clip": 0.01102654, "auxiliary_loss_mlp": 0.01028515, "balance_loss_clip": 1.01586711, "balance_loss_mlp": 1.03428018, "epoch": 0.9326920186382083, "flos": 26065724411520.0, "grad_norm": 1.5146173937784468, "language_loss": 0.80945659, "learning_rate": 4.7294488944301436e-08, "loss": 0.83076829, "num_input_tokens_seen": 334723830, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 15513, "time_per_iteration": 2.4982447624206543 }, { "auxiliary_loss_clip": 0.01109669, "auxiliary_loss_mlp": 0.01031378, "balance_loss_clip": 1.01763296, "balance_loss_mlp": 1.03631854, "epoch": 0.9327521418908763, "flos": 12057116227200.0, "grad_norm": 3.2026061375389654, "language_loss": 0.79888284, "learning_rate": 4.721033078682768e-08, "loss": 0.82029337, "num_input_tokens_seen": 334740825, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.734375, "step": 15514, "time_per_iteration": 2.433154821395874 }, { "auxiliary_loss_clip": 0.01101743, "auxiliary_loss_mlp": 0.01036773, "balance_loss_clip": 1.02503097, "balance_loss_mlp": 1.03525269, "epoch": 0.9328122651435443, "flos": 43834395271680.0, "grad_norm": 1.8833973309792988, "language_loss": 0.7145775, "learning_rate": 4.7126246679333626e-08, "loss": 0.73596269, "num_input_tokens_seen": 334765825, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6640625, "step": 15515, "time_per_iteration": 2.643052577972412 }, { "auxiliary_loss_clip": 0.01108716, "auxiliary_loss_mlp": 0.01032068, "balance_loss_clip": 1.0187223, "balance_loss_mlp": 1.03683972, "epoch": 0.9328723883962122, "flos": 15194954580480.0, "grad_norm": 4.191774327691757, "language_loss": 0.80670851, "learning_rate": 4.704223662500806e-08, "loss": 0.82811636, "num_input_tokens_seen": 334782680, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 15516, "time_per_iteration": 2.446073293685913 }, { "auxiliary_loss_clip": 0.01105727, "auxiliary_loss_mlp": 0.01030558, "balance_loss_clip": 1.01852942, "balance_loss_mlp": 1.03485036, "epoch": 0.9329325116488802, "flos": 20261770041600.0, "grad_norm": 1.9594890945769343, "language_loss": 0.81072021, "learning_rate": 4.695830062703643e-08, "loss": 0.83208305, "num_input_tokens_seen": 334800160, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 15517, "time_per_iteration": 2.431302070617676 }, { "auxiliary_loss_clip": 0.01105903, "auxiliary_loss_mlp": 0.01031625, "balance_loss_clip": 1.01855993, "balance_loss_mlp": 1.03541946, "epoch": 0.9329926349015482, "flos": 13115008609920.0, "grad_norm": 2.186216685346585, "language_loss": 0.74794292, "learning_rate": 4.687443868860219e-08, "loss": 0.76931822, "num_input_tokens_seen": 334815840, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 15518, "time_per_iteration": 2.4781477451324463 }, { "auxiliary_loss_clip": 0.01103661, "auxiliary_loss_mlp": 0.01034224, "balance_loss_clip": 1.02180862, "balance_loss_mlp": 1.03486776, "epoch": 0.9330527581542162, "flos": 23040250778880.0, "grad_norm": 2.0004104007415506, "language_loss": 0.75700581, "learning_rate": 4.679065081288458e-08, "loss": 0.77838469, "num_input_tokens_seen": 334834735, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 15519, "time_per_iteration": 2.4674198627471924 }, { "auxiliary_loss_clip": 0.01102544, "auxiliary_loss_mlp": 0.01034721, "balance_loss_clip": 1.02157784, "balance_loss_mlp": 1.03407562, "epoch": 0.9331128814068841, "flos": 15559627409280.0, "grad_norm": 2.1315483639485966, "language_loss": 0.83044946, "learning_rate": 4.6706937003061275e-08, "loss": 0.85182202, "num_input_tokens_seen": 334853490, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.68359375, "step": 15520, "time_per_iteration": 2.4483301639556885 }, { "auxiliary_loss_clip": 0.01101625, "auxiliary_loss_mlp": 0.01027517, "balance_loss_clip": 1.01566815, "balance_loss_mlp": 1.03415012, "epoch": 0.9331730046595521, "flos": 22271762275200.0, "grad_norm": 4.948546478052381, "language_loss": 0.76240164, "learning_rate": 4.6623297262306846e-08, "loss": 0.78369308, "num_input_tokens_seen": 334873675, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 15521, "time_per_iteration": 2.4559168815612793 }, { "auxiliary_loss_clip": 0.01105399, "auxiliary_loss_mlp": 0.01031054, "balance_loss_clip": 1.01938307, "balance_loss_mlp": 1.03748846, "epoch": 0.93323312791222, "flos": 15777641007360.0, "grad_norm": 1.9776966440139159, "language_loss": 0.77541041, "learning_rate": 4.6539731593792545e-08, "loss": 0.79677492, "num_input_tokens_seen": 334890970, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 15522, "time_per_iteration": 2.4596142768859863 }, { "auxiliary_loss_clip": 0.01104666, "auxiliary_loss_mlp": 0.01029491, "balance_loss_clip": 1.0170691, "balance_loss_mlp": 1.03549016, "epoch": 0.933293251164888, "flos": 22010978557440.0, "grad_norm": 2.723950828702512, "language_loss": 0.62749982, "learning_rate": 4.6456240000687373e-08, "loss": 0.64884144, "num_input_tokens_seen": 334906635, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 15523, "time_per_iteration": 2.424394369125366 }, { "auxiliary_loss_clip": 0.01103283, "auxiliary_loss_mlp": 0.01030005, "balance_loss_clip": 1.01780343, "balance_loss_mlp": 1.03580141, "epoch": 0.933353374417556, "flos": 26031358074240.0, "grad_norm": 1.5707832085285232, "language_loss": 0.67994159, "learning_rate": 4.63728224861577e-08, "loss": 0.70127451, "num_input_tokens_seen": 334926230, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 15524, "time_per_iteration": 2.5081357955932617 }, { "auxiliary_loss_clip": 0.01105057, "auxiliary_loss_mlp": 0.01032292, "balance_loss_clip": 1.02000797, "balance_loss_mlp": 1.03524446, "epoch": 0.933413497670224, "flos": 24900100162560.0, "grad_norm": 1.7903860884068648, "language_loss": 0.74069542, "learning_rate": 4.628947905336589e-08, "loss": 0.76206887, "num_input_tokens_seen": 334946680, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 15525, "time_per_iteration": 2.4813578128814697 }, { "auxiliary_loss_clip": 0.011015, "auxiliary_loss_mlp": 0.01034294, "balance_loss_clip": 1.02249241, "balance_loss_mlp": 1.034091, "epoch": 0.9334736209228919, "flos": 23688689051520.0, "grad_norm": 6.153479277715136, "language_loss": 0.84219766, "learning_rate": 4.6206209705473175e-08, "loss": 0.86355561, "num_input_tokens_seen": 334964785, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 15526, "time_per_iteration": 2.4875402450561523 }, { "auxiliary_loss_clip": 0.01105842, "auxiliary_loss_mlp": 0.0102839, "balance_loss_clip": 1.01603949, "balance_loss_mlp": 1.03606367, "epoch": 0.9335337441755599, "flos": 15377344865280.0, "grad_norm": 2.033555888603917, "language_loss": 0.69541931, "learning_rate": 4.61230144456366e-08, "loss": 0.71676165, "num_input_tokens_seen": 334982400, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 15527, "time_per_iteration": 2.4089019298553467 }, { "auxiliary_loss_clip": 0.01106667, "auxiliary_loss_mlp": 0.01029078, "balance_loss_clip": 1.0153091, "balance_loss_mlp": 1.03570926, "epoch": 0.9335938674282279, "flos": 16106726436480.0, "grad_norm": 2.351957144081449, "language_loss": 0.65100843, "learning_rate": 4.603989327701141e-08, "loss": 0.6723659, "num_input_tokens_seen": 334999685, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7109375, "step": 15528, "time_per_iteration": 2.434688091278076 }, { "auxiliary_loss_clip": 0.01105518, "auxiliary_loss_mlp": 0.0103011, "balance_loss_clip": 1.01732516, "balance_loss_mlp": 1.0345943, "epoch": 0.9336539906808958, "flos": 18952898353920.0, "grad_norm": 2.2958687959006507, "language_loss": 0.74754685, "learning_rate": 4.5956846202748867e-08, "loss": 0.76890314, "num_input_tokens_seen": 335019160, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 15529, "time_per_iteration": 2.4290997982025146 }, { "auxiliary_loss_clip": 0.01101571, "auxiliary_loss_mlp": 0.0103036, "balance_loss_clip": 1.01892805, "balance_loss_mlp": 1.03348279, "epoch": 0.9337141139335638, "flos": 18109104986880.0, "grad_norm": 1.8019219362011547, "language_loss": 0.63102865, "learning_rate": 4.5873873225998674e-08, "loss": 0.65234792, "num_input_tokens_seen": 335037350, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 15530, "time_per_iteration": 2.449726104736328 }, { "auxiliary_loss_clip": 0.01101806, "auxiliary_loss_mlp": 0.01028228, "balance_loss_clip": 1.01649141, "balance_loss_mlp": 1.03492153, "epoch": 0.9337742371862318, "flos": 17345716214400.0, "grad_norm": 1.7359776355149723, "language_loss": 0.72381681, "learning_rate": 4.5790974349907194e-08, "loss": 0.74511713, "num_input_tokens_seen": 335056060, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66796875, "step": 15531, "time_per_iteration": 2.4446938037872314 }, { "auxiliary_loss_clip": 0.01102564, "auxiliary_loss_mlp": 0.01033336, "balance_loss_clip": 1.02066398, "balance_loss_mlp": 1.03493047, "epoch": 0.9338343604388998, "flos": 29058986522880.0, "grad_norm": 1.948234770787497, "language_loss": 0.71134096, "learning_rate": 4.5708149577617925e-08, "loss": 0.73269999, "num_input_tokens_seen": 335075410, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.67578125, "step": 15532, "time_per_iteration": 2.516399383544922 }, { "auxiliary_loss_clip": 0.01104568, "auxiliary_loss_mlp": 0.01031657, "balance_loss_clip": 1.01929498, "balance_loss_mlp": 1.03485596, "epoch": 0.9338944836915677, "flos": 18660908695680.0, "grad_norm": 1.7256130531521154, "language_loss": 0.73432428, "learning_rate": 4.5625398912271016e-08, "loss": 0.75568646, "num_input_tokens_seen": 335095190, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 15533, "time_per_iteration": 2.45418643951416 }, { "auxiliary_loss_clip": 0.01100546, "auxiliary_loss_mlp": 0.01028631, "balance_loss_clip": 1.01707947, "balance_loss_mlp": 1.03374004, "epoch": 0.9339546069442357, "flos": 16617735273600.0, "grad_norm": 1.7111206508000256, "language_loss": 0.79988205, "learning_rate": 4.554272235700507e-08, "loss": 0.82117379, "num_input_tokens_seen": 335113825, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66796875, "step": 15534, "time_per_iteration": 2.433788299560547 }, { "auxiliary_loss_clip": 0.01097842, "auxiliary_loss_mlp": 0.01027826, "balance_loss_clip": 1.01695442, "balance_loss_mlp": 1.03483558, "epoch": 0.9340147301969036, "flos": 23693106424320.0, "grad_norm": 1.8013367820677557, "language_loss": 0.74312377, "learning_rate": 4.546011991495513e-08, "loss": 0.76438051, "num_input_tokens_seen": 335136425, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.62890625, "step": 15535, "time_per_iteration": 2.5007922649383545 }, { "auxiliary_loss_clip": 0.01105659, "auxiliary_loss_mlp": 0.01029141, "balance_loss_clip": 1.01651645, "balance_loss_mlp": 1.03596604, "epoch": 0.9340748534495716, "flos": 28654452576000.0, "grad_norm": 3.4257509236165333, "language_loss": 0.77822804, "learning_rate": 4.537759158925292e-08, "loss": 0.7995761, "num_input_tokens_seen": 335157925, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15536, "time_per_iteration": 2.5212526321411133 }, { "auxiliary_loss_clip": 0.01101836, "auxiliary_loss_mlp": 0.01025399, "balance_loss_clip": 1.01321018, "balance_loss_mlp": 1.034145, "epoch": 0.9341349767022396, "flos": 24899633285760.0, "grad_norm": 1.7866018835488437, "language_loss": 0.80522329, "learning_rate": 4.5295137383028593e-08, "loss": 0.82649565, "num_input_tokens_seen": 335177840, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 15537, "time_per_iteration": 3.7413628101348877 }, { "auxiliary_loss_clip": 0.01105753, "auxiliary_loss_mlp": 0.01031451, "balance_loss_clip": 1.01942873, "balance_loss_mlp": 1.03518963, "epoch": 0.9341950999549076, "flos": 29059525226880.0, "grad_norm": 1.8048119109008152, "language_loss": 0.77752209, "learning_rate": 4.5212757299408764e-08, "loss": 0.79889417, "num_input_tokens_seen": 335199470, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 15538, "time_per_iteration": 2.5495645999908447 }, { "auxiliary_loss_clip": 0.01102587, "auxiliary_loss_mlp": 0.01028388, "balance_loss_clip": 1.01613927, "balance_loss_mlp": 1.03430009, "epoch": 0.9342552232075755, "flos": 23587062497280.0, "grad_norm": 1.5811569478436345, "language_loss": 0.73403192, "learning_rate": 4.513045134151672e-08, "loss": 0.75534165, "num_input_tokens_seen": 335218885, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 15539, "time_per_iteration": 2.475064754486084 }, { "auxiliary_loss_clip": 0.01103214, "auxiliary_loss_mlp": 0.01028701, "balance_loss_clip": 1.01734018, "balance_loss_mlp": 1.03484988, "epoch": 0.9343153464602435, "flos": 36721389646080.0, "grad_norm": 1.677443572739889, "language_loss": 0.64969707, "learning_rate": 4.504821951247373e-08, "loss": 0.67101622, "num_input_tokens_seen": 335239485, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.68359375, "step": 15540, "time_per_iteration": 2.5879900455474854 }, { "auxiliary_loss_clip": 0.01101652, "auxiliary_loss_mlp": 0.01033932, "balance_loss_clip": 1.02196896, "balance_loss_mlp": 1.03320122, "epoch": 0.9343754697129115, "flos": 22236498097920.0, "grad_norm": 2.304304989641614, "language_loss": 0.76595664, "learning_rate": 4.496606181539864e-08, "loss": 0.78731239, "num_input_tokens_seen": 335258355, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 15541, "time_per_iteration": 3.8652503490448 }, { "auxiliary_loss_clip": 0.01106278, "auxiliary_loss_mlp": 0.01035783, "balance_loss_clip": 1.02365947, "balance_loss_mlp": 1.03752685, "epoch": 0.9344355929655794, "flos": 29710333797120.0, "grad_norm": 2.000791371476034, "language_loss": 0.66802323, "learning_rate": 4.4883978253406066e-08, "loss": 0.68944383, "num_input_tokens_seen": 335276835, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 15542, "time_per_iteration": 2.5125718116760254 }, { "auxiliary_loss_clip": 0.01103439, "auxiliary_loss_mlp": 0.01028695, "balance_loss_clip": 1.01635647, "balance_loss_mlp": 1.03430927, "epoch": 0.9344957162182475, "flos": 18880394751360.0, "grad_norm": 2.174839441217889, "language_loss": 0.69783568, "learning_rate": 4.480196882960907e-08, "loss": 0.7191571, "num_input_tokens_seen": 335296220, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 15543, "time_per_iteration": 2.4376473426818848 }, { "auxiliary_loss_clip": 0.01106616, "auxiliary_loss_mlp": 0.01031877, "balance_loss_clip": 1.01891267, "balance_loss_mlp": 1.03482497, "epoch": 0.9345558394709154, "flos": 27417761268480.0, "grad_norm": 1.915507018261189, "language_loss": 0.69619346, "learning_rate": 4.4720033547117394e-08, "loss": 0.71757829, "num_input_tokens_seen": 335316335, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71875, "step": 15544, "time_per_iteration": 2.487269163131714 }, { "auxiliary_loss_clip": 0.01104164, "auxiliary_loss_mlp": 0.0103778, "balance_loss_clip": 1.02584112, "balance_loss_mlp": 1.03463972, "epoch": 0.9346159627235834, "flos": 20741285629440.0, "grad_norm": 1.6200797663235227, "language_loss": 0.77198756, "learning_rate": 4.463817240903789e-08, "loss": 0.79340696, "num_input_tokens_seen": 335335545, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 15545, "time_per_iteration": 5.287272691726685 }, { "auxiliary_loss_clip": 0.01105756, "auxiliary_loss_mlp": 0.01028121, "balance_loss_clip": 1.01664746, "balance_loss_mlp": 1.03538895, "epoch": 0.9346760859762513, "flos": 21069221823360.0, "grad_norm": 1.7010450630193517, "language_loss": 0.68760812, "learning_rate": 4.455638541847495e-08, "loss": 0.70894682, "num_input_tokens_seen": 335355350, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.703125, "step": 15546, "time_per_iteration": 2.480189085006714 }, { "auxiliary_loss_clip": 0.01099429, "auxiliary_loss_mlp": 0.01027812, "balance_loss_clip": 1.01666641, "balance_loss_mlp": 1.034302, "epoch": 0.9347362092289193, "flos": 29204927481600.0, "grad_norm": 1.8883579353298996, "language_loss": 0.82075197, "learning_rate": 4.447467257852966e-08, "loss": 0.84202445, "num_input_tokens_seen": 335375160, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6484375, "step": 15547, "time_per_iteration": 2.5023868083953857 }, { "auxiliary_loss_clip": 0.01100116, "auxiliary_loss_mlp": 0.01030869, "balance_loss_clip": 1.01980019, "balance_loss_mlp": 1.03293836, "epoch": 0.9347963324815872, "flos": 19427350124160.0, "grad_norm": 2.109248450733763, "language_loss": 0.8383233, "learning_rate": 4.439303389230087e-08, "loss": 0.85963315, "num_input_tokens_seen": 335394080, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.671875, "step": 15548, "time_per_iteration": 2.4391472339630127 }, { "auxiliary_loss_clip": 0.01107494, "auxiliary_loss_mlp": 0.01033127, "balance_loss_clip": 1.01950717, "balance_loss_mlp": 1.03543377, "epoch": 0.9348564557342552, "flos": 36901840596480.0, "grad_norm": 1.631058121489596, "language_loss": 0.65151179, "learning_rate": 4.4311469362884326e-08, "loss": 0.67291796, "num_input_tokens_seen": 335414230, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 15549, "time_per_iteration": 2.61844539642334 }, { "auxiliary_loss_clip": 0.0110567, "auxiliary_loss_mlp": 0.0103481, "balance_loss_clip": 1.02198291, "balance_loss_mlp": 1.035887, "epoch": 0.9349165789869232, "flos": 21690117342720.0, "grad_norm": 1.7878902864357364, "language_loss": 0.80117154, "learning_rate": 4.4229978993372665e-08, "loss": 0.82257628, "num_input_tokens_seen": 335432890, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 15550, "time_per_iteration": 2.4816782474517822 }, { "auxiliary_loss_clip": 0.01105646, "auxiliary_loss_mlp": 0.01029144, "balance_loss_clip": 1.01771784, "balance_loss_mlp": 1.03762031, "epoch": 0.9349767022395912, "flos": 18844053166080.0, "grad_norm": 2.3877787980413836, "language_loss": 0.75743461, "learning_rate": 4.4148562786856524e-08, "loss": 0.77878249, "num_input_tokens_seen": 335452085, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 15551, "time_per_iteration": 2.4410624504089355 }, { "auxiliary_loss_clip": 0.01099718, "auxiliary_loss_mlp": 0.01029462, "balance_loss_clip": 1.01899517, "balance_loss_mlp": 1.03421211, "epoch": 0.9350368254922591, "flos": 24973429777920.0, "grad_norm": 1.7697535470870644, "language_loss": 0.73844773, "learning_rate": 4.406722074642255e-08, "loss": 0.75973952, "num_input_tokens_seen": 335472130, "router_z_loss_clip": 0.10449219, "router_z_loss_mlp": 0.65625, "step": 15552, "time_per_iteration": 2.5023581981658936 }, { "auxiliary_loss_clip": 0.01103108, "auxiliary_loss_mlp": 0.01035067, "balance_loss_clip": 1.02304459, "balance_loss_mlp": 1.03500164, "epoch": 0.9350969487449271, "flos": 23070594792960.0, "grad_norm": 1.8077385239741337, "language_loss": 0.7728796, "learning_rate": 4.3985952875155386e-08, "loss": 0.79426134, "num_input_tokens_seen": 335489970, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 15553, "time_per_iteration": 2.449963331222534 }, { "auxiliary_loss_clip": 0.01105501, "auxiliary_loss_mlp": 0.0103888, "balance_loss_clip": 1.02593982, "balance_loss_mlp": 1.0350008, "epoch": 0.9351570719975951, "flos": 18625177641600.0, "grad_norm": 1.6608771091580337, "language_loss": 0.78279269, "learning_rate": 4.390475917613723e-08, "loss": 0.80423653, "num_input_tokens_seen": 335509125, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 15554, "time_per_iteration": 2.4419074058532715 }, { "auxiliary_loss_clip": 0.01099474, "auxiliary_loss_mlp": 0.01027224, "balance_loss_clip": 1.01663208, "balance_loss_mlp": 1.03417301, "epoch": 0.935217195250263, "flos": 15888353702400.0, "grad_norm": 1.584647048482468, "language_loss": 0.69149572, "learning_rate": 4.382363965244695e-08, "loss": 0.71276271, "num_input_tokens_seen": 335525620, "router_z_loss_clip": 0.10595703, "router_z_loss_mlp": 0.65234375, "step": 15555, "time_per_iteration": 2.4127755165100098 }, { "auxiliary_loss_clip": 0.01102892, "auxiliary_loss_mlp": 0.01038735, "balance_loss_clip": 1.02654028, "balance_loss_mlp": 1.03434014, "epoch": 0.935277318502931, "flos": 24390312387840.0, "grad_norm": 1.6344587534302992, "language_loss": 0.75617415, "learning_rate": 4.374259430715965e-08, "loss": 0.77759045, "num_input_tokens_seen": 335547565, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 15556, "time_per_iteration": 2.5113108158111572 }, { "auxiliary_loss_clip": 0.0110251, "auxiliary_loss_mlp": 0.01033625, "balance_loss_clip": 1.02216268, "balance_loss_mlp": 1.03378463, "epoch": 0.935337441755599, "flos": 27600259294080.0, "grad_norm": 1.7414089644676762, "language_loss": 0.72119319, "learning_rate": 4.366162314334953e-08, "loss": 0.74255449, "num_input_tokens_seen": 335570285, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 15557, "time_per_iteration": 2.5601351261138916 }, { "auxiliary_loss_clip": 0.01104041, "auxiliary_loss_mlp": 0.01029207, "balance_loss_clip": 1.0167737, "balance_loss_mlp": 1.03517699, "epoch": 0.935397565008267, "flos": 20482872209280.0, "grad_norm": 1.6084874901875126, "language_loss": 0.6318925, "learning_rate": 4.358072616408681e-08, "loss": 0.653225, "num_input_tokens_seen": 335588600, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 15558, "time_per_iteration": 2.519968032836914 }, { "auxiliary_loss_clip": 0.01102879, "auxiliary_loss_mlp": 0.01030394, "balance_loss_clip": 1.01718521, "balance_loss_mlp": 1.03414524, "epoch": 0.9354576882609349, "flos": 23654394541440.0, "grad_norm": 2.3218464600697084, "language_loss": 0.73276985, "learning_rate": 4.34999033724388e-08, "loss": 0.75410259, "num_input_tokens_seen": 335606235, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6875, "step": 15559, "time_per_iteration": 2.4664828777313232 }, { "auxiliary_loss_clip": 0.01100728, "auxiliary_loss_mlp": 0.01026286, "balance_loss_clip": 1.01587915, "balance_loss_mlp": 1.03487694, "epoch": 0.9355178115136029, "flos": 36684904406400.0, "grad_norm": 2.1404262846154047, "language_loss": 0.63618904, "learning_rate": 4.341915477147062e-08, "loss": 0.6574592, "num_input_tokens_seen": 335628240, "router_z_loss_clip": 0.10400391, "router_z_loss_mlp": 0.65625, "step": 15560, "time_per_iteration": 2.606765031814575 }, { "auxiliary_loss_clip": 0.01111386, "auxiliary_loss_mlp": 0.01035552, "balance_loss_clip": 1.02088952, "balance_loss_mlp": 1.03598249, "epoch": 0.9355779347662708, "flos": 14460401450880.0, "grad_norm": 2.445289017924144, "language_loss": 0.64334571, "learning_rate": 4.3338480364244034e-08, "loss": 0.66481507, "num_input_tokens_seen": 335643755, "router_z_loss_clip": 0.14648438, "router_z_loss_mlp": 0.75390625, "step": 15561, "time_per_iteration": 2.3964731693267822 }, { "auxiliary_loss_clip": 0.01104817, "auxiliary_loss_mlp": 0.0103512, "balance_loss_clip": 1.02251947, "balance_loss_mlp": 1.03687119, "epoch": 0.9356380580189388, "flos": 23185976256000.0, "grad_norm": 1.9932702241003373, "language_loss": 0.75760651, "learning_rate": 4.325788015381859e-08, "loss": 0.77900589, "num_input_tokens_seen": 335665160, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 15562, "time_per_iteration": 2.4786622524261475 }, { "auxiliary_loss_clip": 0.01028086, "auxiliary_loss_mlp": 0.01001552, "balance_loss_clip": 1.00046718, "balance_loss_mlp": 1.00586915, "epoch": 0.9356981812716068, "flos": 67471626090240.0, "grad_norm": 1.4536982355476542, "language_loss": 0.62328744, "learning_rate": 4.31773541432503e-08, "loss": 0.64358377, "num_input_tokens_seen": 335715240, "router_z_loss_clip": 0.01086426, "router_z_loss_mlp": 0.22265625, "step": 15563, "time_per_iteration": 2.9448463916778564 }, { "auxiliary_loss_clip": 0.01100662, "auxiliary_loss_mlp": 0.0103364, "balance_loss_clip": 1.02159977, "balance_loss_mlp": 1.03426993, "epoch": 0.9357583045242748, "flos": 24681619687680.0, "grad_norm": 2.0395843024866043, "language_loss": 0.78310508, "learning_rate": 4.3096902335592714e-08, "loss": 0.80444813, "num_input_tokens_seen": 335734970, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6640625, "step": 15564, "time_per_iteration": 2.5328476428985596 }, { "auxiliary_loss_clip": 0.01104384, "auxiliary_loss_mlp": 0.0103056, "balance_loss_clip": 1.01731622, "balance_loss_mlp": 1.03364766, "epoch": 0.9358184277769427, "flos": 19463727623040.0, "grad_norm": 2.4135735056910166, "language_loss": 0.78010553, "learning_rate": 4.301652473389694e-08, "loss": 0.80145496, "num_input_tokens_seen": 335753435, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 15565, "time_per_iteration": 2.472752332687378 }, { "auxiliary_loss_clip": 0.01101517, "auxiliary_loss_mlp": 0.01028819, "balance_loss_clip": 1.01713026, "balance_loss_mlp": 1.03469992, "epoch": 0.9358785510296107, "flos": 18916987731840.0, "grad_norm": 2.1830546785960223, "language_loss": 0.71878546, "learning_rate": 4.2936221341210774e-08, "loss": 0.74008882, "num_input_tokens_seen": 335772105, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66796875, "step": 15566, "time_per_iteration": 2.4672582149505615 }, { "auxiliary_loss_clip": 0.01103194, "auxiliary_loss_mlp": 0.01025707, "balance_loss_clip": 1.01372075, "balance_loss_mlp": 1.0335629, "epoch": 0.9359386742822787, "flos": 23441265192960.0, "grad_norm": 4.311306438137456, "language_loss": 0.6758604, "learning_rate": 4.285599216057889e-08, "loss": 0.69714946, "num_input_tokens_seen": 335789125, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 15567, "time_per_iteration": 2.4614830017089844 }, { "auxiliary_loss_clip": 0.01104766, "auxiliary_loss_mlp": 0.01033738, "balance_loss_clip": 1.02144742, "balance_loss_mlp": 1.03637552, "epoch": 0.9359987975349466, "flos": 32744067557760.0, "grad_norm": 1.8851754699587582, "language_loss": 0.62082028, "learning_rate": 4.277583719504418e-08, "loss": 0.64220536, "num_input_tokens_seen": 335810995, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 15568, "time_per_iteration": 2.551767349243164 }, { "auxiliary_loss_clip": 0.01101746, "auxiliary_loss_mlp": 0.01032523, "balance_loss_clip": 1.01990485, "balance_loss_mlp": 1.03369308, "epoch": 0.9360589207876147, "flos": 22819651401600.0, "grad_norm": 1.5862876897043403, "language_loss": 0.78591138, "learning_rate": 4.269575644764556e-08, "loss": 0.80725408, "num_input_tokens_seen": 335830580, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 15569, "time_per_iteration": 2.4702963829040527 }, { "auxiliary_loss_clip": 0.01107114, "auxiliary_loss_mlp": 0.01032222, "balance_loss_clip": 1.01958632, "balance_loss_mlp": 1.03690743, "epoch": 0.9361190440402826, "flos": 20885251340160.0, "grad_norm": 2.94675125892893, "language_loss": 0.69253081, "learning_rate": 4.261574992142014e-08, "loss": 0.71392417, "num_input_tokens_seen": 335846515, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 15570, "time_per_iteration": 2.4677109718322754 }, { "auxiliary_loss_clip": 0.01103936, "auxiliary_loss_mlp": 0.01030491, "balance_loss_clip": 1.01792002, "balance_loss_mlp": 1.03439486, "epoch": 0.9361791672929506, "flos": 19317822577920.0, "grad_norm": 3.037925534271446, "language_loss": 0.79522741, "learning_rate": 4.2535817619401726e-08, "loss": 0.81657165, "num_input_tokens_seen": 335863350, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15571, "time_per_iteration": 2.4505369663238525 }, { "auxiliary_loss_clip": 0.01103339, "auxiliary_loss_mlp": 0.01029598, "balance_loss_clip": 1.0170275, "balance_loss_mlp": 1.03403628, "epoch": 0.9362392905456185, "flos": 15158182032000.0, "grad_norm": 1.996141598730348, "language_loss": 0.77497864, "learning_rate": 4.2455959544621224e-08, "loss": 0.79630804, "num_input_tokens_seen": 335880510, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15572, "time_per_iteration": 2.474592447280884 }, { "auxiliary_loss_clip": 0.01101498, "auxiliary_loss_mlp": 0.01036411, "balance_loss_clip": 1.0245316, "balance_loss_mlp": 1.03407085, "epoch": 0.9362994137982865, "flos": 22085888371200.0, "grad_norm": 4.027791618263026, "language_loss": 0.77793193, "learning_rate": 4.237617570010688e-08, "loss": 0.79931104, "num_input_tokens_seen": 335899440, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 15573, "time_per_iteration": 2.4761219024658203 }, { "auxiliary_loss_clip": 0.01101588, "auxiliary_loss_mlp": 0.01025781, "balance_loss_clip": 1.01443815, "balance_loss_mlp": 1.03542161, "epoch": 0.9363595370509544, "flos": 23512260424320.0, "grad_norm": 1.7273859795071969, "language_loss": 0.7471981, "learning_rate": 4.2296466088884044e-08, "loss": 0.76847172, "num_input_tokens_seen": 335919540, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 15574, "time_per_iteration": 2.495650291442871 }, { "auxiliary_loss_clip": 0.01100827, "auxiliary_loss_mlp": 0.01032118, "balance_loss_clip": 1.01972651, "balance_loss_mlp": 1.03445113, "epoch": 0.9364196603036224, "flos": 27123473139840.0, "grad_norm": 3.08148181949307, "language_loss": 0.6832245, "learning_rate": 4.221683071397564e-08, "loss": 0.70455396, "num_input_tokens_seen": 335939665, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6640625, "step": 15575, "time_per_iteration": 2.507343292236328 }, { "auxiliary_loss_clip": 0.01100013, "auxiliary_loss_mlp": 0.01033949, "balance_loss_clip": 1.02183723, "balance_loss_mlp": 1.03408742, "epoch": 0.9364797835562904, "flos": 18479057114880.0, "grad_norm": 1.595303541281182, "language_loss": 0.65363908, "learning_rate": 4.2137269578401026e-08, "loss": 0.67497873, "num_input_tokens_seen": 335958580, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66015625, "step": 15576, "time_per_iteration": 2.47202467918396 }, { "auxiliary_loss_clip": 0.01102895, "auxiliary_loss_mlp": 0.01028763, "balance_loss_clip": 1.01575112, "balance_loss_mlp": 1.03275967, "epoch": 0.9365399068089584, "flos": 13005552890880.0, "grad_norm": 2.978873971062542, "language_loss": 0.75921279, "learning_rate": 4.2057782685177566e-08, "loss": 0.78052932, "num_input_tokens_seen": 335974965, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 15577, "time_per_iteration": 2.4228968620300293 }, { "auxiliary_loss_clip": 0.01103289, "auxiliary_loss_mlp": 0.01028834, "balance_loss_clip": 1.01652527, "balance_loss_mlp": 1.03429604, "epoch": 0.9366000300616263, "flos": 25666433850240.0, "grad_norm": 4.0145145183349555, "language_loss": 0.51800144, "learning_rate": 4.1978370037318855e-08, "loss": 0.53932273, "num_input_tokens_seen": 335996575, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 15578, "time_per_iteration": 2.5316975116729736 }, { "auxiliary_loss_clip": 0.01102899, "auxiliary_loss_mlp": 0.01030823, "balance_loss_clip": 1.01930141, "balance_loss_mlp": 1.0350852, "epoch": 0.9366601533142943, "flos": 21433355948160.0, "grad_norm": 15.777549258288309, "language_loss": 0.70678234, "learning_rate": 4.189903163783692e-08, "loss": 0.72811961, "num_input_tokens_seen": 336017265, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 15579, "time_per_iteration": 3.7747867107391357 }, { "auxiliary_loss_clip": 0.01101561, "auxiliary_loss_mlp": 0.01027752, "balance_loss_clip": 1.01631951, "balance_loss_mlp": 1.03456688, "epoch": 0.9367202765669622, "flos": 24093222998400.0, "grad_norm": 1.739002283244408, "language_loss": 0.76330888, "learning_rate": 4.181976748973959e-08, "loss": 0.78460205, "num_input_tokens_seen": 336035905, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66796875, "step": 15580, "time_per_iteration": 2.468911647796631 }, { "auxiliary_loss_clip": 0.01107333, "auxiliary_loss_mlp": 0.01029854, "balance_loss_clip": 1.01697898, "balance_loss_mlp": 1.03604329, "epoch": 0.9367803998196302, "flos": 20888842700160.0, "grad_norm": 1.7861097049700954, "language_loss": 0.66451973, "learning_rate": 4.1740577596033114e-08, "loss": 0.68589163, "num_input_tokens_seen": 336055585, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 15581, "time_per_iteration": 2.466780424118042 }, { "auxiliary_loss_clip": 0.01103977, "auxiliary_loss_mlp": 0.01028331, "balance_loss_clip": 1.01601696, "balance_loss_mlp": 1.03546524, "epoch": 0.9368405230722983, "flos": 22564362464640.0, "grad_norm": 3.6916838373527834, "language_loss": 0.76666111, "learning_rate": 4.166146195972042e-08, "loss": 0.78798419, "num_input_tokens_seen": 336076695, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15582, "time_per_iteration": 2.467240571975708 }, { "auxiliary_loss_clip": 0.01101822, "auxiliary_loss_mlp": 0.01035246, "balance_loss_clip": 1.02261567, "balance_loss_mlp": 1.03470421, "epoch": 0.9369006463249662, "flos": 18880215183360.0, "grad_norm": 1.7166546249622467, "language_loss": 0.73696905, "learning_rate": 4.1582420583800905e-08, "loss": 0.75833976, "num_input_tokens_seen": 336094740, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.671875, "step": 15583, "time_per_iteration": 3.867596387863159 }, { "auxiliary_loss_clip": 0.01107984, "auxiliary_loss_mlp": 0.01030449, "balance_loss_clip": 1.0176692, "balance_loss_mlp": 1.03595138, "epoch": 0.9369607695776342, "flos": 26432516142720.0, "grad_norm": 2.2338913832354854, "language_loss": 0.84659135, "learning_rate": 4.1503453471272376e-08, "loss": 0.86797571, "num_input_tokens_seen": 336113985, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71875, "step": 15584, "time_per_iteration": 2.496464729309082 }, { "auxiliary_loss_clip": 0.01109294, "auxiliary_loss_mlp": 0.01037907, "balance_loss_clip": 1.0250864, "balance_loss_mlp": 1.03682518, "epoch": 0.9370208928303021, "flos": 39567346081920.0, "grad_norm": 1.5906190664118993, "language_loss": 0.72075349, "learning_rate": 4.1424560625129334e-08, "loss": 0.74222553, "num_input_tokens_seen": 336136395, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 15585, "time_per_iteration": 2.61712646484375 }, { "auxiliary_loss_clip": 0.01098694, "auxiliary_loss_mlp": 0.01023049, "balance_loss_clip": 1.01208186, "balance_loss_mlp": 1.03228831, "epoch": 0.9370810160829701, "flos": 22963114321920.0, "grad_norm": 2.267726413716501, "language_loss": 0.80562615, "learning_rate": 4.134574204836316e-08, "loss": 0.82684356, "num_input_tokens_seen": 336156345, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.6640625, "step": 15586, "time_per_iteration": 3.953355073928833 }, { "auxiliary_loss_clip": 0.01104103, "auxiliary_loss_mlp": 0.0103395, "balance_loss_clip": 1.02174306, "balance_loss_mlp": 1.03598797, "epoch": 0.937141139335638, "flos": 23075048079360.0, "grad_norm": 1.5702635261077484, "language_loss": 0.76718223, "learning_rate": 4.126699774396258e-08, "loss": 0.78856277, "num_input_tokens_seen": 336176760, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 15587, "time_per_iteration": 3.889418601989746 }, { "auxiliary_loss_clip": 0.01106733, "auxiliary_loss_mlp": 0.01032223, "balance_loss_clip": 1.01962209, "balance_loss_mlp": 1.03527832, "epoch": 0.937201262588306, "flos": 16356664247040.0, "grad_norm": 1.882139171414839, "language_loss": 0.87758231, "learning_rate": 4.118832771491387e-08, "loss": 0.89897186, "num_input_tokens_seen": 336193285, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 15588, "time_per_iteration": 2.447922945022583 }, { "auxiliary_loss_clip": 0.01100689, "auxiliary_loss_mlp": 0.01029343, "balance_loss_clip": 1.01779723, "balance_loss_mlp": 1.03467906, "epoch": 0.937261385840974, "flos": 20194078861440.0, "grad_norm": 2.0113950370514413, "language_loss": 0.77992058, "learning_rate": 4.11097319642002e-08, "loss": 0.80122089, "num_input_tokens_seen": 336211425, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66015625, "step": 15589, "time_per_iteration": 2.4524073600769043 }, { "auxiliary_loss_clip": 0.01100582, "auxiliary_loss_mlp": 0.01030361, "balance_loss_clip": 1.01829159, "balance_loss_mlp": 1.03421509, "epoch": 0.937321509093642, "flos": 18295948558080.0, "grad_norm": 1.8333624300359745, "language_loss": 0.77850896, "learning_rate": 4.103121049480163e-08, "loss": 0.7998184, "num_input_tokens_seen": 336230205, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6640625, "step": 15590, "time_per_iteration": 2.441908597946167 }, { "auxiliary_loss_clip": 0.01106012, "auxiliary_loss_mlp": 0.01036719, "balance_loss_clip": 1.02303386, "balance_loss_mlp": 1.03467107, "epoch": 0.9373816323463099, "flos": 25884662929920.0, "grad_norm": 1.78442060645672, "language_loss": 0.71381176, "learning_rate": 4.095276330969577e-08, "loss": 0.73523903, "num_input_tokens_seen": 336252440, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71484375, "step": 15591, "time_per_iteration": 2.4902894496917725 }, { "auxiliary_loss_clip": 0.01109482, "auxiliary_loss_mlp": 0.01034732, "balance_loss_clip": 1.02071285, "balance_loss_mlp": 1.03721559, "epoch": 0.9374417555989779, "flos": 27198849830400.0, "grad_norm": 2.632742865088014, "language_loss": 0.5405612, "learning_rate": 4.0874390411857804e-08, "loss": 0.56200325, "num_input_tokens_seen": 336273845, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.72265625, "step": 15592, "time_per_iteration": 2.5111045837402344 }, { "auxiliary_loss_clip": 0.01101957, "auxiliary_loss_mlp": 0.01027307, "balance_loss_clip": 1.01592803, "balance_loss_mlp": 1.03474092, "epoch": 0.9375018788516458, "flos": 23621249266560.0, "grad_norm": 2.165880583105916, "language_loss": 0.67466867, "learning_rate": 4.0796091804259136e-08, "loss": 0.69596136, "num_input_tokens_seen": 336292790, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.671875, "step": 15593, "time_per_iteration": 2.468874216079712 }, { "auxiliary_loss_clip": 0.01103795, "auxiliary_loss_mlp": 0.01028888, "balance_loss_clip": 1.01730037, "balance_loss_mlp": 1.03393459, "epoch": 0.9375620021043138, "flos": 22678774260480.0, "grad_norm": 1.6503678316400787, "language_loss": 0.74322343, "learning_rate": 4.0717867489868715e-08, "loss": 0.76455021, "num_input_tokens_seen": 336312600, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.69921875, "step": 15594, "time_per_iteration": 2.492309808731079 }, { "auxiliary_loss_clip": 0.01098327, "auxiliary_loss_mlp": 0.01026452, "balance_loss_clip": 1.0153358, "balance_loss_mlp": 1.03260875, "epoch": 0.9376221253569819, "flos": 27560254521600.0, "grad_norm": 1.8489278283927424, "language_loss": 0.73580581, "learning_rate": 4.063971747165351e-08, "loss": 0.75705361, "num_input_tokens_seen": 336332770, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 15595, "time_per_iteration": 2.4836981296539307 }, { "auxiliary_loss_clip": 0.01104447, "auxiliary_loss_mlp": 0.01032064, "balance_loss_clip": 1.01998818, "balance_loss_mlp": 1.03488672, "epoch": 0.9376822486096498, "flos": 24129887806080.0, "grad_norm": 1.9629430901410703, "language_loss": 0.7591033, "learning_rate": 4.056164175257626e-08, "loss": 0.78046846, "num_input_tokens_seen": 336351445, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 15596, "time_per_iteration": 2.4845473766326904 }, { "auxiliary_loss_clip": 0.01103579, "auxiliary_loss_mlp": 0.01030385, "balance_loss_clip": 1.01816654, "balance_loss_mlp": 1.03453314, "epoch": 0.9377423718623178, "flos": 22784028088320.0, "grad_norm": 2.125641056519235, "language_loss": 0.79094982, "learning_rate": 4.0483640335597926e-08, "loss": 0.81228948, "num_input_tokens_seen": 336368690, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 15597, "time_per_iteration": 2.474822521209717 }, { "auxiliary_loss_clip": 0.01108117, "auxiliary_loss_mlp": 0.01032546, "balance_loss_clip": 1.0199759, "balance_loss_mlp": 1.03590393, "epoch": 0.9378024951149857, "flos": 19168900790400.0, "grad_norm": 1.6429854850549486, "language_loss": 0.81138051, "learning_rate": 4.0405713223676363e-08, "loss": 0.83278716, "num_input_tokens_seen": 336388165, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.72265625, "step": 15598, "time_per_iteration": 2.487743616104126 }, { "auxiliary_loss_clip": 0.01107902, "auxiliary_loss_mlp": 0.01030076, "balance_loss_clip": 1.01752305, "balance_loss_mlp": 1.0346899, "epoch": 0.9378626183676537, "flos": 23505508667520.0, "grad_norm": 2.126355210394823, "language_loss": 0.63366389, "learning_rate": 4.0327860419766994e-08, "loss": 0.65504366, "num_input_tokens_seen": 336406475, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.73046875, "step": 15599, "time_per_iteration": 2.4523026943206787 }, { "auxiliary_loss_clip": 0.0110444, "auxiliary_loss_mlp": 0.01031236, "balance_loss_clip": 1.01879692, "balance_loss_mlp": 1.03458595, "epoch": 0.9379227416203216, "flos": 18405655672320.0, "grad_norm": 1.9767754560118063, "language_loss": 0.73464704, "learning_rate": 4.0250081926821e-08, "loss": 0.75600374, "num_input_tokens_seen": 336424690, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69921875, "step": 15600, "time_per_iteration": 2.4850172996520996 }, { "auxiliary_loss_clip": 0.01101709, "auxiliary_loss_mlp": 0.01028331, "balance_loss_clip": 1.01720905, "balance_loss_mlp": 1.03443527, "epoch": 0.9379828648729897, "flos": 17821855923840.0, "grad_norm": 1.974748061463766, "language_loss": 0.69453359, "learning_rate": 4.0172377747788474e-08, "loss": 0.71583402, "num_input_tokens_seen": 336443055, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.671875, "step": 15601, "time_per_iteration": 2.429306745529175 }, { "auxiliary_loss_clip": 0.01027972, "auxiliary_loss_mlp": 0.01001209, "balance_loss_clip": 1.00023162, "balance_loss_mlp": 1.00561452, "epoch": 0.9380429881256576, "flos": 68024399466240.0, "grad_norm": 0.7703400711885192, "language_loss": 0.58114421, "learning_rate": 4.009474788561573e-08, "loss": 0.60143602, "num_input_tokens_seen": 336510190, "router_z_loss_clip": 0.00976562, "router_z_loss_mlp": 0.22363281, "step": 15602, "time_per_iteration": 3.2554931640625 }, { "auxiliary_loss_clip": 0.01103447, "auxiliary_loss_mlp": 0.01031818, "balance_loss_clip": 1.0203737, "balance_loss_mlp": 1.0335505, "epoch": 0.9381031113783256, "flos": 20776980769920.0, "grad_norm": 1.954769646978802, "language_loss": 0.72325122, "learning_rate": 4.001719234324663e-08, "loss": 0.74460387, "num_input_tokens_seen": 336529250, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.69921875, "step": 15603, "time_per_iteration": 2.4562058448791504 }, { "auxiliary_loss_clip": 0.01095729, "auxiliary_loss_mlp": 0.01027184, "balance_loss_clip": 1.01591897, "balance_loss_mlp": 1.03228533, "epoch": 0.9381632346309935, "flos": 19025078734080.0, "grad_norm": 4.1544498087608375, "language_loss": 0.75927925, "learning_rate": 3.993971112362171e-08, "loss": 0.7805084, "num_input_tokens_seen": 336548530, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6328125, "step": 15604, "time_per_iteration": 2.457491636276245 }, { "auxiliary_loss_clip": 0.01105705, "auxiliary_loss_mlp": 0.01032773, "balance_loss_clip": 1.01952934, "balance_loss_mlp": 1.03540885, "epoch": 0.9382233578836615, "flos": 23513840622720.0, "grad_norm": 3.275364142207476, "language_loss": 0.65574276, "learning_rate": 3.9862304229679734e-08, "loss": 0.67712748, "num_input_tokens_seen": 336568510, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 15605, "time_per_iteration": 2.468252658843994 }, { "auxiliary_loss_clip": 0.01106926, "auxiliary_loss_mlp": 0.0103114, "balance_loss_clip": 1.01829481, "balance_loss_mlp": 1.03459787, "epoch": 0.9382834811363294, "flos": 43067882016000.0, "grad_norm": 1.7202194954166072, "language_loss": 0.67590743, "learning_rate": 3.9784971664355683e-08, "loss": 0.69728804, "num_input_tokens_seen": 336592020, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.72265625, "step": 15606, "time_per_iteration": 2.6449928283691406 }, { "auxiliary_loss_clip": 0.01098077, "auxiliary_loss_mlp": 0.01026549, "balance_loss_clip": 1.01525378, "balance_loss_mlp": 1.0326128, "epoch": 0.9383436043889974, "flos": 16436242828800.0, "grad_norm": 1.8735238950571864, "language_loss": 0.78121448, "learning_rate": 3.970771343058166e-08, "loss": 0.80246073, "num_input_tokens_seen": 336610010, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65625, "step": 15607, "time_per_iteration": 2.421452283859253 }, { "auxiliary_loss_clip": 0.01104637, "auxiliary_loss_mlp": 0.01029343, "balance_loss_clip": 1.01769006, "balance_loss_mlp": 1.03517747, "epoch": 0.9384037276416655, "flos": 20740603271040.0, "grad_norm": 2.2007860257181404, "language_loss": 0.82955682, "learning_rate": 3.963052953128776e-08, "loss": 0.8508966, "num_input_tokens_seen": 336628520, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69140625, "step": 15608, "time_per_iteration": 2.462867021560669 }, { "auxiliary_loss_clip": 0.01106863, "auxiliary_loss_mlp": 0.01035369, "balance_loss_clip": 1.02263093, "balance_loss_mlp": 1.03773415, "epoch": 0.9384638508943334, "flos": 19062677295360.0, "grad_norm": 5.175557885290405, "language_loss": 0.68943596, "learning_rate": 3.9553419969400536e-08, "loss": 0.71085829, "num_input_tokens_seen": 336647365, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 15609, "time_per_iteration": 2.4336206912994385 }, { "auxiliary_loss_clip": 0.0110449, "auxiliary_loss_mlp": 0.01029279, "balance_loss_clip": 1.01633906, "balance_loss_mlp": 1.03319871, "epoch": 0.9385239741470014, "flos": 23404887694080.0, "grad_norm": 3.3096973212936063, "language_loss": 0.74775136, "learning_rate": 3.9476384747844316e-08, "loss": 0.76908904, "num_input_tokens_seen": 336667165, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 15610, "time_per_iteration": 2.502577304840088 }, { "auxiliary_loss_clip": 0.0110399, "auxiliary_loss_mlp": 0.01030352, "balance_loss_clip": 1.01893771, "balance_loss_mlp": 1.03442729, "epoch": 0.9385840973996693, "flos": 12824742804480.0, "grad_norm": 2.7355046922483983, "language_loss": 0.75080526, "learning_rate": 3.939942386953987e-08, "loss": 0.77214867, "num_input_tokens_seen": 336684130, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 15611, "time_per_iteration": 2.4331629276275635 }, { "auxiliary_loss_clip": 0.01104666, "auxiliary_loss_mlp": 0.01030671, "balance_loss_clip": 1.01852942, "balance_loss_mlp": 1.03580856, "epoch": 0.9386442206523373, "flos": 15486980152320.0, "grad_norm": 2.2575675548016867, "language_loss": 0.65773916, "learning_rate": 3.9322537337405756e-08, "loss": 0.67909253, "num_input_tokens_seen": 336701520, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 15612, "time_per_iteration": 2.45662522315979 }, { "auxiliary_loss_clip": 0.01099864, "auxiliary_loss_mlp": 0.0102652, "balance_loss_clip": 1.01494503, "balance_loss_mlp": 1.03341293, "epoch": 0.9387043439050052, "flos": 21178821196800.0, "grad_norm": 1.8492016543033483, "language_loss": 0.57164472, "learning_rate": 3.924572515435742e-08, "loss": 0.59290862, "num_input_tokens_seen": 336720675, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 15613, "time_per_iteration": 2.4592461585998535 }, { "auxiliary_loss_clip": 0.01102598, "auxiliary_loss_mlp": 0.0103317, "balance_loss_clip": 1.02092087, "balance_loss_mlp": 1.03335714, "epoch": 0.9387644671576733, "flos": 27668273696640.0, "grad_norm": 3.9070563736792683, "language_loss": 0.70826292, "learning_rate": 3.916898732330764e-08, "loss": 0.72962064, "num_input_tokens_seen": 336741005, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 15614, "time_per_iteration": 2.513291120529175 }, { "auxiliary_loss_clip": 0.01105869, "auxiliary_loss_mlp": 0.01029169, "balance_loss_clip": 1.01656294, "balance_loss_mlp": 1.03493547, "epoch": 0.9388245904103412, "flos": 18836331742080.0, "grad_norm": 1.7321530322083356, "language_loss": 0.8086428, "learning_rate": 3.9092323847166544e-08, "loss": 0.82999313, "num_input_tokens_seen": 336757990, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 15615, "time_per_iteration": 2.4284865856170654 }, { "auxiliary_loss_clip": 0.01100131, "auxiliary_loss_mlp": 0.01026477, "balance_loss_clip": 1.01492572, "balance_loss_mlp": 1.03333414, "epoch": 0.9388847136630092, "flos": 25483828083840.0, "grad_norm": 1.7901513362327068, "language_loss": 0.71998864, "learning_rate": 3.901573472884134e-08, "loss": 0.74125469, "num_input_tokens_seen": 336777705, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66796875, "step": 15616, "time_per_iteration": 2.5132086277008057 }, { "auxiliary_loss_clip": 0.0110464, "auxiliary_loss_mlp": 0.01027069, "balance_loss_clip": 1.01468301, "balance_loss_mlp": 1.03635526, "epoch": 0.9389448369156771, "flos": 18734992496640.0, "grad_norm": 1.7831951370465728, "language_loss": 0.66082734, "learning_rate": 3.89392199712355e-08, "loss": 0.6821444, "num_input_tokens_seen": 336798275, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 15617, "time_per_iteration": 2.4639251232147217 }, { "auxiliary_loss_clip": 0.01107939, "auxiliary_loss_mlp": 0.01037172, "balance_loss_clip": 1.02385616, "balance_loss_mlp": 1.03609693, "epoch": 0.9390049601683451, "flos": 21717839664000.0, "grad_norm": 2.921961425056345, "language_loss": 0.73666453, "learning_rate": 3.886277957725092e-08, "loss": 0.75811565, "num_input_tokens_seen": 336813835, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 15618, "time_per_iteration": 2.4620461463928223 }, { "auxiliary_loss_clip": 0.01107791, "auxiliary_loss_mlp": 0.01032563, "balance_loss_clip": 1.01887178, "balance_loss_mlp": 1.03548145, "epoch": 0.939065083421013, "flos": 19391224020480.0, "grad_norm": 2.2786429081004376, "language_loss": 0.70480156, "learning_rate": 3.878641354978662e-08, "loss": 0.72620511, "num_input_tokens_seen": 336832210, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 15619, "time_per_iteration": 2.441542863845825 }, { "auxiliary_loss_clip": 0.01104645, "auxiliary_loss_mlp": 0.01033926, "balance_loss_clip": 1.02142072, "balance_loss_mlp": 1.03559351, "epoch": 0.939125206673681, "flos": 24681511946880.0, "grad_norm": 1.7930182444436353, "language_loss": 0.77631092, "learning_rate": 3.8710121891737834e-08, "loss": 0.79769659, "num_input_tokens_seen": 336851380, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 15620, "time_per_iteration": 3.8327479362487793 }, { "auxiliary_loss_clip": 0.01101121, "auxiliary_loss_mlp": 0.01026964, "balance_loss_clip": 1.01495957, "balance_loss_mlp": 1.03434789, "epoch": 0.9391853299263491, "flos": 16325961096960.0, "grad_norm": 2.6295999720339016, "language_loss": 0.73600614, "learning_rate": 3.8633904605998025e-08, "loss": 0.75728703, "num_input_tokens_seen": 336868525, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66796875, "step": 15621, "time_per_iteration": 2.442375659942627 }, { "auxiliary_loss_clip": 0.01109006, "auxiliary_loss_mlp": 0.0103178, "balance_loss_clip": 1.01884007, "balance_loss_mlp": 1.03754091, "epoch": 0.939245453179017, "flos": 11655778590720.0, "grad_norm": 2.3366381866331567, "language_loss": 0.66467577, "learning_rate": 3.855776169545688e-08, "loss": 0.68608367, "num_input_tokens_seen": 336886200, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 15622, "time_per_iteration": 2.4296793937683105 }, { "auxiliary_loss_clip": 0.0110184, "auxiliary_loss_mlp": 0.01032016, "balance_loss_clip": 1.02024364, "balance_loss_mlp": 1.03430057, "epoch": 0.939305576431685, "flos": 23148700917120.0, "grad_norm": 1.5838216924479134, "language_loss": 0.71858382, "learning_rate": 3.848169316300209e-08, "loss": 0.73992234, "num_input_tokens_seen": 336905815, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 15623, "time_per_iteration": 2.4849114418029785 }, { "auxiliary_loss_clip": 0.0110934, "auxiliary_loss_mlp": 0.01031818, "balance_loss_clip": 1.01961696, "balance_loss_mlp": 1.03887081, "epoch": 0.9393656996843529, "flos": 33287790706560.0, "grad_norm": 2.032493440841842, "language_loss": 0.72508991, "learning_rate": 3.84056990115178e-08, "loss": 0.74650145, "num_input_tokens_seen": 336928460, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.703125, "step": 15624, "time_per_iteration": 2.5683603286743164 }, { "auxiliary_loss_clip": 0.01101449, "auxiliary_loss_mlp": 0.01029833, "balance_loss_clip": 1.01773286, "balance_loss_mlp": 1.03375447, "epoch": 0.9394258229370209, "flos": 21689434984320.0, "grad_norm": 1.994543819036816, "language_loss": 0.89639562, "learning_rate": 3.832977924388614e-08, "loss": 0.91770846, "num_input_tokens_seen": 336948320, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 15625, "time_per_iteration": 3.8324666023254395 }, { "auxiliary_loss_clip": 0.01102525, "auxiliary_loss_mlp": 0.01030521, "balance_loss_clip": 1.01789033, "balance_loss_mlp": 1.03458667, "epoch": 0.9394859461896888, "flos": 23874203819520.0, "grad_norm": 1.5966094768447676, "language_loss": 0.83659106, "learning_rate": 3.825393386298592e-08, "loss": 0.85792148, "num_input_tokens_seen": 336967670, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 15626, "time_per_iteration": 2.456550121307373 }, { "auxiliary_loss_clip": 0.01028231, "auxiliary_loss_mlp": 0.01001321, "balance_loss_clip": 1.00025964, "balance_loss_mlp": 1.00586486, "epoch": 0.9395460694423569, "flos": 61566116993280.0, "grad_norm": 0.7869517928086818, "language_loss": 0.56131911, "learning_rate": 3.8178162871693284e-08, "loss": 0.58161467, "num_input_tokens_seen": 337028395, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.22460938, "step": 15627, "time_per_iteration": 3.0410590171813965 }, { "auxiliary_loss_clip": 0.01103286, "auxiliary_loss_mlp": 0.01030051, "balance_loss_clip": 1.01845837, "balance_loss_mlp": 1.03640199, "epoch": 0.9396061926950248, "flos": 20995712640000.0, "grad_norm": 1.640637728522836, "language_loss": 0.70282674, "learning_rate": 3.810246627288105e-08, "loss": 0.72416008, "num_input_tokens_seen": 337048150, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 15628, "time_per_iteration": 5.341508865356445 }, { "auxiliary_loss_clip": 0.01103157, "auxiliary_loss_mlp": 0.01028136, "balance_loss_clip": 1.0159409, "balance_loss_mlp": 1.03534985, "epoch": 0.9396663159476928, "flos": 27487786832640.0, "grad_norm": 4.77284509098779, "language_loss": 0.75651622, "learning_rate": 3.8026844069420025e-08, "loss": 0.77782917, "num_input_tokens_seen": 337069315, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 15629, "time_per_iteration": 2.5285370349884033 }, { "auxiliary_loss_clip": 0.01098581, "auxiliary_loss_mlp": 0.01029286, "balance_loss_clip": 1.0181756, "balance_loss_mlp": 1.03369927, "epoch": 0.9397264392003607, "flos": 19427457864960.0, "grad_norm": 1.7584744171114715, "language_loss": 0.74394989, "learning_rate": 3.795129626417748e-08, "loss": 0.76522851, "num_input_tokens_seen": 337087765, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6484375, "step": 15630, "time_per_iteration": 2.437934637069702 }, { "auxiliary_loss_clip": 0.01099638, "auxiliary_loss_mlp": 0.01029381, "balance_loss_clip": 1.01794338, "balance_loss_mlp": 1.03422117, "epoch": 0.9397865624530287, "flos": 18004820826240.0, "grad_norm": 1.936974730251846, "language_loss": 0.69508201, "learning_rate": 3.787582286001845e-08, "loss": 0.71637219, "num_input_tokens_seen": 337106265, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.65625, "step": 15631, "time_per_iteration": 2.444305658340454 }, { "auxiliary_loss_clip": 0.01102797, "auxiliary_loss_mlp": 0.01037678, "balance_loss_clip": 1.02608526, "balance_loss_mlp": 1.03506529, "epoch": 0.9398466857056966, "flos": 22564613859840.0, "grad_norm": 1.4546566995374037, "language_loss": 0.74853104, "learning_rate": 3.7800423859805086e-08, "loss": 0.76993585, "num_input_tokens_seen": 337126090, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 15632, "time_per_iteration": 2.49906325340271 }, { "auxiliary_loss_clip": 0.01111066, "auxiliary_loss_mlp": 0.01035195, "balance_loss_clip": 1.02133667, "balance_loss_mlp": 1.03868258, "epoch": 0.9399068089583646, "flos": 24535678728960.0, "grad_norm": 1.5448293621237634, "language_loss": 0.74465621, "learning_rate": 3.772509926639622e-08, "loss": 0.76611876, "num_input_tokens_seen": 337145655, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.72265625, "step": 15633, "time_per_iteration": 2.5045509338378906 }, { "auxiliary_loss_clip": 0.01105765, "auxiliary_loss_mlp": 0.01032425, "balance_loss_clip": 1.01868033, "balance_loss_mlp": 1.03588223, "epoch": 0.9399669322110327, "flos": 25630343660160.0, "grad_norm": 1.7226061743505228, "language_loss": 0.72794604, "learning_rate": 3.764984908264823e-08, "loss": 0.7493279, "num_input_tokens_seen": 337164805, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.69921875, "step": 15634, "time_per_iteration": 2.5078811645507812 }, { "auxiliary_loss_clip": 0.01104843, "auxiliary_loss_mlp": 0.01029344, "balance_loss_clip": 1.01653445, "balance_loss_mlp": 1.03405011, "epoch": 0.9400270554637006, "flos": 17089385783040.0, "grad_norm": 2.003775781102798, "language_loss": 0.69087601, "learning_rate": 3.75746733114144e-08, "loss": 0.71221781, "num_input_tokens_seen": 337182280, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.70703125, "step": 15635, "time_per_iteration": 2.4868080615997314 }, { "auxiliary_loss_clip": 0.01101074, "auxiliary_loss_mlp": 0.01028399, "balance_loss_clip": 1.01685381, "balance_loss_mlp": 1.03532982, "epoch": 0.9400871787163686, "flos": 22055113393920.0, "grad_norm": 1.6115412006571235, "language_loss": 0.74027097, "learning_rate": 3.7499571955545985e-08, "loss": 0.76156569, "num_input_tokens_seen": 337203495, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.65625, "step": 15636, "time_per_iteration": 2.455615520477295 }, { "auxiliary_loss_clip": 0.01105137, "auxiliary_loss_mlp": 0.01030248, "balance_loss_clip": 1.01799345, "balance_loss_mlp": 1.03568792, "epoch": 0.9401473019690365, "flos": 16982767238400.0, "grad_norm": 2.389559804079875, "language_loss": 0.83141643, "learning_rate": 3.7424545017890054e-08, "loss": 0.85277027, "num_input_tokens_seen": 337220435, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 15637, "time_per_iteration": 2.4543263912200928 }, { "auxiliary_loss_clip": 0.01104067, "auxiliary_loss_mlp": 0.01028525, "balance_loss_clip": 1.01637745, "balance_loss_mlp": 1.03484988, "epoch": 0.9402074252217045, "flos": 19681956702720.0, "grad_norm": 2.4884075965117343, "language_loss": 0.69037992, "learning_rate": 3.7349592501292325e-08, "loss": 0.71170592, "num_input_tokens_seen": 337238095, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.69140625, "step": 15638, "time_per_iteration": 2.4268605709075928 }, { "auxiliary_loss_clip": 0.01098635, "auxiliary_loss_mlp": 0.01033855, "balance_loss_clip": 1.02297103, "balance_loss_mlp": 1.03406692, "epoch": 0.9402675484743724, "flos": 24754302858240.0, "grad_norm": 2.552314385759358, "language_loss": 0.84730577, "learning_rate": 3.727471440859498e-08, "loss": 0.86863077, "num_input_tokens_seen": 337256645, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.6484375, "step": 15639, "time_per_iteration": 2.4980621337890625 }, { "auxiliary_loss_clip": 0.01103315, "auxiliary_loss_mlp": 0.01026114, "balance_loss_clip": 1.01419318, "balance_loss_mlp": 1.03426301, "epoch": 0.9403276717270405, "flos": 25558630156800.0, "grad_norm": 1.61032874133738, "language_loss": 0.78494877, "learning_rate": 3.719991074263662e-08, "loss": 0.80624306, "num_input_tokens_seen": 337278360, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 15640, "time_per_iteration": 2.4910008907318115 }, { "auxiliary_loss_clip": 0.01106776, "auxiliary_loss_mlp": 0.01032618, "balance_loss_clip": 1.02066147, "balance_loss_mlp": 1.03528392, "epoch": 0.9403877949797084, "flos": 26689852154880.0, "grad_norm": 1.640058497428672, "language_loss": 0.7462461, "learning_rate": 3.7125181506254544e-08, "loss": 0.76763999, "num_input_tokens_seen": 337302480, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.71484375, "step": 15641, "time_per_iteration": 2.5459847450256348 }, { "auxiliary_loss_clip": 0.01107505, "auxiliary_loss_mlp": 0.01034038, "balance_loss_clip": 1.02000666, "balance_loss_mlp": 1.03526819, "epoch": 0.9404479182323764, "flos": 15011666455680.0, "grad_norm": 2.203319474749606, "language_loss": 0.82275176, "learning_rate": 3.7050526702282256e-08, "loss": 0.84416711, "num_input_tokens_seen": 337316600, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.72265625, "step": 15642, "time_per_iteration": 2.405015707015991 }, { "auxiliary_loss_clip": 0.01100098, "auxiliary_loss_mlp": 0.01029647, "balance_loss_clip": 1.01816154, "balance_loss_mlp": 1.03361118, "epoch": 0.9405080414850443, "flos": 24973573432320.0, "grad_norm": 3.3077498922437316, "language_loss": 0.6831367, "learning_rate": 3.697594633355084e-08, "loss": 0.70443422, "num_input_tokens_seen": 337336895, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6640625, "step": 15643, "time_per_iteration": 2.4938442707061768 }, { "auxiliary_loss_clip": 0.01109264, "auxiliary_loss_mlp": 0.0103677, "balance_loss_clip": 1.02354431, "balance_loss_mlp": 1.03798437, "epoch": 0.9405681647377123, "flos": 20844743777280.0, "grad_norm": 1.907644373292258, "language_loss": 0.76458454, "learning_rate": 3.6901440402888226e-08, "loss": 0.78604484, "num_input_tokens_seen": 337355105, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 15644, "time_per_iteration": 2.4450454711914062 }, { "auxiliary_loss_clip": 0.01101255, "auxiliary_loss_mlp": 0.0103248, "balance_loss_clip": 1.02183461, "balance_loss_mlp": 1.03472519, "epoch": 0.9406282879903802, "flos": 23805578885760.0, "grad_norm": 1.6835732579574723, "language_loss": 0.67904353, "learning_rate": 3.682700891311974e-08, "loss": 0.70038092, "num_input_tokens_seen": 337374905, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.6640625, "step": 15645, "time_per_iteration": 2.4820175170898438 }, { "auxiliary_loss_clip": 0.0109771, "auxiliary_loss_mlp": 0.01031462, "balance_loss_clip": 1.01954746, "balance_loss_mlp": 1.0326376, "epoch": 0.9406884112430483, "flos": 27674953626240.0, "grad_norm": 1.4548724234397048, "language_loss": 0.70202529, "learning_rate": 3.6752651867067774e-08, "loss": 0.72331697, "num_input_tokens_seen": 337397130, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6484375, "step": 15646, "time_per_iteration": 2.5047380924224854 }, { "auxiliary_loss_clip": 0.01099899, "auxiliary_loss_mlp": 0.01031012, "balance_loss_clip": 1.01885307, "balance_loss_mlp": 1.03318524, "epoch": 0.9407485344957163, "flos": 23075048079360.0, "grad_norm": 2.474593702987953, "language_loss": 0.74358714, "learning_rate": 3.667836926755208e-08, "loss": 0.76489621, "num_input_tokens_seen": 337418660, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6640625, "step": 15647, "time_per_iteration": 2.495995044708252 }, { "auxiliary_loss_clip": 0.01027833, "auxiliary_loss_mlp": 0.01001399, "balance_loss_clip": 1.00049257, "balance_loss_mlp": 1.00553536, "epoch": 0.9408086577483842, "flos": 71014034304000.0, "grad_norm": 0.8962875208628127, "language_loss": 0.63512504, "learning_rate": 3.660416111738907e-08, "loss": 0.65541738, "num_input_tokens_seen": 337478055, "router_z_loss_clip": 0.0090332, "router_z_loss_mlp": 0.22363281, "step": 15648, "time_per_iteration": 3.1979973316192627 }, { "auxiliary_loss_clip": 0.01099936, "auxiliary_loss_mlp": 0.01031158, "balance_loss_clip": 1.02018523, "balance_loss_mlp": 1.03426278, "epoch": 0.9408687810010522, "flos": 23730956380800.0, "grad_norm": 1.5891712737912072, "language_loss": 0.66371715, "learning_rate": 3.653002741939337e-08, "loss": 0.68502808, "num_input_tokens_seen": 337499405, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.65625, "step": 15649, "time_per_iteration": 2.5044312477111816 }, { "auxiliary_loss_clip": 0.01101544, "auxiliary_loss_mlp": 0.01031129, "balance_loss_clip": 1.01928604, "balance_loss_mlp": 1.03300381, "epoch": 0.9409289042537201, "flos": 18369314087040.0, "grad_norm": 2.3739394316557774, "language_loss": 0.77909237, "learning_rate": 3.645596817637586e-08, "loss": 0.80041909, "num_input_tokens_seen": 337517195, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 15650, "time_per_iteration": 2.4396474361419678 }, { "auxiliary_loss_clip": 0.01104782, "auxiliary_loss_mlp": 0.01033034, "balance_loss_clip": 1.02129793, "balance_loss_mlp": 1.03711128, "epoch": 0.9409890275063881, "flos": 23878333883520.0, "grad_norm": 3.4295368473706125, "language_loss": 0.74498504, "learning_rate": 3.638198339114451e-08, "loss": 0.76636314, "num_input_tokens_seen": 337535245, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 15651, "time_per_iteration": 2.4924941062927246 }, { "auxiliary_loss_clip": 0.01103459, "auxiliary_loss_mlp": 0.01034428, "balance_loss_clip": 1.02156532, "balance_loss_mlp": 1.03518581, "epoch": 0.941049150759056, "flos": 16545088016640.0, "grad_norm": 1.8083595962924435, "language_loss": 0.72275209, "learning_rate": 3.630807306650507e-08, "loss": 0.74413091, "num_input_tokens_seen": 337553040, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 15652, "time_per_iteration": 2.4556939601898193 }, { "auxiliary_loss_clip": 0.01107992, "auxiliary_loss_mlp": 0.0103275, "balance_loss_clip": 1.01989913, "balance_loss_mlp": 1.03507638, "epoch": 0.9411092740117241, "flos": 25118401069440.0, "grad_norm": 1.9372758008445616, "language_loss": 0.66198081, "learning_rate": 3.6234237205260645e-08, "loss": 0.68338823, "num_input_tokens_seen": 337574580, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7265625, "step": 15653, "time_per_iteration": 2.517181873321533 }, { "auxiliary_loss_clip": 0.01104512, "auxiliary_loss_mlp": 0.01033923, "balance_loss_clip": 1.02130449, "balance_loss_mlp": 1.03525043, "epoch": 0.941169397264392, "flos": 21142264129920.0, "grad_norm": 1.9151813166673846, "language_loss": 0.77971923, "learning_rate": 3.6160475810210536e-08, "loss": 0.80110359, "num_input_tokens_seen": 337593010, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69140625, "step": 15654, "time_per_iteration": 2.4930033683776855 }, { "auxiliary_loss_clip": 0.0110782, "auxiliary_loss_mlp": 0.01031737, "balance_loss_clip": 1.01920176, "balance_loss_mlp": 1.03526366, "epoch": 0.94122952051706, "flos": 38508914995200.0, "grad_norm": 1.7863138085768018, "language_loss": 0.69994009, "learning_rate": 3.6086788884152065e-08, "loss": 0.72133571, "num_input_tokens_seen": 337616170, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7265625, "step": 15655, "time_per_iteration": 2.624237298965454 }, { "auxiliary_loss_clip": 0.01103689, "auxiliary_loss_mlp": 0.01032362, "balance_loss_clip": 1.01875997, "balance_loss_mlp": 1.03486252, "epoch": 0.9412896437697279, "flos": 18369206346240.0, "grad_norm": 2.02321308937992, "language_loss": 0.72206402, "learning_rate": 3.601317642987944e-08, "loss": 0.74342448, "num_input_tokens_seen": 337635215, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6875, "step": 15656, "time_per_iteration": 2.488840341567993 }, { "auxiliary_loss_clip": 0.01102741, "auxiliary_loss_mlp": 0.01027739, "balance_loss_clip": 1.0160327, "balance_loss_mlp": 1.03459704, "epoch": 0.9413497670223959, "flos": 25884950238720.0, "grad_norm": 2.329698812295537, "language_loss": 0.78077638, "learning_rate": 3.593963845018377e-08, "loss": 0.80208117, "num_input_tokens_seen": 337654195, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 15657, "time_per_iteration": 2.504995107650757 }, { "auxiliary_loss_clip": 0.011007, "auxiliary_loss_mlp": 0.01028536, "balance_loss_clip": 1.01631081, "balance_loss_mlp": 1.03248906, "epoch": 0.9414098902750638, "flos": 16618309891200.0, "grad_norm": 2.35919245521197, "language_loss": 0.84215176, "learning_rate": 3.586617494785371e-08, "loss": 0.86344415, "num_input_tokens_seen": 337671810, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 15658, "time_per_iteration": 2.41355037689209 }, { "auxiliary_loss_clip": 0.01108877, "auxiliary_loss_mlp": 0.01035218, "balance_loss_clip": 1.0210861, "balance_loss_mlp": 1.03635919, "epoch": 0.9414700135277319, "flos": 18625033987200.0, "grad_norm": 2.1598199167369034, "language_loss": 0.70770216, "learning_rate": 3.5792785925675254e-08, "loss": 0.72914314, "num_input_tokens_seen": 337689410, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.72265625, "step": 15659, "time_per_iteration": 2.4504647254943848 }, { "auxiliary_loss_clip": 0.01099716, "auxiliary_loss_mlp": 0.01036865, "balance_loss_clip": 1.02568328, "balance_loss_mlp": 1.03327084, "epoch": 0.9415301367803999, "flos": 26280146649600.0, "grad_norm": 1.80292595013577, "language_loss": 0.79904485, "learning_rate": 3.571947138643172e-08, "loss": 0.82041067, "num_input_tokens_seen": 337709950, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6640625, "step": 15660, "time_per_iteration": 2.479177951812744 }, { "auxiliary_loss_clip": 0.01100113, "auxiliary_loss_mlp": 0.01029085, "balance_loss_clip": 1.01753354, "balance_loss_mlp": 1.03403509, "epoch": 0.9415902600330678, "flos": 23261388860160.0, "grad_norm": 1.4584376025384507, "language_loss": 0.68024719, "learning_rate": 3.564623133290201e-08, "loss": 0.70153916, "num_input_tokens_seen": 337731320, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 15661, "time_per_iteration": 2.517301321029663 }, { "auxiliary_loss_clip": 0.01102146, "auxiliary_loss_mlp": 0.01030643, "balance_loss_clip": 1.01877522, "balance_loss_mlp": 1.033463, "epoch": 0.9416503832857358, "flos": 14719138093440.0, "grad_norm": 2.014956256344648, "language_loss": 0.66674554, "learning_rate": 3.557306576786434e-08, "loss": 0.6880734, "num_input_tokens_seen": 337747720, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 15662, "time_per_iteration": 3.7906813621520996 }, { "auxiliary_loss_clip": 0.01027692, "auxiliary_loss_mlp": 0.01001291, "balance_loss_clip": 1.00018239, "balance_loss_mlp": 1.00527525, "epoch": 0.9417105065384037, "flos": 70312698276480.0, "grad_norm": 0.775876966347202, "language_loss": 0.59293365, "learning_rate": 3.5499974694092935e-08, "loss": 0.61322343, "num_input_tokens_seen": 337806930, "router_z_loss_clip": 0.0111084, "router_z_loss_mlp": 0.22460938, "step": 15663, "time_per_iteration": 3.1685738563537598 }, { "auxiliary_loss_clip": 0.01107838, "auxiliary_loss_mlp": 0.01035022, "balance_loss_clip": 1.02162266, "balance_loss_mlp": 1.03546441, "epoch": 0.9417706297910717, "flos": 34057895322240.0, "grad_norm": 2.095410819011796, "language_loss": 0.66470987, "learning_rate": 3.542695811435914e-08, "loss": 0.68613845, "num_input_tokens_seen": 337828100, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 15664, "time_per_iteration": 2.566152811050415 }, { "auxiliary_loss_clip": 0.01104347, "auxiliary_loss_mlp": 0.0103025, "balance_loss_clip": 1.01863837, "balance_loss_mlp": 1.03633237, "epoch": 0.9418307530437396, "flos": 16471614746880.0, "grad_norm": 2.0245276253894726, "language_loss": 0.73205531, "learning_rate": 3.535401603143207e-08, "loss": 0.75340128, "num_input_tokens_seen": 337844805, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 15665, "time_per_iteration": 2.421699047088623 }, { "auxiliary_loss_clip": 0.01101879, "auxiliary_loss_mlp": 0.01035336, "balance_loss_clip": 1.02373123, "balance_loss_mlp": 1.03561342, "epoch": 0.9418908762964077, "flos": 11253543114240.0, "grad_norm": 2.412220443561427, "language_loss": 0.63810647, "learning_rate": 3.528114844807773e-08, "loss": 0.65947855, "num_input_tokens_seen": 337860490, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 15666, "time_per_iteration": 3.8308513164520264 }, { "auxiliary_loss_clip": 0.01102515, "auxiliary_loss_mlp": 0.01035382, "balance_loss_clip": 1.02300763, "balance_loss_mlp": 1.03359556, "epoch": 0.9419509995490756, "flos": 18438836860800.0, "grad_norm": 1.8903330834356498, "language_loss": 0.7929163, "learning_rate": 3.520835536705902e-08, "loss": 0.81429523, "num_input_tokens_seen": 337878360, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 15667, "time_per_iteration": 2.421084403991699 }, { "auxiliary_loss_clip": 0.01100391, "auxiliary_loss_mlp": 0.01026936, "balance_loss_clip": 1.01592159, "balance_loss_mlp": 1.03368843, "epoch": 0.9420111228017436, "flos": 20737945664640.0, "grad_norm": 1.9323479648099444, "language_loss": 0.75466132, "learning_rate": 3.5135636791136404e-08, "loss": 0.77593458, "num_input_tokens_seen": 337895635, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6640625, "step": 15668, "time_per_iteration": 2.4690074920654297 }, { "auxiliary_loss_clip": 0.01105858, "auxiliary_loss_mlp": 0.01029639, "balance_loss_clip": 1.01724672, "balance_loss_mlp": 1.03494167, "epoch": 0.9420712460544115, "flos": 21141940907520.0, "grad_norm": 3.199085101126363, "language_loss": 0.59098768, "learning_rate": 3.506299272306723e-08, "loss": 0.61234266, "num_input_tokens_seen": 337913940, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 15669, "time_per_iteration": 2.4456839561462402 }, { "auxiliary_loss_clip": 0.01097699, "auxiliary_loss_mlp": 0.0102998, "balance_loss_clip": 1.01852953, "balance_loss_mlp": 1.03260553, "epoch": 0.9421313693070795, "flos": 15851760721920.0, "grad_norm": 1.674291328627731, "language_loss": 0.76851541, "learning_rate": 3.4990423165606406e-08, "loss": 0.78979218, "num_input_tokens_seen": 337932015, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.65234375, "step": 15670, "time_per_iteration": 5.324063539505005 }, { "auxiliary_loss_clip": 0.01104454, "auxiliary_loss_mlp": 0.01035575, "balance_loss_clip": 1.0230763, "balance_loss_mlp": 1.0359112, "epoch": 0.9421914925597474, "flos": 32415915882240.0, "grad_norm": 2.1321547012731434, "language_loss": 0.65128934, "learning_rate": 3.491792812150574e-08, "loss": 0.67268968, "num_input_tokens_seen": 337953345, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 15671, "time_per_iteration": 2.559993267059326 }, { "auxiliary_loss_clip": 0.01101987, "auxiliary_loss_mlp": 0.01030816, "balance_loss_clip": 1.01851308, "balance_loss_mlp": 1.03431845, "epoch": 0.9422516158124155, "flos": 19718513769600.0, "grad_norm": 1.5708898410684977, "language_loss": 0.79179722, "learning_rate": 3.48455075935139e-08, "loss": 0.81312525, "num_input_tokens_seen": 337973685, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 15672, "time_per_iteration": 2.452552318572998 }, { "auxiliary_loss_clip": 0.01107127, "auxiliary_loss_mlp": 0.010331, "balance_loss_clip": 1.01984394, "balance_loss_mlp": 1.03518426, "epoch": 0.9423117390650835, "flos": 16253277926400.0, "grad_norm": 3.974371494265877, "language_loss": 0.73528892, "learning_rate": 3.47731615843776e-08, "loss": 0.75669122, "num_input_tokens_seen": 337989175, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 15673, "time_per_iteration": 2.4517359733581543 }, { "auxiliary_loss_clip": 0.01103728, "auxiliary_loss_mlp": 0.01027284, "balance_loss_clip": 1.01514256, "balance_loss_mlp": 1.03512847, "epoch": 0.9423718623177514, "flos": 31796564647680.0, "grad_norm": 4.875606568909956, "language_loss": 0.70197308, "learning_rate": 3.470089009683974e-08, "loss": 0.72328317, "num_input_tokens_seen": 338011800, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 15674, "time_per_iteration": 2.5425875186920166 }, { "auxiliary_loss_clip": 0.0110201, "auxiliary_loss_mlp": 0.0102482, "balance_loss_clip": 1.01313734, "balance_loss_mlp": 1.03301787, "epoch": 0.9424319855704194, "flos": 23331809473920.0, "grad_norm": 3.1590142511019814, "language_loss": 0.81393397, "learning_rate": 3.462869313364125e-08, "loss": 0.83520222, "num_input_tokens_seen": 338032120, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 15675, "time_per_iteration": 2.4703352451324463 }, { "auxiliary_loss_clip": 0.01104008, "auxiliary_loss_mlp": 0.01026979, "balance_loss_clip": 1.01555228, "balance_loss_mlp": 1.03599584, "epoch": 0.9424921088230873, "flos": 20777627214720.0, "grad_norm": 5.403672440934052, "language_loss": 0.62838179, "learning_rate": 3.4556570697519494e-08, "loss": 0.64969164, "num_input_tokens_seen": 338051880, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6796875, "step": 15676, "time_per_iteration": 2.436384916305542 }, { "auxiliary_loss_clip": 0.01103061, "auxiliary_loss_mlp": 0.01035516, "balance_loss_clip": 1.02348757, "balance_loss_mlp": 1.03464937, "epoch": 0.9425522320757553, "flos": 19026658932480.0, "grad_norm": 1.8044138497439404, "language_loss": 0.67269123, "learning_rate": 3.448452279120984e-08, "loss": 0.69407696, "num_input_tokens_seen": 338069665, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 15677, "time_per_iteration": 2.4515068531036377 }, { "auxiliary_loss_clip": 0.01104815, "auxiliary_loss_mlp": 0.01032649, "balance_loss_clip": 1.01953626, "balance_loss_mlp": 1.03381157, "epoch": 0.9426123553284232, "flos": 25155353185920.0, "grad_norm": 2.050843138722842, "language_loss": 0.64292037, "learning_rate": 3.441254941744387e-08, "loss": 0.66429508, "num_input_tokens_seen": 338090490, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 15678, "time_per_iteration": 2.4738142490386963 }, { "auxiliary_loss_clip": 0.01102341, "auxiliary_loss_mlp": 0.01028422, "balance_loss_clip": 1.01625109, "balance_loss_mlp": 1.03465104, "epoch": 0.9426724785810913, "flos": 21179359900800.0, "grad_norm": 1.5171430114488953, "language_loss": 0.73973548, "learning_rate": 3.434065057895097e-08, "loss": 0.76104313, "num_input_tokens_seen": 338109825, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 15679, "time_per_iteration": 2.463740825653076 }, { "auxiliary_loss_clip": 0.01106832, "auxiliary_loss_mlp": 0.01032032, "balance_loss_clip": 1.01990259, "balance_loss_mlp": 1.03587699, "epoch": 0.9427326018337592, "flos": 14756916222720.0, "grad_norm": 3.0808636387631454, "language_loss": 0.77282912, "learning_rate": 3.426882627845762e-08, "loss": 0.79421777, "num_input_tokens_seen": 338125790, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.7109375, "step": 15680, "time_per_iteration": 2.4107489585876465 }, { "auxiliary_loss_clip": 0.01102235, "auxiliary_loss_mlp": 0.01035121, "balance_loss_clip": 1.02281308, "balance_loss_mlp": 1.03422904, "epoch": 0.9427927250864272, "flos": 20923640000640.0, "grad_norm": 1.9662257189111048, "language_loss": 0.7574231, "learning_rate": 3.419707651868742e-08, "loss": 0.77879667, "num_input_tokens_seen": 338145610, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 15681, "time_per_iteration": 2.4741647243499756 }, { "auxiliary_loss_clip": 0.01106068, "auxiliary_loss_mlp": 0.01033813, "balance_loss_clip": 1.02178526, "balance_loss_mlp": 1.03602242, "epoch": 0.9428528483390951, "flos": 19752520970880.0, "grad_norm": 2.4480703152460928, "language_loss": 0.6587258, "learning_rate": 3.412540130236086e-08, "loss": 0.68012464, "num_input_tokens_seen": 338165960, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 15682, "time_per_iteration": 2.4496397972106934 }, { "auxiliary_loss_clip": 0.01099591, "auxiliary_loss_mlp": 0.01028106, "balance_loss_clip": 1.01592267, "balance_loss_mlp": 1.03203034, "epoch": 0.9429129715917631, "flos": 24534996370560.0, "grad_norm": 1.853665177785085, "language_loss": 0.76755702, "learning_rate": 3.405380063219665e-08, "loss": 0.78883398, "num_input_tokens_seen": 338187215, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.67578125, "step": 15683, "time_per_iteration": 2.494096279144287 }, { "auxiliary_loss_clip": 0.01109007, "auxiliary_loss_mlp": 0.01037417, "balance_loss_clip": 1.02429175, "balance_loss_mlp": 1.03784597, "epoch": 0.942973094844431, "flos": 17959824063360.0, "grad_norm": 3.0588245626685593, "language_loss": 0.75664002, "learning_rate": 3.398227451090885e-08, "loss": 0.77810425, "num_input_tokens_seen": 338201825, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 15684, "time_per_iteration": 2.4039130210876465 }, { "auxiliary_loss_clip": 0.01098895, "auxiliary_loss_mlp": 0.01024069, "balance_loss_clip": 1.01254189, "balance_loss_mlp": 1.03315032, "epoch": 0.9430332180970991, "flos": 26137689310080.0, "grad_norm": 1.582589025636366, "language_loss": 0.77086669, "learning_rate": 3.391082294121017e-08, "loss": 0.79209626, "num_input_tokens_seen": 338220865, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65625, "step": 15685, "time_per_iteration": 2.502429723739624 }, { "auxiliary_loss_clip": 0.01099994, "auxiliary_loss_mlp": 0.01028715, "balance_loss_clip": 1.01734281, "balance_loss_mlp": 1.03357518, "epoch": 0.943093341349767, "flos": 23951376190080.0, "grad_norm": 3.1819206192572085, "language_loss": 0.75676411, "learning_rate": 3.383944592581023e-08, "loss": 0.7780512, "num_input_tokens_seen": 338240160, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6640625, "step": 15686, "time_per_iteration": 2.472306489944458 }, { "auxiliary_loss_clip": 0.01104552, "auxiliary_loss_mlp": 0.01031414, "balance_loss_clip": 1.01883101, "balance_loss_mlp": 1.03395307, "epoch": 0.943153464602435, "flos": 17968407413760.0, "grad_norm": 1.7941288136442444, "language_loss": 0.80607176, "learning_rate": 3.376814346741575e-08, "loss": 0.82743144, "num_input_tokens_seen": 338259305, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 15687, "time_per_iteration": 2.456542730331421 }, { "auxiliary_loss_clip": 0.01108703, "auxiliary_loss_mlp": 0.01036002, "balance_loss_clip": 1.02207875, "balance_loss_mlp": 1.0366466, "epoch": 0.943213587855103, "flos": 14501519544960.0, "grad_norm": 5.182555578475263, "language_loss": 0.76033878, "learning_rate": 3.369691556873011e-08, "loss": 0.78178585, "num_input_tokens_seen": 338274950, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.71875, "step": 15688, "time_per_iteration": 2.4305331707000732 }, { "auxiliary_loss_clip": 0.01099007, "auxiliary_loss_mlp": 0.01026549, "balance_loss_clip": 1.01418102, "balance_loss_mlp": 1.03377962, "epoch": 0.9432737111077709, "flos": 28986411093120.0, "grad_norm": 1.9809757090052869, "language_loss": 0.68370783, "learning_rate": 3.3625762232454504e-08, "loss": 0.70496339, "num_input_tokens_seen": 338295585, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.65234375, "step": 15689, "time_per_iteration": 2.5194544792175293 }, { "auxiliary_loss_clip": 0.01101673, "auxiliary_loss_mlp": 0.01029393, "balance_loss_clip": 1.01914096, "balance_loss_mlp": 1.03436852, "epoch": 0.9433338343604389, "flos": 21609066303360.0, "grad_norm": 2.198565554046775, "language_loss": 0.80719721, "learning_rate": 3.35546834612872e-08, "loss": 0.8285079, "num_input_tokens_seen": 338314555, "router_z_loss_clip": 0.10253906, "router_z_loss_mlp": 0.671875, "step": 15690, "time_per_iteration": 2.443930149078369 }, { "auxiliary_loss_clip": 0.01102758, "auxiliary_loss_mlp": 0.01029542, "balance_loss_clip": 1.01786494, "balance_loss_mlp": 1.035218, "epoch": 0.9433939576131068, "flos": 33182285483520.0, "grad_norm": 1.8488163765044079, "language_loss": 0.60103214, "learning_rate": 3.348367925792317e-08, "loss": 0.6223551, "num_input_tokens_seen": 338336260, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 15691, "time_per_iteration": 2.5580973625183105 }, { "auxiliary_loss_clip": 0.01108302, "auxiliary_loss_mlp": 0.01029917, "balance_loss_clip": 1.0181272, "balance_loss_mlp": 1.03822851, "epoch": 0.9434540808657749, "flos": 20486391742080.0, "grad_norm": 1.5901852709804907, "language_loss": 0.6653713, "learning_rate": 3.341274962505514e-08, "loss": 0.68675351, "num_input_tokens_seen": 338354680, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 15692, "time_per_iteration": 2.4540138244628906 }, { "auxiliary_loss_clip": 0.01105099, "auxiliary_loss_mlp": 0.01030694, "balance_loss_clip": 1.01870108, "balance_loss_mlp": 1.03561473, "epoch": 0.9435142041184428, "flos": 21542955321600.0, "grad_norm": 5.658951398426245, "language_loss": 0.7470839, "learning_rate": 3.334189456537251e-08, "loss": 0.7684418, "num_input_tokens_seen": 338372490, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 15693, "time_per_iteration": 2.4960474967956543 }, { "auxiliary_loss_clip": 0.01104366, "auxiliary_loss_mlp": 0.01032324, "balance_loss_clip": 1.01981902, "balance_loss_mlp": 1.03572261, "epoch": 0.9435743273711108, "flos": 25009089004800.0, "grad_norm": 1.6421330889405337, "language_loss": 0.73378301, "learning_rate": 3.327111408156291e-08, "loss": 0.7551499, "num_input_tokens_seen": 338390870, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 15694, "time_per_iteration": 2.479641914367676 }, { "auxiliary_loss_clip": 0.01028147, "auxiliary_loss_mlp": 0.01000009, "balance_loss_clip": 0.99892998, "balance_loss_mlp": 1.0058465, "epoch": 0.9436344506237787, "flos": 60158707320960.0, "grad_norm": 0.8619489335183576, "language_loss": 0.50605369, "learning_rate": 3.3200408176309316e-08, "loss": 0.52633524, "num_input_tokens_seen": 338453075, "router_z_loss_clip": 0.01080322, "router_z_loss_mlp": 0.22363281, "step": 15695, "time_per_iteration": 3.156944990158081 }, { "auxiliary_loss_clip": 0.01099055, "auxiliary_loss_mlp": 0.01027941, "balance_loss_clip": 1.01658642, "balance_loss_mlp": 1.03418529, "epoch": 0.9436945738764467, "flos": 22237252283520.0, "grad_norm": 4.196620236066197, "language_loss": 0.65120828, "learning_rate": 3.312977685229335e-08, "loss": 0.67247832, "num_input_tokens_seen": 338471770, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6484375, "step": 15696, "time_per_iteration": 2.452172040939331 }, { "auxiliary_loss_clip": 0.01103513, "auxiliary_loss_mlp": 0.01029958, "balance_loss_clip": 1.018067, "balance_loss_mlp": 1.03567982, "epoch": 0.9437546971291146, "flos": 25045179194880.0, "grad_norm": 8.79583014089453, "language_loss": 0.65927708, "learning_rate": 3.305922011219353e-08, "loss": 0.68061185, "num_input_tokens_seen": 338492190, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 15697, "time_per_iteration": 2.4780638217926025 }, { "auxiliary_loss_clip": 0.01027612, "auxiliary_loss_mlp": 0.01001256, "balance_loss_clip": 1.00021887, "balance_loss_mlp": 1.00530648, "epoch": 0.9438148203817827, "flos": 56790788400000.0, "grad_norm": 0.8449449653812693, "language_loss": 0.6322149, "learning_rate": 3.298873795868506e-08, "loss": 0.65250361, "num_input_tokens_seen": 338552560, "router_z_loss_clip": 0.01037598, "router_z_loss_mlp": 0.22265625, "step": 15698, "time_per_iteration": 3.003232955932617 }, { "auxiliary_loss_clip": 0.01107045, "auxiliary_loss_mlp": 0.01035601, "balance_loss_clip": 1.02272058, "balance_loss_mlp": 1.0354836, "epoch": 0.9438749436344506, "flos": 22346384780160.0, "grad_norm": 1.7953358464738325, "language_loss": 0.69790065, "learning_rate": 3.291833039444092e-08, "loss": 0.71932715, "num_input_tokens_seen": 338571770, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 15699, "time_per_iteration": 2.4901349544525146 }, { "auxiliary_loss_clip": 0.01100368, "auxiliary_loss_mlp": 0.01029989, "balance_loss_clip": 1.01820552, "balance_loss_mlp": 1.03410554, "epoch": 0.9439350668871186, "flos": 13370800337280.0, "grad_norm": 2.0739197995599845, "language_loss": 0.74520546, "learning_rate": 3.2847997422130734e-08, "loss": 0.76650906, "num_input_tokens_seen": 338587310, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 15700, "time_per_iteration": 2.420700788497925 }, { "auxiliary_loss_clip": 0.01102412, "auxiliary_loss_mlp": 0.010314, "balance_loss_clip": 1.01971745, "balance_loss_mlp": 1.03521109, "epoch": 0.9439951901397866, "flos": 17785334770560.0, "grad_norm": 1.7385171650308746, "language_loss": 0.70246422, "learning_rate": 3.2777739044421495e-08, "loss": 0.72380233, "num_input_tokens_seen": 338606235, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 15701, "time_per_iteration": 2.4602653980255127 }, { "auxiliary_loss_clip": 0.01106958, "auxiliary_loss_mlp": 0.01027666, "balance_loss_clip": 1.01529264, "balance_loss_mlp": 1.03461599, "epoch": 0.9440553133924545, "flos": 18879568738560.0, "grad_norm": 2.010638449427024, "language_loss": 0.77953184, "learning_rate": 3.2707555263977505e-08, "loss": 0.80087805, "num_input_tokens_seen": 338624090, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7265625, "step": 15702, "time_per_iteration": 2.426851987838745 }, { "auxiliary_loss_clip": 0.01105295, "auxiliary_loss_mlp": 0.01033464, "balance_loss_clip": 1.02163887, "balance_loss_mlp": 1.03573525, "epoch": 0.9441154366451225, "flos": 19572967860480.0, "grad_norm": 1.8053219010581911, "language_loss": 0.66581941, "learning_rate": 3.2637446083460194e-08, "loss": 0.68720698, "num_input_tokens_seen": 338643695, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 15703, "time_per_iteration": 2.4449691772460938 }, { "auxiliary_loss_clip": 0.01106205, "auxiliary_loss_mlp": 0.01027475, "balance_loss_clip": 1.01479721, "balance_loss_mlp": 1.03579426, "epoch": 0.9441755598977905, "flos": 30294995472000.0, "grad_norm": 1.9276689171376393, "language_loss": 0.73409307, "learning_rate": 3.256741150552833e-08, "loss": 0.7554298, "num_input_tokens_seen": 338664725, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 15704, "time_per_iteration": 3.8638885021209717 }, { "auxiliary_loss_clip": 0.0110163, "auxiliary_loss_mlp": 0.0103307, "balance_loss_clip": 1.02048779, "balance_loss_mlp": 1.03419948, "epoch": 0.9442356831504585, "flos": 20667884186880.0, "grad_norm": 1.7067891852825074, "language_loss": 0.74947608, "learning_rate": 3.2497451532837336e-08, "loss": 0.77082312, "num_input_tokens_seen": 338683990, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 15705, "time_per_iteration": 2.5076024532318115 }, { "auxiliary_loss_clip": 0.01104492, "auxiliary_loss_mlp": 0.01035466, "balance_loss_clip": 1.02418852, "balance_loss_mlp": 1.03592646, "epoch": 0.9442958064031264, "flos": 16107265140480.0, "grad_norm": 2.1668789197778753, "language_loss": 0.77520514, "learning_rate": 3.2427566168039986e-08, "loss": 0.79660469, "num_input_tokens_seen": 338702025, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6875, "step": 15706, "time_per_iteration": 2.4418222904205322 }, { "auxiliary_loss_clip": 0.01098672, "auxiliary_loss_mlp": 0.01026322, "balance_loss_clip": 1.01499152, "balance_loss_mlp": 1.03296781, "epoch": 0.9443559296557944, "flos": 20447392550400.0, "grad_norm": 1.502014798770105, "language_loss": 0.69266558, "learning_rate": 3.23577554137866e-08, "loss": 0.71391547, "num_input_tokens_seen": 338720920, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65625, "step": 15707, "time_per_iteration": 2.4359097480773926 }, { "auxiliary_loss_clip": 0.01095111, "auxiliary_loss_mlp": 0.0102585, "balance_loss_clip": 1.01527035, "balance_loss_mlp": 1.03089881, "epoch": 0.9444160529084623, "flos": 21610897896960.0, "grad_norm": 1.965898046865137, "language_loss": 0.69358838, "learning_rate": 3.22880192727244e-08, "loss": 0.71479797, "num_input_tokens_seen": 338739590, "router_z_loss_clip": 0.10595703, "router_z_loss_mlp": 0.640625, "step": 15708, "time_per_iteration": 3.8853938579559326 }, { "auxiliary_loss_clip": 0.01103077, "auxiliary_loss_mlp": 0.01027446, "balance_loss_clip": 1.0159781, "balance_loss_mlp": 1.03540206, "epoch": 0.9444761761611303, "flos": 18441781776000.0, "grad_norm": 2.1735219429238524, "language_loss": 0.70533395, "learning_rate": 3.221835774749748e-08, "loss": 0.72663921, "num_input_tokens_seen": 338757240, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.67578125, "step": 15709, "time_per_iteration": 2.435948133468628 }, { "auxiliary_loss_clip": 0.01104148, "auxiliary_loss_mlp": 0.01030683, "balance_loss_clip": 1.0188396, "balance_loss_mlp": 1.03732824, "epoch": 0.9445362994137982, "flos": 20957144411520.0, "grad_norm": 1.9619245126832143, "language_loss": 0.84765375, "learning_rate": 3.214877084074774e-08, "loss": 0.8690021, "num_input_tokens_seen": 338773750, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6640625, "step": 15710, "time_per_iteration": 2.443694591522217 }, { "auxiliary_loss_clip": 0.01109083, "auxiliary_loss_mlp": 0.01036289, "balance_loss_clip": 1.02306318, "balance_loss_mlp": 1.03790569, "epoch": 0.9445964226664663, "flos": 20303283185280.0, "grad_norm": 1.6532240694851341, "language_loss": 0.71600562, "learning_rate": 3.2079258555113956e-08, "loss": 0.73745936, "num_input_tokens_seen": 338792115, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 15711, "time_per_iteration": 2.4480600357055664 }, { "auxiliary_loss_clip": 0.01106081, "auxiliary_loss_mlp": 0.01026944, "balance_loss_clip": 1.0148921, "balance_loss_mlp": 1.0382669, "epoch": 0.9446565459191342, "flos": 26396030903040.0, "grad_norm": 2.405475483622152, "language_loss": 0.69229752, "learning_rate": 3.200982089323179e-08, "loss": 0.71362776, "num_input_tokens_seen": 338812480, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6796875, "step": 15712, "time_per_iteration": 3.9602174758911133 }, { "auxiliary_loss_clip": 0.01107975, "auxiliary_loss_mlp": 0.01035516, "balance_loss_clip": 1.02249241, "balance_loss_mlp": 1.03729188, "epoch": 0.9447166691718022, "flos": 16544764794240.0, "grad_norm": 3.104348928029723, "language_loss": 0.71074045, "learning_rate": 3.1940457857734246e-08, "loss": 0.73217535, "num_input_tokens_seen": 338829105, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 15713, "time_per_iteration": 2.4631922245025635 }, { "auxiliary_loss_clip": 0.01100527, "auxiliary_loss_mlp": 0.01032868, "balance_loss_clip": 1.02005911, "balance_loss_mlp": 1.03419888, "epoch": 0.9447767924244702, "flos": 29164635400320.0, "grad_norm": 1.601059760148871, "language_loss": 0.76497918, "learning_rate": 3.187116945125212e-08, "loss": 0.78631312, "num_input_tokens_seen": 338850670, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.66015625, "step": 15714, "time_per_iteration": 2.502467155456543 }, { "auxiliary_loss_clip": 0.01102493, "auxiliary_loss_mlp": 0.01031953, "balance_loss_clip": 1.01943648, "balance_loss_mlp": 1.03296125, "epoch": 0.9448369156771381, "flos": 19274908803840.0, "grad_norm": 2.086438451850651, "language_loss": 0.67690289, "learning_rate": 3.1801955676412194e-08, "loss": 0.69824731, "num_input_tokens_seen": 338867795, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 15715, "time_per_iteration": 2.4457650184631348 }, { "auxiliary_loss_clip": 0.01105623, "auxiliary_loss_mlp": 0.01030499, "balance_loss_clip": 1.01772547, "balance_loss_mlp": 1.03549933, "epoch": 0.9448970389298061, "flos": 23841166285440.0, "grad_norm": 2.1010452262324915, "language_loss": 0.74896538, "learning_rate": 3.173281653583948e-08, "loss": 0.77032661, "num_input_tokens_seen": 338887205, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 15716, "time_per_iteration": 2.491569995880127 }, { "auxiliary_loss_clip": 0.01107951, "auxiliary_loss_mlp": 0.01027731, "balance_loss_clip": 1.01526141, "balance_loss_mlp": 1.03814268, "epoch": 0.944957162182474, "flos": 22382259488640.0, "grad_norm": 1.7869496947661652, "language_loss": 0.62596726, "learning_rate": 3.166375203215565e-08, "loss": 0.64732409, "num_input_tokens_seen": 338906130, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69921875, "step": 15717, "time_per_iteration": 2.492479085922241 }, { "auxiliary_loss_clip": 0.01102509, "auxiliary_loss_mlp": 0.010297, "balance_loss_clip": 1.01811278, "balance_loss_mlp": 1.03473282, "epoch": 0.9450172854351421, "flos": 17383889393280.0, "grad_norm": 1.9165046505615952, "language_loss": 0.79370749, "learning_rate": 3.1594762167979514e-08, "loss": 0.81502956, "num_input_tokens_seen": 338923045, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.67578125, "step": 15718, "time_per_iteration": 2.413252830505371 }, { "auxiliary_loss_clip": 0.01027805, "auxiliary_loss_mlp": 0.01002693, "balance_loss_clip": 1.00169182, "balance_loss_mlp": 1.00541186, "epoch": 0.94507740868781, "flos": 68466352406400.0, "grad_norm": 0.7040587685882076, "language_loss": 0.5784207, "learning_rate": 3.152584694592719e-08, "loss": 0.59872562, "num_input_tokens_seen": 338987545, "router_z_loss_clip": 0.01000977, "router_z_loss_mlp": 0.22460938, "step": 15719, "time_per_iteration": 3.1248011589050293 }, { "auxiliary_loss_clip": 0.01106389, "auxiliary_loss_mlp": 0.01032437, "balance_loss_clip": 1.0203371, "balance_loss_mlp": 1.03588486, "epoch": 0.945137531940478, "flos": 21142479611520.0, "grad_norm": 1.5508784813556384, "language_loss": 0.75546175, "learning_rate": 3.145700636861193e-08, "loss": 0.77684999, "num_input_tokens_seen": 339007830, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.70703125, "step": 15720, "time_per_iteration": 2.443279266357422 }, { "auxiliary_loss_clip": 0.01101214, "auxiliary_loss_mlp": 0.01028286, "balance_loss_clip": 1.01736081, "balance_loss_mlp": 1.03376532, "epoch": 0.9451976551931459, "flos": 24533918962560.0, "grad_norm": 2.6009465607652142, "language_loss": 0.7263642, "learning_rate": 3.138824043864452e-08, "loss": 0.74765921, "num_input_tokens_seen": 339028980, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.67578125, "step": 15721, "time_per_iteration": 2.4963772296905518 }, { "auxiliary_loss_clip": 0.01103986, "auxiliary_loss_mlp": 0.01032628, "balance_loss_clip": 1.01993847, "balance_loss_mlp": 1.03516078, "epoch": 0.9452577784458139, "flos": 23440582834560.0, "grad_norm": 2.219033402259706, "language_loss": 0.85254312, "learning_rate": 3.131954915863244e-08, "loss": 0.87390924, "num_input_tokens_seen": 339047950, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 15722, "time_per_iteration": 2.466857433319092 }, { "auxiliary_loss_clip": 0.01027379, "auxiliary_loss_mlp": 0.01002087, "balance_loss_clip": 1.00100267, "balance_loss_mlp": 1.00511718, "epoch": 0.9453179016984818, "flos": 52017686449920.0, "grad_norm": 0.9054627286183956, "language_loss": 0.64451432, "learning_rate": 3.125093253118005e-08, "loss": 0.66480899, "num_input_tokens_seen": 339104535, "router_z_loss_clip": 0.01086426, "router_z_loss_mlp": 0.22265625, "step": 15723, "time_per_iteration": 3.0589985847473145 }, { "auxiliary_loss_clip": 0.01105453, "auxiliary_loss_mlp": 0.01029453, "balance_loss_clip": 1.01719809, "balance_loss_mlp": 1.03510165, "epoch": 0.9453780249511499, "flos": 13473001509120.0, "grad_norm": 2.6228234763720146, "language_loss": 0.73021328, "learning_rate": 3.1182390558889715e-08, "loss": 0.75156236, "num_input_tokens_seen": 339122050, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 15724, "time_per_iteration": 2.4464199542999268 }, { "auxiliary_loss_clip": 0.01103094, "auxiliary_loss_mlp": 0.01028003, "balance_loss_clip": 1.01620102, "balance_loss_mlp": 1.03445721, "epoch": 0.9454381482038178, "flos": 23258515772160.0, "grad_norm": 2.309789398351888, "language_loss": 0.85226738, "learning_rate": 3.111392324436024e-08, "loss": 0.87357843, "num_input_tokens_seen": 339138940, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 15725, "time_per_iteration": 2.494220018386841 }, { "auxiliary_loss_clip": 0.01105076, "auxiliary_loss_mlp": 0.01029983, "balance_loss_clip": 1.01776969, "balance_loss_mlp": 1.03569591, "epoch": 0.9454982714564858, "flos": 19496621502720.0, "grad_norm": 1.7774549167416596, "language_loss": 0.7110551, "learning_rate": 3.104553059018822e-08, "loss": 0.73240572, "num_input_tokens_seen": 339158245, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 15726, "time_per_iteration": 2.474078416824341 }, { "auxiliary_loss_clip": 0.01102888, "auxiliary_loss_mlp": 0.01031306, "balance_loss_clip": 1.01853871, "balance_loss_mlp": 1.03457475, "epoch": 0.9455583947091538, "flos": 23258120722560.0, "grad_norm": 1.7214257487466114, "language_loss": 0.60651672, "learning_rate": 3.097721259896735e-08, "loss": 0.6278587, "num_input_tokens_seen": 339178200, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6796875, "step": 15727, "time_per_iteration": 2.484616279602051 }, { "auxiliary_loss_clip": 0.01098714, "auxiliary_loss_mlp": 0.01033039, "balance_loss_clip": 1.022048, "balance_loss_mlp": 1.03319335, "epoch": 0.9456185179618217, "flos": 17673041877120.0, "grad_norm": 1.8290358787772547, "language_loss": 0.81584537, "learning_rate": 3.0908969273287566e-08, "loss": 0.83716297, "num_input_tokens_seen": 339193950, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.65625, "step": 15728, "time_per_iteration": 2.4480178356170654 }, { "auxiliary_loss_clip": 0.01028287, "auxiliary_loss_mlp": 0.01001773, "balance_loss_clip": 1.00073588, "balance_loss_mlp": 1.00579429, "epoch": 0.9456786412144897, "flos": 61415040389760.0, "grad_norm": 0.7472174067079782, "language_loss": 0.59226537, "learning_rate": 3.08408006157368e-08, "loss": 0.61256599, "num_input_tokens_seen": 339252330, "router_z_loss_clip": 0.01037598, "router_z_loss_mlp": 0.22460938, "step": 15729, "time_per_iteration": 3.0265889167785645 }, { "auxiliary_loss_clip": 0.01102185, "auxiliary_loss_mlp": 0.01025498, "balance_loss_clip": 1.013219, "balance_loss_mlp": 1.03419733, "epoch": 0.9457387644671577, "flos": 18588369179520.0, "grad_norm": 1.9030403824808708, "language_loss": 0.76917374, "learning_rate": 3.077270662890052e-08, "loss": 0.79045063, "num_input_tokens_seen": 339270325, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 15730, "time_per_iteration": 2.4175400733947754 }, { "auxiliary_loss_clip": 0.01102984, "auxiliary_loss_mlp": 0.01031541, "balance_loss_clip": 1.01846957, "balance_loss_mlp": 1.03401661, "epoch": 0.9457988877198257, "flos": 21108544237440.0, "grad_norm": 1.3885824286112955, "language_loss": 0.62441975, "learning_rate": 3.070468731536047e-08, "loss": 0.64576495, "num_input_tokens_seen": 339291980, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 15731, "time_per_iteration": 2.5175631046295166 }, { "auxiliary_loss_clip": 0.01104889, "auxiliary_loss_mlp": 0.01025006, "balance_loss_clip": 1.0125668, "balance_loss_mlp": 1.03450561, "epoch": 0.9458590109724936, "flos": 26688379697280.0, "grad_norm": 2.3098200450591104, "language_loss": 0.63983428, "learning_rate": 3.063674267769589e-08, "loss": 0.66113329, "num_input_tokens_seen": 339311795, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 15732, "time_per_iteration": 2.4770865440368652 }, { "auxiliary_loss_clip": 0.01108033, "auxiliary_loss_mlp": 0.01028021, "balance_loss_clip": 1.01462817, "balance_loss_mlp": 1.03515148, "epoch": 0.9459191342251616, "flos": 18661591054080.0, "grad_norm": 2.573436759726771, "language_loss": 0.84144759, "learning_rate": 3.056887271848363e-08, "loss": 0.86280817, "num_input_tokens_seen": 339327745, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 15733, "time_per_iteration": 2.4543681144714355 }, { "auxiliary_loss_clip": 0.01100822, "auxiliary_loss_mlp": 0.0102716, "balance_loss_clip": 1.01635957, "balance_loss_mlp": 1.03512144, "epoch": 0.9459792574778295, "flos": 23398459159680.0, "grad_norm": 1.4953304081191885, "language_loss": 0.72106576, "learning_rate": 3.0501077440297173e-08, "loss": 0.74234557, "num_input_tokens_seen": 339346445, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.65625, "step": 15734, "time_per_iteration": 2.4608712196350098 }, { "auxiliary_loss_clip": 0.01096519, "auxiliary_loss_mlp": 0.01031341, "balance_loss_clip": 1.02090979, "balance_loss_mlp": 1.03153539, "epoch": 0.9460393807304975, "flos": 24392969994240.0, "grad_norm": 1.6099364322494127, "language_loss": 0.86735344, "learning_rate": 3.043335684570692e-08, "loss": 0.88863212, "num_input_tokens_seen": 339367945, "router_z_loss_clip": 0.10449219, "router_z_loss_mlp": 0.6484375, "step": 15735, "time_per_iteration": 2.499631643295288 }, { "auxiliary_loss_clip": 0.01103086, "auxiliary_loss_mlp": 0.01031759, "balance_loss_clip": 1.0201838, "balance_loss_mlp": 1.0347501, "epoch": 0.9460995039831654, "flos": 21939408708480.0, "grad_norm": 1.830620225288191, "language_loss": 0.67462707, "learning_rate": 3.036571093728102e-08, "loss": 0.69597542, "num_input_tokens_seen": 339386060, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.68359375, "step": 15736, "time_per_iteration": 2.447336435317993 }, { "auxiliary_loss_clip": 0.01028096, "auxiliary_loss_mlp": 0.01003657, "balance_loss_clip": 1.00266171, "balance_loss_mlp": 1.00585842, "epoch": 0.9461596272358335, "flos": 70322466775680.0, "grad_norm": 0.9258926518693704, "language_loss": 0.6526022, "learning_rate": 3.029813971758499e-08, "loss": 0.67291975, "num_input_tokens_seen": 339446695, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22265625, "step": 15737, "time_per_iteration": 3.106705665588379 }, { "auxiliary_loss_clip": 0.0102758, "auxiliary_loss_mlp": 0.01001077, "balance_loss_clip": 0.99999791, "balance_loss_mlp": 1.00534177, "epoch": 0.9462197504885014, "flos": 58591242645120.0, "grad_norm": 0.7999696318712316, "language_loss": 0.58785594, "learning_rate": 3.0230643189181225e-08, "loss": 0.6081425, "num_input_tokens_seen": 339510080, "router_z_loss_clip": 0.01080322, "router_z_loss_mlp": 0.22265625, "step": 15738, "time_per_iteration": 3.0716629028320312 }, { "auxiliary_loss_clip": 0.01101085, "auxiliary_loss_mlp": 0.0103207, "balance_loss_clip": 1.02118635, "balance_loss_mlp": 1.03471851, "epoch": 0.9462798737411694, "flos": 23433759250560.0, "grad_norm": 2.080313018850869, "language_loss": 0.72018659, "learning_rate": 3.016322135462834e-08, "loss": 0.74151814, "num_input_tokens_seen": 339529335, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.6640625, "step": 15739, "time_per_iteration": 2.476867437362671 }, { "auxiliary_loss_clip": 0.01103867, "auxiliary_loss_mlp": 0.01034368, "balance_loss_clip": 1.02166581, "balance_loss_mlp": 1.0352037, "epoch": 0.9463399969938374, "flos": 25046077034880.0, "grad_norm": 2.7337556172424637, "language_loss": 0.6431632, "learning_rate": 3.009587421648363e-08, "loss": 0.66454554, "num_input_tokens_seen": 339548820, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 15740, "time_per_iteration": 2.539152145385742 }, { "auxiliary_loss_clip": 0.01101298, "auxiliary_loss_mlp": 0.01029161, "balance_loss_clip": 1.01724052, "balance_loss_mlp": 1.03443027, "epoch": 0.9464001202465053, "flos": 24352606085760.0, "grad_norm": 2.4390308167515786, "language_loss": 0.6664834, "learning_rate": 3.0028601777301045e-08, "loss": 0.68778801, "num_input_tokens_seen": 339566775, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 15741, "time_per_iteration": 2.470391035079956 }, { "auxiliary_loss_clip": 0.0110396, "auxiliary_loss_mlp": 0.01026121, "balance_loss_clip": 1.01444411, "balance_loss_mlp": 1.03521025, "epoch": 0.9464602434991733, "flos": 17165444832000.0, "grad_norm": 2.2359549074269647, "language_loss": 0.76198578, "learning_rate": 2.9961404039630987e-08, "loss": 0.78328657, "num_input_tokens_seen": 339581905, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 15742, "time_per_iteration": 2.4592442512512207 }, { "auxiliary_loss_clip": 0.01101528, "auxiliary_loss_mlp": 0.01029093, "balance_loss_clip": 1.01788688, "balance_loss_mlp": 1.03496635, "epoch": 0.9465203667518413, "flos": 19938107566080.0, "grad_norm": 4.389537245112282, "language_loss": 0.7234509, "learning_rate": 2.989428100602187e-08, "loss": 0.74475718, "num_input_tokens_seen": 339599870, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 15743, "time_per_iteration": 2.4635958671569824 }, { "auxiliary_loss_clip": 0.01105932, "auxiliary_loss_mlp": 0.01032587, "balance_loss_clip": 1.02004623, "balance_loss_mlp": 1.03612745, "epoch": 0.9465804900045093, "flos": 20120318282880.0, "grad_norm": 1.8275481479546787, "language_loss": 0.79513657, "learning_rate": 2.982723267901943e-08, "loss": 0.81652176, "num_input_tokens_seen": 339620250, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 15744, "time_per_iteration": 2.4776039123535156 }, { "auxiliary_loss_clip": 0.01105757, "auxiliary_loss_mlp": 0.01036448, "balance_loss_clip": 1.02391338, "balance_loss_mlp": 1.03547335, "epoch": 0.9466406132571772, "flos": 23911622812800.0, "grad_norm": 1.6078783844947278, "language_loss": 0.78052527, "learning_rate": 2.9760259061165417e-08, "loss": 0.80194736, "num_input_tokens_seen": 339639900, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 15745, "time_per_iteration": 3.766026496887207 }, { "auxiliary_loss_clip": 0.01104468, "auxiliary_loss_mlp": 0.01030447, "balance_loss_clip": 1.01784635, "balance_loss_mlp": 1.0341177, "epoch": 0.9467007365098452, "flos": 19933223316480.0, "grad_norm": 2.375735995888604, "language_loss": 0.70143706, "learning_rate": 2.9693360155000014e-08, "loss": 0.72278625, "num_input_tokens_seen": 339658970, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 15746, "time_per_iteration": 2.495816946029663 }, { "auxiliary_loss_clip": 0.01104205, "auxiliary_loss_mlp": 0.01024029, "balance_loss_clip": 1.01127362, "balance_loss_mlp": 1.03581309, "epoch": 0.9467608597625131, "flos": 19310496203520.0, "grad_norm": 4.093731897774944, "language_loss": 0.5662576, "learning_rate": 2.962653596305964e-08, "loss": 0.58753997, "num_input_tokens_seen": 339675600, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.68359375, "step": 15747, "time_per_iteration": 2.428515911102295 }, { "auxiliary_loss_clip": 0.01027962, "auxiliary_loss_mlp": 0.01003505, "balance_loss_clip": 1.00250328, "balance_loss_mlp": 1.00569224, "epoch": 0.9468209830151811, "flos": 69630252802560.0, "grad_norm": 0.6565996906473062, "language_loss": 0.53263402, "learning_rate": 2.955978648787871e-08, "loss": 0.55294865, "num_input_tokens_seen": 339744505, "router_z_loss_clip": 0.01000977, "router_z_loss_mlp": 0.22265625, "step": 15748, "time_per_iteration": 3.289579391479492 }, { "auxiliary_loss_clip": 0.01104613, "auxiliary_loss_mlp": 0.01037706, "balance_loss_clip": 1.02552879, "balance_loss_mlp": 1.03602016, "epoch": 0.946881106267849, "flos": 27016639113600.0, "grad_norm": 2.125142300019739, "language_loss": 0.66357309, "learning_rate": 2.9493111731988096e-08, "loss": 0.68499631, "num_input_tokens_seen": 339765810, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 15749, "time_per_iteration": 3.9518275260925293 }, { "auxiliary_loss_clip": 0.01104599, "auxiliary_loss_mlp": 0.01030212, "balance_loss_clip": 1.01650333, "balance_loss_mlp": 1.03474164, "epoch": 0.9469412295205171, "flos": 20190092451840.0, "grad_norm": 2.0741612670924012, "language_loss": 0.76261115, "learning_rate": 2.942651169791621e-08, "loss": 0.78395927, "num_input_tokens_seen": 339784125, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.69921875, "step": 15750, "time_per_iteration": 2.438767910003662 }, { "auxiliary_loss_clip": 0.01104164, "auxiliary_loss_mlp": 0.01027867, "balance_loss_clip": 1.01613736, "balance_loss_mlp": 1.03609085, "epoch": 0.947001352773185, "flos": 21324905809920.0, "grad_norm": 1.8275461903082166, "language_loss": 0.68281716, "learning_rate": 2.9359986388188372e-08, "loss": 0.7041375, "num_input_tokens_seen": 339803450, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.6796875, "step": 15751, "time_per_iteration": 2.470576524734497 }, { "auxiliary_loss_clip": 0.01104202, "auxiliary_loss_mlp": 0.01027871, "balance_loss_clip": 1.01633728, "balance_loss_mlp": 1.03573501, "epoch": 0.947061476025853, "flos": 21944041562880.0, "grad_norm": 1.6408601401089578, "language_loss": 0.65759325, "learning_rate": 2.929353580532723e-08, "loss": 0.67891395, "num_input_tokens_seen": 339823215, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.68359375, "step": 15752, "time_per_iteration": 2.4426541328430176 }, { "auxiliary_loss_clip": 0.0110304, "auxiliary_loss_mlp": 0.01030888, "balance_loss_clip": 1.01872301, "balance_loss_mlp": 1.03417277, "epoch": 0.947121599278521, "flos": 21394715892480.0, "grad_norm": 5.278329151282107, "language_loss": 0.7173022, "learning_rate": 2.9227159951852764e-08, "loss": 0.7386415, "num_input_tokens_seen": 339842230, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6875, "step": 15753, "time_per_iteration": 2.48923921585083 }, { "auxiliary_loss_clip": 0.0110527, "auxiliary_loss_mlp": 0.01031481, "balance_loss_clip": 1.01796269, "balance_loss_mlp": 1.03444195, "epoch": 0.9471817225311889, "flos": 23075730437760.0, "grad_norm": 1.8837508876014983, "language_loss": 0.69947517, "learning_rate": 2.9160858830281855e-08, "loss": 0.72084272, "num_input_tokens_seen": 339861640, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.70703125, "step": 15754, "time_per_iteration": 5.273187160491943 }, { "auxiliary_loss_clip": 0.0110488, "auxiliary_loss_mlp": 0.0103209, "balance_loss_clip": 1.02006793, "balance_loss_mlp": 1.03386188, "epoch": 0.947241845783857, "flos": 11910744305280.0, "grad_norm": 2.302061043026509, "language_loss": 0.78923875, "learning_rate": 2.9094632443129153e-08, "loss": 0.81060845, "num_input_tokens_seen": 339878210, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.7109375, "step": 15755, "time_per_iteration": 2.419296979904175 }, { "auxiliary_loss_clip": 0.01110777, "auxiliary_loss_mlp": 0.01036515, "balance_loss_clip": 1.02172744, "balance_loss_mlp": 1.03636146, "epoch": 0.9473019690365249, "flos": 20740675098240.0, "grad_norm": 2.659814484221021, "language_loss": 0.75533009, "learning_rate": 2.9028480792904876e-08, "loss": 0.77680302, "num_input_tokens_seen": 339894255, "router_z_loss_clip": 0.1484375, "router_z_loss_mlp": 0.7421875, "step": 15756, "time_per_iteration": 2.458237886428833 }, { "auxiliary_loss_clip": 0.01103771, "auxiliary_loss_mlp": 0.01027919, "balance_loss_clip": 1.01665998, "balance_loss_mlp": 1.03456163, "epoch": 0.9473620922891929, "flos": 17639896602240.0, "grad_norm": 2.3054049007934574, "language_loss": 0.75024033, "learning_rate": 2.8962403882118347e-08, "loss": 0.77155721, "num_input_tokens_seen": 339912425, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.69140625, "step": 15757, "time_per_iteration": 2.4220376014709473 }, { "auxiliary_loss_clip": 0.01107744, "auxiliary_loss_mlp": 0.01033644, "balance_loss_clip": 1.02083516, "balance_loss_mlp": 1.03649449, "epoch": 0.9474222155418608, "flos": 23550002640000.0, "grad_norm": 2.7808316730075147, "language_loss": 0.79643458, "learning_rate": 2.889640171327512e-08, "loss": 0.81784844, "num_input_tokens_seen": 339929635, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.71484375, "step": 15758, "time_per_iteration": 2.4874603748321533 }, { "auxiliary_loss_clip": 0.01102285, "auxiliary_loss_mlp": 0.01031898, "balance_loss_clip": 1.02020955, "balance_loss_mlp": 1.03533268, "epoch": 0.9474823387945288, "flos": 27089753247360.0, "grad_norm": 1.6787295535838664, "language_loss": 0.72162807, "learning_rate": 2.8830474288877638e-08, "loss": 0.74296993, "num_input_tokens_seen": 339951200, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 15759, "time_per_iteration": 2.487847328186035 }, { "auxiliary_loss_clip": 0.01100178, "auxiliary_loss_mlp": 0.01024692, "balance_loss_clip": 1.01469588, "balance_loss_mlp": 1.035864, "epoch": 0.9475424620471967, "flos": 22966526113920.0, "grad_norm": 1.605407773395564, "language_loss": 0.75394469, "learning_rate": 2.8764621611426344e-08, "loss": 0.77519345, "num_input_tokens_seen": 339971820, "router_z_loss_clip": 0.10009766, "router_z_loss_mlp": 0.640625, "step": 15760, "time_per_iteration": 2.4688477516174316 }, { "auxiliary_loss_clip": 0.01104037, "auxiliary_loss_mlp": 0.01027303, "balance_loss_clip": 1.01565623, "balance_loss_mlp": 1.03636408, "epoch": 0.9476025852998647, "flos": 20047671025920.0, "grad_norm": 1.8071054704433533, "language_loss": 0.7262972, "learning_rate": 2.8698843683418128e-08, "loss": 0.74761063, "num_input_tokens_seen": 339989420, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 15761, "time_per_iteration": 2.4528746604919434 }, { "auxiliary_loss_clip": 0.01104233, "auxiliary_loss_mlp": 0.01035361, "balance_loss_clip": 1.02377999, "balance_loss_mlp": 1.03743291, "epoch": 0.9476627085525327, "flos": 14975468524800.0, "grad_norm": 2.0515437002786645, "language_loss": 0.71787572, "learning_rate": 2.863314050734722e-08, "loss": 0.73927164, "num_input_tokens_seen": 340006690, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 15762, "time_per_iteration": 2.4898903369903564 }, { "auxiliary_loss_clip": 0.01107499, "auxiliary_loss_mlp": 0.01037, "balance_loss_clip": 1.02363634, "balance_loss_mlp": 1.03458416, "epoch": 0.9477228318052007, "flos": 18697788984960.0, "grad_norm": 2.5583931479394546, "language_loss": 0.67591101, "learning_rate": 2.856751208570518e-08, "loss": 0.69735599, "num_input_tokens_seen": 340025480, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.73046875, "step": 15763, "time_per_iteration": 2.4327962398529053 }, { "auxiliary_loss_clip": 0.01103793, "auxiliary_loss_mlp": 0.01034466, "balance_loss_clip": 1.02219391, "balance_loss_mlp": 1.03397548, "epoch": 0.9477829550578686, "flos": 23875065745920.0, "grad_norm": 2.1075845885170956, "language_loss": 0.69922531, "learning_rate": 2.8501958420980466e-08, "loss": 0.72060788, "num_input_tokens_seen": 340043785, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 15764, "time_per_iteration": 2.4808542728424072 }, { "auxiliary_loss_clip": 0.01100284, "auxiliary_loss_mlp": 0.01028735, "balance_loss_clip": 1.01845312, "balance_loss_mlp": 1.03686213, "epoch": 0.9478430783105366, "flos": 22562890007040.0, "grad_norm": 1.7057854306431575, "language_loss": 0.71098769, "learning_rate": 2.8436479515659306e-08, "loss": 0.73227781, "num_input_tokens_seen": 340064360, "router_z_loss_clip": 0.10253906, "router_z_loss_mlp": 0.6328125, "step": 15765, "time_per_iteration": 2.45184063911438 }, { "auxiliary_loss_clip": 0.01028035, "auxiliary_loss_mlp": 0.01000538, "balance_loss_clip": 0.99947131, "balance_loss_mlp": 1.00574636, "epoch": 0.9479032015632046, "flos": 60857885554560.0, "grad_norm": 0.8056463782395779, "language_loss": 0.59017724, "learning_rate": 2.8371075372224384e-08, "loss": 0.61046296, "num_input_tokens_seen": 340114425, "router_z_loss_clip": 0.01068115, "router_z_loss_mlp": 0.22265625, "step": 15766, "time_per_iteration": 2.8891777992248535 }, { "auxiliary_loss_clip": 0.01103744, "auxiliary_loss_mlp": 0.01034463, "balance_loss_clip": 1.02326906, "balance_loss_mlp": 1.0351069, "epoch": 0.9479633248158725, "flos": 14683873916160.0, "grad_norm": 1.9308787077081941, "language_loss": 0.74068701, "learning_rate": 2.8305745993155938e-08, "loss": 0.76206911, "num_input_tokens_seen": 340132200, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6875, "step": 15767, "time_per_iteration": 2.420196056365967 }, { "auxiliary_loss_clip": 0.01109193, "auxiliary_loss_mlp": 0.01033287, "balance_loss_clip": 1.02043653, "balance_loss_mlp": 1.03717971, "epoch": 0.9480234480685406, "flos": 20333878594560.0, "grad_norm": 3.0083036632885523, "language_loss": 0.73090065, "learning_rate": 2.8240491380931096e-08, "loss": 0.75232548, "num_input_tokens_seen": 340149175, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 15768, "time_per_iteration": 2.464022397994995 }, { "auxiliary_loss_clip": 0.01027963, "auxiliary_loss_mlp": 0.01001489, "balance_loss_clip": 1.0004456, "balance_loss_mlp": 1.00562358, "epoch": 0.9480835713212085, "flos": 70293092428800.0, "grad_norm": 0.7330849565994623, "language_loss": 0.55279243, "learning_rate": 2.8175311538024326e-08, "loss": 0.57308698, "num_input_tokens_seen": 340208155, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.22363281, "step": 15769, "time_per_iteration": 3.1186227798461914 }, { "auxiliary_loss_clip": 0.01101654, "auxiliary_loss_mlp": 0.01031128, "balance_loss_clip": 1.0191772, "balance_loss_mlp": 1.03289986, "epoch": 0.9481436945738765, "flos": 25449749055360.0, "grad_norm": 1.5314058042940106, "language_loss": 0.77593815, "learning_rate": 2.8110206466907428e-08, "loss": 0.79726595, "num_input_tokens_seen": 340229275, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 15770, "time_per_iteration": 2.516606569290161 }, { "auxiliary_loss_clip": 0.01109511, "auxiliary_loss_mlp": 0.01034886, "balance_loss_clip": 1.02173173, "balance_loss_mlp": 1.03888941, "epoch": 0.9482038178265444, "flos": 26979902478720.0, "grad_norm": 3.088972298058707, "language_loss": 0.80039263, "learning_rate": 2.8045176170049313e-08, "loss": 0.82183665, "num_input_tokens_seen": 340248920, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 15771, "time_per_iteration": 2.479489803314209 }, { "auxiliary_loss_clip": 0.01102081, "auxiliary_loss_mlp": 0.01030438, "balance_loss_clip": 1.01827872, "balance_loss_mlp": 1.03502727, "epoch": 0.9482639410792124, "flos": 17785442511360.0, "grad_norm": 1.867920357980406, "language_loss": 0.69415534, "learning_rate": 2.7980220649915566e-08, "loss": 0.71548063, "num_input_tokens_seen": 340266775, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 15772, "time_per_iteration": 2.4226953983306885 }, { "auxiliary_loss_clip": 0.01104034, "auxiliary_loss_mlp": 0.01030191, "balance_loss_clip": 1.01769233, "balance_loss_mlp": 1.03638148, "epoch": 0.9483240643318803, "flos": 20996682307200.0, "grad_norm": 1.5917820145279586, "language_loss": 0.73893213, "learning_rate": 2.7915339908969327e-08, "loss": 0.76027435, "num_input_tokens_seen": 340285295, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 15773, "time_per_iteration": 2.4470815658569336 }, { "auxiliary_loss_clip": 0.01104369, "auxiliary_loss_mlp": 0.01038113, "balance_loss_clip": 1.02523863, "balance_loss_mlp": 1.03383994, "epoch": 0.9483841875845483, "flos": 20083294339200.0, "grad_norm": 2.3061731443704026, "language_loss": 0.62946427, "learning_rate": 2.7850533949671072e-08, "loss": 0.6508891, "num_input_tokens_seen": 340304265, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 15774, "time_per_iteration": 2.4582207202911377 }, { "auxiliary_loss_clip": 0.01104021, "auxiliary_loss_mlp": 0.01030054, "balance_loss_clip": 1.01745319, "balance_loss_mlp": 1.03437638, "epoch": 0.9484443108372163, "flos": 20813645577600.0, "grad_norm": 1.8256712176338519, "language_loss": 0.59520608, "learning_rate": 2.7785802774478396e-08, "loss": 0.61654687, "num_input_tokens_seen": 340323690, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15775, "time_per_iteration": 2.60990834236145 }, { "auxiliary_loss_clip": 0.01106037, "auxiliary_loss_mlp": 0.01027485, "balance_loss_clip": 1.01433039, "balance_loss_mlp": 1.03578615, "epoch": 0.9485044340898843, "flos": 36429184506240.0, "grad_norm": 2.192712640433443, "language_loss": 0.61781108, "learning_rate": 2.772114638584555e-08, "loss": 0.63914627, "num_input_tokens_seen": 340345830, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.703125, "step": 15776, "time_per_iteration": 2.564572334289551 }, { "auxiliary_loss_clip": 0.01103727, "auxiliary_loss_mlp": 0.01030322, "balance_loss_clip": 1.01788807, "balance_loss_mlp": 1.03438354, "epoch": 0.9485645573425522, "flos": 22602535643520.0, "grad_norm": 2.0280551449556943, "language_loss": 0.73316103, "learning_rate": 2.765656478622458e-08, "loss": 0.75450146, "num_input_tokens_seen": 340365910, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 15777, "time_per_iteration": 2.4498231410980225 }, { "auxiliary_loss_clip": 0.01113841, "auxiliary_loss_mlp": 0.01037539, "balance_loss_clip": 1.02484906, "balance_loss_mlp": 1.0384289, "epoch": 0.9486246805952202, "flos": 22017766227840.0, "grad_norm": 2.934087984535304, "language_loss": 0.7246685, "learning_rate": 2.759205797806441e-08, "loss": 0.74618226, "num_input_tokens_seen": 340383935, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7578125, "step": 15778, "time_per_iteration": 2.44333553314209 }, { "auxiliary_loss_clip": 0.01098626, "auxiliary_loss_mlp": 0.01030768, "balance_loss_clip": 1.02042615, "balance_loss_mlp": 1.03504264, "epoch": 0.9486848038478882, "flos": 16508674604160.0, "grad_norm": 1.8937471280953415, "language_loss": 0.70038152, "learning_rate": 2.7527625963810865e-08, "loss": 0.72167552, "num_input_tokens_seen": 340402760, "router_z_loss_clip": 0.10351562, "router_z_loss_mlp": 0.63671875, "step": 15779, "time_per_iteration": 2.432581901550293 }, { "auxiliary_loss_clip": 0.01104777, "auxiliary_loss_mlp": 0.01030024, "balance_loss_clip": 1.0175364, "balance_loss_mlp": 1.03569949, "epoch": 0.9487449271005561, "flos": 19244385221760.0, "grad_norm": 1.9266496307429402, "language_loss": 0.78860503, "learning_rate": 2.7463268745907542e-08, "loss": 0.80995303, "num_input_tokens_seen": 340422105, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 15780, "time_per_iteration": 2.440859794616699 }, { "auxiliary_loss_clip": 0.0110497, "auxiliary_loss_mlp": 0.01031424, "balance_loss_clip": 1.01943719, "balance_loss_mlp": 1.03634238, "epoch": 0.9488050503532242, "flos": 21762692772480.0, "grad_norm": 1.8381813159697389, "language_loss": 0.66191375, "learning_rate": 2.7398986326794494e-08, "loss": 0.68327767, "num_input_tokens_seen": 340441160, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 15781, "time_per_iteration": 2.470723867416382 }, { "auxiliary_loss_clip": 0.01101868, "auxiliary_loss_mlp": 0.0102888, "balance_loss_clip": 1.01654816, "balance_loss_mlp": 1.0349226, "epoch": 0.9488651736058921, "flos": 18368919037440.0, "grad_norm": 2.033161238430905, "language_loss": 0.8013975, "learning_rate": 2.733477870890999e-08, "loss": 0.82270497, "num_input_tokens_seen": 340458200, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.671875, "step": 15782, "time_per_iteration": 2.458845615386963 }, { "auxiliary_loss_clip": 0.01027835, "auxiliary_loss_mlp": 0.01000917, "balance_loss_clip": 0.99992126, "balance_loss_mlp": 1.00546122, "epoch": 0.9489252968585601, "flos": 70084057230720.0, "grad_norm": 0.728103049093095, "language_loss": 0.59818459, "learning_rate": 2.7270645894688082e-08, "loss": 0.6184721, "num_input_tokens_seen": 340526420, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22460938, "step": 15783, "time_per_iteration": 3.1970651149749756 }, { "auxiliary_loss_clip": 0.01105214, "auxiliary_loss_mlp": 0.01035979, "balance_loss_clip": 1.02287745, "balance_loss_mlp": 1.03516269, "epoch": 0.948985420111228, "flos": 27855440490240.0, "grad_norm": 10.06192922164422, "language_loss": 0.7394321, "learning_rate": 2.720658788656105e-08, "loss": 0.76084399, "num_input_tokens_seen": 340546325, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.69921875, "step": 15784, "time_per_iteration": 2.5257515907287598 }, { "auxiliary_loss_clip": 0.01104012, "auxiliary_loss_mlp": 0.01031185, "balance_loss_clip": 1.01809621, "balance_loss_mlp": 1.03449178, "epoch": 0.949045543363896, "flos": 24316049018880.0, "grad_norm": 2.1789204581025556, "language_loss": 0.70001304, "learning_rate": 2.714260468695806e-08, "loss": 0.72136497, "num_input_tokens_seen": 340565145, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 15785, "time_per_iteration": 2.4753658771514893 }, { "auxiliary_loss_clip": 0.01104152, "auxiliary_loss_mlp": 0.01027015, "balance_loss_clip": 1.01517141, "balance_loss_mlp": 1.03403497, "epoch": 0.9491056666165639, "flos": 24241677909120.0, "grad_norm": 1.482591105436613, "language_loss": 0.76008141, "learning_rate": 2.707869629830495e-08, "loss": 0.78139311, "num_input_tokens_seen": 340585465, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 15786, "time_per_iteration": 2.5097568035125732 }, { "auxiliary_loss_clip": 0.01104836, "auxiliary_loss_mlp": 0.01029508, "balance_loss_clip": 1.0178138, "balance_loss_mlp": 1.03623307, "epoch": 0.949165789869232, "flos": 24531261356160.0, "grad_norm": 1.7179494126767447, "language_loss": 0.79145634, "learning_rate": 2.7014862723025335e-08, "loss": 0.81279975, "num_input_tokens_seen": 340606010, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 15787, "time_per_iteration": 4.041906833648682 }, { "auxiliary_loss_clip": 0.0110346, "auxiliary_loss_mlp": 0.01027455, "balance_loss_clip": 1.01595688, "balance_loss_mlp": 1.03785706, "epoch": 0.9492259131218999, "flos": 22235348862720.0, "grad_norm": 1.7742710300898583, "language_loss": 0.76394725, "learning_rate": 2.6951103963540388e-08, "loss": 0.78525639, "num_input_tokens_seen": 340626135, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65625, "step": 15788, "time_per_iteration": 2.4858853816986084 }, { "auxiliary_loss_clip": 0.01105415, "auxiliary_loss_mlp": 0.01033585, "balance_loss_clip": 1.02016258, "balance_loss_mlp": 1.03497136, "epoch": 0.9492860363745679, "flos": 22966310632320.0, "grad_norm": 1.7558807460828783, "language_loss": 0.71779251, "learning_rate": 2.6887420022266848e-08, "loss": 0.73918247, "num_input_tokens_seen": 340644870, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.703125, "step": 15789, "time_per_iteration": 2.4769139289855957 }, { "auxiliary_loss_clip": 0.01101693, "auxiliary_loss_mlp": 0.01031913, "balance_loss_clip": 1.01839471, "balance_loss_mlp": 1.03510547, "epoch": 0.9493461596272358, "flos": 18370283754240.0, "grad_norm": 8.25840493942069, "language_loss": 0.73250806, "learning_rate": 2.682381090161989e-08, "loss": 0.75384414, "num_input_tokens_seen": 340663695, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.6640625, "step": 15790, "time_per_iteration": 2.4334545135498047 }, { "auxiliary_loss_clip": 0.01104516, "auxiliary_loss_mlp": 0.01031656, "balance_loss_clip": 1.01851916, "balance_loss_mlp": 1.03388262, "epoch": 0.9494062828799038, "flos": 20011724490240.0, "grad_norm": 1.8983441086678712, "language_loss": 0.77662987, "learning_rate": 2.6760276604012033e-08, "loss": 0.79799163, "num_input_tokens_seen": 340682970, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 15791, "time_per_iteration": 3.8558926582336426 }, { "auxiliary_loss_clip": 0.01107053, "auxiliary_loss_mlp": 0.01030666, "balance_loss_clip": 1.01801169, "balance_loss_mlp": 1.03511548, "epoch": 0.9494664061325718, "flos": 27228583313280.0, "grad_norm": 1.8299944199782634, "language_loss": 0.73844492, "learning_rate": 2.6696817131852234e-08, "loss": 0.75982213, "num_input_tokens_seen": 340702275, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 15792, "time_per_iteration": 2.5353751182556152 }, { "auxiliary_loss_clip": 0.0110296, "auxiliary_loss_mlp": 0.01032235, "balance_loss_clip": 1.02031434, "balance_loss_mlp": 1.03482556, "epoch": 0.9495265293852397, "flos": 18369816877440.0, "grad_norm": 1.857066183739074, "language_loss": 0.78156745, "learning_rate": 2.663343248754679e-08, "loss": 0.80291945, "num_input_tokens_seen": 340719060, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.68359375, "step": 15793, "time_per_iteration": 2.4363319873809814 }, { "auxiliary_loss_clip": 0.01103477, "auxiliary_loss_mlp": 0.01031333, "balance_loss_clip": 1.01955485, "balance_loss_mlp": 1.0351758, "epoch": 0.9495866526379078, "flos": 23075766351360.0, "grad_norm": 2.7860629722537906, "language_loss": 0.77638018, "learning_rate": 2.6570122673499562e-08, "loss": 0.79772824, "num_input_tokens_seen": 340737815, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 15794, "time_per_iteration": 2.469527244567871 }, { "auxiliary_loss_clip": 0.01106595, "auxiliary_loss_mlp": 0.01029741, "balance_loss_clip": 1.01624632, "balance_loss_mlp": 1.03497076, "epoch": 0.9496467758905757, "flos": 17529902179200.0, "grad_norm": 2.1352287365460083, "language_loss": 0.61290079, "learning_rate": 2.650688769211107e-08, "loss": 0.63426417, "num_input_tokens_seen": 340756035, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 15795, "time_per_iteration": 3.7989561557769775 }, { "auxiliary_loss_clip": 0.01102499, "auxiliary_loss_mlp": 0.01032898, "balance_loss_clip": 1.02069724, "balance_loss_mlp": 1.03634262, "epoch": 0.9497068991432437, "flos": 24133910129280.0, "grad_norm": 1.630811066981726, "language_loss": 0.79292011, "learning_rate": 2.644372754577895e-08, "loss": 0.81427407, "num_input_tokens_seen": 340775620, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 15796, "time_per_iteration": 3.9524025917053223 }, { "auxiliary_loss_clip": 0.01104988, "auxiliary_loss_mlp": 0.01027718, "balance_loss_clip": 1.01516557, "balance_loss_mlp": 1.035604, "epoch": 0.9497670223959116, "flos": 20303319098880.0, "grad_norm": 2.804401423847825, "language_loss": 0.75353736, "learning_rate": 2.6380642236898398e-08, "loss": 0.77486438, "num_input_tokens_seen": 340794510, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 15797, "time_per_iteration": 2.508485794067383 }, { "auxiliary_loss_clip": 0.01105773, "auxiliary_loss_mlp": 0.01031625, "balance_loss_clip": 1.01966214, "balance_loss_mlp": 1.03647029, "epoch": 0.9498271456485796, "flos": 13698916099200.0, "grad_norm": 2.205660011721255, "language_loss": 0.65998757, "learning_rate": 2.6317631767861727e-08, "loss": 0.68136156, "num_input_tokens_seen": 340812955, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 15798, "time_per_iteration": 2.4423673152923584 }, { "auxiliary_loss_clip": 0.01106663, "auxiliary_loss_mlp": 0.01030199, "balance_loss_clip": 1.0182308, "balance_loss_mlp": 1.03638315, "epoch": 0.9498872689012475, "flos": 20814004713600.0, "grad_norm": 1.891821319304115, "language_loss": 0.77180982, "learning_rate": 2.6254696141058575e-08, "loss": 0.7931785, "num_input_tokens_seen": 340829200, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 15799, "time_per_iteration": 2.4808948040008545 }, { "auxiliary_loss_clip": 0.01102174, "auxiliary_loss_mlp": 0.01027897, "balance_loss_clip": 1.0161016, "balance_loss_mlp": 1.03517175, "epoch": 0.9499473921539155, "flos": 21032700670080.0, "grad_norm": 2.433281290748209, "language_loss": 0.70597267, "learning_rate": 2.6191835358874814e-08, "loss": 0.72727334, "num_input_tokens_seen": 340848035, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 15800, "time_per_iteration": 2.4660916328430176 }, { "auxiliary_loss_clip": 0.01100505, "auxiliary_loss_mlp": 0.01027763, "balance_loss_clip": 1.01536512, "balance_loss_mlp": 1.03317618, "epoch": 0.9500075154065835, "flos": 20998693468800.0, "grad_norm": 2.2616277979523702, "language_loss": 0.72127759, "learning_rate": 2.6129049423694315e-08, "loss": 0.74256021, "num_input_tokens_seen": 340870025, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 15801, "time_per_iteration": 2.520033597946167 }, { "auxiliary_loss_clip": 0.01103617, "auxiliary_loss_mlp": 0.01030655, "balance_loss_clip": 1.01863277, "balance_loss_mlp": 1.03510785, "epoch": 0.9500676386592515, "flos": 25121956515840.0, "grad_norm": 1.7100657818237321, "language_loss": 0.8083753, "learning_rate": 2.6066338337898508e-08, "loss": 0.82971805, "num_input_tokens_seen": 340892290, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 15802, "time_per_iteration": 2.5003416538238525 }, { "auxiliary_loss_clip": 0.01105956, "auxiliary_loss_mlp": 0.01030908, "balance_loss_clip": 1.01869535, "balance_loss_mlp": 1.03612435, "epoch": 0.9501277619119194, "flos": 27523625627520.0, "grad_norm": 1.7751173894839352, "language_loss": 0.67699134, "learning_rate": 2.60037021038646e-08, "loss": 0.69835997, "num_input_tokens_seen": 340912260, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 15803, "time_per_iteration": 2.5234460830688477 }, { "auxiliary_loss_clip": 0.01102388, "auxiliary_loss_mlp": 0.01030645, "balance_loss_clip": 1.01875985, "balance_loss_mlp": 1.03465772, "epoch": 0.9501878851645874, "flos": 20813968800000.0, "grad_norm": 2.4140798645567463, "language_loss": 0.7629528, "learning_rate": 2.5941140723968247e-08, "loss": 0.78428316, "num_input_tokens_seen": 340928930, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 15804, "time_per_iteration": 2.4319777488708496 }, { "auxiliary_loss_clip": 0.01106926, "auxiliary_loss_mlp": 0.01035002, "balance_loss_clip": 1.022843, "balance_loss_mlp": 1.03738904, "epoch": 0.9502480084172553, "flos": 18369385914240.0, "grad_norm": 2.070378521693149, "language_loss": 0.73538792, "learning_rate": 2.5878654200581775e-08, "loss": 0.75680721, "num_input_tokens_seen": 340946615, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 15805, "time_per_iteration": 2.445870876312256 }, { "auxiliary_loss_clip": 0.01105416, "auxiliary_loss_mlp": 0.01035451, "balance_loss_clip": 1.02298152, "balance_loss_mlp": 1.03607368, "epoch": 0.9503081316699233, "flos": 23549607590400.0, "grad_norm": 1.42938948689767, "language_loss": 0.79950291, "learning_rate": 2.5816242536074618e-08, "loss": 0.82091159, "num_input_tokens_seen": 340967545, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6953125, "step": 15806, "time_per_iteration": 2.4758310317993164 }, { "auxiliary_loss_clip": 0.01108018, "auxiliary_loss_mlp": 0.01026789, "balance_loss_clip": 1.01485634, "balance_loss_mlp": 1.03747869, "epoch": 0.9503682549225914, "flos": 18040444139520.0, "grad_norm": 2.1325016800229912, "language_loss": 0.82489777, "learning_rate": 2.5753905732813108e-08, "loss": 0.84624588, "num_input_tokens_seen": 340984955, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.703125, "step": 15807, "time_per_iteration": 2.4806034564971924 }, { "auxiliary_loss_clip": 0.01100277, "auxiliary_loss_mlp": 0.0102886, "balance_loss_clip": 1.0171175, "balance_loss_mlp": 1.03246808, "epoch": 0.9504283781752593, "flos": 25886135387520.0, "grad_norm": 1.718424354794071, "language_loss": 0.7175945, "learning_rate": 2.5691643793161355e-08, "loss": 0.73888588, "num_input_tokens_seen": 341007300, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 15808, "time_per_iteration": 2.490354299545288 }, { "auxiliary_loss_clip": 0.01103175, "auxiliary_loss_mlp": 0.01028296, "balance_loss_clip": 1.01646411, "balance_loss_mlp": 1.03540528, "epoch": 0.9504885014279273, "flos": 22124025636480.0, "grad_norm": 1.6748819013742957, "language_loss": 0.69865501, "learning_rate": 2.562945671948058e-08, "loss": 0.71996975, "num_input_tokens_seen": 341026695, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 15809, "time_per_iteration": 2.458683967590332 }, { "auxiliary_loss_clip": 0.0110304, "auxiliary_loss_mlp": 0.01025272, "balance_loss_clip": 1.01347029, "balance_loss_mlp": 1.03423548, "epoch": 0.9505486246805952, "flos": 21615961714560.0, "grad_norm": 1.5715293481856076, "language_loss": 0.75415778, "learning_rate": 2.5567344514128452e-08, "loss": 0.77544081, "num_input_tokens_seen": 341047080, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 15810, "time_per_iteration": 2.451925754547119 }, { "auxiliary_loss_clip": 0.01102297, "auxiliary_loss_mlp": 0.01039324, "balance_loss_clip": 1.02680707, "balance_loss_mlp": 1.03380048, "epoch": 0.9506087479332632, "flos": 22528236360960.0, "grad_norm": 1.5282404041079678, "language_loss": 0.80072594, "learning_rate": 2.5505307179460643e-08, "loss": 0.82214212, "num_input_tokens_seen": 341067310, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 15811, "time_per_iteration": 2.475062131881714 }, { "auxiliary_loss_clip": 0.011019, "auxiliary_loss_mlp": 0.0103376, "balance_loss_clip": 1.02132094, "balance_loss_mlp": 1.03368235, "epoch": 0.9506688711859311, "flos": 27527360641920.0, "grad_norm": 2.387449836385203, "language_loss": 0.70312726, "learning_rate": 2.5443344717829495e-08, "loss": 0.72448385, "num_input_tokens_seen": 341085110, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.68359375, "step": 15812, "time_per_iteration": 2.49442720413208 }, { "auxiliary_loss_clip": 0.01104839, "auxiliary_loss_mlp": 0.01029911, "balance_loss_clip": 1.01797247, "balance_loss_mlp": 1.03543258, "epoch": 0.9507289944385992, "flos": 19865783531520.0, "grad_norm": 1.9420184851120414, "language_loss": 0.65635872, "learning_rate": 2.538145713158446e-08, "loss": 0.67770618, "num_input_tokens_seen": 341103190, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 15813, "time_per_iteration": 2.454704761505127 }, { "auxiliary_loss_clip": 0.01106095, "auxiliary_loss_mlp": 0.01032513, "balance_loss_clip": 1.01989436, "balance_loss_mlp": 1.0357554, "epoch": 0.9507891176912671, "flos": 25193274969600.0, "grad_norm": 1.6484865989972681, "language_loss": 0.70666331, "learning_rate": 2.5319644423072327e-08, "loss": 0.72804946, "num_input_tokens_seen": 341125695, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 15814, "time_per_iteration": 2.509549379348755 }, { "auxiliary_loss_clip": 0.01101259, "auxiliary_loss_mlp": 0.01027432, "balance_loss_clip": 1.01600611, "balance_loss_mlp": 1.03502989, "epoch": 0.9508492409439351, "flos": 24899561458560.0, "grad_norm": 2.0578443229577275, "language_loss": 0.62837183, "learning_rate": 2.5257906594637445e-08, "loss": 0.6496588, "num_input_tokens_seen": 341143930, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 15815, "time_per_iteration": 2.4863007068634033 }, { "auxiliary_loss_clip": 0.01104178, "auxiliary_loss_mlp": 0.01026117, "balance_loss_clip": 1.01461291, "balance_loss_mlp": 1.03485298, "epoch": 0.950909364196603, "flos": 29784094375680.0, "grad_norm": 2.0686309070231044, "language_loss": 0.58766925, "learning_rate": 2.519624364862061e-08, "loss": 0.60897225, "num_input_tokens_seen": 341164280, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6953125, "step": 15816, "time_per_iteration": 2.4949727058410645 }, { "auxiliary_loss_clip": 0.01103006, "auxiliary_loss_mlp": 0.01034885, "balance_loss_clip": 1.02301741, "balance_loss_mlp": 1.03442633, "epoch": 0.950969487449271, "flos": 24717781704960.0, "grad_norm": 1.5228482584621936, "language_loss": 0.73616558, "learning_rate": 2.513465558735994e-08, "loss": 0.7575444, "num_input_tokens_seen": 341183670, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6875, "step": 15817, "time_per_iteration": 2.4757564067840576 }, { "auxiliary_loss_clip": 0.01105901, "auxiliary_loss_mlp": 0.01033981, "balance_loss_clip": 1.0196048, "balance_loss_mlp": 1.03522146, "epoch": 0.9510296107019389, "flos": 13699167494400.0, "grad_norm": 1.758464812567258, "language_loss": 0.60179353, "learning_rate": 2.5073142413190918e-08, "loss": 0.62319231, "num_input_tokens_seen": 341201900, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.70703125, "step": 15818, "time_per_iteration": 2.4233059883117676 }, { "auxiliary_loss_clip": 0.01104864, "auxiliary_loss_mlp": 0.01033322, "balance_loss_clip": 1.02041769, "balance_loss_mlp": 1.03634107, "epoch": 0.9510897339546069, "flos": 17311852667520.0, "grad_norm": 1.879856382664402, "language_loss": 0.69711155, "learning_rate": 2.5011704128446552e-08, "loss": 0.7184934, "num_input_tokens_seen": 341218340, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 15819, "time_per_iteration": 2.4328675270080566 }, { "auxiliary_loss_clip": 0.01108117, "auxiliary_loss_mlp": 0.01028417, "balance_loss_clip": 1.01639485, "balance_loss_mlp": 1.0374496, "epoch": 0.951149857207275, "flos": 14793940166400.0, "grad_norm": 2.162398439989887, "language_loss": 0.74004686, "learning_rate": 2.49503407354561e-08, "loss": 0.76141226, "num_input_tokens_seen": 341235885, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 15820, "time_per_iteration": 2.4699158668518066 }, { "auxiliary_loss_clip": 0.01107331, "auxiliary_loss_mlp": 0.01036753, "balance_loss_clip": 1.02417612, "balance_loss_mlp": 1.03650331, "epoch": 0.9512099804599429, "flos": 19391152193280.0, "grad_norm": 1.756112452753508, "language_loss": 0.78207928, "learning_rate": 2.4889052236546804e-08, "loss": 0.80352008, "num_input_tokens_seen": 341255280, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 15821, "time_per_iteration": 2.4537758827209473 }, { "auxiliary_loss_clip": 0.01103693, "auxiliary_loss_mlp": 0.01028152, "balance_loss_clip": 1.01555204, "balance_loss_mlp": 1.03479743, "epoch": 0.9512701037126109, "flos": 36757874885760.0, "grad_norm": 1.7638553429303567, "language_loss": 0.7116524, "learning_rate": 2.4827838634042586e-08, "loss": 0.73297077, "num_input_tokens_seen": 341279055, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6875, "step": 15822, "time_per_iteration": 2.5836405754089355 }, { "auxiliary_loss_clip": 0.01103638, "auxiliary_loss_mlp": 0.01034016, "balance_loss_clip": 1.02229738, "balance_loss_mlp": 1.03560328, "epoch": 0.9513302269652788, "flos": 22638266697600.0, "grad_norm": 1.7004419099874948, "language_loss": 0.66176605, "learning_rate": 2.47666999302647e-08, "loss": 0.6831426, "num_input_tokens_seen": 341298560, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 15823, "time_per_iteration": 2.452087879180908 }, { "auxiliary_loss_clip": 0.01102868, "auxiliary_loss_mlp": 0.01032732, "balance_loss_clip": 1.02137113, "balance_loss_mlp": 1.03633893, "epoch": 0.9513903502179468, "flos": 22893232412160.0, "grad_norm": 2.525329062621084, "language_loss": 0.77500975, "learning_rate": 2.4705636127531292e-08, "loss": 0.79636568, "num_input_tokens_seen": 341316650, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.6640625, "step": 15824, "time_per_iteration": 2.499990701675415 }, { "auxiliary_loss_clip": 0.01107793, "auxiliary_loss_mlp": 0.01027427, "balance_loss_clip": 1.01420057, "balance_loss_mlp": 1.03516853, "epoch": 0.9514504734706147, "flos": 27928626451200.0, "grad_norm": 2.228236007929143, "language_loss": 0.73761296, "learning_rate": 2.4644647228158065e-08, "loss": 0.75896513, "num_input_tokens_seen": 341336185, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 15825, "time_per_iteration": 2.4941272735595703 }, { "auxiliary_loss_clip": 0.01027965, "auxiliary_loss_mlp": 0.01000515, "balance_loss_clip": 0.99947751, "balance_loss_mlp": 1.00556517, "epoch": 0.9515105967232828, "flos": 67366767312000.0, "grad_norm": 0.931533112852407, "language_loss": 0.53394938, "learning_rate": 2.458373323445806e-08, "loss": 0.55423415, "num_input_tokens_seen": 341395795, "router_z_loss_clip": 0.01037598, "router_z_loss_mlp": 0.22460938, "step": 15826, "time_per_iteration": 3.041292667388916 }, { "auxiliary_loss_clip": 0.01104586, "auxiliary_loss_mlp": 0.01036023, "balance_loss_clip": 1.02399468, "balance_loss_mlp": 1.03600764, "epoch": 0.9515707199759507, "flos": 25846525664640.0, "grad_norm": 2.012389291993184, "language_loss": 0.72869444, "learning_rate": 2.452289414874076e-08, "loss": 0.75010061, "num_input_tokens_seen": 341415675, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 15827, "time_per_iteration": 2.4907517433166504 }, { "auxiliary_loss_clip": 0.01103456, "auxiliary_loss_mlp": 0.01031397, "balance_loss_clip": 1.01908898, "balance_loss_mlp": 1.03499043, "epoch": 0.9516308432286187, "flos": 21828983322240.0, "grad_norm": 1.9940016352474568, "language_loss": 0.74604654, "learning_rate": 2.4462129973313207e-08, "loss": 0.76739502, "num_input_tokens_seen": 341432990, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 15828, "time_per_iteration": 3.832029342651367 }, { "auxiliary_loss_clip": 0.01102162, "auxiliary_loss_mlp": 0.01032607, "balance_loss_clip": 1.02135324, "balance_loss_mlp": 1.03577328, "epoch": 0.9516909664812866, "flos": 27269593666560.0, "grad_norm": 1.6689157144426845, "language_loss": 0.73203999, "learning_rate": 2.440144071047978e-08, "loss": 0.75338769, "num_input_tokens_seen": 341454100, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 15829, "time_per_iteration": 2.4977996349334717 }, { "auxiliary_loss_clip": 0.01103195, "auxiliary_loss_mlp": 0.01027646, "balance_loss_clip": 1.01617825, "balance_loss_mlp": 1.03463078, "epoch": 0.9517510897339546, "flos": 21215342350080.0, "grad_norm": 2.0691706124367046, "language_loss": 0.61551774, "learning_rate": 2.4340826362541533e-08, "loss": 0.63682616, "num_input_tokens_seen": 341472955, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.68359375, "step": 15830, "time_per_iteration": 2.4437296390533447 }, { "auxiliary_loss_clip": 0.01106388, "auxiliary_loss_mlp": 0.01033221, "balance_loss_clip": 1.01939297, "balance_loss_mlp": 1.03585613, "epoch": 0.9518112129866225, "flos": 18733986915840.0, "grad_norm": 2.565032348478942, "language_loss": 0.73032212, "learning_rate": 2.428028693179729e-08, "loss": 0.75171822, "num_input_tokens_seen": 341490165, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.703125, "step": 15831, "time_per_iteration": 2.424811601638794 }, { "auxiliary_loss_clip": 0.01098798, "auxiliary_loss_mlp": 0.01024217, "balance_loss_clip": 1.01392961, "balance_loss_mlp": 1.03232563, "epoch": 0.9518713362392905, "flos": 16763676232320.0, "grad_norm": 1.9387254503614912, "language_loss": 0.65451366, "learning_rate": 2.4219822420542545e-08, "loss": 0.67574382, "num_input_tokens_seen": 341508055, "router_z_loss_clip": 0.10253906, "router_z_loss_mlp": 0.6640625, "step": 15832, "time_per_iteration": 3.8518080711364746 }, { "auxiliary_loss_clip": 0.01105081, "auxiliary_loss_mlp": 0.01028501, "balance_loss_clip": 1.01705062, "balance_loss_mlp": 1.03957486, "epoch": 0.9519314594919586, "flos": 15230649720960.0, "grad_norm": 1.8291922939471177, "language_loss": 0.77949762, "learning_rate": 2.4159432831070135e-08, "loss": 0.80083346, "num_input_tokens_seen": 341526155, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.65625, "step": 15833, "time_per_iteration": 2.4322683811187744 }, { "auxiliary_loss_clip": 0.0110143, "auxiliary_loss_mlp": 0.01030821, "balance_loss_clip": 1.01928711, "balance_loss_mlp": 1.03542256, "epoch": 0.9519915827446265, "flos": 19352943100800.0, "grad_norm": 2.1247509347228233, "language_loss": 0.74889803, "learning_rate": 2.4099118165670007e-08, "loss": 0.77022052, "num_input_tokens_seen": 341540450, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66015625, "step": 15834, "time_per_iteration": 2.4527814388275146 }, { "auxiliary_loss_clip": 0.01109804, "auxiliary_loss_mlp": 0.01035616, "balance_loss_clip": 1.02238965, "balance_loss_mlp": 1.03679979, "epoch": 0.9520517059972945, "flos": 22266303408000.0, "grad_norm": 2.816831414428999, "language_loss": 0.76450789, "learning_rate": 2.4038878426629216e-08, "loss": 0.7859621, "num_input_tokens_seen": 341557865, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.73046875, "step": 15835, "time_per_iteration": 2.4326579570770264 }, { "auxiliary_loss_clip": 0.01104396, "auxiliary_loss_mlp": 0.01032097, "balance_loss_clip": 1.01937711, "balance_loss_mlp": 1.03441179, "epoch": 0.9521118292499624, "flos": 14862313704960.0, "grad_norm": 1.97106328257015, "language_loss": 0.66010153, "learning_rate": 2.397871361623238e-08, "loss": 0.68146646, "num_input_tokens_seen": 341573890, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 15836, "time_per_iteration": 3.8499040603637695 }, { "auxiliary_loss_clip": 0.01102264, "auxiliary_loss_mlp": 0.01026975, "balance_loss_clip": 1.01445818, "balance_loss_mlp": 1.03478241, "epoch": 0.9521719525026304, "flos": 23508812718720.0, "grad_norm": 1.7562413159351704, "language_loss": 0.70511663, "learning_rate": 2.391862373676057e-08, "loss": 0.72640896, "num_input_tokens_seen": 341593770, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.67578125, "step": 15837, "time_per_iteration": 3.9670169353485107 }, { "auxiliary_loss_clip": 0.01105809, "auxiliary_loss_mlp": 0.01035132, "balance_loss_clip": 1.02108312, "balance_loss_mlp": 1.03505278, "epoch": 0.9522320757552983, "flos": 19714922409600.0, "grad_norm": 2.0037242327662876, "language_loss": 0.73459941, "learning_rate": 2.3858608790492617e-08, "loss": 0.7560088, "num_input_tokens_seen": 341612065, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.70703125, "step": 15838, "time_per_iteration": 2.4524593353271484 }, { "auxiliary_loss_clip": 0.01104304, "auxiliary_loss_mlp": 0.01030367, "balance_loss_clip": 1.01770139, "balance_loss_mlp": 1.03463793, "epoch": 0.9522921990079664, "flos": 25921291824000.0, "grad_norm": 2.3354190269114423, "language_loss": 0.78446126, "learning_rate": 2.379866877970449e-08, "loss": 0.80580795, "num_input_tokens_seen": 341631365, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 15839, "time_per_iteration": 2.5173983573913574 }, { "auxiliary_loss_clip": 0.01105953, "auxiliary_loss_mlp": 0.01033336, "balance_loss_clip": 1.0215224, "balance_loss_mlp": 1.03588867, "epoch": 0.9523523222606343, "flos": 19208115463680.0, "grad_norm": 2.367303614086476, "language_loss": 0.80383599, "learning_rate": 2.3738803706668585e-08, "loss": 0.82522881, "num_input_tokens_seen": 341650300, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.703125, "step": 15840, "time_per_iteration": 2.4372477531433105 }, { "auxiliary_loss_clip": 0.01098285, "auxiliary_loss_mlp": 0.01029391, "balance_loss_clip": 1.01895392, "balance_loss_mlp": 1.03324413, "epoch": 0.9524124455133023, "flos": 20921269703040.0, "grad_norm": 3.7566125317406374, "language_loss": 0.73181099, "learning_rate": 2.3679013573655314e-08, "loss": 0.75308776, "num_input_tokens_seen": 341667680, "router_z_loss_clip": 0.10449219, "router_z_loss_mlp": 0.6484375, "step": 15841, "time_per_iteration": 2.445080280303955 }, { "auxiliary_loss_clip": 0.01097863, "auxiliary_loss_mlp": 0.01025674, "balance_loss_clip": 1.01451635, "balance_loss_mlp": 1.03390551, "epoch": 0.9524725687659702, "flos": 18843550375680.0, "grad_norm": 1.93764025256061, "language_loss": 0.79028535, "learning_rate": 2.3619298382931972e-08, "loss": 0.81152076, "num_input_tokens_seen": 341685760, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.640625, "step": 15842, "time_per_iteration": 2.460458993911743 }, { "auxiliary_loss_clip": 0.01104447, "auxiliary_loss_mlp": 0.01031633, "balance_loss_clip": 1.01971221, "balance_loss_mlp": 1.03682494, "epoch": 0.9525326920186382, "flos": 22674680110080.0, "grad_norm": 2.068289475390718, "language_loss": 0.72646606, "learning_rate": 2.3559658136762973e-08, "loss": 0.74782681, "num_input_tokens_seen": 341705300, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 15843, "time_per_iteration": 2.4729719161987305 }, { "auxiliary_loss_clip": 0.01105843, "auxiliary_loss_mlp": 0.01027445, "balance_loss_clip": 1.01387298, "balance_loss_mlp": 1.03592467, "epoch": 0.9525928152713061, "flos": 22086642556800.0, "grad_norm": 1.7513164370812684, "language_loss": 0.78481758, "learning_rate": 2.3500092837409612e-08, "loss": 0.80615044, "num_input_tokens_seen": 341724565, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.69921875, "step": 15844, "time_per_iteration": 2.5001258850097656 }, { "auxiliary_loss_clip": 0.01108298, "auxiliary_loss_mlp": 0.0103219, "balance_loss_clip": 1.01779556, "balance_loss_mlp": 1.03602529, "epoch": 0.9526529385239741, "flos": 20704728562560.0, "grad_norm": 2.065028697206672, "language_loss": 0.70178759, "learning_rate": 2.3440602487130977e-08, "loss": 0.72319251, "num_input_tokens_seen": 341743605, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.72265625, "step": 15845, "time_per_iteration": 2.500821590423584 }, { "auxiliary_loss_clip": 0.01105378, "auxiliary_loss_mlp": 0.01034273, "balance_loss_clip": 1.02225137, "balance_loss_mlp": 1.03486991, "epoch": 0.9527130617766422, "flos": 23368043318400.0, "grad_norm": 1.5111644382516274, "language_loss": 0.75571579, "learning_rate": 2.338118708818282e-08, "loss": 0.77711231, "num_input_tokens_seen": 341763475, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.703125, "step": 15846, "time_per_iteration": 2.462345838546753 }, { "auxiliary_loss_clip": 0.01103661, "auxiliary_loss_mlp": 0.01024957, "balance_loss_clip": 1.01264226, "balance_loss_mlp": 1.03436708, "epoch": 0.9527731850293101, "flos": 18985935888000.0, "grad_norm": 1.8441972581026207, "language_loss": 0.78430915, "learning_rate": 2.3321846642817998e-08, "loss": 0.80559534, "num_input_tokens_seen": 341781265, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69140625, "step": 15847, "time_per_iteration": 2.4533612728118896 }, { "auxiliary_loss_clip": 0.01101799, "auxiliary_loss_mlp": 0.01036334, "balance_loss_clip": 1.0248425, "balance_loss_mlp": 1.0340333, "epoch": 0.9528333082819781, "flos": 19318038059520.0, "grad_norm": 1.7177500831770833, "language_loss": 0.77931309, "learning_rate": 2.326258115328672e-08, "loss": 0.80069435, "num_input_tokens_seen": 341798825, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.67578125, "step": 15848, "time_per_iteration": 2.4623398780822754 }, { "auxiliary_loss_clip": 0.0110905, "auxiliary_loss_mlp": 0.01041728, "balance_loss_clip": 1.02847147, "balance_loss_mlp": 1.03704917, "epoch": 0.952893431534646, "flos": 23951340276480.0, "grad_norm": 1.6047845133783172, "language_loss": 0.7201075, "learning_rate": 2.320339062183674e-08, "loss": 0.7416153, "num_input_tokens_seen": 341819480, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 15849, "time_per_iteration": 2.4771857261657715 }, { "auxiliary_loss_clip": 0.01110807, "auxiliary_loss_mlp": 0.01036583, "balance_loss_clip": 1.02341664, "balance_loss_mlp": 1.03826344, "epoch": 0.952953554787314, "flos": 21030545854080.0, "grad_norm": 1.6740747785636036, "language_loss": 0.75282031, "learning_rate": 2.314427505071226e-08, "loss": 0.77429426, "num_input_tokens_seen": 341838035, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7265625, "step": 15850, "time_per_iteration": 2.4587299823760986 }, { "auxiliary_loss_clip": 0.01103266, "auxiliary_loss_mlp": 0.01033798, "balance_loss_clip": 1.02226496, "balance_loss_mlp": 1.03455925, "epoch": 0.9530136780399819, "flos": 22382870019840.0, "grad_norm": 2.0701088158779672, "language_loss": 0.72549409, "learning_rate": 2.308523444215482e-08, "loss": 0.7468648, "num_input_tokens_seen": 341855895, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 15851, "time_per_iteration": 2.455136775970459 }, { "auxiliary_loss_clip": 0.0110105, "auxiliary_loss_mlp": 0.01026643, "balance_loss_clip": 1.01462078, "balance_loss_mlp": 1.0336982, "epoch": 0.95307380129265, "flos": 22159613036160.0, "grad_norm": 1.8820896946934116, "language_loss": 0.79752797, "learning_rate": 2.3026268798403525e-08, "loss": 0.81880486, "num_input_tokens_seen": 341875240, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 15852, "time_per_iteration": 2.460906744003296 }, { "auxiliary_loss_clip": 0.01104836, "auxiliary_loss_mlp": 0.01033027, "balance_loss_clip": 1.02068257, "balance_loss_mlp": 1.03502667, "epoch": 0.9531339245453179, "flos": 44022747214080.0, "grad_norm": 2.2137387395793273, "language_loss": 0.59479809, "learning_rate": 2.2967378121694138e-08, "loss": 0.61617672, "num_input_tokens_seen": 341901020, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69921875, "step": 15853, "time_per_iteration": 2.6801350116729736 }, { "auxiliary_loss_clip": 0.0109879, "auxiliary_loss_mlp": 0.01029862, "balance_loss_clip": 1.01851296, "balance_loss_mlp": 1.03302956, "epoch": 0.9531940477979859, "flos": 20266690204800.0, "grad_norm": 1.9516206251414416, "language_loss": 0.72602534, "learning_rate": 2.290856241425998e-08, "loss": 0.74731189, "num_input_tokens_seen": 341919365, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65625, "step": 15854, "time_per_iteration": 2.452241897583008 }, { "auxiliary_loss_clip": 0.01102376, "auxiliary_loss_mlp": 0.01028184, "balance_loss_clip": 1.01651883, "balance_loss_mlp": 1.03277314, "epoch": 0.9532541710506538, "flos": 25335732309120.0, "grad_norm": 1.9710442330725475, "language_loss": 0.67655098, "learning_rate": 2.284982167833127e-08, "loss": 0.69785661, "num_input_tokens_seen": 341939985, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 15855, "time_per_iteration": 2.5152697563171387 }, { "auxiliary_loss_clip": 0.01103486, "auxiliary_loss_mlp": 0.01029523, "balance_loss_clip": 1.0178529, "balance_loss_mlp": 1.03430128, "epoch": 0.9533142943033218, "flos": 26469288691200.0, "grad_norm": 3.9176446659852098, "language_loss": 0.76583892, "learning_rate": 2.279115591613556e-08, "loss": 0.78716898, "num_input_tokens_seen": 341959255, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69140625, "step": 15856, "time_per_iteration": 2.488837480545044 }, { "auxiliary_loss_clip": 0.01101138, "auxiliary_loss_mlp": 0.0103178, "balance_loss_clip": 1.02060413, "balance_loss_mlp": 1.03336, "epoch": 0.9533744175559897, "flos": 23656944407040.0, "grad_norm": 1.7631005889663265, "language_loss": 0.77962846, "learning_rate": 2.2732565129897075e-08, "loss": 0.80095768, "num_input_tokens_seen": 341977205, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6796875, "step": 15857, "time_per_iteration": 2.481147289276123 }, { "auxiliary_loss_clip": 0.01028156, "auxiliary_loss_mlp": 0.01000891, "balance_loss_clip": 0.99983042, "balance_loss_mlp": 1.00581288, "epoch": 0.9534345408086577, "flos": 61052055500160.0, "grad_norm": 0.7098792547510102, "language_loss": 0.62611377, "learning_rate": 2.267404932183803e-08, "loss": 0.64640427, "num_input_tokens_seen": 342038545, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.22363281, "step": 15858, "time_per_iteration": 3.092419385910034 }, { "auxiliary_loss_clip": 0.01103082, "auxiliary_loss_mlp": 0.01027429, "balance_loss_clip": 1.01600242, "balance_loss_mlp": 1.03538942, "epoch": 0.9534946640613258, "flos": 18951677291520.0, "grad_norm": 3.024165987263496, "language_loss": 0.56851649, "learning_rate": 2.2615608494177097e-08, "loss": 0.58982158, "num_input_tokens_seen": 342058195, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 15859, "time_per_iteration": 2.4648191928863525 }, { "auxiliary_loss_clip": 0.01099999, "auxiliary_loss_mlp": 0.01027154, "balance_loss_clip": 1.01614547, "balance_loss_mlp": 1.03452063, "epoch": 0.9535547873139937, "flos": 16654292340480.0, "grad_norm": 2.1719041322239443, "language_loss": 0.81753755, "learning_rate": 2.2557242649130504e-08, "loss": 0.83880907, "num_input_tokens_seen": 342075025, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65234375, "step": 15860, "time_per_iteration": 2.4831161499023438 }, { "auxiliary_loss_clip": 0.01102406, "auxiliary_loss_mlp": 0.01026333, "balance_loss_clip": 1.01505625, "balance_loss_mlp": 1.0328052, "epoch": 0.9536149105666617, "flos": 20667776446080.0, "grad_norm": 1.952848971629022, "language_loss": 0.66837656, "learning_rate": 2.249895178891159e-08, "loss": 0.68966395, "num_input_tokens_seen": 342094595, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6953125, "step": 15861, "time_per_iteration": 2.4669673442840576 }, { "auxiliary_loss_clip": 0.01102607, "auxiliary_loss_mlp": 0.01034799, "balance_loss_clip": 1.02238333, "balance_loss_mlp": 1.03410709, "epoch": 0.9536750338193296, "flos": 30700499086080.0, "grad_norm": 3.1068070856460053, "language_loss": 0.65781331, "learning_rate": 2.244073591573037e-08, "loss": 0.67918736, "num_input_tokens_seen": 342115970, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.68359375, "step": 15862, "time_per_iteration": 2.5562081336975098 }, { "auxiliary_loss_clip": 0.01103175, "auxiliary_loss_mlp": 0.01029189, "balance_loss_clip": 1.01798308, "balance_loss_mlp": 1.03684974, "epoch": 0.9537351570719976, "flos": 20405484357120.0, "grad_norm": 1.6236141273510165, "language_loss": 0.67671645, "learning_rate": 2.238259503179485e-08, "loss": 0.69804007, "num_input_tokens_seen": 342134080, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6640625, "step": 15863, "time_per_iteration": 2.45656681060791 }, { "auxiliary_loss_clip": 0.01102625, "auxiliary_loss_mlp": 0.01028173, "balance_loss_clip": 1.01596642, "balance_loss_mlp": 1.03458786, "epoch": 0.9537952803246655, "flos": 29929245235200.0, "grad_norm": 2.472481003831763, "language_loss": 0.78040403, "learning_rate": 2.2324529139309267e-08, "loss": 0.80171204, "num_input_tokens_seen": 342154725, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 15864, "time_per_iteration": 2.5219931602478027 }, { "auxiliary_loss_clip": 0.01102782, "auxiliary_loss_mlp": 0.01027336, "balance_loss_clip": 1.01557624, "balance_loss_mlp": 1.03591073, "epoch": 0.9538554035773336, "flos": 20521404524160.0, "grad_norm": 2.208955479292089, "language_loss": 0.59799933, "learning_rate": 2.226653824047586e-08, "loss": 0.61930048, "num_input_tokens_seen": 342172275, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66796875, "step": 15865, "time_per_iteration": 2.439206600189209 }, { "auxiliary_loss_clip": 0.01103863, "auxiliary_loss_mlp": 0.01029789, "balance_loss_clip": 1.0176239, "balance_loss_mlp": 1.03469908, "epoch": 0.9539155268300015, "flos": 18406517598720.0, "grad_norm": 2.32537984680362, "language_loss": 0.69515526, "learning_rate": 2.2208622337493765e-08, "loss": 0.71649182, "num_input_tokens_seen": 342190880, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 15866, "time_per_iteration": 2.417187213897705 }, { "auxiliary_loss_clip": 0.01103841, "auxiliary_loss_mlp": 0.01029855, "balance_loss_clip": 1.01732612, "balance_loss_mlp": 1.03462434, "epoch": 0.9539756500826695, "flos": 26213281482240.0, "grad_norm": 3.212648552831726, "language_loss": 0.8504234, "learning_rate": 2.215078143255855e-08, "loss": 0.87176037, "num_input_tokens_seen": 342208165, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 15867, "time_per_iteration": 2.492586612701416 }, { "auxiliary_loss_clip": 0.01027635, "auxiliary_loss_mlp": 0.01001846, "balance_loss_clip": 1.00083303, "balance_loss_mlp": 1.00541055, "epoch": 0.9540357733353374, "flos": 68289097766400.0, "grad_norm": 0.8911480023129551, "language_loss": 0.6183511, "learning_rate": 2.2093015527864024e-08, "loss": 0.63864589, "num_input_tokens_seen": 342277110, "router_z_loss_clip": 0.01013184, "router_z_loss_mlp": 0.22265625, "step": 15868, "time_per_iteration": 3.113243818283081 }, { "auxiliary_loss_clip": 0.01103125, "auxiliary_loss_mlp": 0.01027008, "balance_loss_clip": 1.01474166, "balance_loss_mlp": 1.03475487, "epoch": 0.9540958965880054, "flos": 21288276915840.0, "grad_norm": 2.1760763490783654, "language_loss": 0.60317123, "learning_rate": 2.2035324625600425e-08, "loss": 0.62447262, "num_input_tokens_seen": 342294695, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 15869, "time_per_iteration": 2.4557089805603027 }, { "auxiliary_loss_clip": 0.01103992, "auxiliary_loss_mlp": 0.01030716, "balance_loss_clip": 1.0200882, "balance_loss_mlp": 1.03591084, "epoch": 0.9541560198406733, "flos": 19751407649280.0, "grad_norm": 2.1904578177596914, "language_loss": 0.71108031, "learning_rate": 2.197770872795579e-08, "loss": 0.73242736, "num_input_tokens_seen": 342314970, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.6796875, "step": 15870, "time_per_iteration": 3.9471518993377686 }, { "auxiliary_loss_clip": 0.01100098, "auxiliary_loss_mlp": 0.01029366, "balance_loss_clip": 1.01740932, "balance_loss_mlp": 1.03293061, "epoch": 0.9542161430933414, "flos": 24715626888960.0, "grad_norm": 7.033516796390557, "language_loss": 0.76623261, "learning_rate": 2.1920167837114368e-08, "loss": 0.78752726, "num_input_tokens_seen": 342334255, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 15871, "time_per_iteration": 2.4840123653411865 }, { "auxiliary_loss_clip": 0.0110562, "auxiliary_loss_mlp": 0.01031173, "balance_loss_clip": 1.01891875, "balance_loss_mlp": 1.03608775, "epoch": 0.9542762663460094, "flos": 31065818359680.0, "grad_norm": 1.9788670134793998, "language_loss": 0.5797922, "learning_rate": 2.1862701955258634e-08, "loss": 0.60116011, "num_input_tokens_seen": 342354730, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 15872, "time_per_iteration": 2.5497782230377197 }, { "auxiliary_loss_clip": 0.01107061, "auxiliary_loss_mlp": 0.01030823, "balance_loss_clip": 1.01745987, "balance_loss_mlp": 1.03592002, "epoch": 0.9543363895986773, "flos": 20776729374720.0, "grad_norm": 1.4586147208504316, "language_loss": 0.74751216, "learning_rate": 2.1805311084567514e-08, "loss": 0.76889098, "num_input_tokens_seen": 342374565, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 15873, "time_per_iteration": 2.4499192237854004 }, { "auxiliary_loss_clip": 0.01105456, "auxiliary_loss_mlp": 0.01030335, "balance_loss_clip": 1.0174005, "balance_loss_mlp": 1.03587794, "epoch": 0.9543965128513453, "flos": 24462744163200.0, "grad_norm": 1.9537486071784653, "language_loss": 0.62543321, "learning_rate": 2.1747995227217265e-08, "loss": 0.64679116, "num_input_tokens_seen": 342394590, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 15874, "time_per_iteration": 3.88138484954834 }, { "auxiliary_loss_clip": 0.01100923, "auxiliary_loss_mlp": 0.01035661, "balance_loss_clip": 1.02347827, "balance_loss_mlp": 1.03440118, "epoch": 0.9544566361040132, "flos": 15261532439040.0, "grad_norm": 2.04968448036113, "language_loss": 0.89586526, "learning_rate": 2.169075438538104e-08, "loss": 0.91723108, "num_input_tokens_seen": 342410445, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6640625, "step": 15875, "time_per_iteration": 2.3901309967041016 }, { "auxiliary_loss_clip": 0.011073, "auxiliary_loss_mlp": 0.01030209, "balance_loss_clip": 1.01745343, "balance_loss_mlp": 1.03522825, "epoch": 0.9545167593566812, "flos": 25918777872000.0, "grad_norm": 2.2072283084945963, "language_loss": 0.6771608, "learning_rate": 2.1633588561229765e-08, "loss": 0.69853586, "num_input_tokens_seen": 342430970, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.72265625, "step": 15876, "time_per_iteration": 2.4842441082000732 }, { "auxiliary_loss_clip": 0.01106487, "auxiliary_loss_mlp": 0.01030265, "balance_loss_clip": 1.01713383, "balance_loss_mlp": 1.03579497, "epoch": 0.9545768826093491, "flos": 25628188844160.0, "grad_norm": 1.7807090668162164, "language_loss": 0.69153917, "learning_rate": 2.1576497756931267e-08, "loss": 0.71290672, "num_input_tokens_seen": 342449505, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 15877, "time_per_iteration": 2.468933582305908 }, { "auxiliary_loss_clip": 0.01106118, "auxiliary_loss_mlp": 0.01031672, "balance_loss_clip": 1.0190599, "balance_loss_mlp": 1.03588879, "epoch": 0.9546370058620172, "flos": 22491499726080.0, "grad_norm": 1.8206093143017699, "language_loss": 0.70760274, "learning_rate": 2.1519481974650035e-08, "loss": 0.72898066, "num_input_tokens_seen": 342470390, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 15878, "time_per_iteration": 5.29342246055603 }, { "auxiliary_loss_clip": 0.01101598, "auxiliary_loss_mlp": 0.01028952, "balance_loss_clip": 1.01675081, "balance_loss_mlp": 1.03431654, "epoch": 0.9546971291146851, "flos": 24609582961920.0, "grad_norm": 1.9527752353679215, "language_loss": 0.68507195, "learning_rate": 2.1462541216548335e-08, "loss": 0.70637745, "num_input_tokens_seen": 342492560, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 15879, "time_per_iteration": 2.4975385665893555 }, { "auxiliary_loss_clip": 0.01102273, "auxiliary_loss_mlp": 0.01027091, "balance_loss_clip": 1.01543832, "balance_loss_mlp": 1.03503573, "epoch": 0.9547572523673531, "flos": 28657756627200.0, "grad_norm": 2.5090218980926733, "language_loss": 0.85010159, "learning_rate": 2.1405675484785334e-08, "loss": 0.87139523, "num_input_tokens_seen": 342512315, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 15880, "time_per_iteration": 2.4934492111206055 }, { "auxiliary_loss_clip": 0.01103496, "auxiliary_loss_mlp": 0.01032036, "balance_loss_clip": 1.01961434, "balance_loss_mlp": 1.03454733, "epoch": 0.954817375620021, "flos": 33802606385280.0, "grad_norm": 1.8330262840417482, "language_loss": 0.72389209, "learning_rate": 2.134888478151753e-08, "loss": 0.74524742, "num_input_tokens_seen": 342533060, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 15881, "time_per_iteration": 2.563396453857422 }, { "auxiliary_loss_clip": 0.01103076, "auxiliary_loss_mlp": 0.01034061, "balance_loss_clip": 1.0220747, "balance_loss_mlp": 1.03604245, "epoch": 0.954877498872689, "flos": 14428225843200.0, "grad_norm": 1.9175490333438605, "language_loss": 0.71921086, "learning_rate": 2.1292169108898083e-08, "loss": 0.74058229, "num_input_tokens_seen": 342550830, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 15882, "time_per_iteration": 2.4326171875 }, { "auxiliary_loss_clip": 0.01103485, "auxiliary_loss_mlp": 0.01029325, "balance_loss_clip": 1.01764846, "balance_loss_mlp": 1.03511703, "epoch": 0.9549376221253569, "flos": 59269447336320.0, "grad_norm": 1.6787862509649225, "language_loss": 0.66036499, "learning_rate": 2.1235528469078168e-08, "loss": 0.68169308, "num_input_tokens_seen": 342575070, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.68359375, "step": 15883, "time_per_iteration": 2.7986514568328857 }, { "auxiliary_loss_clip": 0.01106504, "auxiliary_loss_mlp": 0.01028661, "balance_loss_clip": 1.01584005, "balance_loss_mlp": 1.03686976, "epoch": 0.954997745378025, "flos": 17274397760640.0, "grad_norm": 2.7711281926502522, "language_loss": 0.77857852, "learning_rate": 2.1178962864205175e-08, "loss": 0.79993021, "num_input_tokens_seen": 342592215, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 15884, "time_per_iteration": 2.4078927040100098 }, { "auxiliary_loss_clip": 0.01104991, "auxiliary_loss_mlp": 0.01027936, "balance_loss_clip": 1.01510298, "balance_loss_mlp": 1.03469181, "epoch": 0.955057868630693, "flos": 13006378903680.0, "grad_norm": 1.6947281766680393, "language_loss": 0.77942705, "learning_rate": 2.1122472296424054e-08, "loss": 0.80075628, "num_input_tokens_seen": 342610030, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 15885, "time_per_iteration": 2.4431049823760986 }, { "auxiliary_loss_clip": 0.0110414, "auxiliary_loss_mlp": 0.01030959, "balance_loss_clip": 1.01929438, "balance_loss_mlp": 1.03417325, "epoch": 0.9551179918833609, "flos": 22637692080000.0, "grad_norm": 1.7606432492649091, "language_loss": 0.69940019, "learning_rate": 2.1066056767877317e-08, "loss": 0.72075117, "num_input_tokens_seen": 342626475, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69921875, "step": 15886, "time_per_iteration": 2.4447741508483887 }, { "auxiliary_loss_clip": 0.01109466, "auxiliary_loss_mlp": 0.01036408, "balance_loss_clip": 1.022645, "balance_loss_mlp": 1.03706384, "epoch": 0.9551781151360289, "flos": 21542811667200.0, "grad_norm": 1.801599319656546, "language_loss": 0.72488844, "learning_rate": 2.1009716280703916e-08, "loss": 0.74634719, "num_input_tokens_seen": 342646645, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.7265625, "step": 15887, "time_per_iteration": 2.451539993286133 }, { "auxiliary_loss_clip": 0.01099569, "auxiliary_loss_mlp": 0.01025374, "balance_loss_clip": 1.01405501, "balance_loss_mlp": 1.03367031, "epoch": 0.9552382383886968, "flos": 20702250524160.0, "grad_norm": 2.577274658056427, "language_loss": 0.56962943, "learning_rate": 2.0953450837040364e-08, "loss": 0.59087884, "num_input_tokens_seen": 342663615, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66015625, "step": 15888, "time_per_iteration": 2.4167332649230957 }, { "auxiliary_loss_clip": 0.01027999, "auxiliary_loss_mlp": 0.00999221, "balance_loss_clip": 0.9981665, "balance_loss_mlp": 1.00573874, "epoch": 0.9552983616413648, "flos": 67769792887680.0, "grad_norm": 0.7070111727447698, "language_loss": 0.5781312, "learning_rate": 2.0897260439020514e-08, "loss": 0.59840345, "num_input_tokens_seen": 342728275, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.22265625, "step": 15889, "time_per_iteration": 3.1351635456085205 }, { "auxiliary_loss_clip": 0.01104212, "auxiliary_loss_mlp": 0.01027474, "balance_loss_clip": 1.01525521, "balance_loss_mlp": 1.03349113, "epoch": 0.9553584848940327, "flos": 21579979265280.0, "grad_norm": 1.5494755045111037, "language_loss": 0.66967547, "learning_rate": 2.084114508877466e-08, "loss": 0.69099236, "num_input_tokens_seen": 342748860, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 15890, "time_per_iteration": 2.4516448974609375 }, { "auxiliary_loss_clip": 0.01104769, "auxiliary_loss_mlp": 0.01032199, "balance_loss_clip": 1.02053404, "balance_loss_mlp": 1.03617096, "epoch": 0.9554186081467008, "flos": 24208173498240.0, "grad_norm": 1.5382153734994308, "language_loss": 0.74283946, "learning_rate": 2.0785104788430874e-08, "loss": 0.76420915, "num_input_tokens_seen": 342769705, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 15891, "time_per_iteration": 2.4864790439605713 }, { "auxiliary_loss_clip": 0.01099365, "auxiliary_loss_mlp": 0.01029927, "balance_loss_clip": 1.01928115, "balance_loss_mlp": 1.03416872, "epoch": 0.9554787313993687, "flos": 16251554073600.0, "grad_norm": 8.909466945319991, "language_loss": 0.7822299, "learning_rate": 2.072913954011435e-08, "loss": 0.80352283, "num_input_tokens_seen": 342787000, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.65234375, "step": 15892, "time_per_iteration": 2.4019014835357666 }, { "auxiliary_loss_clip": 0.01101937, "auxiliary_loss_mlp": 0.01031301, "balance_loss_clip": 1.0189333, "balance_loss_mlp": 1.03423107, "epoch": 0.9555388546520367, "flos": 23404133508480.0, "grad_norm": 1.5071035583874293, "language_loss": 0.69903374, "learning_rate": 2.0673249345947386e-08, "loss": 0.72036612, "num_input_tokens_seen": 342807795, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.67578125, "step": 15893, "time_per_iteration": 2.4721202850341797 }, { "auxiliary_loss_clip": 0.01105626, "auxiliary_loss_mlp": 0.01029266, "balance_loss_clip": 1.01561654, "balance_loss_mlp": 1.0379324, "epoch": 0.9555989779047046, "flos": 14794047907200.0, "grad_norm": 1.9472238663881392, "language_loss": 0.66126549, "learning_rate": 2.0617434208048955e-08, "loss": 0.68261445, "num_input_tokens_seen": 342825490, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.67578125, "step": 15894, "time_per_iteration": 2.4161670207977295 }, { "auxiliary_loss_clip": 0.01105492, "auxiliary_loss_mlp": 0.0103132, "balance_loss_clip": 1.01830816, "balance_loss_mlp": 1.03531098, "epoch": 0.9556591011573726, "flos": 22236749493120.0, "grad_norm": 2.2083891414053327, "language_loss": 0.823066, "learning_rate": 2.056169412853581e-08, "loss": 0.84443414, "num_input_tokens_seen": 342844965, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 15895, "time_per_iteration": 2.454176425933838 }, { "auxiliary_loss_clip": 0.01104123, "auxiliary_loss_mlp": 0.01032667, "balance_loss_clip": 1.02069235, "balance_loss_mlp": 1.03546166, "epoch": 0.9557192244100405, "flos": 27855296835840.0, "grad_norm": 1.9740653901658578, "language_loss": 0.72322464, "learning_rate": 2.0506029109521593e-08, "loss": 0.74459255, "num_input_tokens_seen": 342865915, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 15896, "time_per_iteration": 2.500558614730835 }, { "auxiliary_loss_clip": 0.01100539, "auxiliary_loss_mlp": 0.01032596, "balance_loss_clip": 1.0206337, "balance_loss_mlp": 1.03316832, "epoch": 0.9557793476627086, "flos": 17602800831360.0, "grad_norm": 2.116599337598084, "language_loss": 0.79488713, "learning_rate": 2.045043915311706e-08, "loss": 0.8162185, "num_input_tokens_seen": 342884000, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 15897, "time_per_iteration": 2.480339765548706 }, { "auxiliary_loss_clip": 0.01102344, "auxiliary_loss_mlp": 0.01031442, "balance_loss_clip": 1.01844835, "balance_loss_mlp": 1.0341692, "epoch": 0.9558394709153766, "flos": 23875496709120.0, "grad_norm": 1.682200633687038, "language_loss": 0.72903913, "learning_rate": 2.03949242614303e-08, "loss": 0.750377, "num_input_tokens_seen": 342903095, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6796875, "step": 15898, "time_per_iteration": 2.472421407699585 }, { "auxiliary_loss_clip": 0.01028246, "auxiliary_loss_mlp": 0.01001501, "balance_loss_clip": 1.00051773, "balance_loss_mlp": 1.00581384, "epoch": 0.9558995941680445, "flos": 53682001171200.0, "grad_norm": 0.9226950084973108, "language_loss": 0.5233736, "learning_rate": 2.033948443656652e-08, "loss": 0.54367101, "num_input_tokens_seen": 342958155, "router_z_loss_clip": 0.00982666, "router_z_loss_mlp": 0.22460938, "step": 15899, "time_per_iteration": 3.0206711292266846 }, { "auxiliary_loss_clip": 0.01109319, "auxiliary_loss_mlp": 0.01032549, "balance_loss_clip": 1.01902449, "balance_loss_mlp": 1.03671217, "epoch": 0.9559597174207125, "flos": 13764488376960.0, "grad_norm": 2.4368641403535545, "language_loss": 0.68578362, "learning_rate": 2.028411968062782e-08, "loss": 0.70720232, "num_input_tokens_seen": 342972500, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7265625, "step": 15900, "time_per_iteration": 2.417919158935547 }, { "auxiliary_loss_clip": 0.01105298, "auxiliary_loss_mlp": 0.01025137, "balance_loss_clip": 1.01304936, "balance_loss_mlp": 1.03559256, "epoch": 0.9560198406733804, "flos": 19936347799680.0, "grad_norm": 2.459672928206059, "language_loss": 0.82953513, "learning_rate": 2.0228829995713627e-08, "loss": 0.8508395, "num_input_tokens_seen": 342989035, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 15901, "time_per_iteration": 2.4438929557800293 }, { "auxiliary_loss_clip": 0.01028051, "auxiliary_loss_mlp": 0.0100154, "balance_loss_clip": 1.00044918, "balance_loss_mlp": 1.00580347, "epoch": 0.9560799639260484, "flos": 57289550699520.0, "grad_norm": 0.7118991665297201, "language_loss": 0.54280591, "learning_rate": 2.0173615383920485e-08, "loss": 0.56310189, "num_input_tokens_seen": 343051675, "router_z_loss_clip": 0.01092529, "router_z_loss_mlp": 0.22265625, "step": 15902, "time_per_iteration": 3.183953285217285 }, { "auxiliary_loss_clip": 0.01097065, "auxiliary_loss_mlp": 0.01028021, "balance_loss_clip": 1.01791167, "balance_loss_mlp": 1.0334897, "epoch": 0.9561400871787163, "flos": 18917167299840.0, "grad_norm": 1.9128210038633457, "language_loss": 0.8550117, "learning_rate": 2.01184758473425e-08, "loss": 0.87626255, "num_input_tokens_seen": 343068895, "router_z_loss_clip": 0.10107422, "router_z_loss_mlp": 0.63671875, "step": 15903, "time_per_iteration": 2.4351344108581543 }, { "auxiliary_loss_clip": 0.01101928, "auxiliary_loss_mlp": 0.01026188, "balance_loss_clip": 1.01502419, "balance_loss_mlp": 1.03466833, "epoch": 0.9562002104313844, "flos": 18038576632320.0, "grad_norm": 2.0401181544677147, "language_loss": 0.80562145, "learning_rate": 2.0063411388070217e-08, "loss": 0.82690263, "num_input_tokens_seen": 343087115, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.671875, "step": 15904, "time_per_iteration": 2.449664354324341 }, { "auxiliary_loss_clip": 0.01106743, "auxiliary_loss_mlp": 0.01031299, "balance_loss_clip": 1.01829934, "balance_loss_mlp": 1.0361228, "epoch": 0.9562603336840523, "flos": 24717673964160.0, "grad_norm": 2.5845293529684645, "language_loss": 0.60664195, "learning_rate": 2.0008422008191972e-08, "loss": 0.62802231, "num_input_tokens_seen": 343105575, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 15905, "time_per_iteration": 2.4756274223327637 }, { "auxiliary_loss_clip": 0.01102088, "auxiliary_loss_mlp": 0.01026316, "balance_loss_clip": 1.01443696, "balance_loss_mlp": 1.03420997, "epoch": 0.9563204569367203, "flos": 21177205084800.0, "grad_norm": 2.7037702528469953, "language_loss": 0.70889759, "learning_rate": 1.995350770979254e-08, "loss": 0.73018163, "num_input_tokens_seen": 343123025, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 15906, "time_per_iteration": 2.4454808235168457 }, { "auxiliary_loss_clip": 0.0110684, "auxiliary_loss_mlp": 0.0103048, "balance_loss_clip": 1.017802, "balance_loss_mlp": 1.03655148, "epoch": 0.9563805801893882, "flos": 20229738088320.0, "grad_norm": 1.916429680873688, "language_loss": 0.70557326, "learning_rate": 1.9898668494954473e-08, "loss": 0.72694647, "num_input_tokens_seen": 343141625, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 15907, "time_per_iteration": 2.4237923622131348 }, { "auxiliary_loss_clip": 0.01103208, "auxiliary_loss_mlp": 0.01028388, "balance_loss_clip": 1.01657486, "balance_loss_mlp": 1.03569388, "epoch": 0.9564407034420562, "flos": 25411001258880.0, "grad_norm": 2.4959150639558536, "language_loss": 0.70411897, "learning_rate": 1.9843904365757447e-08, "loss": 0.72543496, "num_input_tokens_seen": 343161300, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 15908, "time_per_iteration": 2.4946765899658203 }, { "auxiliary_loss_clip": 0.01104851, "auxiliary_loss_mlp": 0.01031482, "balance_loss_clip": 1.01922774, "balance_loss_mlp": 1.03655553, "epoch": 0.9565008266947241, "flos": 18623884752000.0, "grad_norm": 2.075827659834797, "language_loss": 0.82798457, "learning_rate": 1.978921532427802e-08, "loss": 0.84934789, "num_input_tokens_seen": 343177815, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 15909, "time_per_iteration": 2.432532787322998 }, { "auxiliary_loss_clip": 0.01102226, "auxiliary_loss_mlp": 0.01032212, "balance_loss_clip": 1.01990962, "balance_loss_mlp": 1.03335965, "epoch": 0.9565609499473922, "flos": 24862142465280.0, "grad_norm": 1.7208073893712326, "language_loss": 0.67568076, "learning_rate": 1.9734601372590086e-08, "loss": 0.69702518, "num_input_tokens_seen": 343198140, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 15910, "time_per_iteration": 2.5417263507843018 }, { "auxiliary_loss_clip": 0.01108469, "auxiliary_loss_mlp": 0.01033536, "balance_loss_clip": 1.02148986, "balance_loss_mlp": 1.03722596, "epoch": 0.9566210732000601, "flos": 21798459740160.0, "grad_norm": 1.922168628679682, "language_loss": 0.74149883, "learning_rate": 1.968006251276444e-08, "loss": 0.76291883, "num_input_tokens_seen": 343218280, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.7109375, "step": 15911, "time_per_iteration": 2.4659483432769775 }, { "auxiliary_loss_clip": 0.01102392, "auxiliary_loss_mlp": 0.01028083, "balance_loss_clip": 1.01618552, "balance_loss_mlp": 1.03380728, "epoch": 0.9566811964527281, "flos": 18697609416960.0, "grad_norm": 1.883469566387966, "language_loss": 0.69601423, "learning_rate": 1.9625598746869198e-08, "loss": 0.71731895, "num_input_tokens_seen": 343236850, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 15912, "time_per_iteration": 3.9022438526153564 }, { "auxiliary_loss_clip": 0.01105216, "auxiliary_loss_mlp": 0.010345, "balance_loss_clip": 1.02212644, "balance_loss_mlp": 1.03658581, "epoch": 0.9567413197053961, "flos": 13000632727680.0, "grad_norm": 2.8518563387107103, "language_loss": 0.72135818, "learning_rate": 1.95712100769696e-08, "loss": 0.74275529, "num_input_tokens_seen": 343253065, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 15913, "time_per_iteration": 2.4401304721832275 }, { "auxiliary_loss_clip": 0.01103029, "auxiliary_loss_mlp": 0.01029767, "balance_loss_clip": 1.01851916, "balance_loss_mlp": 1.03580189, "epoch": 0.956801442958064, "flos": 19719267955200.0, "grad_norm": 2.3645236932644926, "language_loss": 0.73362988, "learning_rate": 1.9516896505128444e-08, "loss": 0.75495785, "num_input_tokens_seen": 343270330, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 15914, "time_per_iteration": 2.47109317779541 }, { "auxiliary_loss_clip": 0.01102663, "auxiliary_loss_mlp": 0.01026746, "balance_loss_clip": 1.0145514, "balance_loss_mlp": 1.03494024, "epoch": 0.956861566210732, "flos": 18222834424320.0, "grad_norm": 1.6970051722281538, "language_loss": 0.67311716, "learning_rate": 1.9462658033404965e-08, "loss": 0.69441128, "num_input_tokens_seen": 343289625, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 15915, "time_per_iteration": 2.422640323638916 }, { "auxiliary_loss_clip": 0.01100962, "auxiliary_loss_mlp": 0.01027586, "balance_loss_clip": 1.01590347, "balance_loss_mlp": 1.03408694, "epoch": 0.9569216894634, "flos": 22196960202240.0, "grad_norm": 1.7065274243423902, "language_loss": 0.64047658, "learning_rate": 1.9408494663855967e-08, "loss": 0.66176212, "num_input_tokens_seen": 343309200, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 15916, "time_per_iteration": 3.8610525131225586 }, { "auxiliary_loss_clip": 0.01097073, "auxiliary_loss_mlp": 0.01028249, "balance_loss_clip": 1.01727557, "balance_loss_mlp": 1.03385425, "epoch": 0.956981812716068, "flos": 21689291329920.0, "grad_norm": 2.0410917404640534, "language_loss": 0.80484968, "learning_rate": 1.935440639853536e-08, "loss": 0.82610291, "num_input_tokens_seen": 343326270, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.6328125, "step": 15917, "time_per_iteration": 2.471834182739258 }, { "auxiliary_loss_clip": 0.01101667, "auxiliary_loss_mlp": 0.01031916, "balance_loss_clip": 1.02037024, "balance_loss_mlp": 1.03494561, "epoch": 0.9570419359687359, "flos": 13990905757440.0, "grad_norm": 6.189630703693542, "language_loss": 0.72882211, "learning_rate": 1.9300393239494172e-08, "loss": 0.75015795, "num_input_tokens_seen": 343344430, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 15918, "time_per_iteration": 2.4233250617980957 }, { "auxiliary_loss_clip": 0.0102803, "auxiliary_loss_mlp": 0.01003814, "balance_loss_clip": 1.0027535, "balance_loss_mlp": 1.0057199, "epoch": 0.9571020592214039, "flos": 65196938534400.0, "grad_norm": 0.6275913282734245, "language_loss": 0.53124213, "learning_rate": 1.924645518878032e-08, "loss": 0.55156052, "num_input_tokens_seen": 343416155, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.22265625, "step": 15919, "time_per_iteration": 3.196503162384033 }, { "auxiliary_loss_clip": 0.01110777, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.02280045, "balance_loss_mlp": 1.03883755, "epoch": 0.9571621824740718, "flos": 17384068961280.0, "grad_norm": 2.691680495245935, "language_loss": 0.75993538, "learning_rate": 1.919259224843972e-08, "loss": 0.7814039, "num_input_tokens_seen": 343431715, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 15920, "time_per_iteration": 5.322102069854736 }, { "auxiliary_loss_clip": 0.01108017, "auxiliary_loss_mlp": 0.01028464, "balance_loss_clip": 1.01569033, "balance_loss_mlp": 1.03691626, "epoch": 0.9572223057267398, "flos": 14538184352640.0, "grad_norm": 1.741224841455, "language_loss": 0.79281402, "learning_rate": 1.9138804420514298e-08, "loss": 0.81417882, "num_input_tokens_seen": 343450425, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 15921, "time_per_iteration": 2.4473178386688232 }, { "auxiliary_loss_clip": 0.01107887, "auxiliary_loss_mlp": 0.01031506, "balance_loss_clip": 1.01847649, "balance_loss_mlp": 1.03505111, "epoch": 0.9572824289794077, "flos": 33947793158400.0, "grad_norm": 3.932132329662778, "language_loss": 0.50966072, "learning_rate": 1.9085091707044197e-08, "loss": 0.53105462, "num_input_tokens_seen": 343470445, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.73046875, "step": 15922, "time_per_iteration": 2.562304973602295 }, { "auxiliary_loss_clip": 0.01104523, "auxiliary_loss_mlp": 0.01030127, "balance_loss_clip": 1.01749682, "balance_loss_mlp": 1.03473306, "epoch": 0.9573425522320758, "flos": 18694915896960.0, "grad_norm": 2.377377061156783, "language_loss": 0.83681214, "learning_rate": 1.903145411006557e-08, "loss": 0.85815859, "num_input_tokens_seen": 343485200, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15923, "time_per_iteration": 2.399414300918579 }, { "auxiliary_loss_clip": 0.01100093, "auxiliary_loss_mlp": 0.01029588, "balance_loss_clip": 1.01840615, "balance_loss_mlp": 1.03291261, "epoch": 0.9574026754847437, "flos": 28510307297280.0, "grad_norm": 3.033144174096417, "language_loss": 0.7507509, "learning_rate": 1.8977891631613008e-08, "loss": 0.77204776, "num_input_tokens_seen": 343505080, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.671875, "step": 15924, "time_per_iteration": 2.525996685028076 }, { "auxiliary_loss_clip": 0.01103345, "auxiliary_loss_mlp": 0.010342, "balance_loss_clip": 1.02162349, "balance_loss_mlp": 1.03397202, "epoch": 0.9574627987374117, "flos": 24352390604160.0, "grad_norm": 2.380834494980714, "language_loss": 0.8596189, "learning_rate": 1.892440427371711e-08, "loss": 0.88099438, "num_input_tokens_seen": 343523995, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 15925, "time_per_iteration": 2.476987838745117 }, { "auxiliary_loss_clip": 0.01108468, "auxiliary_loss_mlp": 0.01032522, "balance_loss_clip": 1.01977289, "balance_loss_mlp": 1.03601241, "epoch": 0.9575229219900797, "flos": 23510680225920.0, "grad_norm": 1.8104297478365754, "language_loss": 0.75705606, "learning_rate": 1.8870992038406474e-08, "loss": 0.77846599, "num_input_tokens_seen": 343542015, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.72265625, "step": 15926, "time_per_iteration": 2.474318027496338 }, { "auxiliary_loss_clip": 0.01104737, "auxiliary_loss_mlp": 0.01028104, "balance_loss_clip": 1.01685095, "balance_loss_mlp": 1.03586912, "epoch": 0.9575830452427476, "flos": 22674823764480.0, "grad_norm": 1.8591839709541076, "language_loss": 0.77905351, "learning_rate": 1.8817654927706373e-08, "loss": 0.8003819, "num_input_tokens_seen": 343561680, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6875, "step": 15927, "time_per_iteration": 2.449816942214966 }, { "auxiliary_loss_clip": 0.01106092, "auxiliary_loss_mlp": 0.0103185, "balance_loss_clip": 1.01826012, "balance_loss_mlp": 1.03516245, "epoch": 0.9576431684954156, "flos": 30485250835200.0, "grad_norm": 1.9094458135061254, "language_loss": 0.68736285, "learning_rate": 1.8764392943639183e-08, "loss": 0.70874232, "num_input_tokens_seen": 343585290, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.70703125, "step": 15928, "time_per_iteration": 2.5955514907836914 }, { "auxiliary_loss_clip": 0.01103404, "auxiliary_loss_mlp": 0.01032507, "balance_loss_clip": 1.01998425, "balance_loss_mlp": 1.03483462, "epoch": 0.9577032917480836, "flos": 21687387909120.0, "grad_norm": 1.891646117512903, "language_loss": 0.81893623, "learning_rate": 1.871120608822485e-08, "loss": 0.84029531, "num_input_tokens_seen": 343604045, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 15929, "time_per_iteration": 2.4383747577667236 }, { "auxiliary_loss_clip": 0.01107991, "auxiliary_loss_mlp": 0.01039369, "balance_loss_clip": 1.02689946, "balance_loss_mlp": 1.03625643, "epoch": 0.9577634150007516, "flos": 29023147728000.0, "grad_norm": 1.6953243276123673, "language_loss": 0.72529542, "learning_rate": 1.8658094363480202e-08, "loss": 0.74676907, "num_input_tokens_seen": 343626595, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 15930, "time_per_iteration": 2.5054702758789062 }, { "auxiliary_loss_clip": 0.01101989, "auxiliary_loss_mlp": 0.01029612, "balance_loss_clip": 1.01802528, "balance_loss_mlp": 1.03442001, "epoch": 0.9578235382534195, "flos": 19282235178240.0, "grad_norm": 5.223981870990533, "language_loss": 0.62349224, "learning_rate": 1.8605057771419185e-08, "loss": 0.64480829, "num_input_tokens_seen": 343646195, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.67578125, "step": 15931, "time_per_iteration": 2.4336185455322266 }, { "auxiliary_loss_clip": 0.01101894, "auxiliary_loss_mlp": 0.01027024, "balance_loss_clip": 1.01574087, "balance_loss_mlp": 1.03507757, "epoch": 0.9578836615060875, "flos": 13699275235200.0, "grad_norm": 1.8526961014346537, "language_loss": 0.69194829, "learning_rate": 1.8552096314052633e-08, "loss": 0.71323752, "num_input_tokens_seen": 343663665, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.66796875, "step": 15932, "time_per_iteration": 2.4355216026306152 }, { "auxiliary_loss_clip": 0.01108734, "auxiliary_loss_mlp": 0.01034357, "balance_loss_clip": 1.02083242, "balance_loss_mlp": 1.03623223, "epoch": 0.9579437847587554, "flos": 17054516655360.0, "grad_norm": 1.743698108028559, "language_loss": 0.75364232, "learning_rate": 1.849920999338961e-08, "loss": 0.77507329, "num_input_tokens_seen": 343682145, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 15933, "time_per_iteration": 2.4141745567321777 }, { "auxiliary_loss_clip": 0.01028216, "auxiliary_loss_mlp": 0.01003595, "balance_loss_clip": 1.00251031, "balance_loss_mlp": 1.00592256, "epoch": 0.9580039080114234, "flos": 60570887886720.0, "grad_norm": 0.69758191899924, "language_loss": 0.57288277, "learning_rate": 1.8446398811434948e-08, "loss": 0.59320086, "num_input_tokens_seen": 343744685, "router_z_loss_clip": 0.01086426, "router_z_loss_mlp": 0.22265625, "step": 15934, "time_per_iteration": 3.2037386894226074 }, { "auxiliary_loss_clip": 0.01027959, "auxiliary_loss_mlp": 0.01001482, "balance_loss_clip": 1.00045729, "balance_loss_mlp": 1.0058229, "epoch": 0.9580640312640913, "flos": 66235365745920.0, "grad_norm": 0.9141804877667166, "language_loss": 0.65914071, "learning_rate": 1.8393662770191277e-08, "loss": 0.67943513, "num_input_tokens_seen": 343801835, "router_z_loss_clip": 0.01025391, "router_z_loss_mlp": 0.22167969, "step": 15935, "time_per_iteration": 3.0187785625457764 }, { "auxiliary_loss_clip": 0.01027901, "auxiliary_loss_mlp": 0.01001546, "balance_loss_clip": 1.00041318, "balance_loss_mlp": 1.00567544, "epoch": 0.9581241545167594, "flos": 62218002971520.0, "grad_norm": 0.7782025879119934, "language_loss": 0.57062697, "learning_rate": 1.8341001871658546e-08, "loss": 0.5909214, "num_input_tokens_seen": 343861515, "router_z_loss_clip": 0.01135254, "router_z_loss_mlp": 0.22265625, "step": 15936, "time_per_iteration": 3.0931663513183594 }, { "auxiliary_loss_clip": 0.01105638, "auxiliary_loss_mlp": 0.01032233, "balance_loss_clip": 1.01934671, "balance_loss_mlp": 1.03571796, "epoch": 0.9581842777694273, "flos": 23768088065280.0, "grad_norm": 2.3530230374521515, "language_loss": 0.78373069, "learning_rate": 1.8288416117833825e-08, "loss": 0.80510938, "num_input_tokens_seen": 343881240, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 15937, "time_per_iteration": 2.498623847961426 }, { "auxiliary_loss_clip": 0.01105932, "auxiliary_loss_mlp": 0.01028925, "balance_loss_clip": 1.01624131, "balance_loss_mlp": 1.03607631, "epoch": 0.9582444010220953, "flos": 21213079793280.0, "grad_norm": 2.130342587680523, "language_loss": 0.68101662, "learning_rate": 1.8235905510710636e-08, "loss": 0.70236522, "num_input_tokens_seen": 343900885, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 15938, "time_per_iteration": 2.4528555870056152 }, { "auxiliary_loss_clip": 0.01103233, "auxiliary_loss_mlp": 0.01030469, "balance_loss_clip": 1.01841688, "balance_loss_mlp": 1.0341543, "epoch": 0.9583045242747633, "flos": 23805147922560.0, "grad_norm": 3.119141045518828, "language_loss": 0.66174781, "learning_rate": 1.8183470052280712e-08, "loss": 0.68308485, "num_input_tokens_seen": 343918460, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69140625, "step": 15939, "time_per_iteration": 2.5229477882385254 }, { "auxiliary_loss_clip": 0.01103396, "auxiliary_loss_mlp": 0.01032715, "balance_loss_clip": 1.02064466, "balance_loss_mlp": 1.03478074, "epoch": 0.9583646475274312, "flos": 24131468004480.0, "grad_norm": 3.0258131046418235, "language_loss": 0.73503774, "learning_rate": 1.8131109744532025e-08, "loss": 0.75639886, "num_input_tokens_seen": 343938030, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 15940, "time_per_iteration": 2.4798529148101807 }, { "auxiliary_loss_clip": 0.01104582, "auxiliary_loss_mlp": 0.01032277, "balance_loss_clip": 1.01924181, "balance_loss_mlp": 1.03524399, "epoch": 0.9584247707800992, "flos": 20886651970560.0, "grad_norm": 1.8882798858646268, "language_loss": 0.73003423, "learning_rate": 1.8078824589450535e-08, "loss": 0.7514028, "num_input_tokens_seen": 343956635, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 15941, "time_per_iteration": 2.4599063396453857 }, { "auxiliary_loss_clip": 0.01103461, "auxiliary_loss_mlp": 0.01033915, "balance_loss_clip": 1.02219653, "balance_loss_mlp": 1.03548551, "epoch": 0.9584848940327672, "flos": 26067591918720.0, "grad_norm": 15.4549115883751, "language_loss": 0.71347201, "learning_rate": 1.8026614589018442e-08, "loss": 0.73484576, "num_input_tokens_seen": 343976625, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 15942, "time_per_iteration": 2.471547842025757 }, { "auxiliary_loss_clip": 0.01103944, "auxiliary_loss_mlp": 0.0103365, "balance_loss_clip": 1.02097178, "balance_loss_mlp": 1.03439522, "epoch": 0.9585450172854352, "flos": 34492988764800.0, "grad_norm": 1.5982836810442569, "language_loss": 0.72082543, "learning_rate": 1.797447974521571e-08, "loss": 0.74220133, "num_input_tokens_seen": 343997790, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 15943, "time_per_iteration": 2.6108522415161133 }, { "auxiliary_loss_clip": 0.01107272, "auxiliary_loss_mlp": 0.01036136, "balance_loss_clip": 1.0232259, "balance_loss_mlp": 1.0364728, "epoch": 0.9586051405381031, "flos": 23110743219840.0, "grad_norm": 1.6698463015959173, "language_loss": 0.68135321, "learning_rate": 1.792242006001965e-08, "loss": 0.70278728, "num_input_tokens_seen": 344016935, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.70703125, "step": 15944, "time_per_iteration": 2.4575812816619873 }, { "auxiliary_loss_clip": 0.01103018, "auxiliary_loss_mlp": 0.01033341, "balance_loss_clip": 1.02095461, "balance_loss_mlp": 1.03330541, "epoch": 0.9586652637907711, "flos": 19603994232960.0, "grad_norm": 1.7810358819750556, "language_loss": 0.66130388, "learning_rate": 1.7870435535403795e-08, "loss": 0.68266749, "num_input_tokens_seen": 344035590, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69921875, "step": 15945, "time_per_iteration": 2.4346413612365723 }, { "auxiliary_loss_clip": 0.01028174, "auxiliary_loss_mlp": 0.01001189, "balance_loss_clip": 1.00019348, "balance_loss_mlp": 1.00580323, "epoch": 0.958725387043439, "flos": 72073327317120.0, "grad_norm": 0.7675570582391223, "language_loss": 0.61871189, "learning_rate": 1.7818526173339678e-08, "loss": 0.63900554, "num_input_tokens_seen": 344100845, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22460938, "step": 15946, "time_per_iteration": 3.180636405944824 }, { "auxiliary_loss_clip": 0.01101702, "auxiliary_loss_mlp": 0.01026365, "balance_loss_clip": 1.01473653, "balance_loss_mlp": 1.03499925, "epoch": 0.958785510296107, "flos": 28911932242560.0, "grad_norm": 1.6916022039606196, "language_loss": 0.75296307, "learning_rate": 1.7766691975795723e-08, "loss": 0.77424377, "num_input_tokens_seen": 344121780, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 15947, "time_per_iteration": 2.524651527404785 }, { "auxiliary_loss_clip": 0.01101805, "auxiliary_loss_mlp": 0.01024822, "balance_loss_clip": 1.01316941, "balance_loss_mlp": 1.03389955, "epoch": 0.958845633548775, "flos": 18477189607680.0, "grad_norm": 2.1727874663767386, "language_loss": 0.6928522, "learning_rate": 1.771493294473747e-08, "loss": 0.71411848, "num_input_tokens_seen": 344140150, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6796875, "step": 15948, "time_per_iteration": 2.431995153427124 }, { "auxiliary_loss_clip": 0.01101698, "auxiliary_loss_mlp": 0.01032377, "balance_loss_clip": 1.02067029, "balance_loss_mlp": 1.03402591, "epoch": 0.958905756801443, "flos": 24206916522240.0, "grad_norm": 2.059657577784872, "language_loss": 0.78611195, "learning_rate": 1.7663249082127574e-08, "loss": 0.80745268, "num_input_tokens_seen": 344158200, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.67578125, "step": 15949, "time_per_iteration": 2.4724674224853516 }, { "auxiliary_loss_clip": 0.0110678, "auxiliary_loss_mlp": 0.01034268, "balance_loss_clip": 1.02147126, "balance_loss_mlp": 1.03719592, "epoch": 0.9589658800541109, "flos": 25007939769600.0, "grad_norm": 1.8341672319498585, "language_loss": 0.68915665, "learning_rate": 1.761164038992602e-08, "loss": 0.71056712, "num_input_tokens_seen": 344174720, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 15950, "time_per_iteration": 2.4795334339141846 }, { "auxiliary_loss_clip": 0.01102559, "auxiliary_loss_mlp": 0.01032863, "balance_loss_clip": 1.02152634, "balance_loss_mlp": 1.03436613, "epoch": 0.9590260033067789, "flos": 23514558894720.0, "grad_norm": 1.7001208693002932, "language_loss": 0.85945535, "learning_rate": 1.7560106870089687e-08, "loss": 0.88080955, "num_input_tokens_seen": 344192580, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 15951, "time_per_iteration": 2.4799139499664307 }, { "auxiliary_loss_clip": 0.01108173, "auxiliary_loss_mlp": 0.0103566, "balance_loss_clip": 1.02318454, "balance_loss_mlp": 1.03590739, "epoch": 0.9590861265594469, "flos": 25520349237120.0, "grad_norm": 3.144279388443274, "language_loss": 0.80203015, "learning_rate": 1.7508648524572568e-08, "loss": 0.82346851, "num_input_tokens_seen": 344210345, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.72265625, "step": 15952, "time_per_iteration": 2.4854421615600586 }, { "auxiliary_loss_clip": 0.01104708, "auxiliary_loss_mlp": 0.01030716, "balance_loss_clip": 1.01800776, "balance_loss_mlp": 1.03574121, "epoch": 0.9591462498121148, "flos": 21179323987200.0, "grad_norm": 1.6991758217094945, "language_loss": 0.69654131, "learning_rate": 1.7457265355326434e-08, "loss": 0.71789557, "num_input_tokens_seen": 344229540, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 15953, "time_per_iteration": 3.9897279739379883 }, { "auxiliary_loss_clip": 0.01106174, "auxiliary_loss_mlp": 0.01028943, "balance_loss_clip": 1.01577079, "balance_loss_mlp": 1.03612518, "epoch": 0.9592063730647828, "flos": 21723047136000.0, "grad_norm": 3.653526917344663, "language_loss": 0.58076823, "learning_rate": 1.7405957364299285e-08, "loss": 0.60211933, "num_input_tokens_seen": 344247830, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 15954, "time_per_iteration": 2.4452242851257324 }, { "auxiliary_loss_clip": 0.01104611, "auxiliary_loss_mlp": 0.01035341, "balance_loss_clip": 1.02175105, "balance_loss_mlp": 1.03444886, "epoch": 0.9592664963174508, "flos": 29891395278720.0, "grad_norm": 2.7047275563458935, "language_loss": 0.7391417, "learning_rate": 1.7354724553437117e-08, "loss": 0.7605412, "num_input_tokens_seen": 344267760, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 15955, "time_per_iteration": 2.52215576171875 }, { "auxiliary_loss_clip": 0.01104685, "auxiliary_loss_mlp": 0.01034831, "balance_loss_clip": 1.0221827, "balance_loss_mlp": 1.03478193, "epoch": 0.9593266195701188, "flos": 17999613354240.0, "grad_norm": 1.9497997733928574, "language_loss": 0.62909919, "learning_rate": 1.7303566924682378e-08, "loss": 0.65049434, "num_input_tokens_seen": 344284905, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 15956, "time_per_iteration": 2.4132604598999023 }, { "auxiliary_loss_clip": 0.01103676, "auxiliary_loss_mlp": 0.01029674, "balance_loss_clip": 1.01716304, "balance_loss_mlp": 1.03485727, "epoch": 0.9593867428227867, "flos": 18838271076480.0, "grad_norm": 2.0995136997413204, "language_loss": 0.59813535, "learning_rate": 1.725248447997507e-08, "loss": 0.61946881, "num_input_tokens_seen": 344302025, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 15957, "time_per_iteration": 3.8303699493408203 }, { "auxiliary_loss_clip": 0.01105435, "auxiliary_loss_mlp": 0.01034291, "balance_loss_clip": 1.02170825, "balance_loss_mlp": 1.03561401, "epoch": 0.9594468660754547, "flos": 29567050444800.0, "grad_norm": 3.0521975041861324, "language_loss": 0.74366677, "learning_rate": 1.7201477221252314e-08, "loss": 0.765064, "num_input_tokens_seen": 344321935, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.69921875, "step": 15958, "time_per_iteration": 2.5453596115112305 }, { "auxiliary_loss_clip": 0.01102333, "auxiliary_loss_mlp": 0.01026163, "balance_loss_clip": 1.01406384, "balance_loss_mlp": 1.03395772, "epoch": 0.9595069893281226, "flos": 20703256104960.0, "grad_norm": 1.845225929874108, "language_loss": 0.74554038, "learning_rate": 1.7150545150448116e-08, "loss": 0.76682532, "num_input_tokens_seen": 344340405, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 15959, "time_per_iteration": 2.4794507026672363 }, { "auxiliary_loss_clip": 0.0110741, "auxiliary_loss_mlp": 0.01032645, "balance_loss_clip": 1.02016354, "balance_loss_mlp": 1.03686023, "epoch": 0.9595671125807906, "flos": 22453613856000.0, "grad_norm": 2.2210058395019825, "language_loss": 0.65405905, "learning_rate": 1.7099688269493816e-08, "loss": 0.67545956, "num_input_tokens_seen": 344359925, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 15960, "time_per_iteration": 2.507780075073242 }, { "auxiliary_loss_clip": 0.01101807, "auxiliary_loss_mlp": 0.010369, "balance_loss_clip": 1.02434707, "balance_loss_mlp": 1.03528249, "epoch": 0.9596272358334585, "flos": 23915214172800.0, "grad_norm": 1.6488243562556772, "language_loss": 0.78043199, "learning_rate": 1.7048906580318544e-08, "loss": 0.80181909, "num_input_tokens_seen": 344379100, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6640625, "step": 15961, "time_per_iteration": 3.9440488815307617 }, { "auxiliary_loss_clip": 0.01101966, "auxiliary_loss_mlp": 0.01029428, "balance_loss_clip": 1.0175612, "balance_loss_mlp": 1.03462946, "epoch": 0.9596873590861266, "flos": 17672539086720.0, "grad_norm": 1.9435067053632062, "language_loss": 0.76014715, "learning_rate": 1.699820008484698e-08, "loss": 0.78146106, "num_input_tokens_seen": 344396895, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.671875, "step": 15962, "time_per_iteration": 3.869480609893799 }, { "auxiliary_loss_clip": 0.01106936, "auxiliary_loss_mlp": 0.01030879, "balance_loss_clip": 1.01785541, "balance_loss_mlp": 1.0354352, "epoch": 0.9597474823387945, "flos": 25808532053760.0, "grad_norm": 2.1653233508053225, "language_loss": 0.71145105, "learning_rate": 1.6947568785002698e-08, "loss": 0.73282921, "num_input_tokens_seen": 344415115, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 15963, "time_per_iteration": 2.4879872798919678 }, { "auxiliary_loss_clip": 0.01101169, "auxiliary_loss_mlp": 0.01029272, "balance_loss_clip": 1.01799512, "balance_loss_mlp": 1.03630304, "epoch": 0.9598076055914625, "flos": 23768519028480.0, "grad_norm": 2.6681423501035613, "language_loss": 0.74103856, "learning_rate": 1.689701268270527e-08, "loss": 0.76234299, "num_input_tokens_seen": 344435185, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6484375, "step": 15964, "time_per_iteration": 2.4907138347625732 }, { "auxiliary_loss_clip": 0.01028235, "auxiliary_loss_mlp": 0.01001269, "balance_loss_clip": 1.00024343, "balance_loss_mlp": 1.00602221, "epoch": 0.9598677288441305, "flos": 56515962464640.0, "grad_norm": 0.8788299070986572, "language_loss": 0.57519686, "learning_rate": 1.684653177987161e-08, "loss": 0.59549189, "num_input_tokens_seen": 344488950, "router_z_loss_clip": 0.01025391, "router_z_loss_mlp": 0.22265625, "step": 15965, "time_per_iteration": 3.0731046199798584 }, { "auxiliary_loss_clip": 0.0110397, "auxiliary_loss_mlp": 0.01029765, "balance_loss_clip": 1.01833916, "balance_loss_mlp": 1.03437543, "epoch": 0.9599278520967984, "flos": 22997480659200.0, "grad_norm": 3.275657762612661, "language_loss": 0.7894212, "learning_rate": 1.6796126078416627e-08, "loss": 0.81075859, "num_input_tokens_seen": 344506740, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6953125, "step": 15966, "time_per_iteration": 2.4646050930023193 }, { "auxiliary_loss_clip": 0.01099654, "auxiliary_loss_mlp": 0.01027831, "balance_loss_clip": 1.01573765, "balance_loss_mlp": 1.03279614, "epoch": 0.9599879753494664, "flos": 23039676161280.0, "grad_norm": 2.271843979137827, "language_loss": 0.79414403, "learning_rate": 1.674579558025102e-08, "loss": 0.8154189, "num_input_tokens_seen": 344526670, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66796875, "step": 15967, "time_per_iteration": 2.4537625312805176 }, { "auxiliary_loss_clip": 0.01107679, "auxiliary_loss_mlp": 0.01028436, "balance_loss_clip": 1.01543033, "balance_loss_mlp": 1.03677821, "epoch": 0.9600480986021344, "flos": 16392287560320.0, "grad_norm": 2.078256521772493, "language_loss": 0.80670732, "learning_rate": 1.669554028728348e-08, "loss": 0.82806849, "num_input_tokens_seen": 344541995, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 15968, "time_per_iteration": 2.412681818008423 }, { "auxiliary_loss_clip": 0.01108204, "auxiliary_loss_mlp": 0.0103651, "balance_loss_clip": 1.02258015, "balance_loss_mlp": 1.0370487, "epoch": 0.9601082218548024, "flos": 24276439296000.0, "grad_norm": 4.754703880936643, "language_loss": 0.68005919, "learning_rate": 1.6645360201420044e-08, "loss": 0.70150638, "num_input_tokens_seen": 344559980, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.7109375, "step": 15969, "time_per_iteration": 2.520016670227051 }, { "auxiliary_loss_clip": 0.0110275, "auxiliary_loss_mlp": 0.01034985, "balance_loss_clip": 1.02358234, "balance_loss_mlp": 1.03555679, "epoch": 0.9601683451074703, "flos": 19609991804160.0, "grad_norm": 2.546756036598241, "language_loss": 0.79410285, "learning_rate": 1.6595255324563186e-08, "loss": 0.81548023, "num_input_tokens_seen": 344577765, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 15970, "time_per_iteration": 2.470839738845825 }, { "auxiliary_loss_clip": 0.01101822, "auxiliary_loss_mlp": 0.01033977, "balance_loss_clip": 1.021734, "balance_loss_mlp": 1.03566504, "epoch": 0.9602284683601383, "flos": 26651104358400.0, "grad_norm": 1.665820472059124, "language_loss": 0.77310389, "learning_rate": 1.654522565861316e-08, "loss": 0.79446185, "num_input_tokens_seen": 344597650, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.66015625, "step": 15971, "time_per_iteration": 2.513909101486206 }, { "auxiliary_loss_clip": 0.01106318, "auxiliary_loss_mlp": 0.01028078, "balance_loss_clip": 1.01560283, "balance_loss_mlp": 1.03427863, "epoch": 0.9602885916128062, "flos": 15554096714880.0, "grad_norm": 2.2310546747629907, "language_loss": 0.67548621, "learning_rate": 1.64952712054669e-08, "loss": 0.69683015, "num_input_tokens_seen": 344613580, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.71875, "step": 15972, "time_per_iteration": 2.435153007507324 }, { "auxiliary_loss_clip": 0.0110123, "auxiliary_loss_mlp": 0.01028481, "balance_loss_clip": 1.01625609, "balance_loss_mlp": 1.03344381, "epoch": 0.9603487148654742, "flos": 16502353810560.0, "grad_norm": 2.4012629356823716, "language_loss": 0.76051295, "learning_rate": 1.644539196701844e-08, "loss": 0.78180999, "num_input_tokens_seen": 344626910, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 15973, "time_per_iteration": 2.401559352874756 }, { "auxiliary_loss_clip": 0.01105028, "auxiliary_loss_mlp": 0.01035913, "balance_loss_clip": 1.02386141, "balance_loss_mlp": 1.03762054, "epoch": 0.9604088381181421, "flos": 20845354308480.0, "grad_norm": 1.6392134598256989, "language_loss": 0.69140851, "learning_rate": 1.639558794515983e-08, "loss": 0.71281791, "num_input_tokens_seen": 344644330, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.67578125, "step": 15974, "time_per_iteration": 2.438444137573242 }, { "auxiliary_loss_clip": 0.01104082, "auxiliary_loss_mlp": 0.01028939, "balance_loss_clip": 1.01604629, "balance_loss_mlp": 1.03371298, "epoch": 0.9604689613708102, "flos": 19683105937920.0, "grad_norm": 1.79234120234273, "language_loss": 0.6819132, "learning_rate": 1.6345859141779105e-08, "loss": 0.70324337, "num_input_tokens_seen": 344663910, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 15975, "time_per_iteration": 2.452404022216797 }, { "auxiliary_loss_clip": 0.01100118, "auxiliary_loss_mlp": 0.01029809, "balance_loss_clip": 1.01737547, "balance_loss_mlp": 1.03465533, "epoch": 0.9605290846234781, "flos": 24097568544000.0, "grad_norm": 2.3430879720718645, "language_loss": 0.55985451, "learning_rate": 1.6296205558762322e-08, "loss": 0.58115375, "num_input_tokens_seen": 344682320, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.65625, "step": 15976, "time_per_iteration": 2.456951141357422 }, { "auxiliary_loss_clip": 0.01099448, "auxiliary_loss_mlp": 0.0102593, "balance_loss_clip": 1.0145154, "balance_loss_mlp": 1.03261733, "epoch": 0.9605892078761461, "flos": 27122575299840.0, "grad_norm": 1.836589324333503, "language_loss": 0.68472952, "learning_rate": 1.624662719799219e-08, "loss": 0.70598322, "num_input_tokens_seen": 344701355, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66796875, "step": 15977, "time_per_iteration": 2.512943744659424 }, { "auxiliary_loss_clip": 0.01102777, "auxiliary_loss_mlp": 0.01036701, "balance_loss_clip": 1.02427936, "balance_loss_mlp": 1.03397524, "epoch": 0.9606493311288141, "flos": 14136918543360.0, "grad_norm": 2.6362135419664185, "language_loss": 0.82370067, "learning_rate": 1.6197124061348766e-08, "loss": 0.8450954, "num_input_tokens_seen": 344717980, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 15978, "time_per_iteration": 2.401398181915283 }, { "auxiliary_loss_clip": 0.01106316, "auxiliary_loss_mlp": 0.01030684, "balance_loss_clip": 1.01813149, "balance_loss_mlp": 1.03528714, "epoch": 0.960709454381482, "flos": 15813336147840.0, "grad_norm": 2.3402492728813082, "language_loss": 0.83675158, "learning_rate": 1.614769615070921e-08, "loss": 0.85812157, "num_input_tokens_seen": 344733480, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 15979, "time_per_iteration": 2.407437801361084 }, { "auxiliary_loss_clip": 0.01103412, "auxiliary_loss_mlp": 0.01036679, "balance_loss_clip": 1.02557492, "balance_loss_mlp": 1.03394055, "epoch": 0.96076957763415, "flos": 22565403959040.0, "grad_norm": 1.59964938271318, "language_loss": 0.80196762, "learning_rate": 1.6098343467947805e-08, "loss": 0.82336849, "num_input_tokens_seen": 344752130, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6953125, "step": 15980, "time_per_iteration": 2.4474565982818604 }, { "auxiliary_loss_clip": 0.01104408, "auxiliary_loss_mlp": 0.01028835, "balance_loss_clip": 1.01633644, "balance_loss_mlp": 1.03412676, "epoch": 0.960829700886818, "flos": 24681260551680.0, "grad_norm": 1.9386946399826877, "language_loss": 0.68551272, "learning_rate": 1.6049066014935942e-08, "loss": 0.70684516, "num_input_tokens_seen": 344771195, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 15981, "time_per_iteration": 2.480435609817505 }, { "auxiliary_loss_clip": 0.01101444, "auxiliary_loss_mlp": 0.01025534, "balance_loss_clip": 1.0142746, "balance_loss_mlp": 1.03419733, "epoch": 0.960889824139486, "flos": 26542223256960.0, "grad_norm": 1.5031008428484185, "language_loss": 0.69500363, "learning_rate": 1.5999863793542344e-08, "loss": 0.71627343, "num_input_tokens_seen": 344793150, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.671875, "step": 15982, "time_per_iteration": 2.4953644275665283 }, { "auxiliary_loss_clip": 0.01028199, "auxiliary_loss_mlp": 0.01000949, "balance_loss_clip": 0.99994165, "balance_loss_mlp": 1.00593078, "epoch": 0.9609499473921539, "flos": 71114942586240.0, "grad_norm": 0.6693058714286211, "language_loss": 0.53240585, "learning_rate": 1.595073680563286e-08, "loss": 0.5526973, "num_input_tokens_seen": 344852855, "router_z_loss_clip": 0.0100708, "router_z_loss_mlp": 0.22265625, "step": 15983, "time_per_iteration": 3.2085015773773193 }, { "auxiliary_loss_clip": 0.01103256, "auxiliary_loss_mlp": 0.01035471, "balance_loss_clip": 1.02331781, "balance_loss_mlp": 1.03511858, "epoch": 0.9610100706448219, "flos": 20552466810240.0, "grad_norm": 2.66577678772797, "language_loss": 0.68069726, "learning_rate": 1.5901685053070212e-08, "loss": 0.70208454, "num_input_tokens_seen": 344869830, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 15984, "time_per_iteration": 2.4214046001434326 }, { "auxiliary_loss_clip": 0.01100116, "auxiliary_loss_mlp": 0.01032992, "balance_loss_clip": 1.02178645, "balance_loss_mlp": 1.03454769, "epoch": 0.9610701938974898, "flos": 14064199459200.0, "grad_norm": 2.4364490790256235, "language_loss": 0.67380273, "learning_rate": 1.5852708537714477e-08, "loss": 0.69513381, "num_input_tokens_seen": 344888905, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.65625, "step": 15985, "time_per_iteration": 2.4315640926361084 }, { "auxiliary_loss_clip": 0.01106182, "auxiliary_loss_mlp": 0.01029859, "balance_loss_clip": 1.01760411, "balance_loss_mlp": 1.03664303, "epoch": 0.9611303171501578, "flos": 20229989483520.0, "grad_norm": 2.117098644687345, "language_loss": 0.78956151, "learning_rate": 1.580380726142283e-08, "loss": 0.81092191, "num_input_tokens_seen": 344907160, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6953125, "step": 15986, "time_per_iteration": 2.4280972480773926 }, { "auxiliary_loss_clip": 0.01104967, "auxiliary_loss_mlp": 0.01030922, "balance_loss_clip": 1.01746392, "balance_loss_mlp": 1.03652883, "epoch": 0.9611904404028258, "flos": 20951075013120.0, "grad_norm": 2.9266743787609792, "language_loss": 0.64284205, "learning_rate": 1.5754981226049792e-08, "loss": 0.66420096, "num_input_tokens_seen": 344922400, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.68359375, "step": 15987, "time_per_iteration": 2.4228949546813965 }, { "auxiliary_loss_clip": 0.01099942, "auxiliary_loss_mlp": 0.01029667, "balance_loss_clip": 1.01833653, "balance_loss_mlp": 1.03484046, "epoch": 0.9612505636554938, "flos": 24827740214400.0, "grad_norm": 1.9492973899486274, "language_loss": 0.66921675, "learning_rate": 1.5706230433446544e-08, "loss": 0.69051284, "num_input_tokens_seen": 344941910, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6484375, "step": 15988, "time_per_iteration": 2.4633965492248535 }, { "auxiliary_loss_clip": 0.01102768, "auxiliary_loss_mlp": 0.01039494, "balance_loss_clip": 1.02797794, "balance_loss_mlp": 1.03416455, "epoch": 0.9613106869081617, "flos": 17164977955200.0, "grad_norm": 1.8977326205643532, "language_loss": 0.74966896, "learning_rate": 1.5657554885462055e-08, "loss": 0.77109152, "num_input_tokens_seen": 344960020, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6875, "step": 15989, "time_per_iteration": 2.4245622158050537 }, { "auxiliary_loss_clip": 0.01028154, "auxiliary_loss_mlp": 0.01000788, "balance_loss_clip": 0.99978113, "balance_loss_mlp": 1.00577092, "epoch": 0.9613708101608297, "flos": 61563818522880.0, "grad_norm": 0.8372641278149787, "language_loss": 0.63145071, "learning_rate": 1.5608954583941737e-08, "loss": 0.65174013, "num_input_tokens_seen": 345018290, "router_z_loss_clip": 0.0100708, "router_z_loss_mlp": 0.22460938, "step": 15990, "time_per_iteration": 2.9812002182006836 }, { "auxiliary_loss_clip": 0.01103219, "auxiliary_loss_mlp": 0.01031895, "balance_loss_clip": 1.02017128, "balance_loss_mlp": 1.03433561, "epoch": 0.9614309334134977, "flos": 27417904922880.0, "grad_norm": 2.921584490718105, "language_loss": 0.7783106, "learning_rate": 1.5560429530729003e-08, "loss": 0.79966176, "num_input_tokens_seen": 345040235, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 15991, "time_per_iteration": 2.4936611652374268 }, { "auxiliary_loss_clip": 0.01109558, "auxiliary_loss_mlp": 0.01032934, "balance_loss_clip": 1.01983881, "balance_loss_mlp": 1.03589308, "epoch": 0.9614910566661656, "flos": 22819148611200.0, "grad_norm": 2.445301743035091, "language_loss": 0.85245359, "learning_rate": 1.5511979727663493e-08, "loss": 0.87387848, "num_input_tokens_seen": 345054540, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 15992, "time_per_iteration": 2.4370667934417725 }, { "auxiliary_loss_clip": 0.01103537, "auxiliary_loss_mlp": 0.01032567, "balance_loss_clip": 1.01966286, "balance_loss_mlp": 1.03425264, "epoch": 0.9615511799188337, "flos": 20667812359680.0, "grad_norm": 2.1785249099867796, "language_loss": 0.71842706, "learning_rate": 1.5463605176582406e-08, "loss": 0.73978812, "num_input_tokens_seen": 345074035, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 15993, "time_per_iteration": 2.4505770206451416 }, { "auxiliary_loss_clip": 0.01102916, "auxiliary_loss_mlp": 0.01031313, "balance_loss_clip": 1.01897502, "balance_loss_mlp": 1.03347385, "epoch": 0.9616113031715016, "flos": 33149212035840.0, "grad_norm": 1.7303557469279915, "language_loss": 0.68016839, "learning_rate": 1.5415305879320716e-08, "loss": 0.70151067, "num_input_tokens_seen": 345099270, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 15994, "time_per_iteration": 2.58807635307312 }, { "auxiliary_loss_clip": 0.01104567, "auxiliary_loss_mlp": 0.01031335, "balance_loss_clip": 1.01873422, "balance_loss_mlp": 1.03616428, "epoch": 0.9616714264241696, "flos": 25009807276800.0, "grad_norm": 1.9504510440226457, "language_loss": 0.8447578, "learning_rate": 1.5367081837709183e-08, "loss": 0.86611676, "num_input_tokens_seen": 345116975, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 15995, "time_per_iteration": 4.0096213817596436 }, { "auxiliary_loss_clip": 0.01105079, "auxiliary_loss_mlp": 0.01032846, "balance_loss_clip": 1.01972103, "balance_loss_mlp": 1.03445101, "epoch": 0.9617315496768375, "flos": 13547480359680.0, "grad_norm": 2.0763549764179627, "language_loss": 0.76086366, "learning_rate": 1.5318933053576788e-08, "loss": 0.78224295, "num_input_tokens_seen": 345133645, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.70703125, "step": 15996, "time_per_iteration": 2.4551968574523926 }, { "auxiliary_loss_clip": 0.01101316, "auxiliary_loss_mlp": 0.01034447, "balance_loss_clip": 1.02181637, "balance_loss_mlp": 1.03322256, "epoch": 0.9617916729295055, "flos": 11254512781440.0, "grad_norm": 2.2269539027734875, "language_loss": 0.76959884, "learning_rate": 1.52708595287494e-08, "loss": 0.79095644, "num_input_tokens_seen": 345150740, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 15997, "time_per_iteration": 2.4020557403564453 }, { "auxiliary_loss_clip": 0.01098818, "auxiliary_loss_mlp": 0.01027015, "balance_loss_clip": 1.01589262, "balance_loss_mlp": 1.03304088, "epoch": 0.9618517961821734, "flos": 22819723228800.0, "grad_norm": 5.92223296359378, "language_loss": 0.67574334, "learning_rate": 1.522286126505001e-08, "loss": 0.69700158, "num_input_tokens_seen": 345170365, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 15998, "time_per_iteration": 2.469374179840088 }, { "auxiliary_loss_clip": 0.0110203, "auxiliary_loss_mlp": 0.01031936, "balance_loss_clip": 1.01944256, "balance_loss_mlp": 1.03392446, "epoch": 0.9619119194348414, "flos": 16617340224000.0, "grad_norm": 1.7201532011609995, "language_loss": 0.72762096, "learning_rate": 1.5174938264298498e-08, "loss": 0.74896061, "num_input_tokens_seen": 345188930, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 15999, "time_per_iteration": 3.809495687484741 }, { "auxiliary_loss_clip": 0.01099643, "auxiliary_loss_mlp": 0.01026178, "balance_loss_clip": 1.01477003, "balance_loss_mlp": 1.03412032, "epoch": 0.9619720426875094, "flos": 24535140024960.0, "grad_norm": 1.8379916800501737, "language_loss": 0.65946853, "learning_rate": 1.5127090528312514e-08, "loss": 0.68072677, "num_input_tokens_seen": 345209615, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.65625, "step": 16000, "time_per_iteration": 2.4730422496795654 }, { "auxiliary_loss_clip": 0.01103156, "auxiliary_loss_mlp": 0.01029578, "balance_loss_clip": 1.01671505, "balance_loss_mlp": 1.0340699, "epoch": 0.9620321659401774, "flos": 20632224960000.0, "grad_norm": 1.7014678704256745, "language_loss": 0.75525856, "learning_rate": 1.5079318058905723e-08, "loss": 0.77658588, "num_input_tokens_seen": 345229175, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69140625, "step": 16001, "time_per_iteration": 2.4404027462005615 }, { "auxiliary_loss_clip": 0.0110213, "auxiliary_loss_mlp": 0.01030566, "balance_loss_clip": 1.01786447, "balance_loss_mlp": 1.03375793, "epoch": 0.9620922891928453, "flos": 18515290959360.0, "grad_norm": 1.985575210911474, "language_loss": 0.68393725, "learning_rate": 1.5031620857890447e-08, "loss": 0.70526421, "num_input_tokens_seen": 345247815, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 16002, "time_per_iteration": 2.4315237998962402 }, { "auxiliary_loss_clip": 0.01104104, "auxiliary_loss_mlp": 0.01033958, "balance_loss_clip": 1.02141726, "balance_loss_mlp": 1.03637123, "epoch": 0.9621524124455133, "flos": 28767391914240.0, "grad_norm": 1.4231357599695813, "language_loss": 0.64767236, "learning_rate": 1.4983998927074804e-08, "loss": 0.66905296, "num_input_tokens_seen": 345269935, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 16003, "time_per_iteration": 5.340189218521118 }, { "auxiliary_loss_clip": 0.01104243, "auxiliary_loss_mlp": 0.01035187, "balance_loss_clip": 1.02352262, "balance_loss_mlp": 1.03564143, "epoch": 0.9622125356981813, "flos": 19098875226240.0, "grad_norm": 2.0194865970609777, "language_loss": 0.75489515, "learning_rate": 1.493645226826512e-08, "loss": 0.77628946, "num_input_tokens_seen": 345288310, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 16004, "time_per_iteration": 2.4430642127990723 }, { "auxiliary_loss_clip": 0.01102585, "auxiliary_loss_mlp": 0.01031086, "balance_loss_clip": 1.01852691, "balance_loss_mlp": 1.03518593, "epoch": 0.9622726589508492, "flos": 20302816308480.0, "grad_norm": 6.226210540092099, "language_loss": 0.79609907, "learning_rate": 1.4888980883263958e-08, "loss": 0.81743574, "num_input_tokens_seen": 345306615, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 16005, "time_per_iteration": 2.469744920730591 }, { "auxiliary_loss_clip": 0.01099582, "auxiliary_loss_mlp": 0.01028353, "balance_loss_clip": 1.0170455, "balance_loss_mlp": 1.03375936, "epoch": 0.9623327822035173, "flos": 54929750889600.0, "grad_norm": 2.1762781276553964, "language_loss": 0.67554402, "learning_rate": 1.4841584773871652e-08, "loss": 0.69682336, "num_input_tokens_seen": 345331935, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65625, "step": 16006, "time_per_iteration": 2.785064458847046 }, { "auxiliary_loss_clip": 0.01098827, "auxiliary_loss_mlp": 0.01033444, "balance_loss_clip": 1.02174973, "balance_loss_mlp": 1.03453469, "epoch": 0.9623929054561852, "flos": 21759029585280.0, "grad_norm": 1.7023492658021142, "language_loss": 0.78271616, "learning_rate": 1.479426394188521e-08, "loss": 0.80403882, "num_input_tokens_seen": 345351510, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.640625, "step": 16007, "time_per_iteration": 2.457057476043701 }, { "auxiliary_loss_clip": 0.01105468, "auxiliary_loss_mlp": 0.01032271, "balance_loss_clip": 1.01984382, "balance_loss_mlp": 1.03589129, "epoch": 0.9624530287088532, "flos": 17931563038080.0, "grad_norm": 1.9957141359146886, "language_loss": 0.67604947, "learning_rate": 1.4747018389099198e-08, "loss": 0.69742692, "num_input_tokens_seen": 345367750, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 16008, "time_per_iteration": 2.4368581771850586 }, { "auxiliary_loss_clip": 0.01106536, "auxiliary_loss_mlp": 0.01034493, "balance_loss_clip": 1.02083182, "balance_loss_mlp": 1.03602374, "epoch": 0.9625131519615211, "flos": 23253739263360.0, "grad_norm": 2.2288903785221605, "language_loss": 0.72859609, "learning_rate": 1.469984811730529e-08, "loss": 0.75000644, "num_input_tokens_seen": 345384790, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.703125, "step": 16009, "time_per_iteration": 2.46628999710083 }, { "auxiliary_loss_clip": 0.01100514, "auxiliary_loss_mlp": 0.0102908, "balance_loss_clip": 1.01732564, "balance_loss_mlp": 1.03329957, "epoch": 0.9625732752141891, "flos": 18916628595840.0, "grad_norm": 1.7946254234148764, "language_loss": 0.75550067, "learning_rate": 1.4652753128292061e-08, "loss": 0.77679664, "num_input_tokens_seen": 345403390, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 16010, "time_per_iteration": 2.439497232437134 }, { "auxiliary_loss_clip": 0.01109903, "auxiliary_loss_mlp": 0.01035216, "balance_loss_clip": 1.01987338, "balance_loss_mlp": 1.03656304, "epoch": 0.962633398466857, "flos": 16252918790400.0, "grad_norm": 4.76516407129392, "language_loss": 0.69464707, "learning_rate": 1.4605733423845635e-08, "loss": 0.71609831, "num_input_tokens_seen": 345418685, "router_z_loss_clip": 0.15332031, "router_z_loss_mlp": 0.734375, "step": 16011, "time_per_iteration": 2.4492337703704834 }, { "auxiliary_loss_clip": 0.01101815, "auxiliary_loss_mlp": 0.01030839, "balance_loss_clip": 1.01929343, "balance_loss_mlp": 1.03463626, "epoch": 0.962693521719525, "flos": 54197424403200.0, "grad_norm": 1.7450082071950899, "language_loss": 0.68702257, "learning_rate": 1.4558789005748585e-08, "loss": 0.70834911, "num_input_tokens_seen": 345442380, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 16012, "time_per_iteration": 2.7724504470825195 }, { "auxiliary_loss_clip": 0.01111259, "auxiliary_loss_mlp": 0.0103337, "balance_loss_clip": 1.01975656, "balance_loss_mlp": 1.03815222, "epoch": 0.962753644972193, "flos": 33105795471360.0, "grad_norm": 3.780022374539039, "language_loss": 0.7248432, "learning_rate": 1.4511919875781264e-08, "loss": 0.74628949, "num_input_tokens_seen": 345463815, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.734375, "step": 16013, "time_per_iteration": 2.536842107772827 }, { "auxiliary_loss_clip": 0.01103606, "auxiliary_loss_mlp": 0.01029523, "balance_loss_clip": 1.01664209, "balance_loss_mlp": 1.03517938, "epoch": 0.962813768224861, "flos": 42230660837760.0, "grad_norm": 2.7108644075937267, "language_loss": 0.63603151, "learning_rate": 1.4465126035720698e-08, "loss": 0.65736282, "num_input_tokens_seen": 345484525, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 16014, "time_per_iteration": 2.6547834873199463 }, { "auxiliary_loss_clip": 0.01099913, "auxiliary_loss_mlp": 0.01029685, "balance_loss_clip": 1.01891994, "balance_loss_mlp": 1.0349828, "epoch": 0.9628738914775289, "flos": 43944677003520.0, "grad_norm": 1.5916043251943297, "language_loss": 0.71662414, "learning_rate": 1.4418407487341688e-08, "loss": 0.73792005, "num_input_tokens_seen": 345508295, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.6484375, "step": 16015, "time_per_iteration": 2.658430576324463 }, { "auxiliary_loss_clip": 0.01103107, "auxiliary_loss_mlp": 0.01029412, "balance_loss_clip": 1.0171876, "balance_loss_mlp": 1.03474545, "epoch": 0.9629340147301969, "flos": 15596184476160.0, "grad_norm": 2.0641052224754035, "language_loss": 0.77632511, "learning_rate": 1.4371764232415707e-08, "loss": 0.79765034, "num_input_tokens_seen": 345525155, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 16016, "time_per_iteration": 2.4214284420013428 }, { "auxiliary_loss_clip": 0.01028424, "auxiliary_loss_mlp": 0.01000873, "balance_loss_clip": 0.99988359, "balance_loss_mlp": 1.0061965, "epoch": 0.9629941379828649, "flos": 62951011816320.0, "grad_norm": 0.8147617678738874, "language_loss": 0.6309315, "learning_rate": 1.4325196272711337e-08, "loss": 0.65122449, "num_input_tokens_seen": 345578905, "router_z_loss_clip": 0.0098877, "router_z_loss_mlp": 0.22265625, "step": 16017, "time_per_iteration": 2.9965476989746094 }, { "auxiliary_loss_clip": 0.01104608, "auxiliary_loss_mlp": 0.01025426, "balance_loss_clip": 1.01379681, "balance_loss_mlp": 1.03515697, "epoch": 0.9630542612355328, "flos": 29899116702720.0, "grad_norm": 1.8855397231703477, "language_loss": 0.66170043, "learning_rate": 1.4278703609994502e-08, "loss": 0.6830008, "num_input_tokens_seen": 345598965, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6953125, "step": 16018, "time_per_iteration": 2.528571128845215 }, { "auxiliary_loss_clip": 0.01103042, "auxiliary_loss_mlp": 0.01034633, "balance_loss_clip": 1.0225873, "balance_loss_mlp": 1.03488207, "epoch": 0.9631143844882009, "flos": 17894575008000.0, "grad_norm": 1.9519028222786754, "language_loss": 0.79339314, "learning_rate": 1.4232286246028457e-08, "loss": 0.81476986, "num_input_tokens_seen": 345617945, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 16019, "time_per_iteration": 2.4142231941223145 }, { "auxiliary_loss_clip": 0.01099368, "auxiliary_loss_mlp": 0.01027278, "balance_loss_clip": 1.01635873, "balance_loss_mlp": 1.0326128, "epoch": 0.9631745077408688, "flos": 26139161767680.0, "grad_norm": 1.843045260347786, "language_loss": 0.71463281, "learning_rate": 1.4185944182572907e-08, "loss": 0.73589927, "num_input_tokens_seen": 345637920, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.66796875, "step": 16020, "time_per_iteration": 2.5157954692840576 }, { "auxiliary_loss_clip": 0.01104235, "auxiliary_loss_mlp": 0.01027655, "balance_loss_clip": 1.01634777, "balance_loss_mlp": 1.03553414, "epoch": 0.9632346309935368, "flos": 24973645259520.0, "grad_norm": 1.9418523293183734, "language_loss": 0.77330172, "learning_rate": 1.4139677421385331e-08, "loss": 0.79462063, "num_input_tokens_seen": 345656195, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6875, "step": 16021, "time_per_iteration": 2.4688947200775146 }, { "auxiliary_loss_clip": 0.01108857, "auxiliary_loss_mlp": 0.01031287, "balance_loss_clip": 1.01695168, "balance_loss_mlp": 1.03618479, "epoch": 0.9632947542462047, "flos": 23617226943360.0, "grad_norm": 2.8987553222972484, "language_loss": 0.64953005, "learning_rate": 1.4093485964220331e-08, "loss": 0.67093152, "num_input_tokens_seen": 345676700, "router_z_loss_clip": 0.14355469, "router_z_loss_mlp": 0.7265625, "step": 16022, "time_per_iteration": 2.4770333766937256 }, { "auxiliary_loss_clip": 0.01100089, "auxiliary_loss_mlp": 0.01031768, "balance_loss_clip": 1.02026439, "balance_loss_mlp": 1.03372943, "epoch": 0.9633548774988727, "flos": 26395599939840.0, "grad_norm": 2.371992442665119, "language_loss": 0.73493302, "learning_rate": 1.4047369812829168e-08, "loss": 0.75625157, "num_input_tokens_seen": 345696725, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 16023, "time_per_iteration": 2.4701149463653564 }, { "auxiliary_loss_clip": 0.01102261, "auxiliary_loss_mlp": 0.01025543, "balance_loss_clip": 1.01392007, "balance_loss_mlp": 1.03424346, "epoch": 0.9634150007515406, "flos": 23767728929280.0, "grad_norm": 1.6577744186024965, "language_loss": 0.81576812, "learning_rate": 1.4001328968960891e-08, "loss": 0.83704621, "num_input_tokens_seen": 345716245, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 16024, "time_per_iteration": 2.4780819416046143 }, { "auxiliary_loss_clip": 0.0110783, "auxiliary_loss_mlp": 0.01031962, "balance_loss_clip": 1.01899838, "balance_loss_mlp": 1.03578997, "epoch": 0.9634751240042086, "flos": 24135346673280.0, "grad_norm": 1.4425118368139231, "language_loss": 0.81562364, "learning_rate": 1.3955363434361212e-08, "loss": 0.83702153, "num_input_tokens_seen": 345739060, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71875, "step": 16025, "time_per_iteration": 2.4820103645324707 }, { "auxiliary_loss_clip": 0.0110489, "auxiliary_loss_mlp": 0.0102711, "balance_loss_clip": 1.01497447, "balance_loss_mlp": 1.03419757, "epoch": 0.9635352472568766, "flos": 24349086552960.0, "grad_norm": 1.7848570376919084, "language_loss": 0.770114, "learning_rate": 1.3909473210773181e-08, "loss": 0.79143405, "num_input_tokens_seen": 345758325, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.70703125, "step": 16026, "time_per_iteration": 2.4904890060424805 }, { "auxiliary_loss_clip": 0.01104215, "auxiliary_loss_mlp": 0.01033103, "balance_loss_clip": 1.01989508, "balance_loss_mlp": 1.03416753, "epoch": 0.9635953705095446, "flos": 23984772860160.0, "grad_norm": 1.7679376845137067, "language_loss": 0.63211852, "learning_rate": 1.3863658299936965e-08, "loss": 0.65349174, "num_input_tokens_seen": 345778530, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.703125, "step": 16027, "time_per_iteration": 2.4757585525512695 }, { "auxiliary_loss_clip": 0.01107621, "auxiliary_loss_mlp": 0.01027341, "balance_loss_clip": 1.01463306, "balance_loss_mlp": 1.0371927, "epoch": 0.9636554937622125, "flos": 19828436365440.0, "grad_norm": 2.1158646984261447, "language_loss": 0.8766349, "learning_rate": 1.3817918703589837e-08, "loss": 0.89798456, "num_input_tokens_seen": 345796535, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 16028, "time_per_iteration": 2.4586644172668457 }, { "auxiliary_loss_clip": 0.01027674, "auxiliary_loss_mlp": 0.01002135, "balance_loss_clip": 1.00113392, "balance_loss_mlp": 1.00540662, "epoch": 0.9637156170148805, "flos": 67435499986560.0, "grad_norm": 0.7109080766542636, "language_loss": 0.53220236, "learning_rate": 1.3772254423466412e-08, "loss": 0.55250049, "num_input_tokens_seen": 345859700, "router_z_loss_clip": 0.01000977, "router_z_loss_mlp": 0.22265625, "step": 16029, "time_per_iteration": 3.0464117527008057 }, { "auxiliary_loss_clip": 0.01104266, "auxiliary_loss_mlp": 0.01028133, "balance_loss_clip": 1.01625967, "balance_loss_mlp": 1.03474617, "epoch": 0.9637757402675484, "flos": 20300912887680.0, "grad_norm": 1.5844337802324313, "language_loss": 0.74123621, "learning_rate": 1.372666546129797e-08, "loss": 0.76256019, "num_input_tokens_seen": 345878760, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 16030, "time_per_iteration": 2.465341329574585 }, { "auxiliary_loss_clip": 0.01101832, "auxiliary_loss_mlp": 0.01027482, "balance_loss_clip": 1.01572752, "balance_loss_mlp": 1.03507125, "epoch": 0.9638358635202164, "flos": 27234544970880.0, "grad_norm": 1.9762984854991066, "language_loss": 0.65858972, "learning_rate": 1.3681151818813575e-08, "loss": 0.67988282, "num_input_tokens_seen": 345900445, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.66796875, "step": 16031, "time_per_iteration": 2.4866409301757812 }, { "auxiliary_loss_clip": 0.01028187, "auxiliary_loss_mlp": 0.01001225, "balance_loss_clip": 1.00010419, "balance_loss_mlp": 1.00580549, "epoch": 0.9638959867728845, "flos": 70288998278400.0, "grad_norm": 0.8686680776255509, "language_loss": 0.60719842, "learning_rate": 1.3635713497738955e-08, "loss": 0.62749261, "num_input_tokens_seen": 345961020, "router_z_loss_clip": 0.01123047, "router_z_loss_mlp": 0.22460938, "step": 16032, "time_per_iteration": 3.1375324726104736 }, { "auxiliary_loss_clip": 0.01097823, "auxiliary_loss_mlp": 0.0102882, "balance_loss_clip": 1.01790035, "balance_loss_mlp": 1.0335629, "epoch": 0.9639561100255524, "flos": 25407517639680.0, "grad_norm": 1.8374907417120832, "language_loss": 0.66521651, "learning_rate": 1.3590350499796954e-08, "loss": 0.68648291, "num_input_tokens_seen": 345980210, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.640625, "step": 16033, "time_per_iteration": 2.4982242584228516 }, { "auxiliary_loss_clip": 0.01103544, "auxiliary_loss_mlp": 0.01031182, "balance_loss_clip": 1.01922512, "balance_loss_mlp": 1.03585339, "epoch": 0.9640162332782204, "flos": 18113881495680.0, "grad_norm": 1.8161964083689517, "language_loss": 0.65437853, "learning_rate": 1.3545062826707976e-08, "loss": 0.67572582, "num_input_tokens_seen": 345998280, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6796875, "step": 16034, "time_per_iteration": 2.4422128200531006 }, { "auxiliary_loss_clip": 0.01104498, "auxiliary_loss_mlp": 0.01030432, "balance_loss_clip": 1.01815319, "balance_loss_mlp": 1.03544974, "epoch": 0.9640763565308883, "flos": 23440295525760.0, "grad_norm": 2.537145801538884, "language_loss": 0.74117249, "learning_rate": 1.3499850480189313e-08, "loss": 0.7625218, "num_input_tokens_seen": 346015545, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69140625, "step": 16035, "time_per_iteration": 2.451974391937256 }, { "auxiliary_loss_clip": 0.01105021, "auxiliary_loss_mlp": 0.01028666, "balance_loss_clip": 1.01648307, "balance_loss_mlp": 1.03702927, "epoch": 0.9641364797835563, "flos": 22419355259520.0, "grad_norm": 3.5746819482066283, "language_loss": 0.81850529, "learning_rate": 1.3454713461955591e-08, "loss": 0.83984214, "num_input_tokens_seen": 346034055, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 16036, "time_per_iteration": 3.9571964740753174 }, { "auxiliary_loss_clip": 0.01102063, "auxiliary_loss_mlp": 0.01028594, "balance_loss_clip": 1.01610696, "balance_loss_mlp": 1.03335619, "epoch": 0.9641966030362242, "flos": 30622357048320.0, "grad_norm": 3.148973869347377, "language_loss": 0.6988405, "learning_rate": 1.340965177371789e-08, "loss": 0.72014713, "num_input_tokens_seen": 346054130, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 16037, "time_per_iteration": 2.5089375972747803 }, { "auxiliary_loss_clip": 0.01103624, "auxiliary_loss_mlp": 0.01028273, "balance_loss_clip": 1.01652443, "balance_loss_mlp": 1.03462696, "epoch": 0.9642567262888923, "flos": 20953122088320.0, "grad_norm": 1.8970873013637208, "language_loss": 0.62809527, "learning_rate": 1.3364665417185506e-08, "loss": 0.6494143, "num_input_tokens_seen": 346072990, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 16038, "time_per_iteration": 2.4507555961608887 }, { "auxiliary_loss_clip": 0.01104673, "auxiliary_loss_mlp": 0.01031568, "balance_loss_clip": 1.01908672, "balance_loss_mlp": 1.0348146, "epoch": 0.9643168495415602, "flos": 22639415932800.0, "grad_norm": 2.213837521273671, "language_loss": 0.71034616, "learning_rate": 1.3319754394064187e-08, "loss": 0.73170853, "num_input_tokens_seen": 346093745, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 16039, "time_per_iteration": 2.5056238174438477 }, { "auxiliary_loss_clip": 0.01105303, "auxiliary_loss_mlp": 0.01028677, "balance_loss_clip": 1.01636851, "balance_loss_mlp": 1.03585339, "epoch": 0.9643769727942282, "flos": 20266259241600.0, "grad_norm": 3.437906000124932, "language_loss": 0.73267019, "learning_rate": 1.327491870605657e-08, "loss": 0.75400996, "num_input_tokens_seen": 346110115, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 16040, "time_per_iteration": 3.819448709487915 }, { "auxiliary_loss_clip": 0.01106357, "auxiliary_loss_mlp": 0.01032354, "balance_loss_clip": 1.01987278, "balance_loss_mlp": 1.03624713, "epoch": 0.9644370960468961, "flos": 13881845088000.0, "grad_norm": 2.049757671958776, "language_loss": 0.72920501, "learning_rate": 1.3230158354863296e-08, "loss": 0.75059211, "num_input_tokens_seen": 346127165, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.703125, "step": 16041, "time_per_iteration": 2.4216179847717285 }, { "auxiliary_loss_clip": 0.01099664, "auxiliary_loss_mlp": 0.01029924, "balance_loss_clip": 1.01877761, "balance_loss_mlp": 1.03491163, "epoch": 0.9644972192995641, "flos": 17238199829760.0, "grad_norm": 2.006179797964617, "language_loss": 0.71849132, "learning_rate": 1.3185473342181674e-08, "loss": 0.73978722, "num_input_tokens_seen": 346145950, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6484375, "step": 16042, "time_per_iteration": 2.4235429763793945 }, { "auxiliary_loss_clip": 0.01104748, "auxiliary_loss_mlp": 0.01031929, "balance_loss_clip": 1.02001977, "balance_loss_mlp": 1.03385794, "epoch": 0.964557342552232, "flos": 23840340272640.0, "grad_norm": 1.629070778105524, "language_loss": 0.81294507, "learning_rate": 1.3140863669705683e-08, "loss": 0.83431184, "num_input_tokens_seen": 346165005, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 16043, "time_per_iteration": 2.47078800201416 }, { "auxiliary_loss_clip": 0.01103107, "auxiliary_loss_mlp": 0.01028285, "balance_loss_clip": 1.01695383, "balance_loss_mlp": 1.03552842, "epoch": 0.9646174658049, "flos": 21653129312640.0, "grad_norm": 1.8569589109294855, "language_loss": 0.71825659, "learning_rate": 1.3096329339127522e-08, "loss": 0.7395705, "num_input_tokens_seen": 346185095, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.67578125, "step": 16044, "time_per_iteration": 2.454280138015747 }, { "auxiliary_loss_clip": 0.01101339, "auxiliary_loss_mlp": 0.01027022, "balance_loss_clip": 1.01463056, "balance_loss_mlp": 1.03388619, "epoch": 0.9646775890575681, "flos": 17129570123520.0, "grad_norm": 1.8852102816421379, "language_loss": 0.69789648, "learning_rate": 1.3051870352135397e-08, "loss": 0.71918011, "num_input_tokens_seen": 346202580, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.671875, "step": 16045, "time_per_iteration": 5.234684467315674 }, { "auxiliary_loss_clip": 0.01105469, "auxiliary_loss_mlp": 0.01032831, "balance_loss_clip": 1.01998019, "balance_loss_mlp": 1.03541589, "epoch": 0.964737712310236, "flos": 13005732458880.0, "grad_norm": 2.0691213421286556, "language_loss": 0.75218999, "learning_rate": 1.3007486710415737e-08, "loss": 0.77357298, "num_input_tokens_seen": 346219395, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 16046, "time_per_iteration": 2.423786163330078 }, { "auxiliary_loss_clip": 0.01105566, "auxiliary_loss_mlp": 0.01035553, "balance_loss_clip": 1.02229083, "balance_loss_mlp": 1.03461015, "epoch": 0.964797835562904, "flos": 24279240556800.0, "grad_norm": 2.1752868788543243, "language_loss": 0.62796247, "learning_rate": 1.2963178415651199e-08, "loss": 0.64937365, "num_input_tokens_seen": 346239715, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 16047, "time_per_iteration": 2.455381155014038 }, { "auxiliary_loss_clip": 0.01104816, "auxiliary_loss_mlp": 0.01032062, "balance_loss_clip": 1.01968193, "balance_loss_mlp": 1.03660035, "epoch": 0.9648579588155719, "flos": 20522697413760.0, "grad_norm": 2.0858436173047497, "language_loss": 0.69068146, "learning_rate": 1.2918945469521992e-08, "loss": 0.71205032, "num_input_tokens_seen": 346258500, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 16048, "time_per_iteration": 2.4627203941345215 }, { "auxiliary_loss_clip": 0.01106524, "auxiliary_loss_mlp": 0.01029738, "balance_loss_clip": 1.01694107, "balance_loss_mlp": 1.03598499, "epoch": 0.9649180820682399, "flos": 32154844855680.0, "grad_norm": 5.269480410105386, "language_loss": 0.63767457, "learning_rate": 1.2874787873705662e-08, "loss": 0.65903723, "num_input_tokens_seen": 346279110, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 16049, "time_per_iteration": 2.5049917697906494 }, { "auxiliary_loss_clip": 0.01106522, "auxiliary_loss_mlp": 0.01028216, "balance_loss_clip": 1.01637292, "balance_loss_mlp": 1.0374651, "epoch": 0.9649782053209078, "flos": 20522589672960.0, "grad_norm": 2.0360296912772684, "language_loss": 0.70971465, "learning_rate": 1.2830705629876427e-08, "loss": 0.73106205, "num_input_tokens_seen": 346297860, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 16050, "time_per_iteration": 2.4855058193206787 }, { "auxiliary_loss_clip": 0.01107029, "auxiliary_loss_mlp": 0.01035926, "balance_loss_clip": 1.02233636, "balance_loss_mlp": 1.03460717, "epoch": 0.9650383285735759, "flos": 43067953843200.0, "grad_norm": 3.71974958777835, "language_loss": 0.69796896, "learning_rate": 1.278669873970606e-08, "loss": 0.71939844, "num_input_tokens_seen": 346319860, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.7265625, "step": 16051, "time_per_iteration": 2.6526412963867188 }, { "auxiliary_loss_clip": 0.01027894, "auxiliary_loss_mlp": 0.01001418, "balance_loss_clip": 1.00035703, "balance_loss_mlp": 1.005422, "epoch": 0.9650984518262438, "flos": 61748255882880.0, "grad_norm": 0.8440113694471761, "language_loss": 0.59191906, "learning_rate": 1.2742767204863004e-08, "loss": 0.61221218, "num_input_tokens_seen": 346379025, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.22460938, "step": 16052, "time_per_iteration": 3.1200554370880127 }, { "auxiliary_loss_clip": 0.01099836, "auxiliary_loss_mlp": 0.01024696, "balance_loss_clip": 1.0127151, "balance_loss_mlp": 1.03351665, "epoch": 0.9651585750789118, "flos": 29789337761280.0, "grad_norm": 2.5977378894498973, "language_loss": 0.74162751, "learning_rate": 1.2698911027013482e-08, "loss": 0.76287282, "num_input_tokens_seen": 346402250, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6640625, "step": 16053, "time_per_iteration": 2.5514652729034424 }, { "auxiliary_loss_clip": 0.01105357, "auxiliary_loss_mlp": 0.01032415, "balance_loss_clip": 1.02025008, "balance_loss_mlp": 1.03605711, "epoch": 0.9652186983315797, "flos": 16873060124160.0, "grad_norm": 2.0035880993566977, "language_loss": 0.68666124, "learning_rate": 1.2655130207820386e-08, "loss": 0.70803893, "num_input_tokens_seen": 346419555, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 16054, "time_per_iteration": 2.4029488563537598 }, { "auxiliary_loss_clip": 0.0110161, "auxiliary_loss_mlp": 0.01034292, "balance_loss_clip": 1.02288985, "balance_loss_mlp": 1.03450012, "epoch": 0.9652788215842477, "flos": 31649761762560.0, "grad_norm": 1.6427722040712476, "language_loss": 0.62423301, "learning_rate": 1.2611424748943944e-08, "loss": 0.64559209, "num_input_tokens_seen": 346441245, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.671875, "step": 16055, "time_per_iteration": 2.5383594036102295 }, { "auxiliary_loss_clip": 0.01099976, "auxiliary_loss_mlp": 0.01033021, "balance_loss_clip": 1.02093315, "balance_loss_mlp": 1.03378582, "epoch": 0.9653389448369156, "flos": 24754266944640.0, "grad_norm": 1.9410885183358937, "language_loss": 0.77039087, "learning_rate": 1.2567794652041719e-08, "loss": 0.79172087, "num_input_tokens_seen": 346460065, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.66015625, "step": 16056, "time_per_iteration": 2.4670190811157227 }, { "auxiliary_loss_clip": 0.01102479, "auxiliary_loss_mlp": 0.01028873, "balance_loss_clip": 1.0172025, "balance_loss_mlp": 1.03347349, "epoch": 0.9653990680895836, "flos": 20297249700480.0, "grad_norm": 1.6331774581544065, "language_loss": 0.71373761, "learning_rate": 1.2524239918767498e-08, "loss": 0.73505116, "num_input_tokens_seen": 346478005, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6875, "step": 16057, "time_per_iteration": 2.4264144897460938 }, { "auxiliary_loss_clip": 0.01101188, "auxiliary_loss_mlp": 0.01031299, "balance_loss_clip": 1.01935434, "balance_loss_mlp": 1.03442907, "epoch": 0.9654591913422517, "flos": 22528775064960.0, "grad_norm": 1.9412154204243008, "language_loss": 0.7165668, "learning_rate": 1.2480760550773295e-08, "loss": 0.73789167, "num_input_tokens_seen": 346497575, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.66796875, "step": 16058, "time_per_iteration": 2.50557017326355 }, { "auxiliary_loss_clip": 0.01100395, "auxiliary_loss_mlp": 0.01031673, "balance_loss_clip": 1.01972795, "balance_loss_mlp": 1.03323829, "epoch": 0.9655193145949196, "flos": 26763002202240.0, "grad_norm": 1.5129233492794165, "language_loss": 0.7410987, "learning_rate": 1.2437356549708011e-08, "loss": 0.7624194, "num_input_tokens_seen": 346520000, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 16059, "time_per_iteration": 2.5065431594848633 }, { "auxiliary_loss_clip": 0.0110672, "auxiliary_loss_mlp": 0.01034293, "balance_loss_clip": 1.02241397, "balance_loss_mlp": 1.03561902, "epoch": 0.9655794378475876, "flos": 41970703132800.0, "grad_norm": 2.6875584640247094, "language_loss": 0.73675096, "learning_rate": 1.239402791721722e-08, "loss": 0.75816107, "num_input_tokens_seen": 346541605, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 16060, "time_per_iteration": 2.632553815841675 }, { "auxiliary_loss_clip": 0.010995, "auxiliary_loss_mlp": 0.01027544, "balance_loss_clip": 1.01682675, "balance_loss_mlp": 1.03467572, "epoch": 0.9656395611002555, "flos": 27709427704320.0, "grad_norm": 1.7010118265181544, "language_loss": 0.76676166, "learning_rate": 1.2350774654944273e-08, "loss": 0.78803211, "num_input_tokens_seen": 346560955, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.6484375, "step": 16061, "time_per_iteration": 2.5164008140563965 }, { "auxiliary_loss_clip": 0.01027417, "auxiliary_loss_mlp": 0.01001907, "balance_loss_clip": 1.00092983, "balance_loss_mlp": 1.00517333, "epoch": 0.9656996843529235, "flos": 68968562411520.0, "grad_norm": 0.7269687822928205, "language_loss": 0.64147472, "learning_rate": 1.2307596764528749e-08, "loss": 0.66176796, "num_input_tokens_seen": 346621615, "router_z_loss_clip": 0.00976562, "router_z_loss_mlp": 0.22265625, "step": 16062, "time_per_iteration": 3.1338274478912354 }, { "auxiliary_loss_clip": 0.01098388, "auxiliary_loss_mlp": 0.01026908, "balance_loss_clip": 1.01597047, "balance_loss_mlp": 1.032637, "epoch": 0.9657598076055914, "flos": 20631327120000.0, "grad_norm": 15.696443470439505, "language_loss": 0.93439233, "learning_rate": 1.226449424760867e-08, "loss": 0.9556452, "num_input_tokens_seen": 346637460, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.65625, "step": 16063, "time_per_iteration": 2.435528516769409 }, { "auxiliary_loss_clip": 0.01105231, "auxiliary_loss_mlp": 0.01033381, "balance_loss_clip": 1.02097738, "balance_loss_mlp": 1.03634143, "epoch": 0.9658199308582595, "flos": 20448577699200.0, "grad_norm": 2.429017515725559, "language_loss": 0.82205391, "learning_rate": 1.2221467105818062e-08, "loss": 0.84344006, "num_input_tokens_seen": 346655625, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 16064, "time_per_iteration": 2.4386892318725586 }, { "auxiliary_loss_clip": 0.01106075, "auxiliary_loss_mlp": 0.01028318, "balance_loss_clip": 1.01706493, "balance_loss_mlp": 1.03843307, "epoch": 0.9658800541109274, "flos": 24718033100160.0, "grad_norm": 1.775518095837309, "language_loss": 0.84123427, "learning_rate": 1.2178515340788731e-08, "loss": 0.86257821, "num_input_tokens_seen": 346675220, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.67578125, "step": 16065, "time_per_iteration": 2.495495557785034 }, { "auxiliary_loss_clip": 0.011014, "auxiliary_loss_mlp": 0.01028442, "balance_loss_clip": 1.01604438, "balance_loss_mlp": 1.03356481, "epoch": 0.9659401773635954, "flos": 21610035970560.0, "grad_norm": 2.338870366978562, "language_loss": 0.67442983, "learning_rate": 1.2135638954149151e-08, "loss": 0.69572818, "num_input_tokens_seen": 346694710, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 16066, "time_per_iteration": 2.4548871517181396 }, { "auxiliary_loss_clip": 0.01103141, "auxiliary_loss_mlp": 0.01025095, "balance_loss_clip": 1.01348424, "balance_loss_mlp": 1.03457832, "epoch": 0.9660003006162633, "flos": 20301200196480.0, "grad_norm": 4.0416134281164515, "language_loss": 0.81904447, "learning_rate": 1.209283794752558e-08, "loss": 0.84032691, "num_input_tokens_seen": 346712645, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 16067, "time_per_iteration": 2.4524948596954346 }, { "auxiliary_loss_clip": 0.01101281, "auxiliary_loss_mlp": 0.01026953, "balance_loss_clip": 1.01487732, "balance_loss_mlp": 1.03404713, "epoch": 0.9660604238689313, "flos": 24461954064000.0, "grad_norm": 1.7208498212440844, "language_loss": 0.69260037, "learning_rate": 1.2050112322540496e-08, "loss": 0.71388268, "num_input_tokens_seen": 346732375, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.671875, "step": 16068, "time_per_iteration": 2.4650373458862305 }, { "auxiliary_loss_clip": 0.01097698, "auxiliary_loss_mlp": 0.01027984, "balance_loss_clip": 1.01766682, "balance_loss_mlp": 1.03369713, "epoch": 0.9661205471215992, "flos": 19864023765120.0, "grad_norm": 1.9243313786378662, "language_loss": 0.68371427, "learning_rate": 1.20074620808146e-08, "loss": 0.70497108, "num_input_tokens_seen": 346750430, "router_z_loss_clip": 0.10302734, "router_z_loss_mlp": 0.640625, "step": 16069, "time_per_iteration": 2.441521644592285 }, { "auxiliary_loss_clip": 0.01105771, "auxiliary_loss_mlp": 0.01027963, "balance_loss_clip": 1.01595855, "balance_loss_mlp": 1.03694344, "epoch": 0.9661806703742672, "flos": 20557889763840.0, "grad_norm": 1.9788422934424583, "language_loss": 0.89019936, "learning_rate": 1.1964887223964826e-08, "loss": 0.91153669, "num_input_tokens_seen": 346768455, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 16070, "time_per_iteration": 2.445417881011963 }, { "auxiliary_loss_clip": 0.01106702, "auxiliary_loss_mlp": 0.01031783, "balance_loss_clip": 1.01902747, "balance_loss_mlp": 1.03794456, "epoch": 0.9662407936269353, "flos": 21430949736960.0, "grad_norm": 1.9135603612129206, "language_loss": 0.77245671, "learning_rate": 1.1922387753605878e-08, "loss": 0.7938416, "num_input_tokens_seen": 346786530, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6875, "step": 16071, "time_per_iteration": 2.4557089805603027 }, { "auxiliary_loss_clip": 0.01102206, "auxiliary_loss_mlp": 0.01029257, "balance_loss_clip": 1.01622713, "balance_loss_mlp": 1.03448105, "epoch": 0.9663009168796032, "flos": 14902893095040.0, "grad_norm": 1.743895305571274, "language_loss": 0.65840775, "learning_rate": 1.1879963671349137e-08, "loss": 0.67972243, "num_input_tokens_seen": 346804635, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6796875, "step": 16072, "time_per_iteration": 2.4189064502716064 }, { "auxiliary_loss_clip": 0.01106084, "auxiliary_loss_mlp": 0.01031268, "balance_loss_clip": 1.01931143, "balance_loss_mlp": 1.03637397, "epoch": 0.9663610401322712, "flos": 24310877460480.0, "grad_norm": 1.629280769162032, "language_loss": 0.77362299, "learning_rate": 1.1837614978803534e-08, "loss": 0.7949965, "num_input_tokens_seen": 346823070, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 16073, "time_per_iteration": 2.48207688331604 }, { "auxiliary_loss_clip": 0.01106626, "auxiliary_loss_mlp": 0.01033106, "balance_loss_clip": 1.02083385, "balance_loss_mlp": 1.03558493, "epoch": 0.9664211633849391, "flos": 17637849527040.0, "grad_norm": 2.4067779478393163, "language_loss": 0.7630145, "learning_rate": 1.1795341677574677e-08, "loss": 0.78441185, "num_input_tokens_seen": 346841180, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.7109375, "step": 16074, "time_per_iteration": 2.4174914360046387 }, { "auxiliary_loss_clip": 0.01105734, "auxiliary_loss_mlp": 0.01031806, "balance_loss_clip": 1.01906252, "balance_loss_mlp": 1.03610468, "epoch": 0.9664812866376071, "flos": 29789409588480.0, "grad_norm": 2.244265393310534, "language_loss": 0.75560963, "learning_rate": 1.1753143769265728e-08, "loss": 0.77698505, "num_input_tokens_seen": 346864250, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 16075, "time_per_iteration": 2.536141872406006 }, { "auxiliary_loss_clip": 0.01105595, "auxiliary_loss_mlp": 0.01031685, "balance_loss_clip": 1.01959682, "balance_loss_mlp": 1.03662407, "epoch": 0.966541409890275, "flos": 14282320798080.0, "grad_norm": 4.9480931769412795, "language_loss": 0.78895909, "learning_rate": 1.171102125547696e-08, "loss": 0.81033188, "num_input_tokens_seen": 346881955, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 16076, "time_per_iteration": 2.4051666259765625 }, { "auxiliary_loss_clip": 0.01105824, "auxiliary_loss_mlp": 0.0103737, "balance_loss_clip": 1.02449536, "balance_loss_mlp": 1.03598523, "epoch": 0.9666015331429431, "flos": 19860432405120.0, "grad_norm": 1.726837169953725, "language_loss": 0.72283554, "learning_rate": 1.166897413780532e-08, "loss": 0.74426752, "num_input_tokens_seen": 346900445, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 16077, "time_per_iteration": 2.4369590282440186 }, { "auxiliary_loss_clip": 0.01102702, "auxiliary_loss_mlp": 0.01032171, "balance_loss_clip": 1.01992202, "balance_loss_mlp": 1.03389537, "epoch": 0.966661656395611, "flos": 27125951178240.0, "grad_norm": 2.049374165524448, "language_loss": 0.59351623, "learning_rate": 1.1627002417845533e-08, "loss": 0.61486495, "num_input_tokens_seen": 346920135, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 16078, "time_per_iteration": 4.002780199050903 }, { "auxiliary_loss_clip": 0.0110629, "auxiliary_loss_mlp": 0.01034182, "balance_loss_clip": 1.02140307, "balance_loss_mlp": 1.03526294, "epoch": 0.966721779648279, "flos": 21508229848320.0, "grad_norm": 1.9133175320344795, "language_loss": 0.72182304, "learning_rate": 1.158510609718899e-08, "loss": 0.74322778, "num_input_tokens_seen": 346940450, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 16079, "time_per_iteration": 2.4680895805358887 }, { "auxiliary_loss_clip": 0.01099878, "auxiliary_loss_mlp": 0.01029195, "balance_loss_clip": 1.0177505, "balance_loss_mlp": 1.03392339, "epoch": 0.9667819029009469, "flos": 23878118401920.0, "grad_norm": 5.367706762150467, "language_loss": 0.72484392, "learning_rate": 1.1543285177424644e-08, "loss": 0.74613464, "num_input_tokens_seen": 346960935, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66015625, "step": 16080, "time_per_iteration": 2.457184314727783 }, { "auxiliary_loss_clip": 0.01103974, "auxiliary_loss_mlp": 0.01028075, "balance_loss_clip": 1.01654804, "balance_loss_mlp": 1.03592443, "epoch": 0.9668420261536149, "flos": 21507224267520.0, "grad_norm": 2.5568568908239357, "language_loss": 0.7399531, "learning_rate": 1.1501539660138115e-08, "loss": 0.76127362, "num_input_tokens_seen": 346980100, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 16081, "time_per_iteration": 2.4657506942749023 }, { "auxiliary_loss_clip": 0.01102918, "auxiliary_loss_mlp": 0.01025742, "balance_loss_clip": 1.01364255, "balance_loss_mlp": 1.03380799, "epoch": 0.9669021494062828, "flos": 26687266375680.0, "grad_norm": 1.9523267504521118, "language_loss": 0.67598188, "learning_rate": 1.145986954691236e-08, "loss": 0.69726849, "num_input_tokens_seen": 347001250, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 16082, "time_per_iteration": 3.8958559036254883 }, { "auxiliary_loss_clip": 0.01100626, "auxiliary_loss_mlp": 0.01030393, "balance_loss_clip": 1.01871645, "balance_loss_mlp": 1.03365552, "epoch": 0.9669622726589508, "flos": 29825032901760.0, "grad_norm": 1.7414843141209357, "language_loss": 0.76618207, "learning_rate": 1.141827483932789e-08, "loss": 0.78749228, "num_input_tokens_seen": 347022975, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 16083, "time_per_iteration": 2.4967732429504395 }, { "auxiliary_loss_clip": 0.01105839, "auxiliary_loss_mlp": 0.01036073, "balance_loss_clip": 1.02363396, "balance_loss_mlp": 1.03591156, "epoch": 0.9670223959116189, "flos": 22922499018240.0, "grad_norm": 1.9847351125178847, "language_loss": 0.79642451, "learning_rate": 1.1376755538961669e-08, "loss": 0.81784362, "num_input_tokens_seen": 347038780, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69921875, "step": 16084, "time_per_iteration": 2.4545960426330566 }, { "auxiliary_loss_clip": 0.01105671, "auxiliary_loss_mlp": 0.01029059, "balance_loss_clip": 1.01619589, "balance_loss_mlp": 1.03455627, "epoch": 0.9670825191642868, "flos": 18624495283200.0, "grad_norm": 2.562174658290235, "language_loss": 0.67771673, "learning_rate": 1.1335311647387991e-08, "loss": 0.69906408, "num_input_tokens_seen": 347056705, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.7109375, "step": 16085, "time_per_iteration": 2.4549996852874756 }, { "auxiliary_loss_clip": 0.01109641, "auxiliary_loss_mlp": 0.01029688, "balance_loss_clip": 1.01639581, "balance_loss_mlp": 1.03732347, "epoch": 0.9671426424169548, "flos": 24497936513280.0, "grad_norm": 2.057262885557747, "language_loss": 0.69227684, "learning_rate": 1.1293943166178709e-08, "loss": 0.71367013, "num_input_tokens_seen": 347075710, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.72265625, "step": 16086, "time_per_iteration": 3.893667697906494 }, { "auxiliary_loss_clip": 0.01102591, "auxiliary_loss_mlp": 0.01029755, "balance_loss_clip": 1.01748824, "balance_loss_mlp": 1.03549659, "epoch": 0.9672027656696227, "flos": 20371189847040.0, "grad_norm": 2.491357453101444, "language_loss": 0.78377336, "learning_rate": 1.125265009690235e-08, "loss": 0.80509686, "num_input_tokens_seen": 347092325, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.671875, "step": 16087, "time_per_iteration": 3.9313814640045166 }, { "auxiliary_loss_clip": 0.01102714, "auxiliary_loss_mlp": 0.01025458, "balance_loss_clip": 1.01385272, "balance_loss_mlp": 1.0345248, "epoch": 0.9672628889222907, "flos": 18880179269760.0, "grad_norm": 1.8671795021789939, "language_loss": 0.71497804, "learning_rate": 1.1211432441124769e-08, "loss": 0.7362597, "num_input_tokens_seen": 347110595, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 16088, "time_per_iteration": 2.421428918838501 }, { "auxiliary_loss_clip": 0.01102016, "auxiliary_loss_mlp": 0.01028211, "balance_loss_clip": 1.01664782, "balance_loss_mlp": 1.03528571, "epoch": 0.9673230121749586, "flos": 28695247447680.0, "grad_norm": 1.5060876714430063, "language_loss": 0.70513409, "learning_rate": 1.117029020040916e-08, "loss": 0.72643638, "num_input_tokens_seen": 347131625, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.66796875, "step": 16089, "time_per_iteration": 2.525843858718872 }, { "auxiliary_loss_clip": 0.01105693, "auxiliary_loss_mlp": 0.01031096, "balance_loss_clip": 1.0193243, "balance_loss_mlp": 1.0355047, "epoch": 0.9673831354276267, "flos": 20484452407680.0, "grad_norm": 2.430622571770293, "language_loss": 0.74964726, "learning_rate": 1.1129223376315167e-08, "loss": 0.77101511, "num_input_tokens_seen": 347147910, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.703125, "step": 16090, "time_per_iteration": 2.440645933151245 }, { "auxiliary_loss_clip": 0.01106509, "auxiliary_loss_mlp": 0.01030393, "balance_loss_clip": 1.01810861, "balance_loss_mlp": 1.03386426, "epoch": 0.9674432586802946, "flos": 26797548107520.0, "grad_norm": 1.8384154886261654, "language_loss": 0.69028735, "learning_rate": 1.1088231970400653e-08, "loss": 0.71165639, "num_input_tokens_seen": 347168805, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7265625, "step": 16091, "time_per_iteration": 2.5011143684387207 }, { "auxiliary_loss_clip": 0.01102405, "auxiliary_loss_mlp": 0.01030332, "balance_loss_clip": 1.01805329, "balance_loss_mlp": 1.03459692, "epoch": 0.9675033819329626, "flos": 22310941034880.0, "grad_norm": 1.858416269564867, "language_loss": 0.76941061, "learning_rate": 1.1047315984219484e-08, "loss": 0.79073799, "num_input_tokens_seen": 347189455, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 16092, "time_per_iteration": 2.4669485092163086 }, { "auxiliary_loss_clip": 0.01104767, "auxiliary_loss_mlp": 0.0102757, "balance_loss_clip": 1.01601315, "balance_loss_mlp": 1.03693938, "epoch": 0.9675635051856305, "flos": 12675713276160.0, "grad_norm": 2.101464847323642, "language_loss": 0.76004338, "learning_rate": 1.1006475419323313e-08, "loss": 0.78136677, "num_input_tokens_seen": 347206030, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 16093, "time_per_iteration": 2.4201250076293945 }, { "auxiliary_loss_clip": 0.01101887, "auxiliary_loss_mlp": 0.01026694, "balance_loss_clip": 1.0133189, "balance_loss_mlp": 1.03413653, "epoch": 0.9676236284382985, "flos": 24608469640320.0, "grad_norm": 1.561198354350834, "language_loss": 0.69236016, "learning_rate": 1.096571027726112e-08, "loss": 0.71364594, "num_input_tokens_seen": 347226250, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6796875, "step": 16094, "time_per_iteration": 2.5078470706939697 }, { "auxiliary_loss_clip": 0.01105671, "auxiliary_loss_mlp": 0.01029402, "balance_loss_clip": 1.01757646, "balance_loss_mlp": 1.03479087, "epoch": 0.9676837516909664, "flos": 23367145478400.0, "grad_norm": 1.5690599102090188, "language_loss": 0.76218104, "learning_rate": 1.0925020559578557e-08, "loss": 0.78353179, "num_input_tokens_seen": 347247350, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.7109375, "step": 16095, "time_per_iteration": 2.4707460403442383 }, { "auxiliary_loss_clip": 0.01108173, "auxiliary_loss_mlp": 0.01036254, "balance_loss_clip": 1.02367139, "balance_loss_mlp": 1.03623426, "epoch": 0.9677438749436345, "flos": 20486894532480.0, "grad_norm": 2.342099287436205, "language_loss": 0.7018618, "learning_rate": 1.0884406267818392e-08, "loss": 0.72330606, "num_input_tokens_seen": 347266870, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 16096, "time_per_iteration": 2.4403016567230225 }, { "auxiliary_loss_clip": 0.01106693, "auxiliary_loss_mlp": 0.01027588, "balance_loss_clip": 1.01532769, "balance_loss_mlp": 1.03669429, "epoch": 0.9678039981963025, "flos": 47555889719040.0, "grad_norm": 1.7921577457090352, "language_loss": 0.7199856, "learning_rate": 1.0843867403520946e-08, "loss": 0.74132842, "num_input_tokens_seen": 347290120, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69921875, "step": 16097, "time_per_iteration": 2.7108078002929688 }, { "auxiliary_loss_clip": 0.0110238, "auxiliary_loss_mlp": 0.01032611, "balance_loss_clip": 1.02042186, "balance_loss_mlp": 1.03520954, "epoch": 0.9678641214489704, "flos": 25040474513280.0, "grad_norm": 1.633614246369255, "language_loss": 0.77739942, "learning_rate": 1.0803403968223434e-08, "loss": 0.79874933, "num_input_tokens_seen": 347308785, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 16098, "time_per_iteration": 2.473188638687134 }, { "auxiliary_loss_clip": 0.01100526, "auxiliary_loss_mlp": 0.01029815, "balance_loss_clip": 1.01849043, "balance_loss_mlp": 1.03371286, "epoch": 0.9679242447016384, "flos": 19240937516160.0, "grad_norm": 2.0040718591744464, "language_loss": 0.90736645, "learning_rate": 1.0763015963459965e-08, "loss": 0.92866981, "num_input_tokens_seen": 347326375, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.66796875, "step": 16099, "time_per_iteration": 2.4345626831054688 }, { "auxiliary_loss_clip": 0.01105559, "auxiliary_loss_mlp": 0.01032048, "balance_loss_clip": 1.01961398, "balance_loss_mlp": 1.03433907, "epoch": 0.9679843679543063, "flos": 33254681345280.0, "grad_norm": 2.166287487275681, "language_loss": 0.66251713, "learning_rate": 1.0722703390762643e-08, "loss": 0.6838932, "num_input_tokens_seen": 347348250, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 16100, "time_per_iteration": 2.553767442703247 }, { "auxiliary_loss_clip": 0.01105814, "auxiliary_loss_mlp": 0.01034351, "balance_loss_clip": 1.02184606, "balance_loss_mlp": 1.03637707, "epoch": 0.9680444912069743, "flos": 22783633038720.0, "grad_norm": 1.5220473210147365, "language_loss": 0.73365498, "learning_rate": 1.0682466251659584e-08, "loss": 0.75505656, "num_input_tokens_seen": 347367400, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 16101, "time_per_iteration": 2.4614171981811523 }, { "auxiliary_loss_clip": 0.01101884, "auxiliary_loss_mlp": 0.01031501, "balance_loss_clip": 1.01919901, "balance_loss_mlp": 1.03442311, "epoch": 0.9681046144596422, "flos": 24024095274240.0, "grad_norm": 1.5211548584188708, "language_loss": 0.73563349, "learning_rate": 1.0642304547676672e-08, "loss": 0.75696737, "num_input_tokens_seen": 347387600, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 16102, "time_per_iteration": 2.4633355140686035 }, { "auxiliary_loss_clip": 0.01105249, "auxiliary_loss_mlp": 0.01035119, "balance_loss_clip": 1.02196431, "balance_loss_mlp": 1.0360868, "epoch": 0.9681647377123103, "flos": 23441013797760.0, "grad_norm": 2.0045631108139457, "language_loss": 0.77470905, "learning_rate": 1.0602218280337139e-08, "loss": 0.79611266, "num_input_tokens_seen": 347406915, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 16103, "time_per_iteration": 2.4834632873535156 }, { "auxiliary_loss_clip": 0.01104968, "auxiliary_loss_mlp": 0.01029026, "balance_loss_clip": 1.01785564, "balance_loss_mlp": 1.0366739, "epoch": 0.9682248609649782, "flos": 22675075159680.0, "grad_norm": 1.6289824083983795, "language_loss": 0.80583441, "learning_rate": 1.0562207451160655e-08, "loss": 0.82717443, "num_input_tokens_seen": 347425140, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.68359375, "step": 16104, "time_per_iteration": 2.456270694732666 }, { "auxiliary_loss_clip": 0.01096881, "auxiliary_loss_mlp": 0.01029805, "balance_loss_clip": 1.01915395, "balance_loss_mlp": 1.03111959, "epoch": 0.9682849842176462, "flos": 24428413739520.0, "grad_norm": 1.4727150042008639, "language_loss": 0.77648926, "learning_rate": 1.0522272061664672e-08, "loss": 0.79775608, "num_input_tokens_seen": 347446350, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.65625, "step": 16105, "time_per_iteration": 2.498138427734375 }, { "auxiliary_loss_clip": 0.01027605, "auxiliary_loss_mlp": 0.01000411, "balance_loss_clip": 0.99939197, "balance_loss_mlp": 1.00537705, "epoch": 0.9683451074703141, "flos": 59995132784640.0, "grad_norm": 0.8446854817438941, "language_loss": 0.5676083, "learning_rate": 1.0482412113363536e-08, "loss": 0.58788842, "num_input_tokens_seen": 347510135, "router_z_loss_clip": 0.01019287, "router_z_loss_mlp": 0.22265625, "step": 16106, "time_per_iteration": 3.137357473373413 }, { "auxiliary_loss_clip": 0.01027342, "auxiliary_loss_mlp": 0.01001271, "balance_loss_clip": 1.00026941, "balance_loss_mlp": 1.00509024, "epoch": 0.9684052307229821, "flos": 52696145514240.0, "grad_norm": 0.8758164457841725, "language_loss": 0.61546606, "learning_rate": 1.0442627607768707e-08, "loss": 0.6357522, "num_input_tokens_seen": 347562505, "router_z_loss_clip": 0.01000977, "router_z_loss_mlp": 0.22265625, "step": 16107, "time_per_iteration": 3.00793719291687 }, { "auxiliary_loss_clip": 0.01104999, "auxiliary_loss_mlp": 0.01032381, "balance_loss_clip": 1.01886868, "balance_loss_mlp": 1.03622007, "epoch": 0.96846535397565, "flos": 22783848520320.0, "grad_norm": 2.0057178558341295, "language_loss": 0.73602843, "learning_rate": 1.040291854638875e-08, "loss": 0.75740224, "num_input_tokens_seen": 347579150, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6875, "step": 16108, "time_per_iteration": 2.496338129043579 }, { "auxiliary_loss_clip": 0.01105528, "auxiliary_loss_mlp": 0.01027808, "balance_loss_clip": 1.01499319, "balance_loss_mlp": 1.03592873, "epoch": 0.968525477228318, "flos": 23323980309120.0, "grad_norm": 2.553677172345196, "language_loss": 0.57322371, "learning_rate": 1.0363284930729576e-08, "loss": 0.59455705, "num_input_tokens_seen": 347596705, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.6953125, "step": 16109, "time_per_iteration": 2.4647204875946045 }, { "auxiliary_loss_clip": 0.0102802, "auxiliary_loss_mlp": 0.01001271, "balance_loss_clip": 1.00027597, "balance_loss_mlp": 1.00569046, "epoch": 0.9685856004809861, "flos": 67882947707520.0, "grad_norm": 0.9543368528532313, "language_loss": 0.54229814, "learning_rate": 1.0323726762294205e-08, "loss": 0.56259102, "num_input_tokens_seen": 347661870, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22363281, "step": 16110, "time_per_iteration": 3.0673604011535645 }, { "auxiliary_loss_clip": 0.01108591, "auxiliary_loss_mlp": 0.01039103, "balance_loss_clip": 1.02556086, "balance_loss_mlp": 1.03730309, "epoch": 0.968645723733654, "flos": 33947900899200.0, "grad_norm": 1.5195276542116023, "language_loss": 0.62586963, "learning_rate": 1.0284244042582325e-08, "loss": 0.64734656, "num_input_tokens_seen": 347684295, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 16111, "time_per_iteration": 2.5822298526763916 }, { "auxiliary_loss_clip": 0.01100731, "auxiliary_loss_mlp": 0.0102823, "balance_loss_clip": 1.01734018, "balance_loss_mlp": 1.03310859, "epoch": 0.968705846986322, "flos": 18551488890240.0, "grad_norm": 27.923974066588492, "language_loss": 0.7499131, "learning_rate": 1.024483677309118e-08, "loss": 0.77120268, "num_input_tokens_seen": 347702585, "router_z_loss_clip": 0.10888672, "router_z_loss_mlp": 0.67578125, "step": 16112, "time_per_iteration": 2.421440362930298 }, { "auxiliary_loss_clip": 0.01101273, "auxiliary_loss_mlp": 0.01026249, "balance_loss_clip": 1.01472723, "balance_loss_mlp": 1.03438044, "epoch": 0.9687659702389899, "flos": 17420913336960.0, "grad_norm": 6.2116482650277876, "language_loss": 0.66809583, "learning_rate": 1.020550495531558e-08, "loss": 0.68937099, "num_input_tokens_seen": 347721810, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 16113, "time_per_iteration": 2.447709083557129 }, { "auxiliary_loss_clip": 0.01027888, "auxiliary_loss_mlp": 0.01001657, "balance_loss_clip": 1.0006144, "balance_loss_mlp": 1.00559759, "epoch": 0.9688260934916579, "flos": 62047176865920.0, "grad_norm": 0.6978031709220917, "language_loss": 0.56537962, "learning_rate": 1.0166248590746329e-08, "loss": 0.58567512, "num_input_tokens_seen": 347782330, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.22265625, "step": 16114, "time_per_iteration": 3.088639259338379 }, { "auxiliary_loss_clip": 0.01104032, "auxiliary_loss_mlp": 0.0103136, "balance_loss_clip": 1.01907551, "balance_loss_mlp": 1.03560376, "epoch": 0.9688862167443258, "flos": 15076520461440.0, "grad_norm": 1.958408064907713, "language_loss": 0.82970613, "learning_rate": 1.0127067680872458e-08, "loss": 0.85106003, "num_input_tokens_seen": 347794835, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 16115, "time_per_iteration": 2.3927736282348633 }, { "auxiliary_loss_clip": 0.01097808, "auxiliary_loss_mlp": 0.01028788, "balance_loss_clip": 1.0178268, "balance_loss_mlp": 1.03377199, "epoch": 0.9689463399969939, "flos": 19938215306880.0, "grad_norm": 1.716616914155662, "language_loss": 0.72050709, "learning_rate": 1.0087962227179448e-08, "loss": 0.74177307, "num_input_tokens_seen": 347814320, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.640625, "step": 16116, "time_per_iteration": 2.451651096343994 }, { "auxiliary_loss_clip": 0.01105591, "auxiliary_loss_mlp": 0.01031698, "balance_loss_clip": 1.01946664, "balance_loss_mlp": 1.03553081, "epoch": 0.9690064632496618, "flos": 19573039687680.0, "grad_norm": 3.805279683338944, "language_loss": 0.75383174, "learning_rate": 1.0048932231150553e-08, "loss": 0.77520466, "num_input_tokens_seen": 347832125, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 16117, "time_per_iteration": 2.4490232467651367 }, { "auxiliary_loss_clip": 0.01103992, "auxiliary_loss_mlp": 0.01027652, "balance_loss_clip": 1.0147711, "balance_loss_mlp": 1.03368437, "epoch": 0.9690665865023298, "flos": 21872292145920.0, "grad_norm": 2.4084873141963197, "language_loss": 0.7772963, "learning_rate": 1.000997769426548e-08, "loss": 0.79861271, "num_input_tokens_seen": 347850765, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 16118, "time_per_iteration": 2.452383041381836 }, { "auxiliary_loss_clip": 0.01106945, "auxiliary_loss_mlp": 0.01035637, "balance_loss_clip": 1.02388859, "balance_loss_mlp": 1.0372622, "epoch": 0.9691267097549977, "flos": 20994491577600.0, "grad_norm": 1.8336539297649757, "language_loss": 0.77973443, "learning_rate": 9.971098618001272e-09, "loss": 0.80116022, "num_input_tokens_seen": 347870125, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.69921875, "step": 16119, "time_per_iteration": 2.481095314025879 }, { "auxiliary_loss_clip": 0.01099798, "auxiliary_loss_mlp": 0.01026893, "balance_loss_clip": 1.01602077, "balance_loss_mlp": 1.03496432, "epoch": 0.9691868330076657, "flos": 24279132816000.0, "grad_norm": 1.4747121588495504, "language_loss": 0.76110166, "learning_rate": 9.932295003832747e-09, "loss": 0.78236854, "num_input_tokens_seen": 347890615, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.6484375, "step": 16120, "time_per_iteration": 4.0574235916137695 }, { "auxiliary_loss_clip": 0.01102946, "auxiliary_loss_mlp": 0.01027087, "balance_loss_clip": 1.01552939, "balance_loss_mlp": 1.03531122, "epoch": 0.9692469562603336, "flos": 17675699483520.0, "grad_norm": 4.715743768275542, "language_loss": 0.69407582, "learning_rate": 9.89356685323095e-09, "loss": 0.71537614, "num_input_tokens_seen": 347908685, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.67578125, "step": 16121, "time_per_iteration": 2.407849073410034 }, { "auxiliary_loss_clip": 0.01102532, "auxiliary_loss_mlp": 0.01030005, "balance_loss_clip": 1.0182209, "balance_loss_mlp": 1.03416491, "epoch": 0.9693070795130017, "flos": 26834392483200.0, "grad_norm": 2.241763995140158, "language_loss": 0.69242227, "learning_rate": 9.854914167664486e-09, "loss": 0.71374774, "num_input_tokens_seen": 347926385, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 16122, "time_per_iteration": 2.499983310699463 }, { "auxiliary_loss_clip": 0.01101652, "auxiliary_loss_mlp": 0.01026701, "balance_loss_clip": 1.015692, "balance_loss_mlp": 1.03338718, "epoch": 0.9693672027656697, "flos": 18077288515200.0, "grad_norm": 1.721088557763244, "language_loss": 0.75924373, "learning_rate": 9.81633694859907e-09, "loss": 0.78052723, "num_input_tokens_seen": 347945290, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.68359375, "step": 16123, "time_per_iteration": 3.927419662475586 }, { "auxiliary_loss_clip": 0.01102921, "auxiliary_loss_mlp": 0.01033624, "balance_loss_clip": 1.02107716, "balance_loss_mlp": 1.03340602, "epoch": 0.9694273260183376, "flos": 21763015994880.0, "grad_norm": 1.612993345250429, "language_loss": 0.74741137, "learning_rate": 9.777835197497753e-09, "loss": 0.76877677, "num_input_tokens_seen": 347966330, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 16124, "time_per_iteration": 2.460822820663452 }, { "auxiliary_loss_clip": 0.01106312, "auxiliary_loss_mlp": 0.01034683, "balance_loss_clip": 1.02236307, "balance_loss_mlp": 1.03605938, "epoch": 0.9694874492710056, "flos": 24426115269120.0, "grad_norm": 3.1449616654621986, "language_loss": 0.74304664, "learning_rate": 9.739408915820258e-09, "loss": 0.76445657, "num_input_tokens_seen": 347982590, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 16125, "time_per_iteration": 2.504171133041382 }, { "auxiliary_loss_clip": 0.01027963, "auxiliary_loss_mlp": 0.0100003, "balance_loss_clip": 0.99901628, "balance_loss_mlp": 1.00550818, "epoch": 0.9695475725236735, "flos": 67650748237440.0, "grad_norm": 0.921319462615276, "language_loss": 0.61497438, "learning_rate": 9.70105810502364e-09, "loss": 0.63525426, "num_input_tokens_seen": 348043310, "router_z_loss_clip": 0.01013184, "router_z_loss_mlp": 0.22460938, "step": 16126, "time_per_iteration": 3.0354082584381104 }, { "auxiliary_loss_clip": 0.0110281, "auxiliary_loss_mlp": 0.01033571, "balance_loss_clip": 1.02186441, "balance_loss_mlp": 1.03595579, "epoch": 0.9696076957763415, "flos": 19129326981120.0, "grad_norm": 1.6284408655430638, "language_loss": 0.75042576, "learning_rate": 9.662782766562738e-09, "loss": 0.77178955, "num_input_tokens_seen": 348062200, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66796875, "step": 16127, "time_per_iteration": 2.4512364864349365 }, { "auxiliary_loss_clip": 0.01105145, "auxiliary_loss_mlp": 0.010341, "balance_loss_clip": 1.02110648, "balance_loss_mlp": 1.03397763, "epoch": 0.9696678190290094, "flos": 15486836497920.0, "grad_norm": 1.6649768322817298, "language_loss": 0.69274044, "learning_rate": 9.62458290188839e-09, "loss": 0.71413291, "num_input_tokens_seen": 348080685, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7109375, "step": 16128, "time_per_iteration": 5.268558025360107 }, { "auxiliary_loss_clip": 0.01104763, "auxiliary_loss_mlp": 0.0103285, "balance_loss_clip": 1.02050614, "balance_loss_mlp": 1.03650713, "epoch": 0.9697279422816775, "flos": 36208692869760.0, "grad_norm": 3.119684744457104, "language_loss": 0.64975667, "learning_rate": 9.586458512449213e-09, "loss": 0.6711328, "num_input_tokens_seen": 348102500, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 16129, "time_per_iteration": 2.5765480995178223 }, { "auxiliary_loss_clip": 0.01107172, "auxiliary_loss_mlp": 0.01032395, "balance_loss_clip": 1.01929379, "balance_loss_mlp": 1.03531957, "epoch": 0.9697880655343454, "flos": 25484007651840.0, "grad_norm": 1.8287357454688853, "language_loss": 0.63222164, "learning_rate": 9.548409599691166e-09, "loss": 0.65361738, "num_input_tokens_seen": 348122515, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.71875, "step": 16130, "time_per_iteration": 2.5198841094970703 }, { "auxiliary_loss_clip": 0.01106745, "auxiliary_loss_mlp": 0.0102921, "balance_loss_clip": 1.01709294, "balance_loss_mlp": 1.03523207, "epoch": 0.9698481887870134, "flos": 15333533251200.0, "grad_norm": 2.1135412946471255, "language_loss": 0.70117116, "learning_rate": 9.510436165056867e-09, "loss": 0.72253072, "num_input_tokens_seen": 348138775, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71484375, "step": 16131, "time_per_iteration": 2.395972728729248 }, { "auxiliary_loss_clip": 0.0110452, "auxiliary_loss_mlp": 0.01032231, "balance_loss_clip": 1.01959538, "balance_loss_mlp": 1.03480482, "epoch": 0.9699083120396813, "flos": 21982250655360.0, "grad_norm": 1.8494970922599805, "language_loss": 0.76646054, "learning_rate": 9.472538209986058e-09, "loss": 0.78782809, "num_input_tokens_seen": 348157115, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 16132, "time_per_iteration": 2.467102527618408 }, { "auxiliary_loss_clip": 0.01107069, "auxiliary_loss_mlp": 0.01031923, "balance_loss_clip": 1.01928067, "balance_loss_mlp": 1.03693044, "epoch": 0.9699684352923493, "flos": 15664055224320.0, "grad_norm": 2.8551980258284644, "language_loss": 0.78943151, "learning_rate": 9.434715735916477e-09, "loss": 0.81082141, "num_input_tokens_seen": 348173035, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 16133, "time_per_iteration": 2.3832225799560547 }, { "auxiliary_loss_clip": 0.01100004, "auxiliary_loss_mlp": 0.01027793, "balance_loss_clip": 1.01629496, "balance_loss_mlp": 1.0336473, "epoch": 0.9700285585450172, "flos": 21908382336000.0, "grad_norm": 1.9207599320256965, "language_loss": 0.64472324, "learning_rate": 9.396968744281863e-09, "loss": 0.6660012, "num_input_tokens_seen": 348192960, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6640625, "step": 16134, "time_per_iteration": 2.4487476348876953 }, { "auxiliary_loss_clip": 0.01103319, "auxiliary_loss_mlp": 0.01031686, "balance_loss_clip": 1.01919889, "balance_loss_mlp": 1.03446865, "epoch": 0.9700886817976853, "flos": 23914890950400.0, "grad_norm": 2.21627633423252, "language_loss": 0.81141925, "learning_rate": 9.359297236513519e-09, "loss": 0.83276933, "num_input_tokens_seen": 348212805, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 16135, "time_per_iteration": 2.4625015258789062 }, { "auxiliary_loss_clip": 0.0110666, "auxiliary_loss_mlp": 0.01031698, "balance_loss_clip": 1.01855493, "balance_loss_mlp": 1.03540206, "epoch": 0.9701488050503532, "flos": 25447845634560.0, "grad_norm": 2.5404078552948492, "language_loss": 0.73315895, "learning_rate": 9.321701214040079e-09, "loss": 0.75454253, "num_input_tokens_seen": 348232900, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 16136, "time_per_iteration": 2.488983392715454 }, { "auxiliary_loss_clip": 0.01102693, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.018857, "balance_loss_mlp": 1.0351882, "epoch": 0.9702089283030212, "flos": 20590855470720.0, "grad_norm": 1.6085411587677965, "language_loss": 0.76105654, "learning_rate": 9.28418067828729e-09, "loss": 0.78237933, "num_input_tokens_seen": 348253065, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.671875, "step": 16137, "time_per_iteration": 2.473989963531494 }, { "auxiliary_loss_clip": 0.01027692, "auxiliary_loss_mlp": 0.00999888, "balance_loss_clip": 0.99884474, "balance_loss_mlp": 1.00539565, "epoch": 0.9702690515556892, "flos": 70651516291200.0, "grad_norm": 0.7975852233856408, "language_loss": 0.54882479, "learning_rate": 9.246735630678015e-09, "loss": 0.56910056, "num_input_tokens_seen": 348316075, "router_z_loss_clip": 0.01043701, "router_z_loss_mlp": 0.22265625, "step": 16138, "time_per_iteration": 3.1831324100494385 }, { "auxiliary_loss_clip": 0.01102583, "auxiliary_loss_mlp": 0.01026594, "balance_loss_clip": 1.01476252, "balance_loss_mlp": 1.03366852, "epoch": 0.9703291748083571, "flos": 35881439034240.0, "grad_norm": 1.8832600153362609, "language_loss": 0.70985276, "learning_rate": 9.209366072632007e-09, "loss": 0.73114449, "num_input_tokens_seen": 348337605, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 16139, "time_per_iteration": 2.5626494884490967 }, { "auxiliary_loss_clip": 0.01105211, "auxiliary_loss_mlp": 0.01029011, "balance_loss_clip": 1.01662517, "balance_loss_mlp": 1.03589964, "epoch": 0.9703892980610251, "flos": 24316479982080.0, "grad_norm": 1.6970721158550206, "language_loss": 0.72436237, "learning_rate": 9.172072005566134e-09, "loss": 0.74570453, "num_input_tokens_seen": 348359430, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 16140, "time_per_iteration": 2.487363576889038 }, { "auxiliary_loss_clip": 0.01107715, "auxiliary_loss_mlp": 0.01039872, "balance_loss_clip": 1.02724791, "balance_loss_mlp": 1.03633714, "epoch": 0.970449421313693, "flos": 18003743418240.0, "grad_norm": 2.563801536870731, "language_loss": 0.68384844, "learning_rate": 9.13485343089504e-09, "loss": 0.70532435, "num_input_tokens_seen": 348377890, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71484375, "step": 16141, "time_per_iteration": 2.4142637252807617 }, { "auxiliary_loss_clip": 0.01100131, "auxiliary_loss_mlp": 0.01029457, "balance_loss_clip": 1.01751769, "balance_loss_mlp": 1.03362548, "epoch": 0.9705095445663611, "flos": 25337994865920.0, "grad_norm": 2.2376353634381645, "language_loss": 0.68348211, "learning_rate": 9.097710350029597e-09, "loss": 0.70477796, "num_input_tokens_seen": 348396550, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 16142, "time_per_iteration": 2.503575325012207 }, { "auxiliary_loss_clip": 0.01101433, "auxiliary_loss_mlp": 0.01027416, "balance_loss_clip": 1.01541162, "balance_loss_mlp": 1.03343654, "epoch": 0.970569667819029, "flos": 26833602384000.0, "grad_norm": 1.805166728952023, "language_loss": 0.55912173, "learning_rate": 9.060642764378457e-09, "loss": 0.58041024, "num_input_tokens_seen": 348417120, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 16143, "time_per_iteration": 2.5311942100524902 }, { "auxiliary_loss_clip": 0.01105632, "auxiliary_loss_mlp": 0.01030826, "balance_loss_clip": 1.01907206, "balance_loss_mlp": 1.03599405, "epoch": 0.970629791071697, "flos": 25848644567040.0, "grad_norm": 2.5692989795549463, "language_loss": 0.68393815, "learning_rate": 9.023650675347382e-09, "loss": 0.70530272, "num_input_tokens_seen": 348437750, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6953125, "step": 16144, "time_per_iteration": 2.4976840019226074 }, { "auxiliary_loss_clip": 0.01104434, "auxiliary_loss_mlp": 0.01039236, "balance_loss_clip": 1.02751148, "balance_loss_mlp": 1.03622293, "epoch": 0.9706899143243649, "flos": 36540184510080.0, "grad_norm": 2.6702393543188245, "language_loss": 0.72361678, "learning_rate": 8.986734084339253e-09, "loss": 0.74505353, "num_input_tokens_seen": 348460935, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.68359375, "step": 16145, "time_per_iteration": 2.583324432373047 }, { "auxiliary_loss_clip": 0.01103894, "auxiliary_loss_mlp": 0.01031031, "balance_loss_clip": 1.01771522, "balance_loss_mlp": 1.03310931, "epoch": 0.9707500375770329, "flos": 12268234414080.0, "grad_norm": 3.1492266113834493, "language_loss": 0.80730969, "learning_rate": 8.949892992753395e-09, "loss": 0.82865894, "num_input_tokens_seen": 348474480, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 16146, "time_per_iteration": 2.4284873008728027 }, { "auxiliary_loss_clip": 0.01027585, "auxiliary_loss_mlp": 0.01004207, "balance_loss_clip": 1.00309837, "balance_loss_mlp": 1.00527716, "epoch": 0.9708101608297008, "flos": 60853040196480.0, "grad_norm": 0.7647552278992633, "language_loss": 0.54531622, "learning_rate": 8.91312740198713e-09, "loss": 0.56563413, "num_input_tokens_seen": 348541220, "router_z_loss_clip": 0.0111084, "router_z_loss_mlp": 0.22265625, "step": 16147, "time_per_iteration": 3.1123273372650146 }, { "auxiliary_loss_clip": 0.01106367, "auxiliary_loss_mlp": 0.01033358, "balance_loss_clip": 1.0203104, "balance_loss_mlp": 1.03514338, "epoch": 0.9708702840823689, "flos": 27124766029440.0, "grad_norm": 15.847887467072214, "language_loss": 0.61825097, "learning_rate": 8.876437313434682e-09, "loss": 0.6396482, "num_input_tokens_seen": 348559230, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 16148, "time_per_iteration": 2.50016450881958 }, { "auxiliary_loss_clip": 0.01102228, "auxiliary_loss_mlp": 0.01037427, "balance_loss_clip": 1.02545822, "balance_loss_mlp": 1.03514493, "epoch": 0.9709304073350368, "flos": 20777699041920.0, "grad_norm": 2.0196757849786175, "language_loss": 0.74046397, "learning_rate": 8.839822728487155e-09, "loss": 0.76186049, "num_input_tokens_seen": 348577850, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 16149, "time_per_iteration": 2.4440255165100098 }, { "auxiliary_loss_clip": 0.0110327, "auxiliary_loss_mlp": 0.01034234, "balance_loss_clip": 1.02191949, "balance_loss_mlp": 1.03349566, "epoch": 0.9709905305877048, "flos": 41934541115520.0, "grad_norm": 2.617152201966, "language_loss": 0.75229639, "learning_rate": 8.803283648533222e-09, "loss": 0.77367151, "num_input_tokens_seen": 348598345, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 16150, "time_per_iteration": 2.6142845153808594 }, { "auxiliary_loss_clip": 0.01111298, "auxiliary_loss_mlp": 0.01027918, "balance_loss_clip": 1.01340413, "balance_loss_mlp": 1.037606, "epoch": 0.9710506538403728, "flos": 17165588486400.0, "grad_norm": 2.0586892619949992, "language_loss": 0.73773563, "learning_rate": 8.766820074958214e-09, "loss": 0.75912786, "num_input_tokens_seen": 348616300, "router_z_loss_clip": 0.14550781, "router_z_loss_mlp": 0.734375, "step": 16151, "time_per_iteration": 2.4088640213012695 }, { "auxiliary_loss_clip": 0.01100765, "auxiliary_loss_mlp": 0.01029236, "balance_loss_clip": 1.01722002, "balance_loss_mlp": 1.03391588, "epoch": 0.9711107770930407, "flos": 21173470070400.0, "grad_norm": 1.861601129628814, "language_loss": 0.74551678, "learning_rate": 8.730432009145027e-09, "loss": 0.76681679, "num_input_tokens_seen": 348633845, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6640625, "step": 16152, "time_per_iteration": 2.4613454341888428 }, { "auxiliary_loss_clip": 0.01103418, "auxiliary_loss_mlp": 0.01033041, "balance_loss_clip": 1.0210371, "balance_loss_mlp": 1.03543985, "epoch": 0.9711709003457087, "flos": 22237072715520.0, "grad_norm": 2.277706869260519, "language_loss": 0.6703499, "learning_rate": 8.694119452473448e-09, "loss": 0.69171453, "num_input_tokens_seen": 348653070, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6796875, "step": 16153, "time_per_iteration": 2.451695203781128 }, { "auxiliary_loss_clip": 0.01103296, "auxiliary_loss_mlp": 0.01028723, "balance_loss_clip": 1.01773167, "balance_loss_mlp": 1.03424573, "epoch": 0.9712310235983767, "flos": 26213856099840.0, "grad_norm": 1.6681266423490093, "language_loss": 0.70492715, "learning_rate": 8.65788240632037e-09, "loss": 0.72624731, "num_input_tokens_seen": 348672145, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.6875, "step": 16154, "time_per_iteration": 2.4881091117858887 }, { "auxiliary_loss_clip": 0.01107855, "auxiliary_loss_mlp": 0.01032369, "balance_loss_clip": 1.01886261, "balance_loss_mlp": 1.03769553, "epoch": 0.9712911468510447, "flos": 20668171495680.0, "grad_norm": 1.8222510114088821, "language_loss": 0.81044883, "learning_rate": 8.621720872059812e-09, "loss": 0.83185112, "num_input_tokens_seen": 348690615, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.703125, "step": 16155, "time_per_iteration": 2.4368042945861816 }, { "auxiliary_loss_clip": 0.01106943, "auxiliary_loss_mlp": 0.01036451, "balance_loss_clip": 1.02322519, "balance_loss_mlp": 1.03521729, "epoch": 0.9713512701037126, "flos": 13552903313280.0, "grad_norm": 2.158553724555787, "language_loss": 0.67393208, "learning_rate": 8.58563485106334e-09, "loss": 0.69536602, "num_input_tokens_seen": 348708665, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71484375, "step": 16156, "time_per_iteration": 2.417623996734619 }, { "auxiliary_loss_clip": 0.01104568, "auxiliary_loss_mlp": 0.01030338, "balance_loss_clip": 1.01848304, "balance_loss_mlp": 1.03358281, "epoch": 0.9714113933563806, "flos": 25848752307840.0, "grad_norm": 3.0539878037955392, "language_loss": 0.90706468, "learning_rate": 8.54962434469919e-09, "loss": 0.92841375, "num_input_tokens_seen": 348726105, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.7109375, "step": 16157, "time_per_iteration": 2.4595696926116943 }, { "auxiliary_loss_clip": 0.01105604, "auxiliary_loss_mlp": 0.01027882, "balance_loss_clip": 1.01633644, "balance_loss_mlp": 1.03583968, "epoch": 0.9714715166090485, "flos": 12743081233920.0, "grad_norm": 2.0831501518237014, "language_loss": 0.72481215, "learning_rate": 8.513689354332721e-09, "loss": 0.74614704, "num_input_tokens_seen": 348743360, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6953125, "step": 16158, "time_per_iteration": 2.4135899543762207 }, { "auxiliary_loss_clip": 0.01102211, "auxiliary_loss_mlp": 0.01033268, "balance_loss_clip": 1.02094805, "balance_loss_mlp": 1.03501046, "epoch": 0.9715316398617165, "flos": 18405547931520.0, "grad_norm": 2.2406619414363065, "language_loss": 0.60318685, "learning_rate": 8.477829881326836e-09, "loss": 0.62454164, "num_input_tokens_seen": 348759045, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.671875, "step": 16159, "time_per_iteration": 2.39665150642395 }, { "auxiliary_loss_clip": 0.01099231, "auxiliary_loss_mlp": 0.01026462, "balance_loss_clip": 1.01525617, "balance_loss_mlp": 1.03440523, "epoch": 0.9715917631143844, "flos": 28913799749760.0, "grad_norm": 1.6676047656617474, "language_loss": 0.78566867, "learning_rate": 8.44204592704112e-09, "loss": 0.80692559, "num_input_tokens_seen": 348779910, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6484375, "step": 16160, "time_per_iteration": 2.5105910301208496 }, { "auxiliary_loss_clip": 0.01027822, "auxiliary_loss_mlp": 0.01002173, "balance_loss_clip": 1.0010941, "balance_loss_mlp": 1.00568914, "epoch": 0.9716518863670525, "flos": 65939712900480.0, "grad_norm": 0.8250133904044423, "language_loss": 0.54330754, "learning_rate": 8.406337492832704e-09, "loss": 0.56360745, "num_input_tokens_seen": 348838995, "router_z_loss_clip": 0.01080322, "router_z_loss_mlp": 0.22167969, "step": 16161, "time_per_iteration": 4.586691856384277 }, { "auxiliary_loss_clip": 0.01100913, "auxiliary_loss_mlp": 0.01030259, "balance_loss_clip": 1.01873112, "balance_loss_mlp": 1.03523421, "epoch": 0.9717120096197204, "flos": 17712759340800.0, "grad_norm": 1.7949211951471726, "language_loss": 0.71788704, "learning_rate": 8.3707045800554e-09, "loss": 0.73919868, "num_input_tokens_seen": 348858090, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65625, "step": 16162, "time_per_iteration": 2.4312336444854736 }, { "auxiliary_loss_clip": 0.01101139, "auxiliary_loss_mlp": 0.01027343, "balance_loss_clip": 1.01510036, "balance_loss_mlp": 1.03358746, "epoch": 0.9717721328723884, "flos": 24463426521600.0, "grad_norm": 1.7026177375547975, "language_loss": 0.79038715, "learning_rate": 8.335147190060787e-09, "loss": 0.81167197, "num_input_tokens_seen": 348877885, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.67578125, "step": 16163, "time_per_iteration": 2.5040485858917236 }, { "auxiliary_loss_clip": 0.01102472, "auxiliary_loss_mlp": 0.01027689, "balance_loss_clip": 1.01604176, "balance_loss_mlp": 1.03563654, "epoch": 0.9718322561250564, "flos": 20776477979520.0, "grad_norm": 2.0781755636725014, "language_loss": 0.73119986, "learning_rate": 8.299665324196903e-09, "loss": 0.75250149, "num_input_tokens_seen": 348897720, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 16164, "time_per_iteration": 2.4361732006073 }, { "auxiliary_loss_clip": 0.01104478, "auxiliary_loss_mlp": 0.01032498, "balance_loss_clip": 1.02024305, "balance_loss_mlp": 1.03520882, "epoch": 0.9718923793777243, "flos": 19025904746880.0, "grad_norm": 2.226285203189017, "language_loss": 0.83566296, "learning_rate": 8.264258983809114e-09, "loss": 0.85703278, "num_input_tokens_seen": 348915410, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69140625, "step": 16165, "time_per_iteration": 3.865579128265381 }, { "auxiliary_loss_clip": 0.0110369, "auxiliary_loss_mlp": 0.0102502, "balance_loss_clip": 1.01437473, "balance_loss_mlp": 1.035483, "epoch": 0.9719525026303923, "flos": 21871717528320.0, "grad_norm": 1.5885412889752628, "language_loss": 0.79162818, "learning_rate": 8.228928170240345e-09, "loss": 0.81291527, "num_input_tokens_seen": 348934335, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.6796875, "step": 16166, "time_per_iteration": 2.4552011489868164 }, { "auxiliary_loss_clip": 0.01103717, "auxiliary_loss_mlp": 0.01026289, "balance_loss_clip": 1.01472521, "balance_loss_mlp": 1.03524518, "epoch": 0.9720126258830603, "flos": 14429303251200.0, "grad_norm": 1.917990548659436, "language_loss": 0.709427, "learning_rate": 8.193672884830195e-09, "loss": 0.73072708, "num_input_tokens_seen": 348952405, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6875, "step": 16167, "time_per_iteration": 2.4377200603485107 }, { "auxiliary_loss_clip": 0.01103317, "auxiliary_loss_mlp": 0.01029949, "balance_loss_clip": 1.0185343, "balance_loss_mlp": 1.03686738, "epoch": 0.9720727491357283, "flos": 26251167352320.0, "grad_norm": 1.6821119594598617, "language_loss": 0.75463623, "learning_rate": 8.158493128915812e-09, "loss": 0.77596885, "num_input_tokens_seen": 348973580, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 16168, "time_per_iteration": 2.4919016361236572 }, { "auxiliary_loss_clip": 0.01105351, "auxiliary_loss_mlp": 0.01036507, "balance_loss_clip": 1.02378094, "balance_loss_mlp": 1.0357672, "epoch": 0.9721328723883962, "flos": 22674105492480.0, "grad_norm": 2.8572342049300516, "language_loss": 0.73079163, "learning_rate": 8.123388903830797e-09, "loss": 0.7522102, "num_input_tokens_seen": 348992035, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 16169, "time_per_iteration": 2.4533913135528564 }, { "auxiliary_loss_clip": 0.01104484, "auxiliary_loss_mlp": 0.01031051, "balance_loss_clip": 1.01807475, "balance_loss_mlp": 1.03316844, "epoch": 0.9721929956410642, "flos": 28074172360320.0, "grad_norm": 2.013050218252105, "language_loss": 0.57892823, "learning_rate": 8.088360210906309e-09, "loss": 0.60028362, "num_input_tokens_seen": 349013160, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 16170, "time_per_iteration": 5.242861032485962 }, { "auxiliary_loss_clip": 0.01105513, "auxiliary_loss_mlp": 0.01029048, "balance_loss_clip": 1.01625741, "balance_loss_mlp": 1.03528345, "epoch": 0.9722531188937321, "flos": 20996251344000.0, "grad_norm": 2.3449584171325903, "language_loss": 0.71730214, "learning_rate": 8.053407051471062e-09, "loss": 0.7386477, "num_input_tokens_seen": 349033485, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 16171, "time_per_iteration": 2.445753574371338 }, { "auxiliary_loss_clip": 0.01104365, "auxiliary_loss_mlp": 0.01035345, "balance_loss_clip": 1.0235374, "balance_loss_mlp": 1.03560781, "epoch": 0.9723132421464001, "flos": 16070600332800.0, "grad_norm": 1.8840040206234017, "language_loss": 0.6899296, "learning_rate": 8.018529426850218e-09, "loss": 0.71132678, "num_input_tokens_seen": 349051705, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 16172, "time_per_iteration": 2.418041229248047 }, { "auxiliary_loss_clip": 0.01102389, "auxiliary_loss_mlp": 0.01027291, "balance_loss_clip": 1.01536965, "balance_loss_mlp": 1.03478992, "epoch": 0.972373365399068, "flos": 27745769289600.0, "grad_norm": 1.8012357523092797, "language_loss": 0.85792041, "learning_rate": 7.983727338366274e-09, "loss": 0.87921721, "num_input_tokens_seen": 349070825, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.67578125, "step": 16173, "time_per_iteration": 2.478119373321533 }, { "auxiliary_loss_clip": 0.01108451, "auxiliary_loss_mlp": 0.0103247, "balance_loss_clip": 1.01842713, "balance_loss_mlp": 1.0354799, "epoch": 0.9724334886517361, "flos": 23002939526400.0, "grad_norm": 1.9759178497075214, "language_loss": 0.64413536, "learning_rate": 7.949000787339289e-09, "loss": 0.66554457, "num_input_tokens_seen": 349089730, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.73046875, "step": 16174, "time_per_iteration": 2.479815721511841 }, { "auxiliary_loss_clip": 0.01103609, "auxiliary_loss_mlp": 0.01027724, "balance_loss_clip": 1.01616657, "balance_loss_mlp": 1.03575253, "epoch": 0.972493611904404, "flos": 25447055535360.0, "grad_norm": 5.477863484844537, "language_loss": 0.78237098, "learning_rate": 7.914349775085538e-09, "loss": 0.80368435, "num_input_tokens_seen": 349111315, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6796875, "step": 16175, "time_per_iteration": 2.4883530139923096 }, { "auxiliary_loss_clip": 0.01102577, "auxiliary_loss_mlp": 0.01034334, "balance_loss_clip": 1.02097642, "balance_loss_mlp": 1.03397107, "epoch": 0.972553735157072, "flos": 16983054547200.0, "grad_norm": 2.514578330141643, "language_loss": 0.57144439, "learning_rate": 7.879774302919307e-09, "loss": 0.59281349, "num_input_tokens_seen": 349129495, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.6875, "step": 16176, "time_per_iteration": 2.4379019737243652 }, { "auxiliary_loss_clip": 0.01104719, "auxiliary_loss_mlp": 0.01030486, "balance_loss_clip": 1.01882124, "balance_loss_mlp": 1.03737664, "epoch": 0.97261385840974, "flos": 26104651776000.0, "grad_norm": 3.290530111864211, "language_loss": 0.72487706, "learning_rate": 7.845274372151545e-09, "loss": 0.74622905, "num_input_tokens_seen": 349148850, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 16177, "time_per_iteration": 2.486752986907959 }, { "auxiliary_loss_clip": 0.01103265, "auxiliary_loss_mlp": 0.01029602, "balance_loss_clip": 1.01796174, "balance_loss_mlp": 1.03275919, "epoch": 0.9726739816624079, "flos": 25447881548160.0, "grad_norm": 2.1890097376418085, "language_loss": 0.68806332, "learning_rate": 7.810849984090984e-09, "loss": 0.70939201, "num_input_tokens_seen": 349167620, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.703125, "step": 16178, "time_per_iteration": 2.4899075031280518 }, { "auxiliary_loss_clip": 0.01104422, "auxiliary_loss_mlp": 0.01029748, "balance_loss_clip": 1.01705205, "balance_loss_mlp": 1.03397703, "epoch": 0.972734104915076, "flos": 29014923513600.0, "grad_norm": 4.327655928400076, "language_loss": 0.67082399, "learning_rate": 7.776501140042358e-09, "loss": 0.69216573, "num_input_tokens_seen": 349185845, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 16179, "time_per_iteration": 2.5010132789611816 }, { "auxiliary_loss_clip": 0.01101862, "auxiliary_loss_mlp": 0.01032949, "balance_loss_clip": 1.02128434, "balance_loss_mlp": 1.03523421, "epoch": 0.9727942281677439, "flos": 23437637919360.0, "grad_norm": 1.5748629677916715, "language_loss": 0.77286935, "learning_rate": 7.742227841308624e-09, "loss": 0.79421747, "num_input_tokens_seen": 349204525, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6640625, "step": 16180, "time_per_iteration": 2.502211093902588 }, { "auxiliary_loss_clip": 0.01105426, "auxiliary_loss_mlp": 0.01031206, "balance_loss_clip": 1.01861131, "balance_loss_mlp": 1.03374457, "epoch": 0.9728543514204119, "flos": 31724599749120.0, "grad_norm": 2.647685724003289, "language_loss": 0.76690161, "learning_rate": 7.708030089189188e-09, "loss": 0.78826791, "num_input_tokens_seen": 349228075, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 16181, "time_per_iteration": 2.5354089736938477 }, { "auxiliary_loss_clip": 0.01101582, "auxiliary_loss_mlp": 0.01031913, "balance_loss_clip": 1.02005744, "balance_loss_mlp": 1.03389955, "epoch": 0.9729144746730798, "flos": 16289368116480.0, "grad_norm": 1.8419241635800327, "language_loss": 0.63388741, "learning_rate": 7.67390788498079e-09, "loss": 0.65522242, "num_input_tokens_seen": 349246990, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.67578125, "step": 16182, "time_per_iteration": 2.440183162689209 }, { "auxiliary_loss_clip": 0.01106342, "auxiliary_loss_mlp": 0.01031562, "balance_loss_clip": 1.01914656, "balance_loss_mlp": 1.03627777, "epoch": 0.9729745979257478, "flos": 25041408266880.0, "grad_norm": 2.207260912060643, "language_loss": 0.62369138, "learning_rate": 7.639861229977507e-09, "loss": 0.64507043, "num_input_tokens_seen": 349265890, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69921875, "step": 16183, "time_per_iteration": 2.4663326740264893 }, { "auxiliary_loss_clip": 0.01101832, "auxiliary_loss_mlp": 0.01031131, "balance_loss_clip": 1.01878047, "balance_loss_mlp": 1.03499126, "epoch": 0.9730347211784157, "flos": 22638733574400.0, "grad_norm": 1.778084802004912, "language_loss": 0.78035605, "learning_rate": 7.605890125470527e-09, "loss": 0.80168563, "num_input_tokens_seen": 349285275, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.66796875, "step": 16184, "time_per_iteration": 2.470935821533203 }, { "auxiliary_loss_clip": 0.01101007, "auxiliary_loss_mlp": 0.01028431, "balance_loss_clip": 1.0161165, "balance_loss_mlp": 1.03352249, "epoch": 0.9730948444310837, "flos": 10998613313280.0, "grad_norm": 2.5698664502731847, "language_loss": 0.79249632, "learning_rate": 7.571994572747709e-09, "loss": 0.8137908, "num_input_tokens_seen": 349301515, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 16185, "time_per_iteration": 2.4054644107818604 }, { "auxiliary_loss_clip": 0.0110486, "auxiliary_loss_mlp": 0.01030034, "balance_loss_clip": 1.01799369, "balance_loss_mlp": 1.03515744, "epoch": 0.9731549676837516, "flos": 16799479113600.0, "grad_norm": 3.354096115354843, "language_loss": 0.77554452, "learning_rate": 7.538174573094469e-09, "loss": 0.79689348, "num_input_tokens_seen": 349319590, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 16186, "time_per_iteration": 2.6125426292419434 }, { "auxiliary_loss_clip": 0.01100496, "auxiliary_loss_mlp": 0.01027962, "balance_loss_clip": 1.01613688, "balance_loss_mlp": 1.0336206, "epoch": 0.9732150909364197, "flos": 21141761339520.0, "grad_norm": 1.9510143954780446, "language_loss": 0.65182835, "learning_rate": 7.504430127793337e-09, "loss": 0.67311293, "num_input_tokens_seen": 349339230, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 16187, "time_per_iteration": 2.439833402633667 }, { "auxiliary_loss_clip": 0.01100735, "auxiliary_loss_mlp": 0.01030305, "balance_loss_clip": 1.01858664, "balance_loss_mlp": 1.03290236, "epoch": 0.9732752141890876, "flos": 33727337435520.0, "grad_norm": 2.057952650333405, "language_loss": 0.80816472, "learning_rate": 7.47076123812418e-09, "loss": 0.8294751, "num_input_tokens_seen": 349361155, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 16188, "time_per_iteration": 2.5540332794189453 }, { "auxiliary_loss_clip": 0.01100603, "auxiliary_loss_mlp": 0.0103081, "balance_loss_clip": 1.01946712, "balance_loss_mlp": 1.03412426, "epoch": 0.9733353374417556, "flos": 23404384903680.0, "grad_norm": 1.9253015921217882, "language_loss": 0.78409892, "learning_rate": 7.437167905363084e-09, "loss": 0.80541301, "num_input_tokens_seen": 349379335, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6640625, "step": 16189, "time_per_iteration": 2.469255208969116 }, { "auxiliary_loss_clip": 0.01100194, "auxiliary_loss_mlp": 0.01025493, "balance_loss_clip": 1.01366186, "balance_loss_mlp": 1.03303504, "epoch": 0.9733954606944236, "flos": 39165792963840.0, "grad_norm": 2.107775375414647, "language_loss": 0.51533395, "learning_rate": 7.403650130784367e-09, "loss": 0.53659081, "num_input_tokens_seen": 349401575, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.671875, "step": 16190, "time_per_iteration": 2.5991010665893555 }, { "auxiliary_loss_clip": 0.0110323, "auxiliary_loss_mlp": 0.01029768, "balance_loss_clip": 1.01746607, "balance_loss_mlp": 1.03455746, "epoch": 0.9734555839470915, "flos": 21981819692160.0, "grad_norm": 1.7205259107465016, "language_loss": 0.81143236, "learning_rate": 7.3702079156590105e-09, "loss": 0.83276242, "num_input_tokens_seen": 349420650, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 16191, "time_per_iteration": 2.4393270015716553 }, { "auxiliary_loss_clip": 0.01101172, "auxiliary_loss_mlp": 0.01027387, "balance_loss_clip": 1.01658058, "balance_loss_mlp": 1.0338248, "epoch": 0.9735157071997596, "flos": 16575539771520.0, "grad_norm": 1.7254146400210637, "language_loss": 0.82808661, "learning_rate": 7.336841261255111e-09, "loss": 0.84937221, "num_input_tokens_seen": 349436830, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.671875, "step": 16192, "time_per_iteration": 2.4308528900146484 }, { "auxiliary_loss_clip": 0.01107232, "auxiliary_loss_mlp": 0.01028889, "balance_loss_clip": 1.01709318, "balance_loss_mlp": 1.03773224, "epoch": 0.9735758304524275, "flos": 20223237726720.0, "grad_norm": 3.221174672718499, "language_loss": 0.74837303, "learning_rate": 7.303550168837658e-09, "loss": 0.76973426, "num_input_tokens_seen": 349454325, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 16193, "time_per_iteration": 2.464613437652588 }, { "auxiliary_loss_clip": 0.01099505, "auxiliary_loss_mlp": 0.0102878, "balance_loss_clip": 1.01777732, "balance_loss_mlp": 1.03359032, "epoch": 0.9736359537050955, "flos": 23653353047040.0, "grad_norm": 2.404070714081214, "language_loss": 0.85504305, "learning_rate": 7.270334639669417e-09, "loss": 0.87632596, "num_input_tokens_seen": 349470230, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.66015625, "step": 16194, "time_per_iteration": 2.462925672531128 }, { "auxiliary_loss_clip": 0.01101527, "auxiliary_loss_mlp": 0.01032472, "balance_loss_clip": 1.02092075, "balance_loss_mlp": 1.03572035, "epoch": 0.9736960769577634, "flos": 15560202026880.0, "grad_norm": 1.8298074068495227, "language_loss": 0.76194918, "learning_rate": 7.237194675009828e-09, "loss": 0.78328919, "num_input_tokens_seen": 349486250, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65625, "step": 16195, "time_per_iteration": 2.4071342945098877 }, { "auxiliary_loss_clip": 0.01027612, "auxiliary_loss_mlp": 0.00999296, "balance_loss_clip": 0.99827689, "balance_loss_mlp": 1.00550795, "epoch": 0.9737562002104314, "flos": 65351783088000.0, "grad_norm": 0.7796874461388104, "language_loss": 0.52421772, "learning_rate": 7.204130276115439e-09, "loss": 0.54448682, "num_input_tokens_seen": 349545865, "router_z_loss_clip": 0.01019287, "router_z_loss_mlp": 0.22070312, "step": 16196, "time_per_iteration": 3.035365581512451 }, { "auxiliary_loss_clip": 0.01103516, "auxiliary_loss_mlp": 0.01032187, "balance_loss_clip": 1.02023077, "balance_loss_mlp": 1.03568196, "epoch": 0.9738163234630993, "flos": 27196730928000.0, "grad_norm": 2.110068750889671, "language_loss": 0.76485503, "learning_rate": 7.171141444240136e-09, "loss": 0.78621209, "num_input_tokens_seen": 349566080, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 16197, "time_per_iteration": 2.5021042823791504 }, { "auxiliary_loss_clip": 0.01106336, "auxiliary_loss_mlp": 0.01025926, "balance_loss_clip": 1.01362932, "balance_loss_mlp": 1.03500938, "epoch": 0.9738764467157673, "flos": 21069365477760.0, "grad_norm": 1.836699979184248, "language_loss": 0.6748364, "learning_rate": 7.13822818063492e-09, "loss": 0.69615906, "num_input_tokens_seen": 349585665, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 16198, "time_per_iteration": 2.482393980026245 }, { "auxiliary_loss_clip": 0.01102653, "auxiliary_loss_mlp": 0.01030246, "balance_loss_clip": 1.01747894, "balance_loss_mlp": 1.0333066, "epoch": 0.9739365699684353, "flos": 21361211481600.0, "grad_norm": 1.7839151645625413, "language_loss": 0.77970481, "learning_rate": 7.10539048654768e-09, "loss": 0.80103379, "num_input_tokens_seen": 349605125, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69140625, "step": 16199, "time_per_iteration": 2.4449315071105957 }, { "auxiliary_loss_clip": 0.01103396, "auxiliary_loss_mlp": 0.010326, "balance_loss_clip": 1.02031589, "balance_loss_mlp": 1.035496, "epoch": 0.9739966932211033, "flos": 21902061542400.0, "grad_norm": 1.8434700584802757, "language_loss": 0.79366076, "learning_rate": 7.072628363223865e-09, "loss": 0.81502074, "num_input_tokens_seen": 349623360, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6796875, "step": 16200, "time_per_iteration": 2.45944881439209 }, { "auxiliary_loss_clip": 0.01109289, "auxiliary_loss_mlp": 0.01037116, "balance_loss_clip": 1.0241518, "balance_loss_mlp": 1.03596556, "epoch": 0.9740568164737712, "flos": 24827345164800.0, "grad_norm": 2.219543804105009, "language_loss": 0.68273401, "learning_rate": 7.039941811905592e-09, "loss": 0.704198, "num_input_tokens_seen": 349644390, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.734375, "step": 16201, "time_per_iteration": 2.5039238929748535 }, { "auxiliary_loss_clip": 0.01102936, "auxiliary_loss_mlp": 0.0102873, "balance_loss_clip": 1.01686251, "balance_loss_mlp": 1.03366399, "epoch": 0.9741169397264392, "flos": 23623583650560.0, "grad_norm": 1.4358203742927114, "language_loss": 0.72885358, "learning_rate": 7.0073308338325364e-09, "loss": 0.75017023, "num_input_tokens_seen": 349663200, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69140625, "step": 16202, "time_per_iteration": 2.511384963989258 }, { "auxiliary_loss_clip": 0.01105893, "auxiliary_loss_mlp": 0.01031105, "balance_loss_clip": 1.01818228, "balance_loss_mlp": 1.03613448, "epoch": 0.9741770629791072, "flos": 18841144164480.0, "grad_norm": 1.7139383188619428, "language_loss": 0.73005557, "learning_rate": 6.974795430241265e-09, "loss": 0.7514255, "num_input_tokens_seen": 349681975, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 16203, "time_per_iteration": 3.9242613315582275 }, { "auxiliary_loss_clip": 0.01104485, "auxiliary_loss_mlp": 0.01031299, "balance_loss_clip": 1.0193423, "balance_loss_mlp": 1.03516686, "epoch": 0.9742371862317751, "flos": 22346241125760.0, "grad_norm": 3.6637827497053452, "language_loss": 0.77603471, "learning_rate": 6.942335602365235e-09, "loss": 0.79739261, "num_input_tokens_seen": 349701185, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 16204, "time_per_iteration": 2.4550299644470215 }, { "auxiliary_loss_clip": 0.01106892, "auxiliary_loss_mlp": 0.01032554, "balance_loss_clip": 1.01954806, "balance_loss_mlp": 1.03672957, "epoch": 0.9742973094844432, "flos": 21762764599680.0, "grad_norm": 6.660494400961362, "language_loss": 0.79594731, "learning_rate": 6.909951351435905e-09, "loss": 0.81734174, "num_input_tokens_seen": 349720360, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.703125, "step": 16205, "time_per_iteration": 2.507476568222046 }, { "auxiliary_loss_clip": 0.01103246, "auxiliary_loss_mlp": 0.01029766, "balance_loss_clip": 1.01839936, "balance_loss_mlp": 1.03560948, "epoch": 0.9743574327371111, "flos": 26248725227520.0, "grad_norm": 1.6298219490505421, "language_loss": 0.74412215, "learning_rate": 6.87764267868074e-09, "loss": 0.76545227, "num_input_tokens_seen": 349741040, "router_z_loss_clip": 0.11376953, "router_z_loss_mlp": 0.67578125, "step": 16206, "time_per_iteration": 3.9093093872070312 }, { "auxiliary_loss_clip": 0.01102333, "auxiliary_loss_mlp": 0.01031028, "balance_loss_clip": 1.01841521, "balance_loss_mlp": 1.03261876, "epoch": 0.9744175559897791, "flos": 12349321367040.0, "grad_norm": 4.591001837267476, "language_loss": 0.840554, "learning_rate": 6.8454095853252015e-09, "loss": 0.86188769, "num_input_tokens_seen": 349758895, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 16207, "time_per_iteration": 2.4379148483276367 }, { "auxiliary_loss_clip": 0.01101497, "auxiliary_loss_mlp": 0.01034639, "balance_loss_clip": 1.02273655, "balance_loss_mlp": 1.03413153, "epoch": 0.974477679242447, "flos": 28397834835840.0, "grad_norm": 1.772656690627642, "language_loss": 0.70805746, "learning_rate": 6.813252072591425e-09, "loss": 0.72941887, "num_input_tokens_seen": 349779740, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 16208, "time_per_iteration": 2.506570339202881 }, { "auxiliary_loss_clip": 0.01096234, "auxiliary_loss_mlp": 0.01024129, "balance_loss_clip": 1.01382315, "balance_loss_mlp": 1.03351784, "epoch": 0.974537802495115, "flos": 17785370684160.0, "grad_norm": 1.8011660864257413, "language_loss": 0.77128863, "learning_rate": 6.781170141698878e-09, "loss": 0.79249227, "num_input_tokens_seen": 349796820, "router_z_loss_clip": 0.10302734, "router_z_loss_mlp": 0.625, "step": 16209, "time_per_iteration": 2.4160449504852295 }, { "auxiliary_loss_clip": 0.01105575, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 1.01635039, "balance_loss_mlp": 1.03425264, "epoch": 0.9745979257477829, "flos": 23842315520640.0, "grad_norm": 2.7207938092990056, "language_loss": 0.78957319, "learning_rate": 6.749163793864144e-09, "loss": 0.81091803, "num_input_tokens_seen": 349816550, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 16210, "time_per_iteration": 2.483757972717285 }, { "auxiliary_loss_clip": 0.01103269, "auxiliary_loss_mlp": 0.01032865, "balance_loss_clip": 1.02099824, "balance_loss_mlp": 1.03373432, "epoch": 0.9746580490004509, "flos": 27016172236800.0, "grad_norm": 2.4307020122967646, "language_loss": 0.77923268, "learning_rate": 6.7172330303009176e-09, "loss": 0.80059397, "num_input_tokens_seen": 349834350, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 16211, "time_per_iteration": 3.847215414047241 }, { "auxiliary_loss_clip": 0.01108644, "auxiliary_loss_mlp": 0.01030862, "balance_loss_clip": 1.01761174, "balance_loss_mlp": 1.03592992, "epoch": 0.9747181722531189, "flos": 19792022952960.0, "grad_norm": 2.066912999411206, "language_loss": 0.77996576, "learning_rate": 6.685377852219787e-09, "loss": 0.80136085, "num_input_tokens_seen": 349853460, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7265625, "step": 16212, "time_per_iteration": 3.9079325199127197 }, { "auxiliary_loss_clip": 0.01102859, "auxiliary_loss_mlp": 0.0103026, "balance_loss_clip": 1.01885724, "balance_loss_mlp": 1.03542721, "epoch": 0.9747782955057869, "flos": 31430598929280.0, "grad_norm": 1.7548659687417023, "language_loss": 0.8017, "learning_rate": 6.653598260829118e-09, "loss": 0.82303119, "num_input_tokens_seen": 349874830, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 16213, "time_per_iteration": 2.5381126403808594 }, { "auxiliary_loss_clip": 0.01103093, "auxiliary_loss_mlp": 0.010289, "balance_loss_clip": 1.01674092, "balance_loss_mlp": 1.03392649, "epoch": 0.9748384187584548, "flos": 15961288268160.0, "grad_norm": 2.1393980292842474, "language_loss": 0.66470093, "learning_rate": 6.6218942573335044e-09, "loss": 0.68602085, "num_input_tokens_seen": 349893690, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 16214, "time_per_iteration": 2.417762041091919 }, { "auxiliary_loss_clip": 0.01105676, "auxiliary_loss_mlp": 0.01030816, "balance_loss_clip": 1.01832879, "balance_loss_mlp": 1.03588176, "epoch": 0.9748985420111228, "flos": 20558715776640.0, "grad_norm": 1.755962469612079, "language_loss": 0.74488068, "learning_rate": 6.5902658429355386e-09, "loss": 0.7662456, "num_input_tokens_seen": 349912480, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 16215, "time_per_iteration": 2.4432778358459473 }, { "auxiliary_loss_clip": 0.01101448, "auxiliary_loss_mlp": 0.01033035, "balance_loss_clip": 1.02089381, "balance_loss_mlp": 1.03355992, "epoch": 0.9749586652637908, "flos": 36721605127680.0, "grad_norm": 1.7259794917327327, "language_loss": 0.66756225, "learning_rate": 6.558713018834483e-09, "loss": 0.68890715, "num_input_tokens_seen": 349932470, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 16216, "time_per_iteration": 2.554581880569458 }, { "auxiliary_loss_clip": 0.01107196, "auxiliary_loss_mlp": 0.01027116, "balance_loss_clip": 1.01455736, "balance_loss_mlp": 1.03634048, "epoch": 0.9750187885164587, "flos": 10999223844480.0, "grad_norm": 1.9167969924069497, "language_loss": 0.72077388, "learning_rate": 6.527235786226937e-09, "loss": 0.74211693, "num_input_tokens_seen": 349949060, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 16217, "time_per_iteration": 2.4514925479888916 }, { "auxiliary_loss_clip": 0.01103939, "auxiliary_loss_mlp": 0.01028994, "balance_loss_clip": 1.01703119, "balance_loss_mlp": 1.03532696, "epoch": 0.9750789117691268, "flos": 25739512070400.0, "grad_norm": 1.6508785746134187, "language_loss": 0.78561187, "learning_rate": 6.495834146306167e-09, "loss": 0.80694115, "num_input_tokens_seen": 349968010, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6875, "step": 16218, "time_per_iteration": 2.486685037612915 }, { "auxiliary_loss_clip": 0.01100514, "auxiliary_loss_mlp": 0.01028941, "balance_loss_clip": 1.01676929, "balance_loss_mlp": 1.03418159, "epoch": 0.9751390350217947, "flos": 13333955961600.0, "grad_norm": 2.311909054465669, "language_loss": 0.77509248, "learning_rate": 6.464508100263222e-09, "loss": 0.79638696, "num_input_tokens_seen": 349985270, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6640625, "step": 16219, "time_per_iteration": 2.4335784912109375 }, { "auxiliary_loss_clip": 0.01105512, "auxiliary_loss_mlp": 0.01031799, "balance_loss_clip": 1.0199492, "balance_loss_mlp": 1.03524566, "epoch": 0.9751991582744627, "flos": 22820621068800.0, "grad_norm": 3.651759421906874, "language_loss": 0.81363678, "learning_rate": 6.433257649285817e-09, "loss": 0.83500993, "num_input_tokens_seen": 350003935, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.703125, "step": 16220, "time_per_iteration": 2.484633207321167 }, { "auxiliary_loss_clip": 0.01101322, "auxiliary_loss_mlp": 0.0102943, "balance_loss_clip": 1.01774192, "balance_loss_mlp": 1.03409481, "epoch": 0.9752592815271306, "flos": 19646189735040.0, "grad_norm": 2.436691386130809, "language_loss": 0.75337029, "learning_rate": 6.402082794559227e-09, "loss": 0.77467775, "num_input_tokens_seen": 350023595, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 16221, "time_per_iteration": 2.452920913696289 }, { "auxiliary_loss_clip": 0.01099864, "auxiliary_loss_mlp": 0.01032247, "balance_loss_clip": 1.02000999, "balance_loss_mlp": 1.03321588, "epoch": 0.9753194047797986, "flos": 26690462686080.0, "grad_norm": 1.748799796830498, "language_loss": 0.66750044, "learning_rate": 6.370983537265395e-09, "loss": 0.68882155, "num_input_tokens_seen": 350045920, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.66796875, "step": 16222, "time_per_iteration": 2.5056378841400146 }, { "auxiliary_loss_clip": 0.01101131, "auxiliary_loss_mlp": 0.01028609, "balance_loss_clip": 1.01697457, "balance_loss_mlp": 1.03345609, "epoch": 0.9753795280324665, "flos": 23221779137280.0, "grad_norm": 1.7214485545686868, "language_loss": 0.88112652, "learning_rate": 6.3399598785836004e-09, "loss": 0.90242392, "num_input_tokens_seen": 350063925, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6796875, "step": 16223, "time_per_iteration": 2.4751603603363037 }, { "auxiliary_loss_clip": 0.01101233, "auxiliary_loss_mlp": 0.01030237, "balance_loss_clip": 1.01859045, "balance_loss_mlp": 1.03419328, "epoch": 0.9754396512851345, "flos": 19463835363840.0, "grad_norm": 2.0822016060936, "language_loss": 0.747486, "learning_rate": 6.309011819690457e-09, "loss": 0.76880074, "num_input_tokens_seen": 350080900, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 16224, "time_per_iteration": 2.500849962234497 }, { "auxiliary_loss_clip": 0.01027931, "auxiliary_loss_mlp": 0.01003155, "balance_loss_clip": 1.00210607, "balance_loss_mlp": 1.00560486, "epoch": 0.9754997745378025, "flos": 68459313340800.0, "grad_norm": 0.809547323759434, "language_loss": 0.59097338, "learning_rate": 6.278139361759249e-09, "loss": 0.61128426, "num_input_tokens_seen": 350144550, "router_z_loss_clip": 0.01049805, "router_z_loss_mlp": 0.22363281, "step": 16225, "time_per_iteration": 3.0398430824279785 }, { "auxiliary_loss_clip": 0.0110282, "auxiliary_loss_mlp": 0.01035116, "balance_loss_clip": 1.02315307, "balance_loss_mlp": 1.03500032, "epoch": 0.9755598977904705, "flos": 26395168976640.0, "grad_norm": 9.017100018143909, "language_loss": 0.68951595, "learning_rate": 6.247342505960818e-09, "loss": 0.7108953, "num_input_tokens_seen": 350164050, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 16226, "time_per_iteration": 2.4804935455322266 }, { "auxiliary_loss_clip": 0.01103073, "auxiliary_loss_mlp": 0.01036727, "balance_loss_clip": 1.02405488, "balance_loss_mlp": 1.0346837, "epoch": 0.9756200210431384, "flos": 16617663446400.0, "grad_norm": 1.7856291261690946, "language_loss": 0.82840091, "learning_rate": 6.216621253462894e-09, "loss": 0.84979892, "num_input_tokens_seen": 350181350, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.68359375, "step": 16227, "time_per_iteration": 2.4296696186065674 }, { "auxiliary_loss_clip": 0.01100901, "auxiliary_loss_mlp": 0.01028395, "balance_loss_clip": 1.01688552, "balance_loss_mlp": 1.03404903, "epoch": 0.9756801442958064, "flos": 23623044946560.0, "grad_norm": 1.838527957265554, "language_loss": 0.7786715, "learning_rate": 6.185975605430549e-09, "loss": 0.79996443, "num_input_tokens_seen": 350199765, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 16228, "time_per_iteration": 2.4466447830200195 }, { "auxiliary_loss_clip": 0.0102764, "auxiliary_loss_mlp": 0.01001651, "balance_loss_clip": 1.00064361, "balance_loss_mlp": 1.00539494, "epoch": 0.9757402675484744, "flos": 61625799440640.0, "grad_norm": 0.8478313024619346, "language_loss": 0.55829138, "learning_rate": 6.155405563025962e-09, "loss": 0.57858425, "num_input_tokens_seen": 350256420, "router_z_loss_clip": 0.0100708, "router_z_loss_mlp": 0.22265625, "step": 16229, "time_per_iteration": 3.0045833587646484 }, { "auxiliary_loss_clip": 0.0110356, "auxiliary_loss_mlp": 0.01029141, "balance_loss_clip": 1.01705265, "balance_loss_mlp": 1.03499079, "epoch": 0.9758003908011423, "flos": 24058964401920.0, "grad_norm": 1.7617489729945546, "language_loss": 0.74927962, "learning_rate": 6.124911127407984e-09, "loss": 0.77060658, "num_input_tokens_seen": 350276270, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6875, "step": 16230, "time_per_iteration": 2.4812214374542236 }, { "auxiliary_loss_clip": 0.01098964, "auxiliary_loss_mlp": 0.01026513, "balance_loss_clip": 1.01517642, "balance_loss_mlp": 1.03469896, "epoch": 0.9758605140538104, "flos": 17493093717120.0, "grad_norm": 1.8984556737013696, "language_loss": 0.72208667, "learning_rate": 6.094492299733245e-09, "loss": 0.74334145, "num_input_tokens_seen": 350295000, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.64453125, "step": 16231, "time_per_iteration": 2.434783935546875 }, { "auxiliary_loss_clip": 0.01107155, "auxiliary_loss_mlp": 0.01030254, "balance_loss_clip": 1.01782012, "balance_loss_mlp": 1.03679764, "epoch": 0.9759206373064783, "flos": 24826950115200.0, "grad_norm": 2.4034371142576916, "language_loss": 0.76858932, "learning_rate": 6.064149081155267e-09, "loss": 0.78996342, "num_input_tokens_seen": 350314980, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 16232, "time_per_iteration": 2.473611354827881 }, { "auxiliary_loss_clip": 0.01027817, "auxiliary_loss_mlp": 0.01002671, "balance_loss_clip": 1.0016222, "balance_loss_mlp": 1.00543308, "epoch": 0.9759807605591463, "flos": 68161182456960.0, "grad_norm": 0.734631267610308, "language_loss": 0.53806281, "learning_rate": 6.033881472824465e-09, "loss": 0.55836761, "num_input_tokens_seen": 350371985, "router_z_loss_clip": 0.01049805, "router_z_loss_mlp": 0.22363281, "step": 16233, "time_per_iteration": 2.959219217300415 }, { "auxiliary_loss_clip": 0.01102607, "auxiliary_loss_mlp": 0.01031881, "balance_loss_clip": 1.02022815, "balance_loss_mlp": 1.03403127, "epoch": 0.9760408838118142, "flos": 18989239939200.0, "grad_norm": 2.4081266984552556, "language_loss": 0.71572936, "learning_rate": 6.003689475888807e-09, "loss": 0.73707426, "num_input_tokens_seen": 350390590, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6875, "step": 16234, "time_per_iteration": 2.4238243103027344 }, { "auxiliary_loss_clip": 0.01105226, "auxiliary_loss_mlp": 0.01033209, "balance_loss_clip": 1.02057254, "balance_loss_mlp": 1.03389692, "epoch": 0.9761010070644822, "flos": 17125978763520.0, "grad_norm": 2.406739929110624, "language_loss": 0.79005837, "learning_rate": 5.973573091493156e-09, "loss": 0.81144279, "num_input_tokens_seen": 350403770, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.7109375, "step": 16235, "time_per_iteration": 2.413738965988159 }, { "auxiliary_loss_clip": 0.0110346, "auxiliary_loss_mlp": 0.0103222, "balance_loss_clip": 1.01863003, "balance_loss_mlp": 1.03522086, "epoch": 0.9761611303171501, "flos": 22052599441920.0, "grad_norm": 1.8987557022051125, "language_loss": 0.76994157, "learning_rate": 5.943532320779265e-09, "loss": 0.79129839, "num_input_tokens_seen": 350421870, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.68359375, "step": 16236, "time_per_iteration": 2.4730496406555176 }, { "auxiliary_loss_clip": 0.01102499, "auxiliary_loss_mlp": 0.01026348, "balance_loss_clip": 1.01439142, "balance_loss_mlp": 1.03482628, "epoch": 0.9762212535698181, "flos": 21757521214080.0, "grad_norm": 2.1903954314041605, "language_loss": 0.75519347, "learning_rate": 5.913567164886446e-09, "loss": 0.77648187, "num_input_tokens_seen": 350440025, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.67578125, "step": 16237, "time_per_iteration": 2.4760775566101074 }, { "auxiliary_loss_clip": 0.01103426, "auxiliary_loss_mlp": 0.01033507, "balance_loss_clip": 1.02007258, "balance_loss_mlp": 1.03365088, "epoch": 0.9762813768224861, "flos": 25921615046400.0, "grad_norm": 2.073712588864602, "language_loss": 0.73151457, "learning_rate": 5.8836776249509e-09, "loss": 0.75288391, "num_input_tokens_seen": 350459435, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 16238, "time_per_iteration": 2.4821760654449463 }, { "auxiliary_loss_clip": 0.01102886, "auxiliary_loss_mlp": 0.01033451, "balance_loss_clip": 1.0205344, "balance_loss_mlp": 1.03452015, "epoch": 0.9763415000751541, "flos": 24051853509120.0, "grad_norm": 2.13115657471131, "language_loss": 0.83573008, "learning_rate": 5.8538637021063875e-09, "loss": 0.85709345, "num_input_tokens_seen": 350472655, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.68359375, "step": 16239, "time_per_iteration": 2.4829623699188232 }, { "auxiliary_loss_clip": 0.01104542, "auxiliary_loss_mlp": 0.010295, "balance_loss_clip": 1.01715577, "balance_loss_mlp": 1.03584671, "epoch": 0.976401623327822, "flos": 17018677860480.0, "grad_norm": 2.678820262518993, "language_loss": 0.60804927, "learning_rate": 5.824125397483115e-09, "loss": 0.62938964, "num_input_tokens_seen": 350488160, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6875, "step": 16240, "time_per_iteration": 2.4356110095977783 }, { "auxiliary_loss_clip": 0.01102928, "auxiliary_loss_mlp": 0.01033165, "balance_loss_clip": 1.02111292, "balance_loss_mlp": 1.03671718, "epoch": 0.97646174658049, "flos": 16106941918080.0, "grad_norm": 1.722901311751199, "language_loss": 0.8248592, "learning_rate": 5.7944627122088474e-09, "loss": 0.84622014, "num_input_tokens_seen": 350506065, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6640625, "step": 16241, "time_per_iteration": 2.4334006309509277 }, { "auxiliary_loss_clip": 0.01102833, "auxiliary_loss_mlp": 0.01033262, "balance_loss_clip": 1.02195501, "balance_loss_mlp": 1.03458071, "epoch": 0.9765218698331579, "flos": 21252725429760.0, "grad_norm": 1.968048928274115, "language_loss": 0.83343685, "learning_rate": 5.764875647408463e-09, "loss": 0.85479784, "num_input_tokens_seen": 350524495, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 16242, "time_per_iteration": 2.4681520462036133 }, { "auxiliary_loss_clip": 0.01105255, "auxiliary_loss_mlp": 0.01028654, "balance_loss_clip": 1.01652479, "balance_loss_mlp": 1.03617382, "epoch": 0.9765819930858259, "flos": 18588045957120.0, "grad_norm": 1.6775694102584675, "language_loss": 0.75152183, "learning_rate": 5.7353642042037294e-09, "loss": 0.77286088, "num_input_tokens_seen": 350544185, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 16243, "time_per_iteration": 2.5404574871063232 }, { "auxiliary_loss_clip": 0.0110336, "auxiliary_loss_mlp": 0.01033861, "balance_loss_clip": 1.02133811, "balance_loss_mlp": 1.03402495, "epoch": 0.976642116338494, "flos": 20266833859200.0, "grad_norm": 1.707686838220716, "language_loss": 0.70061976, "learning_rate": 5.705928383713754e-09, "loss": 0.72199196, "num_input_tokens_seen": 350562675, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 16244, "time_per_iteration": 3.919025421142578 }, { "auxiliary_loss_clip": 0.01106952, "auxiliary_loss_mlp": 0.01028822, "balance_loss_clip": 1.01617956, "balance_loss_mlp": 1.03767204, "epoch": 0.9767022395911619, "flos": 25550477769600.0, "grad_norm": 1.9449657251217447, "language_loss": 0.83497834, "learning_rate": 5.676568187055197e-09, "loss": 0.85633612, "num_input_tokens_seen": 350581535, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 16245, "time_per_iteration": 2.4706199169158936 }, { "auxiliary_loss_clip": 0.01100672, "auxiliary_loss_mlp": 0.01024931, "balance_loss_clip": 1.01336813, "balance_loss_mlp": 1.03370237, "epoch": 0.9767623628438299, "flos": 21762656858880.0, "grad_norm": 1.4827812934410434, "language_loss": 0.78606665, "learning_rate": 5.647283615340726e-09, "loss": 0.80732274, "num_input_tokens_seen": 350601615, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 16246, "time_per_iteration": 2.4370241165161133 }, { "auxiliary_loss_clip": 0.01096504, "auxiliary_loss_mlp": 0.0103091, "balance_loss_clip": 1.02046728, "balance_loss_mlp": 1.03375125, "epoch": 0.9768224860964978, "flos": 15851114277120.0, "grad_norm": 1.667897259381724, "language_loss": 0.74180263, "learning_rate": 5.6180746696812275e-09, "loss": 0.76307678, "num_input_tokens_seen": 350619580, "router_z_loss_clip": 0.10449219, "router_z_loss_mlp": 0.62890625, "step": 16247, "time_per_iteration": 2.425206422805786 }, { "auxiliary_loss_clip": 0.01103933, "auxiliary_loss_mlp": 0.01029804, "balance_loss_clip": 1.01811528, "balance_loss_mlp": 1.03529048, "epoch": 0.9768826093491658, "flos": 25151151294720.0, "grad_norm": 1.7468440255294253, "language_loss": 0.79895663, "learning_rate": 5.58894135118404e-09, "loss": 0.82029402, "num_input_tokens_seen": 350640015, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 16248, "time_per_iteration": 3.9010801315307617 }, { "auxiliary_loss_clip": 0.01110591, "auxiliary_loss_mlp": 0.01039537, "balance_loss_clip": 1.02614379, "balance_loss_mlp": 1.03810048, "epoch": 0.9769427326018337, "flos": 22967028904320.0, "grad_norm": 1.8327989253791372, "language_loss": 0.79188383, "learning_rate": 5.559883660954278e-09, "loss": 0.81338519, "num_input_tokens_seen": 350659155, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 16249, "time_per_iteration": 2.451611042022705 }, { "auxiliary_loss_clip": 0.010995, "auxiliary_loss_mlp": 0.01031506, "balance_loss_clip": 1.01943612, "balance_loss_mlp": 1.03375387, "epoch": 0.9770028558545018, "flos": 15264297786240.0, "grad_norm": 2.3237010455548646, "language_loss": 0.66682947, "learning_rate": 5.530901600093507e-09, "loss": 0.68813956, "num_input_tokens_seen": 350676615, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.65625, "step": 16250, "time_per_iteration": 2.4237983226776123 }, { "auxiliary_loss_clip": 0.01027644, "auxiliary_loss_mlp": 0.01000977, "balance_loss_clip": 1.0, "balance_loss_mlp": 1.00553155, "epoch": 0.9770629791071697, "flos": 71450348808960.0, "grad_norm": 0.789398068587704, "language_loss": 0.59824789, "learning_rate": 5.501995169700846e-09, "loss": 0.61853409, "num_input_tokens_seen": 350736805, "router_z_loss_clip": 0.00976562, "router_z_loss_mlp": 0.22167969, "step": 16251, "time_per_iteration": 3.124054193496704 }, { "auxiliary_loss_clip": 0.01102217, "auxiliary_loss_mlp": 0.01029316, "balance_loss_clip": 1.01705503, "balance_loss_mlp": 1.03380013, "epoch": 0.9771231023598377, "flos": 22412854897920.0, "grad_norm": 4.820895755246817, "language_loss": 0.78700089, "learning_rate": 5.473164370872307e-09, "loss": 0.80831623, "num_input_tokens_seen": 350753600, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.68359375, "step": 16252, "time_per_iteration": 2.450570583343506 }, { "auxiliary_loss_clip": 0.01101992, "auxiliary_loss_mlp": 0.01030842, "balance_loss_clip": 1.01865911, "balance_loss_mlp": 1.03462851, "epoch": 0.9771832256125056, "flos": 19025940660480.0, "grad_norm": 2.701906498823525, "language_loss": 0.64196408, "learning_rate": 5.444409204701461e-09, "loss": 0.66329241, "num_input_tokens_seen": 350771225, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 16253, "time_per_iteration": 5.2128355503082275 }, { "auxiliary_loss_clip": 0.01106417, "auxiliary_loss_mlp": 0.01031796, "balance_loss_clip": 1.01846814, "balance_loss_mlp": 1.03701246, "epoch": 0.9772433488651736, "flos": 17822143232640.0, "grad_norm": 2.614542471519321, "language_loss": 0.7666961, "learning_rate": 5.415729672278324e-09, "loss": 0.78807819, "num_input_tokens_seen": 350789100, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 16254, "time_per_iteration": 2.425750255584717 }, { "auxiliary_loss_clip": 0.01105256, "auxiliary_loss_mlp": 0.01030064, "balance_loss_clip": 1.01782155, "balance_loss_mlp": 1.03437734, "epoch": 0.9773034721178415, "flos": 37629785623680.0, "grad_norm": 1.9297861709695134, "language_loss": 0.64284241, "learning_rate": 5.387125774690471e-09, "loss": 0.6641956, "num_input_tokens_seen": 350811085, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.70703125, "step": 16255, "time_per_iteration": 2.620025873184204 }, { "auxiliary_loss_clip": 0.01108583, "auxiliary_loss_mlp": 0.0103547, "balance_loss_clip": 1.02190423, "balance_loss_mlp": 1.03659213, "epoch": 0.9773635953705095, "flos": 20302457172480.0, "grad_norm": 1.7878586367500418, "language_loss": 0.75845122, "learning_rate": 5.358597513023033e-09, "loss": 0.77989173, "num_input_tokens_seen": 350831065, "router_z_loss_clip": 0.13574219, "router_z_loss_mlp": 0.71875, "step": 16256, "time_per_iteration": 2.4617974758148193 }, { "auxiliary_loss_clip": 0.01102241, "auxiliary_loss_mlp": 0.01036661, "balance_loss_clip": 1.02349472, "balance_loss_mlp": 1.0362916, "epoch": 0.9774237186231776, "flos": 22309253095680.0, "grad_norm": 2.3592476402980007, "language_loss": 0.78171402, "learning_rate": 5.330144888357369e-09, "loss": 0.80310309, "num_input_tokens_seen": 350849675, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.65625, "step": 16257, "time_per_iteration": 2.4562692642211914 }, { "auxiliary_loss_clip": 0.01103291, "auxiliary_loss_mlp": 0.01033329, "balance_loss_clip": 1.02068067, "balance_loss_mlp": 1.03525257, "epoch": 0.9774838418758455, "flos": 24204905360640.0, "grad_norm": 1.962595248535481, "language_loss": 0.74910033, "learning_rate": 5.301767901772391e-09, "loss": 0.77046657, "num_input_tokens_seen": 350868955, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 16258, "time_per_iteration": 2.477869749069214 }, { "auxiliary_loss_clip": 0.01027634, "auxiliary_loss_mlp": 0.0100183, "balance_loss_clip": 1.00083447, "balance_loss_mlp": 1.0055244, "epoch": 0.9775439651285135, "flos": 66357139829760.0, "grad_norm": 0.6987790860631652, "language_loss": 0.59801525, "learning_rate": 5.273466554344353e-09, "loss": 0.61830986, "num_input_tokens_seen": 350935110, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22070312, "step": 16259, "time_per_iteration": 3.162698268890381 }, { "auxiliary_loss_clip": 0.01107045, "auxiliary_loss_mlp": 0.01029645, "balance_loss_clip": 1.01708674, "balance_loss_mlp": 1.03571725, "epoch": 0.9776040883811814, "flos": 22601565976320.0, "grad_norm": 2.236185379209168, "language_loss": 0.73733073, "learning_rate": 5.2452408471461705e-09, "loss": 0.75869769, "num_input_tokens_seen": 350953220, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 16260, "time_per_iteration": 2.4522829055786133 }, { "auxiliary_loss_clip": 0.01104655, "auxiliary_loss_mlp": 0.0102882, "balance_loss_clip": 1.01640475, "balance_loss_mlp": 1.03465843, "epoch": 0.9776642116338494, "flos": 18442176825600.0, "grad_norm": 3.7696406608534825, "language_loss": 0.79607916, "learning_rate": 5.2170907812485456e-09, "loss": 0.81741393, "num_input_tokens_seen": 350971915, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.703125, "step": 16261, "time_per_iteration": 2.439194440841675 }, { "auxiliary_loss_clip": 0.01106601, "auxiliary_loss_mlp": 0.01025987, "balance_loss_clip": 1.01340497, "balance_loss_mlp": 1.03638411, "epoch": 0.9777243348865173, "flos": 22638446265600.0, "grad_norm": 2.1533992289763035, "language_loss": 0.74383247, "learning_rate": 5.189016357718845e-09, "loss": 0.7651583, "num_input_tokens_seen": 350990470, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 16262, "time_per_iteration": 2.4489176273345947 }, { "auxiliary_loss_clip": 0.01106245, "auxiliary_loss_mlp": 0.01032349, "balance_loss_clip": 1.0184257, "balance_loss_mlp": 1.03603792, "epoch": 0.9777844581391854, "flos": 31321394605440.0, "grad_norm": 2.3044238725326927, "language_loss": 0.70773888, "learning_rate": 5.16101757762133e-09, "loss": 0.7291249, "num_input_tokens_seen": 351010755, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.703125, "step": 16263, "time_per_iteration": 2.5464303493499756 }, { "auxiliary_loss_clip": 0.01106393, "auxiliary_loss_mlp": 0.01029797, "balance_loss_clip": 1.01802516, "balance_loss_mlp": 1.03659093, "epoch": 0.9778445813918533, "flos": 23039101543680.0, "grad_norm": 1.6870445966631864, "language_loss": 0.66559809, "learning_rate": 5.133094442018038e-09, "loss": 0.68695998, "num_input_tokens_seen": 351029965, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 16264, "time_per_iteration": 2.46403431892395 }, { "auxiliary_loss_clip": 0.01109544, "auxiliary_loss_mlp": 0.01028826, "balance_loss_clip": 1.01508164, "balance_loss_mlp": 1.03719616, "epoch": 0.9779047046445213, "flos": 17566351505280.0, "grad_norm": 1.9813763949615193, "language_loss": 0.7331422, "learning_rate": 5.105246951967679e-09, "loss": 0.75452584, "num_input_tokens_seen": 351046205, "router_z_loss_clip": 0.13769531, "router_z_loss_mlp": 0.72265625, "step": 16265, "time_per_iteration": 2.4488608837127686 }, { "auxiliary_loss_clip": 0.0110093, "auxiliary_loss_mlp": 0.01031632, "balance_loss_clip": 1.01970458, "balance_loss_mlp": 1.03447723, "epoch": 0.9779648278971892, "flos": 20741141975040.0, "grad_norm": 1.9184055911152338, "language_loss": 0.68271065, "learning_rate": 5.077475108526297e-09, "loss": 0.70403636, "num_input_tokens_seen": 351065390, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 16266, "time_per_iteration": 2.4497337341308594 }, { "auxiliary_loss_clip": 0.0110037, "auxiliary_loss_mlp": 0.01026556, "balance_loss_clip": 1.01573777, "balance_loss_mlp": 1.03516984, "epoch": 0.9780249511498572, "flos": 21026954494080.0, "grad_norm": 8.788535118873325, "language_loss": 0.86799461, "learning_rate": 5.049778912747049e-09, "loss": 0.88926387, "num_input_tokens_seen": 351084355, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.65234375, "step": 16267, "time_per_iteration": 2.4604873657226562 }, { "auxiliary_loss_clip": 0.01105953, "auxiliary_loss_mlp": 0.0102809, "balance_loss_clip": 1.01545429, "balance_loss_mlp": 1.03521979, "epoch": 0.9780850744025251, "flos": 30774223751040.0, "grad_norm": 1.939289957191016, "language_loss": 0.70110703, "learning_rate": 5.022158365679985e-09, "loss": 0.72244751, "num_input_tokens_seen": 351105870, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.70703125, "step": 16268, "time_per_iteration": 2.506068706512451 }, { "auxiliary_loss_clip": 0.01104208, "auxiliary_loss_mlp": 0.01025143, "balance_loss_clip": 1.01369882, "balance_loss_mlp": 1.03536606, "epoch": 0.9781451976551931, "flos": 20302995876480.0, "grad_norm": 1.6914792958676332, "language_loss": 0.73871851, "learning_rate": 4.994613468372711e-09, "loss": 0.76001197, "num_input_tokens_seen": 351124760, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6875, "step": 16269, "time_per_iteration": 2.4348742961883545 }, { "auxiliary_loss_clip": 0.01105861, "auxiliary_loss_mlp": 0.01029215, "balance_loss_clip": 1.01625693, "balance_loss_mlp": 1.03620672, "epoch": 0.9782053209078612, "flos": 24316479982080.0, "grad_norm": 1.6388286810500292, "language_loss": 0.70779866, "learning_rate": 4.967144221869501e-09, "loss": 0.72914946, "num_input_tokens_seen": 351142820, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 16270, "time_per_iteration": 2.4735286235809326 }, { "auxiliary_loss_clip": 0.01105052, "auxiliary_loss_mlp": 0.01035114, "balance_loss_clip": 1.02296662, "balance_loss_mlp": 1.03539133, "epoch": 0.9782654441605291, "flos": 32489425065600.0, "grad_norm": 3.0027317730400327, "language_loss": 0.64222443, "learning_rate": 4.939750627212191e-09, "loss": 0.66362607, "num_input_tokens_seen": 351164805, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 16271, "time_per_iteration": 2.5411839485168457 }, { "auxiliary_loss_clip": 0.01101621, "auxiliary_loss_mlp": 0.0103063, "balance_loss_clip": 1.01842916, "balance_loss_mlp": 1.03628516, "epoch": 0.9783255674131971, "flos": 26979076465920.0, "grad_norm": 1.480315763131132, "language_loss": 0.70664847, "learning_rate": 4.912432685439505e-09, "loss": 0.72797096, "num_input_tokens_seen": 351187005, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.65625, "step": 16272, "time_per_iteration": 2.489668130874634 }, { "auxiliary_loss_clip": 0.01106597, "auxiliary_loss_mlp": 0.01034154, "balance_loss_clip": 1.0213275, "balance_loss_mlp": 1.03658438, "epoch": 0.978385690665865, "flos": 23112251591040.0, "grad_norm": 1.7416586415103807, "language_loss": 0.66908729, "learning_rate": 4.88519039758728e-09, "loss": 0.69049478, "num_input_tokens_seen": 351208450, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 16273, "time_per_iteration": 2.461449146270752 }, { "auxiliary_loss_clip": 0.01104142, "auxiliary_loss_mlp": 0.01026359, "balance_loss_clip": 1.01390135, "balance_loss_mlp": 1.03532946, "epoch": 0.978445813918533, "flos": 25409672455680.0, "grad_norm": 2.29604069764761, "language_loss": 0.74048996, "learning_rate": 4.85802376468869e-09, "loss": 0.76179498, "num_input_tokens_seen": 351229585, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 16274, "time_per_iteration": 2.4771640300750732 }, { "auxiliary_loss_clip": 0.01104509, "auxiliary_loss_mlp": 0.01030137, "balance_loss_clip": 1.01880622, "balance_loss_mlp": 1.03655493, "epoch": 0.9785059371712009, "flos": 23550218121600.0, "grad_norm": 1.8920802245981785, "language_loss": 0.77806854, "learning_rate": 4.830932787773579e-09, "loss": 0.79941505, "num_input_tokens_seen": 351249525, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6796875, "step": 16275, "time_per_iteration": 2.4815261363983154 }, { "auxiliary_loss_clip": 0.01105821, "auxiliary_loss_mlp": 0.01031021, "balance_loss_clip": 1.01845026, "balance_loss_mlp": 1.03540993, "epoch": 0.978566060423869, "flos": 34351177870080.0, "grad_norm": 1.5574686524462558, "language_loss": 0.70804918, "learning_rate": 4.803917467869567e-09, "loss": 0.72941756, "num_input_tokens_seen": 351272530, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 16276, "time_per_iteration": 2.551236391067505 }, { "auxiliary_loss_clip": 0.01099193, "auxiliary_loss_mlp": 0.01027272, "balance_loss_clip": 1.015751, "balance_loss_mlp": 1.03347921, "epoch": 0.9786261836765369, "flos": 11618862387840.0, "grad_norm": 1.8348791083464215, "language_loss": 0.85513294, "learning_rate": 4.776977806000726e-09, "loss": 0.87639767, "num_input_tokens_seen": 351288530, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.65625, "step": 16277, "time_per_iteration": 2.4355039596557617 }, { "auxiliary_loss_clip": 0.01100316, "auxiliary_loss_mlp": 0.01026648, "balance_loss_clip": 1.01460767, "balance_loss_mlp": 1.03402901, "epoch": 0.9786863069292049, "flos": 17420949250560.0, "grad_norm": 2.0478384409611974, "language_loss": 0.7092495, "learning_rate": 4.7501138031891264e-09, "loss": 0.73051918, "num_input_tokens_seen": 351305890, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6640625, "step": 16278, "time_per_iteration": 2.4092602729797363 }, { "auxiliary_loss_clip": 0.01101075, "auxiliary_loss_mlp": 0.0102939, "balance_loss_clip": 1.01674199, "balance_loss_mlp": 1.03335142, "epoch": 0.9787464301818728, "flos": 20844923345280.0, "grad_norm": 1.819822844209691, "language_loss": 0.84427744, "learning_rate": 4.723325460453065e-09, "loss": 0.86558211, "num_input_tokens_seen": 351325010, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.67578125, "step": 16279, "time_per_iteration": 2.4633126258850098 }, { "auxiliary_loss_clip": 0.01103376, "auxiliary_loss_mlp": 0.01027345, "balance_loss_clip": 1.01484597, "balance_loss_mlp": 1.03452945, "epoch": 0.9788065534345408, "flos": 18222942165120.0, "grad_norm": 1.8251371388835722, "language_loss": 0.78809679, "learning_rate": 4.696612778808395e-09, "loss": 0.80940402, "num_input_tokens_seen": 351343060, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6875, "step": 16280, "time_per_iteration": 2.4378740787506104 }, { "auxiliary_loss_clip": 0.01100342, "auxiliary_loss_mlp": 0.01030731, "balance_loss_clip": 1.01975751, "balance_loss_mlp": 1.03519905, "epoch": 0.9788666766872087, "flos": 21578219498880.0, "grad_norm": 3.786548271779147, "language_loss": 0.79644585, "learning_rate": 4.669975759268085e-09, "loss": 0.81775653, "num_input_tokens_seen": 351363260, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.65234375, "step": 16281, "time_per_iteration": 2.5323116779327393 }, { "auxiliary_loss_clip": 0.01105552, "auxiliary_loss_mlp": 0.01031047, "balance_loss_clip": 1.01820278, "balance_loss_mlp": 1.03560638, "epoch": 0.9789267999398767, "flos": 24900495212160.0, "grad_norm": 2.061852807529627, "language_loss": 0.80320454, "learning_rate": 4.643414402842216e-09, "loss": 0.82457054, "num_input_tokens_seen": 351382610, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 16282, "time_per_iteration": 2.5020902156829834 }, { "auxiliary_loss_clip": 0.01104295, "auxiliary_loss_mlp": 0.01037279, "balance_loss_clip": 1.02498281, "balance_loss_mlp": 1.03561449, "epoch": 0.9789869231925448, "flos": 19573111514880.0, "grad_norm": 2.211056786580574, "language_loss": 0.83349323, "learning_rate": 4.616928710538204e-09, "loss": 0.854909, "num_input_tokens_seen": 351401075, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6875, "step": 16283, "time_per_iteration": 2.4384419918060303 }, { "auxiliary_loss_clip": 0.0110288, "auxiliary_loss_mlp": 0.01033964, "balance_loss_clip": 1.02196527, "balance_loss_mlp": 1.03389573, "epoch": 0.9790470464452127, "flos": 16796641939200.0, "grad_norm": 1.959008925243237, "language_loss": 0.72365248, "learning_rate": 4.590518683360134e-09, "loss": 0.74502093, "num_input_tokens_seen": 351419275, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6875, "step": 16284, "time_per_iteration": 2.4328248500823975 }, { "auxiliary_loss_clip": 0.01100923, "auxiliary_loss_mlp": 0.01033581, "balance_loss_clip": 1.02225041, "balance_loss_mlp": 1.03497195, "epoch": 0.9791071696978807, "flos": 18369350000640.0, "grad_norm": 2.6595597515877487, "language_loss": 0.64471203, "learning_rate": 4.56418432230965e-09, "loss": 0.66605711, "num_input_tokens_seen": 351437375, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65625, "step": 16285, "time_per_iteration": 2.4437201023101807 }, { "auxiliary_loss_clip": 0.01103375, "auxiliary_loss_mlp": 0.01029908, "balance_loss_clip": 1.01757574, "balance_loss_mlp": 1.03536355, "epoch": 0.9791672929505486, "flos": 24170323541760.0, "grad_norm": 1.5810137480348219, "language_loss": 0.70436215, "learning_rate": 4.537925628385286e-09, "loss": 0.72569501, "num_input_tokens_seen": 351457810, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 16286, "time_per_iteration": 3.9659695625305176 }, { "auxiliary_loss_clip": 0.01100249, "auxiliary_loss_mlp": 0.01026243, "balance_loss_clip": 1.0145669, "balance_loss_mlp": 1.03360701, "epoch": 0.9792274162032166, "flos": 24354114456960.0, "grad_norm": 1.6792505812284544, "language_loss": 0.58209866, "learning_rate": 4.511742602582691e-09, "loss": 0.60336357, "num_input_tokens_seen": 351478825, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66796875, "step": 16287, "time_per_iteration": 2.487123966217041 }, { "auxiliary_loss_clip": 0.01102915, "auxiliary_loss_mlp": 0.01032123, "balance_loss_clip": 1.01979113, "balance_loss_mlp": 1.03491664, "epoch": 0.9792875394558845, "flos": 26395779507840.0, "grad_norm": 1.90738840317967, "language_loss": 0.81850362, "learning_rate": 4.485635245894626e-09, "loss": 0.83985406, "num_input_tokens_seen": 351498785, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6796875, "step": 16288, "time_per_iteration": 2.503758668899536 }, { "auxiliary_loss_clip": 0.01103729, "auxiliary_loss_mlp": 0.01024822, "balance_loss_clip": 1.01258552, "balance_loss_mlp": 1.03445125, "epoch": 0.9793476627085526, "flos": 28148004766080.0, "grad_norm": 1.668549010315426, "language_loss": 0.71828222, "learning_rate": 4.459603559311631e-09, "loss": 0.73956776, "num_input_tokens_seen": 351520235, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69140625, "step": 16289, "time_per_iteration": 3.903360366821289 }, { "auxiliary_loss_clip": 0.01103919, "auxiliary_loss_mlp": 0.0103433, "balance_loss_clip": 1.02199149, "balance_loss_mlp": 1.03657126, "epoch": 0.9794077859612205, "flos": 16763927627520.0, "grad_norm": 3.792323991166249, "language_loss": 0.75655508, "learning_rate": 4.43364754382003e-09, "loss": 0.77793753, "num_input_tokens_seen": 351538900, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.671875, "step": 16290, "time_per_iteration": 2.4648404121398926 }, { "auxiliary_loss_clip": 0.01105628, "auxiliary_loss_mlp": 0.01032353, "balance_loss_clip": 1.0191927, "balance_loss_mlp": 1.03504646, "epoch": 0.9794679092138885, "flos": 19280834547840.0, "grad_norm": 1.628891523068749, "language_loss": 0.67384291, "learning_rate": 4.4077672004048105e-09, "loss": 0.69522274, "num_input_tokens_seen": 351558715, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.70703125, "step": 16291, "time_per_iteration": 2.4516499042510986 }, { "auxiliary_loss_clip": 0.01106524, "auxiliary_loss_mlp": 0.01026814, "balance_loss_clip": 1.01409447, "balance_loss_mlp": 1.03552985, "epoch": 0.9795280324665564, "flos": 32156640535680.0, "grad_norm": 1.8965016208621124, "language_loss": 0.62495291, "learning_rate": 4.3819625300467456e-09, "loss": 0.64628637, "num_input_tokens_seen": 351578450, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 16292, "time_per_iteration": 2.5857837200164795 }, { "auxiliary_loss_clip": 0.01104296, "auxiliary_loss_mlp": 0.01031439, "balance_loss_clip": 1.0193398, "balance_loss_mlp": 1.03599453, "epoch": 0.9795881557192244, "flos": 19060953442560.0, "grad_norm": 1.7710927463350026, "language_loss": 0.73355591, "learning_rate": 4.356233533724829e-09, "loss": 0.75491333, "num_input_tokens_seen": 351597195, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.68359375, "step": 16293, "time_per_iteration": 2.4702019691467285 }, { "auxiliary_loss_clip": 0.01104738, "auxiliary_loss_mlp": 0.01030684, "balance_loss_clip": 1.01812482, "balance_loss_mlp": 1.03450274, "epoch": 0.9796482789718923, "flos": 28329928174080.0, "grad_norm": 50.08629109650354, "language_loss": 0.83988643, "learning_rate": 4.330580212414503e-09, "loss": 0.86124063, "num_input_tokens_seen": 351617460, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.703125, "step": 16294, "time_per_iteration": 2.5589945316314697 }, { "auxiliary_loss_clip": 0.0109945, "auxiliary_loss_mlp": 0.01032572, "balance_loss_clip": 1.02115202, "balance_loss_mlp": 1.03414106, "epoch": 0.9797084022245603, "flos": 17967976450560.0, "grad_norm": 2.2286484861467057, "language_loss": 0.71995616, "learning_rate": 4.305002567088767e-09, "loss": 0.74127638, "num_input_tokens_seen": 351635900, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.65234375, "step": 16295, "time_per_iteration": 5.296590089797974 }, { "auxiliary_loss_clip": 0.01108701, "auxiliary_loss_mlp": 0.01035841, "balance_loss_clip": 1.02338362, "balance_loss_mlp": 1.03744483, "epoch": 0.9797685254772284, "flos": 20266726118400.0, "grad_norm": 2.1720344443706634, "language_loss": 0.80793619, "learning_rate": 4.2795005987170674e-09, "loss": 0.82938164, "num_input_tokens_seen": 351655400, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 16296, "time_per_iteration": 2.461958885192871 }, { "auxiliary_loss_clip": 0.01102805, "auxiliary_loss_mlp": 0.0103361, "balance_loss_clip": 1.02180815, "balance_loss_mlp": 1.03487396, "epoch": 0.9798286487298963, "flos": 26907147480960.0, "grad_norm": 1.9598743866094825, "language_loss": 0.75508833, "learning_rate": 4.254074308266853e-09, "loss": 0.77645242, "num_input_tokens_seen": 351675505, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6796875, "step": 16297, "time_per_iteration": 2.509955883026123 }, { "auxiliary_loss_clip": 0.01107577, "auxiliary_loss_mlp": 0.01030487, "balance_loss_clip": 1.01837587, "balance_loss_mlp": 1.03622937, "epoch": 0.9798887719825643, "flos": 27161071701120.0, "grad_norm": 1.5961968182536344, "language_loss": 0.78522652, "learning_rate": 4.228723696702019e-09, "loss": 0.80660719, "num_input_tokens_seen": 351697920, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.71484375, "step": 16298, "time_per_iteration": 2.528857469558716 }, { "auxiliary_loss_clip": 0.01099382, "auxiliary_loss_mlp": 0.01027923, "balance_loss_clip": 1.01604998, "balance_loss_mlp": 1.03359056, "epoch": 0.9799488952352322, "flos": 20668422890880.0, "grad_norm": 5.6865953281667005, "language_loss": 0.72562152, "learning_rate": 4.203448764984019e-09, "loss": 0.7468946, "num_input_tokens_seen": 351717615, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.65625, "step": 16299, "time_per_iteration": 2.513666868209839 }, { "auxiliary_loss_clip": 0.01106361, "auxiliary_loss_mlp": 0.01028881, "balance_loss_clip": 1.01641142, "balance_loss_mlp": 1.03547394, "epoch": 0.9800090184879002, "flos": 21981209160960.0, "grad_norm": 2.8318401220465637, "language_loss": 0.89721024, "learning_rate": 4.178249514071419e-09, "loss": 0.91856265, "num_input_tokens_seen": 351735260, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.7109375, "step": 16300, "time_per_iteration": 2.4583115577697754 }, { "auxiliary_loss_clip": 0.0110512, "auxiliary_loss_mlp": 0.01031421, "balance_loss_clip": 1.01916659, "balance_loss_mlp": 1.03438902, "epoch": 0.9800691417405681, "flos": 21288420570240.0, "grad_norm": 2.179385652038805, "language_loss": 0.78053451, "learning_rate": 4.1531259449194555e-09, "loss": 0.80189991, "num_input_tokens_seen": 351755800, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.70703125, "step": 16301, "time_per_iteration": 2.448690176010132 }, { "auxiliary_loss_clip": 0.01104693, "auxiliary_loss_mlp": 0.0103764, "balance_loss_clip": 1.02462792, "balance_loss_mlp": 1.03533268, "epoch": 0.9801292649932362, "flos": 18439878355200.0, "grad_norm": 2.3455853632512684, "language_loss": 0.75208032, "learning_rate": 4.128078058480921e-09, "loss": 0.77350366, "num_input_tokens_seen": 351774790, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.6953125, "step": 16302, "time_per_iteration": 2.4460997581481934 }, { "auxiliary_loss_clip": 0.01103203, "auxiliary_loss_mlp": 0.01030009, "balance_loss_clip": 1.01758707, "balance_loss_mlp": 1.03523111, "epoch": 0.9801893882459041, "flos": 25046364343680.0, "grad_norm": 1.6668183067301838, "language_loss": 0.79799342, "learning_rate": 4.103105855705724e-09, "loss": 0.81932557, "num_input_tokens_seen": 351792855, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6796875, "step": 16303, "time_per_iteration": 2.5143940448760986 }, { "auxiliary_loss_clip": 0.01106224, "auxiliary_loss_mlp": 0.01029191, "balance_loss_clip": 1.01663816, "balance_loss_mlp": 1.03502822, "epoch": 0.9802495114985721, "flos": 18511484117760.0, "grad_norm": 2.386248364437594, "language_loss": 0.83111185, "learning_rate": 4.078209337540883e-09, "loss": 0.85246599, "num_input_tokens_seen": 351811450, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.7109375, "step": 16304, "time_per_iteration": 2.4455347061157227 }, { "auxiliary_loss_clip": 0.01097578, "auxiliary_loss_mlp": 0.01027236, "balance_loss_clip": 1.01644731, "balance_loss_mlp": 1.03333688, "epoch": 0.98030963475124, "flos": 21469841187840.0, "grad_norm": 1.9225626359254004, "language_loss": 0.70451713, "learning_rate": 4.053388504930089e-09, "loss": 0.72576529, "num_input_tokens_seen": 351831960, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.64453125, "step": 16305, "time_per_iteration": 2.501410484313965 }, { "auxiliary_loss_clip": 0.01105154, "auxiliary_loss_mlp": 0.01033679, "balance_loss_clip": 1.02114975, "balance_loss_mlp": 1.0360744, "epoch": 0.980369758003908, "flos": 20412272027520.0, "grad_norm": 1.8312900605046223, "language_loss": 0.71864903, "learning_rate": 4.028643358815032e-09, "loss": 0.74003744, "num_input_tokens_seen": 351851585, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 16306, "time_per_iteration": 2.464324951171875 }, { "auxiliary_loss_clip": 0.01099745, "auxiliary_loss_mlp": 0.01029887, "balance_loss_clip": 1.018538, "balance_loss_mlp": 1.03433585, "epoch": 0.9804298812565759, "flos": 23399177431680.0, "grad_norm": 2.7971507417664117, "language_loss": 0.73344439, "learning_rate": 4.00397390013385e-09, "loss": 0.75474072, "num_input_tokens_seen": 351871085, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65625, "step": 16307, "time_per_iteration": 2.4743261337280273 }, { "auxiliary_loss_clip": 0.01097417, "auxiliary_loss_mlp": 0.01029766, "balance_loss_clip": 1.01929975, "balance_loss_mlp": 1.03392458, "epoch": 0.980490004509244, "flos": 23292666627840.0, "grad_norm": 1.5128161071260415, "language_loss": 0.74781036, "learning_rate": 3.979380129822018e-09, "loss": 0.76908225, "num_input_tokens_seen": 351891775, "router_z_loss_clip": 0.10449219, "router_z_loss_mlp": 0.6328125, "step": 16308, "time_per_iteration": 2.4756288528442383 }, { "auxiliary_loss_clip": 0.01027646, "auxiliary_loss_mlp": 0.01001653, "balance_loss_clip": 1.00065792, "balance_loss_mlp": 1.00550091, "epoch": 0.980550127761912, "flos": 56051027798400.0, "grad_norm": 0.8558634553977762, "language_loss": 0.57711905, "learning_rate": 3.954862048811902e-09, "loss": 0.59741205, "num_input_tokens_seen": 351946770, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22167969, "step": 16309, "time_per_iteration": 2.97322416305542 }, { "auxiliary_loss_clip": 0.01102954, "auxiliary_loss_mlp": 0.01033447, "balance_loss_clip": 1.02095938, "balance_loss_mlp": 1.03353393, "epoch": 0.9806102510145799, "flos": 25333290184320.0, "grad_norm": 1.8550138467719317, "language_loss": 0.65983081, "learning_rate": 3.930419658033646e-09, "loss": 0.68119478, "num_input_tokens_seen": 351966155, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 16310, "time_per_iteration": 2.5000131130218506 }, { "auxiliary_loss_clip": 0.01027788, "auxiliary_loss_mlp": 0.01000652, "balance_loss_clip": 0.99961489, "balance_loss_mlp": 1.00560856, "epoch": 0.9806703742672479, "flos": 67274837429760.0, "grad_norm": 0.8690717095450183, "language_loss": 0.54546607, "learning_rate": 3.906052958413841e-09, "loss": 0.56575048, "num_input_tokens_seen": 352031655, "router_z_loss_clip": 0.01037598, "router_z_loss_mlp": 0.22167969, "step": 16311, "time_per_iteration": 3.1699118614196777 }, { "auxiliary_loss_clip": 0.01103747, "auxiliary_loss_mlp": 0.01025244, "balance_loss_clip": 1.01363873, "balance_loss_mlp": 1.03541756, "epoch": 0.9807304975199158, "flos": 25228970110080.0, "grad_norm": 1.9219059081808876, "language_loss": 0.7983433, "learning_rate": 3.881761950876638e-09, "loss": 0.81963319, "num_input_tokens_seen": 352051920, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 16312, "time_per_iteration": 2.5118653774261475 }, { "auxiliary_loss_clip": 0.01101944, "auxiliary_loss_mlp": 0.01027733, "balance_loss_clip": 1.01642561, "balance_loss_mlp": 1.03489137, "epoch": 0.9807906207725838, "flos": 17456392995840.0, "grad_norm": 1.9189230150681278, "language_loss": 0.63822603, "learning_rate": 3.8575466363430785e-09, "loss": 0.65952277, "num_input_tokens_seen": 352069315, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 16313, "time_per_iteration": 2.4329307079315186 }, { "auxiliary_loss_clip": 0.01103991, "auxiliary_loss_mlp": 0.01032463, "balance_loss_clip": 1.01978517, "balance_loss_mlp": 1.03605008, "epoch": 0.9808507440252517, "flos": 21032413361280.0, "grad_norm": 2.2808368389897873, "language_loss": 0.72545886, "learning_rate": 3.833407015731316e-09, "loss": 0.74682337, "num_input_tokens_seen": 352089480, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6796875, "step": 16314, "time_per_iteration": 2.483494758605957 }, { "auxiliary_loss_clip": 0.01027657, "auxiliary_loss_mlp": 0.01000157, "balance_loss_clip": 0.99912566, "balance_loss_mlp": 1.00544202, "epoch": 0.9809108672779198, "flos": 64044491598720.0, "grad_norm": 0.695740862659974, "language_loss": 0.51703358, "learning_rate": 3.80934308995684e-09, "loss": 0.53731167, "num_input_tokens_seen": 352150000, "router_z_loss_clip": 0.01031494, "router_z_loss_mlp": 0.22265625, "step": 16315, "time_per_iteration": 3.1024889945983887 }, { "auxiliary_loss_clip": 0.01102153, "auxiliary_loss_mlp": 0.01031011, "balance_loss_clip": 1.0195787, "balance_loss_mlp": 1.03387582, "epoch": 0.9809709905305877, "flos": 22780616296320.0, "grad_norm": 1.4010970595255405, "language_loss": 0.69811511, "learning_rate": 3.785354859932033e-09, "loss": 0.71944666, "num_input_tokens_seen": 352170990, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.68359375, "step": 16316, "time_per_iteration": 2.5218822956085205 }, { "auxiliary_loss_clip": 0.01104773, "auxiliary_loss_mlp": 0.01027236, "balance_loss_clip": 1.01593542, "balance_loss_mlp": 1.03470027, "epoch": 0.9810311137832557, "flos": 37013415217920.0, "grad_norm": 3.484463725777891, "language_loss": 0.55323672, "learning_rate": 3.76144232656661e-09, "loss": 0.57455683, "num_input_tokens_seen": 352195335, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.69921875, "step": 16317, "time_per_iteration": 2.6031575202941895 }, { "auxiliary_loss_clip": 0.01101443, "auxiliary_loss_mlp": 0.01028142, "balance_loss_clip": 1.01719236, "balance_loss_mlp": 1.03467202, "epoch": 0.9810912370359236, "flos": 18916305373440.0, "grad_norm": 1.6902484866940475, "language_loss": 0.7317425, "learning_rate": 3.737605490767404e-09, "loss": 0.75303829, "num_input_tokens_seen": 352214170, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.66796875, "step": 16318, "time_per_iteration": 2.4680545330047607 }, { "auxiliary_loss_clip": 0.01101377, "auxiliary_loss_mlp": 0.01026933, "balance_loss_clip": 1.0156436, "balance_loss_mlp": 1.03494394, "epoch": 0.9811513602885916, "flos": 18441602208000.0, "grad_norm": 2.208895290606069, "language_loss": 0.82475597, "learning_rate": 3.7138443534383555e-09, "loss": 0.84603906, "num_input_tokens_seen": 352231470, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.6640625, "step": 16319, "time_per_iteration": 2.466550350189209 }, { "auxiliary_loss_clip": 0.01027484, "auxiliary_loss_mlp": 0.01001962, "balance_loss_clip": 1.00094914, "balance_loss_mlp": 1.00529826, "epoch": 0.9812114835412595, "flos": 68058945371520.0, "grad_norm": 0.7301174400639631, "language_loss": 0.53534508, "learning_rate": 3.6901589154803014e-09, "loss": 0.55563956, "num_input_tokens_seen": 352291770, "router_z_loss_clip": 0.01013184, "router_z_loss_mlp": 0.22265625, "step": 16320, "time_per_iteration": 2.967801094055176 }, { "auxiliary_loss_clip": 0.01103855, "auxiliary_loss_mlp": 0.01035054, "balance_loss_clip": 1.02298415, "balance_loss_mlp": 1.03511381, "epoch": 0.9812716067939276, "flos": 25373007648000.0, "grad_norm": 1.8532216505731791, "language_loss": 0.73323137, "learning_rate": 3.6665491777914116e-09, "loss": 0.75462043, "num_input_tokens_seen": 352310735, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 16321, "time_per_iteration": 2.500295400619507 }, { "auxiliary_loss_clip": 0.0110484, "auxiliary_loss_mlp": 0.0102855, "balance_loss_clip": 1.01690936, "balance_loss_mlp": 1.03847575, "epoch": 0.9813317300465956, "flos": 22856818999680.0, "grad_norm": 1.724916369494126, "language_loss": 0.7852816, "learning_rate": 3.6430151412669698e-09, "loss": 0.80661547, "num_input_tokens_seen": 352329545, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 16322, "time_per_iteration": 2.4788718223571777 }, { "auxiliary_loss_clip": 0.01103744, "auxiliary_loss_mlp": 0.01035099, "balance_loss_clip": 1.02248073, "balance_loss_mlp": 1.03580117, "epoch": 0.9813918532992635, "flos": 23586954756480.0, "grad_norm": 1.6261166431832932, "language_loss": 0.80466926, "learning_rate": 3.619556806799595e-09, "loss": 0.82605767, "num_input_tokens_seen": 352352080, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 16323, "time_per_iteration": 2.5202646255493164 }, { "auxiliary_loss_clip": 0.01107692, "auxiliary_loss_mlp": 0.01032351, "balance_loss_clip": 1.02065694, "balance_loss_mlp": 1.03694522, "epoch": 0.9814519765519315, "flos": 19606328616960.0, "grad_norm": 5.752045367702657, "language_loss": 0.84919679, "learning_rate": 3.596174175278799e-09, "loss": 0.87059724, "num_input_tokens_seen": 352366455, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.70703125, "step": 16324, "time_per_iteration": 2.4445557594299316 }, { "auxiliary_loss_clip": 0.0110328, "auxiliary_loss_mlp": 0.01030096, "balance_loss_clip": 1.01722765, "balance_loss_mlp": 1.03472662, "epoch": 0.9815120998045994, "flos": 33946284787200.0, "grad_norm": 1.5804584693338117, "language_loss": 0.74558806, "learning_rate": 3.5728672475909827e-09, "loss": 0.76692176, "num_input_tokens_seen": 352386090, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 16325, "time_per_iteration": 2.600590467453003 }, { "auxiliary_loss_clip": 0.01099691, "auxiliary_loss_mlp": 0.01033511, "balance_loss_clip": 1.02250814, "balance_loss_mlp": 1.03414559, "epoch": 0.9815722230572674, "flos": 20850023076480.0, "grad_norm": 1.7832117269225654, "language_loss": 0.76483911, "learning_rate": 3.5496360246201063e-09, "loss": 0.7861712, "num_input_tokens_seen": 352404000, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.65625, "step": 16326, "time_per_iteration": 2.4671714305877686 }, { "auxiliary_loss_clip": 0.01106722, "auxiliary_loss_mlp": 0.01029478, "balance_loss_clip": 1.01705086, "balance_loss_mlp": 1.03700316, "epoch": 0.9816323463099353, "flos": 22894525301760.0, "grad_norm": 1.8458835749529467, "language_loss": 0.67065245, "learning_rate": 3.5264805072470205e-09, "loss": 0.69201446, "num_input_tokens_seen": 352423540, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 16327, "time_per_iteration": 2.496443271636963 }, { "auxiliary_loss_clip": 0.01109173, "auxiliary_loss_mlp": 0.01035415, "balance_loss_clip": 1.02246284, "balance_loss_mlp": 1.03647089, "epoch": 0.9816924695626034, "flos": 31539444117120.0, "grad_norm": 2.4779774326432937, "language_loss": 0.74038094, "learning_rate": 3.5034006963501337e-09, "loss": 0.76182681, "num_input_tokens_seen": 352445530, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.7265625, "step": 16328, "time_per_iteration": 4.033248424530029 }, { "auxiliary_loss_clip": 0.01111178, "auxiliary_loss_mlp": 0.01035143, "balance_loss_clip": 1.02185762, "balance_loss_mlp": 1.03724086, "epoch": 0.9817525928152713, "flos": 21506901045120.0, "grad_norm": 1.886483495343433, "language_loss": 0.81633967, "learning_rate": 3.4803965928040802e-09, "loss": 0.83780289, "num_input_tokens_seen": 352466325, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.73828125, "step": 16329, "time_per_iteration": 2.5140321254730225 }, { "auxiliary_loss_clip": 0.01106208, "auxiliary_loss_mlp": 0.01030059, "balance_loss_clip": 1.01683843, "balance_loss_mlp": 1.03413033, "epoch": 0.9818127160679393, "flos": 25550513683200.0, "grad_norm": 2.2448911300079812, "language_loss": 0.75930196, "learning_rate": 3.4574681974817168e-09, "loss": 0.78066462, "num_input_tokens_seen": 352485505, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.71875, "step": 16330, "time_per_iteration": 2.486720085144043 }, { "auxiliary_loss_clip": 0.01112642, "auxiliary_loss_mlp": 0.01033292, "balance_loss_clip": 1.01817667, "balance_loss_mlp": 1.03708732, "epoch": 0.9818728393206072, "flos": 28803661672320.0, "grad_norm": 2.786334665210625, "language_loss": 0.66815609, "learning_rate": 3.434615511252126e-09, "loss": 0.68961537, "num_input_tokens_seen": 352505360, "router_z_loss_clip": 0.15136719, "router_z_loss_mlp": 0.75390625, "step": 16331, "time_per_iteration": 3.959516763687134 }, { "auxiliary_loss_clip": 0.01102143, "auxiliary_loss_mlp": 0.01030144, "balance_loss_clip": 1.01843739, "balance_loss_mlp": 1.03404105, "epoch": 0.9819329625732752, "flos": 23222246014080.0, "grad_norm": 1.8841001608062657, "language_loss": 0.73442274, "learning_rate": 3.411838534981948e-09, "loss": 0.75574559, "num_input_tokens_seen": 352524035, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 16332, "time_per_iteration": 2.482886791229248 }, { "auxiliary_loss_clip": 0.01103544, "auxiliary_loss_mlp": 0.01026195, "balance_loss_clip": 1.01515675, "balance_loss_mlp": 1.03649426, "epoch": 0.9819930858259431, "flos": 17530440883200.0, "grad_norm": 1.803118231956261, "language_loss": 0.76795065, "learning_rate": 3.389137269534936e-09, "loss": 0.78924805, "num_input_tokens_seen": 352543210, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.671875, "step": 16333, "time_per_iteration": 2.446132183074951 }, { "auxiliary_loss_clip": 0.01102191, "auxiliary_loss_mlp": 0.0103023, "balance_loss_clip": 1.01801074, "balance_loss_mlp": 1.03430557, "epoch": 0.9820532090786112, "flos": 12529915971840.0, "grad_norm": 2.2441061510675726, "language_loss": 0.72950613, "learning_rate": 3.366511715771958e-09, "loss": 0.75083035, "num_input_tokens_seen": 352559770, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.6796875, "step": 16334, "time_per_iteration": 2.427671432495117 }, { "auxiliary_loss_clip": 0.01103514, "auxiliary_loss_mlp": 0.01036507, "balance_loss_clip": 1.02438366, "balance_loss_mlp": 1.03419435, "epoch": 0.9821133323312792, "flos": 18840174497280.0, "grad_norm": 5.790513638207259, "language_loss": 0.78457057, "learning_rate": 3.3439618745509934e-09, "loss": 0.80597079, "num_input_tokens_seen": 352577690, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 16335, "time_per_iteration": 2.450021982192993 }, { "auxiliary_loss_clip": 0.01109288, "auxiliary_loss_mlp": 0.01037856, "balance_loss_clip": 1.02376592, "balance_loss_mlp": 1.03658319, "epoch": 0.9821734555839471, "flos": 34824013528320.0, "grad_norm": 1.9545942020648572, "language_loss": 0.64279109, "learning_rate": 3.3214877467271362e-09, "loss": 0.66426253, "num_input_tokens_seen": 352598850, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.7265625, "step": 16336, "time_per_iteration": 3.9642910957336426 }, { "auxiliary_loss_clip": 0.01108191, "auxiliary_loss_mlp": 0.01035133, "balance_loss_clip": 1.0210309, "balance_loss_mlp": 1.03540826, "epoch": 0.9822335788366151, "flos": 17128169493120.0, "grad_norm": 1.857899936637487, "language_loss": 0.73272169, "learning_rate": 3.299089333152372e-09, "loss": 0.75415492, "num_input_tokens_seen": 352616130, "router_z_loss_clip": 0.14160156, "router_z_loss_mlp": 0.7265625, "step": 16337, "time_per_iteration": 3.864236354827881 }, { "auxiliary_loss_clip": 0.01105026, "auxiliary_loss_mlp": 0.01031607, "balance_loss_clip": 1.01824975, "balance_loss_mlp": 1.03451443, "epoch": 0.982293702089283, "flos": 20813250528000.0, "grad_norm": 1.5836998531690818, "language_loss": 0.73391902, "learning_rate": 3.2767666346764645e-09, "loss": 0.75528538, "num_input_tokens_seen": 352636885, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.70703125, "step": 16338, "time_per_iteration": 2.461205244064331 }, { "auxiliary_loss_clip": 0.01101352, "auxiliary_loss_mlp": 0.01029221, "balance_loss_clip": 1.01731801, "balance_loss_mlp": 1.03325486, "epoch": 0.982353825341951, "flos": 24680829588480.0, "grad_norm": 1.668774492988766, "language_loss": 0.81564313, "learning_rate": 3.2545196521454045e-09, "loss": 0.83694887, "num_input_tokens_seen": 352657905, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 16339, "time_per_iteration": 2.485968589782715 }, { "auxiliary_loss_clip": 0.01100473, "auxiliary_loss_mlp": 0.01031651, "balance_loss_clip": 1.02011752, "balance_loss_mlp": 1.03436661, "epoch": 0.982413948594619, "flos": 20850489953280.0, "grad_norm": 1.8345766545074953, "language_loss": 0.62730598, "learning_rate": 3.232348386403405e-09, "loss": 0.64862722, "num_input_tokens_seen": 352676320, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66015625, "step": 16340, "time_per_iteration": 2.480482816696167 }, { "auxiliary_loss_clip": 0.01108569, "auxiliary_loss_mlp": 0.01031233, "balance_loss_clip": 1.01846004, "balance_loss_mlp": 1.03762984, "epoch": 0.982474071847287, "flos": 15377380778880.0, "grad_norm": 2.789934465674263, "language_loss": 0.86311543, "learning_rate": 3.2102528382904613e-09, "loss": 0.88451344, "num_input_tokens_seen": 352692665, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.7109375, "step": 16341, "time_per_iteration": 2.449172258377075 }, { "auxiliary_loss_clip": 0.01099848, "auxiliary_loss_mlp": 0.01026124, "balance_loss_clip": 1.01440537, "balance_loss_mlp": 1.03418303, "epoch": 0.9825341950999549, "flos": 23774732081280.0, "grad_norm": 1.533194069626265, "language_loss": 0.67022562, "learning_rate": 3.188233008645014e-09, "loss": 0.69148529, "num_input_tokens_seen": 352716130, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.65625, "step": 16342, "time_per_iteration": 2.532597780227661 }, { "auxiliary_loss_clip": 0.01103868, "auxiliary_loss_mlp": 0.01025455, "balance_loss_clip": 1.01337886, "balance_loss_mlp": 1.03391361, "epoch": 0.9825943183526229, "flos": 22746285872640.0, "grad_norm": 1.5688719799529383, "language_loss": 0.77709764, "learning_rate": 3.16628889830195e-09, "loss": 0.79839087, "num_input_tokens_seen": 352734705, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 16343, "time_per_iteration": 2.4745075702667236 }, { "auxiliary_loss_clip": 0.01101129, "auxiliary_loss_mlp": 0.01029679, "balance_loss_clip": 1.0188669, "balance_loss_mlp": 1.03380692, "epoch": 0.9826544416052908, "flos": 27709966408320.0, "grad_norm": 1.6634797068373024, "language_loss": 0.75551081, "learning_rate": 3.1444205080932707e-09, "loss": 0.77681893, "num_input_tokens_seen": 352756225, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.671875, "step": 16344, "time_per_iteration": 2.5124895572662354 }, { "auxiliary_loss_clip": 0.01104715, "auxiliary_loss_mlp": 0.01032164, "balance_loss_clip": 1.02015972, "balance_loss_mlp": 1.03551245, "epoch": 0.9827145648579588, "flos": 26941657472640.0, "grad_norm": 2.0337808415757053, "language_loss": 0.66331416, "learning_rate": 3.122627838848313e-09, "loss": 0.68468297, "num_input_tokens_seen": 352776210, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 16345, "time_per_iteration": 2.511167287826538 }, { "auxiliary_loss_clip": 0.01098601, "auxiliary_loss_mlp": 0.01025887, "balance_loss_clip": 1.01512837, "balance_loss_mlp": 1.03341591, "epoch": 0.9827746881106267, "flos": 21866545969920.0, "grad_norm": 1.5329429093352727, "language_loss": 0.79518104, "learning_rate": 3.1009108913933045e-09, "loss": 0.81642592, "num_input_tokens_seen": 352795455, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.65234375, "step": 16346, "time_per_iteration": 2.4593310356140137 }, { "auxiliary_loss_clip": 0.01108985, "auxiliary_loss_mlp": 0.01033263, "balance_loss_clip": 1.02022743, "balance_loss_mlp": 1.0356046, "epoch": 0.9828348113632948, "flos": 20850777262080.0, "grad_norm": 5.167048285755752, "language_loss": 0.75068909, "learning_rate": 3.079269666552031e-09, "loss": 0.77211154, "num_input_tokens_seen": 352812895, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.734375, "step": 16347, "time_per_iteration": 2.4686083793640137 }, { "auxiliary_loss_clip": 0.01098731, "auxiliary_loss_mlp": 0.01032233, "balance_loss_clip": 1.02115226, "balance_loss_mlp": 1.03261387, "epoch": 0.9828949346159628, "flos": 34569227381760.0, "grad_norm": 1.777312078441802, "language_loss": 0.66633409, "learning_rate": 3.0577041651449474e-09, "loss": 0.68764377, "num_input_tokens_seen": 352835470, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.66015625, "step": 16348, "time_per_iteration": 2.57529616355896 }, { "auxiliary_loss_clip": 0.01104801, "auxiliary_loss_mlp": 0.01029352, "balance_loss_clip": 1.0168829, "balance_loss_mlp": 1.03600836, "epoch": 0.9829550578686307, "flos": 24457464864000.0, "grad_norm": 1.833503678764719, "language_loss": 0.69335741, "learning_rate": 3.0362143879898437e-09, "loss": 0.71469891, "num_input_tokens_seen": 352854295, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.6875, "step": 16349, "time_per_iteration": 2.5058064460754395 }, { "auxiliary_loss_clip": 0.01098825, "auxiliary_loss_mlp": 0.01030368, "balance_loss_clip": 1.01938915, "balance_loss_mlp": 1.03449082, "epoch": 0.9830151811212987, "flos": 16910084067840.0, "grad_norm": 2.4011683468752083, "language_loss": 0.76450491, "learning_rate": 3.0148003359014018e-09, "loss": 0.78579688, "num_input_tokens_seen": 352869695, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.640625, "step": 16350, "time_per_iteration": 2.4416911602020264 }, { "auxiliary_loss_clip": 0.01102864, "auxiliary_loss_mlp": 0.01033118, "balance_loss_clip": 1.0202316, "balance_loss_mlp": 1.03394508, "epoch": 0.9830753043739666, "flos": 21288312829440.0, "grad_norm": 2.505705263847144, "language_loss": 0.84287918, "learning_rate": 2.9934620096920826e-09, "loss": 0.86423898, "num_input_tokens_seen": 352887430, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6875, "step": 16351, "time_per_iteration": 2.4785847663879395 }, { "auxiliary_loss_clip": 0.01101799, "auxiliary_loss_mlp": 0.01024819, "balance_loss_clip": 1.01293397, "balance_loss_mlp": 1.03290343, "epoch": 0.9831354276266346, "flos": 31723522341120.0, "grad_norm": 3.155536445095927, "language_loss": 0.68590665, "learning_rate": 2.972199410170795e-09, "loss": 0.70717287, "num_input_tokens_seen": 352907555, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 16352, "time_per_iteration": 2.537083864212036 }, { "auxiliary_loss_clip": 0.0110023, "auxiliary_loss_mlp": 0.01029793, "balance_loss_clip": 1.01860499, "balance_loss_mlp": 1.03403175, "epoch": 0.9831955508793025, "flos": 21619050284160.0, "grad_norm": 2.1964965975892317, "language_loss": 0.66505969, "learning_rate": 2.951012538143782e-09, "loss": 0.68635988, "num_input_tokens_seen": 352928670, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.66015625, "step": 16353, "time_per_iteration": 2.501433849334717 }, { "auxiliary_loss_clip": 0.01099335, "auxiliary_loss_mlp": 0.01027864, "balance_loss_clip": 1.0166409, "balance_loss_mlp": 1.03263688, "epoch": 0.9832556741319706, "flos": 22968214053120.0, "grad_norm": 1.6202243245543901, "language_loss": 0.74189711, "learning_rate": 2.9299013944144025e-09, "loss": 0.76316905, "num_input_tokens_seen": 352948345, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66796875, "step": 16354, "time_per_iteration": 2.5112597942352295 }, { "auxiliary_loss_clip": 0.01101627, "auxiliary_loss_mlp": 0.01029858, "balance_loss_clip": 1.01756155, "balance_loss_mlp": 1.03404355, "epoch": 0.9833157973846385, "flos": 21323900229120.0, "grad_norm": 3.642550810978028, "language_loss": 0.77560884, "learning_rate": 2.9088659797835702e-09, "loss": 0.79692364, "num_input_tokens_seen": 352967250, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 16355, "time_per_iteration": 2.5512068271636963 }, { "auxiliary_loss_clip": 0.01101634, "auxiliary_loss_mlp": 0.01029606, "balance_loss_clip": 1.01803076, "balance_loss_mlp": 1.03444123, "epoch": 0.9833759206373065, "flos": 21068719032960.0, "grad_norm": 1.7629674331485334, "language_loss": 0.73081422, "learning_rate": 2.8879062950484256e-09, "loss": 0.75212663, "num_input_tokens_seen": 352984725, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 16356, "time_per_iteration": 2.475022554397583 }, { "auxiliary_loss_clip": 0.01101609, "auxiliary_loss_mlp": 0.01029182, "balance_loss_clip": 1.01705861, "balance_loss_mlp": 1.03501058, "epoch": 0.9834360438899744, "flos": 18697322108160.0, "grad_norm": 2.9286472709321836, "language_loss": 0.75859636, "learning_rate": 2.8670223410041104e-09, "loss": 0.77990431, "num_input_tokens_seen": 353003480, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.66796875, "step": 16357, "time_per_iteration": 2.4979047775268555 }, { "auxiliary_loss_clip": 0.01102726, "auxiliary_loss_mlp": 0.01025534, "balance_loss_clip": 1.01329088, "balance_loss_mlp": 1.03534293, "epoch": 0.9834961671426424, "flos": 21105240186240.0, "grad_norm": 4.572212538839715, "language_loss": 0.80000758, "learning_rate": 2.846214118442436e-09, "loss": 0.82129019, "num_input_tokens_seen": 353021425, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.67578125, "step": 16358, "time_per_iteration": 2.440824031829834 }, { "auxiliary_loss_clip": 0.01101777, "auxiliary_loss_mlp": 0.01025777, "balance_loss_clip": 1.01396966, "balance_loss_mlp": 1.03347909, "epoch": 0.9835562903953103, "flos": 26687625511680.0, "grad_norm": 1.9996710454871767, "language_loss": 0.67542595, "learning_rate": 2.8254816281523263e-09, "loss": 0.69670153, "num_input_tokens_seen": 353039870, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 16359, "time_per_iteration": 2.5107128620147705 }, { "auxiliary_loss_clip": 0.01099939, "auxiliary_loss_mlp": 0.01029316, "balance_loss_clip": 1.01831341, "balance_loss_mlp": 1.03344166, "epoch": 0.9836164136479784, "flos": 22090162089600.0, "grad_norm": 1.622660109505336, "language_loss": 0.69545686, "learning_rate": 2.804824870920264e-09, "loss": 0.71674943, "num_input_tokens_seen": 353059750, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6640625, "step": 16360, "time_per_iteration": 2.489349842071533 }, { "auxiliary_loss_clip": 0.01106082, "auxiliary_loss_mlp": 0.01035363, "balance_loss_clip": 1.02267313, "balance_loss_mlp": 1.03577936, "epoch": 0.9836765369006463, "flos": 23878405710720.0, "grad_norm": 1.9793474073372168, "language_loss": 0.8402276, "learning_rate": 2.7842438475293996e-09, "loss": 0.861642, "num_input_tokens_seen": 353079940, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.703125, "step": 16361, "time_per_iteration": 2.499021530151367 }, { "auxiliary_loss_clip": 0.01102475, "auxiliary_loss_mlp": 0.01028707, "balance_loss_clip": 1.01722121, "balance_loss_mlp": 1.03400397, "epoch": 0.9837366601533143, "flos": 25845017293440.0, "grad_norm": 1.9296288233976688, "language_loss": 0.76073217, "learning_rate": 2.76373855876022e-09, "loss": 0.78204405, "num_input_tokens_seen": 353099990, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.68359375, "step": 16362, "time_per_iteration": 2.4732449054718018 }, { "auxiliary_loss_clip": 0.01103885, "auxiliary_loss_mlp": 0.01031648, "balance_loss_clip": 1.01926184, "balance_loss_mlp": 1.03574526, "epoch": 0.9837967834059823, "flos": 21358015171200.0, "grad_norm": 2.6620125636235166, "language_loss": 0.70950365, "learning_rate": 2.7433090053901043e-09, "loss": 0.73085898, "num_input_tokens_seen": 353118710, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 16363, "time_per_iteration": 2.4690675735473633 }, { "auxiliary_loss_clip": 0.01098521, "auxiliary_loss_mlp": 0.01029553, "balance_loss_clip": 1.01821649, "balance_loss_mlp": 1.03339767, "epoch": 0.9838569066586502, "flos": 18515793749760.0, "grad_norm": 3.6444463660315387, "language_loss": 0.63202804, "learning_rate": 2.7229551881937653e-09, "loss": 0.65330881, "num_input_tokens_seen": 353136415, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.65234375, "step": 16364, "time_per_iteration": 2.3982207775115967 }, { "auxiliary_loss_clip": 0.01103591, "auxiliary_loss_mlp": 0.01030408, "balance_loss_clip": 1.01925588, "balance_loss_mlp": 1.0350287, "epoch": 0.9839170299113182, "flos": 22452392793600.0, "grad_norm": 1.6409110897575434, "language_loss": 0.75064284, "learning_rate": 2.702677107943252e-09, "loss": 0.77198279, "num_input_tokens_seen": 353154650, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6875, "step": 16365, "time_per_iteration": 2.423957109451294 }, { "auxiliary_loss_clip": 0.01100821, "auxiliary_loss_mlp": 0.01027522, "balance_loss_clip": 1.01501703, "balance_loss_mlp": 1.03352475, "epoch": 0.9839771531639862, "flos": 27892320779520.0, "grad_norm": 2.279004256171764, "language_loss": 0.75839365, "learning_rate": 2.6824747654072832e-09, "loss": 0.77967703, "num_input_tokens_seen": 353174065, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 16366, "time_per_iteration": 2.5044119358062744 }, { "auxiliary_loss_clip": 0.01100637, "auxiliary_loss_mlp": 0.01027369, "balance_loss_clip": 1.01619291, "balance_loss_mlp": 1.03396869, "epoch": 0.9840372764166542, "flos": 28214510797440.0, "grad_norm": 1.955031285681805, "language_loss": 0.77425086, "learning_rate": 2.662348161352357e-09, "loss": 0.79553092, "num_input_tokens_seen": 353193560, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.6640625, "step": 16367, "time_per_iteration": 2.468851089477539 }, { "auxiliary_loss_clip": 0.01103597, "auxiliary_loss_mlp": 0.01032108, "balance_loss_clip": 1.01919174, "balance_loss_mlp": 1.03700328, "epoch": 0.9840973996693221, "flos": 23403989854080.0, "grad_norm": 4.926984027981053, "language_loss": 0.61193651, "learning_rate": 2.642297296540974e-09, "loss": 0.63329351, "num_input_tokens_seen": 353213525, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6640625, "step": 16368, "time_per_iteration": 2.471798896789551 }, { "auxiliary_loss_clip": 0.01098842, "auxiliary_loss_mlp": 0.01032394, "balance_loss_clip": 1.02164125, "balance_loss_mlp": 1.03365803, "epoch": 0.9841575229219901, "flos": 21395865127680.0, "grad_norm": 1.950674410620547, "language_loss": 0.65566087, "learning_rate": 2.6223221717340816e-09, "loss": 0.67697328, "num_input_tokens_seen": 353234000, "router_z_loss_clip": 0.10742188, "router_z_loss_mlp": 0.65234375, "step": 16369, "time_per_iteration": 3.918022871017456 }, { "auxiliary_loss_clip": 0.01105118, "auxiliary_loss_mlp": 0.01036525, "balance_loss_clip": 1.02393043, "balance_loss_mlp": 1.03567624, "epoch": 0.984217646174658, "flos": 24464072966400.0, "grad_norm": 1.889070082198475, "language_loss": 0.68565375, "learning_rate": 2.6024227876886295e-09, "loss": 0.70707023, "num_input_tokens_seen": 353254940, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 16370, "time_per_iteration": 2.4741482734680176 }, { "auxiliary_loss_clip": 0.01103345, "auxiliary_loss_mlp": 0.01033406, "balance_loss_clip": 1.02018547, "balance_loss_mlp": 1.03384805, "epoch": 0.984277769427326, "flos": 16435057680000.0, "grad_norm": 2.076589506399239, "language_loss": 0.73617637, "learning_rate": 2.582599145159792e-09, "loss": 0.7575438, "num_input_tokens_seen": 353272590, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.6953125, "step": 16371, "time_per_iteration": 2.4340314865112305 }, { "auxiliary_loss_clip": 0.01027833, "auxiliary_loss_mlp": 0.01001428, "balance_loss_clip": 1.00045002, "balance_loss_mlp": 1.00557351, "epoch": 0.9843378926799939, "flos": 64530615288960.0, "grad_norm": 0.7762681307089215, "language_loss": 0.65136462, "learning_rate": 2.562851244898745e-09, "loss": 0.67165726, "num_input_tokens_seen": 353334380, "router_z_loss_clip": 0.00976562, "router_z_loss_mlp": 0.22265625, "step": 16372, "time_per_iteration": 3.088104724884033 }, { "auxiliary_loss_clip": 0.01100661, "auxiliary_loss_mlp": 0.01027419, "balance_loss_clip": 1.01573074, "balance_loss_mlp": 1.03350735, "epoch": 0.984398015932662, "flos": 17382811985280.0, "grad_norm": 1.833775177667484, "language_loss": 0.70439088, "learning_rate": 2.5431790876544456e-09, "loss": 0.72567177, "num_input_tokens_seen": 353351640, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.671875, "step": 16373, "time_per_iteration": 3.7964017391204834 }, { "auxiliary_loss_clip": 0.01102386, "auxiliary_loss_mlp": 0.01028376, "balance_loss_clip": 1.01695633, "balance_loss_mlp": 1.03584266, "epoch": 0.9844581391853299, "flos": 23879088069120.0, "grad_norm": 1.898992145694327, "language_loss": 0.81658864, "learning_rate": 2.523582674173186e-09, "loss": 0.83789623, "num_input_tokens_seen": 353372555, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6640625, "step": 16374, "time_per_iteration": 2.5035853385925293 }, { "auxiliary_loss_clip": 0.01105558, "auxiliary_loss_mlp": 0.01034631, "balance_loss_clip": 1.02248931, "balance_loss_mlp": 1.03607798, "epoch": 0.9845182624379979, "flos": 19865352568320.0, "grad_norm": 1.7814458833759568, "language_loss": 0.69284838, "learning_rate": 2.504062005197927e-09, "loss": 0.71425033, "num_input_tokens_seen": 353391385, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6953125, "step": 16375, "time_per_iteration": 2.433270215988159 }, { "auxiliary_loss_clip": 0.01104927, "auxiliary_loss_mlp": 0.01035085, "balance_loss_clip": 1.02203786, "balance_loss_mlp": 1.03423643, "epoch": 0.9845783856906659, "flos": 28254659224320.0, "grad_norm": 1.8185589459716478, "language_loss": 0.80866849, "learning_rate": 2.484617081468521e-09, "loss": 0.83006865, "num_input_tokens_seen": 353411630, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.70703125, "step": 16376, "time_per_iteration": 2.5093202590942383 }, { "auxiliary_loss_clip": 0.0110043, "auxiliary_loss_mlp": 0.0103137, "balance_loss_clip": 1.01950884, "balance_loss_mlp": 1.0338223, "epoch": 0.9846385089433338, "flos": 28328383889280.0, "grad_norm": 2.1214097602879867, "language_loss": 0.6225639, "learning_rate": 2.4652479037228224e-09, "loss": 0.64388192, "num_input_tokens_seen": 353432895, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 16377, "time_per_iteration": 2.4947755336761475 }, { "auxiliary_loss_clip": 0.01104809, "auxiliary_loss_mlp": 0.01036447, "balance_loss_clip": 1.02409697, "balance_loss_mlp": 1.03540897, "epoch": 0.9846986321960018, "flos": 24316767290880.0, "grad_norm": 1.67703633037335, "language_loss": 0.73183101, "learning_rate": 2.445954472695133e-09, "loss": 0.75324357, "num_input_tokens_seen": 353454195, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.6953125, "step": 16378, "time_per_iteration": 5.248405933380127 }, { "auxiliary_loss_clip": 0.0110282, "auxiliary_loss_mlp": 0.01034521, "balance_loss_clip": 1.02272534, "balance_loss_mlp": 1.03440309, "epoch": 0.9847587554486698, "flos": 27271999877760.0, "grad_norm": 1.6699923671359604, "language_loss": 0.71091616, "learning_rate": 2.426736789116868e-09, "loss": 0.73228955, "num_input_tokens_seen": 353475125, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.68359375, "step": 16379, "time_per_iteration": 2.5030412673950195 }, { "auxiliary_loss_clip": 0.01105681, "auxiliary_loss_mlp": 0.01030014, "balance_loss_clip": 1.01785469, "balance_loss_mlp": 1.03620458, "epoch": 0.9848188787013378, "flos": 16542717719040.0, "grad_norm": 1.9460483043541916, "language_loss": 0.68164611, "learning_rate": 2.407594853716999e-09, "loss": 0.70300305, "num_input_tokens_seen": 353493265, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6953125, "step": 16380, "time_per_iteration": 2.4169723987579346 }, { "auxiliary_loss_clip": 0.01107573, "auxiliary_loss_mlp": 0.01033149, "balance_loss_clip": 1.0210073, "balance_loss_mlp": 1.03568316, "epoch": 0.9848790019540057, "flos": 20193647898240.0, "grad_norm": 1.9130187629214837, "language_loss": 0.79029161, "learning_rate": 2.38852866722139e-09, "loss": 0.81169879, "num_input_tokens_seen": 353511650, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.71875, "step": 16381, "time_per_iteration": 2.443521738052368 }, { "auxiliary_loss_clip": 0.01104703, "auxiliary_loss_mlp": 0.01028764, "balance_loss_clip": 1.01689625, "balance_loss_mlp": 1.03507435, "epoch": 0.9849391252066737, "flos": 28259723041920.0, "grad_norm": 1.6131359832082086, "language_loss": 0.8259564, "learning_rate": 2.3695382303527965e-09, "loss": 0.84729105, "num_input_tokens_seen": 353534035, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 16382, "time_per_iteration": 2.513484239578247 }, { "auxiliary_loss_clip": 0.01106165, "auxiliary_loss_mlp": 0.01033667, "balance_loss_clip": 1.02041149, "balance_loss_mlp": 1.03435874, "epoch": 0.9849992484593416, "flos": 22454942659200.0, "grad_norm": 2.606992035626309, "language_loss": 0.74519658, "learning_rate": 2.3506235438315316e-09, "loss": 0.76659495, "num_input_tokens_seen": 353549950, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.71875, "step": 16383, "time_per_iteration": 2.446392297744751 }, { "auxiliary_loss_clip": 0.01105205, "auxiliary_loss_mlp": 0.01031017, "balance_loss_clip": 1.01929832, "balance_loss_mlp": 1.03660262, "epoch": 0.9850593717120096, "flos": 34497190656000.0, "grad_norm": 2.4183953758228407, "language_loss": 0.6609093, "learning_rate": 2.3317846083750203e-09, "loss": 0.68227148, "num_input_tokens_seen": 353573745, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 16384, "time_per_iteration": 2.5642523765563965 }, { "auxiliary_loss_clip": 0.01109154, "auxiliary_loss_mlp": 0.01033732, "balance_loss_clip": 1.02007639, "balance_loss_mlp": 1.03735828, "epoch": 0.9851194949646775, "flos": 38837282152320.0, "grad_norm": 1.750675944926548, "language_loss": 0.70156842, "learning_rate": 2.313021424697359e-09, "loss": 0.72299731, "num_input_tokens_seen": 353595335, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.71875, "step": 16385, "time_per_iteration": 2.596649408340454 }, { "auxiliary_loss_clip": 0.01108557, "auxiliary_loss_mlp": 0.01034039, "balance_loss_clip": 1.02171898, "balance_loss_mlp": 1.03932917, "epoch": 0.9851796182173456, "flos": 17712436118400.0, "grad_norm": 1.9981020979692283, "language_loss": 0.80676109, "learning_rate": 2.294333993509978e-09, "loss": 0.82818705, "num_input_tokens_seen": 353614270, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.69140625, "step": 16386, "time_per_iteration": 2.4254324436187744 }, { "auxiliary_loss_clip": 0.01104845, "auxiliary_loss_mlp": 0.01035345, "balance_loss_clip": 1.02284026, "balance_loss_mlp": 1.03561151, "epoch": 0.9852397414700135, "flos": 27454318335360.0, "grad_norm": 1.9937328009417181, "language_loss": 0.6809606, "learning_rate": 2.2757223155216442e-09, "loss": 0.70236248, "num_input_tokens_seen": 353634900, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 16387, "time_per_iteration": 2.507279872894287 }, { "auxiliary_loss_clip": 0.01095493, "auxiliary_loss_mlp": 0.01026609, "balance_loss_clip": 1.01532638, "balance_loss_mlp": 1.03120828, "epoch": 0.9852998647226815, "flos": 18296702743680.0, "grad_norm": 2.1229972634939878, "language_loss": 0.73732489, "learning_rate": 2.257186391438237e-09, "loss": 0.75854594, "num_input_tokens_seen": 353652890, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.640625, "step": 16388, "time_per_iteration": 2.4150049686431885 }, { "auxiliary_loss_clip": 0.01100147, "auxiliary_loss_mlp": 0.01027798, "balance_loss_clip": 1.01637816, "balance_loss_mlp": 1.03265619, "epoch": 0.9853599879753495, "flos": 19642562461440.0, "grad_norm": 1.7050202408143342, "language_loss": 0.81980497, "learning_rate": 2.238726221962528e-09, "loss": 0.84108436, "num_input_tokens_seen": 353671295, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 16389, "time_per_iteration": 2.4315624237060547 }, { "auxiliary_loss_clip": 0.01101498, "auxiliary_loss_mlp": 0.01025907, "balance_loss_clip": 1.01388466, "balance_loss_mlp": 1.03430438, "epoch": 0.9854201112280174, "flos": 23841956384640.0, "grad_norm": 2.0306857519414563, "language_loss": 0.67144173, "learning_rate": 2.2203418077946234e-09, "loss": 0.69271576, "num_input_tokens_seen": 353690560, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.671875, "step": 16390, "time_per_iteration": 2.454068183898926 }, { "auxiliary_loss_clip": 0.01104547, "auxiliary_loss_mlp": 0.01032189, "balance_loss_clip": 1.01921296, "balance_loss_mlp": 1.03541684, "epoch": 0.9854802344806854, "flos": 30080573233920.0, "grad_norm": 1.6639107424144277, "language_loss": 0.77240515, "learning_rate": 2.2020331496312994e-09, "loss": 0.79377258, "num_input_tokens_seen": 353710660, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69140625, "step": 16391, "time_per_iteration": 2.548011064529419 }, { "auxiliary_loss_clip": 0.01097241, "auxiliary_loss_mlp": 0.01029741, "balance_loss_clip": 1.01876783, "balance_loss_mlp": 1.03325939, "epoch": 0.9855403577333534, "flos": 21907412668800.0, "grad_norm": 1.9083505993761885, "language_loss": 0.68210733, "learning_rate": 2.1838002481673333e-09, "loss": 0.70337713, "num_input_tokens_seen": 353730440, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.640625, "step": 16392, "time_per_iteration": 2.4772794246673584 }, { "auxiliary_loss_clip": 0.01107352, "auxiliary_loss_mlp": 0.01031689, "balance_loss_clip": 1.01843929, "balance_loss_mlp": 1.03499293, "epoch": 0.9856004809860214, "flos": 15413794191360.0, "grad_norm": 2.061203815342255, "language_loss": 0.56032264, "learning_rate": 2.1656431040937286e-09, "loss": 0.58171308, "num_input_tokens_seen": 353748360, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.72265625, "step": 16393, "time_per_iteration": 2.4208872318267822 }, { "auxiliary_loss_clip": 0.01109404, "auxiliary_loss_mlp": 0.01030493, "balance_loss_clip": 1.01746953, "balance_loss_mlp": 1.03667712, "epoch": 0.9856606042386893, "flos": 13653201064320.0, "grad_norm": 3.13848392041226, "language_loss": 0.78734207, "learning_rate": 2.1475617180990444e-09, "loss": 0.80874109, "num_input_tokens_seen": 353760880, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7265625, "step": 16394, "time_per_iteration": 2.3760766983032227 }, { "auxiliary_loss_clip": 0.01107418, "auxiliary_loss_mlp": 0.01031767, "balance_loss_clip": 1.01958418, "balance_loss_mlp": 1.03578067, "epoch": 0.9857207274913573, "flos": 23479151063040.0, "grad_norm": 3.8407670089289287, "language_loss": 0.76124811, "learning_rate": 2.129556090869178e-09, "loss": 0.78263992, "num_input_tokens_seen": 353782255, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.71875, "step": 16395, "time_per_iteration": 2.5007591247558594 }, { "auxiliary_loss_clip": 0.01102167, "auxiliary_loss_mlp": 0.01027909, "balance_loss_clip": 1.01601219, "balance_loss_mlp": 1.03448486, "epoch": 0.9857808507440252, "flos": 21065486808960.0, "grad_norm": 2.0824371288919363, "language_loss": 0.74937832, "learning_rate": 2.1116262230866933e-09, "loss": 0.77067912, "num_input_tokens_seen": 353803580, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 16396, "time_per_iteration": 2.4850106239318848 }, { "auxiliary_loss_clip": 0.01101313, "auxiliary_loss_mlp": 0.0102622, "balance_loss_clip": 1.01407862, "balance_loss_mlp": 1.03419924, "epoch": 0.9858409739966932, "flos": 25301365971840.0, "grad_norm": 1.4675063440973675, "language_loss": 0.70921957, "learning_rate": 2.0937721154317133e-09, "loss": 0.73049492, "num_input_tokens_seen": 353824200, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.671875, "step": 16397, "time_per_iteration": 2.4838473796844482 }, { "auxiliary_loss_clip": 0.01100019, "auxiliary_loss_mlp": 0.0102906, "balance_loss_clip": 1.01791954, "balance_loss_mlp": 1.03607035, "epoch": 0.9859010972493611, "flos": 20558751690240.0, "grad_norm": 1.9717752331507696, "language_loss": 0.71616238, "learning_rate": 2.0759937685810304e-09, "loss": 0.73745322, "num_input_tokens_seen": 353843350, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.640625, "step": 16398, "time_per_iteration": 2.443953275680542 }, { "auxiliary_loss_clip": 0.01101128, "auxiliary_loss_mlp": 0.01025638, "balance_loss_clip": 1.01416433, "balance_loss_mlp": 1.03473115, "epoch": 0.9859612205020292, "flos": 24754985216640.0, "grad_norm": 1.5507095487272404, "language_loss": 0.74157953, "learning_rate": 2.058291183208771e-09, "loss": 0.76284713, "num_input_tokens_seen": 353864520, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6640625, "step": 16399, "time_per_iteration": 2.4961724281311035 }, { "auxiliary_loss_clip": 0.01104282, "auxiliary_loss_mlp": 0.01027382, "balance_loss_clip": 1.01503778, "balance_loss_mlp": 1.03449452, "epoch": 0.9860213437546971, "flos": 21105850717440.0, "grad_norm": 2.553704884421358, "language_loss": 0.57659149, "learning_rate": 2.0406643599863993e-09, "loss": 0.59790814, "num_input_tokens_seen": 353882240, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.69921875, "step": 16400, "time_per_iteration": 2.4318158626556396 }, { "auxiliary_loss_clip": 0.01107818, "auxiliary_loss_mlp": 0.01029728, "balance_loss_clip": 1.01705027, "balance_loss_mlp": 1.03500175, "epoch": 0.9860814670073651, "flos": 19136078737920.0, "grad_norm": 2.1020392933937564, "language_loss": 0.80726808, "learning_rate": 2.023113299582491e-09, "loss": 0.82864356, "num_input_tokens_seen": 353901590, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7265625, "step": 16401, "time_per_iteration": 2.45634126663208 }, { "auxiliary_loss_clip": 0.01102057, "auxiliary_loss_mlp": 0.01031634, "balance_loss_clip": 1.01852679, "balance_loss_mlp": 1.03506231, "epoch": 0.9861415902600331, "flos": 17237050594560.0, "grad_norm": 1.8572284894446367, "language_loss": 0.78117204, "learning_rate": 2.005638002662069e-09, "loss": 0.80250895, "num_input_tokens_seen": 353918785, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.66796875, "step": 16402, "time_per_iteration": 2.409649610519409 }, { "auxiliary_loss_clip": 0.01104363, "auxiliary_loss_mlp": 0.0103078, "balance_loss_clip": 1.01874638, "balance_loss_mlp": 1.03535759, "epoch": 0.986201713512701, "flos": 27782577751680.0, "grad_norm": 1.6711890034257073, "language_loss": 0.70107257, "learning_rate": 1.9882384698881596e-09, "loss": 0.72242403, "num_input_tokens_seen": 353940390, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.69140625, "step": 16403, "time_per_iteration": 2.4891655445098877 }, { "auxiliary_loss_clip": 0.01100603, "auxiliary_loss_mlp": 0.01028732, "balance_loss_clip": 1.01732397, "balance_loss_mlp": 1.03294325, "epoch": 0.986261836765369, "flos": 28730403884160.0, "grad_norm": 1.9917341483703774, "language_loss": 0.74430388, "learning_rate": 1.9709147019204566e-09, "loss": 0.76559722, "num_input_tokens_seen": 353962180, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.67578125, "step": 16404, "time_per_iteration": 2.4802052974700928 }, { "auxiliary_loss_clip": 0.01102854, "auxiliary_loss_mlp": 0.01027975, "balance_loss_clip": 1.01624441, "balance_loss_mlp": 1.03462863, "epoch": 0.986321960018037, "flos": 34313471568000.0, "grad_norm": 1.667986341876088, "language_loss": 0.69611537, "learning_rate": 1.953666699415768e-09, "loss": 0.71742368, "num_input_tokens_seen": 353984305, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6796875, "step": 16405, "time_per_iteration": 2.588062047958374 }, { "auxiliary_loss_clip": 0.01103035, "auxiliary_loss_mlp": 0.01031419, "balance_loss_clip": 1.02029097, "balance_loss_mlp": 1.03689003, "epoch": 0.986382083270705, "flos": 25189755436800.0, "grad_norm": 1.7920664478270774, "language_loss": 0.69518447, "learning_rate": 1.93649446302846e-09, "loss": 0.71652901, "num_input_tokens_seen": 354004495, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6640625, "step": 16406, "time_per_iteration": 2.5027077198028564 }, { "auxiliary_loss_clip": 0.01101586, "auxiliary_loss_mlp": 0.01035281, "balance_loss_clip": 1.02335382, "balance_loss_mlp": 1.03539753, "epoch": 0.9864422065233729, "flos": 11025904671360.0, "grad_norm": 4.181394083867114, "language_loss": 0.75419021, "learning_rate": 1.9193979934095663e-09, "loss": 0.77555883, "num_input_tokens_seen": 354015985, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6640625, "step": 16407, "time_per_iteration": 2.3691658973693848 }, { "auxiliary_loss_clip": 0.01100714, "auxiliary_loss_mlp": 0.010325, "balance_loss_clip": 1.02037656, "balance_loss_mlp": 1.03291321, "epoch": 0.9865023297760409, "flos": 16545590807040.0, "grad_norm": 2.1589760320084386, "language_loss": 0.77138948, "learning_rate": 1.9023772912072357e-09, "loss": 0.79272163, "num_input_tokens_seen": 354033260, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6796875, "step": 16408, "time_per_iteration": 2.4160845279693604 }, { "auxiliary_loss_clip": 0.01107666, "auxiliary_loss_mlp": 0.01032054, "balance_loss_clip": 1.01866055, "balance_loss_mlp": 1.03635693, "epoch": 0.9865624530287088, "flos": 18880179269760.0, "grad_norm": 1.8306527401715444, "language_loss": 0.67704642, "learning_rate": 1.8854323570669515e-09, "loss": 0.69844365, "num_input_tokens_seen": 354052825, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7109375, "step": 16409, "time_per_iteration": 2.4160046577453613 }, { "auxiliary_loss_clip": 0.01027867, "auxiliary_loss_mlp": 0.01001798, "balance_loss_clip": 1.00080252, "balance_loss_mlp": 1.00566339, "epoch": 0.9866225762813768, "flos": 68887798680960.0, "grad_norm": 0.8084096973752917, "language_loss": 0.61080581, "learning_rate": 1.8685631916313118e-09, "loss": 0.6311025, "num_input_tokens_seen": 354113920, "router_z_loss_clip": 0.00994873, "router_z_loss_mlp": 0.22265625, "step": 16410, "time_per_iteration": 3.1528987884521484 }, { "auxiliary_loss_clip": 0.01103273, "auxiliary_loss_mlp": 0.01029841, "balance_loss_clip": 1.0177232, "balance_loss_mlp": 1.03421903, "epoch": 0.9866826995340447, "flos": 29023111814400.0, "grad_norm": 2.350092849968041, "language_loss": 0.66448194, "learning_rate": 1.8517697955400258e-09, "loss": 0.68581307, "num_input_tokens_seen": 354134210, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 16411, "time_per_iteration": 4.001407861709595 }, { "auxiliary_loss_clip": 0.01028015, "auxiliary_loss_mlp": 0.01000013, "balance_loss_clip": 0.99898833, "balance_loss_mlp": 1.00568664, "epoch": 0.9867428227867128, "flos": 65376814867200.0, "grad_norm": 0.7237293300254081, "language_loss": 0.56251073, "learning_rate": 1.8350521694299182e-09, "loss": 0.58279097, "num_input_tokens_seen": 354198010, "router_z_loss_clip": 0.01025391, "router_z_loss_mlp": 0.22363281, "step": 16412, "time_per_iteration": 3.1685667037963867 }, { "auxiliary_loss_clip": 0.01106193, "auxiliary_loss_mlp": 0.01035542, "balance_loss_clip": 1.02226853, "balance_loss_mlp": 1.03530741, "epoch": 0.9868029460393807, "flos": 26506312634880.0, "grad_norm": 2.8732691217400195, "language_loss": 0.73456633, "learning_rate": 1.818410313934926e-09, "loss": 0.75598371, "num_input_tokens_seen": 354220000, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 16413, "time_per_iteration": 2.5034127235412598 }, { "auxiliary_loss_clip": 0.01102875, "auxiliary_loss_mlp": 0.01028831, "balance_loss_clip": 1.01686811, "balance_loss_mlp": 1.03351843, "epoch": 0.9868630692920487, "flos": 22967280299520.0, "grad_norm": 3.560253662965518, "language_loss": 0.71565402, "learning_rate": 1.8018442296858782e-09, "loss": 0.73697102, "num_input_tokens_seen": 354240910, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 16414, "time_per_iteration": 3.8609719276428223 }, { "auxiliary_loss_clip": 0.01101412, "auxiliary_loss_mlp": 0.01033717, "balance_loss_clip": 1.02229095, "balance_loss_mlp": 1.03647482, "epoch": 0.9869231925447167, "flos": 19828687760640.0, "grad_norm": 1.9192408869136888, "language_loss": 0.70258427, "learning_rate": 1.7853539173111608e-09, "loss": 0.7239356, "num_input_tokens_seen": 354259430, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.6484375, "step": 16415, "time_per_iteration": 2.4207470417022705 }, { "auxiliary_loss_clip": 0.0109685, "auxiliary_loss_mlp": 0.01028203, "balance_loss_clip": 1.01736701, "balance_loss_mlp": 1.03267145, "epoch": 0.9869833157973846, "flos": 20195228096640.0, "grad_norm": 1.7624913190069413, "language_loss": 0.75580263, "learning_rate": 1.7689393774362737e-09, "loss": 0.77705312, "num_input_tokens_seen": 354279490, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.640625, "step": 16416, "time_per_iteration": 2.4587979316711426 }, { "auxiliary_loss_clip": 0.01103544, "auxiliary_loss_mlp": 0.01027303, "balance_loss_clip": 1.01510167, "balance_loss_mlp": 1.03628063, "epoch": 0.9870434390500527, "flos": 16099507802880.0, "grad_norm": 1.9256542717702458, "language_loss": 0.71006137, "learning_rate": 1.7526006106833858e-09, "loss": 0.73136991, "num_input_tokens_seen": 354295080, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 16417, "time_per_iteration": 2.401844024658203 }, { "auxiliary_loss_clip": 0.01108261, "auxiliary_loss_mlp": 0.01034352, "balance_loss_clip": 1.02150726, "balance_loss_mlp": 1.03677607, "epoch": 0.9871035623027206, "flos": 21760753438080.0, "grad_norm": 1.646843904911971, "language_loss": 0.70743477, "learning_rate": 1.7363376176720013e-09, "loss": 0.72886086, "num_input_tokens_seen": 354314610, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.71484375, "step": 16418, "time_per_iteration": 2.4663240909576416 }, { "auxiliary_loss_clip": 0.01027866, "auxiliary_loss_mlp": 0.01000923, "balance_loss_clip": 0.99983829, "balance_loss_mlp": 1.00557089, "epoch": 0.9871636855553886, "flos": 70219583245440.0, "grad_norm": 0.65821201125496, "language_loss": 0.5369752, "learning_rate": 1.7201503990189603e-09, "loss": 0.55726308, "num_input_tokens_seen": 354383115, "router_z_loss_clip": 0.01086426, "router_z_loss_mlp": 0.22265625, "step": 16419, "time_per_iteration": 3.199040174484253 }, { "auxiliary_loss_clip": 0.01106459, "auxiliary_loss_mlp": 0.0103583, "balance_loss_clip": 1.02237082, "balance_loss_mlp": 1.03449917, "epoch": 0.9872238088080565, "flos": 25045825639680.0, "grad_norm": 1.8726988491480838, "language_loss": 0.78033036, "learning_rate": 1.7040389553382162e-09, "loss": 0.80175322, "num_input_tokens_seen": 354403115, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.71875, "step": 16420, "time_per_iteration": 5.295129776000977 }, { "auxiliary_loss_clip": 0.01105907, "auxiliary_loss_mlp": 0.0102789, "balance_loss_clip": 1.01538503, "balance_loss_mlp": 1.03853965, "epoch": 0.9872839320607245, "flos": 19465846525440.0, "grad_norm": 1.9913868171551856, "language_loss": 0.71242404, "learning_rate": 1.6880032872403916e-09, "loss": 0.73376203, "num_input_tokens_seen": 354424520, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.671875, "step": 16421, "time_per_iteration": 2.4558215141296387 }, { "auxiliary_loss_clip": 0.01105437, "auxiliary_loss_mlp": 0.01035847, "balance_loss_clip": 1.02270389, "balance_loss_mlp": 1.03438151, "epoch": 0.9873440553133924, "flos": 26942914448640.0, "grad_norm": 12.991077169685221, "language_loss": 0.81930107, "learning_rate": 1.6720433953338886e-09, "loss": 0.84071392, "num_input_tokens_seen": 354444800, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.7109375, "step": 16422, "time_per_iteration": 2.471095323562622 }, { "auxiliary_loss_clip": 0.01102211, "auxiliary_loss_mlp": 0.01025498, "balance_loss_clip": 1.01400661, "balance_loss_mlp": 1.03544986, "epoch": 0.9874041785660604, "flos": 19062210418560.0, "grad_norm": 1.8521524208556024, "language_loss": 0.86371422, "learning_rate": 1.656159280223779e-09, "loss": 0.88499129, "num_input_tokens_seen": 354464590, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 16423, "time_per_iteration": 2.4453115463256836 }, { "auxiliary_loss_clip": 0.01107157, "auxiliary_loss_mlp": 0.0102656, "balance_loss_clip": 1.01433516, "balance_loss_mlp": 1.03714395, "epoch": 0.9874643018187284, "flos": 21105814803840.0, "grad_norm": 2.0423655853230915, "language_loss": 0.70487475, "learning_rate": 1.6403509425122475e-09, "loss": 0.72621191, "num_input_tokens_seen": 354484145, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.69921875, "step": 16424, "time_per_iteration": 2.46199893951416 }, { "auxiliary_loss_clip": 0.01102818, "auxiliary_loss_mlp": 0.0102998, "balance_loss_clip": 1.01772571, "balance_loss_mlp": 1.03355455, "epoch": 0.9875244250713964, "flos": 24426043441920.0, "grad_norm": 2.021901147222342, "language_loss": 0.80753297, "learning_rate": 1.6246183827990366e-09, "loss": 0.828861, "num_input_tokens_seen": 354502475, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69140625, "step": 16425, "time_per_iteration": 2.473400354385376 }, { "auxiliary_loss_clip": 0.01105292, "auxiliary_loss_mlp": 0.0103048, "balance_loss_clip": 1.01748013, "balance_loss_mlp": 1.03534663, "epoch": 0.9875845483240643, "flos": 25117610970240.0, "grad_norm": 2.2830746058719633, "language_loss": 0.79957199, "learning_rate": 1.6089616016803364e-09, "loss": 0.82092977, "num_input_tokens_seen": 354521855, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.69921875, "step": 16426, "time_per_iteration": 2.4634695053100586 }, { "auxiliary_loss_clip": 0.01104192, "auxiliary_loss_mlp": 0.01031683, "balance_loss_clip": 1.01993465, "balance_loss_mlp": 1.03662086, "epoch": 0.9876446715767323, "flos": 16581788737920.0, "grad_norm": 1.949434552142492, "language_loss": 0.84871852, "learning_rate": 1.593380599750338e-09, "loss": 0.87007719, "num_input_tokens_seen": 354539535, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 16427, "time_per_iteration": 2.43326473236084 }, { "auxiliary_loss_clip": 0.01103535, "auxiliary_loss_mlp": 0.0103266, "balance_loss_clip": 1.02069092, "balance_loss_mlp": 1.03637266, "epoch": 0.9877047948294003, "flos": 21616141282560.0, "grad_norm": 1.7668884662576623, "language_loss": 0.7027539, "learning_rate": 1.577875377599458e-09, "loss": 0.72411585, "num_input_tokens_seen": 354557430, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.671875, "step": 16428, "time_per_iteration": 2.4260549545288086 }, { "auxiliary_loss_clip": 0.01100764, "auxiliary_loss_mlp": 0.01032542, "balance_loss_clip": 1.02084184, "balance_loss_mlp": 1.03400695, "epoch": 0.9877649180820682, "flos": 21178497974400.0, "grad_norm": 2.145896372727404, "language_loss": 0.79710144, "learning_rate": 1.5624459358158926e-09, "loss": 0.81843448, "num_input_tokens_seen": 354574735, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.66796875, "step": 16429, "time_per_iteration": 2.449788808822632 }, { "auxiliary_loss_clip": 0.01101724, "auxiliary_loss_mlp": 0.01028428, "balance_loss_clip": 1.01692462, "balance_loss_mlp": 1.03375185, "epoch": 0.9878250413347363, "flos": 39749233576320.0, "grad_norm": 2.6133657206117156, "language_loss": 0.62247479, "learning_rate": 1.5470922749845073e-09, "loss": 0.6437763, "num_input_tokens_seen": 354597050, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 16430, "time_per_iteration": 2.5988399982452393 }, { "auxiliary_loss_clip": 0.01104942, "auxiliary_loss_mlp": 0.01032153, "balance_loss_clip": 1.02022004, "balance_loss_mlp": 1.03555548, "epoch": 0.9878851645874042, "flos": 29425634599680.0, "grad_norm": 1.2571567435431084, "language_loss": 0.73272794, "learning_rate": 1.531814395687725e-09, "loss": 0.75409889, "num_input_tokens_seen": 354619095, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6953125, "step": 16431, "time_per_iteration": 2.532837390899658 }, { "auxiliary_loss_clip": 0.01103516, "auxiliary_loss_mlp": 0.0103297, "balance_loss_clip": 1.02069771, "balance_loss_mlp": 1.03582978, "epoch": 0.9879452878400722, "flos": 15806261168640.0, "grad_norm": 2.450293601610162, "language_loss": 0.80946171, "learning_rate": 1.5166122985048602e-09, "loss": 0.83082658, "num_input_tokens_seen": 354633790, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.67578125, "step": 16432, "time_per_iteration": 2.404179573059082 }, { "auxiliary_loss_clip": 0.01100382, "auxiliary_loss_mlp": 0.0102737, "balance_loss_clip": 1.0163846, "balance_loss_mlp": 1.03367949, "epoch": 0.9880054110927401, "flos": 22233912318720.0, "grad_norm": 1.943201100107817, "language_loss": 0.80559194, "learning_rate": 1.5014859840123405e-09, "loss": 0.82686949, "num_input_tokens_seen": 354653180, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.6640625, "step": 16433, "time_per_iteration": 2.4676122665405273 }, { "auxiliary_loss_clip": 0.01100928, "auxiliary_loss_mlp": 0.01032687, "balance_loss_clip": 1.02040839, "balance_loss_mlp": 1.03518391, "epoch": 0.9880655343454081, "flos": 28763836467840.0, "grad_norm": 2.9561570121362313, "language_loss": 0.65104699, "learning_rate": 1.4864354527837075e-09, "loss": 0.67238319, "num_input_tokens_seen": 354669900, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.65625, "step": 16434, "time_per_iteration": 2.4668240547180176 }, { "auxiliary_loss_clip": 0.01103724, "auxiliary_loss_mlp": 0.01030662, "balance_loss_clip": 1.01840782, "balance_loss_mlp": 1.0335114, "epoch": 0.988125657598076, "flos": 32853379622400.0, "grad_norm": 1.5303866608337215, "language_loss": 0.69531429, "learning_rate": 1.4714607053896154e-09, "loss": 0.71665812, "num_input_tokens_seen": 354693165, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.703125, "step": 16435, "time_per_iteration": 2.537802219390869 }, { "auxiliary_loss_clip": 0.01103799, "auxiliary_loss_mlp": 0.01031571, "balance_loss_clip": 1.01952541, "balance_loss_mlp": 1.03620636, "epoch": 0.988185780850744, "flos": 19390685316480.0, "grad_norm": 2.225051321270245, "language_loss": 0.75130439, "learning_rate": 1.4565617423980548e-09, "loss": 0.77265811, "num_input_tokens_seen": 354711915, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.67578125, "step": 16436, "time_per_iteration": 2.418198347091675 }, { "auxiliary_loss_clip": 0.01104473, "auxiliary_loss_mlp": 0.01028633, "balance_loss_clip": 1.01550186, "balance_loss_mlp": 1.03568745, "epoch": 0.988245904103412, "flos": 22528415928960.0, "grad_norm": 2.0664217980487587, "language_loss": 0.73906797, "learning_rate": 1.4417385643741286e-09, "loss": 0.76039898, "num_input_tokens_seen": 354729135, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6875, "step": 16437, "time_per_iteration": 2.4425084590911865 }, { "auxiliary_loss_clip": 0.01099557, "auxiliary_loss_mlp": 0.01033481, "balance_loss_clip": 1.0215838, "balance_loss_mlp": 1.03400612, "epoch": 0.98830602735608, "flos": 28659193171200.0, "grad_norm": 1.9779173241502161, "language_loss": 0.60216367, "learning_rate": 1.4269911718796103e-09, "loss": 0.62349403, "num_input_tokens_seen": 354752530, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.65625, "step": 16438, "time_per_iteration": 2.496407985687256 }, { "auxiliary_loss_clip": 0.01103746, "auxiliary_loss_mlp": 0.01030052, "balance_loss_clip": 1.01741004, "balance_loss_mlp": 1.03627396, "epoch": 0.9883661506087479, "flos": 20996035862400.0, "grad_norm": 1.9335113686071388, "language_loss": 0.71701247, "learning_rate": 1.4123195654738295e-09, "loss": 0.73835039, "num_input_tokens_seen": 354771135, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.671875, "step": 16439, "time_per_iteration": 2.4410290718078613 }, { "auxiliary_loss_clip": 0.01101347, "auxiliary_loss_mlp": 0.01030539, "balance_loss_clip": 1.01849318, "balance_loss_mlp": 1.03436184, "epoch": 0.9884262738614159, "flos": 32706109860480.0, "grad_norm": 1.77371475050484, "language_loss": 0.60414869, "learning_rate": 1.3977237457134528e-09, "loss": 0.62546754, "num_input_tokens_seen": 354791800, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.66796875, "step": 16440, "time_per_iteration": 2.5262153148651123 }, { "auxiliary_loss_clip": 0.01103247, "auxiliary_loss_mlp": 0.01029657, "balance_loss_clip": 1.01760459, "balance_loss_mlp": 1.03331828, "epoch": 0.9884863971140839, "flos": 17564699479680.0, "grad_norm": 3.086299177934394, "language_loss": 0.76507902, "learning_rate": 1.3832037131513707e-09, "loss": 0.78640807, "num_input_tokens_seen": 354809200, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.69921875, "step": 16441, "time_per_iteration": 2.4244072437286377 }, { "auxiliary_loss_clip": 0.01104049, "auxiliary_loss_mlp": 0.01028495, "balance_loss_clip": 1.01604915, "balance_loss_mlp": 1.03499389, "epoch": 0.9885465203667518, "flos": 40552519380480.0, "grad_norm": 2.720955437777101, "language_loss": 0.67819631, "learning_rate": 1.3687594683386982e-09, "loss": 0.69952172, "num_input_tokens_seen": 354829945, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 16442, "time_per_iteration": 2.5972867012023926 }, { "auxiliary_loss_clip": 0.01102671, "auxiliary_loss_mlp": 0.01028471, "balance_loss_clip": 1.01686645, "balance_loss_mlp": 1.03527379, "epoch": 0.9886066436194199, "flos": 13807976768640.0, "grad_norm": 4.728372681595715, "language_loss": 0.74480903, "learning_rate": 1.3543910118227753e-09, "loss": 0.76612043, "num_input_tokens_seen": 354845055, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.671875, "step": 16443, "time_per_iteration": 2.41772198677063 }, { "auxiliary_loss_clip": 0.01103517, "auxiliary_loss_mlp": 0.01030634, "balance_loss_clip": 1.01754475, "balance_loss_mlp": 1.03403366, "epoch": 0.9886667668720878, "flos": 23325129544320.0, "grad_norm": 2.1239773143945095, "language_loss": 0.73734283, "learning_rate": 1.3400983441487213e-09, "loss": 0.7586844, "num_input_tokens_seen": 354864680, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.6953125, "step": 16444, "time_per_iteration": 2.455435037612915 }, { "auxiliary_loss_clip": 0.01102967, "auxiliary_loss_mlp": 0.01029999, "balance_loss_clip": 1.01860893, "balance_loss_mlp": 1.03679073, "epoch": 0.9887268901247558, "flos": 22706029704960.0, "grad_norm": 3.1607320969901678, "language_loss": 0.69297624, "learning_rate": 1.325881465858547e-09, "loss": 0.71430588, "num_input_tokens_seen": 354885685, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.66015625, "step": 16445, "time_per_iteration": 2.4950404167175293 }, { "auxiliary_loss_clip": 0.01105631, "auxiliary_loss_mlp": 0.01028391, "balance_loss_clip": 1.01577306, "balance_loss_mlp": 1.03649163, "epoch": 0.9887870133774237, "flos": 13041283944960.0, "grad_norm": 3.5528294972858876, "language_loss": 0.60509855, "learning_rate": 1.311740377491155e-09, "loss": 0.62643874, "num_input_tokens_seen": 354901505, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69140625, "step": 16446, "time_per_iteration": 2.398345470428467 }, { "auxiliary_loss_clip": 0.01103284, "auxiliary_loss_mlp": 0.01033575, "balance_loss_clip": 1.0221076, "balance_loss_mlp": 1.0348289, "epoch": 0.9888471366300917, "flos": 15158864390400.0, "grad_norm": 3.557629985210482, "language_loss": 0.71148735, "learning_rate": 1.297675079582783e-09, "loss": 0.73285592, "num_input_tokens_seen": 354920060, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.68359375, "step": 16447, "time_per_iteration": 2.4433066844940186 }, { "auxiliary_loss_clip": 0.01102594, "auxiliary_loss_mlp": 0.01028335, "balance_loss_clip": 1.01704013, "balance_loss_mlp": 1.03509116, "epoch": 0.9889072598827596, "flos": 25118796119040.0, "grad_norm": 2.408878272674168, "language_loss": 0.83860946, "learning_rate": 1.2836855726667818e-09, "loss": 0.85991871, "num_input_tokens_seen": 354938690, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 16448, "time_per_iteration": 2.470581293106079 }, { "auxiliary_loss_clip": 0.01100129, "auxiliary_loss_mlp": 0.01024969, "balance_loss_clip": 1.01410353, "balance_loss_mlp": 1.03400731, "epoch": 0.9889673831354276, "flos": 16728663450240.0, "grad_norm": 1.6410613964828813, "language_loss": 0.70226574, "learning_rate": 1.26977185727406e-09, "loss": 0.7235167, "num_input_tokens_seen": 354956955, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.66015625, "step": 16449, "time_per_iteration": 2.438507556915283 }, { "auxiliary_loss_clip": 0.0110567, "auxiliary_loss_mlp": 0.01028324, "balance_loss_clip": 1.01615918, "balance_loss_mlp": 1.0351553, "epoch": 0.9890275063880956, "flos": 35585175657600.0, "grad_norm": 2.7328773987090984, "language_loss": 0.74292701, "learning_rate": 1.25593393393153e-09, "loss": 0.76426697, "num_input_tokens_seen": 354976800, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.70703125, "step": 16450, "time_per_iteration": 2.5493667125701904 }, { "auxiliary_loss_clip": 0.01104003, "auxiliary_loss_mlp": 0.01030954, "balance_loss_clip": 1.01906848, "balance_loss_mlp": 1.03280044, "epoch": 0.9890876296407636, "flos": 18952359649920.0, "grad_norm": 2.0233659769929457, "language_loss": 0.79328912, "learning_rate": 1.242171803164549e-09, "loss": 0.81463867, "num_input_tokens_seen": 354996625, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.7109375, "step": 16451, "time_per_iteration": 2.4404423236846924 }, { "auxiliary_loss_clip": 0.01103165, "auxiliary_loss_mlp": 0.01031106, "balance_loss_clip": 1.01797485, "balance_loss_mlp": 1.03325319, "epoch": 0.9891477528934315, "flos": 23769309127680.0, "grad_norm": 1.8940817962330962, "language_loss": 0.70075262, "learning_rate": 1.2284854654946996e-09, "loss": 0.72209525, "num_input_tokens_seen": 355014535, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.69921875, "step": 16452, "time_per_iteration": 3.909342050552368 }, { "auxiliary_loss_clip": 0.01099758, "auxiliary_loss_mlp": 0.01025177, "balance_loss_clip": 1.01447797, "balance_loss_mlp": 1.03445089, "epoch": 0.9892078761460995, "flos": 20772922533120.0, "grad_norm": 1.5544130538073193, "language_loss": 0.73746562, "learning_rate": 1.2148749214409004e-09, "loss": 0.75871491, "num_input_tokens_seen": 355033280, "router_z_loss_clip": 0.10693359, "router_z_loss_mlp": 0.65234375, "step": 16453, "time_per_iteration": 2.428351640701294 }, { "auxiliary_loss_clip": 0.01103985, "auxiliary_loss_mlp": 0.01036298, "balance_loss_clip": 1.02444267, "balance_loss_mlp": 1.03433692, "epoch": 0.9892679993987675, "flos": 23367827836800.0, "grad_norm": 2.2717003205787236, "language_loss": 0.7005738, "learning_rate": 1.2013401715191828e-09, "loss": 0.72197664, "num_input_tokens_seen": 355053320, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.6953125, "step": 16454, "time_per_iteration": 2.447819232940674 }, { "auxiliary_loss_clip": 0.01099052, "auxiliary_loss_mlp": 0.01031156, "balance_loss_clip": 1.02015305, "balance_loss_mlp": 1.03457546, "epoch": 0.9893281226514354, "flos": 22705419173760.0, "grad_norm": 2.444023499515594, "language_loss": 0.76109642, "learning_rate": 1.1878812162433583e-09, "loss": 0.78239858, "num_input_tokens_seen": 355070230, "router_z_loss_clip": 0.10986328, "router_z_loss_mlp": 0.64453125, "step": 16455, "time_per_iteration": 2.4797964096069336 }, { "auxiliary_loss_clip": 0.01101722, "auxiliary_loss_mlp": 0.01025038, "balance_loss_clip": 1.01318824, "balance_loss_mlp": 1.03493571, "epoch": 0.9893882459041035, "flos": 21796664060160.0, "grad_norm": 2.603386151563394, "language_loss": 0.6538434, "learning_rate": 1.1744980561230188e-09, "loss": 0.67511094, "num_input_tokens_seen": 355090125, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.66796875, "step": 16456, "time_per_iteration": 3.8933985233306885 }, { "auxiliary_loss_clip": 0.01106497, "auxiliary_loss_mlp": 0.01028243, "balance_loss_clip": 1.01635742, "balance_loss_mlp": 1.0366683, "epoch": 0.9894483691567714, "flos": 18113773754880.0, "grad_norm": 1.9096894870150374, "language_loss": 0.7383213, "learning_rate": 1.161190691666203e-09, "loss": 0.75966871, "num_input_tokens_seen": 355107890, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69921875, "step": 16457, "time_per_iteration": 2.3996336460113525 }, { "auxiliary_loss_clip": 0.01104326, "auxiliary_loss_mlp": 0.01029526, "balance_loss_clip": 1.01733041, "balance_loss_mlp": 1.03585756, "epoch": 0.9895084924094394, "flos": 31211615664000.0, "grad_norm": 2.1710751021843966, "language_loss": 0.69108319, "learning_rate": 1.1479591233773954e-09, "loss": 0.71242172, "num_input_tokens_seen": 355126340, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.68359375, "step": 16458, "time_per_iteration": 2.5134668350219727 }, { "auxiliary_loss_clip": 0.0109974, "auxiliary_loss_mlp": 0.01028966, "balance_loss_clip": 1.01687241, "balance_loss_mlp": 1.03355157, "epoch": 0.9895686156621073, "flos": 19678042120320.0, "grad_norm": 1.859336286799458, "language_loss": 0.79436779, "learning_rate": 1.1348033517581956e-09, "loss": 0.81565487, "num_input_tokens_seen": 355144025, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.6640625, "step": 16459, "time_per_iteration": 2.4099950790405273 }, { "auxiliary_loss_clip": 0.01103352, "auxiliary_loss_mlp": 0.01033162, "balance_loss_clip": 1.02147365, "balance_loss_mlp": 1.03368938, "epoch": 0.9896287389147753, "flos": 23581675457280.0, "grad_norm": 2.0634313636080135, "language_loss": 0.7108385, "learning_rate": 1.1217233773075373e-09, "loss": 0.7322036, "num_input_tokens_seen": 355163125, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.6953125, "step": 16460, "time_per_iteration": 2.4658215045928955 }, { "auxiliary_loss_clip": 0.01105554, "auxiliary_loss_mlp": 0.01027987, "balance_loss_clip": 1.01560736, "balance_loss_mlp": 1.03488088, "epoch": 0.9896888621674432, "flos": 29605331364480.0, "grad_norm": 1.6451950721120463, "language_loss": 0.87725794, "learning_rate": 1.1087192005214685e-09, "loss": 0.89859331, "num_input_tokens_seen": 355184060, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.70703125, "step": 16461, "time_per_iteration": 3.910621166229248 }, { "auxiliary_loss_clip": 0.0110273, "auxiliary_loss_mlp": 0.0103349, "balance_loss_clip": 1.02025223, "balance_loss_mlp": 1.03449678, "epoch": 0.9897489854201112, "flos": 23695045758720.0, "grad_norm": 2.9191189481147384, "language_loss": 0.62886178, "learning_rate": 1.09579082189315e-09, "loss": 0.65022397, "num_input_tokens_seen": 355204505, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.68359375, "step": 16462, "time_per_iteration": 3.9065983295440674 }, { "auxiliary_loss_clip": 0.01105059, "auxiliary_loss_mlp": 0.01028858, "balance_loss_clip": 1.01738453, "balance_loss_mlp": 1.0370121, "epoch": 0.9898091086727792, "flos": 13225146687360.0, "grad_norm": 1.6995429396485082, "language_loss": 0.7293551, "learning_rate": 1.0829382419126343e-09, "loss": 0.75069422, "num_input_tokens_seen": 355223055, "router_z_loss_clip": 0.11474609, "router_z_loss_mlp": 0.6796875, "step": 16463, "time_per_iteration": 2.4228193759918213 }, { "auxiliary_loss_clip": 0.01102283, "auxiliary_loss_mlp": 0.01027719, "balance_loss_clip": 1.01542902, "balance_loss_mlp": 1.03405917, "epoch": 0.9898692319254472, "flos": 22930400010240.0, "grad_norm": 2.363774692594866, "language_loss": 0.69709527, "learning_rate": 1.0701614610675314e-09, "loss": 0.71839535, "num_input_tokens_seen": 355242000, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 16464, "time_per_iteration": 2.44659686088562 }, { "auxiliary_loss_clip": 0.01104462, "auxiliary_loss_mlp": 0.01030305, "balance_loss_clip": 1.01785398, "balance_loss_mlp": 1.03437161, "epoch": 0.9899293551781151, "flos": 12458346122880.0, "grad_norm": 2.099010608394906, "language_loss": 0.73238611, "learning_rate": 1.0574604798421204e-09, "loss": 0.75373387, "num_input_tokens_seen": 355260175, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 16465, "time_per_iteration": 2.430859327316284 }, { "auxiliary_loss_clip": 0.01101026, "auxiliary_loss_mlp": 0.010319, "balance_loss_clip": 1.02083158, "balance_loss_mlp": 1.03369176, "epoch": 0.9899894784307831, "flos": 26871129118080.0, "grad_norm": 1.8293837545427127, "language_loss": 0.86462367, "learning_rate": 1.0448352987182386e-09, "loss": 0.88595295, "num_input_tokens_seen": 355281930, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.671875, "step": 16466, "time_per_iteration": 2.510002851486206 }, { "auxiliary_loss_clip": 0.01103082, "auxiliary_loss_mlp": 0.01024104, "balance_loss_clip": 1.01220679, "balance_loss_mlp": 1.03531146, "epoch": 0.990049601683451, "flos": 21542093395200.0, "grad_norm": 1.6490075624761615, "language_loss": 0.71756995, "learning_rate": 1.0322859181743915e-09, "loss": 0.73884177, "num_input_tokens_seen": 355301555, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6796875, "step": 16467, "time_per_iteration": 2.4807515144348145 }, { "auxiliary_loss_clip": 0.01102337, "auxiliary_loss_mlp": 0.01028962, "balance_loss_clip": 1.01738727, "balance_loss_mlp": 1.03502882, "epoch": 0.990109724936119, "flos": 28771809287040.0, "grad_norm": 1.3774355551161408, "language_loss": 0.64900422, "learning_rate": 1.019812338686643e-09, "loss": 0.67031717, "num_input_tokens_seen": 355324925, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.671875, "step": 16468, "time_per_iteration": 2.5248985290527344 }, { "auxiliary_loss_clip": 0.0110797, "auxiliary_loss_mlp": 0.01030457, "balance_loss_clip": 1.0178982, "balance_loss_mlp": 1.03603888, "epoch": 0.9901698481887871, "flos": 29274270687360.0, "grad_norm": 1.8941054773245525, "language_loss": 0.62276804, "learning_rate": 1.0074145607281704e-09, "loss": 0.64415228, "num_input_tokens_seen": 355343875, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.71875, "step": 16469, "time_per_iteration": 2.5282106399536133 }, { "auxiliary_loss_clip": 0.01106707, "auxiliary_loss_mlp": 0.01031138, "balance_loss_clip": 1.01934791, "balance_loss_mlp": 1.03590035, "epoch": 0.990229971441455, "flos": 15959025711360.0, "grad_norm": 2.3981838034786063, "language_loss": 0.69958389, "learning_rate": 9.950925847685976e-10, "loss": 0.72096241, "num_input_tokens_seen": 355358835, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.70703125, "step": 16470, "time_per_iteration": 2.425872325897217 }, { "auxiliary_loss_clip": 0.01027907, "auxiliary_loss_mlp": 0.01001817, "balance_loss_clip": 1.00076246, "balance_loss_mlp": 1.00561976, "epoch": 0.990290094694123, "flos": 69780287911680.0, "grad_norm": 0.6570133552348703, "language_loss": 0.55577725, "learning_rate": 9.828464112755509e-10, "loss": 0.57607448, "num_input_tokens_seen": 355431225, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.22265625, "step": 16471, "time_per_iteration": 3.265469551086426 }, { "auxiliary_loss_clip": 0.01105039, "auxiliary_loss_mlp": 0.01032269, "balance_loss_clip": 1.02005577, "balance_loss_mlp": 1.03673625, "epoch": 0.9903502179467909, "flos": 16252451913600.0, "grad_norm": 2.326226391094935, "language_loss": 0.8344211, "learning_rate": 9.706760407131032e-10, "loss": 0.85579413, "num_input_tokens_seen": 355448250, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 16472, "time_per_iteration": 2.4044413566589355 }, { "auxiliary_loss_clip": 0.01103957, "auxiliary_loss_mlp": 0.01027788, "balance_loss_clip": 1.01574206, "balance_loss_mlp": 1.0353744, "epoch": 0.9904103411994589, "flos": 21688393489920.0, "grad_norm": 2.703866409129693, "language_loss": 0.85950005, "learning_rate": 9.585814735431075e-10, "loss": 0.88081753, "num_input_tokens_seen": 355467040, "router_z_loss_clip": 0.12060547, "router_z_loss_mlp": 0.6875, "step": 16473, "time_per_iteration": 2.4513304233551025 }, { "auxiliary_loss_clip": 0.01100689, "auxiliary_loss_mlp": 0.01029285, "balance_loss_clip": 1.01806784, "balance_loss_mlp": 1.03298259, "epoch": 0.9904704644521268, "flos": 25739440243200.0, "grad_norm": 1.8721483425106142, "language_loss": 0.84396213, "learning_rate": 9.465627102240859e-10, "loss": 0.86526185, "num_input_tokens_seen": 355487825, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.67578125, "step": 16474, "time_per_iteration": 2.469205379486084 }, { "auxiliary_loss_clip": 0.0109798, "auxiliary_loss_mlp": 0.01034609, "balance_loss_clip": 1.02312326, "balance_loss_mlp": 1.0301528, "epoch": 0.9905305877047949, "flos": 21908346422400.0, "grad_norm": 1.8082677371174996, "language_loss": 0.76507187, "learning_rate": 9.346197512116738e-10, "loss": 0.78639776, "num_input_tokens_seen": 355507445, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.67578125, "step": 16475, "time_per_iteration": 2.464966058731079 }, { "auxiliary_loss_clip": 0.01101409, "auxiliary_loss_mlp": 0.01031127, "balance_loss_clip": 1.01942635, "balance_loss_mlp": 1.03239596, "epoch": 0.9905907109574628, "flos": 21392417422080.0, "grad_norm": 1.5562537069477758, "language_loss": 0.75918567, "learning_rate": 9.227525969588423e-10, "loss": 0.78051102, "num_input_tokens_seen": 355527205, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 16476, "time_per_iteration": 2.4398391246795654 }, { "auxiliary_loss_clip": 0.01107993, "auxiliary_loss_mlp": 0.01028518, "balance_loss_clip": 1.01519597, "balance_loss_mlp": 1.03556085, "epoch": 0.9906508342101308, "flos": 20521620005760.0, "grad_norm": 2.5520440192747307, "language_loss": 0.67629123, "learning_rate": 9.109612479154538e-10, "loss": 0.69765633, "num_input_tokens_seen": 355544740, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.72265625, "step": 16477, "time_per_iteration": 2.424790620803833 }, { "auxiliary_loss_clip": 0.01108258, "auxiliary_loss_mlp": 0.0103233, "balance_loss_clip": 1.01958704, "balance_loss_mlp": 1.03724325, "epoch": 0.9907109574627987, "flos": 21361211481600.0, "grad_norm": 1.9432860277406168, "language_loss": 0.71749675, "learning_rate": 8.992457045289282e-10, "loss": 0.73890263, "num_input_tokens_seen": 355564385, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.7109375, "step": 16478, "time_per_iteration": 2.424637794494629 }, { "auxiliary_loss_clip": 0.01104766, "auxiliary_loss_mlp": 0.01036074, "balance_loss_clip": 1.02253199, "balance_loss_mlp": 1.03520191, "epoch": 0.9907710807154667, "flos": 17338605321600.0, "grad_norm": 2.3454685726429387, "language_loss": 0.81021881, "learning_rate": 8.876059672433545e-10, "loss": 0.83162725, "num_input_tokens_seen": 355579260, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.6953125, "step": 16479, "time_per_iteration": 2.427614688873291 }, { "auxiliary_loss_clip": 0.01105656, "auxiliary_loss_mlp": 0.01031131, "balance_loss_clip": 1.01935267, "balance_loss_mlp": 1.03603506, "epoch": 0.9908312039681346, "flos": 28621881918720.0, "grad_norm": 2.755515486352648, "language_loss": 0.66353977, "learning_rate": 8.760420364999355e-10, "loss": 0.68490756, "num_input_tokens_seen": 355599790, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6953125, "step": 16480, "time_per_iteration": 2.4863779544830322 }, { "auxiliary_loss_clip": 0.01100499, "auxiliary_loss_mlp": 0.01030217, "balance_loss_clip": 1.01789045, "balance_loss_mlp": 1.03332305, "epoch": 0.9908913272208026, "flos": 35770654512000.0, "grad_norm": 1.8523008536859729, "language_loss": 0.72018111, "learning_rate": 8.645539127374313e-10, "loss": 0.74148828, "num_input_tokens_seen": 355620925, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.671875, "step": 16481, "time_per_iteration": 2.5590267181396484 }, { "auxiliary_loss_clip": 0.01101898, "auxiliary_loss_mlp": 0.01023682, "balance_loss_clip": 1.01173162, "balance_loss_mlp": 1.03536177, "epoch": 0.9909514504734707, "flos": 19902196944000.0, "grad_norm": 1.7905764358595784, "language_loss": 0.77539396, "learning_rate": 8.531415963912713e-10, "loss": 0.79664981, "num_input_tokens_seen": 355639165, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6640625, "step": 16482, "time_per_iteration": 2.4312241077423096 }, { "auxiliary_loss_clip": 0.01105552, "auxiliary_loss_mlp": 0.01029928, "balance_loss_clip": 1.01803124, "balance_loss_mlp": 1.03562474, "epoch": 0.9910115737261386, "flos": 20004793165440.0, "grad_norm": 2.3773639101857658, "language_loss": 0.75823569, "learning_rate": 8.418050878944427e-10, "loss": 0.77959061, "num_input_tokens_seen": 355657320, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 16483, "time_per_iteration": 2.4268178939819336 }, { "auxiliary_loss_clip": 0.0102786, "auxiliary_loss_mlp": 0.01001989, "balance_loss_clip": 1.00092816, "balance_loss_mlp": 1.00559282, "epoch": 0.9910716969788066, "flos": 70688432494080.0, "grad_norm": 0.6741920274648993, "language_loss": 0.53672928, "learning_rate": 8.305443876768237e-10, "loss": 0.55702776, "num_input_tokens_seen": 355726370, "router_z_loss_clip": 0.01062012, "router_z_loss_mlp": 0.22265625, "step": 16484, "time_per_iteration": 3.1996283531188965 }, { "auxiliary_loss_clip": 0.01097613, "auxiliary_loss_mlp": 0.01028575, "balance_loss_clip": 1.0172441, "balance_loss_mlp": 1.03260386, "epoch": 0.9911318202314745, "flos": 21434038306560.0, "grad_norm": 1.7540405377523063, "language_loss": 0.81912345, "learning_rate": 8.19359496165184e-10, "loss": 0.84038532, "num_input_tokens_seen": 355745840, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6484375, "step": 16485, "time_per_iteration": 2.47115421295166 }, { "auxiliary_loss_clip": 0.01102841, "auxiliary_loss_mlp": 0.01033934, "balance_loss_clip": 1.02149463, "balance_loss_mlp": 1.03618777, "epoch": 0.9911919434841425, "flos": 19826820253440.0, "grad_norm": 1.7397884514759148, "language_loss": 0.81517136, "learning_rate": 8.082504137836288e-10, "loss": 0.83653909, "num_input_tokens_seen": 355763385, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6640625, "step": 16486, "time_per_iteration": 2.410264730453491 }, { "auxiliary_loss_clip": 0.01104431, "auxiliary_loss_mlp": 0.01028472, "balance_loss_clip": 1.016927, "balance_loss_mlp": 1.03504539, "epoch": 0.9912520667368104, "flos": 41719364691840.0, "grad_norm": 3.806933499044248, "language_loss": 0.65816516, "learning_rate": 7.972171409538209e-10, "loss": 0.6794942, "num_input_tokens_seen": 355786075, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.6953125, "step": 16487, "time_per_iteration": 2.638550281524658 }, { "auxiliary_loss_clip": 0.01100819, "auxiliary_loss_mlp": 0.01027028, "balance_loss_clip": 1.01595986, "balance_loss_mlp": 1.03395605, "epoch": 0.9913121899894785, "flos": 23769668263680.0, "grad_norm": 128.61984801344295, "language_loss": 0.76619947, "learning_rate": 7.862596780936481e-10, "loss": 0.78747797, "num_input_tokens_seen": 355806295, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.671875, "step": 16488, "time_per_iteration": 2.4568140506744385 }, { "auxiliary_loss_clip": 0.01107413, "auxiliary_loss_mlp": 0.01029164, "balance_loss_clip": 1.01643193, "balance_loss_mlp": 1.03538084, "epoch": 0.9913723132421464, "flos": 23769668263680.0, "grad_norm": 2.3211608188000037, "language_loss": 0.68424463, "learning_rate": 7.753780256190001e-10, "loss": 0.70561039, "num_input_tokens_seen": 355825730, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71875, "step": 16489, "time_per_iteration": 2.480130434036255 }, { "auxiliary_loss_clip": 0.01027704, "auxiliary_loss_mlp": 0.01003774, "balance_loss_clip": 1.00269544, "balance_loss_mlp": 1.0054996, "epoch": 0.9914324364948144, "flos": 71267419820160.0, "grad_norm": 0.6062108054161365, "language_loss": 0.52555728, "learning_rate": 7.645721839424357e-10, "loss": 0.54587209, "num_input_tokens_seen": 355891545, "router_z_loss_clip": 0.01080322, "router_z_loss_mlp": 0.22265625, "step": 16490, "time_per_iteration": 3.2060256004333496 }, { "auxiliary_loss_clip": 0.01110249, "auxiliary_loss_mlp": 0.01032263, "balance_loss_clip": 1.01865518, "balance_loss_mlp": 1.03792, "epoch": 0.9914925597474823, "flos": 23695440808320.0, "grad_norm": 1.717658454916735, "language_loss": 0.75777763, "learning_rate": 7.538421534734052e-10, "loss": 0.77920276, "num_input_tokens_seen": 355909920, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 16491, "time_per_iteration": 2.466858386993408 }, { "auxiliary_loss_clip": 0.01109046, "auxiliary_loss_mlp": 0.01031547, "balance_loss_clip": 1.01832628, "balance_loss_mlp": 1.03824997, "epoch": 0.9915526830001503, "flos": 13433822749440.0, "grad_norm": 1.9704675103001519, "language_loss": 0.70739698, "learning_rate": 7.431879346191383e-10, "loss": 0.72880292, "num_input_tokens_seen": 355923130, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.70703125, "step": 16492, "time_per_iteration": 2.4012527465820312 }, { "auxiliary_loss_clip": 0.01102703, "auxiliary_loss_mlp": 0.01031659, "balance_loss_clip": 1.01846218, "balance_loss_mlp": 1.03401542, "epoch": 0.9916128062528182, "flos": 20740962407040.0, "grad_norm": 10.35405625867132, "language_loss": 0.68136054, "learning_rate": 7.326095277837563e-10, "loss": 0.70270419, "num_input_tokens_seen": 355941960, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6875, "step": 16493, "time_per_iteration": 2.441256523132324 }, { "auxiliary_loss_clip": 0.01106571, "auxiliary_loss_mlp": 0.01032591, "balance_loss_clip": 1.02059233, "balance_loss_mlp": 1.03596616, "epoch": 0.9916729295054862, "flos": 22487082353280.0, "grad_norm": 1.793572416539433, "language_loss": 0.71413058, "learning_rate": 7.221069333678276e-10, "loss": 0.73552215, "num_input_tokens_seen": 355961640, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.70703125, "step": 16494, "time_per_iteration": 3.932345390319824 }, { "auxiliary_loss_clip": 0.01104255, "auxiliary_loss_mlp": 0.01030319, "balance_loss_clip": 1.0171349, "balance_loss_mlp": 1.0348016, "epoch": 0.9917330527581543, "flos": 14792467708800.0, "grad_norm": 2.0372777399046105, "language_loss": 0.68129063, "learning_rate": 7.116801517701443e-10, "loss": 0.70263642, "num_input_tokens_seen": 355977980, "router_z_loss_clip": 0.13183594, "router_z_loss_mlp": 0.6953125, "step": 16495, "time_per_iteration": 2.4027154445648193 }, { "auxiliary_loss_clip": 0.01027543, "auxiliary_loss_mlp": 0.01000963, "balance_loss_clip": 0.99986595, "balance_loss_mlp": 1.00534391, "epoch": 0.9917931760108222, "flos": 59191595585280.0, "grad_norm": 1.181299669190586, "language_loss": 0.53490579, "learning_rate": 7.013291833859458e-10, "loss": 0.55519086, "num_input_tokens_seen": 356042900, "router_z_loss_clip": 0.01098633, "router_z_loss_mlp": 0.22265625, "step": 16496, "time_per_iteration": 3.177849531173706 }, { "auxiliary_loss_clip": 0.0110561, "auxiliary_loss_mlp": 0.01031148, "balance_loss_clip": 1.01781464, "balance_loss_mlp": 1.03538132, "epoch": 0.9918532992634902, "flos": 26761637485440.0, "grad_norm": 1.902999982926975, "language_loss": 0.7147069, "learning_rate": 6.91054028607585e-10, "loss": 0.73607445, "num_input_tokens_seen": 356063000, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 16497, "time_per_iteration": 3.908104419708252 }, { "auxiliary_loss_clip": 0.01107474, "auxiliary_loss_mlp": 0.01034913, "balance_loss_clip": 1.02157366, "balance_loss_mlp": 1.03551149, "epoch": 0.9919134225161581, "flos": 14975719920000.0, "grad_norm": 4.640827878728116, "language_loss": 0.82153112, "learning_rate": 6.808546878249721e-10, "loss": 0.84295499, "num_input_tokens_seen": 356078130, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 16498, "time_per_iteration": 2.398078441619873 }, { "auxiliary_loss_clip": 0.01106794, "auxiliary_loss_mlp": 0.01037329, "balance_loss_clip": 1.02492523, "balance_loss_mlp": 1.0370996, "epoch": 0.9919735457688261, "flos": 27818201064960.0, "grad_norm": 1.6635811710829806, "language_loss": 0.68253309, "learning_rate": 6.707311614246869e-10, "loss": 0.70397425, "num_input_tokens_seen": 356101655, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6953125, "step": 16499, "time_per_iteration": 2.5135810375213623 }, { "auxiliary_loss_clip": 0.01105395, "auxiliary_loss_mlp": 0.01025733, "balance_loss_clip": 1.01391923, "balance_loss_mlp": 1.03620982, "epoch": 0.992033669021494, "flos": 22562782266240.0, "grad_norm": 2.28376399900316, "language_loss": 0.82303727, "learning_rate": 6.606834497904223e-10, "loss": 0.84434849, "num_input_tokens_seen": 356121425, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69140625, "step": 16500, "time_per_iteration": 2.4338390827178955 }, { "auxiliary_loss_clip": 0.01106694, "auxiliary_loss_mlp": 0.01028947, "balance_loss_clip": 1.01665604, "balance_loss_mlp": 1.03619802, "epoch": 0.9920937922741621, "flos": 25374587846400.0, "grad_norm": 2.9827962616648858, "language_loss": 0.81906247, "learning_rate": 6.507115533036511e-10, "loss": 0.84041893, "num_input_tokens_seen": 356140710, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.703125, "step": 16501, "time_per_iteration": 2.4896347522735596 }, { "auxiliary_loss_clip": 0.01104809, "auxiliary_loss_mlp": 0.01028204, "balance_loss_clip": 1.01589561, "balance_loss_mlp": 1.03548002, "epoch": 0.99215391552683, "flos": 22054466949120.0, "grad_norm": 2.0422552221452173, "language_loss": 0.77043128, "learning_rate": 6.408154723420711e-10, "loss": 0.7917614, "num_input_tokens_seen": 356159835, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 16502, "time_per_iteration": 2.4555604457855225 }, { "auxiliary_loss_clip": 0.01105344, "auxiliary_loss_mlp": 0.01029789, "balance_loss_clip": 1.01648498, "balance_loss_mlp": 1.03492093, "epoch": 0.992214038779498, "flos": 15413937845760.0, "grad_norm": 2.543757486193513, "language_loss": 0.71342659, "learning_rate": 6.309952072811597e-10, "loss": 0.73477781, "num_input_tokens_seen": 356177555, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.703125, "step": 16503, "time_per_iteration": 5.279479265213013 }, { "auxiliary_loss_clip": 0.01027616, "auxiliary_loss_mlp": 0.01002548, "balance_loss_clip": 1.00147462, "balance_loss_mlp": 1.00549126, "epoch": 0.9922741620321659, "flos": 62014498467840.0, "grad_norm": 0.6305969234295338, "language_loss": 0.55143404, "learning_rate": 6.212507584932858e-10, "loss": 0.57173562, "num_input_tokens_seen": 356244975, "router_z_loss_clip": 0.01074219, "router_z_loss_mlp": 0.22070312, "step": 16504, "time_per_iteration": 3.174103021621704 }, { "auxiliary_loss_clip": 0.01102381, "auxiliary_loss_mlp": 0.01026671, "balance_loss_clip": 1.01551294, "balance_loss_mlp": 1.03414583, "epoch": 0.9923342852848339, "flos": 17165480745600.0, "grad_norm": 2.143160449453085, "language_loss": 0.69597542, "learning_rate": 6.115821263481536e-10, "loss": 0.7172659, "num_input_tokens_seen": 356262605, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.68359375, "step": 16505, "time_per_iteration": 2.412611246109009 }, { "auxiliary_loss_clip": 0.01106118, "auxiliary_loss_mlp": 0.01029487, "balance_loss_clip": 1.01612377, "balance_loss_mlp": 1.03425372, "epoch": 0.9923944085375018, "flos": 23183210908800.0, "grad_norm": 2.311737144113983, "language_loss": 0.65717703, "learning_rate": 6.019893112119146e-10, "loss": 0.67853308, "num_input_tokens_seen": 356278935, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.71875, "step": 16506, "time_per_iteration": 2.4640676975250244 }, { "auxiliary_loss_clip": 0.0110255, "auxiliary_loss_mlp": 0.01028373, "balance_loss_clip": 1.01635087, "balance_loss_mlp": 1.03407598, "epoch": 0.9924545317901698, "flos": 20813861059200.0, "grad_norm": 1.9393383549759138, "language_loss": 0.62994313, "learning_rate": 5.924723134487219e-10, "loss": 0.65125239, "num_input_tokens_seen": 356295675, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.68359375, "step": 16507, "time_per_iteration": 2.4333274364471436 }, { "auxiliary_loss_clip": 0.01104628, "auxiliary_loss_mlp": 0.01036211, "balance_loss_clip": 1.023628, "balance_loss_mlp": 1.03536201, "epoch": 0.9925146550428379, "flos": 20083437993600.0, "grad_norm": 2.659706488100528, "language_loss": 0.72401333, "learning_rate": 5.830311334193983e-10, "loss": 0.74542171, "num_input_tokens_seen": 356312885, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6953125, "step": 16508, "time_per_iteration": 2.4293010234832764 }, { "auxiliary_loss_clip": 0.01103495, "auxiliary_loss_mlp": 0.01030387, "balance_loss_clip": 1.01752472, "balance_loss_mlp": 1.0340395, "epoch": 0.9925747782955058, "flos": 24973717086720.0, "grad_norm": 1.7506263485092004, "language_loss": 0.70229429, "learning_rate": 5.736657714818793e-10, "loss": 0.72363305, "num_input_tokens_seen": 356334070, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 16509, "time_per_iteration": 2.4543604850769043 }, { "auxiliary_loss_clip": 0.01104198, "auxiliary_loss_mlp": 0.0103392, "balance_loss_clip": 1.02112269, "balance_loss_mlp": 1.0343219, "epoch": 0.9926349015481738, "flos": 60472526492160.0, "grad_norm": 1.6732996293100635, "language_loss": 0.68580663, "learning_rate": 5.643762279912146e-10, "loss": 0.70718783, "num_input_tokens_seen": 356359410, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.69921875, "step": 16510, "time_per_iteration": 2.7883517742156982 }, { "auxiliary_loss_clip": 0.01106424, "auxiliary_loss_mlp": 0.0103612, "balance_loss_clip": 1.02384162, "balance_loss_mlp": 1.03574538, "epoch": 0.9926950248008417, "flos": 20741716592640.0, "grad_norm": 2.1054104871385526, "language_loss": 0.81447834, "learning_rate": 5.551625032997886e-10, "loss": 0.83590376, "num_input_tokens_seen": 356378345, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.70703125, "step": 16511, "time_per_iteration": 2.425261974334717 }, { "auxiliary_loss_clip": 0.01100625, "auxiliary_loss_mlp": 0.01029595, "balance_loss_clip": 1.01779282, "balance_loss_mlp": 1.03333759, "epoch": 0.9927551480535097, "flos": 24352965221760.0, "grad_norm": 1.814103912670115, "language_loss": 0.91847825, "learning_rate": 5.460245977570998e-10, "loss": 0.93978047, "num_input_tokens_seen": 356397345, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.671875, "step": 16512, "time_per_iteration": 2.4737606048583984 }, { "auxiliary_loss_clip": 0.01027811, "auxiliary_loss_mlp": 0.01000369, "balance_loss_clip": 0.99933827, "balance_loss_mlp": 1.00563645, "epoch": 0.9928152713061776, "flos": 71275572207360.0, "grad_norm": 0.7009684707449748, "language_loss": 0.55218017, "learning_rate": 5.369625117095378e-10, "loss": 0.57246196, "num_input_tokens_seen": 356459160, "router_z_loss_clip": 0.01031494, "router_z_loss_mlp": 0.22167969, "step": 16513, "time_per_iteration": 3.1770100593566895 }, { "auxiliary_loss_clip": 0.01101637, "auxiliary_loss_mlp": 0.01028065, "balance_loss_clip": 1.01571465, "balance_loss_mlp": 1.03417742, "epoch": 0.9928753945588457, "flos": 57809499045120.0, "grad_norm": 1.4567833007674085, "language_loss": 0.6510905, "learning_rate": 5.279762455006054e-10, "loss": 0.67238748, "num_input_tokens_seen": 356486405, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.67578125, "step": 16514, "time_per_iteration": 2.8007278442382812 }, { "auxiliary_loss_clip": 0.01106366, "auxiliary_loss_mlp": 0.01029519, "balance_loss_clip": 1.01627493, "balance_loss_mlp": 1.03634357, "epoch": 0.9929355178115136, "flos": 19568981450880.0, "grad_norm": 2.047516719260903, "language_loss": 0.73192424, "learning_rate": 5.190657994713632e-10, "loss": 0.75328314, "num_input_tokens_seen": 356502905, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.69921875, "step": 16515, "time_per_iteration": 2.4351048469543457 }, { "auxiliary_loss_clip": 0.01106032, "auxiliary_loss_mlp": 0.01030682, "balance_loss_clip": 1.01886845, "balance_loss_mlp": 1.03709972, "epoch": 0.9929956410641816, "flos": 22964658606720.0, "grad_norm": 1.454000056681413, "language_loss": 0.77089953, "learning_rate": 5.102311739593191e-10, "loss": 0.79226673, "num_input_tokens_seen": 356523830, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 16516, "time_per_iteration": 2.4630134105682373 }, { "auxiliary_loss_clip": 0.01101221, "auxiliary_loss_mlp": 0.01025847, "balance_loss_clip": 1.01465344, "balance_loss_mlp": 1.03317332, "epoch": 0.9930557643168495, "flos": 22566409539840.0, "grad_norm": 2.487693069608404, "language_loss": 0.78007835, "learning_rate": 5.014723692997602e-10, "loss": 0.80134904, "num_input_tokens_seen": 356543965, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.6796875, "step": 16517, "time_per_iteration": 2.4694297313690186 }, { "auxiliary_loss_clip": 0.01108058, "auxiliary_loss_mlp": 0.01038044, "balance_loss_clip": 1.02406669, "balance_loss_mlp": 1.03569627, "epoch": 0.9931158875695175, "flos": 17201032231680.0, "grad_norm": 2.286828346236953, "language_loss": 0.67605937, "learning_rate": 4.927893858248655e-10, "loss": 0.69752038, "num_input_tokens_seen": 356561530, "router_z_loss_clip": 0.13964844, "router_z_loss_mlp": 0.72265625, "step": 16518, "time_per_iteration": 2.41292667388916 }, { "auxiliary_loss_clip": 0.01027701, "auxiliary_loss_mlp": 0.01002861, "balance_loss_clip": 1.0018059, "balance_loss_mlp": 1.0054704, "epoch": 0.9931760108221854, "flos": 63711204278400.0, "grad_norm": 0.7408033965523709, "language_loss": 0.53447235, "learning_rate": 4.84182223863483e-10, "loss": 0.55477798, "num_input_tokens_seen": 356616845, "router_z_loss_clip": 0.01055908, "router_z_loss_mlp": 0.22265625, "step": 16519, "time_per_iteration": 2.956960916519165 }, { "auxiliary_loss_clip": 0.01102026, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.01917934, "balance_loss_mlp": 1.03499222, "epoch": 0.9932361340748534, "flos": 15304805349120.0, "grad_norm": 1.8937227435739366, "language_loss": 0.59986472, "learning_rate": 4.756508837426842e-10, "loss": 0.6211943, "num_input_tokens_seen": 356633560, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.671875, "step": 16520, "time_per_iteration": 2.4355359077453613 }, { "auxiliary_loss_clip": 0.0110313, "auxiliary_loss_mlp": 0.01032938, "balance_loss_clip": 1.02037954, "balance_loss_mlp": 1.03504252, "epoch": 0.9932962573275215, "flos": 36064906727040.0, "grad_norm": 1.9891449702981374, "language_loss": 0.62061656, "learning_rate": 4.671953657853223e-10, "loss": 0.64197731, "num_input_tokens_seen": 356657600, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.6796875, "step": 16521, "time_per_iteration": 2.597259998321533 }, { "auxiliary_loss_clip": 0.01106926, "auxiliary_loss_mlp": 0.01035057, "balance_loss_clip": 1.02194405, "balance_loss_mlp": 1.03613424, "epoch": 0.9933563805801894, "flos": 21470523546240.0, "grad_norm": 1.9374820482444846, "language_loss": 0.74217176, "learning_rate": 4.5881567031225145e-10, "loss": 0.76359159, "num_input_tokens_seen": 356675880, "router_z_loss_clip": 0.13085938, "router_z_loss_mlp": 0.7109375, "step": 16522, "time_per_iteration": 2.4368233680725098 }, { "auxiliary_loss_clip": 0.01102396, "auxiliary_loss_mlp": 0.01028841, "balance_loss_clip": 1.01690769, "balance_loss_mlp": 1.03482342, "epoch": 0.9934165038328574, "flos": 23986532626560.0, "grad_norm": 1.8185251278300152, "language_loss": 0.7299751, "learning_rate": 4.5051179764143964e-10, "loss": 0.75128746, "num_input_tokens_seen": 356696000, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.671875, "step": 16523, "time_per_iteration": 2.4895336627960205 }, { "auxiliary_loss_clip": 0.01101852, "auxiliary_loss_mlp": 0.01028889, "balance_loss_clip": 1.01693821, "balance_loss_mlp": 1.03341866, "epoch": 0.9934766270855253, "flos": 21907807718400.0, "grad_norm": 2.7315002942260995, "language_loss": 0.71313059, "learning_rate": 4.422837480875241e-10, "loss": 0.734438, "num_input_tokens_seen": 356716845, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.68359375, "step": 16524, "time_per_iteration": 2.454694986343384 }, { "auxiliary_loss_clip": 0.01104445, "auxiliary_loss_mlp": 0.0102949, "balance_loss_clip": 1.01773024, "balance_loss_mlp": 1.03600955, "epoch": 0.9935367503381933, "flos": 17129139160320.0, "grad_norm": 2.18746009015348, "language_loss": 0.79546517, "learning_rate": 4.341315219624775e-10, "loss": 0.81680453, "num_input_tokens_seen": 356732100, "router_z_loss_clip": 0.11767578, "router_z_loss_mlp": 0.68359375, "step": 16525, "time_per_iteration": 2.4388647079467773 }, { "auxiliary_loss_clip": 0.01101266, "auxiliary_loss_mlp": 0.01026201, "balance_loss_clip": 1.01401722, "balance_loss_mlp": 1.03420615, "epoch": 0.9935968735908612, "flos": 22346241125760.0, "grad_norm": 3.0932469998385272, "language_loss": 0.74632812, "learning_rate": 4.2605511957582995e-10, "loss": 0.7676028, "num_input_tokens_seen": 356751480, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.671875, "step": 16526, "time_per_iteration": 2.438371181488037 }, { "auxiliary_loss_clip": 0.01098813, "auxiliary_loss_mlp": 0.01026937, "balance_loss_clip": 1.01567173, "balance_loss_mlp": 1.03280711, "epoch": 0.9936569968435293, "flos": 29460539640960.0, "grad_norm": 1.7750135893047954, "language_loss": 0.72295046, "learning_rate": 4.180545412333369e-10, "loss": 0.74420798, "num_input_tokens_seen": 356772650, "router_z_loss_clip": 0.11230469, "router_z_loss_mlp": 0.66015625, "step": 16527, "time_per_iteration": 2.5228431224823 }, { "auxiliary_loss_clip": 0.01104765, "auxiliary_loss_mlp": 0.01030394, "balance_loss_clip": 1.01804399, "balance_loss_mlp": 1.03428793, "epoch": 0.9937171200961972, "flos": 16544046522240.0, "grad_norm": 2.087763598118057, "language_loss": 0.76277989, "learning_rate": 4.1012978723875547e-10, "loss": 0.78413153, "num_input_tokens_seen": 356788510, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 16528, "time_per_iteration": 2.397705316543579 }, { "auxiliary_loss_clip": 0.01104257, "auxiliary_loss_mlp": 0.01025581, "balance_loss_clip": 1.01284957, "balance_loss_mlp": 1.03451622, "epoch": 0.9937772433488652, "flos": 24390276474240.0, "grad_norm": 6.560548501818992, "language_loss": 0.67360783, "learning_rate": 4.022808578922898e-10, "loss": 0.69490618, "num_input_tokens_seen": 356809115, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 16529, "time_per_iteration": 2.4889230728149414 }, { "auxiliary_loss_clip": 0.01109261, "auxiliary_loss_mlp": 0.01036083, "balance_loss_clip": 1.02246928, "balance_loss_mlp": 1.03677726, "epoch": 0.9938373666015331, "flos": 15669909141120.0, "grad_norm": 3.23796909847354, "language_loss": 0.6553027, "learning_rate": 3.9450775349170186e-10, "loss": 0.67675614, "num_input_tokens_seen": 356826410, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.7265625, "step": 16530, "time_per_iteration": 2.4364802837371826 }, { "auxiliary_loss_clip": 0.01105508, "auxiliary_loss_mlp": 0.0102488, "balance_loss_clip": 1.01374531, "balance_loss_mlp": 1.03620219, "epoch": 0.9938974898542011, "flos": 19496190539520.0, "grad_norm": 4.182531108213863, "language_loss": 0.7127794, "learning_rate": 3.8681047433186676e-10, "loss": 0.73408324, "num_input_tokens_seen": 356844990, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6953125, "step": 16531, "time_per_iteration": 2.4290130138397217 }, { "auxiliary_loss_clip": 0.01106629, "auxiliary_loss_mlp": 0.01029525, "balance_loss_clip": 1.01716304, "balance_loss_mlp": 1.03643179, "epoch": 0.993957613106869, "flos": 26906896085760.0, "grad_norm": 1.7734804278728031, "language_loss": 0.74303657, "learning_rate": 3.791890207045512e-10, "loss": 0.7643981, "num_input_tokens_seen": 356866530, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.703125, "step": 16532, "time_per_iteration": 2.496840000152588 }, { "auxiliary_loss_clip": 0.01098937, "auxiliary_loss_mlp": 0.01030359, "balance_loss_clip": 1.01971936, "balance_loss_mlp": 1.03543544, "epoch": 0.994017736359537, "flos": 14939593816320.0, "grad_norm": 1.8238833959620524, "language_loss": 0.70568848, "learning_rate": 3.7164339289885717e-10, "loss": 0.72698146, "num_input_tokens_seen": 356884660, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.6328125, "step": 16533, "time_per_iteration": 2.430436849594116 }, { "auxiliary_loss_clip": 0.01105839, "auxiliary_loss_mlp": 0.01026181, "balance_loss_clip": 1.01338398, "balance_loss_mlp": 1.03541589, "epoch": 0.9940778596122051, "flos": 15377883569280.0, "grad_norm": 2.4371207660688956, "language_loss": 0.8388741, "learning_rate": 3.641735912007782e-10, "loss": 0.86019427, "num_input_tokens_seen": 356900895, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.703125, "step": 16534, "time_per_iteration": 2.4145846366882324 }, { "auxiliary_loss_clip": 0.01098512, "auxiliary_loss_mlp": 0.01027295, "balance_loss_clip": 1.01598811, "balance_loss_mlp": 1.03389287, "epoch": 0.994137982864873, "flos": 25228108183680.0, "grad_norm": 1.8991217169473515, "language_loss": 0.6605072, "learning_rate": 3.567796158934211e-10, "loss": 0.6817652, "num_input_tokens_seen": 356920985, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.64453125, "step": 16535, "time_per_iteration": 2.481966733932495 }, { "auxiliary_loss_clip": 0.01102633, "auxiliary_loss_mlp": 0.01028679, "balance_loss_clip": 1.01759839, "balance_loss_mlp": 1.03615117, "epoch": 0.994198106117541, "flos": 18442140912000.0, "grad_norm": 1.7776435230254035, "language_loss": 0.64564347, "learning_rate": 3.4946146725767235e-10, "loss": 0.66695666, "num_input_tokens_seen": 356939800, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.6640625, "step": 16536, "time_per_iteration": 3.8803422451019287 }, { "auxiliary_loss_clip": 0.01101881, "auxiliary_loss_mlp": 0.01031033, "balance_loss_clip": 1.01844406, "balance_loss_mlp": 1.0344739, "epoch": 0.9942582293702089, "flos": 16654112772480.0, "grad_norm": 3.9043966397596397, "language_loss": 0.7895152, "learning_rate": 3.4221914557064357e-10, "loss": 0.8108443, "num_input_tokens_seen": 356957780, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.67578125, "step": 16537, "time_per_iteration": 2.4271583557128906 }, { "auxiliary_loss_clip": 0.01109635, "auxiliary_loss_mlp": 0.01031841, "balance_loss_clip": 1.01898384, "balance_loss_mlp": 1.03648734, "epoch": 0.9943183526228769, "flos": 21944580266880.0, "grad_norm": 1.5641157991006498, "language_loss": 0.6908071, "learning_rate": 3.35052651107004e-10, "loss": 0.7122218, "num_input_tokens_seen": 356979185, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.73046875, "step": 16538, "time_per_iteration": 2.4981749057769775 }, { "auxiliary_loss_clip": 0.0109942, "auxiliary_loss_mlp": 0.01030545, "balance_loss_clip": 1.01890433, "balance_loss_mlp": 1.03313923, "epoch": 0.9943784758755448, "flos": 23842566915840.0, "grad_norm": 1.8432372978712153, "language_loss": 0.75199175, "learning_rate": 3.2796198413853614e-10, "loss": 0.77329141, "num_input_tokens_seen": 356997735, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.6640625, "step": 16539, "time_per_iteration": 3.923583745956421 }, { "auxiliary_loss_clip": 0.0110614, "auxiliary_loss_mlp": 0.01033754, "balance_loss_clip": 1.0217917, "balance_loss_mlp": 1.03664863, "epoch": 0.9944385991282129, "flos": 21469984842240.0, "grad_norm": 2.7175859447685276, "language_loss": 0.70787454, "learning_rate": 3.209471449341361e-10, "loss": 0.72927344, "num_input_tokens_seen": 357015660, "router_z_loss_clip": 0.11962891, "router_z_loss_mlp": 0.6953125, "step": 16540, "time_per_iteration": 2.4944934844970703 }, { "auxiliary_loss_clip": 0.01100656, "auxiliary_loss_mlp": 0.01025236, "balance_loss_clip": 1.01443017, "balance_loss_mlp": 1.03352737, "epoch": 0.9944987223808808, "flos": 22927024131840.0, "grad_norm": 2.4944804586513847, "language_loss": 0.75106752, "learning_rate": 3.140081337600353e-10, "loss": 0.77232647, "num_input_tokens_seen": 357034800, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.671875, "step": 16541, "time_per_iteration": 2.437852144241333 }, { "auxiliary_loss_clip": 0.01102066, "auxiliary_loss_mlp": 0.01033709, "balance_loss_clip": 1.02110302, "balance_loss_mlp": 1.03361368, "epoch": 0.9945588456335488, "flos": 22383013674240.0, "grad_norm": 1.8990291553530112, "language_loss": 0.76748037, "learning_rate": 3.0714495087891255e-10, "loss": 0.78883815, "num_input_tokens_seen": 357053785, "router_z_loss_clip": 0.12597656, "router_z_loss_mlp": 0.68359375, "step": 16542, "time_per_iteration": 2.4581990242004395 }, { "auxiliary_loss_clip": 0.01106492, "auxiliary_loss_mlp": 0.01026412, "balance_loss_clip": 1.01356125, "balance_loss_mlp": 1.03618538, "epoch": 0.9946189688862167, "flos": 21397517153280.0, "grad_norm": 2.2460844423455106, "language_loss": 0.74204034, "learning_rate": 3.0035759655122615e-10, "loss": 0.76336938, "num_input_tokens_seen": 357072025, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.703125, "step": 16543, "time_per_iteration": 2.4349308013916016 }, { "auxiliary_loss_clip": 0.01106176, "auxiliary_loss_mlp": 0.01028506, "balance_loss_clip": 1.01594758, "balance_loss_mlp": 1.03533447, "epoch": 0.9946790921388847, "flos": 12416545670400.0, "grad_norm": 3.532212971642618, "language_loss": 0.82101166, "learning_rate": 2.9364607103454785e-10, "loss": 0.84235847, "num_input_tokens_seen": 357086960, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.70703125, "step": 16544, "time_per_iteration": 2.3875787258148193 }, { "auxiliary_loss_clip": 0.01103243, "auxiliary_loss_mlp": 0.01029588, "balance_loss_clip": 1.01742816, "balance_loss_mlp": 1.03484416, "epoch": 0.9947392153915526, "flos": 19058295836160.0, "grad_norm": 1.9557090493717786, "language_loss": 0.79011446, "learning_rate": 2.870103745831187e-10, "loss": 0.81144279, "num_input_tokens_seen": 357105095, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.68359375, "step": 16545, "time_per_iteration": 5.307588815689087 }, { "auxiliary_loss_clip": 0.01107806, "auxiliary_loss_mlp": 0.01026895, "balance_loss_clip": 1.01458633, "balance_loss_mlp": 1.03678381, "epoch": 0.9947993386442207, "flos": 27308808339840.0, "grad_norm": 1.8605072673800092, "language_loss": 0.7268194, "learning_rate": 2.8045050744873733e-10, "loss": 0.74816644, "num_input_tokens_seen": 357125065, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.7109375, "step": 16546, "time_per_iteration": 2.506237506866455 }, { "auxiliary_loss_clip": 0.01101356, "auxiliary_loss_mlp": 0.01033593, "balance_loss_clip": 1.02203548, "balance_loss_mlp": 1.03418255, "epoch": 0.9948594618968887, "flos": 20806498771200.0, "grad_norm": 2.3789284892281892, "language_loss": 0.77434719, "learning_rate": 2.739664698798716e-10, "loss": 0.79569674, "num_input_tokens_seen": 357141600, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.671875, "step": 16547, "time_per_iteration": 2.4274678230285645 }, { "auxiliary_loss_clip": 0.01102989, "auxiliary_loss_mlp": 0.01031025, "balance_loss_clip": 1.01970017, "balance_loss_mlp": 1.03439665, "epoch": 0.9949195851495566, "flos": 23292953936640.0, "grad_norm": 2.4532994794342082, "language_loss": 0.70147443, "learning_rate": 2.67558262122769e-10, "loss": 0.72281456, "num_input_tokens_seen": 357157880, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6875, "step": 16548, "time_per_iteration": 2.4329562187194824 }, { "auxiliary_loss_clip": 0.01103685, "auxiliary_loss_mlp": 0.01029676, "balance_loss_clip": 1.01803505, "balance_loss_mlp": 1.03539073, "epoch": 0.9949797084022246, "flos": 18515470527360.0, "grad_norm": 1.7856804562646846, "language_loss": 0.75550359, "learning_rate": 2.6122588442012427e-10, "loss": 0.77683723, "num_input_tokens_seen": 357176705, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.68359375, "step": 16549, "time_per_iteration": 2.4310927391052246 }, { "auxiliary_loss_clip": 0.01107485, "auxiliary_loss_mlp": 0.01031916, "balance_loss_clip": 1.01861238, "balance_loss_mlp": 1.0366137, "epoch": 0.9950398316548925, "flos": 30407719328640.0, "grad_norm": 2.370255803008857, "language_loss": 0.74525017, "learning_rate": 2.5496933701241177e-10, "loss": 0.76664418, "num_input_tokens_seen": 357197630, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 16550, "time_per_iteration": 2.5090548992156982 }, { "auxiliary_loss_clip": 0.01102903, "auxiliary_loss_mlp": 0.01028172, "balance_loss_clip": 1.01673388, "balance_loss_mlp": 1.03449035, "epoch": 0.9950999549075605, "flos": 19900868140800.0, "grad_norm": 1.6450507237661065, "language_loss": 0.77696878, "learning_rate": 2.4878862013655297e-10, "loss": 0.79827952, "num_input_tokens_seen": 357215445, "router_z_loss_clip": 0.11425781, "router_z_loss_mlp": 0.68359375, "step": 16551, "time_per_iteration": 2.4385576248168945 }, { "auxiliary_loss_clip": 0.01096908, "auxiliary_loss_mlp": 0.01028749, "balance_loss_clip": 1.01815128, "balance_loss_mlp": 1.03335583, "epoch": 0.9951600781602284, "flos": 17603555016960.0, "grad_norm": 1.6487238443409968, "language_loss": 0.66444051, "learning_rate": 2.426837340270271e-10, "loss": 0.68569708, "num_input_tokens_seen": 357234285, "router_z_loss_clip": 0.10644531, "router_z_loss_mlp": 0.63671875, "step": 16552, "time_per_iteration": 2.421987295150757 }, { "auxiliary_loss_clip": 0.01102935, "auxiliary_loss_mlp": 0.01026603, "balance_loss_clip": 1.01456332, "balance_loss_mlp": 1.033324, "epoch": 0.9952202014128965, "flos": 28950715952640.0, "grad_norm": 1.513337858231281, "language_loss": 0.81535733, "learning_rate": 2.3665467891520465e-10, "loss": 0.83665276, "num_input_tokens_seen": 357257565, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 16553, "time_per_iteration": 2.5391998291015625 }, { "auxiliary_loss_clip": 0.01027562, "auxiliary_loss_mlp": 0.01001876, "balance_loss_clip": 1.00089228, "balance_loss_mlp": 1.00552869, "epoch": 0.9952803246655644, "flos": 70810386145920.0, "grad_norm": 0.7407649851111329, "language_loss": 0.57318401, "learning_rate": 2.3070145503001348e-10, "loss": 0.59347844, "num_input_tokens_seen": 357320205, "router_z_loss_clip": 0.00982666, "router_z_loss_mlp": 0.22070312, "step": 16554, "time_per_iteration": 3.175626516342163 }, { "auxiliary_loss_clip": 0.01105038, "auxiliary_loss_mlp": 0.01033849, "balance_loss_clip": 1.02201772, "balance_loss_mlp": 1.03513145, "epoch": 0.9953404479182324, "flos": 21799070271360.0, "grad_norm": 2.649938245584799, "language_loss": 0.77032918, "learning_rate": 2.24824062597051e-10, "loss": 0.79171807, "num_input_tokens_seen": 357340695, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.69921875, "step": 16555, "time_per_iteration": 2.4583370685577393 }, { "auxiliary_loss_clip": 0.0110434, "auxiliary_loss_mlp": 0.01033858, "balance_loss_clip": 1.02133465, "balance_loss_mlp": 1.03497362, "epoch": 0.9954005711709003, "flos": 21937397546880.0, "grad_norm": 1.9533294934149876, "language_loss": 0.86065239, "learning_rate": 2.1902250183902793e-10, "loss": 0.88203436, "num_input_tokens_seen": 357357505, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6953125, "step": 16556, "time_per_iteration": 2.430067300796509 }, { "auxiliary_loss_clip": 0.01101639, "auxiliary_loss_mlp": 0.0103152, "balance_loss_clip": 1.0191642, "balance_loss_mlp": 1.03495717, "epoch": 0.9954606944235683, "flos": 19354559212800.0, "grad_norm": 1.77411200758014, "language_loss": 0.73197305, "learning_rate": 2.132967729762125e-10, "loss": 0.7533046, "num_input_tokens_seen": 357375395, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.66796875, "step": 16557, "time_per_iteration": 2.4382553100585938 }, { "auxiliary_loss_clip": 0.01101474, "auxiliary_loss_mlp": 0.01030997, "balance_loss_clip": 1.01935053, "balance_loss_mlp": 1.035496, "epoch": 0.9955208176762362, "flos": 30518611591680.0, "grad_norm": 3.3263587179493395, "language_loss": 0.7655524, "learning_rate": 2.0764687622554233e-10, "loss": 0.78687716, "num_input_tokens_seen": 357397375, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.66015625, "step": 16558, "time_per_iteration": 2.518697500228882 }, { "auxiliary_loss_clip": 0.01102844, "auxiliary_loss_mlp": 0.01031091, "balance_loss_clip": 1.01843691, "balance_loss_mlp": 1.0332768, "epoch": 0.9955809409289043, "flos": 30008249199360.0, "grad_norm": 2.625762666916366, "language_loss": 0.6384812, "learning_rate": 2.0207281180129044e-10, "loss": 0.65982056, "num_input_tokens_seen": 357418880, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6953125, "step": 16559, "time_per_iteration": 2.531848430633545 }, { "auxiliary_loss_clip": 0.0110199, "auxiliary_loss_mlp": 0.01026304, "balance_loss_clip": 1.0146873, "balance_loss_mlp": 1.03505397, "epoch": 0.9956410641815723, "flos": 21543278544000.0, "grad_norm": 1.8854316201021786, "language_loss": 0.74195486, "learning_rate": 1.965745799148433e-10, "loss": 0.76323783, "num_input_tokens_seen": 357438310, "router_z_loss_clip": 0.11621094, "router_z_loss_mlp": 0.66796875, "step": 16560, "time_per_iteration": 2.474566698074341 }, { "auxiliary_loss_clip": 0.01102049, "auxiliary_loss_mlp": 0.01027909, "balance_loss_clip": 1.01657808, "balance_loss_mlp": 1.03471208, "epoch": 0.9957011874342402, "flos": 21689470897920.0, "grad_norm": 1.9200224651107256, "language_loss": 0.79066378, "learning_rate": 1.9115218077470073e-10, "loss": 0.81196332, "num_input_tokens_seen": 357457155, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.671875, "step": 16561, "time_per_iteration": 2.458150625228882 }, { "auxiliary_loss_clip": 0.01100811, "auxiliary_loss_mlp": 0.01028593, "balance_loss_clip": 1.01747084, "balance_loss_mlp": 1.03519845, "epoch": 0.9957613106869082, "flos": 17702667619200.0, "grad_norm": 3.2451300956165365, "language_loss": 0.6503464, "learning_rate": 1.8580561458647614e-10, "loss": 0.6716404, "num_input_tokens_seen": 357468060, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.65625, "step": 16562, "time_per_iteration": 2.3865580558776855 }, { "auxiliary_loss_clip": 0.01107031, "auxiliary_loss_mlp": 0.01037706, "balance_loss_clip": 1.02426469, "balance_loss_mlp": 1.03566372, "epoch": 0.9958214339395761, "flos": 30555994671360.0, "grad_norm": 2.2937273431016463, "language_loss": 0.64499444, "learning_rate": 1.805348815528962e-10, "loss": 0.6664418, "num_input_tokens_seen": 357489665, "router_z_loss_clip": 0.13476562, "router_z_loss_mlp": 0.7109375, "step": 16563, "time_per_iteration": 2.518521547317505 }, { "auxiliary_loss_clip": 0.01101946, "auxiliary_loss_mlp": 0.01031531, "balance_loss_clip": 1.01909184, "balance_loss_mlp": 1.0345614, "epoch": 0.9958815571922441, "flos": 24169174306560.0, "grad_norm": 1.5509331542904843, "language_loss": 0.64660645, "learning_rate": 1.7533998187380105e-10, "loss": 0.66794121, "num_input_tokens_seen": 357511975, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.671875, "step": 16564, "time_per_iteration": 2.48917555809021 }, { "auxiliary_loss_clip": 0.01101714, "auxiliary_loss_mlp": 0.01027697, "balance_loss_clip": 1.01535916, "balance_loss_mlp": 1.03479266, "epoch": 0.995941680444912, "flos": 15487016065920.0, "grad_norm": 1.9094450166482357, "language_loss": 0.74138558, "learning_rate": 1.7022091574636633e-10, "loss": 0.7626797, "num_input_tokens_seen": 357529345, "router_z_loss_clip": 0.12353516, "router_z_loss_mlp": 0.66796875, "step": 16565, "time_per_iteration": 2.4232521057128906 }, { "auxiliary_loss_clip": 0.01103869, "auxiliary_loss_mlp": 0.01028576, "balance_loss_clip": 1.01690555, "balance_loss_mlp": 1.03376245, "epoch": 0.9960018036975801, "flos": 18621227145600.0, "grad_norm": 1.8839579881740705, "language_loss": 0.78982532, "learning_rate": 1.6517768336443694e-10, "loss": 0.81114978, "num_input_tokens_seen": 357547615, "router_z_loss_clip": 0.11669922, "router_z_loss_mlp": 0.69921875, "step": 16566, "time_per_iteration": 2.4499080181121826 }, { "auxiliary_loss_clip": 0.01100698, "auxiliary_loss_mlp": 0.01027776, "balance_loss_clip": 1.01658845, "balance_loss_mlp": 1.03402352, "epoch": 0.996061926950248, "flos": 20084120352000.0, "grad_norm": 1.569140549422977, "language_loss": 0.70414674, "learning_rate": 1.6021028491941535e-10, "loss": 0.72543144, "num_input_tokens_seen": 357567380, "router_z_loss_clip": 0.11181641, "router_z_loss_mlp": 0.66796875, "step": 16567, "time_per_iteration": 2.44661808013916 }, { "auxiliary_loss_clip": 0.01105319, "auxiliary_loss_mlp": 0.01031524, "balance_loss_clip": 1.0188638, "balance_loss_mlp": 1.03559101, "epoch": 0.996122050202916, "flos": 24347829576960.0, "grad_norm": 2.1087635031449485, "language_loss": 0.78695226, "learning_rate": 1.5531872059959538e-10, "loss": 0.8083207, "num_input_tokens_seen": 357586435, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 16568, "time_per_iteration": 2.457986354827881 }, { "auxiliary_loss_clip": 0.01099269, "auxiliary_loss_mlp": 0.01027566, "balance_loss_clip": 1.01674151, "balance_loss_mlp": 1.03360236, "epoch": 0.9961821734555839, "flos": 24199302839040.0, "grad_norm": 1.8770927524667853, "language_loss": 0.8207792, "learning_rate": 1.5050299059060634e-10, "loss": 0.84204751, "num_input_tokens_seen": 357604720, "router_z_loss_clip": 0.10839844, "router_z_loss_mlp": 0.65625, "step": 16569, "time_per_iteration": 2.4584853649139404 }, { "auxiliary_loss_clip": 0.01101726, "auxiliary_loss_mlp": 0.01032964, "balance_loss_clip": 1.02147233, "balance_loss_mlp": 1.03576112, "epoch": 0.9962422967082519, "flos": 22633741584000.0, "grad_norm": 2.2774311117419748, "language_loss": 0.70311671, "learning_rate": 1.457630950747468e-10, "loss": 0.72446358, "num_input_tokens_seen": 357622345, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66015625, "step": 16570, "time_per_iteration": 2.4273035526275635 }, { "auxiliary_loss_clip": 0.0110453, "auxiliary_loss_mlp": 0.01026857, "balance_loss_clip": 1.01466155, "balance_loss_mlp": 1.03637683, "epoch": 0.9963024199609198, "flos": 26396030903040.0, "grad_norm": 2.1661286350832527, "language_loss": 0.7494483, "learning_rate": 1.4109903423209502e-10, "loss": 0.77076215, "num_input_tokens_seen": 357642710, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6796875, "step": 16571, "time_per_iteration": 2.5035183429718018 }, { "auxiliary_loss_clip": 0.0110315, "auxiliary_loss_mlp": 0.01030696, "balance_loss_clip": 1.01828659, "balance_loss_mlp": 1.03480744, "epoch": 0.9963625432135879, "flos": 16581537342720.0, "grad_norm": 2.098248407906464, "language_loss": 0.7989161, "learning_rate": 1.3651080823939843e-10, "loss": 0.82025456, "num_input_tokens_seen": 357659870, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 16572, "time_per_iteration": 2.4071125984191895 }, { "auxiliary_loss_clip": 0.01103456, "auxiliary_loss_mlp": 0.01033767, "balance_loss_clip": 1.02123165, "balance_loss_mlp": 1.03550148, "epoch": 0.9964226664662559, "flos": 26468534505600.0, "grad_norm": 1.821089431975031, "language_loss": 0.70504844, "learning_rate": 1.3199841727074e-10, "loss": 0.72642064, "num_input_tokens_seen": 357677075, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.6796875, "step": 16573, "time_per_iteration": 2.4833431243896484 }, { "auxiliary_loss_clip": 0.0110598, "auxiliary_loss_mlp": 0.01033503, "balance_loss_clip": 1.02016377, "balance_loss_mlp": 1.03484035, "epoch": 0.9964827897189238, "flos": 27448320764160.0, "grad_norm": 1.710068488324502, "language_loss": 0.63355458, "learning_rate": 1.275618614968721e-10, "loss": 0.65494943, "num_input_tokens_seen": 357696715, "router_z_loss_clip": 0.1328125, "router_z_loss_mlp": 0.7109375, "step": 16574, "time_per_iteration": 2.4859819412231445 }, { "auxiliary_loss_clip": 0.01110736, "auxiliary_loss_mlp": 0.01032962, "balance_loss_clip": 1.01961076, "balance_loss_mlp": 1.03827155, "epoch": 0.9965429129715918, "flos": 11721566350080.0, "grad_norm": 2.918261948193309, "language_loss": 0.76461732, "learning_rate": 1.2320114108654856e-10, "loss": 0.78605437, "num_input_tokens_seen": 357712345, "router_z_loss_clip": 0.13378906, "router_z_loss_mlp": 0.7265625, "step": 16575, "time_per_iteration": 2.397263765335083 }, { "auxiliary_loss_clip": 0.01102995, "auxiliary_loss_mlp": 0.01032319, "balance_loss_clip": 1.01992106, "balance_loss_mlp": 1.03497481, "epoch": 0.9966030362242597, "flos": 19756004590080.0, "grad_norm": 1.7005714029434078, "language_loss": 0.70325625, "learning_rate": 1.1891625620474855e-10, "loss": 0.72460938, "num_input_tokens_seen": 357731815, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6796875, "step": 16576, "time_per_iteration": 2.439373731613159 }, { "auxiliary_loss_clip": 0.01101002, "auxiliary_loss_mlp": 0.01028235, "balance_loss_clip": 1.01603961, "balance_loss_mlp": 1.03517127, "epoch": 0.9966631594769277, "flos": 23915178259200.0, "grad_norm": 12.735029618785374, "language_loss": 0.71681058, "learning_rate": 1.1470720701400871e-10, "loss": 0.73810291, "num_input_tokens_seen": 357751640, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.66015625, "step": 16577, "time_per_iteration": 3.9469919204711914 }, { "auxiliary_loss_clip": 0.01102579, "auxiliary_loss_mlp": 0.01031714, "balance_loss_clip": 1.01943564, "balance_loss_mlp": 1.03419495, "epoch": 0.9967232827295956, "flos": 15559591495680.0, "grad_norm": 1.985344235652026, "language_loss": 0.78204226, "learning_rate": 1.1057399367397912e-10, "loss": 0.8033852, "num_input_tokens_seen": 357769850, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.68359375, "step": 16578, "time_per_iteration": 2.4375948905944824 }, { "auxiliary_loss_clip": 0.01105198, "auxiliary_loss_mlp": 0.01030778, "balance_loss_clip": 1.01878524, "balance_loss_mlp": 1.03577054, "epoch": 0.9967834059822637, "flos": 20813035046400.0, "grad_norm": 5.368066257098124, "language_loss": 0.75972855, "learning_rate": 1.0651661634142328e-10, "loss": 0.78108835, "num_input_tokens_seen": 357789550, "router_z_loss_clip": 0.12011719, "router_z_loss_mlp": 0.6953125, "step": 16579, "time_per_iteration": 2.4475231170654297 }, { "auxiliary_loss_clip": 0.01106559, "auxiliary_loss_mlp": 0.01037149, "balance_loss_clip": 1.02324939, "balance_loss_mlp": 1.03756797, "epoch": 0.9968435292349316, "flos": 36719234830080.0, "grad_norm": 2.0910901151280754, "language_loss": 0.69193685, "learning_rate": 1.0253507516999604e-10, "loss": 0.7133739, "num_input_tokens_seen": 357809525, "router_z_loss_clip": 0.13867188, "router_z_loss_mlp": 0.69140625, "step": 16580, "time_per_iteration": 2.5714364051818848 }, { "auxiliary_loss_clip": 0.0110376, "auxiliary_loss_mlp": 0.01028591, "balance_loss_clip": 1.016873, "balance_loss_mlp": 1.03460288, "epoch": 0.9969036524875996, "flos": 26760919213440.0, "grad_norm": 1.7484740357682407, "language_loss": 0.79643637, "learning_rate": 9.862937031113184e-11, "loss": 0.81775987, "num_input_tokens_seen": 357829795, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.69140625, "step": 16581, "time_per_iteration": 3.9347493648529053 }, { "auxiliary_loss_clip": 0.01099958, "auxiliary_loss_mlp": 0.01027237, "balance_loss_clip": 1.0164609, "balance_loss_mlp": 1.03378677, "epoch": 0.9969637757402675, "flos": 24827237424000.0, "grad_norm": 1.7637906541901809, "language_loss": 0.80060762, "learning_rate": 9.479950191249031e-11, "loss": 0.82187951, "num_input_tokens_seen": 357851655, "router_z_loss_clip": 0.10791016, "router_z_loss_mlp": 0.66015625, "step": 16582, "time_per_iteration": 2.502664089202881 }, { "auxiliary_loss_clip": 0.01099649, "auxiliary_loss_mlp": 0.01028978, "balance_loss_clip": 1.01740885, "balance_loss_mlp": 1.03424764, "epoch": 0.9970238989929355, "flos": 23038742407680.0, "grad_norm": 1.6365847272074285, "language_loss": 0.60231167, "learning_rate": 9.104547011951069e-11, "loss": 0.62359792, "num_input_tokens_seen": 357871205, "router_z_loss_clip": 0.11572266, "router_z_loss_mlp": 0.65234375, "step": 16583, "time_per_iteration": 2.483130693435669 }, { "auxiliary_loss_clip": 0.01102913, "auxiliary_loss_mlp": 0.01034933, "balance_loss_clip": 1.02308345, "balance_loss_mlp": 1.03381157, "epoch": 0.9970840222456034, "flos": 25298816106240.0, "grad_norm": 1.561272127738993, "language_loss": 0.77941716, "learning_rate": 8.736727507452357e-11, "loss": 0.80079561, "num_input_tokens_seen": 357892145, "router_z_loss_clip": 0.11865234, "router_z_loss_mlp": 0.69140625, "step": 16584, "time_per_iteration": 2.4964840412139893 }, { "auxiliary_loss_clip": 0.01098449, "auxiliary_loss_mlp": 0.01028312, "balance_loss_clip": 1.01727891, "balance_loss_mlp": 1.03247976, "epoch": 0.9971441454982715, "flos": 21615602578560.0, "grad_norm": 1.763220727621928, "language_loss": 0.69267511, "learning_rate": 8.376491691697297e-11, "loss": 0.71394265, "num_input_tokens_seen": 357911205, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.66015625, "step": 16585, "time_per_iteration": 2.4583182334899902 }, { "auxiliary_loss_clip": 0.01102792, "auxiliary_loss_mlp": 0.0102833, "balance_loss_clip": 1.01593828, "balance_loss_mlp": 1.03573823, "epoch": 0.9972042687509394, "flos": 14975612179200.0, "grad_norm": 2.7936477432284104, "language_loss": 0.81569815, "learning_rate": 8.023839578363834e-11, "loss": 0.83700943, "num_input_tokens_seen": 357928190, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.671875, "step": 16586, "time_per_iteration": 3.8036632537841797 }, { "auxiliary_loss_clip": 0.01102535, "auxiliary_loss_mlp": 0.01033059, "balance_loss_clip": 1.02134109, "balance_loss_mlp": 1.03349519, "epoch": 0.9972643920036074, "flos": 25806664546560.0, "grad_norm": 1.8383207403212312, "language_loss": 0.78029805, "learning_rate": 7.678771180796851e-11, "loss": 0.80165398, "num_input_tokens_seen": 357946985, "router_z_loss_clip": 0.1171875, "router_z_loss_mlp": 0.6875, "step": 16587, "time_per_iteration": 3.986821174621582 }, { "auxiliary_loss_clip": 0.0110664, "auxiliary_loss_mlp": 0.01033743, "balance_loss_clip": 1.02126801, "balance_loss_mlp": 1.03662026, "epoch": 0.9973245152562754, "flos": 23326242865920.0, "grad_norm": 1.6640908496575886, "language_loss": 0.72809458, "learning_rate": 7.341286512074773e-11, "loss": 0.74949843, "num_input_tokens_seen": 357966720, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69921875, "step": 16588, "time_per_iteration": 2.4610393047332764 }, { "auxiliary_loss_clip": 0.01108812, "auxiliary_loss_mlp": 0.01026693, "balance_loss_clip": 1.01400924, "balance_loss_mlp": 1.03555846, "epoch": 0.9973846385089433, "flos": 12166212810240.0, "grad_norm": 2.929506004059809, "language_loss": 0.82604194, "learning_rate": 7.011385585031781e-11, "loss": 0.84739697, "num_input_tokens_seen": 357981375, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.73046875, "step": 16589, "time_per_iteration": 2.417017936706543 }, { "auxiliary_loss_clip": 0.01108062, "auxiliary_loss_mlp": 0.01034049, "balance_loss_clip": 1.02005422, "balance_loss_mlp": 1.03637981, "epoch": 0.9974447617616113, "flos": 20045157073920.0, "grad_norm": 2.0678192502034114, "language_loss": 0.70773101, "learning_rate": 6.689068412168986e-11, "loss": 0.72915208, "num_input_tokens_seen": 358000290, "router_z_loss_clip": 0.140625, "router_z_loss_mlp": 0.71484375, "step": 16590, "time_per_iteration": 2.4493651390075684 }, { "auxiliary_loss_clip": 0.0110601, "auxiliary_loss_mlp": 0.01029352, "balance_loss_clip": 1.0164479, "balance_loss_mlp": 1.03627133, "epoch": 0.9975048850142793, "flos": 32014614159360.0, "grad_norm": 2.3372344684803523, "language_loss": 0.63447654, "learning_rate": 6.374335005676634e-11, "loss": 0.65583014, "num_input_tokens_seen": 358022075, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.6953125, "step": 16591, "time_per_iteration": 2.538595199584961 }, { "auxiliary_loss_clip": 0.0110091, "auxiliary_loss_mlp": 0.01029559, "balance_loss_clip": 1.01748323, "balance_loss_mlp": 1.03194571, "epoch": 0.9975650082669473, "flos": 36933728895360.0, "grad_norm": 1.7220514921895798, "language_loss": 0.73355258, "learning_rate": 6.067185377522933e-11, "loss": 0.7548573, "num_input_tokens_seen": 358043940, "router_z_loss_clip": 0.12109375, "router_z_loss_mlp": 0.69140625, "step": 16592, "time_per_iteration": 2.5489308834075928 }, { "auxiliary_loss_clip": 0.0110414, "auxiliary_loss_mlp": 0.0102839, "balance_loss_clip": 1.01598656, "balance_loss_mlp": 1.03484666, "epoch": 0.9976251315196152, "flos": 16472117537280.0, "grad_norm": 1.6371491064017967, "language_loss": 0.85188162, "learning_rate": 5.767619539343016e-11, "loss": 0.87320685, "num_input_tokens_seen": 358062720, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.69140625, "step": 16593, "time_per_iteration": 2.4370148181915283 }, { "auxiliary_loss_clip": 0.01099363, "auxiliary_loss_mlp": 0.01027512, "balance_loss_clip": 1.0162524, "balance_loss_mlp": 1.03395712, "epoch": 0.9976852547722832, "flos": 19646836179840.0, "grad_norm": 1.8439783097441815, "language_loss": 0.6958921, "learning_rate": 5.4756375024833656e-11, "loss": 0.71716076, "num_input_tokens_seen": 358081560, "router_z_loss_clip": 0.11279297, "router_z_loss_mlp": 0.65625, "step": 16594, "time_per_iteration": 2.4426486492156982 }, { "auxiliary_loss_clip": 0.01105642, "auxiliary_loss_mlp": 0.0102802, "balance_loss_clip": 1.0161109, "balance_loss_mlp": 1.03582263, "epoch": 0.9977453780249511, "flos": 20448434044800.0, "grad_norm": 3.390182545633768, "language_loss": 0.72882318, "learning_rate": 5.1912392780462113e-11, "loss": 0.7501598, "num_input_tokens_seen": 358099065, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.69921875, "step": 16595, "time_per_iteration": 2.4475178718566895 }, { "auxiliary_loss_clip": 0.01027294, "auxiliary_loss_mlp": 0.01003199, "balance_loss_clip": 1.0021919, "balance_loss_mlp": 1.00517488, "epoch": 0.9978055012776191, "flos": 65455097581440.0, "grad_norm": 0.8145696212197245, "language_loss": 0.60374093, "learning_rate": 4.9144248768007156e-11, "loss": 0.62404585, "num_input_tokens_seen": 358156095, "router_z_loss_clip": 0.0100708, "router_z_loss_mlp": 0.22167969, "step": 16596, "time_per_iteration": 2.956878662109375 }, { "auxiliary_loss_clip": 0.01104255, "auxiliary_loss_mlp": 0.01031437, "balance_loss_clip": 1.01907492, "balance_loss_mlp": 1.03587389, "epoch": 0.997865624530287, "flos": 20631506688000.0, "grad_norm": 1.8282259326961594, "language_loss": 0.77592796, "learning_rate": 4.645194309227385e-11, "loss": 0.79728484, "num_input_tokens_seen": 358175230, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.68359375, "step": 16597, "time_per_iteration": 2.4504759311676025 }, { "auxiliary_loss_clip": 0.01104704, "auxiliary_loss_mlp": 0.01030109, "balance_loss_clip": 1.0174551, "balance_loss_mlp": 1.0349195, "epoch": 0.9979257477829551, "flos": 29387102284800.0, "grad_norm": 2.4712760158221356, "language_loss": 0.82041109, "learning_rate": 4.383547585562475e-11, "loss": 0.8417592, "num_input_tokens_seen": 358197075, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.69921875, "step": 16598, "time_per_iteration": 2.5095057487487793 }, { "auxiliary_loss_clip": 0.0110826, "auxiliary_loss_mlp": 0.01040592, "balance_loss_clip": 1.02697873, "balance_loss_mlp": 1.03592706, "epoch": 0.997985871035623, "flos": 22635070387200.0, "grad_norm": 1.907441296819634, "language_loss": 0.6451568, "learning_rate": 4.129484715709175e-11, "loss": 0.66664529, "num_input_tokens_seen": 358215925, "router_z_loss_clip": 0.13671875, "router_z_loss_mlp": 0.72265625, "step": 16599, "time_per_iteration": 2.4715585708618164 }, { "auxiliary_loss_clip": 0.01027544, "auxiliary_loss_mlp": 0.0100244, "balance_loss_clip": 1.00143266, "balance_loss_mlp": 1.00535131, "epoch": 0.998045994288291, "flos": 61806968663040.0, "grad_norm": 0.8564862097931818, "language_loss": 0.62291324, "learning_rate": 3.8830057093264256e-11, "loss": 0.64321315, "num_input_tokens_seen": 358269035, "router_z_loss_clip": 0.0100708, "router_z_loss_mlp": 0.22265625, "step": 16600, "time_per_iteration": 2.9941253662109375 }, { "auxiliary_loss_clip": 0.01102078, "auxiliary_loss_mlp": 0.01029475, "balance_loss_clip": 1.01853204, "balance_loss_mlp": 1.03465986, "epoch": 0.998106117540959, "flos": 19245534456960.0, "grad_norm": 1.6748748994909932, "language_loss": 0.78555417, "learning_rate": 3.644110575717896e-11, "loss": 0.80686969, "num_input_tokens_seen": 358287680, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.671875, "step": 16601, "time_per_iteration": 2.4469480514526367 }, { "auxiliary_loss_clip": 0.01107074, "auxiliary_loss_mlp": 0.01028091, "balance_loss_clip": 1.01630163, "balance_loss_mlp": 1.03556466, "epoch": 0.9981662407936269, "flos": 21106209853440.0, "grad_norm": 4.822327358794947, "language_loss": 0.82730722, "learning_rate": 3.412799323987414e-11, "loss": 0.8486588, "num_input_tokens_seen": 358304080, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.71484375, "step": 16602, "time_per_iteration": 2.4352104663848877 }, { "auxiliary_loss_clip": 0.01104536, "auxiliary_loss_mlp": 0.01035929, "balance_loss_clip": 1.0240438, "balance_loss_mlp": 1.03574026, "epoch": 0.998226364046295, "flos": 24316839118080.0, "grad_norm": 3.436565068470833, "language_loss": 0.6291396, "learning_rate": 3.189071962883538e-11, "loss": 0.65054423, "num_input_tokens_seen": 358323670, "router_z_loss_clip": 0.11914062, "router_z_loss_mlp": 0.6875, "step": 16603, "time_per_iteration": 2.5385570526123047 }, { "auxiliary_loss_clip": 0.01102708, "auxiliary_loss_mlp": 0.01028303, "balance_loss_clip": 1.0158813, "balance_loss_mlp": 1.03365874, "epoch": 0.9982864872989629, "flos": 23836389776640.0, "grad_norm": 2.577244716034012, "language_loss": 0.71065021, "learning_rate": 2.972928500866168e-11, "loss": 0.7319603, "num_input_tokens_seen": 358341980, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.6875, "step": 16604, "time_per_iteration": 2.451148748397827 }, { "auxiliary_loss_clip": 0.01102691, "auxiliary_loss_mlp": 0.01026942, "balance_loss_clip": 1.01462197, "balance_loss_mlp": 1.03394306, "epoch": 0.9983466105516309, "flos": 18333116156160.0, "grad_norm": 1.4885992491200797, "language_loss": 0.64260232, "learning_rate": 2.7643689461953613e-11, "loss": 0.66389865, "num_input_tokens_seen": 358360400, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6875, "step": 16605, "time_per_iteration": 2.4335415363311768 }, { "auxiliary_loss_clip": 0.01101256, "auxiliary_loss_mlp": 0.01028918, "balance_loss_clip": 1.01742673, "balance_loss_mlp": 1.03449917, "epoch": 0.9984067338042988, "flos": 17236763285760.0, "grad_norm": 1.7967784230272257, "language_loss": 0.71723115, "learning_rate": 2.5633933067092938e-11, "loss": 0.7385329, "num_input_tokens_seen": 358378990, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66796875, "step": 16606, "time_per_iteration": 2.4490199089050293 }, { "auxiliary_loss_clip": 0.01103782, "auxiliary_loss_mlp": 0.01026898, "balance_loss_clip": 1.01505494, "balance_loss_mlp": 1.03490138, "epoch": 0.9984668570569668, "flos": 20667884186880.0, "grad_norm": 2.0214320533982, "language_loss": 0.81870747, "learning_rate": 2.370001590090709e-11, "loss": 0.84001422, "num_input_tokens_seen": 358395970, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6875, "step": 16607, "time_per_iteration": 2.4452338218688965 }, { "auxiliary_loss_clip": 0.01104609, "auxiliary_loss_mlp": 0.01030638, "balance_loss_clip": 1.01762021, "balance_loss_mlp": 1.03339875, "epoch": 0.9985269803096347, "flos": 30262532555520.0, "grad_norm": 5.956074719385704, "language_loss": 0.67130697, "learning_rate": 2.184193803622669e-11, "loss": 0.6926595, "num_input_tokens_seen": 358417355, "router_z_loss_clip": 0.12988281, "router_z_loss_mlp": 0.71484375, "step": 16608, "time_per_iteration": 2.5087404251098633 }, { "auxiliary_loss_clip": 0.01104861, "auxiliary_loss_mlp": 0.01031516, "balance_loss_clip": 1.01926732, "balance_loss_mlp": 1.03580415, "epoch": 0.9985871035623027, "flos": 10560970005120.0, "grad_norm": 3.888780262792226, "language_loss": 0.8078413, "learning_rate": 2.0059699543883978e-11, "loss": 0.8292051, "num_input_tokens_seen": 358434345, "router_z_loss_clip": 0.12255859, "router_z_loss_mlp": 0.69140625, "step": 16609, "time_per_iteration": 2.4158120155334473 }, { "auxiliary_loss_clip": 0.01102523, "auxiliary_loss_mlp": 0.01033081, "balance_loss_clip": 1.0205822, "balance_loss_mlp": 1.03333974, "epoch": 0.9986472268149706, "flos": 16873455173760.0, "grad_norm": 1.766515853159119, "language_loss": 0.62971008, "learning_rate": 1.8353300491158462e-11, "loss": 0.65106618, "num_input_tokens_seen": 358452870, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.69140625, "step": 16610, "time_per_iteration": 2.4258108139038086 }, { "auxiliary_loss_clip": 0.01101945, "auxiliary_loss_mlp": 0.01030325, "balance_loss_clip": 1.01897669, "balance_loss_mlp": 1.0333693, "epoch": 0.9987073500676387, "flos": 22054538776320.0, "grad_norm": 2.146310624656144, "language_loss": 0.66693664, "learning_rate": 1.672274094288717e-11, "loss": 0.68825936, "num_input_tokens_seen": 358472210, "router_z_loss_clip": 0.11328125, "router_z_loss_mlp": 0.6875, "step": 16611, "time_per_iteration": 2.460242986679077 }, { "auxiliary_loss_clip": 0.01103283, "auxiliary_loss_mlp": 0.01032304, "balance_loss_clip": 1.0196681, "balance_loss_mlp": 1.03475308, "epoch": 0.9987674733203066, "flos": 30482880537600.0, "grad_norm": 1.4705746825676034, "language_loss": 0.69762075, "learning_rate": 1.5168020961020544e-11, "loss": 0.71897662, "num_input_tokens_seen": 358493840, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 16612, "time_per_iteration": 2.506176233291626 }, { "auxiliary_loss_clip": 0.01100025, "auxiliary_loss_mlp": 0.01027697, "balance_loss_clip": 1.01661015, "balance_loss_mlp": 1.03447545, "epoch": 0.9988275965729746, "flos": 27745230585600.0, "grad_norm": 1.5213944962912902, "language_loss": 0.73888379, "learning_rate": 1.3689140604400407e-11, "loss": 0.76016104, "num_input_tokens_seen": 358515060, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.65625, "step": 16613, "time_per_iteration": 2.516723155975342 }, { "auxiliary_loss_clip": 0.01104618, "auxiliary_loss_mlp": 0.01029003, "balance_loss_clip": 1.01606917, "balance_loss_mlp": 1.03468943, "epoch": 0.9988877198256426, "flos": 17524191916800.0, "grad_norm": 2.258176652572995, "language_loss": 0.73476028, "learning_rate": 1.2286099928981996e-11, "loss": 0.75609654, "num_input_tokens_seen": 358528200, "router_z_loss_clip": 0.12890625, "router_z_loss_mlp": 0.69921875, "step": 16614, "time_per_iteration": 2.380295991897583 }, { "auxiliary_loss_clip": 0.01104067, "auxiliary_loss_mlp": 0.01028825, "balance_loss_clip": 1.01727343, "balance_loss_mlp": 1.03615904, "epoch": 0.9989478430783105, "flos": 20996502739200.0, "grad_norm": 1.6215738399686215, "language_loss": 0.72662812, "learning_rate": 1.0958898988278065e-11, "loss": 0.74795699, "num_input_tokens_seen": 358548360, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.6796875, "step": 16615, "time_per_iteration": 2.4589831829071045 }, { "auxiliary_loss_clip": 0.01107224, "auxiliary_loss_mlp": 0.01028417, "balance_loss_clip": 1.0159421, "balance_loss_mlp": 1.03668833, "epoch": 0.9990079663309785, "flos": 13370620769280.0, "grad_norm": 2.5741689139828994, "language_loss": 0.77381611, "learning_rate": 9.70753783247069e-12, "loss": 0.79517245, "num_input_tokens_seen": 358566270, "router_z_loss_clip": 0.125, "router_z_loss_mlp": 0.703125, "step": 16616, "time_per_iteration": 2.432251214981079 }, { "auxiliary_loss_clip": 0.01102316, "auxiliary_loss_mlp": 0.01028008, "balance_loss_clip": 1.01585436, "balance_loss_mlp": 1.03424692, "epoch": 0.9990680895836465, "flos": 17310236555520.0, "grad_norm": 1.8426496575352471, "language_loss": 0.83076656, "learning_rate": 8.532016508855378e-12, "loss": 0.8520698, "num_input_tokens_seen": 358584710, "router_z_loss_clip": 0.12158203, "router_z_loss_mlp": 0.6796875, "step": 16617, "time_per_iteration": 2.421769142150879 }, { "auxiliary_loss_clip": 0.01102298, "auxiliary_loss_mlp": 0.01025515, "balance_loss_clip": 1.01437521, "balance_loss_mlp": 1.03453612, "epoch": 0.9991282128363145, "flos": 24207993930240.0, "grad_norm": 1.5532126860961835, "language_loss": 0.78645074, "learning_rate": 7.43233506206309e-12, "loss": 0.80772889, "num_input_tokens_seen": 358606750, "router_z_loss_clip": 0.11132812, "router_z_loss_mlp": 0.6796875, "step": 16618, "time_per_iteration": 2.466270923614502 }, { "auxiliary_loss_clip": 0.01100767, "auxiliary_loss_mlp": 0.01029978, "balance_loss_clip": 1.0181942, "balance_loss_mlp": 1.03321242, "epoch": 0.9991883360889824, "flos": 21175301664000.0, "grad_norm": 1.6298317690935848, "language_loss": 0.74855787, "learning_rate": 6.408493534060255e-12, "loss": 0.76986533, "num_input_tokens_seen": 358624675, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.67578125, "step": 16619, "time_per_iteration": 3.9175591468811035 }, { "auxiliary_loss_clip": 0.01099353, "auxiliary_loss_mlp": 0.01027303, "balance_loss_clip": 1.01637769, "balance_loss_mlp": 1.03351855, "epoch": 0.9992484593416504, "flos": 19901155449600.0, "grad_norm": 1.9985442853593558, "language_loss": 0.86996925, "learning_rate": 5.460491963260594e-12, "loss": 0.89123583, "num_input_tokens_seen": 358640715, "router_z_loss_clip": 0.109375, "router_z_loss_mlp": 0.66015625, "step": 16620, "time_per_iteration": 2.4221811294555664 }, { "auxiliary_loss_clip": 0.01098043, "auxiliary_loss_mlp": 0.01024114, "balance_loss_clip": 1.01301527, "balance_loss_mlp": 1.03215134, "epoch": 0.9993085825943183, "flos": 24857832833280.0, "grad_norm": 2.1614331549797785, "language_loss": 0.7295714, "learning_rate": 4.58833038607942e-12, "loss": 0.75079292, "num_input_tokens_seen": 358659630, "router_z_loss_clip": 0.11083984, "router_z_loss_mlp": 0.65625, "step": 16621, "time_per_iteration": 2.4731404781341553 }, { "auxiliary_loss_clip": 0.01027622, "auxiliary_loss_mlp": 0.01002511, "balance_loss_clip": 1.00155187, "balance_loss_mlp": 1.00548697, "epoch": 0.9993687058469863, "flos": 71284478780160.0, "grad_norm": 0.7330098070789148, "language_loss": 0.56464446, "learning_rate": 3.79200883515729e-12, "loss": 0.5849458, "num_input_tokens_seen": 358727840, "router_z_loss_clip": 0.00958252, "router_z_loss_mlp": 0.22167969, "step": 16622, "time_per_iteration": 4.60698938369751 }, { "auxiliary_loss_clip": 0.01103876, "auxiliary_loss_mlp": 0.01027669, "balance_loss_clip": 1.01521754, "balance_loss_mlp": 1.03468108, "epoch": 0.9994288290996542, "flos": 12199573566720.0, "grad_norm": 1.8559050713958285, "language_loss": 0.71513146, "learning_rate": 3.071527340914315e-12, "loss": 0.73644686, "num_input_tokens_seen": 358744125, "router_z_loss_clip": 0.12451172, "router_z_loss_mlp": 0.69140625, "step": 16623, "time_per_iteration": 2.4089667797088623 }, { "auxiliary_loss_clip": 0.01101264, "auxiliary_loss_mlp": 0.01031785, "balance_loss_clip": 1.01898766, "balance_loss_mlp": 1.03412509, "epoch": 0.9994889523523223, "flos": 17889942153600.0, "grad_norm": 1.8253620904876993, "language_loss": 0.74782461, "learning_rate": 2.4268859304399368e-12, "loss": 0.76915514, "num_input_tokens_seen": 358761420, "router_z_loss_clip": 0.12792969, "router_z_loss_mlp": 0.671875, "step": 16624, "time_per_iteration": 2.4149861335754395 }, { "auxiliary_loss_clip": 0.01101745, "auxiliary_loss_mlp": 0.0102979, "balance_loss_clip": 1.01713049, "balance_loss_mlp": 1.03298521, "epoch": 0.9995490756049902, "flos": 26578888064640.0, "grad_norm": 1.7853278636764425, "language_loss": 0.73659539, "learning_rate": 1.8580846286031514e-12, "loss": 0.75791073, "num_input_tokens_seen": 358782600, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.6875, "step": 16625, "time_per_iteration": 2.5019731521606445 }, { "auxiliary_loss_clip": 0.01100705, "auxiliary_loss_mlp": 0.01032162, "balance_loss_clip": 1.02065194, "balance_loss_mlp": 1.03454089, "epoch": 0.9996091988576582, "flos": 22200048771840.0, "grad_norm": 2.589205315699116, "language_loss": 0.76646841, "learning_rate": 1.3651234567202408e-12, "loss": 0.78779703, "num_input_tokens_seen": 358801220, "router_z_loss_clip": 0.11523438, "router_z_loss_mlp": 0.66015625, "step": 16626, "time_per_iteration": 2.5756866931915283 }, { "auxiliary_loss_clip": 0.01102834, "auxiliary_loss_mlp": 0.0103207, "balance_loss_clip": 1.02028584, "balance_loss_mlp": 1.03649092, "epoch": 0.9996693221103262, "flos": 27373195468800.0, "grad_norm": 1.709786942064651, "language_loss": 0.82552838, "learning_rate": 9.480024334429515e-13, "loss": 0.84687746, "num_input_tokens_seen": 358819190, "router_z_loss_clip": 0.11816406, "router_z_loss_mlp": 0.6640625, "step": 16627, "time_per_iteration": 2.60577392578125 }, { "auxiliary_loss_clip": 0.01107812, "auxiliary_loss_mlp": 0.01034172, "balance_loss_clip": 1.02144587, "balance_loss_mlp": 1.03626287, "epoch": 0.9997294453629941, "flos": 26870410846080.0, "grad_norm": 2.012938506313586, "language_loss": 0.70851642, "learning_rate": 6.067215747584952e-13, "loss": 0.72993624, "num_input_tokens_seen": 358839850, "router_z_loss_clip": 0.12695312, "router_z_loss_mlp": 0.71484375, "step": 16628, "time_per_iteration": 5.283509254455566 }, { "auxiliary_loss_clip": 0.01102754, "auxiliary_loss_mlp": 0.01027616, "balance_loss_clip": 1.0154388, "balance_loss_mlp": 1.03322196, "epoch": 0.9997895686156621, "flos": 23476996247040.0, "grad_norm": 1.9041002039270591, "language_loss": 0.75565451, "learning_rate": 3.4128089332341456e-13, "loss": 0.77695823, "num_input_tokens_seen": 358859805, "router_z_loss_clip": 0.12207031, "router_z_loss_mlp": 0.6953125, "step": 16629, "time_per_iteration": 2.4665653705596924 }, { "auxiliary_loss_clip": 0.01107636, "auxiliary_loss_mlp": 0.01036438, "balance_loss_clip": 1.02403986, "balance_loss_mlp": 1.03634059, "epoch": 0.9998496918683301, "flos": 20224961579520.0, "grad_norm": 1.6467024169845204, "language_loss": 0.60442334, "learning_rate": 1.5168039935176126e-13, "loss": 0.62586403, "num_input_tokens_seen": 358877900, "router_z_loss_clip": 0.12402344, "router_z_loss_mlp": 0.7109375, "step": 16630, "time_per_iteration": 2.415832281112671 }, { "auxiliary_loss_clip": 0.01105635, "auxiliary_loss_mlp": 0.01026504, "balance_loss_clip": 1.01422584, "balance_loss_mlp": 1.03623533, "epoch": 0.9999098151209981, "flos": 21652913831040.0, "grad_norm": 1.7617381575398354, "language_loss": 0.60163867, "learning_rate": 3.792010017100722e-14, "loss": 0.62296009, "num_input_tokens_seen": 358897285, "router_z_loss_clip": 0.12304688, "router_z_loss_mlp": 0.6953125, "step": 16631, "time_per_iteration": 2.4491724967956543 }, { "auxiliary_loss_clip": 0.01102056, "auxiliary_loss_mlp": 0.01027401, "balance_loss_clip": 1.01637375, "balance_loss_mlp": 1.03572929, "epoch": 0.999969938373666, "flos": 11544599018880.0, "grad_norm": 1.8534794710118447, "language_loss": 0.73024958, "learning_rate": 0.0, "loss": 0.75154412, "num_input_tokens_seen": 358911570, "router_z_loss_clip": 0.11035156, "router_z_loss_mlp": 0.6640625, "step": 16632, "time_per_iteration": 2.3874118328094482 } ], "logging_steps": 1.0, "max_steps": 16632, "num_input_tokens_seen": 358911570, "num_train_epochs": 1, "save_steps": 3328, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3992169073237033e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }